Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/loader/package/debian.py b/swh/loader/package/debian.py
index 0396c8d..eb2f74c 100644
--- a/swh/loader/package/debian.py
+++ b/swh/loader/package/debian.py
@@ -1,342 +1,356 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import email.utils
import iso8601
import logging
import re
import subprocess
from dateutil.parser import parse as parse_date
from debian.changelog import Changelog
from debian.deb822 import Dsc
from os import path
-from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
+from typing import (
+ Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple
+)
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import download
logger = logging.getLogger(__name__)
UPLOADERS_SPLIT = re.compile(r'(?<=\>)\s*,\s*')
class DebianLoader(PackageLoader):
"""Load debian origins into swh archive.
"""
visit_type = 'debian'
def __init__(self, url: str, date: str, packages: Mapping[str, Any]):
super().__init__(url=url)
self.packages = packages
def get_versions(self) -> Sequence[str]:
"""Returns the keys of the packages input (e.g.
stretch/contrib/0.7.2-3, etc...)
"""
return self.packages.keys()
def get_default_release(self) -> str:
"""Take the first version as default release
"""
return list(self.packages.keys())[0]
- def get_artifacts(self, version: str) -> Generator[
- Tuple[Mapping[str, Any], Dict], None, None]:
- a_metadata = self.packages[version]
- artifacts_package_info = a_metadata.copy()
- artifacts_package_info['filename'] = version
- yield artifacts_package_info, a_metadata
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
+ meta = self.packages[version]
+ p_info = meta.copy()
+ p_info['raw'] = meta
+ yield 'releases/%s' % version, p_info
def resolve_revision_from(
self, known_package_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
artifacts_to_fetch = artifact_metadata['files']
logger.debug('k_p_artifacts: %s', known_package_artifacts)
logger.debug('artifacts_to_fetch: %s', artifacts_to_fetch)
for rev_id, known_artifacts in known_package_artifacts.items():
logger.debug('Revision: %s', rev_id)
logger.debug('Associated known_artifacts: %s', known_artifacts)
known_artifacts = known_artifacts['extrinsic']['raw']['files']
rev_found = True
for a_name, k_artifact in known_artifacts.items():
artifact_to_fetch = artifacts_to_fetch.get(a_name)
logger.debug('artifact_to_fetch: %s', artifact_to_fetch)
if artifact_to_fetch is None:
# as soon as we do not see an artifact, we consider we need
# to check the other revision
rev_found = False
if k_artifact['sha256'] != artifact_to_fetch['sha256']:
# Hash is different, we consider we need to check the other
# revisions
rev_found = False
if rev_found:
logger.debug('Existing revision %s found for new artifacts.',
rev_id)
return rev_id
# if we pass here, we did not find any known artifacts
logger.debug('No existing revision found for the new artifacts.')
- def download_package(self, a_p_info: str, tmpdir: str) -> Tuple[str, Dict]:
+ def download_package(self, p_info: Mapping[str, Any],
+ tmpdir: str) -> [Tuple[str, Dict]]:
"""Contrary to other package loaders (1 package, 1 artifact),
`a_metadata` represents the package's datafiles set to fetch:
- <package-version>.orig.tar.gz
- <package-version>.dsc
- <package-version>.diff.gz
This is delegated to the `download_package` function.
"""
- logger.debug('debian: artifactS_package_info: %s', a_p_info)
- return tmpdir, download_package(a_p_info, tmpdir)
-
- def uncompress(self, a_path: str, tmpdir: str, a_metadata: Dict) -> str:
- return extract_package(a_metadata, tmpdir)
-
- def read_intrinsic_metadata(self, a_metadata: Dict,
- a_uncompressed_path: str) -> Dict:
- _, dsc_name = dsc_information(a_metadata)
- dsc_path = path.join(path.dirname(a_uncompressed_path), dsc_name)
- return get_package_metadata(
- a_metadata, dsc_path, a_uncompressed_path)
-
- def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
- dsc_url, _ = dsc_information(a_metadata)
+ all_hashes = download_package(p_info, tmpdir)
+ logger.debug('all_hashes: %s', all_hashes)
+ res = []
+ for hashes in all_hashes.values():
+ res.append((tmpdir, hashes))
+ logger.debug('res: %s', res)
+ return res
+
+ def uncompress(self, dl_artifacts: [Tuple[str, Dict]], dest: str) -> str:
+ logger.debug('dl_artifacts: %s', dl_artifacts)
+ return extract_package(dl_artifacts, dest=dest)
+
+ def build_revision(self, a_metadata: Mapping[str, Any],
+ uncompressed_path: str) -> Dict:
+ dsc_url, dsc_name = dsc_information(a_metadata)
+ dsc_path = path.join(path.dirname(uncompressed_path), dsc_name)
+ i_metadata = get_package_metadata(
+ a_metadata, dsc_path, uncompressed_path)
+
logger.debug('i_metadata: %s', i_metadata)
logger.debug('a_metadata: %s', a_metadata)
msg = 'Synthetic revision for Debian source package %s version %s' % (
a_metadata['name'], a_metadata['version'])
date = iso8601.parse_date(i_metadata['changelog']['date'])
author = prepare_person(i_metadata['changelog']['person'])
# inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa
return {
'type': 'dsc',
'message': msg.encode('utf-8'),
'author': author,
'date': date,
'committer': author,
'committer_date': date,
'parents': [],
'metadata': {
'intrinsic': {
'tool': 'dsc',
'raw': i_metadata,
},
'extrinsic': {
'provider': dsc_url,
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
}
}
def uid_to_person(uid: str) -> Mapping[str, str]:
"""Convert an uid to a person suitable for insertion.
Args:
uid: an uid of the form "Name <email@ddress>"
Returns:
a dictionary with the following keys:
- name: the name associated to the uid
- email: the mail associated to the uid
- fullname: the actual uid input
"""
logger.debug('uid: %s', uid)
ret = {
'name': '',
'email': '',
'fullname': uid,
}
name, mail = email.utils.parseaddr(uid)
if name and email:
ret['name'] = name
ret['email'] = mail
else:
ret['name'] = uid
return ret
def prepare_person(person: Mapping[str, str]) -> Mapping[str, bytes]:
"""Prepare person for swh serialization...
Args:
A person dict
Returns:
A person dict ready for storage
"""
ret = {}
for key, value in person.items():
ret[key] = value.encode('utf-8')
return ret
def download_package(
package: Mapping[str, Any], tmpdir: Any) -> Mapping[str, Any]:
"""Fetch a source package in a temporary directory and check the checksums
for all files.
Args:
package: Dict defining the set of files representing a debian package
tmpdir: Where to download and extract the files to ingest
Returns:
Dict of swh hashes per filename key
"""
all_hashes = {}
for filename, fileinfo in package['files'].items():
uri = fileinfo['uri']
logger.debug('fileinfo: %s', fileinfo)
extrinsic_hashes = {'sha256': fileinfo['sha256']}
logger.debug('extrinsic_hashes(%s): %s', filename, extrinsic_hashes)
filepath, hashes = download(uri, dest=tmpdir, filename=filename,
hashes=extrinsic_hashes)
all_hashes[filename] = hashes
logger.debug('all_hashes: %s', all_hashes)
return all_hashes
def dsc_information(package: Mapping[str, Any]) -> Tuple[str, str]:
"""Retrieve dsc information from a package.
Args:
package: Package metadata information
Returns:
Tuple of dsc file's uri, dsc's full disk path
"""
dsc_name = None
dsc_url = None
for filename, fileinfo in package['files'].items():
if filename.endswith('.dsc'):
if dsc_name:
raise ValueError(
'Package %s_%s references several dsc files' %
(package['name'], package['version'])
)
dsc_url = fileinfo['uri']
dsc_name = filename
return dsc_url, dsc_name
-def extract_package(package: Mapping[str, Any], tmpdir: str) -> str:
+def extract_package(dl_artifacts: List[Tuple[str, Dict]], dest: str) -> str:
"""Extract a Debian source package to a given directory.
Note that after extraction the target directory will be the root of the
extracted package, rather than containing it.
Args:
- package (dict): package information dictionary
- tmpdir (str): directory where the package files are stored
+ package: package information dictionary
+ dest: directory where the package files are stored
Returns:
Package extraction directory
"""
- _, dsc_name = dsc_information(package)
- dsc_path = path.join(tmpdir, dsc_name)
- destdir = path.join(tmpdir, 'extracted')
- logfile = path.join(tmpdir, 'extract.log')
+ a_path = dl_artifacts[0][0]
+ logger.debug('dl_artifacts: %s', dl_artifacts)
+ for _, hashes in dl_artifacts:
+ logger.debug('hashes: %s', hashes)
+ filename = hashes['filename']
+ if filename.endswith('.dsc'):
+ dsc_name = filename
+ break
+
+ dsc_path = path.join(a_path, dsc_name)
+ destdir = path.join(dest, 'extracted')
+ logfile = path.join(dest, 'extract.log')
logger.debug('extract Debian source package %s in %s' %
(dsc_path, destdir), extra={
'swh_type': 'deb_extract',
'swh_dsc': dsc_path,
'swh_destdir': destdir,
})
cmd = ['dpkg-source',
'--no-copy', '--no-check',
'--ignore-bad-version',
'-x', dsc_path,
destdir]
try:
with open(logfile, 'w') as stdout:
subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
logdata = open(logfile, 'r').read()
raise ValueError('dpkg-source exited with code %s: %s' %
(e.returncode, logdata)) from None
return destdir
def get_package_metadata(package: Mapping[str, Any], dsc_path: str,
extracted_path: str) -> Mapping[str, Any]:
"""Get the package metadata from the source package at dsc_path,
extracted in extracted_path.
Args:
package: the package dict (with a dsc_path key)
dsc_path: path to the package's dsc file
extracted_path: the path where the package got extracted
Returns:
dict: a dictionary with the following keys:
- history: list of (package_name, package_version) tuples parsed from
the package changelog
"""
with open(dsc_path, 'rb') as dsc:
parsed_dsc = Dsc(dsc)
# Parse the changelog to retrieve the rest of the package information
changelog_path = path.join(extracted_path, 'debian/changelog')
with open(changelog_path, 'rb') as changelog:
try:
parsed_changelog = Changelog(changelog)
except UnicodeDecodeError:
logger.warning('Unknown encoding for changelog %s,'
' falling back to iso' %
changelog_path.decode('utf-8'), extra={
'swh_type': 'deb_changelog_encoding',
'swh_name': package['name'],
'swh_version': str(package['version']),
'swh_changelog': changelog_path.decode('utf-8'),
})
# need to reset as Changelog scrolls to the end of the file
changelog.seek(0)
parsed_changelog = Changelog(changelog, encoding='iso-8859-15')
package_info = {
'name': package['name'],
'version': str(package['version']),
'changelog': {
'person': uid_to_person(parsed_changelog.author),
'date': parse_date(parsed_changelog.date).isoformat(),
'history': [(block.package, str(block.version))
for block in parsed_changelog][1:],
}
}
maintainers = [
uid_to_person(parsed_dsc['Maintainer']),
]
maintainers.extend(
uid_to_person(person)
for person in UPLOADERS_SPLIT.split(parsed_dsc.get('Uploaders', ''))
)
package_info['maintainers'] = maintainers
return package_info
diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit.py
index 1a74c0a..160819c 100644
--- a/swh/loader/package/deposit.py
+++ b/swh/loader/package/deposit.py
@@ -1,153 +1,154 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Dict, Generator, Mapping, Sequence, Tuple
from swh.model.hashutil import hash_to_hex
from swh.loader.package.loader import PackageLoader
from swh.deposit.client import PrivateApiDepositClient as ApiClient
logger = logging.getLogger(__name__)
class DepositLoader(PackageLoader):
"""Load pypi origin's artifact releases into swh archive.
"""
visit_type = 'deposit'
def __init__(self, url: str, deposit_id: str):
"""Constructor
Args:
url: Origin url to associate the artifacts/metadata to
deposit_id: Deposit identity
"""
super().__init__(url=url)
# For now build back existing api urls
# archive_url: Private api url to retrieve archive artifact
self.archive_url = '/%s/raw/' % deposit_id
# metadata_url: Private api url to retrieve the deposit metadata
self.metadata_url = '/%s/meta/' % deposit_id
# deposit_update_url: Private api to push pids and status update on the
# deposit id
self.deposit_update_url = '/%s/update/' % deposit_id
self.client = ApiClient()
self._metadata = None
@property
def metadata(self):
if self._metadata is None:
self._metadata = self.client.metadata_get(self.metadata_url)
return self._metadata
def get_versions(self) -> Sequence[str]:
# only 1 branch 'HEAD' with no alias since we only have 1 snapshot
# branch
return ['HEAD']
- def get_artifacts(self, version: str) -> Generator[
+ def get_package_info(self, version: str) -> Generator[
Tuple[Mapping[str, str], Dict], None, None]:
- artifact_package_info = {
+ p_info = {
'url': self.client.base_url + self.archive_url,
'filename': 'archive.zip',
+ 'raw': self.metadata,
}
- yield artifact_package_info, self.metadata
+ yield 'HEAD', p_info
def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
+ self, a_metadata: Dict, uncompressed_path: str) -> Dict:
revision = a_metadata.pop('revision')
metadata = {
'extrinsic': {
'provider': '%s/%s' % (
self.client.base_url, self.metadata_url),
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
}
# FIXME: the deposit no longer needs to build the revision
revision['metadata'].update(metadata)
revision['author'] = parse_author(revision['author'])
revision['committer'] = parse_author(revision['committer'])
revision['message'] = revision['message'].encode('utf-8')
revision['type'] = 'tar'
return revision
def load(self) -> Dict:
# Usual loading
r = super().load()
success = r['status'] != 'failed'
if success:
# Update archive with metadata information
origin_metadata = self.metadata['origin_metadata']
logger.debug('origin_metadata: %s', origin_metadata)
tools = self.storage.tool_add([origin_metadata['tool']])
logger.debug('tools: %s', tools)
tool_id = tools[0]['id']
provider = origin_metadata['provider']
# FIXME: Shall we delete this info?
provider_id = self.storage.metadata_provider_add(
provider['provider_name'],
provider['provider_type'],
provider['provider_url'],
metadata=None)
metadata = origin_metadata['metadata']
self.storage.origin_metadata_add(
self.url, self.visit_date, provider_id, tool_id, metadata)
# Update deposit status
try:
if not success:
self.client.status_update(
self.deposit_update_url, status='failed')
return r
snapshot_id = r['snapshot_id']
branches = self.storage.snapshot_get(snapshot_id)['branches']
logger.debug('branches: %s', branches)
if not branches:
return r
rev_id = branches[b'HEAD']['target']
revision = next(self.storage.revision_get([rev_id]))
# Retrieve the revision identifier
dir_id = revision['directory']
# update the deposit's status to success with its
# revision-id and directory-id
self.client.status_update(
self.deposit_update_url,
status='done',
revision_id=hash_to_hex(rev_id),
directory_id=hash_to_hex(dir_id),
origin_url=self.url)
except Exception:
logger.exception(
'Problem when trying to update the deposit\'s status')
return {'status': 'failed'}
return r
def parse_author(author):
"""See prior fixme
"""
return {
'fullname': author['fullname'].encode('utf-8'),
'name': author['name'].encode('utf-8'),
'email': author['email'].encode('utf-8'),
}
diff --git a/swh/loader/package/gnu.py b/swh/loader/package/gnu.py
index ce194c6..189041c 100644
--- a/swh/loader/package/gnu.py
+++ b/swh/loader/package/gnu.py
@@ -1,191 +1,195 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import re
from os import path
-from typing import Dict, Generator, Mapping, Optional, Sequence, Tuple
+from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
from swh.loader.package.loader import PackageLoader
from swh.model.identifiers import normalize_timestamp
logger = logging.getLogger(__name__)
# to recognize existing naming pattern
extensions = [
'zip',
'tar',
'gz', 'tgz',
'bz2', 'bzip2',
'lzma', 'lz',
'xz',
'Z',
]
version_keywords = [
'cygwin_me',
'w32', 'win32', 'nt', 'cygwin', 'mingw',
'latest', 'alpha', 'beta',
'release', 'stable',
'hppa',
'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
'aix', 'ibm', 'rs6000',
'i386', 'i686',
'linux', 'redhat', 'linuxlibc',
'mips',
'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
'unknown',
'netbsd', 'freebsd',
'sgi', 'irix',
]
# Match a filename into components.
#
# We use Debian's release number heuristic: A release number starts
# with a digit, and is followed by alphanumeric characters or any of
# ., +, :, ~ and -
#
# We hardcode a list of possible extensions, as this release number
# scheme would match them too... We match on any combination of those.
#
# Greedy matching is done right to left (we only match the extension
# greedily with +, software_name and release_number are matched lazily
# with +? and *?).
pattern = r'''
^
(?:
# We have a software name and a release number, separated with a
# -, _ or dot.
(?P<software_name1>.+?[-_.])
(?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
|
# We couldn't match a release number, put everything in the
# software name.
(?P<software_name2>.+?)
)
(?P<extension>(?:\.(?:%(extensions)s))+)
$
''' % {
'extensions': '|'.join(extensions),
'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
}
def get_version(url: str) -> str:
"""Extract branch name from tarball url
Args:
url (str): Tarball URL
Returns:
byte: Branch name
Example:
For url = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
>>> get_version(url)
'0.2.0'
"""
filename = path.split(url)[-1]
m = re.match(pattern, filename,
flags=re.VERBOSE | re.IGNORECASE)
if m:
d = m.groupdict()
if d['software_name1'] and d['release_number']:
return d['release_number']
if d['software_name2']:
return d['software_name2']
return ''
class GNULoader(PackageLoader):
visit_type = 'gnu'
SWH_PERSON = {
'name': b'Software Heritage',
'fullname': b'Software Heritage',
'email': b'robot@softwareheritage.org'
}
REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
def __init__(self, package_url: str, tarballs: Sequence):
"""Loader constructor.
For now, this is the lister's task output.
Args:
package_url: Origin url
tarballs: List of dict with keys `date` (date) and `archive` (str)
the url to retrieve one versioned archive
"""
super().__init__(url=package_url)
self.tarballs = list(sorted(tarballs, key=lambda v: v['time']))
def get_versions(self) -> Sequence[str]:
versions = []
for archive in self.tarballs:
v = get_version(archive['archive'])
if v:
versions.append(v)
return versions
def get_default_release(self) -> str:
# It's the most recent, so for this loader, it's the last one
return get_version(self.tarballs[-1]['archive'])
- def get_artifacts(self, version: str) -> Generator[
- Tuple[Mapping[str, str], Dict], None, None]:
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
for a_metadata in self.tarballs:
url = a_metadata['archive']
- artifact_version = get_version(url)
- if version == artifact_version:
- artifact_package_info = {
+ package_version = get_version(url)
+ if version == package_version:
+ p_info = {
'url': url,
- 'filename': path.split(url)[-1]
+ 'filename': path.split(url)[-1],
+ 'raw': a_metadata,
}
- yield artifact_package_info, a_metadata
+ # FIXME: this code assumes we have only 1 artifact per
+ # versioned package
+ yield 'releases/%s' % version, p_info
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
def pk(d):
return [d.get(k) for k in ['time', 'archive', 'length']]
artifact_pk = pk(artifact_metadata)
for rev_id, known_artifact in known_artifacts.items():
logging.debug('known_artifact: %s', known_artifact)
known_pk = pk(known_artifact['extrinsic']['raw'])
if artifact_pk == known_pk:
return rev_id
def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
+ self, a_metadata: Mapping[str, Any],
+ uncompressed_path: str) -> Dict:
normalized_date = normalize_timestamp(int(a_metadata['time']))
return {
'type': 'tar',
'message': self.REVISION_MESSAGE,
'date': normalized_date,
'author': self.SWH_PERSON,
'committer': self.SWH_PERSON,
'committer_date': normalized_date,
'parents': [],
'metadata': {
'intrinsic': {},
'extrinsic': {
'provider': self.url,
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
},
}
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
index 061c8b3..2c5a083 100644
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -1,409 +1,384 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import logging
import tempfile
import os
-from typing import Dict, Generator, List, Mapping, Optional, Sequence, Tuple
+from typing import (
+ Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple
+)
from swh.core.tarball import uncompress
from swh.core.config import SWHConfig
from swh.model.from_disk import Directory
from swh.model.identifiers import (
revision_identifier, snapshot_identifier, identifier_to_bytes
)
from swh.storage import get_storage
from swh.storage.algos.snapshot import snapshot_get_all_branches
from swh.loader.core.converters import content_for_storage
from swh.loader.package.utils import download
logger = logging.getLogger(__name__)
# Not implemented yet:
# - clean up disk routines from previous killed workers (when OOMkilled)
# -> separation of concern would like this to be abstracted from the code
# -> experience tells us it's complicated to do as such (T903, T964, T982,
# etc...)
#
# - model: swh.model.merkle.from_disk should output swh.model.model.* objects
# to avoid this layer's conversion routine call
# -> Take this up within swh.model's current implementation
class PackageLoader:
# Origin visit type (str) set by the loader
visit_type = ''
def __init__(self, url):
"""Loader's constructor. This raises exception if the minimal required
configuration is missing (cf. fn:`check` method).
Args:
url (str): Origin url to load data from
"""
# This expects to use the environment variable SWH_CONFIG_FILENAME
self.config = SWHConfig.parse_config_file()
self._check_configuration()
self.storage = get_storage(**self.config['storage'])
self.url = url
self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
def _check_configuration(self):
"""Checks the minimal configuration required is set for the loader.
If some required configuration is missing, exception detailing the
issue is raised.
"""
if 'storage' not in self.config:
raise ValueError(
'Misconfiguration, at least the storage key should be set')
def get_versions(self) -> Sequence[str]:
"""Return the list of all published package versions.
Returns:
Sequence of published versions
"""
return []
- def get_artifacts(self, version: str) -> Generator[
- Tuple[str, str, Dict], None, None]:
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
"""Given a release version of a package, retrieve the associated
- artifact information for such version.
+ package information for such version.
Args:
version: Package version
Returns:
- (artifact filename, artifact uri, raw artifact metadata)
+ (branch name, package metadata)
"""
yield from {}
def build_revision(
self, a_metadata: Dict, i_metadata: Dict) -> Dict:
"""Build the revision dict from the archive metadata (extrinsic
artifact metadata) and the intrinsic metadata.
Returns:
SWH data dict
"""
return {}
def get_default_release(self) -> str:
"""Retrieve the latest release version
Returns:
Latest version
"""
return ''
def last_snapshot(self) -> Optional[Dict]:
"""Retrieve the last snapshot
"""
visit = self.storage.origin_visit_get_latest(
self.url, require_snapshot=True)
if visit:
return snapshot_get_all_branches(
self.storage, visit['snapshot']['id'])
def known_artifacts(self, snapshot: Dict) -> [Dict]:
"""Retrieve the known releases/artifact for the origin.
Args
snapshot: snapshot for the visit
Returns:
Dict of keys revision id (bytes), values a metadata Dict.
"""
if not snapshot or 'branches' not in snapshot:
return {}
# retrieve only revisions (e.g the alias we do not want here)
revs = [rev['target']
for rev in snapshot['branches'].values()
if rev and rev['target_type'] == 'revision']
known_revisions = self.storage.revision_get(revs)
ret = {}
for revision in known_revisions:
if not revision: # revision_get can return None
continue
ret[revision['id']] = revision['metadata']
return ret
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
"""Resolve the revision from a snapshot and an artifact metadata dict.
If the artifact has already been downloaded, this will return the
existing revision targeting that uncompressed artifact directory.
Otherwise, this returns None.
Args:
snapshot: Snapshot
artifact_metadata: Information dict
Returns:
None or revision identifier
"""
return None
- def download_package(self, artifacts_package_info: Mapping[str, str],
- tmpdir: str) -> Tuple[str, Dict]:
- """Download artifacts for a specific package. All downloads happen in the
- the tmpdir folder.
+ def download_package(self, p_info: Mapping[str, Any],
+ tmpdir: str) -> [Tuple[str, Dict]]:
+ """Download artifacts for a specific package. All downloads happen in
+ in the tmpdir folder.
Default implementation expects the artifacts package info to be
about one artifact per package.
Note that most implementation have 1 artifact per package. But some
implementation have multiple artifacts per package (debian), some have
none, the package is the artifact (gnu).
Args:
artifacts_package_info: Information on the package artifacts to
- download (uri, filename, etc...)
+ download (url, filename, etc...)
tmpdir: Location to retrieve such artifacts
- Note:
-
- """
- a_uri = artifacts_package_info['url']
- filename = artifacts_package_info.get('filename')
- return download(a_uri, dest=tmpdir, filename=filename)
-
- def read_intrinsic_metadata(
- self, a_metadata: Dict, a_uncompressed_path: str) -> Dict:
- """Read intrinsic metadata from either the a_metadata or
- the uncompressed path.
-
- Depending on the implementations, some extracts directly from the
- artifacts to ingest (pypi, npm...), some use api to access directly
- their intrinsic metadata (debian exposes a dsc through uri) or some
- have none (gnu).
+ Returns:
+ List of (path, computed hashes)
"""
- return {}
+ a_uri = p_info['url']
+ filename = p_info.get('filename')
+ return [download(a_uri, dest=tmpdir, filename=filename)]
- def uncompress(
- self, a_path: str, tmpdir: str, a_metadata: Dict) -> str:
- """Uncompress the artfifact(s) stored at a_path to tmpdir.
+ def uncompress(self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]],
+ dest: str) -> str:
+ """Uncompress the artifact(s) in the destination folder dest.
- Optionally, this could need to use the a_metadata dict for some more
+ Optionally, this could need to use the p_info dict for some more
information (debian).
"""
- uncompressed_path = os.path.join(tmpdir, 'src')
- uncompress(a_path, dest=uncompressed_path)
+ uncompressed_path = os.path.join(dest, 'src')
+ for a_path, _ in dl_artifacts:
+ uncompress(a_path, dest=uncompressed_path)
return uncompressed_path
def load(self) -> Dict:
"""Load for a specific origin the associated contents.
for each package version of the origin
1. Fetch the files for one package version By default, this can be
implemented as a simple HTTP request. Loaders with more specific
requirements can override this, e.g.: the PyPI loader checks the
integrity of the downloaded files; the Debian loader has to download
and check several files for one package version.
2. Extract the downloaded files By default, this would be a universal
archive/tarball extraction.
Loaders for specific formats can override this method (for instance,
the Debian loader uses dpkg-source -x).
3. Convert the extracted directory to a set of Software Heritage
objects Using swh.model.from_disk.
4. Extract the metadata from the unpacked directories This would only
be applicable for "smart" loaders like npm (parsing the
package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing
debian/changelog and debian/control).
On "minimal-metadata" sources such as the GNU archive, the lister
should provide the minimal set of metadata needed to populate the
revision/release objects (authors, dates) as an argument to the
task.
5. Generate the revision/release objects for the given version. From
the data generated at steps 3 and 4.
end for each
6. Generate and load the snapshot for the visit
Using the revisions/releases collected at step 5., and the branch
information from step 0., generate a snapshot and load it into the
Software Heritage archive
"""
status_load = 'uneventful' # either: eventful, uneventful, failed
status_visit = 'full' # either: partial, full
tmp_revisions = {} # type: Dict[str, List]
snapshot = None
try:
# Prepare origin and origin_visit
origin = {'url': self.url}
self.storage.origin_add([origin])
visit_id = self.storage.origin_visit_add(
origin=self.url,
date=self.visit_date,
type=self.visit_type)['visit']
last_snapshot = self.last_snapshot()
logger.debug('last snapshot: %s', last_snapshot)
known_artifacts = self.known_artifacts(last_snapshot)
logger.debug('known artifacts: %s', known_artifacts)
# Retrieve the default release (the "latest" one)
default_release = self.get_default_release()
logger.debug('default release: %s', default_release)
for version in self.get_versions(): # for each
logger.debug('version: %s', version)
tmp_revisions[version] = []
- # `a_` stands for `artifact(s)_`, `p_` stands for `package_`
- for a_p_info, a_metadata in self.get_artifacts(version):
- logger.debug('a_p_info: %s', a_p_info)
- logger.debug('a_metadata: %s', a_metadata)
+ # `p_` stands for `package_`
+ for branch_name, p_info in self.get_package_info(version):
+ logger.debug('package_info: %s', p_info)
revision_id = self.resolve_revision_from(
- known_artifacts, a_metadata)
+ known_artifacts, p_info['raw'])
if revision_id is None:
with tempfile.TemporaryDirectory() as tmpdir:
try:
- # a_c_: archive_computed_
- a_path, a_c_metadata = self.download_package(
- a_p_info, tmpdir)
+ dl_artifacts = self.download_package(
+ p_info, tmpdir)
except Exception:
logger.exception('Unable to retrieve %s',
- a_p_info['url'])
+ p_info)
status_visit = 'partial'
continue
- logger.debug('archive_path: %s', a_path)
- logger.debug('archive_computed_metadata: %s',
- a_c_metadata)
-
uncompressed_path = self.uncompress(
- a_path, tmpdir, a_metadata)
+ dl_artifacts, dest=tmpdir)
logger.debug('uncompressed_path: %s',
uncompressed_path)
directory = Directory.from_disk(
path=uncompressed_path.encode('utf-8'),
data=True) # noqa
# FIXME: Try not to load the full raw content in
# memory
objects = directory.collect()
contents = objects['content'].values()
logger.debug('Number of contents: %s',
len(contents))
self.storage.content_add(
map(content_for_storage, contents))
status_load = 'eventful'
directories = objects['directory'].values()
logger.debug('Number of directories: %s',
len(directories))
self.storage.directory_add(directories)
- i_metadata = self.read_intrinsic_metadata(
- a_metadata, uncompressed_path)
-
# FIXME: This should be release. cf. D409
revision = self.build_revision(
- a_metadata, i_metadata)
+ p_info['raw'], uncompressed_path)
revision.update({
'synthetic': True,
'directory': directory.hash,
})
revision['metadata'].update({
- 'original_artifact': a_c_metadata,
+ 'original_artifact': [
+ hashes for _, hashes in dl_artifacts
+ ],
})
revision['id'] = revision_id = identifier_to_bytes(
revision_identifier(revision))
logger.debug('Revision: %s', revision)
self.storage.revision_add([revision])
- tmp_revisions[version].append(
- (a_p_info['filename'], revision_id))
+ tmp_revisions[version].append((branch_name, revision_id))
# Build and load the snapshot
branches = {}
- for version, v_branches in tmp_revisions.items():
- if len(v_branches) == 1:
- branch_name = (
- version if version == 'HEAD'
- else 'releases/%s' % version).encode('utf-8')
- if version == default_release:
+ for version, branch_name_revisions in tmp_revisions.items():
+ if len(branch_name_revisions) == 1:
+ branch_name, target = branch_name_revisions[0]
+ if branch_name != 'HEAD':
branches[b'HEAD'] = {
'target_type': 'alias',
- 'target': branch_name,
+ 'target': branch_name.encode('utf-8'),
}
+ for branch_name, target in branch_name_revisions:
+ branch_name = branch_name.encode('utf-8')
branches[branch_name] = {
'target_type': 'revision',
- 'target': v_branches[0][1],
+ 'target': target,
}
- else:
- for filename, target in v_branches:
- branch_name = ('releases/%s/%s' % (
- version, filename)).encode('utf-8')
- branches[branch_name] = {
- 'target_type': 'revision',
- 'target': target,
- }
snapshot = {
'branches': branches
}
logger.debug('snapshot: %s', snapshot)
snapshot['id'] = identifier_to_bytes(
snapshot_identifier(snapshot))
logger.debug('snapshot: %s', snapshot)
self.storage.snapshot_add([snapshot])
if hasattr(self.storage, 'flush'):
self.storage.flush()
except Exception:
logger.exception('Fail to load %s' % self.url)
status_visit = 'partial'
status_load = 'failed'
finally:
self.storage.origin_visit_update(
origin=self.url, visit_id=visit_id, status=status_visit,
snapshot=snapshot)
result = {
'status': status_load,
}
if snapshot:
result['snapshot_id'] = snapshot['id']
return result
diff --git a/swh/loader/package/npm.py b/swh/loader/package/npm.py
index ae1fa3c..09cdfdd 100644
--- a/swh/loader/package/npm.py
+++ b/swh/loader/package/npm.py
@@ -1,298 +1,295 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
import os
import re
from codecs import BOM_UTF8
-from typing import Dict, Generator, Mapping, Sequence, Tuple, Optional
+from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional
import chardet
import iso8601
from swh.model.identifiers import normalize_timestamp
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import api_info
logger = logging.getLogger(__name__)
_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
# https://github.com/jonschlinkert/author-regex
_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)'
def parse_npm_package_author(author_str):
"""
Parse npm package author string.
It works with a flexible range of formats, as detailed below::
name
name <email> (url)
name <email>(url)
name<email> (url)
name<email>(url)
name (url) <email>
name (url)<email>
name(url) <email>
name(url)<email>
name (url)
name(url)
name <email>
name<email>
<email> (url)
<email>(url)
(url) <email>
(url)<email>
<email>
(url)
Args:
author_str (str): input author string
Returns:
dict: A dict that may contain the following keys:
* name
* email
* url
"""
author = {}
matches = re.findall(_author_regexp,
author_str.replace('<>', '').replace('()', ''),
re.M)
for match in matches:
if match[0].strip():
author['name'] = match[0].strip()
if match[1].strip():
author['email'] = match[1].strip()
if match[2].strip():
author['url'] = match[2].strip()
return author
def extract_npm_package_author(package_json):
"""
Extract package author from a ``package.json`` file content and
return it in swh format.
Args:
package_json (dict): Dict holding the content of parsed
``package.json`` file
Returns:
dict: A dict with the following keys:
* fullname
* name
* email
"""
def _author_str(author_data):
if type(author_data) is dict:
author_str = ''
if 'name' in author_data:
author_str += author_data['name']
if 'email' in author_data:
author_str += ' <%s>' % author_data['email']
return author_str
elif type(author_data) is list:
return _author_str(author_data[0]) if len(author_data) > 0 else ''
else:
return author_data
author_data = {}
for author_key in ('author', 'authors'):
if author_key in package_json:
author_str = _author_str(package_json[author_key])
author_data = parse_npm_package_author(author_str)
name = author_data.get('name')
email = author_data.get('email')
fullname = None
if name and email:
fullname = '%s <%s>' % (name, email)
elif name:
fullname = name
if not fullname:
return _EMPTY_AUTHOR
if fullname:
fullname = fullname.encode('utf-8')
if name:
name = name.encode('utf-8')
if email:
email = email.encode('utf-8')
return {'fullname': fullname, 'name': name, 'email': email}
def _lstrip_bom(s, bom=BOM_UTF8):
if s.startswith(bom):
return s[len(bom):]
else:
return s
def load_json(json_bytes):
"""
Try to load JSON from bytes and return a dictionary.
First try to decode from utf-8. If the decoding failed,
try to detect the encoding and decode again with replace
error handling.
If JSON is malformed, an empty dictionary will be returned.
Args:
json_bytes (bytes): binary content of a JSON file
Returns:
dict: JSON data loaded in a dictionary
"""
json_data = {}
try:
json_str = _lstrip_bom(json_bytes).decode('utf-8')
except UnicodeDecodeError:
encoding = chardet.detect(json_bytes)['encoding']
if encoding:
json_str = json_bytes.decode(encoding, 'replace')
try:
json_data = json.loads(json_str)
except json.decoder.JSONDecodeError:
pass
return json_data
def extract_intrinsic_metadata(dir_path: str) -> Dict:
"""Given an uncompressed path holding the pkginfo file, returns a
pkginfo parsed structure as a dict.
The release artifact contains at their root one folder. For example:
$ tar tvf zprint-0.0.6.tar.gz
drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
...
Args:
dir_path (str): Path to the uncompressed directory
representing a release artifact from npm.
Returns:
the pkginfo parsed structure as a dict if any or None if
none was present.
"""
# Retrieve the root folder of the archive
if not os.path.exists(dir_path):
return {}
lst = os.listdir(dir_path)
if len(lst) == 0:
return {}
project_dirname = lst[0]
package_json_path = os.path.join(dir_path, project_dirname, 'package.json')
if not os.path.exists(package_json_path):
return {}
with open(package_json_path, 'rb') as package_json_file:
package_json_bytes = package_json_file.read()
return load_json(package_json_bytes)
class NpmLoader(PackageLoader):
visit_type = 'npm'
def __init__(self, package_name, package_url, package_metadata_url):
super().__init__(url=package_url)
self.provider_url = package_metadata_url
self._info = None
self._versions = None
# if package_url is None:
# package_url = 'https://www.npmjs.com/package/%s' % package_name
# if package_metadata_url is None:
# package_metadata_url = 'https://replicate.npmjs.com/%s/' %\
# quote(package_name, safe='')
@property
def info(self) -> Dict:
"""Return the project metadata information (fetched from npm registry)
"""
if not self._info:
self._info = api_info(self.provider_url)
return self._info
def get_versions(self) -> Sequence[str]:
return sorted(self.info['versions'].keys())
def get_default_release(self) -> str:
return self.info['dist-tags'].get('latest', '')
- def get_artifacts(self, version: str) -> Generator[
- Tuple[Mapping[str, str], Dict], None, None]:
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
meta = self.info['versions'][version]
url = meta['dist']['tarball']
- artifact_package_info = {
+ p_info = {
'url': url,
'filename': os.path.basename(url),
+ 'raw': meta,
}
- yield artifact_package_info, meta
+ yield 'releases/%s' % version, p_info
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
shasum = artifact_metadata['dist']['shasum']
for rev_id, known_artifact in known_artifacts.items():
- original_artifact = known_artifact['original_artifact']
+ original_artifact = known_artifact['original_artifact'][0]
if shasum == original_artifact['checksums']['sha1']:
return rev_id
- def read_intrinsic_metadata(self, a_metadata: Dict,
- a_uncompressed_path: str) -> Dict:
- return extract_intrinsic_metadata(a_uncompressed_path)
-
def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
-
+ self, a_metadata: Dict, uncompressed_path: str) -> Dict:
+ i_metadata = extract_intrinsic_metadata(uncompressed_path)
# from intrinsic metadata
author = extract_npm_package_author(i_metadata)
# extrinsic metadata
version = i_metadata['version']
date = self.info['time'][version]
date = iso8601.parse_date(date)
date = normalize_timestamp(int(date.timestamp()))
message = version.encode('ascii')
return {
'type': 'tar',
'message': message,
'author': author,
'date': date,
'committer': author,
'committer_date': date,
'parents': [],
'metadata': {
'intrinsic': {
'tool': 'package.json',
'raw': i_metadata,
},
'extrinsic': {
'provider': self.provider_url,
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
},
}
diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi.py
index 156ab2f..820b79c 100644
--- a/swh/loader/package/pypi.py
+++ b/swh/loader/package/pypi.py
@@ -1,186 +1,193 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
-from typing import Generator, Dict, Mapping, Optional, Sequence, Tuple
+from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
from urllib.parse import urlparse
from pkginfo import UnpackedSDist
import iso8601
from swh.model.identifiers import normalize_timestamp
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import api_info
def pypi_api_url(url: str) -> str:
"""Compute api url from a project url
Args:
url (str): PyPI instance's url (e.g: https://pypi.org/project/requests)
This deals with correctly transforming the project's api url (e.g
https://pypi.org/pypi/requests/json)
Returns:
api url
"""
p_url = urlparse(url)
project_name = p_url.path.split('/')[-1]
url = '%s://%s/pypi/%s/json' % (p_url.scheme, p_url.netloc, project_name)
return url
def extract_intrinsic_metadata(dir_path: str) -> Dict:
"""Given an uncompressed path holding the pkginfo file, returns a
pkginfo parsed structure as a dict.
The release artifact contains at their root one folder. For example:
$ tar tvf zprint-0.0.6.tar.gz
drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
...
Args:
dir_path (str): Path to the uncompressed directory
representing a release artifact from pypi.
Returns:
the pkginfo parsed structure as a dict if any or None if
none was present.
"""
# Retrieve the root folder of the archive
if not os.path.exists(dir_path):
return {}
lst = os.listdir(dir_path)
if len(lst) != 1:
return {}
project_dirname = lst[0]
pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO')
if not os.path.exists(pkginfo_path):
return {}
pkginfo = UnpackedSDist(pkginfo_path)
raw = pkginfo.__dict__
raw.pop('filename') # this gets added with the ondisk location
return raw
def author(data: Dict) -> Dict:
"""Given a dict of project/release artifact information (coming from
PyPI), returns an author subset.
Args:
data (dict): Representing either artifact information or
release information.
Returns:
swh-model dict representing a person.
"""
name = data.get('author')
email = data.get('author_email')
if email:
fullname = '%s <%s>' % (name, email)
else:
fullname = name
if not fullname:
return {'fullname': b'', 'name': None, 'email': None}
fullname = fullname.encode('utf-8')
if name is not None:
name = name.encode('utf-8')
if email is not None:
email = email.encode('utf-8')
return {'fullname': fullname, 'name': name, 'email': email}
class PyPILoader(PackageLoader):
"""Load pypi origin's artifact releases into swh archive.
"""
visit_type = 'pypi'
def __init__(self, url):
super().__init__(url=url)
self._info = None
self.provider_url = pypi_api_url(self.url)
@property
def info(self) -> Dict:
"""Return the project metadata information (fetched from pypi registry)
"""
if not self._info:
self._info = api_info(self.provider_url)
return self._info
def get_versions(self) -> Sequence[str]:
return self.info['releases'].keys()
def get_default_release(self) -> str:
return self.info['info']['version']
- def get_artifacts(self, version: str) -> Generator[
- Tuple[Mapping[str, str], Dict], None, None]:
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
+ res = []
for meta in self.info['releases'][version]:
- artifact_package_info = {
+ filename = meta['filename']
+ p_info = {
'url': meta['url'],
- 'filename': meta['filename'],
+ 'filename': filename,
+ 'raw': meta,
}
- yield artifact_package_info, meta
+ res.append((version, p_info))
+
+ if len(res) == 1:
+ version, p_info = res[0]
+ yield 'releases/%s' % version, p_info
+ else:
+ for version, p_info in res:
+ yield 'releases/%s/%s' % (version, p_info['filename']), p_info
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
sha256 = artifact_metadata['digests']['sha256']
for rev_id, known_artifact in known_artifacts.items():
- original_artifact = known_artifact['original_artifact']
- if sha256 == original_artifact['checksums']['sha256']:
- return rev_id
-
- def read_intrinsic_metadata(self, a_metadata: Dict,
- a_uncompressed_path: str) -> Dict:
- return extract_intrinsic_metadata(a_uncompressed_path)
+ for original_artifact in known_artifact['original_artifact']:
+ if sha256 == original_artifact['checksums']['sha256']:
+ return rev_id
def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
+ self, a_metadata: Dict, uncompressed_path: str) -> Dict:
+ i_metadata = extract_intrinsic_metadata(uncompressed_path)
# from intrinsic metadata
name = i_metadata['version']
_author = author(i_metadata)
# from extrinsic metadata
message = a_metadata.get('comment_text', '')
message = '%s: %s' % (name, message) if message else name
date = normalize_timestamp(
int(iso8601.parse_date(a_metadata['upload_time']).timestamp()))
return {
'type': 'tar',
'message': message.encode('utf-8'),
'author': _author,
'date': date,
'committer': _author,
'committer_date': date,
'parents': [],
'metadata': {
'intrinsic': {
'tool': 'PKG-INFO',
'raw': i_metadata,
},
'extrinsic': {
'provider': self.provider_url,
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
}
}
diff --git a/swh/loader/package/tests/test_debian.py b/swh/loader/package/tests/test_debian.py
index 85663ac..b453c28 100644
--- a/swh/loader/package/tests/test_debian.py
+++ b/swh/loader/package/tests/test_debian.py
@@ -1,316 +1,318 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import copy
import logging
import pytest
from os import path
from swh.loader.package.debian import (
DebianLoader, download_package, dsc_information, uid_to_person,
prepare_person, get_package_metadata, extract_package
)
from swh.loader.package.tests.common import check_snapshot
logger = logging.getLogger(__name__)
PACKAGE_FILES = {
'files': {
'cicero_0.7.2-3.diff.gz': {
'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce',
'name': 'cicero_0.7.2-3.diff.gz',
'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c', # noqa
'size': 3964,
'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2-3.diff.gz' # noqa
},
'cicero_0.7.2-3.dsc': {
'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a',
'name': 'cicero_0.7.2-3.dsc',
'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03', # noqa
'size': 1864,
'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2-3.dsc'}, # noqa
'cicero_0.7.2.orig.tar.gz': {
'md5sum': '4353dede07c5728319ba7f5595a7230a',
'name': 'cicero_0.7.2.orig.tar.gz',
'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa
'size': 96527,
'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa
}
},
'id': 23,
'name': 'cicero',
'revision_id': None,
'version': '0.7.2-3'
}
PACKAGE_PER_VERSION = {
'stretch/contrib/0.7.2-3': PACKAGE_FILES
}
def test_debian_first_visit(
swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
loader = DebianLoader(
url='deb://Debian/packages/cicero',
date='2019-10-12T05:58:09.165557+00:00',
packages=PACKAGE_PER_VERSION)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 42,
'directory': 2,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 1, # all artifacts under 1 revision
'skipped_content': 0,
'snapshot': 1
} == stats
expected_snapshot = {
'id': 'a59ec49a01ff329dcbbc63fd36a5654143aef240',
'branches': {
'HEAD': {
'target_type': 'alias',
'target': 'releases/stretch/contrib/0.7.2-3'
},
'releases/stretch/contrib/0.7.2-3': {
'target_type': 'revision',
'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07',
}
},
} # different than the previous loader as no release is done
check_snapshot(expected_snapshot, loader.storage)
def test_debian_first_visit_then_another_visit(
swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
url = 'deb://Debian/packages/cicero'
loader = DebianLoader(
url=url,
date='2019-10-12T05:58:09.165557+00:00',
packages=PACKAGE_PER_VERSION)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
stats = loader.storage.stat_counters()
assert {
'content': 42,
'directory': 2,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 1, # all artifacts under 1 revision
'skipped_content': 0,
'snapshot': 1
} == stats
expected_snapshot = {
'id': 'a59ec49a01ff329dcbbc63fd36a5654143aef240',
'branches': {
'HEAD': {
'target_type': 'alias',
'target': 'releases/stretch/contrib/0.7.2-3'
},
'releases/stretch/contrib/0.7.2-3': {
'target_type': 'revision',
'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07',
}
},
} # different than the previous loader as no release is done
check_snapshot(expected_snapshot, loader.storage)
# No change in between load
actual_load_status2 = loader.load()
assert actual_load_status2['status'] == 'uneventful'
origin_visit2 = list(loader.storage.origin_visit_get(url))
assert origin_visit2[-1]['status'] == 'full'
stats2 = loader.storage.stat_counters()
assert {
'content': 42 + 0,
'directory': 2 + 0,
'origin': 1,
'origin_visit': 1 + 1, # a new visit occurred
'person': 1,
'release': 0,
'revision': 1,
'skipped_content': 0,
'snapshot': 1, # same snapshot across 2 visits
} == stats2
urls = [
m.url for m in requests_mock_datadir.request_history
if m.url.startswith('http://deb.debian.org')
]
# visited each package artifact twice across 2 visits
assert len(urls) == len(set(urls))
def test_uid_to_person():
uid = 'Someone Name <someone@orga.org>'
actual_person = uid_to_person(uid)
assert actual_person == {
'name': 'Someone Name',
'email': 'someone@orga.org',
'fullname': uid,
}
def test_prepare_person():
actual_author = prepare_person({
'name': 'Someone Name',
'email': 'someone@orga.org',
'fullname': 'Someone Name <someone@orga.org>',
})
assert actual_author == {
'name': b'Someone Name',
'email': b'someone@orga.org',
'fullname': b'Someone Name <someone@orga.org>',
}
def test_download_package(datadir, tmpdir, requests_mock_datadir):
tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue)
all_hashes = download_package(PACKAGE_FILES, tmpdir)
assert all_hashes == {
'cicero_0.7.2-3.diff.gz': {
'checksums': {
'blake2s256': '08b1c438e70d2474bab843d826515147fa4a817f8c4baaf3ddfbeb5132183f21', # noqa
'sha1': '0815282053f21601b0ec4adf7a8fe47eace3c0bc',
'sha1_git': '834ac91da3a9da8f23f47004bb456dd5bd16fe49',
'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c' # noqa
},
'filename': 'cicero_0.7.2-3.diff.gz',
'length': 3964},
'cicero_0.7.2-3.dsc': {
'checksums': {
'blake2s256': '8c002bead3e35818eaa9d00826f3d141345707c58fb073beaa8abecf4bde45d2', # noqa
'sha1': 'abbec4e8efbbc80278236e1dd136831eac08accd',
'sha1_git': '1f94b2086fa1142c2df6b94092f5c5fa11093a8e',
'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03' # noqa
},
'filename': 'cicero_0.7.2-3.dsc',
'length': 1864},
'cicero_0.7.2.orig.tar.gz': {
'checksums': {
'blake2s256': '9809aa8d2e2dad7f34cef72883db42b0456ab7c8f1418a636eebd30ab71a15a6', # noqa
'sha1': 'a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43',
'sha1_git': 'aa0a38978dce86d531b5b0299b4a616b95c64c74',
'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786' # noqa
},
'filename': 'cicero_0.7.2.orig.tar.gz',
'length': 96527
}
}
def test_dsc_information_ok():
fname = 'cicero_0.7.2-3.dsc'
dsc_url, dsc_name = dsc_information(PACKAGE_FILES)
assert dsc_url == PACKAGE_FILES['files'][fname]['uri']
assert dsc_name == PACKAGE_FILES['files'][fname]['name']
def test_dsc_information_not_found():
fname = 'cicero_0.7.2-3.dsc'
package_files = copy.deepcopy(PACKAGE_FILES)
package_files['files'].pop(fname)
dsc_url, dsc_name = dsc_information(package_files)
assert dsc_url is None
assert dsc_name is None
def test_dsc_information_too_many_dsc_entries():
# craft an extra dsc file
fname = 'cicero_0.7.2-3.dsc'
package_files = copy.deepcopy(PACKAGE_FILES)
data = package_files['files'][fname]
fname2 = fname.replace('cicero', 'ciceroo')
package_files['files'][fname2] = data
with pytest.raises(
ValueError, match='Package %s_%s references several dsc' % (
package_files['name'], package_files['version'])):
dsc_information(package_files)
def test_get_package_metadata(requests_mock_datadir, datadir, tmp_path):
tmp_path = str(tmp_path) # py3.5 compat.
package = PACKAGE_FILES
logger.debug('package: %s', package)
# download the packages
- download_package(package, tmp_path)
+ all_hashes = download_package(package, tmp_path)
# Retrieve information from package
_, dsc_name = dsc_information(package)
+ dl_artifacts = [(tmp_path, hashes) for hashes in all_hashes.values()]
+
# Extract information from package
- extracted_path = extract_package(package, tmp_path)
+ extracted_path = extract_package(dl_artifacts, tmp_path)
# Retrieve information on package
dsc_path = path.join(path.dirname(extracted_path), dsc_name)
actual_package_info = get_package_metadata(
package, dsc_path, extracted_path)
logger.debug('actual_package_info: %s', actual_package_info)
assert actual_package_info == {
'changelog': {
'date': '2014-10-19T16:52:35+02:00',
'history': [
('cicero', '0.7.2-2'),
('cicero', '0.7.2-1'),
('cicero', '0.7-1')
],
'person': {
'email': 'sthibault@debian.org',
'fullname': 'Samuel Thibault <sthibault@debian.org>',
'name': 'Samuel Thibault'
}
},
'maintainers': [
{
'email': 'debian-accessibility@lists.debian.org',
'fullname': 'Debian Accessibility Team '
'<debian-accessibility@lists.debian.org>',
'name': 'Debian Accessibility Team'
},
{
'email': 'sthibault@debian.org',
'fullname': 'Samuel Thibault <sthibault@debian.org>',
'name': 'Samuel Thibault'
}
],
'name': 'cicero',
'version': '0.7.2-3'
}
diff --git a/swh/loader/package/tests/test_deposit.py b/swh/loader/package/tests/test_deposit.py
index 8cc5723..2e999b0 100644
--- a/swh/loader/package/tests/test_deposit.py
+++ b/swh/loader/package/tests/test_deposit.py
@@ -1,199 +1,204 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.deposit import DepositLoader
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
from swh.core.pytest_plugin import requests_mock_datadir_factory
def test_deposit_init_ok(swh_config):
url = 'some-url'
deposit_id = 999
loader = DepositLoader(url, deposit_id) # Something that does not exist
assert loader.url == url
assert loader.archive_url == '/%s/raw/' % deposit_id
assert loader.metadata_url == '/%s/meta/' % deposit_id
assert loader.deposit_update_url == '/%s/update/' % deposit_id
assert loader.client is not None
def test_deposit_loading_failure_to_fetch_metadata(swh_config):
"""Error during fetching artifact ends us with failed/partial visit
"""
# private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
url = 'some-url'
unknown_deposit_id = 666
loader = DepositLoader(url, unknown_deposit_id) # does not exist
actual_load_status = loader.load()
assert actual_load_status['status'] == 'failed'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 0,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[
'https://deposit.softwareheritage.org/1/private/666/raw/',
])
def test_deposit_loading_failure_to_retrieve_1_artifact(
swh_config, requests_mock_datadir_missing_one):
"""Deposit with missing artifact ends up with an uneventful/partial visit
"""
# private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
url = 'some-url-2'
deposit_id = 666
loader = DepositLoader(url, deposit_id)
assert loader.archive_url
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 1,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
# do not care for deposit update query
requests_mock_datadir.put(re.compile('https'))
url = 'https://hal-test.archives-ouvertes.fr/some-external-id'
deposit_id = 666
loader = DepositLoader(url, deposit_id)
assert loader.archive_url
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_revision_id = hash_to_bytes(
'9471c606239bccb1f269564c9ea114e1eeab9eb4')
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(revision['metadata'], paths=[
('extrinsic.provider', str),
('extrinsic.when', str),
('extrinsic.raw', dict),
- ('original_artifact.filename', str),
- ('original_artifact.length', int),
- ('original_artifact.checksums', dict),
+ ('original_artifact', list),
])
+ for original_artifact in revision['metadata']['original_artifact']:
+ check_metadata_paths(original_artifact, paths=[
+ ('filename', str),
+ ('length', int),
+ ('checksums', dict),
+ ])
+
def test_deposit_loading_ok(swh_config, requests_mock_datadir):
requests_mock_datadir.put(re.compile('https')) # do not care for put
url = 'https://hal-test.archives-ouvertes.fr/some-external-id'
deposit_id = 666
loader = DepositLoader(url, deposit_id)
assert loader.archive_url
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 303,
'directory': 12,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 1,
'skipped_content': 0,
'snapshot': 1,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
expected_branches = {
'HEAD': {
'target': '9471c606239bccb1f269564c9ea114e1eeab9eb4',
'target_type': 'revision',
},
}
expected_snapshot = {
'id': '453f455d0efb69586143cd6b6e5897f9906b53a7',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, storage=loader.storage)
# check metadata
tool = {
"name": "swh-deposit",
"version": "0.0.1",
"configuration": {
"sword_version": "2",
}
}
tool = loader.storage.tool_get(tool)
assert tool is not None
assert tool['id'] is not None
provider = {
"provider_name": "hal",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
"metadata": None,
}
provider = loader.storage.metadata_provider_get_by(provider)
assert provider is not None
assert provider['id'] is not None
metadata = loader.storage.origin_metadata_get_by(
url, provider_type='deposit_client')
assert metadata is not None
assert isinstance(metadata, list)
assert len(metadata) == 1
metadata0 = metadata[0]
assert metadata0['provider_id'] == provider['id']
assert metadata0['provider_type'] == 'deposit_client'
assert metadata0['tool_id'] == tool['id']
diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_gnu.py
index ea70a83..3be6610 100644
--- a/swh/loader/package/tests/test_gnu.py
+++ b/swh/loader/package/tests/test_gnu.py
@@ -1,349 +1,354 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import re
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.gnu import GNULoader, get_version
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
def test_get_version():
"""From url to branch name should yield something relevant
"""
for url, expected_branchname in [
('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
('https://ftp.org/gnu/aris-w32.zip', 'w32'),
('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
('https://ftp.org/gnu/crypto-build-demo.tar.gz',
'crypto-build-demo'),
('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
'clue+clio+xit.clisp'),
('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
'clue+clio.for-pcl'),
('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
'hppa2.0-hp-hpux10.20'),
('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
('clisp-powerpc-unknown-linuxlibc6.tar.gz',
'powerpc-unknown-linuxlibc6'),
('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
'sparc-sun-sunos4.1.3_U1'),
('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
'2.25.1-powerpc-apple-MacOSX'),
('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
'2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
'2.27-i686-unknown-Linux-2.2.19'),
('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
'2.28-i386-i386-freebsd-4.3-RELEASE'),
('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
'2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
'2.29-i386-i386-freebsd-4.6-STABLE'),
('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
'2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
'2.5.3-ansi-japi-xdr.20030701_mingw32'),
('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
('sather-logo_images.tar.gz', 'sather-logo_images'),
('sather-specification-000328.html.tar.gz', '000328.html')
]:
actual_branchname = get_version(url)
assert actual_branchname == expected_branchname
_expected_new_contents_first_visit = [
'e9258d81faf5881a2f96a77ba609396f82cb97ad',
'1170cf105b04b7e2822a0e09d2acf71da7b9a130',
'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac',
'0057bec9b5422aff9256af240b177ac0e3ac2608',
'2b8d0d0b43a1078fc708930c8ddc2956a86c566e',
'27de3b3bc6545d2a797aeeb4657c0e215a0c2e55',
'2e6db43f5cd764e677f416ff0d0c78c7a82ef19b',
'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62',
'edeb33282b2bffa0e608e9d2fd960fd08093c0ea',
'd64e64d4c73679323f8d4cde2643331ba6c20af9',
'7a756602914be889c0a2d3952c710144b3e64cb0',
'84fb589b554fcb7f32b806951dcf19518d67b08f',
'8624bcdae55baeef00cd11d5dfcfa60f68710a02',
'e08441aeab02704cfbd435d6445f7c072f8f524e',
'f67935bc3a83a67259cda4b2d43373bd56703844',
'809788434b433eb2e3cfabd5d591c9a659d5e3d8',
'7d7c6c8c5ebaeff879f61f37083a3854184f6c41',
'b99fec102eb24bffd53ab61fc30d59e810f116a2',
'7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68',
'f0c97052e567948adf03e641301e9983c478ccff',
'7fb724242e2b62b85ca64190c31dcae5303e19b3',
'4f9709e64a9134fe8aefb36fd827b84d8b617ab5',
'7350628ccf194c2c3afba4ac588c33e3f3ac778d',
'0bb892d9391aa706dc2c3b1906567df43cbe06a2',
'49d4c0ce1a16601f1e265d446b6c5ea6b512f27c',
'6b5cc594ac466351450f7f64a0b79fdaf4435ad3',
'3046e5d1f70297e2a507b98224b6222c9688d610',
'1572607d456d7f633bc6065a2b3048496d679a31',
]
_expected_new_directories_first_visit = [
'daabc65ec75d487b1335ffc101c0ac11c803f8fc',
'263be23b4a8101d3ad0d9831319a3e0f2b065f36',
'7f6e63ba6eb3e2236f65892cd822041f1a01dd5c',
'4db0a3ecbc976083e2dac01a62f93729698429a3',
'dfef1c80e1098dd5deda664bb44a9ab1f738af13',
'eca971d346ea54d95a6e19d5051f900237fafdaa',
'3aebc29ed1fccc4a6f2f2010fb8e57882406b528',
]
_expected_new_revisions_first_visit = {
'44183488c0774ce3c957fa19ba695cf18a4a42b3':
'3aebc29ed1fccc4a6f2f2010fb8e57882406b528'
}
_expected_branches_first_visit = {
'HEAD': {
'target_type': 'alias',
'target': 'releases/0.1.0',
},
'releases/0.1.0': {
'target_type': 'revision',
'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3',
},
}
# hash is different then before as we changed the snapshot
# gnu used to use `release/` (singular) instead of plural
_expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa
def test_visit_with_no_artifact_found(swh_config, requests_mock):
package_url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': '944729610',
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(package_url, tarballs)
requests_mock.get(re.compile('https://'), status_code=404)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 1,
} == stats
origin_visit = next(loader.storage.origin_visit_get(package_url))
assert origin_visit['status'] == 'partial'
def test_check_revision_metadata_structure(swh_config, requests_mock_datadir):
package_url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': '944729610',
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(package_url, tarballs)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_revision_id = hash_to_bytes(
'44183488c0774ce3c957fa19ba695cf18a4a42b3')
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(revision['metadata'], paths=[
('intrinsic', dict),
('extrinsic.provider', str),
('extrinsic.when', str),
('extrinsic.raw', dict),
- ('original_artifact.filename', str),
- ('original_artifact.length', int),
- ('original_artifact.checksums', dict),
+ ('original_artifact', list),
])
+ for original_artifact in revision['metadata']['original_artifact']:
+ check_metadata_paths(original_artifact, paths=[
+ ('filename', str),
+ ('length', int),
+ ('checksums', dict),
+ ])
+
def test_visit_with_release_artifact_no_prior_visit(
swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
package_url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': 944729610,
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(package_url, tarballs)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit),
'directory': len(_expected_new_directories_first_visit),
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': len(_expected_new_revisions_first_visit),
'skipped_content': 0,
'snapshot': 1
} == stats
expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit)
assert list(loader.storage.content_missing_per_sha1(expected_contents)) \
== []
expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit)
assert list(loader.storage.directory_missing(expected_dirs)) == []
expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit)
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_snapshot = {
'id': _expected_new_snapshot_first_visit_id,
'branches': _expected_branches_first_visit,
}
check_snapshot(expected_snapshot, loader.storage)
def test_2_visits_without_change(swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': 944729610,
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(url, tarballs)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit['status'] == 'full'
actual_load_status2 = loader.load()
assert actual_load_status2['status'] == 'uneventful'
origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit2['status'] == 'full'
urls = [
m.url for m in requests_mock_datadir.request_history
if m.url.startswith('https://ftp.gnu.org')
]
assert len(urls) == 1
def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
url = 'https://ftp.gnu.org/gnu/8sync/'
tarball1 = {
'time': 944729610,
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}
loader = GNULoader(url, [tarball1])
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit['status'] == 'full'
stats = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit),
'directory': len(_expected_new_directories_first_visit),
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': len(_expected_new_revisions_first_visit),
'skipped_content': 0,
'snapshot': 1
} == stats
urls = [
m.url for m in requests_mock_datadir.request_history
if m.url.startswith('https://ftp.gnu.org')
]
assert len(urls) == 1
tarball2 = {
'time': 1480991830,
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
'length': 238466,
}
loader2 = GNULoader(url, [tarball1, tarball2])
# implementation detail: share the storage in between visits
loader2.storage = loader.storage
stats2 = loader2.storage.stat_counters()
assert stats == stats2 # ensure we share the storage
actual_load_status2 = loader2.load()
assert actual_load_status2['status'] == 'eventful'
stats2 = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit) + 14,
'directory': len(_expected_new_directories_first_visit) + 8,
'origin': 1,
'origin_visit': 1 + 1,
'person': 1,
'release': 0,
'revision': len(_expected_new_revisions_first_visit) + 1,
'skipped_content': 0,
'snapshot': 1 + 1,
} == stats2
origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit2['status'] == 'full'
urls = [
m.url for m in requests_mock_datadir.request_history
if m.url.startswith('https://ftp.gnu.org')
]
# 1 artifact (2nd time no modification) + 1 new artifact
assert len(urls) == 2
diff --git a/swh/loader/package/tests/test_npm.py b/swh/loader/package/tests/test_npm.py
index 1c253c0..654c472 100644
--- a/swh/loader/package/tests/test_npm.py
+++ b/swh/loader/package/tests/test_npm.py
@@ -1,526 +1,531 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import os
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.npm import (
parse_npm_package_author, extract_npm_package_author
)
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
from swh.loader.package.npm import NpmLoader
def _parse_author_string_test(author_str, expected_result):
assert parse_npm_package_author(author_str) == expected_result
assert parse_npm_package_author(' %s' % author_str) == expected_result
assert parse_npm_package_author('%s ' % author_str) == expected_result
def test_parse_npm_package_author():
_parse_author_string_test(
'John Doe',
{
'name': 'John Doe'
}
)
_parse_author_string_test(
'<john.doe@foo.bar>',
{
'email': 'john.doe@foo.bar'
}
)
_parse_author_string_test(
'(https://john.doe)',
{
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe <john.doe@foo.bar>',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar'
}
)
_parse_author_string_test(
'John Doe<john.doe@foo.bar>',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar'
}
)
_parse_author_string_test(
'John Doe (https://john.doe)',
{
'name': 'John Doe',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe(https://john.doe)',
{
'name': 'John Doe',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'<john.doe@foo.bar> (https://john.doe)',
{
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'(https://john.doe) <john.doe@foo.bar>',
{
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe <john.doe@foo.bar> (https://john.doe)',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe (https://john.doe) <john.doe@foo.bar>',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe<john.doe@foo.bar> (https://john.doe)',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe<john.doe@foo.bar>(https://john.doe)',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test('', {})
_parse_author_string_test('<>', {})
_parse_author_string_test(' <>', {})
_parse_author_string_test('<>()', {})
_parse_author_string_test('<> ()', {})
_parse_author_string_test('()', {})
_parse_author_string_test(' ()', {})
_parse_author_string_test(
'John Doe <> ()',
{
'name': 'John Doe'
}
)
_parse_author_string_test(
'John Doe <>',
{
'name': 'John Doe'
}
)
_parse_author_string_test(
'John Doe ()',
{
'name': 'John Doe'
}
)
def test_extract_npm_package_author(datadir):
package_metadata_filepath = os.path.join(
datadir, 'https_replicate.npmjs.com', 'org_visit1')
with open(package_metadata_filepath) as json_file:
package_metadata = json.load(json_file)
extract_npm_package_author(package_metadata['versions']['0.0.2']) == \
{
'fullname': b'mooz <stillpedant@gmail.com>',
'name': b'mooz',
'email': b'stillpedant@gmail.com'
}
assert (
extract_npm_package_author(package_metadata['versions']['0.0.3']) ==
{
'fullname': b'Masafumi Oyamada <stillpedant@gmail.com>',
'name': b'Masafumi Oyamada',
'email': b'stillpedant@gmail.com'
}
)
package_json = json.loads('''
{
"name": "highlightjs-line-numbers.js",
"version": "2.7.0",
"description": "Highlight.js line numbers plugin.",
"main": "src/highlightjs-line-numbers.js",
"dependencies": {},
"devDependencies": {
"gulp": "^4.0.0",
"gulp-rename": "^1.4.0",
"gulp-replace": "^0.6.1",
"gulp-uglify": "^1.2.0"
},
"repository": {
"type": "git",
"url": "https://github.com/wcoder/highlightjs-line-numbers.js.git"
},
"author": "Yauheni Pakala <evgeniy.pakalo@gmail.com>",
"license": "MIT",
"bugs": {
"url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues"
},
"homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/"
}''') # noqa
assert extract_npm_package_author(package_json) == \
{
'fullname': b'Yauheni Pakala <evgeniy.pakalo@gmail.com>',
'name': b'Yauheni Pakala',
'email': b'evgeniy.pakalo@gmail.com'
}
package_json = json.loads('''
{
"name": "3-way-diff",
"version": "0.0.1",
"description": "3-way diffing of JavaScript objects",
"main": "index.js",
"authors": [
{
"name": "Shawn Walsh",
"url": "https://github.com/shawnpwalsh"
},
{
"name": "Markham F Rollins IV",
"url": "https://github.com/mrollinsiv"
}
],
"keywords": [
"3-way diff",
"3 way diff",
"three-way diff",
"three way diff"
],
"devDependencies": {
"babel-core": "^6.20.0",
"babel-preset-es2015": "^6.18.0",
"mocha": "^3.0.2"
},
"dependencies": {
"lodash": "^4.15.0"
}
}''')
assert extract_npm_package_author(package_json) == \
{
'fullname': b'Shawn Walsh',
'name': b'Shawn Walsh',
'email': None
}
package_json = json.loads('''
{
"name": "yfe-ynpm",
"version": "1.0.0",
"homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm",
"repository": {
"type": "git",
"url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git"
},
"author": [
"fengmk2 <fengmk2@gmail.com> (https://fengmk2.com)",
"xufuzi <xufuzi@ywwl.com> (https://7993.org)"
],
"license": "MIT"
}''')
assert extract_npm_package_author(package_json) == \
{
'fullname': b'fengmk2 <fengmk2@gmail.com>',
'name': b'fengmk2',
'email': b'fengmk2@gmail.com'
}
package_json = json.loads('''
{
"name": "umi-plugin-whale",
"version": "0.0.8",
"description": "Internal contract component",
"authors": {
"name": "xiaohuoni",
"email": "448627663@qq.com"
},
"repository": "alitajs/whale",
"devDependencies": {
"np": "^3.0.4",
"umi-tools": "*"
},
"license": "MIT"
}''')
assert extract_npm_package_author(package_json) == \
{
'fullname': b'xiaohuoni <448627663@qq.com>',
'name': b'xiaohuoni',
'email': b'448627663@qq.com'
}
def normalize_hashes(hashes):
if isinstance(hashes, str):
return hash_to_bytes(hashes)
if isinstance(hashes, list):
return [hash_to_bytes(x) for x in hashes]
return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()}
_expected_new_contents_first_visit = normalize_hashes([
'4ce3058e16ab3d7e077f65aabf855c34895bf17c',
'858c3ceee84c8311adc808f8cdb30d233ddc9d18',
'0fa33b4f5a4e0496da6843a38ff1af8b61541996',
'85a410f8ef8eb8920f2c384a9555566ad4a2e21b',
'9163ac8025923d5a45aaac482262893955c9b37b',
'692cf623b8dd2c5df2c2998fd95ae4ec99882fb4',
'18c03aac6d3e910efb20039c15d70ab5e0297101',
'41265c42446aac17ca769e67d1704f99e5a1394d',
'783ff33f5882813dca9239452c4a7cadd4dba778',
'b029cfb85107aee4590c2434a3329bfcf36f8fa1',
'112d1900b4c2e3e9351050d1b542c9744f9793f3',
'5439bbc4bd9a996f1a38244e6892b71850bc98fd',
'd83097a2f994b503185adf4e719d154123150159',
'd0939b4898e83090ee55fd9d8a60e312cfadfbaf',
'b3523a26f7147e4af40d9d462adaae6d49eda13e',
'cd065fb435d6fb204a8871bcd623d0d0e673088c',
'2854a40855ad839a54f4b08f5cff0cf52fca4399',
'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe',
'0f73d56e1cf480bded8a1ecf20ec6fc53c574713',
'0d9882b2dfafdce31f4e77fe307d41a44a74cefe',
'585fc5caab9ead178a327d3660d35851db713df1',
'e8cd41a48d79101977e3036a87aeb1aac730686f',
'5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7',
'9c3cc2763bf9e9e37067d3607302c4776502df98',
'3649a68410e354c83cd4a38b66bd314de4c8f5c9',
'e96ed0c091de1ebdf587104eaf63400d1974a1fe',
'078ca03d2f99e4e6eab16f7b75fbb7afb699c86c',
'38de737da99514de6559ff163c988198bc91367a',
])
_expected_new_directories_first_visit = normalize_hashes([
'3370d20d6f96dc1c9e50f083e2134881db110f4f',
'42753c0c2ab00c4501b552ac4671c68f3cf5aece',
'd7895533ef5edbcffdea3f057d9fef3a1ef845ce',
'80579be563e2ef3e385226fe7a3f079b377f142c',
'3b0ddc6a9e58b4b53c222da4e27b280b6cda591c',
'bcad03ce58ac136f26f000990fc9064e559fe1c0',
'5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca',
'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd',
'584b5b4b6cf7f038095e820b99386a9c232de931',
'184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a',
'bb5f4ee143c970367eb409f2e4c1104898048b9d',
'1b95491047add1103db0dfdfa84a9735dcb11e88',
'a00c6de13471a2d66e64aca140ddb21ef5521e62',
'5ce6c1cd5cda2d546db513aaad8c72a44c7771e2',
'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2',
'202fafcd7c0f8230e89d5496ad7f44ab12b807bf',
'775cc516543be86c15c1dc172f49c0d4e6e78235',
'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e',
])
_expected_new_revisions_first_visit = normalize_hashes({
'd8a1c7474d2956ac598a19f0f27d52f7015f117e':
'42753c0c2ab00c4501b552ac4671c68f3cf5aece',
'5f9eb78af37ffd12949f235e86fac04898f9f72a':
'3370d20d6f96dc1c9e50f083e2134881db110f4f',
'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a':
'd7895533ef5edbcffdea3f057d9fef3a1ef845ce'}
)
_expected_new_snapshot_first_visit_id = normalize_hashes(
'd0587e1195aed5a8800411a008f2f2d627f18e2d')
_expected_branches_first_visit = {
'HEAD': {
'target': 'releases/0.0.4',
'target_type': 'alias'
},
'releases/0.0.2': {
'target': 'd8a1c7474d2956ac598a19f0f27d52f7015f117e',
'target_type': 'revision'
},
'releases/0.0.3': {
'target': '5f9eb78af37ffd12949f235e86fac04898f9f72a',
'target_type': 'revision'
},
'releases/0.0.4': {
'target': 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a',
'target_type': 'revision'
}
}
def package_url(package):
return 'https://www.npmjs.com/package/%s' % package
def package_metadata_url(package):
return 'https://replicate.npmjs.com/%s/' % package
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
package = 'org'
loader = NpmLoader(package,
package_url(package),
package_metadata_url(package))
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_revision_id = hash_to_bytes(
'd8a1c7474d2956ac598a19f0f27d52f7015f117e')
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(revision['metadata'], paths=[
('intrinsic.tool', str),
('intrinsic.raw', dict),
('extrinsic.provider', str),
('extrinsic.when', str),
('extrinsic.raw', dict),
- ('original_artifact.filename', str),
- ('original_artifact.length', int),
- ('original_artifact.checksums', dict),
+ ('original_artifact', list),
])
+ for original_artifact in revision['metadata']['original_artifact']:
+ check_metadata_paths(original_artifact, paths=[
+ ('filename', str),
+ ('length', int),
+ ('checksums', dict),
+ ])
+
def test_npm_loader_first_visit(swh_config, requests_mock_datadir):
package = 'org'
loader = NpmLoader(package,
package_url(package),
package_metadata_url(package))
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit),
'directory': len(_expected_new_directories_first_visit),
'origin': 1,
'origin_visit': 1,
'person': 2,
'release': 0,
'revision': len(_expected_new_revisions_first_visit),
'skipped_content': 0,
'snapshot': 1,
} == stats
assert len(list(loader.storage.content_get(
_expected_new_contents_first_visit))) == len(
_expected_new_contents_first_visit)
assert list(loader.storage.directory_missing(
_expected_new_directories_first_visit)) == []
assert list(loader.storage.revision_missing(
_expected_new_revisions_first_visit)) == []
expected_snapshot = {
'id': _expected_new_snapshot_first_visit_id,
'branches': _expected_branches_first_visit,
}
check_snapshot(expected_snapshot, loader.storage)
def test_npm_loader_incremental_visit(
swh_config, requests_mock_datadir_visits):
package = 'org'
url = package_url(package)
metadata_url = package_metadata_url(package)
loader = NpmLoader(package, url, metadata_url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit['status'] == 'full'
stats = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit),
'directory': len(_expected_new_directories_first_visit),
'origin': 1,
'origin_visit': 1,
'person': 2,
'release': 0,
'revision': len(_expected_new_revisions_first_visit),
'skipped_content': 0,
'snapshot': 1,
} == stats
loader._info = None # reset loader internal state
actual_load_status2 = loader.load()
assert actual_load_status2['status'] == 'eventful'
origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit2['status'] == 'full'
stats = loader.storage.stat_counters()
assert { # 3 new releases artifacts
'content': len(_expected_new_contents_first_visit) + 14,
'directory': len(_expected_new_directories_first_visit) + 15,
'origin': 1,
'origin_visit': 2,
'person': 2,
'release': 0,
'revision': len(_expected_new_revisions_first_visit) + 3,
'skipped_content': 0,
'snapshot': 2,
} == stats
urls = [
m.url for m in requests_mock_datadir_visits.request_history
if m.url.startswith('https://registry.npmjs.org')
]
assert len(urls) == len(set(urls)) # we visited each artifact once across
diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py
index aba1814..74b3a70 100644
--- a/swh/loader/package/tests/test_pypi.py
+++ b/swh/loader/package/tests/test_pypi.py
@@ -1,654 +1,659 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from os import path
import pytest
from unittest.mock import patch
from swh.core.tarball import uncompress
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.pypi import (
PyPILoader, pypi_api_url, author, extract_intrinsic_metadata
)
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
def test_author_basic():
data = {
'author': "i-am-groot",
'author_email': 'iam@groot.org',
}
actual_author = author(data)
expected_author = {
'fullname': b'i-am-groot <iam@groot.org>',
'name': b'i-am-groot',
'email': b'iam@groot.org',
}
assert actual_author == expected_author
def test_author_empty_email():
data = {
'author': 'i-am-groot',
'author_email': '',
}
actual_author = author(data)
expected_author = {
'fullname': b'i-am-groot',
'name': b'i-am-groot',
'email': b'',
}
assert actual_author == expected_author
def test_author_empty_name():
data = {
'author': "",
'author_email': 'iam@groot.org',
}
actual_author = author(data)
expected_author = {
'fullname': b' <iam@groot.org>',
'name': b'',
'email': b'iam@groot.org',
}
assert actual_author == expected_author
def test_author_malformed():
data = {
'author': "['pierre', 'paul', 'jacques']",
'author_email': None,
}
actual_author = author(data)
expected_author = {
'fullname': b"['pierre', 'paul', 'jacques']",
'name': b"['pierre', 'paul', 'jacques']",
'email': None,
}
assert actual_author == expected_author
def test_author_malformed_2():
data = {
'author': '[marie, jeanne]',
'author_email': '[marie@some, jeanne@thing]',
}
actual_author = author(data)
expected_author = {
'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>',
'name': b'[marie, jeanne]',
'email': b'[marie@some, jeanne@thing]',
}
assert actual_author == expected_author
def test_author_malformed_3():
data = {
'author': '[marie, jeanne, pierre]',
'author_email': '[marie@somewhere.org, jeanne@somewhere.org]',
}
actual_author = author(data)
expected_author = {
'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa
'name': b'[marie, jeanne, pierre]',
'email': b'[marie@somewhere.org, jeanne@somewhere.org]',
}
actual_author == expected_author
# configuration error #
def test_badly_configured_loader_raise(monkeypatch):
"""Badly configured loader should raise"""
monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False)
with pytest.raises(ValueError) as e:
PyPILoader(url='some-url')
assert 'Misconfiguration' in e.value.args[0]
def test_pypi_api_url():
"""Compute pypi api url from the pypi project url should be ok"""
url = pypi_api_url('https://pypi.org/project/requests')
assert url == 'https://pypi.org/pypi/requests/json'
@pytest.mark.fs
def test_extract_intrinsic_metadata(tmp_path, datadir):
"""Parsing existing archive's PKG-INFO should yield results"""
uncompressed_archive_path = str(tmp_path)
archive_path = path.join(
datadir, 'https_files.pythonhosted.org', '0805nexter-1.1.0.zip')
uncompress(archive_path, dest=uncompressed_archive_path)
actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path)
expected_metadata = {
'metadata_version': '1.0',
'name': '0805nexter',
'version': '1.1.0',
'summary': 'a simple printer of nested lest',
'home_page': 'http://www.hp.com',
'author': 'hgtkpython',
'author_email': '2868989685@qq.com',
'platforms': ['UNKNOWN'],
}
assert actual_metadata == expected_metadata
@pytest.mark.fs
def test_extract_intrinsic_metadata_failures(tmp_path):
"""Parsing inexistant path/archive/PKG-INFO yield None"""
tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue)
# inexistant first level path
assert extract_intrinsic_metadata('/something-inexistant') == {}
# inexistant second level path (as expected by pypi archives)
assert extract_intrinsic_metadata(tmp_path) == {}
# inexistant PKG-INFO within second level path
existing_path_no_pkginfo = path.join(tmp_path, 'something')
os.mkdir(existing_path_no_pkginfo)
assert extract_intrinsic_metadata(tmp_path) == {}
# LOADER SCENARIO #
# "edge" cases (for the same origin) #
# no release artifact:
# {visit full, status: uneventful, no contents, etc...}
requests_mock_datadir_missing_all = requests_mock_datadir_factory(ignore_urls=[
'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa
'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa
])
def test_no_release_artifact(swh_config, requests_mock_datadir_missing_all):
"""Load a pypi project with all artifacts missing ends up with no snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 1,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
# problem during loading:
# {visit: partial, status: uneventful, no snapshot}
def test_release_with_traceback(swh_config):
url = 'https://pypi.org/project/0805nexter'
with patch('swh.loader.package.pypi.PyPILoader.get_default_release',
side_effect=ValueError('Problem')):
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'failed'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 0,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
# problem during loading: failure early enough in between swh contents...
# some contents (contents, directories, etc...) have been written in storage
# {visit: partial, status: eventful, no snapshot}
# problem during loading: failure late enough we can have snapshots (some
# revisions are written in storage already)
# {visit: partial, status: eventful, snapshot}
# "normal" cases (for the same origin) #
requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[
'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa
])
# some missing release artifacts:
# {visit partial, status: eventful, 1 snapshot}
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_revision_id = hash_to_bytes(
'e445da4da22b31bfebb6ffc4383dbf839a074d21')
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(revision['metadata'], paths=[
('intrinsic.tool', str),
('intrinsic.raw', dict),
('extrinsic.provider', str),
('extrinsic.when', str),
('extrinsic.raw', dict),
- ('original_artifact.filename', str),
- ('original_artifact.length', int),
- ('original_artifact.checksums', dict),
+ ('original_artifact', list),
])
+ for original_artifact in revision['metadata']['original_artifact']:
+ check_metadata_paths(original_artifact, paths=[
+ ('filename', str),
+ ('length', int),
+ ('checksums', dict),
+ ])
+
def test_visit_with_missing_artifact(
swh_config, requests_mock_datadir_missing_one):
"""Load a pypi project with some missing artifacts ends up with 1 snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 3,
'directory': 2,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 1,
'skipped_content': 0,
'snapshot': 1
} == stats
expected_contents = map(hash_to_bytes, [
'405859113963cb7a797642b45f171d6360425d16',
'e5686aa568fdb1d19d7f1329267082fe40482d31',
'83ecf6ec1114fd260ca7a833a2d165e71258c338',
])
assert list(loader.storage.content_missing_per_sha1(expected_contents))\
== []
expected_dirs = map(hash_to_bytes, [
'b178b66bd22383d5f16f4f5c923d39ca798861b4',
'c3a58f8b57433a4b56caaa5033ae2e0931405338',
])
assert list(loader.storage.directory_missing(expected_dirs)) == []
# {revision hash: directory hash}
expected_revs = {
hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
}
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_branches = {
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.2.0',
'target_type': 'alias',
},
}
expected_snapshot = {
'id': 'dd0e4201a232b1c104433741dbf45895b8ac9355',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, storage=loader.storage)
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
def test_visit_with_1_release_artifact(swh_config, requests_mock_datadir):
"""With no prior visit, load a pypi project ends up with 1 snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 6,
'directory': 4,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 2,
'skipped_content': 0,
'snapshot': 1
} == stats
expected_contents = map(hash_to_bytes, [
'a61e24cdfdab3bb7817f6be85d37a3e666b34566',
'938c33483285fd8ad57f15497f538320df82aeb8',
'a27576d60e08c94a05006d2e6d540c0fdb5f38c8',
'405859113963cb7a797642b45f171d6360425d16',
'e5686aa568fdb1d19d7f1329267082fe40482d31',
'83ecf6ec1114fd260ca7a833a2d165e71258c338',
])
assert list(loader.storage.content_missing_per_sha1(expected_contents))\
== []
expected_dirs = map(hash_to_bytes, [
'05219ba38bc542d4345d5638af1ed56c7d43ca7d',
'cf019eb456cf6f78d8c4674596f1c9a97ece8f44',
'b178b66bd22383d5f16f4f5c923d39ca798861b4',
'c3a58f8b57433a4b56caaa5033ae2e0931405338',
])
assert list(loader.storage.directory_missing(expected_dirs)) == []
# {revision hash: directory hash}
expected_revs = {
hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa
hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
}
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_branches = {
'releases/1.1.0': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.2.0',
'target_type': 'alias',
},
}
expected_snapshot = {
'id': 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, loader.storage)
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
def test_multiple_visits_with_no_change(swh_config, requests_mock_datadir):
"""Multiple visits with no changes results in 1 same snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 6,
'directory': 4,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 2,
'skipped_content': 0,
'snapshot': 1
} == stats
expected_branches = {
'releases/1.1.0': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.2.0',
'target_type': 'alias',
},
}
snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a'
expected_snapshot = {
'id': snapshot_id,
'branches': expected_branches,
}
check_snapshot(expected_snapshot, loader.storage)
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
actual_load_status2 = loader.load()
assert actual_load_status2['status'] == 'uneventful'
stats2 = loader.storage.stat_counters()
expected_stats2 = stats.copy()
expected_stats2['origin_visit'] = 1 + 1
assert expected_stats2 == stats2
# same snapshot
actual_snapshot_id = origin_visit['snapshot']['id']
assert actual_snapshot_id == hash_to_bytes(snapshot_id)
def test_incremental_visit(swh_config, requests_mock_datadir_visits):
"""With prior visit, 2nd load will result with a different snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
visit1_actual_load_status = loader.load()
visit1_stats = loader.storage.stat_counters()
assert visit1_actual_load_status['status'] == 'eventful'
origin_visit1 = next(loader.storage.origin_visit_get(url))
assert origin_visit1['status'] == 'full'
assert {
'content': 6,
'directory': 4,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 2,
'skipped_content': 0,
'snapshot': 1
} == visit1_stats
# Reset internal state
loader._info = None
visit2_actual_load_status = loader.load()
visit2_stats = loader.storage.stat_counters()
assert visit2_actual_load_status['status'] == 'eventful'
visits = list(loader.storage.origin_visit_get(url))
assert len(visits) == 2
assert visits[1]['status'] == 'full'
assert {
'content': 6 + 1, # 1 more content
'directory': 4 + 2, # 2 more directories
'origin': 1,
'origin_visit': 1 + 1,
'person': 1,
'release': 0,
'revision': 2 + 1, # 1 more revision
'skipped_content': 0,
'snapshot': 1 + 1, # 1 more snapshot
} == visit2_stats
expected_contents = map(hash_to_bytes, [
'a61e24cdfdab3bb7817f6be85d37a3e666b34566',
'938c33483285fd8ad57f15497f538320df82aeb8',
'a27576d60e08c94a05006d2e6d540c0fdb5f38c8',
'405859113963cb7a797642b45f171d6360425d16',
'e5686aa568fdb1d19d7f1329267082fe40482d31',
'83ecf6ec1114fd260ca7a833a2d165e71258c338',
'92689fa2b7fb4d4fc6fb195bf73a50c87c030639'
])
assert list(loader.storage.content_missing_per_sha1(expected_contents))\
== []
expected_dirs = map(hash_to_bytes, [
'05219ba38bc542d4345d5638af1ed56c7d43ca7d',
'cf019eb456cf6f78d8c4674596f1c9a97ece8f44',
'b178b66bd22383d5f16f4f5c923d39ca798861b4',
'c3a58f8b57433a4b56caaa5033ae2e0931405338',
'e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a',
'52604d46843b898f5a43208045d09fcf8731631b',
])
assert list(loader.storage.directory_missing(expected_dirs)) == []
# {revision hash: directory hash}
expected_revs = {
hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa
hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
hash_to_bytes('51247143b01445c9348afa9edfae31bf7c5d86b1'): hash_to_bytes('e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a'), # noqa
}
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_branches = {
'releases/1.1.0': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'releases/1.3.0': {
'target': '51247143b01445c9348afa9edfae31bf7c5d86b1',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.3.0',
'target_type': 'alias',
},
}
expected_snapshot = {
'id': '2e5149a7b0725d18231a37b342e9b7c4e121f283',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, loader.storage)
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit['status'] == 'full'
urls = [
m.url for m in requests_mock_datadir_visits.request_history
if m.url.startswith('https://files.pythonhosted.org')
]
# visited each artifact once across 2 visits
assert len(urls) == len(set(urls))
# release artifact, no new artifact
# {visit full, status uneventful, same snapshot as before}
# release artifact, old artifact with different checksums
# {visit full, status full, new snapshot with shared history and some new
# different history}
# release with multiple sdist artifacts per pypi "version"
# snapshot branch output is different
def test_visit_1_release_with_2_artifacts(swh_config, requests_mock_datadir):
"""With no prior visit, load a pypi project ends up with 1 snapshot
"""
url = 'https://pypi.org/project/nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_branches = {
'releases/1.1.0/nexter-1.1.0.zip': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.1.0/nexter-1.1.0.tar.gz': {
'target': '0bf88f5760cca7665d0af4d6575d9301134fe11a',
'target_type': 'revision',
},
}
expected_snapshot = {
'id': 'a27e638a4dad6fbfa273c6ebec1c4bf320fb84c6',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, loader.storage)
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'

File Metadata

Mime Type
text/x-diff
Expires
Mon, Aug 25, 5:47 PM (4 d, 4 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3247448

Event Timeline