Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9749507
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
128 KB
Subscribers
None
View Options
diff --git a/swh/loader/package/debian.py b/swh/loader/package/debian.py
index 0396c8d..eb2f74c 100644
--- a/swh/loader/package/debian.py
+++ b/swh/loader/package/debian.py
@@ -1,342 +1,356 @@
# Copyright (C) 2017-2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import email.utils
import iso8601
import logging
import re
import subprocess
from dateutil.parser import parse as parse_date
from debian.changelog import Changelog
from debian.deb822 import Dsc
from os import path
-from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
+from typing import (
+ Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple
+)
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import download
logger = logging.getLogger(__name__)
UPLOADERS_SPLIT = re.compile(r'(?<=\>)\s*,\s*')
class DebianLoader(PackageLoader):
"""Load debian origins into swh archive.
"""
visit_type = 'debian'
def __init__(self, url: str, date: str, packages: Mapping[str, Any]):
super().__init__(url=url)
self.packages = packages
def get_versions(self) -> Sequence[str]:
"""Returns the keys of the packages input (e.g.
stretch/contrib/0.7.2-3, etc...)
"""
return self.packages.keys()
def get_default_release(self) -> str:
"""Take the first version as default release
"""
return list(self.packages.keys())[0]
- def get_artifacts(self, version: str) -> Generator[
- Tuple[Mapping[str, Any], Dict], None, None]:
- a_metadata = self.packages[version]
- artifacts_package_info = a_metadata.copy()
- artifacts_package_info['filename'] = version
- yield artifacts_package_info, a_metadata
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
+ meta = self.packages[version]
+ p_info = meta.copy()
+ p_info['raw'] = meta
+ yield 'releases/%s' % version, p_info
def resolve_revision_from(
self, known_package_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
artifacts_to_fetch = artifact_metadata['files']
logger.debug('k_p_artifacts: %s', known_package_artifacts)
logger.debug('artifacts_to_fetch: %s', artifacts_to_fetch)
for rev_id, known_artifacts in known_package_artifacts.items():
logger.debug('Revision: %s', rev_id)
logger.debug('Associated known_artifacts: %s', known_artifacts)
known_artifacts = known_artifacts['extrinsic']['raw']['files']
rev_found = True
for a_name, k_artifact in known_artifacts.items():
artifact_to_fetch = artifacts_to_fetch.get(a_name)
logger.debug('artifact_to_fetch: %s', artifact_to_fetch)
if artifact_to_fetch is None:
# as soon as we do not see an artifact, we consider we need
# to check the other revision
rev_found = False
if k_artifact['sha256'] != artifact_to_fetch['sha256']:
# Hash is different, we consider we need to check the other
# revisions
rev_found = False
if rev_found:
logger.debug('Existing revision %s found for new artifacts.',
rev_id)
return rev_id
# if we pass here, we did not find any known artifacts
logger.debug('No existing revision found for the new artifacts.')
- def download_package(self, a_p_info: str, tmpdir: str) -> Tuple[str, Dict]:
+ def download_package(self, p_info: Mapping[str, Any],
+ tmpdir: str) -> [Tuple[str, Dict]]:
"""Contrary to other package loaders (1 package, 1 artifact),
`a_metadata` represents the package's datafiles set to fetch:
- <package-version>.orig.tar.gz
- <package-version>.dsc
- <package-version>.diff.gz
This is delegated to the `download_package` function.
"""
- logger.debug('debian: artifactS_package_info: %s', a_p_info)
- return tmpdir, download_package(a_p_info, tmpdir)
-
- def uncompress(self, a_path: str, tmpdir: str, a_metadata: Dict) -> str:
- return extract_package(a_metadata, tmpdir)
-
- def read_intrinsic_metadata(self, a_metadata: Dict,
- a_uncompressed_path: str) -> Dict:
- _, dsc_name = dsc_information(a_metadata)
- dsc_path = path.join(path.dirname(a_uncompressed_path), dsc_name)
- return get_package_metadata(
- a_metadata, dsc_path, a_uncompressed_path)
-
- def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
- dsc_url, _ = dsc_information(a_metadata)
+ all_hashes = download_package(p_info, tmpdir)
+ logger.debug('all_hashes: %s', all_hashes)
+ res = []
+ for hashes in all_hashes.values():
+ res.append((tmpdir, hashes))
+ logger.debug('res: %s', res)
+ return res
+
+ def uncompress(self, dl_artifacts: [Tuple[str, Dict]], dest: str) -> str:
+ logger.debug('dl_artifacts: %s', dl_artifacts)
+ return extract_package(dl_artifacts, dest=dest)
+
+ def build_revision(self, a_metadata: Mapping[str, Any],
+ uncompressed_path: str) -> Dict:
+ dsc_url, dsc_name = dsc_information(a_metadata)
+ dsc_path = path.join(path.dirname(uncompressed_path), dsc_name)
+ i_metadata = get_package_metadata(
+ a_metadata, dsc_path, uncompressed_path)
+
logger.debug('i_metadata: %s', i_metadata)
logger.debug('a_metadata: %s', a_metadata)
msg = 'Synthetic revision for Debian source package %s version %s' % (
a_metadata['name'], a_metadata['version'])
date = iso8601.parse_date(i_metadata['changelog']['date'])
author = prepare_person(i_metadata['changelog']['person'])
# inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa
return {
'type': 'dsc',
'message': msg.encode('utf-8'),
'author': author,
'date': date,
'committer': author,
'committer_date': date,
'parents': [],
'metadata': {
'intrinsic': {
'tool': 'dsc',
'raw': i_metadata,
},
'extrinsic': {
'provider': dsc_url,
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
}
}
def uid_to_person(uid: str) -> Mapping[str, str]:
"""Convert an uid to a person suitable for insertion.
Args:
uid: an uid of the form "Name <email@ddress>"
Returns:
a dictionary with the following keys:
- name: the name associated to the uid
- email: the mail associated to the uid
- fullname: the actual uid input
"""
logger.debug('uid: %s', uid)
ret = {
'name': '',
'email': '',
'fullname': uid,
}
name, mail = email.utils.parseaddr(uid)
if name and email:
ret['name'] = name
ret['email'] = mail
else:
ret['name'] = uid
return ret
def prepare_person(person: Mapping[str, str]) -> Mapping[str, bytes]:
"""Prepare person for swh serialization...
Args:
A person dict
Returns:
A person dict ready for storage
"""
ret = {}
for key, value in person.items():
ret[key] = value.encode('utf-8')
return ret
def download_package(
package: Mapping[str, Any], tmpdir: Any) -> Mapping[str, Any]:
"""Fetch a source package in a temporary directory and check the checksums
for all files.
Args:
package: Dict defining the set of files representing a debian package
tmpdir: Where to download and extract the files to ingest
Returns:
Dict of swh hashes per filename key
"""
all_hashes = {}
for filename, fileinfo in package['files'].items():
uri = fileinfo['uri']
logger.debug('fileinfo: %s', fileinfo)
extrinsic_hashes = {'sha256': fileinfo['sha256']}
logger.debug('extrinsic_hashes(%s): %s', filename, extrinsic_hashes)
filepath, hashes = download(uri, dest=tmpdir, filename=filename,
hashes=extrinsic_hashes)
all_hashes[filename] = hashes
logger.debug('all_hashes: %s', all_hashes)
return all_hashes
def dsc_information(package: Mapping[str, Any]) -> Tuple[str, str]:
"""Retrieve dsc information from a package.
Args:
package: Package metadata information
Returns:
Tuple of dsc file's uri, dsc's full disk path
"""
dsc_name = None
dsc_url = None
for filename, fileinfo in package['files'].items():
if filename.endswith('.dsc'):
if dsc_name:
raise ValueError(
'Package %s_%s references several dsc files' %
(package['name'], package['version'])
)
dsc_url = fileinfo['uri']
dsc_name = filename
return dsc_url, dsc_name
-def extract_package(package: Mapping[str, Any], tmpdir: str) -> str:
+def extract_package(dl_artifacts: List[Tuple[str, Dict]], dest: str) -> str:
"""Extract a Debian source package to a given directory.
Note that after extraction the target directory will be the root of the
extracted package, rather than containing it.
Args:
- package (dict): package information dictionary
- tmpdir (str): directory where the package files are stored
+ package: package information dictionary
+ dest: directory where the package files are stored
Returns:
Package extraction directory
"""
- _, dsc_name = dsc_information(package)
- dsc_path = path.join(tmpdir, dsc_name)
- destdir = path.join(tmpdir, 'extracted')
- logfile = path.join(tmpdir, 'extract.log')
+ a_path = dl_artifacts[0][0]
+ logger.debug('dl_artifacts: %s', dl_artifacts)
+ for _, hashes in dl_artifacts:
+ logger.debug('hashes: %s', hashes)
+ filename = hashes['filename']
+ if filename.endswith('.dsc'):
+ dsc_name = filename
+ break
+
+ dsc_path = path.join(a_path, dsc_name)
+ destdir = path.join(dest, 'extracted')
+ logfile = path.join(dest, 'extract.log')
logger.debug('extract Debian source package %s in %s' %
(dsc_path, destdir), extra={
'swh_type': 'deb_extract',
'swh_dsc': dsc_path,
'swh_destdir': destdir,
})
cmd = ['dpkg-source',
'--no-copy', '--no-check',
'--ignore-bad-version',
'-x', dsc_path,
destdir]
try:
with open(logfile, 'w') as stdout:
subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
logdata = open(logfile, 'r').read()
raise ValueError('dpkg-source exited with code %s: %s' %
(e.returncode, logdata)) from None
return destdir
def get_package_metadata(package: Mapping[str, Any], dsc_path: str,
extracted_path: str) -> Mapping[str, Any]:
"""Get the package metadata from the source package at dsc_path,
extracted in extracted_path.
Args:
package: the package dict (with a dsc_path key)
dsc_path: path to the package's dsc file
extracted_path: the path where the package got extracted
Returns:
dict: a dictionary with the following keys:
- history: list of (package_name, package_version) tuples parsed from
the package changelog
"""
with open(dsc_path, 'rb') as dsc:
parsed_dsc = Dsc(dsc)
# Parse the changelog to retrieve the rest of the package information
changelog_path = path.join(extracted_path, 'debian/changelog')
with open(changelog_path, 'rb') as changelog:
try:
parsed_changelog = Changelog(changelog)
except UnicodeDecodeError:
logger.warning('Unknown encoding for changelog %s,'
' falling back to iso' %
changelog_path.decode('utf-8'), extra={
'swh_type': 'deb_changelog_encoding',
'swh_name': package['name'],
'swh_version': str(package['version']),
'swh_changelog': changelog_path.decode('utf-8'),
})
# need to reset as Changelog scrolls to the end of the file
changelog.seek(0)
parsed_changelog = Changelog(changelog, encoding='iso-8859-15')
package_info = {
'name': package['name'],
'version': str(package['version']),
'changelog': {
'person': uid_to_person(parsed_changelog.author),
'date': parse_date(parsed_changelog.date).isoformat(),
'history': [(block.package, str(block.version))
for block in parsed_changelog][1:],
}
}
maintainers = [
uid_to_person(parsed_dsc['Maintainer']),
]
maintainers.extend(
uid_to_person(person)
for person in UPLOADERS_SPLIT.split(parsed_dsc.get('Uploaders', ''))
)
package_info['maintainers'] = maintainers
return package_info
diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit.py
index 1a74c0a..160819c 100644
--- a/swh/loader/package/deposit.py
+++ b/swh/loader/package/deposit.py
@@ -1,153 +1,154 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Dict, Generator, Mapping, Sequence, Tuple
from swh.model.hashutil import hash_to_hex
from swh.loader.package.loader import PackageLoader
from swh.deposit.client import PrivateApiDepositClient as ApiClient
logger = logging.getLogger(__name__)
class DepositLoader(PackageLoader):
"""Load pypi origin's artifact releases into swh archive.
"""
visit_type = 'deposit'
def __init__(self, url: str, deposit_id: str):
"""Constructor
Args:
url: Origin url to associate the artifacts/metadata to
deposit_id: Deposit identity
"""
super().__init__(url=url)
# For now build back existing api urls
# archive_url: Private api url to retrieve archive artifact
self.archive_url = '/%s/raw/' % deposit_id
# metadata_url: Private api url to retrieve the deposit metadata
self.metadata_url = '/%s/meta/' % deposit_id
# deposit_update_url: Private api to push pids and status update on the
# deposit id
self.deposit_update_url = '/%s/update/' % deposit_id
self.client = ApiClient()
self._metadata = None
@property
def metadata(self):
if self._metadata is None:
self._metadata = self.client.metadata_get(self.metadata_url)
return self._metadata
def get_versions(self) -> Sequence[str]:
# only 1 branch 'HEAD' with no alias since we only have 1 snapshot
# branch
return ['HEAD']
- def get_artifacts(self, version: str) -> Generator[
+ def get_package_info(self, version: str) -> Generator[
Tuple[Mapping[str, str], Dict], None, None]:
- artifact_package_info = {
+ p_info = {
'url': self.client.base_url + self.archive_url,
'filename': 'archive.zip',
+ 'raw': self.metadata,
}
- yield artifact_package_info, self.metadata
+ yield 'HEAD', p_info
def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
+ self, a_metadata: Dict, uncompressed_path: str) -> Dict:
revision = a_metadata.pop('revision')
metadata = {
'extrinsic': {
'provider': '%s/%s' % (
self.client.base_url, self.metadata_url),
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
}
# FIXME: the deposit no longer needs to build the revision
revision['metadata'].update(metadata)
revision['author'] = parse_author(revision['author'])
revision['committer'] = parse_author(revision['committer'])
revision['message'] = revision['message'].encode('utf-8')
revision['type'] = 'tar'
return revision
def load(self) -> Dict:
# Usual loading
r = super().load()
success = r['status'] != 'failed'
if success:
# Update archive with metadata information
origin_metadata = self.metadata['origin_metadata']
logger.debug('origin_metadata: %s', origin_metadata)
tools = self.storage.tool_add([origin_metadata['tool']])
logger.debug('tools: %s', tools)
tool_id = tools[0]['id']
provider = origin_metadata['provider']
# FIXME: Shall we delete this info?
provider_id = self.storage.metadata_provider_add(
provider['provider_name'],
provider['provider_type'],
provider['provider_url'],
metadata=None)
metadata = origin_metadata['metadata']
self.storage.origin_metadata_add(
self.url, self.visit_date, provider_id, tool_id, metadata)
# Update deposit status
try:
if not success:
self.client.status_update(
self.deposit_update_url, status='failed')
return r
snapshot_id = r['snapshot_id']
branches = self.storage.snapshot_get(snapshot_id)['branches']
logger.debug('branches: %s', branches)
if not branches:
return r
rev_id = branches[b'HEAD']['target']
revision = next(self.storage.revision_get([rev_id]))
# Retrieve the revision identifier
dir_id = revision['directory']
# update the deposit's status to success with its
# revision-id and directory-id
self.client.status_update(
self.deposit_update_url,
status='done',
revision_id=hash_to_hex(rev_id),
directory_id=hash_to_hex(dir_id),
origin_url=self.url)
except Exception:
logger.exception(
'Problem when trying to update the deposit\'s status')
return {'status': 'failed'}
return r
def parse_author(author):
"""See prior fixme
"""
return {
'fullname': author['fullname'].encode('utf-8'),
'name': author['name'].encode('utf-8'),
'email': author['email'].encode('utf-8'),
}
diff --git a/swh/loader/package/gnu.py b/swh/loader/package/gnu.py
index ce194c6..189041c 100644
--- a/swh/loader/package/gnu.py
+++ b/swh/loader/package/gnu.py
@@ -1,191 +1,195 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import re
from os import path
-from typing import Dict, Generator, Mapping, Optional, Sequence, Tuple
+from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
from swh.loader.package.loader import PackageLoader
from swh.model.identifiers import normalize_timestamp
logger = logging.getLogger(__name__)
# to recognize existing naming pattern
extensions = [
'zip',
'tar',
'gz', 'tgz',
'bz2', 'bzip2',
'lzma', 'lz',
'xz',
'Z',
]
version_keywords = [
'cygwin_me',
'w32', 'win32', 'nt', 'cygwin', 'mingw',
'latest', 'alpha', 'beta',
'release', 'stable',
'hppa',
'solaris', 'sunos', 'sun4u', 'sparc', 'sun',
'aix', 'ibm', 'rs6000',
'i386', 'i686',
'linux', 'redhat', 'linuxlibc',
'mips',
'powerpc', 'macos', 'apple', 'darwin', 'macosx', 'powermacintosh',
'unknown',
'netbsd', 'freebsd',
'sgi', 'irix',
]
# Match a filename into components.
#
# We use Debian's release number heuristic: A release number starts
# with a digit, and is followed by alphanumeric characters or any of
# ., +, :, ~ and -
#
# We hardcode a list of possible extensions, as this release number
# scheme would match them too... We match on any combination of those.
#
# Greedy matching is done right to left (we only match the extension
# greedily with +, software_name and release_number are matched lazily
# with +? and *?).
pattern = r'''
^
(?:
# We have a software name and a release number, separated with a
# -, _ or dot.
(?P<software_name1>.+?[-_.])
(?P<release_number>(%(vkeywords)s|[0-9][0-9a-zA-Z_.+:~-]*?)+)
|
# We couldn't match a release number, put everything in the
# software name.
(?P<software_name2>.+?)
)
(?P<extension>(?:\.(?:%(extensions)s))+)
$
''' % {
'extensions': '|'.join(extensions),
'vkeywords': '|'.join('%s[-]?' % k for k in version_keywords),
}
def get_version(url: str) -> str:
"""Extract branch name from tarball url
Args:
url (str): Tarball URL
Returns:
byte: Branch name
Example:
For url = https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz
>>> get_version(url)
'0.2.0'
"""
filename = path.split(url)[-1]
m = re.match(pattern, filename,
flags=re.VERBOSE | re.IGNORECASE)
if m:
d = m.groupdict()
if d['software_name1'] and d['release_number']:
return d['release_number']
if d['software_name2']:
return d['software_name2']
return ''
class GNULoader(PackageLoader):
visit_type = 'gnu'
SWH_PERSON = {
'name': b'Software Heritage',
'fullname': b'Software Heritage',
'email': b'robot@softwareheritage.org'
}
REVISION_MESSAGE = b'swh-loader-package: synthetic revision message'
def __init__(self, package_url: str, tarballs: Sequence):
"""Loader constructor.
For now, this is the lister's task output.
Args:
package_url: Origin url
tarballs: List of dict with keys `date` (date) and `archive` (str)
the url to retrieve one versioned archive
"""
super().__init__(url=package_url)
self.tarballs = list(sorted(tarballs, key=lambda v: v['time']))
def get_versions(self) -> Sequence[str]:
versions = []
for archive in self.tarballs:
v = get_version(archive['archive'])
if v:
versions.append(v)
return versions
def get_default_release(self) -> str:
# It's the most recent, so for this loader, it's the last one
return get_version(self.tarballs[-1]['archive'])
- def get_artifacts(self, version: str) -> Generator[
- Tuple[Mapping[str, str], Dict], None, None]:
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
for a_metadata in self.tarballs:
url = a_metadata['archive']
- artifact_version = get_version(url)
- if version == artifact_version:
- artifact_package_info = {
+ package_version = get_version(url)
+ if version == package_version:
+ p_info = {
'url': url,
- 'filename': path.split(url)[-1]
+ 'filename': path.split(url)[-1],
+ 'raw': a_metadata,
}
- yield artifact_package_info, a_metadata
+ # FIXME: this code assumes we have only 1 artifact per
+ # versioned package
+ yield 'releases/%s' % version, p_info
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
def pk(d):
return [d.get(k) for k in ['time', 'archive', 'length']]
artifact_pk = pk(artifact_metadata)
for rev_id, known_artifact in known_artifacts.items():
logging.debug('known_artifact: %s', known_artifact)
known_pk = pk(known_artifact['extrinsic']['raw'])
if artifact_pk == known_pk:
return rev_id
def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
+ self, a_metadata: Mapping[str, Any],
+ uncompressed_path: str) -> Dict:
normalized_date = normalize_timestamp(int(a_metadata['time']))
return {
'type': 'tar',
'message': self.REVISION_MESSAGE,
'date': normalized_date,
'author': self.SWH_PERSON,
'committer': self.SWH_PERSON,
'committer_date': normalized_date,
'parents': [],
'metadata': {
'intrinsic': {},
'extrinsic': {
'provider': self.url,
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
},
}
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
index 061c8b3..2c5a083 100644
--- a/swh/loader/package/loader.py
+++ b/swh/loader/package/loader.py
@@ -1,409 +1,384 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import logging
import tempfile
import os
-from typing import Dict, Generator, List, Mapping, Optional, Sequence, Tuple
+from typing import (
+ Any, Dict, Generator, List, Mapping, Optional, Sequence, Tuple
+)
from swh.core.tarball import uncompress
from swh.core.config import SWHConfig
from swh.model.from_disk import Directory
from swh.model.identifiers import (
revision_identifier, snapshot_identifier, identifier_to_bytes
)
from swh.storage import get_storage
from swh.storage.algos.snapshot import snapshot_get_all_branches
from swh.loader.core.converters import content_for_storage
from swh.loader.package.utils import download
logger = logging.getLogger(__name__)
# Not implemented yet:
# - clean up disk routines from previous killed workers (when OOMkilled)
# -> separation of concern would like this to be abstracted from the code
# -> experience tells us it's complicated to do as such (T903, T964, T982,
# etc...)
#
# - model: swh.model.merkle.from_disk should output swh.model.model.* objects
# to avoid this layer's conversion routine call
# -> Take this up within swh.model's current implementation
class PackageLoader:
# Origin visit type (str) set by the loader
visit_type = ''
def __init__(self, url):
"""Loader's constructor. This raises exception if the minimal required
configuration is missing (cf. fn:`check` method).
Args:
url (str): Origin url to load data from
"""
# This expects to use the environment variable SWH_CONFIG_FILENAME
self.config = SWHConfig.parse_config_file()
self._check_configuration()
self.storage = get_storage(**self.config['storage'])
self.url = url
self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc)
def _check_configuration(self):
"""Checks the minimal configuration required is set for the loader.
If some required configuration is missing, exception detailing the
issue is raised.
"""
if 'storage' not in self.config:
raise ValueError(
'Misconfiguration, at least the storage key should be set')
def get_versions(self) -> Sequence[str]:
"""Return the list of all published package versions.
Returns:
Sequence of published versions
"""
return []
- def get_artifacts(self, version: str) -> Generator[
- Tuple[str, str, Dict], None, None]:
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
"""Given a release version of a package, retrieve the associated
- artifact information for such version.
+ package information for such version.
Args:
version: Package version
Returns:
- (artifact filename, artifact uri, raw artifact metadata)
+ (branch name, package metadata)
"""
yield from {}
def build_revision(
self, a_metadata: Dict, i_metadata: Dict) -> Dict:
"""Build the revision dict from the archive metadata (extrinsic
artifact metadata) and the intrinsic metadata.
Returns:
SWH data dict
"""
return {}
def get_default_release(self) -> str:
"""Retrieve the latest release version
Returns:
Latest version
"""
return ''
def last_snapshot(self) -> Optional[Dict]:
"""Retrieve the last snapshot
"""
visit = self.storage.origin_visit_get_latest(
self.url, require_snapshot=True)
if visit:
return snapshot_get_all_branches(
self.storage, visit['snapshot']['id'])
def known_artifacts(self, snapshot: Dict) -> [Dict]:
"""Retrieve the known releases/artifact for the origin.
Args
snapshot: snapshot for the visit
Returns:
Dict of keys revision id (bytes), values a metadata Dict.
"""
if not snapshot or 'branches' not in snapshot:
return {}
# retrieve only revisions (e.g the alias we do not want here)
revs = [rev['target']
for rev in snapshot['branches'].values()
if rev and rev['target_type'] == 'revision']
known_revisions = self.storage.revision_get(revs)
ret = {}
for revision in known_revisions:
if not revision: # revision_get can return None
continue
ret[revision['id']] = revision['metadata']
return ret
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
"""Resolve the revision from a snapshot and an artifact metadata dict.
If the artifact has already been downloaded, this will return the
existing revision targeting that uncompressed artifact directory.
Otherwise, this returns None.
Args:
snapshot: Snapshot
artifact_metadata: Information dict
Returns:
None or revision identifier
"""
return None
- def download_package(self, artifacts_package_info: Mapping[str, str],
- tmpdir: str) -> Tuple[str, Dict]:
- """Download artifacts for a specific package. All downloads happen in the
- the tmpdir folder.
+ def download_package(self, p_info: Mapping[str, Any],
+ tmpdir: str) -> [Tuple[str, Dict]]:
+ """Download artifacts for a specific package. All downloads happen in
+ in the tmpdir folder.
Default implementation expects the artifacts package info to be
about one artifact per package.
Note that most implementation have 1 artifact per package. But some
implementation have multiple artifacts per package (debian), some have
none, the package is the artifact (gnu).
Args:
artifacts_package_info: Information on the package artifacts to
- download (uri, filename, etc...)
+ download (url, filename, etc...)
tmpdir: Location to retrieve such artifacts
- Note:
-
- """
- a_uri = artifacts_package_info['url']
- filename = artifacts_package_info.get('filename')
- return download(a_uri, dest=tmpdir, filename=filename)
-
- def read_intrinsic_metadata(
- self, a_metadata: Dict, a_uncompressed_path: str) -> Dict:
- """Read intrinsic metadata from either the a_metadata or
- the uncompressed path.
-
- Depending on the implementations, some extracts directly from the
- artifacts to ingest (pypi, npm...), some use api to access directly
- their intrinsic metadata (debian exposes a dsc through uri) or some
- have none (gnu).
+ Returns:
+ List of (path, computed hashes)
"""
- return {}
+ a_uri = p_info['url']
+ filename = p_info.get('filename')
+ return [download(a_uri, dest=tmpdir, filename=filename)]
- def uncompress(
- self, a_path: str, tmpdir: str, a_metadata: Dict) -> str:
- """Uncompress the artfifact(s) stored at a_path to tmpdir.
+ def uncompress(self, dl_artifacts: List[Tuple[str, Mapping[str, Any]]],
+ dest: str) -> str:
+ """Uncompress the artifact(s) in the destination folder dest.
- Optionally, this could need to use the a_metadata dict for some more
+ Optionally, this could need to use the p_info dict for some more
information (debian).
"""
- uncompressed_path = os.path.join(tmpdir, 'src')
- uncompress(a_path, dest=uncompressed_path)
+ uncompressed_path = os.path.join(dest, 'src')
+ for a_path, _ in dl_artifacts:
+ uncompress(a_path, dest=uncompressed_path)
return uncompressed_path
def load(self) -> Dict:
"""Load for a specific origin the associated contents.
for each package version of the origin
1. Fetch the files for one package version By default, this can be
implemented as a simple HTTP request. Loaders with more specific
requirements can override this, e.g.: the PyPI loader checks the
integrity of the downloaded files; the Debian loader has to download
and check several files for one package version.
2. Extract the downloaded files By default, this would be a universal
archive/tarball extraction.
Loaders for specific formats can override this method (for instance,
the Debian loader uses dpkg-source -x).
3. Convert the extracted directory to a set of Software Heritage
objects Using swh.model.from_disk.
4. Extract the metadata from the unpacked directories This would only
be applicable for "smart" loaders like npm (parsing the
package.json), PyPI (parsing the PKG-INFO file) or Debian (parsing
debian/changelog and debian/control).
On "minimal-metadata" sources such as the GNU archive, the lister
should provide the minimal set of metadata needed to populate the
revision/release objects (authors, dates) as an argument to the
task.
5. Generate the revision/release objects for the given version. From
the data generated at steps 3 and 4.
end for each
6. Generate and load the snapshot for the visit
Using the revisions/releases collected at step 5., and the branch
information from step 0., generate a snapshot and load it into the
Software Heritage archive
"""
status_load = 'uneventful' # either: eventful, uneventful, failed
status_visit = 'full' # either: partial, full
tmp_revisions = {} # type: Dict[str, List]
snapshot = None
try:
# Prepare origin and origin_visit
origin = {'url': self.url}
self.storage.origin_add([origin])
visit_id = self.storage.origin_visit_add(
origin=self.url,
date=self.visit_date,
type=self.visit_type)['visit']
last_snapshot = self.last_snapshot()
logger.debug('last snapshot: %s', last_snapshot)
known_artifacts = self.known_artifacts(last_snapshot)
logger.debug('known artifacts: %s', known_artifacts)
# Retrieve the default release (the "latest" one)
default_release = self.get_default_release()
logger.debug('default release: %s', default_release)
for version in self.get_versions(): # for each
logger.debug('version: %s', version)
tmp_revisions[version] = []
- # `a_` stands for `artifact(s)_`, `p_` stands for `package_`
- for a_p_info, a_metadata in self.get_artifacts(version):
- logger.debug('a_p_info: %s', a_p_info)
- logger.debug('a_metadata: %s', a_metadata)
+ # `p_` stands for `package_`
+ for branch_name, p_info in self.get_package_info(version):
+ logger.debug('package_info: %s', p_info)
revision_id = self.resolve_revision_from(
- known_artifacts, a_metadata)
+ known_artifacts, p_info['raw'])
if revision_id is None:
with tempfile.TemporaryDirectory() as tmpdir:
try:
- # a_c_: archive_computed_
- a_path, a_c_metadata = self.download_package(
- a_p_info, tmpdir)
+ dl_artifacts = self.download_package(
+ p_info, tmpdir)
except Exception:
logger.exception('Unable to retrieve %s',
- a_p_info['url'])
+ p_info)
status_visit = 'partial'
continue
- logger.debug('archive_path: %s', a_path)
- logger.debug('archive_computed_metadata: %s',
- a_c_metadata)
-
uncompressed_path = self.uncompress(
- a_path, tmpdir, a_metadata)
+ dl_artifacts, dest=tmpdir)
logger.debug('uncompressed_path: %s',
uncompressed_path)
directory = Directory.from_disk(
path=uncompressed_path.encode('utf-8'),
data=True) # noqa
# FIXME: Try not to load the full raw content in
# memory
objects = directory.collect()
contents = objects['content'].values()
logger.debug('Number of contents: %s',
len(contents))
self.storage.content_add(
map(content_for_storage, contents))
status_load = 'eventful'
directories = objects['directory'].values()
logger.debug('Number of directories: %s',
len(directories))
self.storage.directory_add(directories)
- i_metadata = self.read_intrinsic_metadata(
- a_metadata, uncompressed_path)
-
# FIXME: This should be release. cf. D409
revision = self.build_revision(
- a_metadata, i_metadata)
+ p_info['raw'], uncompressed_path)
revision.update({
'synthetic': True,
'directory': directory.hash,
})
revision['metadata'].update({
- 'original_artifact': a_c_metadata,
+ 'original_artifact': [
+ hashes for _, hashes in dl_artifacts
+ ],
})
revision['id'] = revision_id = identifier_to_bytes(
revision_identifier(revision))
logger.debug('Revision: %s', revision)
self.storage.revision_add([revision])
- tmp_revisions[version].append(
- (a_p_info['filename'], revision_id))
+ tmp_revisions[version].append((branch_name, revision_id))
# Build and load the snapshot
branches = {}
- for version, v_branches in tmp_revisions.items():
- if len(v_branches) == 1:
- branch_name = (
- version if version == 'HEAD'
- else 'releases/%s' % version).encode('utf-8')
- if version == default_release:
+ for version, branch_name_revisions in tmp_revisions.items():
+ if len(branch_name_revisions) == 1:
+ branch_name, target = branch_name_revisions[0]
+ if branch_name != 'HEAD':
branches[b'HEAD'] = {
'target_type': 'alias',
- 'target': branch_name,
+ 'target': branch_name.encode('utf-8'),
}
+ for branch_name, target in branch_name_revisions:
+ branch_name = branch_name.encode('utf-8')
branches[branch_name] = {
'target_type': 'revision',
- 'target': v_branches[0][1],
+ 'target': target,
}
- else:
- for filename, target in v_branches:
- branch_name = ('releases/%s/%s' % (
- version, filename)).encode('utf-8')
- branches[branch_name] = {
- 'target_type': 'revision',
- 'target': target,
- }
snapshot = {
'branches': branches
}
logger.debug('snapshot: %s', snapshot)
snapshot['id'] = identifier_to_bytes(
snapshot_identifier(snapshot))
logger.debug('snapshot: %s', snapshot)
self.storage.snapshot_add([snapshot])
if hasattr(self.storage, 'flush'):
self.storage.flush()
except Exception:
logger.exception('Fail to load %s' % self.url)
status_visit = 'partial'
status_load = 'failed'
finally:
self.storage.origin_visit_update(
origin=self.url, visit_id=visit_id, status=status_visit,
snapshot=snapshot)
result = {
'status': status_load,
}
if snapshot:
result['snapshot_id'] = snapshot['id']
return result
diff --git a/swh/loader/package/npm.py b/swh/loader/package/npm.py
index ae1fa3c..09cdfdd 100644
--- a/swh/loader/package/npm.py
+++ b/swh/loader/package/npm.py
@@ -1,298 +1,295 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import logging
import os
import re
from codecs import BOM_UTF8
-from typing import Dict, Generator, Mapping, Sequence, Tuple, Optional
+from typing import Any, Dict, Generator, Mapping, Sequence, Tuple, Optional
import chardet
import iso8601
from swh.model.identifiers import normalize_timestamp
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import api_info
logger = logging.getLogger(__name__)
_EMPTY_AUTHOR = {'fullname': b'', 'name': None, 'email': None}
# https://github.com/jonschlinkert/author-regex
_author_regexp = r'([^<(]+?)?[ \t]*(?:<([^>(]+?)>)?[ \t]*(?:\(([^)]+?)\)|$)'
def parse_npm_package_author(author_str):
"""
Parse npm package author string.
It works with a flexible range of formats, as detailed below::
name
name <email> (url)
name <email>(url)
name<email> (url)
name<email>(url)
name (url) <email>
name (url)<email>
name(url) <email>
name(url)<email>
name (url)
name(url)
name <email>
name<email>
<email> (url)
<email>(url)
(url) <email>
(url)<email>
<email>
(url)
Args:
author_str (str): input author string
Returns:
dict: A dict that may contain the following keys:
* name
* email
* url
"""
author = {}
matches = re.findall(_author_regexp,
author_str.replace('<>', '').replace('()', ''),
re.M)
for match in matches:
if match[0].strip():
author['name'] = match[0].strip()
if match[1].strip():
author['email'] = match[1].strip()
if match[2].strip():
author['url'] = match[2].strip()
return author
def extract_npm_package_author(package_json):
"""
Extract package author from a ``package.json`` file content and
return it in swh format.
Args:
package_json (dict): Dict holding the content of parsed
``package.json`` file
Returns:
dict: A dict with the following keys:
* fullname
* name
* email
"""
def _author_str(author_data):
if type(author_data) is dict:
author_str = ''
if 'name' in author_data:
author_str += author_data['name']
if 'email' in author_data:
author_str += ' <%s>' % author_data['email']
return author_str
elif type(author_data) is list:
return _author_str(author_data[0]) if len(author_data) > 0 else ''
else:
return author_data
author_data = {}
for author_key in ('author', 'authors'):
if author_key in package_json:
author_str = _author_str(package_json[author_key])
author_data = parse_npm_package_author(author_str)
name = author_data.get('name')
email = author_data.get('email')
fullname = None
if name and email:
fullname = '%s <%s>' % (name, email)
elif name:
fullname = name
if not fullname:
return _EMPTY_AUTHOR
if fullname:
fullname = fullname.encode('utf-8')
if name:
name = name.encode('utf-8')
if email:
email = email.encode('utf-8')
return {'fullname': fullname, 'name': name, 'email': email}
def _lstrip_bom(s, bom=BOM_UTF8):
if s.startswith(bom):
return s[len(bom):]
else:
return s
def load_json(json_bytes):
"""
Try to load JSON from bytes and return a dictionary.
First try to decode from utf-8. If the decoding failed,
try to detect the encoding and decode again with replace
error handling.
If JSON is malformed, an empty dictionary will be returned.
Args:
json_bytes (bytes): binary content of a JSON file
Returns:
dict: JSON data loaded in a dictionary
"""
json_data = {}
try:
json_str = _lstrip_bom(json_bytes).decode('utf-8')
except UnicodeDecodeError:
encoding = chardet.detect(json_bytes)['encoding']
if encoding:
json_str = json_bytes.decode(encoding, 'replace')
try:
json_data = json.loads(json_str)
except json.decoder.JSONDecodeError:
pass
return json_data
def extract_intrinsic_metadata(dir_path: str) -> Dict:
"""Given an uncompressed path holding the pkginfo file, returns a
pkginfo parsed structure as a dict.
The release artifact contains at their root one folder. For example:
$ tar tvf zprint-0.0.6.tar.gz
drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
...
Args:
dir_path (str): Path to the uncompressed directory
representing a release artifact from npm.
Returns:
the pkginfo parsed structure as a dict if any or None if
none was present.
"""
# Retrieve the root folder of the archive
if not os.path.exists(dir_path):
return {}
lst = os.listdir(dir_path)
if len(lst) == 0:
return {}
project_dirname = lst[0]
package_json_path = os.path.join(dir_path, project_dirname, 'package.json')
if not os.path.exists(package_json_path):
return {}
with open(package_json_path, 'rb') as package_json_file:
package_json_bytes = package_json_file.read()
return load_json(package_json_bytes)
class NpmLoader(PackageLoader):
visit_type = 'npm'
def __init__(self, package_name, package_url, package_metadata_url):
super().__init__(url=package_url)
self.provider_url = package_metadata_url
self._info = None
self._versions = None
# if package_url is None:
# package_url = 'https://www.npmjs.com/package/%s' % package_name
# if package_metadata_url is None:
# package_metadata_url = 'https://replicate.npmjs.com/%s/' %\
# quote(package_name, safe='')
@property
def info(self) -> Dict:
"""Return the project metadata information (fetched from npm registry)
"""
if not self._info:
self._info = api_info(self.provider_url)
return self._info
def get_versions(self) -> Sequence[str]:
return sorted(self.info['versions'].keys())
def get_default_release(self) -> str:
return self.info['dist-tags'].get('latest', '')
- def get_artifacts(self, version: str) -> Generator[
- Tuple[Mapping[str, str], Dict], None, None]:
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
meta = self.info['versions'][version]
url = meta['dist']['tarball']
- artifact_package_info = {
+ p_info = {
'url': url,
'filename': os.path.basename(url),
+ 'raw': meta,
}
- yield artifact_package_info, meta
+ yield 'releases/%s' % version, p_info
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
shasum = artifact_metadata['dist']['shasum']
for rev_id, known_artifact in known_artifacts.items():
- original_artifact = known_artifact['original_artifact']
+ original_artifact = known_artifact['original_artifact'][0]
if shasum == original_artifact['checksums']['sha1']:
return rev_id
- def read_intrinsic_metadata(self, a_metadata: Dict,
- a_uncompressed_path: str) -> Dict:
- return extract_intrinsic_metadata(a_uncompressed_path)
-
def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
-
+ self, a_metadata: Dict, uncompressed_path: str) -> Dict:
+ i_metadata = extract_intrinsic_metadata(uncompressed_path)
# from intrinsic metadata
author = extract_npm_package_author(i_metadata)
# extrinsic metadata
version = i_metadata['version']
date = self.info['time'][version]
date = iso8601.parse_date(date)
date = normalize_timestamp(int(date.timestamp()))
message = version.encode('ascii')
return {
'type': 'tar',
'message': message,
'author': author,
'date': date,
'committer': author,
'committer_date': date,
'parents': [],
'metadata': {
'intrinsic': {
'tool': 'package.json',
'raw': i_metadata,
},
'extrinsic': {
'provider': self.provider_url,
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
},
}
diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi.py
index 156ab2f..820b79c 100644
--- a/swh/loader/package/pypi.py
+++ b/swh/loader/package/pypi.py
@@ -1,186 +1,193 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
-from typing import Generator, Dict, Mapping, Optional, Sequence, Tuple
+from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple
from urllib.parse import urlparse
from pkginfo import UnpackedSDist
import iso8601
from swh.model.identifiers import normalize_timestamp
from swh.loader.package.loader import PackageLoader
from swh.loader.package.utils import api_info
def pypi_api_url(url: str) -> str:
"""Compute api url from a project url
Args:
url (str): PyPI instance's url (e.g: https://pypi.org/project/requests)
This deals with correctly transforming the project's api url (e.g
https://pypi.org/pypi/requests/json)
Returns:
api url
"""
p_url = urlparse(url)
project_name = p_url.path.split('/')[-1]
url = '%s://%s/pypi/%s/json' % (p_url.scheme, p_url.netloc, project_name)
return url
def extract_intrinsic_metadata(dir_path: str) -> Dict:
"""Given an uncompressed path holding the pkginfo file, returns a
pkginfo parsed structure as a dict.
The release artifact contains at their root one folder. For example:
$ tar tvf zprint-0.0.6.tar.gz
drwxr-xr-x root/root 0 2018-08-22 11:01 zprint-0.0.6/
...
Args:
dir_path (str): Path to the uncompressed directory
representing a release artifact from pypi.
Returns:
the pkginfo parsed structure as a dict if any or None if
none was present.
"""
# Retrieve the root folder of the archive
if not os.path.exists(dir_path):
return {}
lst = os.listdir(dir_path)
if len(lst) != 1:
return {}
project_dirname = lst[0]
pkginfo_path = os.path.join(dir_path, project_dirname, 'PKG-INFO')
if not os.path.exists(pkginfo_path):
return {}
pkginfo = UnpackedSDist(pkginfo_path)
raw = pkginfo.__dict__
raw.pop('filename') # this gets added with the ondisk location
return raw
def author(data: Dict) -> Dict:
"""Given a dict of project/release artifact information (coming from
PyPI), returns an author subset.
Args:
data (dict): Representing either artifact information or
release information.
Returns:
swh-model dict representing a person.
"""
name = data.get('author')
email = data.get('author_email')
if email:
fullname = '%s <%s>' % (name, email)
else:
fullname = name
if not fullname:
return {'fullname': b'', 'name': None, 'email': None}
fullname = fullname.encode('utf-8')
if name is not None:
name = name.encode('utf-8')
if email is not None:
email = email.encode('utf-8')
return {'fullname': fullname, 'name': name, 'email': email}
class PyPILoader(PackageLoader):
"""Load pypi origin's artifact releases into swh archive.
"""
visit_type = 'pypi'
def __init__(self, url):
super().__init__(url=url)
self._info = None
self.provider_url = pypi_api_url(self.url)
@property
def info(self) -> Dict:
"""Return the project metadata information (fetched from pypi registry)
"""
if not self._info:
self._info = api_info(self.provider_url)
return self._info
def get_versions(self) -> Sequence[str]:
return self.info['releases'].keys()
def get_default_release(self) -> str:
return self.info['info']['version']
- def get_artifacts(self, version: str) -> Generator[
- Tuple[Mapping[str, str], Dict], None, None]:
+ def get_package_info(self, version: str) -> Generator[
+ Tuple[str, Mapping[str, Any]], None, None]:
+ res = []
for meta in self.info['releases'][version]:
- artifact_package_info = {
+ filename = meta['filename']
+ p_info = {
'url': meta['url'],
- 'filename': meta['filename'],
+ 'filename': filename,
+ 'raw': meta,
}
- yield artifact_package_info, meta
+ res.append((version, p_info))
+
+ if len(res) == 1:
+ version, p_info = res[0]
+ yield 'releases/%s' % version, p_info
+ else:
+ for version, p_info in res:
+ yield 'releases/%s/%s' % (version, p_info['filename']), p_info
def resolve_revision_from(
self, known_artifacts: Dict, artifact_metadata: Dict) \
-> Optional[bytes]:
sha256 = artifact_metadata['digests']['sha256']
for rev_id, known_artifact in known_artifacts.items():
- original_artifact = known_artifact['original_artifact']
- if sha256 == original_artifact['checksums']['sha256']:
- return rev_id
-
- def read_intrinsic_metadata(self, a_metadata: Dict,
- a_uncompressed_path: str) -> Dict:
- return extract_intrinsic_metadata(a_uncompressed_path)
+ for original_artifact in known_artifact['original_artifact']:
+ if sha256 == original_artifact['checksums']['sha256']:
+ return rev_id
def build_revision(
- self, a_metadata: Dict, i_metadata: Dict) -> Dict:
+ self, a_metadata: Dict, uncompressed_path: str) -> Dict:
+ i_metadata = extract_intrinsic_metadata(uncompressed_path)
# from intrinsic metadata
name = i_metadata['version']
_author = author(i_metadata)
# from extrinsic metadata
message = a_metadata.get('comment_text', '')
message = '%s: %s' % (name, message) if message else name
date = normalize_timestamp(
int(iso8601.parse_date(a_metadata['upload_time']).timestamp()))
return {
'type': 'tar',
'message': message.encode('utf-8'),
'author': _author,
'date': date,
'committer': _author,
'committer_date': date,
'parents': [],
'metadata': {
'intrinsic': {
'tool': 'PKG-INFO',
'raw': i_metadata,
},
'extrinsic': {
'provider': self.provider_url,
'when': self.visit_date.isoformat(),
'raw': a_metadata,
},
}
}
diff --git a/swh/loader/package/tests/test_debian.py b/swh/loader/package/tests/test_debian.py
index 85663ac..b453c28 100644
--- a/swh/loader/package/tests/test_debian.py
+++ b/swh/loader/package/tests/test_debian.py
@@ -1,316 +1,318 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import copy
import logging
import pytest
from os import path
from swh.loader.package.debian import (
DebianLoader, download_package, dsc_information, uid_to_person,
prepare_person, get_package_metadata, extract_package
)
from swh.loader.package.tests.common import check_snapshot
logger = logging.getLogger(__name__)
PACKAGE_FILES = {
'files': {
'cicero_0.7.2-3.diff.gz': {
'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce',
'name': 'cicero_0.7.2-3.diff.gz',
'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c', # noqa
'size': 3964,
'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2-3.diff.gz' # noqa
},
'cicero_0.7.2-3.dsc': {
'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a',
'name': 'cicero_0.7.2-3.dsc',
'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03', # noqa
'size': 1864,
'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2-3.dsc'}, # noqa
'cicero_0.7.2.orig.tar.gz': {
'md5sum': '4353dede07c5728319ba7f5595a7230a',
'name': 'cicero_0.7.2.orig.tar.gz',
'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa
'size': 96527,
'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa
}
},
'id': 23,
'name': 'cicero',
'revision_id': None,
'version': '0.7.2-3'
}
PACKAGE_PER_VERSION = {
'stretch/contrib/0.7.2-3': PACKAGE_FILES
}
def test_debian_first_visit(
swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
loader = DebianLoader(
url='deb://Debian/packages/cicero',
date='2019-10-12T05:58:09.165557+00:00',
packages=PACKAGE_PER_VERSION)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 42,
'directory': 2,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 1, # all artifacts under 1 revision
'skipped_content': 0,
'snapshot': 1
} == stats
expected_snapshot = {
'id': 'a59ec49a01ff329dcbbc63fd36a5654143aef240',
'branches': {
'HEAD': {
'target_type': 'alias',
'target': 'releases/stretch/contrib/0.7.2-3'
},
'releases/stretch/contrib/0.7.2-3': {
'target_type': 'revision',
'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07',
}
},
} # different than the previous loader as no release is done
check_snapshot(expected_snapshot, loader.storage)
def test_debian_first_visit_then_another_visit(
swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
url = 'deb://Debian/packages/cicero'
loader = DebianLoader(
url=url,
date='2019-10-12T05:58:09.165557+00:00',
packages=PACKAGE_PER_VERSION)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
stats = loader.storage.stat_counters()
assert {
'content': 42,
'directory': 2,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 1, # all artifacts under 1 revision
'skipped_content': 0,
'snapshot': 1
} == stats
expected_snapshot = {
'id': 'a59ec49a01ff329dcbbc63fd36a5654143aef240',
'branches': {
'HEAD': {
'target_type': 'alias',
'target': 'releases/stretch/contrib/0.7.2-3'
},
'releases/stretch/contrib/0.7.2-3': {
'target_type': 'revision',
'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07',
}
},
} # different than the previous loader as no release is done
check_snapshot(expected_snapshot, loader.storage)
# No change in between load
actual_load_status2 = loader.load()
assert actual_load_status2['status'] == 'uneventful'
origin_visit2 = list(loader.storage.origin_visit_get(url))
assert origin_visit2[-1]['status'] == 'full'
stats2 = loader.storage.stat_counters()
assert {
'content': 42 + 0,
'directory': 2 + 0,
'origin': 1,
'origin_visit': 1 + 1, # a new visit occurred
'person': 1,
'release': 0,
'revision': 1,
'skipped_content': 0,
'snapshot': 1, # same snapshot across 2 visits
} == stats2
urls = [
m.url for m in requests_mock_datadir.request_history
if m.url.startswith('http://deb.debian.org')
]
# visited each package artifact twice across 2 visits
assert len(urls) == len(set(urls))
def test_uid_to_person():
uid = 'Someone Name <someone@orga.org>'
actual_person = uid_to_person(uid)
assert actual_person == {
'name': 'Someone Name',
'email': 'someone@orga.org',
'fullname': uid,
}
def test_prepare_person():
actual_author = prepare_person({
'name': 'Someone Name',
'email': 'someone@orga.org',
'fullname': 'Someone Name <someone@orga.org>',
})
assert actual_author == {
'name': b'Someone Name',
'email': b'someone@orga.org',
'fullname': b'Someone Name <someone@orga.org>',
}
def test_download_package(datadir, tmpdir, requests_mock_datadir):
tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue)
all_hashes = download_package(PACKAGE_FILES, tmpdir)
assert all_hashes == {
'cicero_0.7.2-3.diff.gz': {
'checksums': {
'blake2s256': '08b1c438e70d2474bab843d826515147fa4a817f8c4baaf3ddfbeb5132183f21', # noqa
'sha1': '0815282053f21601b0ec4adf7a8fe47eace3c0bc',
'sha1_git': '834ac91da3a9da8f23f47004bb456dd5bd16fe49',
'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c' # noqa
},
'filename': 'cicero_0.7.2-3.diff.gz',
'length': 3964},
'cicero_0.7.2-3.dsc': {
'checksums': {
'blake2s256': '8c002bead3e35818eaa9d00826f3d141345707c58fb073beaa8abecf4bde45d2', # noqa
'sha1': 'abbec4e8efbbc80278236e1dd136831eac08accd',
'sha1_git': '1f94b2086fa1142c2df6b94092f5c5fa11093a8e',
'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03' # noqa
},
'filename': 'cicero_0.7.2-3.dsc',
'length': 1864},
'cicero_0.7.2.orig.tar.gz': {
'checksums': {
'blake2s256': '9809aa8d2e2dad7f34cef72883db42b0456ab7c8f1418a636eebd30ab71a15a6', # noqa
'sha1': 'a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43',
'sha1_git': 'aa0a38978dce86d531b5b0299b4a616b95c64c74',
'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786' # noqa
},
'filename': 'cicero_0.7.2.orig.tar.gz',
'length': 96527
}
}
def test_dsc_information_ok():
fname = 'cicero_0.7.2-3.dsc'
dsc_url, dsc_name = dsc_information(PACKAGE_FILES)
assert dsc_url == PACKAGE_FILES['files'][fname]['uri']
assert dsc_name == PACKAGE_FILES['files'][fname]['name']
def test_dsc_information_not_found():
fname = 'cicero_0.7.2-3.dsc'
package_files = copy.deepcopy(PACKAGE_FILES)
package_files['files'].pop(fname)
dsc_url, dsc_name = dsc_information(package_files)
assert dsc_url is None
assert dsc_name is None
def test_dsc_information_too_many_dsc_entries():
# craft an extra dsc file
fname = 'cicero_0.7.2-3.dsc'
package_files = copy.deepcopy(PACKAGE_FILES)
data = package_files['files'][fname]
fname2 = fname.replace('cicero', 'ciceroo')
package_files['files'][fname2] = data
with pytest.raises(
ValueError, match='Package %s_%s references several dsc' % (
package_files['name'], package_files['version'])):
dsc_information(package_files)
def test_get_package_metadata(requests_mock_datadir, datadir, tmp_path):
tmp_path = str(tmp_path) # py3.5 compat.
package = PACKAGE_FILES
logger.debug('package: %s', package)
# download the packages
- download_package(package, tmp_path)
+ all_hashes = download_package(package, tmp_path)
# Retrieve information from package
_, dsc_name = dsc_information(package)
+ dl_artifacts = [(tmp_path, hashes) for hashes in all_hashes.values()]
+
# Extract information from package
- extracted_path = extract_package(package, tmp_path)
+ extracted_path = extract_package(dl_artifacts, tmp_path)
# Retrieve information on package
dsc_path = path.join(path.dirname(extracted_path), dsc_name)
actual_package_info = get_package_metadata(
package, dsc_path, extracted_path)
logger.debug('actual_package_info: %s', actual_package_info)
assert actual_package_info == {
'changelog': {
'date': '2014-10-19T16:52:35+02:00',
'history': [
('cicero', '0.7.2-2'),
('cicero', '0.7.2-1'),
('cicero', '0.7-1')
],
'person': {
'email': 'sthibault@debian.org',
'fullname': 'Samuel Thibault <sthibault@debian.org>',
'name': 'Samuel Thibault'
}
},
'maintainers': [
{
'email': 'debian-accessibility@lists.debian.org',
'fullname': 'Debian Accessibility Team '
'<debian-accessibility@lists.debian.org>',
'name': 'Debian Accessibility Team'
},
{
'email': 'sthibault@debian.org',
'fullname': 'Samuel Thibault <sthibault@debian.org>',
'name': 'Samuel Thibault'
}
],
'name': 'cicero',
'version': '0.7.2-3'
}
diff --git a/swh/loader/package/tests/test_deposit.py b/swh/loader/package/tests/test_deposit.py
index 8cc5723..2e999b0 100644
--- a/swh/loader/package/tests/test_deposit.py
+++ b/swh/loader/package/tests/test_deposit.py
@@ -1,199 +1,204 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.deposit import DepositLoader
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
from swh.core.pytest_plugin import requests_mock_datadir_factory
def test_deposit_init_ok(swh_config):
url = 'some-url'
deposit_id = 999
loader = DepositLoader(url, deposit_id) # Something that does not exist
assert loader.url == url
assert loader.archive_url == '/%s/raw/' % deposit_id
assert loader.metadata_url == '/%s/meta/' % deposit_id
assert loader.deposit_update_url == '/%s/update/' % deposit_id
assert loader.client is not None
def test_deposit_loading_failure_to_fetch_metadata(swh_config):
"""Error during fetching artifact ends us with failed/partial visit
"""
# private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
url = 'some-url'
unknown_deposit_id = 666
loader = DepositLoader(url, unknown_deposit_id) # does not exist
actual_load_status = loader.load()
assert actual_load_status['status'] == 'failed'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 0,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[
'https://deposit.softwareheritage.org/1/private/666/raw/',
])
def test_deposit_loading_failure_to_retrieve_1_artifact(
swh_config, requests_mock_datadir_missing_one):
"""Deposit with missing artifact ends up with an uneventful/partial visit
"""
# private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
url = 'some-url-2'
deposit_id = 666
loader = DepositLoader(url, deposit_id)
assert loader.archive_url
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 1,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
# do not care for deposit update query
requests_mock_datadir.put(re.compile('https'))
url = 'https://hal-test.archives-ouvertes.fr/some-external-id'
deposit_id = 666
loader = DepositLoader(url, deposit_id)
assert loader.archive_url
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_revision_id = hash_to_bytes(
'9471c606239bccb1f269564c9ea114e1eeab9eb4')
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(revision['metadata'], paths=[
('extrinsic.provider', str),
('extrinsic.when', str),
('extrinsic.raw', dict),
- ('original_artifact.filename', str),
- ('original_artifact.length', int),
- ('original_artifact.checksums', dict),
+ ('original_artifact', list),
])
+ for original_artifact in revision['metadata']['original_artifact']:
+ check_metadata_paths(original_artifact, paths=[
+ ('filename', str),
+ ('length', int),
+ ('checksums', dict),
+ ])
+
def test_deposit_loading_ok(swh_config, requests_mock_datadir):
requests_mock_datadir.put(re.compile('https')) # do not care for put
url = 'https://hal-test.archives-ouvertes.fr/some-external-id'
deposit_id = 666
loader = DepositLoader(url, deposit_id)
assert loader.archive_url
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 303,
'directory': 12,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 1,
'skipped_content': 0,
'snapshot': 1,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
expected_branches = {
'HEAD': {
'target': '9471c606239bccb1f269564c9ea114e1eeab9eb4',
'target_type': 'revision',
},
}
expected_snapshot = {
'id': '453f455d0efb69586143cd6b6e5897f9906b53a7',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, storage=loader.storage)
# check metadata
tool = {
"name": "swh-deposit",
"version": "0.0.1",
"configuration": {
"sword_version": "2",
}
}
tool = loader.storage.tool_get(tool)
assert tool is not None
assert tool['id'] is not None
provider = {
"provider_name": "hal",
"provider_type": "deposit_client",
"provider_url": "https://hal-test.archives-ouvertes.fr/",
"metadata": None,
}
provider = loader.storage.metadata_provider_get_by(provider)
assert provider is not None
assert provider['id'] is not None
metadata = loader.storage.origin_metadata_get_by(
url, provider_type='deposit_client')
assert metadata is not None
assert isinstance(metadata, list)
assert len(metadata) == 1
metadata0 = metadata[0]
assert metadata0['provider_id'] == provider['id']
assert metadata0['provider_type'] == 'deposit_client'
assert metadata0['tool_id'] == tool['id']
diff --git a/swh/loader/package/tests/test_gnu.py b/swh/loader/package/tests/test_gnu.py
index ea70a83..3be6610 100644
--- a/swh/loader/package/tests/test_gnu.py
+++ b/swh/loader/package/tests/test_gnu.py
@@ -1,349 +1,354 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import re
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.gnu import GNULoader, get_version
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
def test_get_version():
"""From url to branch name should yield something relevant
"""
for url, expected_branchname in [
('https://gnu.org/sthg/info-2.1.0.tar.gz', '2.1.0'),
('https://gnu.org/sthg/info-2.1.2.zip', '2.1.2'),
('https://sthg.org/gnu/sthg.tar.gz', 'sthg'),
('https://sthg.org/gnu/DLDF-1.1.4.tar.gz', '1.1.4'),
('https://sthg.org/gnu/anubis-latest.tar.bz2', 'latest'),
('https://ftp.org/gnu/aris-w32.zip', 'w32'),
('https://ftp.org/gnu/aris-w32-2.2.zip', 'w32-2.2'),
('https://ftp.org/gnu/autogen.info.tar.gz', 'autogen.info'),
('https://ftp.org/gnu/crypto-build-demo.tar.gz',
'crypto-build-demo'),
('https://ftp.org/gnu/clue+clio+xit.clisp.tar.gz',
'clue+clio+xit.clisp'),
('https://ftp.org/gnu/clue+clio.for-pcl.tar.gz',
'clue+clio.for-pcl'),
('https://ftp.org/gnu/clisp-hppa2.0-hp-hpux10.20.tar.gz',
'hppa2.0-hp-hpux10.20'),
('clisp-i386-solaris2.6.tar.gz', 'i386-solaris2.6'),
('clisp-mips-sgi-irix6.5.tar.gz', 'mips-sgi-irix6.5'),
('clisp-powerpc-apple-macos.tar.gz', 'powerpc-apple-macos'),
('clisp-powerpc-unknown-linuxlibc6.tar.gz',
'powerpc-unknown-linuxlibc6'),
('clisp-rs6000-ibm-aix3.2.5.tar.gz', 'rs6000-ibm-aix3.2.5'),
('clisp-sparc-redhat51-linux.tar.gz', 'sparc-redhat51-linux'),
('clisp-sparc-sun-solaris2.4.tar.gz', 'sparc-sun-solaris2.4'),
('clisp-sparc-sun-sunos4.1.3_U1.tar.gz',
'sparc-sun-sunos4.1.3_U1'),
('clisp-2.25.1-powerpc-apple-MacOSX.tar.gz',
'2.25.1-powerpc-apple-MacOSX'),
('clisp-2.27-PowerMacintosh-powerpc-Darwin-1.3.7.tar.gz',
'2.27-PowerMacintosh-powerpc-Darwin-1.3.7'),
('clisp-2.27-i686-unknown-Linux-2.2.19.tar.gz',
'2.27-i686-unknown-Linux-2.2.19'),
('clisp-2.28-i386-i386-freebsd-4.3-RELEASE.tar.gz',
'2.28-i386-i386-freebsd-4.3-RELEASE'),
('clisp-2.28-i686-unknown-cygwin_me-4.90-1.3.10.tar.gz',
'2.28-i686-unknown-cygwin_me-4.90-1.3.10'),
('clisp-2.29-i386-i386-freebsd-4.6-STABLE.tar.gz',
'2.29-i386-i386-freebsd-4.6-STABLE'),
('clisp-2.29-i686-unknown-cygwin_nt-5.0-1.3.12.tar.gz',
'2.29-i686-unknown-cygwin_nt-5.0-1.3.12'),
('gcl-2.5.3-ansi-japi-xdr.20030701_mingw32.zip',
'2.5.3-ansi-japi-xdr.20030701_mingw32'),
('gettext-runtime-0.13.1.bin.woe32.zip', '0.13.1.bin.woe32'),
('sather-logo_images.tar.gz', 'sather-logo_images'),
('sather-specification-000328.html.tar.gz', '000328.html')
]:
actual_branchname = get_version(url)
assert actual_branchname == expected_branchname
_expected_new_contents_first_visit = [
'e9258d81faf5881a2f96a77ba609396f82cb97ad',
'1170cf105b04b7e2822a0e09d2acf71da7b9a130',
'fbd27c3f41f2668624ffc80b7ba5db9b92ff27ac',
'0057bec9b5422aff9256af240b177ac0e3ac2608',
'2b8d0d0b43a1078fc708930c8ddc2956a86c566e',
'27de3b3bc6545d2a797aeeb4657c0e215a0c2e55',
'2e6db43f5cd764e677f416ff0d0c78c7a82ef19b',
'ae9be03bd2a06ed8f4f118d3fe76330bb1d77f62',
'edeb33282b2bffa0e608e9d2fd960fd08093c0ea',
'd64e64d4c73679323f8d4cde2643331ba6c20af9',
'7a756602914be889c0a2d3952c710144b3e64cb0',
'84fb589b554fcb7f32b806951dcf19518d67b08f',
'8624bcdae55baeef00cd11d5dfcfa60f68710a02',
'e08441aeab02704cfbd435d6445f7c072f8f524e',
'f67935bc3a83a67259cda4b2d43373bd56703844',
'809788434b433eb2e3cfabd5d591c9a659d5e3d8',
'7d7c6c8c5ebaeff879f61f37083a3854184f6c41',
'b99fec102eb24bffd53ab61fc30d59e810f116a2',
'7d149b28eaa228b3871c91f0d5a95a2fa7cb0c68',
'f0c97052e567948adf03e641301e9983c478ccff',
'7fb724242e2b62b85ca64190c31dcae5303e19b3',
'4f9709e64a9134fe8aefb36fd827b84d8b617ab5',
'7350628ccf194c2c3afba4ac588c33e3f3ac778d',
'0bb892d9391aa706dc2c3b1906567df43cbe06a2',
'49d4c0ce1a16601f1e265d446b6c5ea6b512f27c',
'6b5cc594ac466351450f7f64a0b79fdaf4435ad3',
'3046e5d1f70297e2a507b98224b6222c9688d610',
'1572607d456d7f633bc6065a2b3048496d679a31',
]
_expected_new_directories_first_visit = [
'daabc65ec75d487b1335ffc101c0ac11c803f8fc',
'263be23b4a8101d3ad0d9831319a3e0f2b065f36',
'7f6e63ba6eb3e2236f65892cd822041f1a01dd5c',
'4db0a3ecbc976083e2dac01a62f93729698429a3',
'dfef1c80e1098dd5deda664bb44a9ab1f738af13',
'eca971d346ea54d95a6e19d5051f900237fafdaa',
'3aebc29ed1fccc4a6f2f2010fb8e57882406b528',
]
_expected_new_revisions_first_visit = {
'44183488c0774ce3c957fa19ba695cf18a4a42b3':
'3aebc29ed1fccc4a6f2f2010fb8e57882406b528'
}
_expected_branches_first_visit = {
'HEAD': {
'target_type': 'alias',
'target': 'releases/0.1.0',
},
'releases/0.1.0': {
'target_type': 'revision',
'target': '44183488c0774ce3c957fa19ba695cf18a4a42b3',
},
}
# hash is different then before as we changed the snapshot
# gnu used to use `release/` (singular) instead of plural
_expected_new_snapshot_first_visit_id = 'c419397fd912039825ebdbea378bc6283f006bf5' # noqa
def test_visit_with_no_artifact_found(swh_config, requests_mock):
package_url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': '944729610',
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(package_url, tarballs)
requests_mock.get(re.compile('https://'), status_code=404)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 1,
} == stats
origin_visit = next(loader.storage.origin_visit_get(package_url))
assert origin_visit['status'] == 'partial'
def test_check_revision_metadata_structure(swh_config, requests_mock_datadir):
package_url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': '944729610',
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(package_url, tarballs)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_revision_id = hash_to_bytes(
'44183488c0774ce3c957fa19ba695cf18a4a42b3')
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(revision['metadata'], paths=[
('intrinsic', dict),
('extrinsic.provider', str),
('extrinsic.when', str),
('extrinsic.raw', dict),
- ('original_artifact.filename', str),
- ('original_artifact.length', int),
- ('original_artifact.checksums', dict),
+ ('original_artifact', list),
])
+ for original_artifact in revision['metadata']['original_artifact']:
+ check_metadata_paths(original_artifact, paths=[
+ ('filename', str),
+ ('length', int),
+ ('checksums', dict),
+ ])
+
def test_visit_with_release_artifact_no_prior_visit(
swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
package_url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': 944729610,
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(package_url, tarballs)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit),
'directory': len(_expected_new_directories_first_visit),
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': len(_expected_new_revisions_first_visit),
'skipped_content': 0,
'snapshot': 1
} == stats
expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit)
assert list(loader.storage.content_missing_per_sha1(expected_contents)) \
== []
expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit)
assert list(loader.storage.directory_missing(expected_dirs)) == []
expected_revs = map(hash_to_bytes, _expected_new_revisions_first_visit)
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_snapshot = {
'id': _expected_new_snapshot_first_visit_id,
'branches': _expected_branches_first_visit,
}
check_snapshot(expected_snapshot, loader.storage)
def test_2_visits_without_change(swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
url = 'https://ftp.gnu.org/gnu/8sync/'
tarballs = [{
'time': 944729610,
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}]
loader = GNULoader(url, tarballs)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit['status'] == 'full'
actual_load_status2 = loader.load()
assert actual_load_status2['status'] == 'uneventful'
origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit2['status'] == 'full'
urls = [
m.url for m in requests_mock_datadir.request_history
if m.url.startswith('https://ftp.gnu.org')
]
assert len(urls) == 1
def test_2_visits_with_new_artifact(swh_config, requests_mock_datadir):
"""With no prior visit, load a gnu project ends up with 1 snapshot
"""
assert 'SWH_CONFIG_FILENAME' in os.environ # cf. tox.ini
url = 'https://ftp.gnu.org/gnu/8sync/'
tarball1 = {
'time': 944729610,
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
'length': 221837,
}
loader = GNULoader(url, [tarball1])
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit['status'] == 'full'
stats = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit),
'directory': len(_expected_new_directories_first_visit),
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': len(_expected_new_revisions_first_visit),
'skipped_content': 0,
'snapshot': 1
} == stats
urls = [
m.url for m in requests_mock_datadir.request_history
if m.url.startswith('https://ftp.gnu.org')
]
assert len(urls) == 1
tarball2 = {
'time': 1480991830,
'archive': 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
'length': 238466,
}
loader2 = GNULoader(url, [tarball1, tarball2])
# implementation detail: share the storage in between visits
loader2.storage = loader.storage
stats2 = loader2.storage.stat_counters()
assert stats == stats2 # ensure we share the storage
actual_load_status2 = loader2.load()
assert actual_load_status2['status'] == 'eventful'
stats2 = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit) + 14,
'directory': len(_expected_new_directories_first_visit) + 8,
'origin': 1,
'origin_visit': 1 + 1,
'person': 1,
'release': 0,
'revision': len(_expected_new_revisions_first_visit) + 1,
'skipped_content': 0,
'snapshot': 1 + 1,
} == stats2
origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit2['status'] == 'full'
urls = [
m.url for m in requests_mock_datadir.request_history
if m.url.startswith('https://ftp.gnu.org')
]
# 1 artifact (2nd time no modification) + 1 new artifact
assert len(urls) == 2
diff --git a/swh/loader/package/tests/test_npm.py b/swh/loader/package/tests/test_npm.py
index 1c253c0..654c472 100644
--- a/swh/loader/package/tests/test_npm.py
+++ b/swh/loader/package/tests/test_npm.py
@@ -1,526 +1,531 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import json
import os
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.npm import (
parse_npm_package_author, extract_npm_package_author
)
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
from swh.loader.package.npm import NpmLoader
def _parse_author_string_test(author_str, expected_result):
assert parse_npm_package_author(author_str) == expected_result
assert parse_npm_package_author(' %s' % author_str) == expected_result
assert parse_npm_package_author('%s ' % author_str) == expected_result
def test_parse_npm_package_author():
_parse_author_string_test(
'John Doe',
{
'name': 'John Doe'
}
)
_parse_author_string_test(
'<john.doe@foo.bar>',
{
'email': 'john.doe@foo.bar'
}
)
_parse_author_string_test(
'(https://john.doe)',
{
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe <john.doe@foo.bar>',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar'
}
)
_parse_author_string_test(
'John Doe<john.doe@foo.bar>',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar'
}
)
_parse_author_string_test(
'John Doe (https://john.doe)',
{
'name': 'John Doe',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe(https://john.doe)',
{
'name': 'John Doe',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'<john.doe@foo.bar> (https://john.doe)',
{
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'(https://john.doe) <john.doe@foo.bar>',
{
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe <john.doe@foo.bar> (https://john.doe)',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe (https://john.doe) <john.doe@foo.bar>',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe<john.doe@foo.bar> (https://john.doe)',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test(
'John Doe<john.doe@foo.bar>(https://john.doe)',
{
'name': 'John Doe',
'email': 'john.doe@foo.bar',
'url': 'https://john.doe'
}
)
_parse_author_string_test('', {})
_parse_author_string_test('<>', {})
_parse_author_string_test(' <>', {})
_parse_author_string_test('<>()', {})
_parse_author_string_test('<> ()', {})
_parse_author_string_test('()', {})
_parse_author_string_test(' ()', {})
_parse_author_string_test(
'John Doe <> ()',
{
'name': 'John Doe'
}
)
_parse_author_string_test(
'John Doe <>',
{
'name': 'John Doe'
}
)
_parse_author_string_test(
'John Doe ()',
{
'name': 'John Doe'
}
)
def test_extract_npm_package_author(datadir):
package_metadata_filepath = os.path.join(
datadir, 'https_replicate.npmjs.com', 'org_visit1')
with open(package_metadata_filepath) as json_file:
package_metadata = json.load(json_file)
extract_npm_package_author(package_metadata['versions']['0.0.2']) == \
{
'fullname': b'mooz <stillpedant@gmail.com>',
'name': b'mooz',
'email': b'stillpedant@gmail.com'
}
assert (
extract_npm_package_author(package_metadata['versions']['0.0.3']) ==
{
'fullname': b'Masafumi Oyamada <stillpedant@gmail.com>',
'name': b'Masafumi Oyamada',
'email': b'stillpedant@gmail.com'
}
)
package_json = json.loads('''
{
"name": "highlightjs-line-numbers.js",
"version": "2.7.0",
"description": "Highlight.js line numbers plugin.",
"main": "src/highlightjs-line-numbers.js",
"dependencies": {},
"devDependencies": {
"gulp": "^4.0.0",
"gulp-rename": "^1.4.0",
"gulp-replace": "^0.6.1",
"gulp-uglify": "^1.2.0"
},
"repository": {
"type": "git",
"url": "https://github.com/wcoder/highlightjs-line-numbers.js.git"
},
"author": "Yauheni Pakala <evgeniy.pakalo@gmail.com>",
"license": "MIT",
"bugs": {
"url": "https://github.com/wcoder/highlightjs-line-numbers.js/issues"
},
"homepage": "http://wcoder.github.io/highlightjs-line-numbers.js/"
}''') # noqa
assert extract_npm_package_author(package_json) == \
{
'fullname': b'Yauheni Pakala <evgeniy.pakalo@gmail.com>',
'name': b'Yauheni Pakala',
'email': b'evgeniy.pakalo@gmail.com'
}
package_json = json.loads('''
{
"name": "3-way-diff",
"version": "0.0.1",
"description": "3-way diffing of JavaScript objects",
"main": "index.js",
"authors": [
{
"name": "Shawn Walsh",
"url": "https://github.com/shawnpwalsh"
},
{
"name": "Markham F Rollins IV",
"url": "https://github.com/mrollinsiv"
}
],
"keywords": [
"3-way diff",
"3 way diff",
"three-way diff",
"three way diff"
],
"devDependencies": {
"babel-core": "^6.20.0",
"babel-preset-es2015": "^6.18.0",
"mocha": "^3.0.2"
},
"dependencies": {
"lodash": "^4.15.0"
}
}''')
assert extract_npm_package_author(package_json) == \
{
'fullname': b'Shawn Walsh',
'name': b'Shawn Walsh',
'email': None
}
package_json = json.loads('''
{
"name": "yfe-ynpm",
"version": "1.0.0",
"homepage": "http://gitlab.ywwl.com/yfe/yfe-ynpm",
"repository": {
"type": "git",
"url": "git@gitlab.ywwl.com:yfe/yfe-ynpm.git"
},
"author": [
"fengmk2 <fengmk2@gmail.com> (https://fengmk2.com)",
"xufuzi <xufuzi@ywwl.com> (https://7993.org)"
],
"license": "MIT"
}''')
assert extract_npm_package_author(package_json) == \
{
'fullname': b'fengmk2 <fengmk2@gmail.com>',
'name': b'fengmk2',
'email': b'fengmk2@gmail.com'
}
package_json = json.loads('''
{
"name": "umi-plugin-whale",
"version": "0.0.8",
"description": "Internal contract component",
"authors": {
"name": "xiaohuoni",
"email": "448627663@qq.com"
},
"repository": "alitajs/whale",
"devDependencies": {
"np": "^3.0.4",
"umi-tools": "*"
},
"license": "MIT"
}''')
assert extract_npm_package_author(package_json) == \
{
'fullname': b'xiaohuoni <448627663@qq.com>',
'name': b'xiaohuoni',
'email': b'448627663@qq.com'
}
def normalize_hashes(hashes):
if isinstance(hashes, str):
return hash_to_bytes(hashes)
if isinstance(hashes, list):
return [hash_to_bytes(x) for x in hashes]
return {hash_to_bytes(k): hash_to_bytes(v) for k, v in hashes.items()}
_expected_new_contents_first_visit = normalize_hashes([
'4ce3058e16ab3d7e077f65aabf855c34895bf17c',
'858c3ceee84c8311adc808f8cdb30d233ddc9d18',
'0fa33b4f5a4e0496da6843a38ff1af8b61541996',
'85a410f8ef8eb8920f2c384a9555566ad4a2e21b',
'9163ac8025923d5a45aaac482262893955c9b37b',
'692cf623b8dd2c5df2c2998fd95ae4ec99882fb4',
'18c03aac6d3e910efb20039c15d70ab5e0297101',
'41265c42446aac17ca769e67d1704f99e5a1394d',
'783ff33f5882813dca9239452c4a7cadd4dba778',
'b029cfb85107aee4590c2434a3329bfcf36f8fa1',
'112d1900b4c2e3e9351050d1b542c9744f9793f3',
'5439bbc4bd9a996f1a38244e6892b71850bc98fd',
'd83097a2f994b503185adf4e719d154123150159',
'd0939b4898e83090ee55fd9d8a60e312cfadfbaf',
'b3523a26f7147e4af40d9d462adaae6d49eda13e',
'cd065fb435d6fb204a8871bcd623d0d0e673088c',
'2854a40855ad839a54f4b08f5cff0cf52fca4399',
'b8a53bbaac34ebb8c6169d11a4b9f13b05c583fe',
'0f73d56e1cf480bded8a1ecf20ec6fc53c574713',
'0d9882b2dfafdce31f4e77fe307d41a44a74cefe',
'585fc5caab9ead178a327d3660d35851db713df1',
'e8cd41a48d79101977e3036a87aeb1aac730686f',
'5414efaef33cceb9f3c9eb5c4cc1682cd62d14f7',
'9c3cc2763bf9e9e37067d3607302c4776502df98',
'3649a68410e354c83cd4a38b66bd314de4c8f5c9',
'e96ed0c091de1ebdf587104eaf63400d1974a1fe',
'078ca03d2f99e4e6eab16f7b75fbb7afb699c86c',
'38de737da99514de6559ff163c988198bc91367a',
])
_expected_new_directories_first_visit = normalize_hashes([
'3370d20d6f96dc1c9e50f083e2134881db110f4f',
'42753c0c2ab00c4501b552ac4671c68f3cf5aece',
'd7895533ef5edbcffdea3f057d9fef3a1ef845ce',
'80579be563e2ef3e385226fe7a3f079b377f142c',
'3b0ddc6a9e58b4b53c222da4e27b280b6cda591c',
'bcad03ce58ac136f26f000990fc9064e559fe1c0',
'5fc7e82a1bc72e074665c6078c6d3fad2f13d7ca',
'e3cd26beba9b1e02f6762ef54bd9ac80cc5f25fd',
'584b5b4b6cf7f038095e820b99386a9c232de931',
'184c8d6d0d242f2b1792ef9d3bf396a5434b7f7a',
'bb5f4ee143c970367eb409f2e4c1104898048b9d',
'1b95491047add1103db0dfdfa84a9735dcb11e88',
'a00c6de13471a2d66e64aca140ddb21ef5521e62',
'5ce6c1cd5cda2d546db513aaad8c72a44c7771e2',
'c337091e349b6ac10d38a49cdf8c2401ef9bb0f2',
'202fafcd7c0f8230e89d5496ad7f44ab12b807bf',
'775cc516543be86c15c1dc172f49c0d4e6e78235',
'ff3d1ead85a14f891e8b3fa3a89de39db1b8de2e',
])
_expected_new_revisions_first_visit = normalize_hashes({
'd8a1c7474d2956ac598a19f0f27d52f7015f117e':
'42753c0c2ab00c4501b552ac4671c68f3cf5aece',
'5f9eb78af37ffd12949f235e86fac04898f9f72a':
'3370d20d6f96dc1c9e50f083e2134881db110f4f',
'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a':
'd7895533ef5edbcffdea3f057d9fef3a1ef845ce'}
)
_expected_new_snapshot_first_visit_id = normalize_hashes(
'd0587e1195aed5a8800411a008f2f2d627f18e2d')
_expected_branches_first_visit = {
'HEAD': {
'target': 'releases/0.0.4',
'target_type': 'alias'
},
'releases/0.0.2': {
'target': 'd8a1c7474d2956ac598a19f0f27d52f7015f117e',
'target_type': 'revision'
},
'releases/0.0.3': {
'target': '5f9eb78af37ffd12949f235e86fac04898f9f72a',
'target_type': 'revision'
},
'releases/0.0.4': {
'target': 'ba019b192bdb94bd0b5bd68b3a5f92b5acc2239a',
'target_type': 'revision'
}
}
def package_url(package):
return 'https://www.npmjs.com/package/%s' % package
def package_metadata_url(package):
return 'https://replicate.npmjs.com/%s/' % package
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
package = 'org'
loader = NpmLoader(package,
package_url(package),
package_metadata_url(package))
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_revision_id = hash_to_bytes(
'd8a1c7474d2956ac598a19f0f27d52f7015f117e')
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(revision['metadata'], paths=[
('intrinsic.tool', str),
('intrinsic.raw', dict),
('extrinsic.provider', str),
('extrinsic.when', str),
('extrinsic.raw', dict),
- ('original_artifact.filename', str),
- ('original_artifact.length', int),
- ('original_artifact.checksums', dict),
+ ('original_artifact', list),
])
+ for original_artifact in revision['metadata']['original_artifact']:
+ check_metadata_paths(original_artifact, paths=[
+ ('filename', str),
+ ('length', int),
+ ('checksums', dict),
+ ])
+
def test_npm_loader_first_visit(swh_config, requests_mock_datadir):
package = 'org'
loader = NpmLoader(package,
package_url(package),
package_metadata_url(package))
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit),
'directory': len(_expected_new_directories_first_visit),
'origin': 1,
'origin_visit': 1,
'person': 2,
'release': 0,
'revision': len(_expected_new_revisions_first_visit),
'skipped_content': 0,
'snapshot': 1,
} == stats
assert len(list(loader.storage.content_get(
_expected_new_contents_first_visit))) == len(
_expected_new_contents_first_visit)
assert list(loader.storage.directory_missing(
_expected_new_directories_first_visit)) == []
assert list(loader.storage.revision_missing(
_expected_new_revisions_first_visit)) == []
expected_snapshot = {
'id': _expected_new_snapshot_first_visit_id,
'branches': _expected_branches_first_visit,
}
check_snapshot(expected_snapshot, loader.storage)
def test_npm_loader_incremental_visit(
swh_config, requests_mock_datadir_visits):
package = 'org'
url = package_url(package)
metadata_url = package_metadata_url(package)
loader = NpmLoader(package, url, metadata_url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit['status'] == 'full'
stats = loader.storage.stat_counters()
assert {
'content': len(_expected_new_contents_first_visit),
'directory': len(_expected_new_directories_first_visit),
'origin': 1,
'origin_visit': 1,
'person': 2,
'release': 0,
'revision': len(_expected_new_revisions_first_visit),
'skipped_content': 0,
'snapshot': 1,
} == stats
loader._info = None # reset loader internal state
actual_load_status2 = loader.load()
assert actual_load_status2['status'] == 'eventful'
origin_visit2 = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit2['status'] == 'full'
stats = loader.storage.stat_counters()
assert { # 3 new releases artifacts
'content': len(_expected_new_contents_first_visit) + 14,
'directory': len(_expected_new_directories_first_visit) + 15,
'origin': 1,
'origin_visit': 2,
'person': 2,
'release': 0,
'revision': len(_expected_new_revisions_first_visit) + 3,
'skipped_content': 0,
'snapshot': 2,
} == stats
urls = [
m.url for m in requests_mock_datadir_visits.request_history
if m.url.startswith('https://registry.npmjs.org')
]
assert len(urls) == len(set(urls)) # we visited each artifact once across
diff --git a/swh/loader/package/tests/test_pypi.py b/swh/loader/package/tests/test_pypi.py
index aba1814..74b3a70 100644
--- a/swh/loader/package/tests/test_pypi.py
+++ b/swh/loader/package/tests/test_pypi.py
@@ -1,654 +1,659 @@
# Copyright (C) 2019 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from os import path
import pytest
from unittest.mock import patch
from swh.core.tarball import uncompress
from swh.core.pytest_plugin import requests_mock_datadir_factory
from swh.model.hashutil import hash_to_bytes
from swh.loader.package.pypi import (
PyPILoader, pypi_api_url, author, extract_intrinsic_metadata
)
from swh.loader.package.tests.common import (
check_snapshot, check_metadata_paths
)
def test_author_basic():
data = {
'author': "i-am-groot",
'author_email': 'iam@groot.org',
}
actual_author = author(data)
expected_author = {
'fullname': b'i-am-groot <iam@groot.org>',
'name': b'i-am-groot',
'email': b'iam@groot.org',
}
assert actual_author == expected_author
def test_author_empty_email():
data = {
'author': 'i-am-groot',
'author_email': '',
}
actual_author = author(data)
expected_author = {
'fullname': b'i-am-groot',
'name': b'i-am-groot',
'email': b'',
}
assert actual_author == expected_author
def test_author_empty_name():
data = {
'author': "",
'author_email': 'iam@groot.org',
}
actual_author = author(data)
expected_author = {
'fullname': b' <iam@groot.org>',
'name': b'',
'email': b'iam@groot.org',
}
assert actual_author == expected_author
def test_author_malformed():
data = {
'author': "['pierre', 'paul', 'jacques']",
'author_email': None,
}
actual_author = author(data)
expected_author = {
'fullname': b"['pierre', 'paul', 'jacques']",
'name': b"['pierre', 'paul', 'jacques']",
'email': None,
}
assert actual_author == expected_author
def test_author_malformed_2():
data = {
'author': '[marie, jeanne]',
'author_email': '[marie@some, jeanne@thing]',
}
actual_author = author(data)
expected_author = {
'fullname': b'[marie, jeanne] <[marie@some, jeanne@thing]>',
'name': b'[marie, jeanne]',
'email': b'[marie@some, jeanne@thing]',
}
assert actual_author == expected_author
def test_author_malformed_3():
data = {
'author': '[marie, jeanne, pierre]',
'author_email': '[marie@somewhere.org, jeanne@somewhere.org]',
}
actual_author = author(data)
expected_author = {
'fullname': b'[marie, jeanne, pierre] <[marie@somewhere.org, jeanne@somewhere.org]>', # noqa
'name': b'[marie, jeanne, pierre]',
'email': b'[marie@somewhere.org, jeanne@somewhere.org]',
}
actual_author == expected_author
# configuration error #
def test_badly_configured_loader_raise(monkeypatch):
"""Badly configured loader should raise"""
monkeypatch.delenv('SWH_CONFIG_FILENAME', raising=False)
with pytest.raises(ValueError) as e:
PyPILoader(url='some-url')
assert 'Misconfiguration' in e.value.args[0]
def test_pypi_api_url():
"""Compute pypi api url from the pypi project url should be ok"""
url = pypi_api_url('https://pypi.org/project/requests')
assert url == 'https://pypi.org/pypi/requests/json'
@pytest.mark.fs
def test_extract_intrinsic_metadata(tmp_path, datadir):
"""Parsing existing archive's PKG-INFO should yield results"""
uncompressed_archive_path = str(tmp_path)
archive_path = path.join(
datadir, 'https_files.pythonhosted.org', '0805nexter-1.1.0.zip')
uncompress(archive_path, dest=uncompressed_archive_path)
actual_metadata = extract_intrinsic_metadata(uncompressed_archive_path)
expected_metadata = {
'metadata_version': '1.0',
'name': '0805nexter',
'version': '1.1.0',
'summary': 'a simple printer of nested lest',
'home_page': 'http://www.hp.com',
'author': 'hgtkpython',
'author_email': '2868989685@qq.com',
'platforms': ['UNKNOWN'],
}
assert actual_metadata == expected_metadata
@pytest.mark.fs
def test_extract_intrinsic_metadata_failures(tmp_path):
"""Parsing inexistant path/archive/PKG-INFO yield None"""
tmp_path = str(tmp_path) # py3.5 work around (PosixPath issue)
# inexistant first level path
assert extract_intrinsic_metadata('/something-inexistant') == {}
# inexistant second level path (as expected by pypi archives)
assert extract_intrinsic_metadata(tmp_path) == {}
# inexistant PKG-INFO within second level path
existing_path_no_pkginfo = path.join(tmp_path, 'something')
os.mkdir(existing_path_no_pkginfo)
assert extract_intrinsic_metadata(tmp_path) == {}
# LOADER SCENARIO #
# "edge" cases (for the same origin) #
# no release artifact:
# {visit full, status: uneventful, no contents, etc...}
requests_mock_datadir_missing_all = requests_mock_datadir_factory(ignore_urls=[
'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa
'https://files.pythonhosted.org/packages/c4/a0/4562cda161dc4ecbbe9e2a11eb365400c0461845c5be70d73869786809c4/0805nexter-1.2.0.zip', # noqa
])
def test_no_release_artifact(swh_config, requests_mock_datadir_missing_all):
"""Load a pypi project with all artifacts missing ends up with no snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'uneventful'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 1,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
# problem during loading:
# {visit: partial, status: uneventful, no snapshot}
def test_release_with_traceback(swh_config):
url = 'https://pypi.org/project/0805nexter'
with patch('swh.loader.package.pypi.PyPILoader.get_default_release',
side_effect=ValueError('Problem')):
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'failed'
stats = loader.storage.stat_counters()
assert {
'content': 0,
'directory': 0,
'origin': 1,
'origin_visit': 1,
'person': 0,
'release': 0,
'revision': 0,
'skipped_content': 0,
'snapshot': 0,
} == stats
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
# problem during loading: failure early enough in between swh contents...
# some contents (contents, directories, etc...) have been written in storage
# {visit: partial, status: eventful, no snapshot}
# problem during loading: failure late enough we can have snapshots (some
# revisions are written in storage already)
# {visit: partial, status: eventful, snapshot}
# "normal" cases (for the same origin) #
requests_mock_datadir_missing_one = requests_mock_datadir_factory(ignore_urls=[
'https://files.pythonhosted.org/packages/ec/65/c0116953c9a3f47de89e71964d6c7b0c783b01f29fa3390584dbf3046b4d/0805nexter-1.1.0.zip', # noqa
])
# some missing release artifacts:
# {visit partial, status: eventful, 1 snapshot}
def test_revision_metadata_structure(swh_config, requests_mock_datadir):
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_revision_id = hash_to_bytes(
'e445da4da22b31bfebb6ffc4383dbf839a074d21')
revision = list(loader.storage.revision_get([expected_revision_id]))[0]
assert revision is not None
check_metadata_paths(revision['metadata'], paths=[
('intrinsic.tool', str),
('intrinsic.raw', dict),
('extrinsic.provider', str),
('extrinsic.when', str),
('extrinsic.raw', dict),
- ('original_artifact.filename', str),
- ('original_artifact.length', int),
- ('original_artifact.checksums', dict),
+ ('original_artifact', list),
])
+ for original_artifact in revision['metadata']['original_artifact']:
+ check_metadata_paths(original_artifact, paths=[
+ ('filename', str),
+ ('length', int),
+ ('checksums', dict),
+ ])
+
def test_visit_with_missing_artifact(
swh_config, requests_mock_datadir_missing_one):
"""Load a pypi project with some missing artifacts ends up with 1 snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 3,
'directory': 2,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 1,
'skipped_content': 0,
'snapshot': 1
} == stats
expected_contents = map(hash_to_bytes, [
'405859113963cb7a797642b45f171d6360425d16',
'e5686aa568fdb1d19d7f1329267082fe40482d31',
'83ecf6ec1114fd260ca7a833a2d165e71258c338',
])
assert list(loader.storage.content_missing_per_sha1(expected_contents))\
== []
expected_dirs = map(hash_to_bytes, [
'b178b66bd22383d5f16f4f5c923d39ca798861b4',
'c3a58f8b57433a4b56caaa5033ae2e0931405338',
])
assert list(loader.storage.directory_missing(expected_dirs)) == []
# {revision hash: directory hash}
expected_revs = {
hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
}
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_branches = {
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.2.0',
'target_type': 'alias',
},
}
expected_snapshot = {
'id': 'dd0e4201a232b1c104433741dbf45895b8ac9355',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, storage=loader.storage)
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'partial'
def test_visit_with_1_release_artifact(swh_config, requests_mock_datadir):
"""With no prior visit, load a pypi project ends up with 1 snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 6,
'directory': 4,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 2,
'skipped_content': 0,
'snapshot': 1
} == stats
expected_contents = map(hash_to_bytes, [
'a61e24cdfdab3bb7817f6be85d37a3e666b34566',
'938c33483285fd8ad57f15497f538320df82aeb8',
'a27576d60e08c94a05006d2e6d540c0fdb5f38c8',
'405859113963cb7a797642b45f171d6360425d16',
'e5686aa568fdb1d19d7f1329267082fe40482d31',
'83ecf6ec1114fd260ca7a833a2d165e71258c338',
])
assert list(loader.storage.content_missing_per_sha1(expected_contents))\
== []
expected_dirs = map(hash_to_bytes, [
'05219ba38bc542d4345d5638af1ed56c7d43ca7d',
'cf019eb456cf6f78d8c4674596f1c9a97ece8f44',
'b178b66bd22383d5f16f4f5c923d39ca798861b4',
'c3a58f8b57433a4b56caaa5033ae2e0931405338',
])
assert list(loader.storage.directory_missing(expected_dirs)) == []
# {revision hash: directory hash}
expected_revs = {
hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa
hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
}
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_branches = {
'releases/1.1.0': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.2.0',
'target_type': 'alias',
},
}
expected_snapshot = {
'id': 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, loader.storage)
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
def test_multiple_visits_with_no_change(swh_config, requests_mock_datadir):
"""Multiple visits with no changes results in 1 same snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
stats = loader.storage.stat_counters()
assert {
'content': 6,
'directory': 4,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 2,
'skipped_content': 0,
'snapshot': 1
} == stats
expected_branches = {
'releases/1.1.0': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.2.0',
'target_type': 'alias',
},
}
snapshot_id = 'ba6e158ada75d0b3cfb209ffdf6daa4ed34a227a'
expected_snapshot = {
'id': snapshot_id,
'branches': expected_branches,
}
check_snapshot(expected_snapshot, loader.storage)
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
actual_load_status2 = loader.load()
assert actual_load_status2['status'] == 'uneventful'
stats2 = loader.storage.stat_counters()
expected_stats2 = stats.copy()
expected_stats2['origin_visit'] = 1 + 1
assert expected_stats2 == stats2
# same snapshot
actual_snapshot_id = origin_visit['snapshot']['id']
assert actual_snapshot_id == hash_to_bytes(snapshot_id)
def test_incremental_visit(swh_config, requests_mock_datadir_visits):
"""With prior visit, 2nd load will result with a different snapshot
"""
url = 'https://pypi.org/project/0805nexter'
loader = PyPILoader(url)
visit1_actual_load_status = loader.load()
visit1_stats = loader.storage.stat_counters()
assert visit1_actual_load_status['status'] == 'eventful'
origin_visit1 = next(loader.storage.origin_visit_get(url))
assert origin_visit1['status'] == 'full'
assert {
'content': 6,
'directory': 4,
'origin': 1,
'origin_visit': 1,
'person': 1,
'release': 0,
'revision': 2,
'skipped_content': 0,
'snapshot': 1
} == visit1_stats
# Reset internal state
loader._info = None
visit2_actual_load_status = loader.load()
visit2_stats = loader.storage.stat_counters()
assert visit2_actual_load_status['status'] == 'eventful'
visits = list(loader.storage.origin_visit_get(url))
assert len(visits) == 2
assert visits[1]['status'] == 'full'
assert {
'content': 6 + 1, # 1 more content
'directory': 4 + 2, # 2 more directories
'origin': 1,
'origin_visit': 1 + 1,
'person': 1,
'release': 0,
'revision': 2 + 1, # 1 more revision
'skipped_content': 0,
'snapshot': 1 + 1, # 1 more snapshot
} == visit2_stats
expected_contents = map(hash_to_bytes, [
'a61e24cdfdab3bb7817f6be85d37a3e666b34566',
'938c33483285fd8ad57f15497f538320df82aeb8',
'a27576d60e08c94a05006d2e6d540c0fdb5f38c8',
'405859113963cb7a797642b45f171d6360425d16',
'e5686aa568fdb1d19d7f1329267082fe40482d31',
'83ecf6ec1114fd260ca7a833a2d165e71258c338',
'92689fa2b7fb4d4fc6fb195bf73a50c87c030639'
])
assert list(loader.storage.content_missing_per_sha1(expected_contents))\
== []
expected_dirs = map(hash_to_bytes, [
'05219ba38bc542d4345d5638af1ed56c7d43ca7d',
'cf019eb456cf6f78d8c4674596f1c9a97ece8f44',
'b178b66bd22383d5f16f4f5c923d39ca798861b4',
'c3a58f8b57433a4b56caaa5033ae2e0931405338',
'e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a',
'52604d46843b898f5a43208045d09fcf8731631b',
])
assert list(loader.storage.directory_missing(expected_dirs)) == []
# {revision hash: directory hash}
expected_revs = {
hash_to_bytes('4c99891f93b81450385777235a37b5e966dd1571'): hash_to_bytes('05219ba38bc542d4345d5638af1ed56c7d43ca7d'), # noqa
hash_to_bytes('e445da4da22b31bfebb6ffc4383dbf839a074d21'): hash_to_bytes('b178b66bd22383d5f16f4f5c923d39ca798861b4'), # noqa
hash_to_bytes('51247143b01445c9348afa9edfae31bf7c5d86b1'): hash_to_bytes('e226e7e4ad03b4fc1403d69a18ebdd6f2edd2b3a'), # noqa
}
assert list(loader.storage.revision_missing(expected_revs)) == []
expected_branches = {
'releases/1.1.0': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.2.0': {
'target': 'e445da4da22b31bfebb6ffc4383dbf839a074d21',
'target_type': 'revision',
},
'releases/1.3.0': {
'target': '51247143b01445c9348afa9edfae31bf7c5d86b1',
'target_type': 'revision',
},
'HEAD': {
'target': 'releases/1.3.0',
'target_type': 'alias',
},
}
expected_snapshot = {
'id': '2e5149a7b0725d18231a37b342e9b7c4e121f283',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, loader.storage)
origin_visit = list(loader.storage.origin_visit_get(url))[-1]
assert origin_visit['status'] == 'full'
urls = [
m.url for m in requests_mock_datadir_visits.request_history
if m.url.startswith('https://files.pythonhosted.org')
]
# visited each artifact once across 2 visits
assert len(urls) == len(set(urls))
# release artifact, no new artifact
# {visit full, status uneventful, same snapshot as before}
# release artifact, old artifact with different checksums
# {visit full, status full, new snapshot with shared history and some new
# different history}
# release with multiple sdist artifacts per pypi "version"
# snapshot branch output is different
def test_visit_1_release_with_2_artifacts(swh_config, requests_mock_datadir):
"""With no prior visit, load a pypi project ends up with 1 snapshot
"""
url = 'https://pypi.org/project/nexter'
loader = PyPILoader(url)
actual_load_status = loader.load()
assert actual_load_status['status'] == 'eventful'
expected_branches = {
'releases/1.1.0/nexter-1.1.0.zip': {
'target': '4c99891f93b81450385777235a37b5e966dd1571',
'target_type': 'revision',
},
'releases/1.1.0/nexter-1.1.0.tar.gz': {
'target': '0bf88f5760cca7665d0af4d6575d9301134fe11a',
'target_type': 'revision',
},
}
expected_snapshot = {
'id': 'a27e638a4dad6fbfa273c6ebec1c4bf320fb84c6',
'branches': expected_branches,
}
check_snapshot(expected_snapshot, loader.storage)
origin_visit = next(loader.storage.origin_visit_get(url))
assert origin_visit['status'] == 'full'
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Aug 25, 5:47 PM (4 d, 3 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3247448
Attached To
rDLDBASE Generic VCS/Package Loader
Event Timeline
Log In to Comment