diff --git a/swh/loader/package/debian.py b/swh/loader/package/debian.py index c600040..94d4776 100644 --- a/swh/loader/package/debian.py +++ b/swh/loader/package/debian.py @@ -1,357 +1,367 @@ # Copyright (C) 2017-2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os import copy import datetime import email.utils import logging import re import subprocess from dateutil.parser import parse as parse_date from debian.changelog import Changelog from debian.deb822 import Dsc +from os import path from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple from swh.model import hashutil from swh.loader.package.loader import PackageLoader from swh.loader.package.utils import download logger = logging.getLogger(__name__) UPLOADERS_SPLIT = re.compile(r'(?<=\>)\s*,\s*') def uid_to_person(uid, encode=True): """Convert an uid to a person suitable for insertion. Args: uid: an uid of the form "Name " encode: whether to convert the output to bytes or not Returns: dict: a dictionary with the following keys: - name: the name associated to the uid - email: the mail associated to the uid """ ret = { 'name': '', 'email': '', 'fullname': uid, } name, mail = email.utils.parseaddr(uid) if name and email: ret['name'] = name ret['email'] = mail else: ret['name'] = uid if encode: for key in list(ret): ret[key] = ret[key].encode('utf-8') return ret def download_package(package: Dict, tmpdir: Any) -> Mapping[str, Dict]: """Fetch a source package in a temporary directory and check the checksums for all files. Args: package: Dict defining the set of files representing a debian package tmpdir: Where to download and extract the files to ingest Returns: Dict of swh hashes per filename key """ all_hashes = {} for filename, fileinfo in package['files'].items(): uri = fileinfo['uri'] logger.debug('fileinfo: %s', fileinfo) extrinsic_hashes = {'sha256': fileinfo['sha256']} logger.debug('extrinsic_hashes(%s): %s', filename, extrinsic_hashes) filepath, hashes = download(uri, dest=tmpdir, filename=filename, hashes=extrinsic_hashes) all_hashes[filename] = hashes logger.debug('all_hashes: %s', all_hashes) return all_hashes -def extract_package(package: Dict, tmpdir: str) -> Tuple[str, str, str]: - """Extract a Debian source package to a given directory. - - Note that after extraction the target directory will be the root of the - extracted package, rather than containing it. +def dsc_information(package: Dict) -> Tuple[str, str]: + """Retrieve dsc information from a package. Args: - package (dict): package information dictionary - tmpdir (str): directory where the package files are stored + package: Package metadata information Returns: - tuple: path to the dsc, uri used to retrieve the dsc, extraction - directory + Tuple of dsc file's uri, dsc's full disk path """ dsc_name = None dsc_url = None - for filename, fileinfo in package['files'].items(): if filename.endswith('.dsc'): if dsc_name: raise ValueError( 'Package %s_%s references several dsc files' % (package['name'], package['version']) ) dsc_url = fileinfo['uri'] dsc_name = filename + return dsc_url, dsc_name + + +def extract_package(package: Dict, tmpdir: str) -> str: + """Extract a Debian source package to a given directory. + + Note that after extraction the target directory will be the root of the + extracted package, rather than containing it. + + Args: + package (dict): package information dictionary + tmpdir (str): directory where the package files are stored + + Returns: + Package extraction directory + + """ + _, dsc_name = dsc_information(package) dsc_path = os.path.join(tmpdir, dsc_name) destdir = os.path.join(tmpdir, 'extracted') logfile = os.path.join(tmpdir, 'extract.log') - logger.debug('extract Debian source package %s in %s' % (dsc_path, destdir), extra={ 'swh_type': 'deb_extract', 'swh_dsc': dsc_path, 'swh_destdir': destdir, }) cmd = ['dpkg-source', '--no-copy', '--no-check', '--ignore-bad-version', '-x', dsc_path, destdir] try: with open(logfile, 'w') as stdout: subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logdata = open(logfile, 'r').read() raise ValueError('dpkg-source exited with code %s: %s' % (e.returncode, logdata)) from None - return dsc_path, dsc_url, destdir + return destdir def get_file_info(filepath): """Retrieve the original file information from the file at filepath. Args: filepath: the path to the original file Returns: dict: information about the original file, in a dictionary with the following keys - name: the file name - sha1, sha1_git, sha256: original file hashes - length: original file length """ name = os.path.basename(filepath) if isinstance(name, bytes): name = name.decode('utf-8') hashes = hashutil.MultiHash.from_path(filepath).hexdigest() hashes['name'] = name hashes['length'] = os.path.getsize(filepath) return hashes def get_package_metadata(package, dsc_path, extracted_path): """Get the package metadata from the source package at dsc_path, extracted in extracted_path. Args: package: the package dict (with a dsc_path key) dsc_path: path to the package's dsc file extracted_path: the path where the package got extracted Returns: dict: a dictionary with the following keys: - history: list of (package_name, package_version) tuples parsed from the package changelog - source_files: information about all the files in the source package """ ret = {} with open(dsc_path, 'rb') as dsc: parsed_dsc = Dsc(dsc) source_files = [get_file_info(dsc_path)] dsc_dir = os.path.dirname(dsc_path) for filename in package['files']: file_path = os.path.join(dsc_dir, filename) file_info = get_file_info(file_path) source_files.append(file_info) ret['original_artifact'] = source_files # Parse the changelog to retrieve the rest of the package information changelog_path = os.path.join(extracted_path, 'debian/changelog') with open(changelog_path, 'rb') as changelog: try: parsed_changelog = Changelog(changelog) except UnicodeDecodeError: logger.warning('Unknown encoding for changelog %s,' ' falling back to iso' % changelog_path.decode('utf-8'), extra={ 'swh_type': 'deb_changelog_encoding', 'swh_name': package['name'], 'swh_version': str(package['version']), 'swh_changelog': changelog_path.decode('utf-8'), }) # need to reset as Changelog scrolls to the end of the file changelog.seek(0) parsed_changelog = Changelog(changelog, encoding='iso-8859-15') package_info = { 'name': package['name'], 'version': str(package['version']), 'changelog': { 'person': uid_to_person(parsed_changelog.author), 'date': parse_date(parsed_changelog.date), 'history': [(block.package, str(block.version)) for block in parsed_changelog][1:], } } maintainers = [ uid_to_person(parsed_dsc['Maintainer'], encode=False), ] maintainers.extend( uid_to_person(person, encode=False) for person in UPLOADERS_SPLIT.split(parsed_dsc.get('Uploaders', '')) ) package_info['maintainers'] = maintainers ret['package_info'] = package_info return ret class DebianLoader(PackageLoader): """Load debian origins into swh archive. """ visit_type = 'debian' def __init__(self, url: str, date: str, packages: Mapping[str, Dict]): super().__init__(url=url) self._info = None self.packages = packages - self.dsc_path = None - self.dsc_url = None def get_versions(self) -> Sequence[str]: """Returns the keys of the packages input (e.g. stretch/contrib/0.7.2-3, etc...) """ return self.packages.keys() def get_default_release(self) -> str: """Take the first version as default release """ return list(self.packages.keys())[0] def get_artifacts(self, version: str) -> Generator[ Tuple[Mapping[str, Any], Dict], None, None]: a_metadata = self.packages[version] artifacts_package_info = a_metadata.copy() artifacts_package_info['filename'] = version yield artifacts_package_info, a_metadata def resolve_revision_from( self, known_artifacts: Dict, artifact_metadata: Dict) \ -> Optional[bytes]: pass # for now def download_package(self, a_p_info: str, tmpdir: str) -> Tuple[str, Dict]: """Contrary to other package loaders (1 package, 1 artifact), `a_metadata` represents the package's datafiles set to fetch: - .orig.tar.gz - .dsc - .diff.gz This is delegated to the `download_package` function. """ logger.debug('debian: artifactS_package_info: %s', a_p_info) a_c_metadata = download_package(a_p_info, tmpdir) return tmpdir, a_c_metadata def uncompress(self, a_path: str, tmpdir: str, a_metadata: Dict) -> str: - self.dsc_path, self.dsc_url, a_uncompressed_path = extract_package( - a_metadata, tmpdir) + a_uncompressed_path = extract_package(a_metadata, tmpdir) return a_uncompressed_path def read_intrinsic_metadata(self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: - dsc_path = self.dsc_path # XXX + _, dsc_name = dsc_information(a_metadata) + dsc_path = path.join(path.dirname(a_uncompressed_path), dsc_name) i_metadata = get_package_metadata( a_metadata, dsc_path, a_uncompressed_path) return i_metadata def build_revision( self, a_metadata: Dict, i_metadata: Dict) -> Dict: - + dsc_url, _ = dsc_information(a_metadata) logger.debug('i_metadata: %s', i_metadata) logger.debug('a_metadata: %s', a_metadata) def prepare(obj): if isinstance(obj, list): return [prepare(item) for item in obj] elif isinstance(obj, dict): return {k: prepare(v) for k, v in obj.items()} elif isinstance(obj, datetime.datetime): return obj.isoformat() elif isinstance(obj, bytes): return obj.decode('utf-8') else: return copy.deepcopy(obj) package_info = i_metadata['package_info'] msg = 'Synthetic revision for Debian source package %s version %s' % ( a_metadata['name'], a_metadata['version']) date = package_info['changelog']['date'] author = package_info['changelog']['person'] # inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa return { 'type': 'dsc', 'message': msg.encode('utf-8'), 'author': author, 'date': date, 'committer': author, 'committer_date': date, 'parents': [], 'metadata': { 'intrinsic': { 'tool': 'dsc', 'raw': prepare(package_info), }, 'extrinsic': { - 'provider': self.dsc_url, + 'provider': dsc_url, 'when': self.visit_date.isoformat(), 'raw': a_metadata, }, } } diff --git a/swh/loader/package/tests/test_debian.py b/swh/loader/package/tests/test_debian.py index bd17a30..7894bae 100644 --- a/swh/loader/package/tests/test_debian.py +++ b/swh/loader/package/tests/test_debian.py @@ -1,137 +1,173 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +import copy +import pytest + from os import path from swh.loader.package.debian import ( - DebianLoader, get_file_info, download_package + DebianLoader, get_file_info, download_package, dsc_information ) from swh.loader.package.tests.common import check_snapshot PACKAGE_FILES = { 'files': { 'cicero_0.7.2-3.diff.gz': { 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', 'name': 'cicero_0.7.2-3.diff.gz', 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c', # noqa 'size': 3964, 'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2-3.diff.gz' # noqa }, 'cicero_0.7.2-3.dsc': { 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', 'name': 'cicero_0.7.2-3.dsc', 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03', # noqa 'size': 1864, 'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2-3.dsc'}, # noqa 'cicero_0.7.2.orig.tar.gz': { 'md5sum': '4353dede07c5728319ba7f5595a7230a', 'name': 'cicero_0.7.2.orig.tar.gz', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa 'size': 96527, 'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa } }, 'id': 23, 'name': 'cicero', 'revision_id': None, 'version': '0.7.2-3' } PACKAGE_PER_VERSION = { 'stretch/contrib/0.7.2-3': PACKAGE_FILES } def test_get_file_info(datadir): filepath = path.join(datadir, 'deb.debian.org', 'onefile.txt') actual_info = get_file_info(filepath) expected_info = { 'name': 'onefile.txt', 'length': 62, 'sha1': '135572f4ac013f49e624612301f9076af1eacef2', 'sha1_git': '1d62cd247ef251d52d98bbd931d44ad1f967ea99', 'sha256': '40f1a3cbe9355879319759bae1a6ba09cbf34056e79e951cd2dc0adbff169b9f', # noqa 'blake2s256': '4072cf9a0017ad7705a9995bbfbbc098276e6a3afea8d84ab54bff6381c897ab', # noqa } assert actual_info == expected_info def test_download_package(datadir, tmpdir, requests_mock_http_datadir): tmpdir = str(tmpdir) # py3.5 work around (LocalPath issue) all_hashes = download_package(PACKAGE_FILES, tmpdir) assert all_hashes == { 'cicero_0.7.2-3.diff.gz': { 'checksums': { 'blake2s256': '08b1c438e70d2474bab843d826515147fa4a817f8c4baaf3ddfbeb5132183f21', # noqa 'sha1': '0815282053f21601b0ec4adf7a8fe47eace3c0bc', 'sha1_git': '834ac91da3a9da8f23f47004bb456dd5bd16fe49', 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c' # noqa }, 'filename': 'cicero_0.7.2-3.diff.gz', 'length': 3964}, 'cicero_0.7.2-3.dsc': { 'checksums': { 'blake2s256': '8c002bead3e35818eaa9d00826f3d141345707c58fb073beaa8abecf4bde45d2', # noqa 'sha1': 'abbec4e8efbbc80278236e1dd136831eac08accd', 'sha1_git': '1f94b2086fa1142c2df6b94092f5c5fa11093a8e', 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03' # noqa }, 'filename': 'cicero_0.7.2-3.dsc', 'length': 1864}, 'cicero_0.7.2.orig.tar.gz': { 'checksums': { 'blake2s256': '9809aa8d2e2dad7f34cef72883db42b0456ab7c8f1418a636eebd30ab71a15a6', # noqa 'sha1': 'a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43', 'sha1_git': 'aa0a38978dce86d531b5b0299b4a616b95c64c74', 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786' # noqa }, 'filename': 'cicero_0.7.2.orig.tar.gz', 'length': 96527}} +def test_dsc_information_ok(): + fname = 'cicero_0.7.2-3.dsc' + dsc_url, dsc_name = dsc_information(PACKAGE_FILES) + + assert dsc_url == PACKAGE_FILES['files'][fname]['uri'] + assert dsc_name == PACKAGE_FILES['files'][fname]['name'] + + +def test_dsc_information_not_found(): + fname = 'cicero_0.7.2-3.dsc' + package_files = copy.deepcopy(PACKAGE_FILES) + package_files['files'].pop(fname) + + dsc_url, dsc_name = dsc_information(package_files) + + assert dsc_url is None + assert dsc_name is None + + +def test_dsc_information_too_many_dsc_entries(): + # craft an extra dsc file + fname = 'cicero_0.7.2-3.dsc' + package_files = copy.deepcopy(PACKAGE_FILES) + data = package_files['files'][fname] + fname2 = fname.replace('cicero', 'ciceroo') + package_files['files'][fname2] = data + + with pytest.raises( + ValueError, match='Package %s_%s references several dsc' % ( + package_files['name'], package_files['version'])): + dsc_information(package_files) + + def test_debian_first_visit( swh_config, requests_mock_http_datadir): """With no prior visit, load a gnu project ends up with 1 snapshot """ loader = DebianLoader( url='deb://Debian/packages/cicero', date='2019-10-12T05:58:09.165557+00:00', packages=PACKAGE_PER_VERSION) actual_load_status = loader.load() assert actual_load_status['status'] == 'eventful' stats = loader.storage.stat_counters() assert { 'content': 42, 'directory': 2, 'origin': 1, 'origin_visit': 1, 'person': 1, 'release': 0, 'revision': 1, # all artifacts under 1 revision 'skipped_content': 0, 'snapshot': 1 } == stats expected_snapshot = { 'id': 'a59ec49a01ff329dcbbc63fd36a5654143aef240', 'branches': { 'HEAD': { 'target_type': 'alias', 'target': 'releases/stretch/contrib/0.7.2-3' }, 'releases/stretch/contrib/0.7.2-3': { 'target_type': 'revision', 'target': '2807f5b3f84368b4889a9ae827fe85854ffecf07', } }, } # different than the previous loader as no release is done check_snapshot(expected_snapshot, loader.storage)