diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ requests iso8601 pkginfo +python-debian diff --git a/swh/loader/package/debian.py b/swh/loader/package/debian.py new file mode 100644 --- /dev/null +++ b/swh/loader/package/debian.py @@ -0,0 +1,358 @@ +# Copyright (C) 2017-2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import os +import copy +import datetime +import email.utils +import logging +import re +import subprocess + +from dateutil.parser import parse as parse_date +from debian.changelog import Changelog +from debian.deb822 import Dsc +from typing import Any, Dict, Generator, Mapping, Optional, Sequence, Tuple + +from swh.model import hashutil + +from swh.loader.package.loader import PackageLoader +from swh.loader.package.utils import download + + +logger = logging.getLogger(__name__) +UPLOADERS_SPLIT = re.compile(r'(?<=\>)\s*,\s*') + + +def uid_to_person(uid, encode=True): + """Convert an uid to a person suitable for insertion. + + Args: + uid: an uid of the form "Name " + encode: whether to convert the output to bytes or not + + Returns: + dict: a dictionary with the following keys: + + - name: the name associated to the uid + - email: the mail associated to the uid + """ + + ret = { + 'name': '', + 'email': '', + 'fullname': uid, + } + + name, mail = email.utils.parseaddr(uid) + + if name and email: + ret['name'] = name + ret['email'] = mail + else: + ret['name'] = uid + + if encode: + for key in list(ret): + ret[key] = ret[key].encode('utf-8') + + return ret + + +def download_package(package: Dict, tmpdir: Any) -> Mapping[str, Dict]: + """Fetch a source package in a temporary directory and check the checksums + for all files. + + Args: + package: Dict defining the set of files representing a debian package + tmpdir: Where to download and extract the files to ingest + + Returns: + Dict of swh hashes per filename key + + """ + all_hashes = {} + for filename, fileinfo in copy.deepcopy(package['files']).items(): + uri = fileinfo.pop('uri') + logger.debug('filename: %s', filename) + logger.debug('fileinfo: %s', fileinfo) + extrinsic_hashes = {'sha256': fileinfo['sha256']} + logger.debug('extrinsic_hashes(%s): %s', filename, extrinsic_hashes) + filepath, hashes = download(uri, dest=tmpdir, filename=filename, + hashes=extrinsic_hashes) + all_hashes[filename] = hashes + + logger.debug('all_hashes: %s', all_hashes) + return all_hashes + + +def extract_package(package: Dict, tmpdir: str) -> Tuple[str, str, str]: + """Extract a Debian source package to a given directory. + + Note that after extraction the target directory will be the root of the + extracted package, rather than containing it. + + Args: + package (dict): package information dictionary + tmpdir (str): directory where the package files are stored + + Returns: + tuple: path to the dsc, uri used to retrieve the dsc, extraction + directory + + """ + dsc_name = None + dsc_url = None + + for filename, fileinfo in package['files'].items(): + if filename.endswith('.dsc'): + if dsc_name: + raise ValueError( + 'Package %s_%s references several dsc files' % + (package['name'], package['version']) + ) + dsc_url = fileinfo['uri'] + dsc_name = filename + + dsc_path = os.path.join(tmpdir, dsc_name) + destdir = os.path.join(tmpdir, 'extracted') + logfile = os.path.join(tmpdir, 'extract.log') + + logger.debug('extract Debian source package %s in %s' % + (dsc_path, destdir), extra={ + 'swh_type': 'deb_extract', + 'swh_dsc': dsc_path, + 'swh_destdir': destdir, + }) + + cmd = ['dpkg-source', + '--no-copy', '--no-check', + '--ignore-bad-version', + '-x', dsc_path, + destdir] + + try: + with open(logfile, 'w') as stdout: + subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + logdata = open(logfile, 'r').read() + raise ValueError('dpkg-source exited with code %s: %s' % + (e.returncode, logdata)) from None + + return dsc_path, dsc_url, destdir + + +def get_file_info(filepath): + """Retrieve the original file information from the file at filepath. + + Args: + filepath: the path to the original file + + Returns: + dict: information about the original file, in a dictionary with the + following keys + + - name: the file name + - sha1, sha1_git, sha256: original file hashes + - length: original file length + """ + + name = os.path.basename(filepath) + if isinstance(name, bytes): + name = name.decode('utf-8') + + hashes = hashutil.MultiHash.from_path(filepath).hexdigest() + hashes['name'] = name + hashes['length'] = os.path.getsize(filepath) + return hashes + + +def get_package_metadata(package, dsc_path, extracted_path): + """Get the package metadata from the source package at dsc_path, + extracted in extracted_path. + + Args: + package: the package dict (with a dsc_path key) + dsc_path: path to the package's dsc file + extracted_path: the path where the package got extracted + + Returns: + dict: a dictionary with the following keys: + + - history: list of (package_name, package_version) tuples parsed from + the package changelog + - source_files: information about all the files in the source package + + """ + ret = {} + + with open(dsc_path, 'rb') as dsc: + parsed_dsc = Dsc(dsc) + + source_files = [get_file_info(dsc_path)] + + dsc_dir = os.path.dirname(dsc_path) + for filename in package['files']: + file_path = os.path.join(dsc_dir, filename) + file_info = get_file_info(file_path) + source_files.append(file_info) + + ret['original_artifact'] = source_files + + # Parse the changelog to retrieve the rest of the package information + changelog_path = os.path.join(extracted_path, 'debian/changelog') + with open(changelog_path, 'rb') as changelog: + try: + parsed_changelog = Changelog(changelog) + except UnicodeDecodeError: + logger.warning('Unknown encoding for changelog %s,' + ' falling back to iso' % + changelog_path.decode('utf-8'), extra={ + 'swh_type': 'deb_changelog_encoding', + 'swh_name': package['name'], + 'swh_version': str(package['version']), + 'swh_changelog': changelog_path.decode('utf-8'), + }) + + # need to reset as Changelog scrolls to the end of the file + changelog.seek(0) + parsed_changelog = Changelog(changelog, encoding='iso-8859-15') + + package_info = { + 'name': package['name'], + 'version': str(package['version']), + 'changelog': { + 'person': uid_to_person(parsed_changelog.author), + 'date': parse_date(parsed_changelog.date), + 'history': [(block.package, str(block.version)) + for block in parsed_changelog][1:], + } + } + + maintainers = [ + uid_to_person(parsed_dsc['Maintainer'], encode=False), + ] + maintainers.extend( + uid_to_person(person, encode=False) + for person in UPLOADERS_SPLIT.split(parsed_dsc.get('Uploaders', '')) + ) + package_info['maintainers'] = maintainers + + ret['package_info'] = package_info + + return ret + + +class DebianLoader(PackageLoader): + """Load debian origins into swh archive. + + """ + visit_type = 'debian' + + def __init__(self, url: str, date: str, packages: Mapping[str, Dict]): + super().__init__(url=url) + self._info = None + self.packages = packages + self.dsc_path = None + self.dsc_url = None + + def get_versions(self) -> Sequence[str]: + """Returns the keys of the packages input (e.g. + stretch/contrib/0.7.2-3, etc...) + + """ + return self.packages.keys() + + def get_default_release(self) -> str: + """Take the first version as default release + + """ + return list(self.packages.keys())[0] + + def get_artifacts(self, version: str) -> Generator[ + Tuple[str, str, Dict], None, None]: + url = '' # url is not useful to retrieve the package files here + a_metadata = self.packages[version] + yield version, url, a_metadata # we care only for version, a_metadata + + def resolve_revision_from( + self, known_artifacts: Dict, artifact_metadata: Dict) \ + -> Optional[bytes]: + pass # for now + + def download_package(self, a_uri: str, tmpdir: str, filename: str, + a_metadata: Dict) -> Tuple[str, Dict]: + """Contrary to other package loaders (1 package, 1 artifact), + `a_metadata` represents the package's datafiles set to fetch: + - .orig.tar.gz + - .dsc + - .diff.gz + + This is delegated to the `download_package` function. + + """ + logger.debug('debian: a_metadata: %s', a_metadata) + a_c_metadata = download_package(a_metadata, tmpdir) + return tmpdir, a_c_metadata + + def uncompress(self, a_path: str, tmpdir: str, a_metadata: Dict) -> str: + self.dsc_path, self.dsc_url, a_uncompressed_path = extract_package( + a_metadata, tmpdir) + return a_uncompressed_path + + def read_intrinsic_metadata(self, a_metadata: Dict, + a_uncompressed_path: str) -> Dict: + dsc_path = self.dsc_path # XXX + i_metadata = get_package_metadata( + a_metadata, dsc_path, a_uncompressed_path) + return i_metadata + + def build_revision( + self, a_metadata: Dict, i_metadata: Dict) -> Dict: + + logger.debug('i_metadata: %s', i_metadata) + logger.debug('a_metadata: %s', a_metadata) + + def prepare(obj): + if isinstance(obj, list): + return [prepare(item) for item in obj] + elif isinstance(obj, dict): + return {k: prepare(v) for k, v in obj.items()} + elif isinstance(obj, datetime.datetime): + return obj.isoformat() + elif isinstance(obj, bytes): + return obj.decode('utf-8') + else: + return copy.deepcopy(obj) + + package_info = i_metadata['package_info'] + + msg = 'Synthetic revision for Debian source package %s version %s' % ( + a_metadata['name'], a_metadata['version']) + + date = package_info['changelog']['date'] + author = package_info['changelog']['person'] + + # inspired from swh.loader.debian.converters.package_metadata_to_revision # noqa + return { + 'type': 'dsc', + 'message': msg.encode('utf-8'), + 'author': author, + 'date': date, + 'committer': author, + 'committer_date': date, + 'parents': [], + 'metadata': { + 'intrinsic': { + 'tool': 'dsc', + 'raw': prepare(package_info), + }, + 'extrinsic': { + 'provider': self.dsc_url, + 'when': self.visit_date.isoformat(), + 'raw': a_metadata, + }, + } + } diff --git a/swh/loader/package/deposit.py b/swh/loader/package/deposit.py --- a/swh/loader/package/deposit.py +++ b/swh/loader/package/deposit.py @@ -60,7 +60,7 @@ yield filename, url, self.metadata def build_revision( - self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: + self, a_metadata: Dict, i_metadata: Dict) -> Dict: revision = a_metadata.pop('revision') metadata = { 'extrinsic': { @@ -76,6 +76,7 @@ revision['author'] = parse_author(revision['author']) revision['committer'] = parse_author(revision['committer']) revision['message'] = revision['message'].encode('utf-8') + revision['type'] = 'tar' return revision diff --git a/swh/loader/package/gnu.py b/swh/loader/package/gnu.py --- a/swh/loader/package/gnu.py +++ b/swh/loader/package/gnu.py @@ -167,9 +167,10 @@ return rev_id def build_revision( - self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: + self, a_metadata: Dict, i_metadata: Dict) -> Dict: normalized_date = normalize_timestamp(int(a_metadata['time'])) return { + 'type': 'tar', 'message': self.REVISION_MESSAGE, 'date': normalized_date, 'author': self.SWH_PERSON, diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -90,8 +90,9 @@ yield from {} def build_revision( - self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: - """Build the revision dict + self, a_metadata: Dict, i_metadata: Dict) -> Dict: + """Build the revision dict from the archive metadata (extrinsic + artifact metadata) and the intrinsic metadata. Returns: SWH data dict @@ -164,6 +165,43 @@ """ return None + def download_package(self, a_uri: str, tmpdir: str, filename: str, + a_metadata: Dict) -> Tuple[str, Dict]: + """Download package from uri within the tmpdir (with name filename). + Optionally, this can also use the a_metadata information to retrieve + more information. + + Note: Default implementation does not use the a_metadata (debian + implementation does) + + """ + return download(a_uri, dest=tmpdir, filename=filename) + + def read_intrinsic_metadata( + self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: + """Read intrinsic metadata from either the a_metadata or + the uncompressed path. + + Depending on the implementations, some extracts directly from the + artifacts to ingest (pypi, npm...), some use api to access directly + their intrinsic metadata (debian exposes a dsc through uri) or some + have none (gnu). + + """ + return {} + + def uncompress( + self, a_path: str, tmpdir: str, a_metadata: Dict) -> str: + """Uncompress the artfifact(s) stored at a_path to tmpdir. + + Optionally, this could need to use the a_metadata dict for some more + information (debian). + + """ + uncompressed_path = os.path.join(tmpdir, 'src') + uncompress(a_path, dest=uncompressed_path) + return uncompressed_path + def load(self) -> Dict: """Load for a specific origin the associated contents. @@ -241,8 +279,9 @@ with tempfile.TemporaryDirectory() as tmpdir: try: # a_c_: archive_computed_ - a_path, a_c_metadata = download( - a_uri, dest=tmpdir, filename=a_filename) + a_path, a_c_metadata = self.download_package( + a_uri, tmpdir, a_filename, + a_metadata=a_metadata) except Exception: logger.exception('Unable to retrieve %s', a_uri) @@ -253,14 +292,14 @@ logger.debug('archive_computed_metadata: %s', a_c_metadata) - uncompressed_path = os.path.join(tmpdir, 'src') - uncompress(a_path, dest=uncompressed_path) - + uncompressed_path = self.uncompress( + a_path, tmpdir, a_metadata) logger.debug('uncompressed_path: %s', uncompressed_path) directory = Directory.from_disk( - path=uncompressed_path.encode('utf-8'), data=True) # noqa + path=uncompressed_path.encode('utf-8'), + data=True) # noqa # FIXME: Try not to load the full raw content in # memory objects = directory.collect() @@ -280,11 +319,13 @@ self.storage.directory_add(directories) + i_metadata = self.read_intrinsic_metadata( + a_metadata, uncompressed_path) + # FIXME: This should be release. cf. D409 revision = self.build_revision( - a_metadata, uncompressed_path) + a_metadata, i_metadata) revision.update({ - 'type': 'tar', 'synthetic': True, 'directory': directory.hash, }) diff --git a/swh/loader/package/npm.py b/swh/loader/package/npm.py --- a/swh/loader/package/npm.py +++ b/swh/loader/package/npm.py @@ -257,10 +257,12 @@ if shasum == original_artifact['checksums']['sha1']: return rev_id + def read_intrinsic_metadata(self, a_metadata: Dict, + a_uncompressed_path: str) -> Dict: + return extract_intrinsic_metadata(a_uncompressed_path) + def build_revision( - self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: - # Parse metadata (project, artifact metadata) - i_metadata = extract_intrinsic_metadata(a_uncompressed_path) + self, a_metadata: Dict, i_metadata: Dict) -> Dict: # from intrinsic metadata author = extract_npm_package_author(i_metadata) @@ -272,11 +274,12 @@ message = version.encode('ascii') return { + 'type': 'tar', + 'message': message, 'author': author, 'date': date, 'committer': author, 'committer_date': date, - 'message': message, 'parents': [], 'metadata': { 'intrinsic': { diff --git a/swh/loader/package/pypi.py b/swh/loader/package/pypi.py --- a/swh/loader/package/pypi.py +++ b/swh/loader/package/pypi.py @@ -143,14 +143,16 @@ if sha256 == original_artifact['checksums']['sha256']: return rev_id + def read_intrinsic_metadata(self, a_metadata: Dict, + a_uncompressed_path: str) -> Dict: + return extract_intrinsic_metadata(a_uncompressed_path) + def build_revision( - self, a_metadata: Dict, a_uncompressed_path: str) -> Dict: - # Parse metadata (project, artifact metadata) - metadata = extract_intrinsic_metadata(a_uncompressed_path) + self, a_metadata: Dict, i_metadata: Dict) -> Dict: # from intrinsic metadata - name = metadata['version'] - _author = author(metadata) + name = i_metadata['version'] + _author = author(i_metadata) # from extrinsic metadata message = a_metadata.get('comment_text', '') @@ -159,6 +161,7 @@ int(iso8601.parse_date(a_metadata['upload_time']).timestamp())) return { + 'type': 'tar', 'message': message.encode('utf-8'), 'author': _author, 'date': date, @@ -168,7 +171,7 @@ 'metadata': { 'intrinsic': { 'tool': 'PKG-INFO', - 'raw': metadata, + 'raw': i_metadata, }, 'extrinsic': { 'provider': self.provider_url, diff --git a/swh/loader/package/tests/conftest.py b/swh/loader/package/tests/conftest.py --- a/swh/loader/package/tests/conftest.py +++ b/swh/loader/package/tests/conftest.py @@ -4,12 +4,25 @@ # See top-level LICENSE file for more information import pytest +import re +from functools import partial from os import path +from swh.core.pytest_plugin import get_response_cb + @pytest.fixture def swh_config(monkeypatch, datadir): conffile = path.join(datadir, 'loader.yml') monkeypatch.setenv('SWH_CONFIG_FILENAME', conffile) return conffile + + +@pytest.fixture +def requests_mock_http_datadir(requests_mock_datadir, datadir): + # hack: main fixture does not support http query yet + requests_mock_datadir.get(re.compile('http://'), body=partial( + get_response_cb, datadir=datadir)) + + return requests_mock_datadir diff --git a/swh/loader/package/tests/data/deb.debian.org/debian__pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz b/swh/loader/package/tests/data/deb.debian.org/debian__pool_contrib_c_cicero_cicero_0.7.2-3.diff.gz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@ +Uploaders: Samuel Thibault +Homepage: http://pages.infinit.net/fluxnic/cicero/ +Standards-Version: 3.9.6 +Vcs-Browser: http://git.debian.org/?p=pkg-a11y/cicero.git;a=summary +Vcs-Git: git://git.debian.org/git/pkg-a11y/cicero.git +Build-Depends: debhelper (>= 7) +Package-List: + cicero deb contrib/sound optional arch=all +Checksums-Sha1: + a286efd63fe2c9c9f7bb30255c3d6fcdcf390b43 96527 cicero_0.7.2.orig.tar.gz + 0815282053f21601b0ec4adf7a8fe47eace3c0bc 3964 cicero_0.7.2-3.diff.gz +Checksums-Sha256: + 63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786 96527 cicero_0.7.2.orig.tar.gz + f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c 3964 cicero_0.7.2-3.diff.gz +Files: + 4353dede07c5728319ba7f5595a7230a 96527 cicero_0.7.2.orig.tar.gz + a93661b6a48db48d59ba7d26796fc9ce 3964 cicero_0.7.2-3.diff.gz + +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v1 + +iQIcBAEBCgAGBQJUQ9GjAAoJEBH0lP5vGG7NTFMQAIk5Wkicp5/GQOfkFh5qT7X7 +cKd98i/7t/0HznGCPv1iaQwsky5wbdqimMaW+vnKWEj8P2AEOLmyfGAjAKGSj0yW +r28dB0+vaiy1rFbtuTL+AVrtO2b/uVuh9eA2ZhDgLekv//bSzpMorIF+uqdQS18d +x2y9ZyKOucVPc+ARTcTrOmPbKR7ywIZEaj3E0Lq5p1e50BkqHVbZzzM7dMZuyatH +FcTsoCjz9kiulGx4LGzItajMBOdA2lIK4TlBRsO6wApOIvOnhSEQr5CqwbVwzwGv +N//8EoiNbs5bpweOGXOLN/RzvRPaEp/8W5P+E7jKyyiGkBeBrQeDlJA5cqBXcz1G +63zVmLyp3AYDrRaQ1AvgUyoL91mQIsDwc2gwT3YRYc4TE4HtYCAD85e/NGCAG5mk +vy+WH6NaaU6mb17IN7V+mGgb/l5pgwPADP4VaFugjrZK7nJp6I2xK2FmgDlGw8gj +qC2LUVuI/ijxTkxS9KdGSwtF4YLw6hbhUIv+19n5ajJ8MpTniv3hPiG4ZYY0qc7j +oejiRGszAR9syTjPKHhYpBnKwTVg8dkaOI+Jw+uwlK5W0opKoDt4Kr4ceCxuxsvU +L1I0MtaTGsGABJTX6utGvklYROApAcqMzGYozNeYOuGlWpvBx5QqdTmo6yv515cq +vWwMF6ldOni8Da5B/7Q9 +=XtIw +-----END PGP SIGNATURE----- diff --git a/swh/loader/package/tests/data/deb.debian.org/debian__pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz b/swh/loader/package/tests/data/deb.debian.org/debian__pool_contrib_c_cicero_cicero_0.7.2.orig.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@