diff --git a/swh/loader/debian/loader.py b/swh/loader/debian/loader.py index 21688fd..3133bd0 100644 --- a/swh/loader/debian/loader.py +++ b/swh/loader/debian/loader.py @@ -1,510 +1,527 @@ # Copyright (C) 2015-2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import copy import hashlib import logging import os import re import subprocess import tempfile from dateutil.parser import parse as parse_date from debian.changelog import Changelog from debian.deb822 import Dsc import requests from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from swh.loader.core.loader import BufferedLoader from swh.storage.schemata.distribution import Package from swh.model import hashutil from swh.model.from_disk import Directory from swh.model.identifiers import identifier_to_bytes, snapshot_identifier from . import converters UPLOADERS_SPLIT = re.compile(r'(?<=\>)\s*,\s*') log = logging.getLogger(__name__) class DebianLoaderException(Exception): pass class PackageDownloadFailed(DebianLoaderException): """Raise this exception when a package extraction failed""" pass class PackageExtractionFailed(DebianLoaderException): """Raise this exception when a package extraction failed""" pass def _debian_to_hashlib(hashname): """Convert Debian hash names to hashlib-compatible names""" return { 'md5sum': 'md5', }.get(hashname, hashname) def download_package(package): """Fetch a source package in a temporary directory and check the checksums for all files""" tempdir = tempfile.TemporaryDirectory( prefix='swh.loader.debian.%s.' % package['name'] ) for filename, fileinfo in copy.deepcopy(package['files']).items(): uri = fileinfo.pop('uri') hashes = { hashname: hashlib.new(_debian_to_hashlib(hashname)) for hashname in fileinfo if hashname not in ('name', 'size') } r = requests.get(uri, stream=True) if r.status_code != 200: raise PackageDownloadFailed( 'Download of %s returned status_code %s: %s' % (uri, r.status_code, r.text) ) size = 0 with open(os.path.join(tempdir.name, filename), 'wb') as f: for chunk in r.iter_content(chunk_size=1024 * 1024): size += len(chunk) f.write(chunk) for hash in hashes.values(): hash.update(chunk) downloadinfo = { 'name': filename, 'size': size, } for hashname, hash in hashes.items(): downloadinfo[hashname] = hash.hexdigest() if fileinfo != downloadinfo: raise PackageDownloadFailed( 'Checksums mismatch: fetched %s, expected %s' % (downloadinfo, fileinfo) ) return tempdir def extract_package(package, tempdir): """Extract a Debian source package to a given directory Note that after extraction the target directory will be the root of the extracted package, rather than containing it. Args: package (dict): package information dictionary tempdir (str): directory where the package files are stored Returns: tuple: path to the dsc (str) and extraction directory (str) """ dsc_name = None for filename in package['files']: if filename.endswith('.dsc'): if dsc_name: raise PackageExtractionFailed( 'Package %s_%s references several dsc files' % (package['name'], package['version']) ) dsc_name = filename dsc_path = os.path.join(tempdir.name, dsc_name) destdir = os.path.join(tempdir.name, 'extracted') logfile = os.path.join(tempdir.name, 'extract.log') log.debug('extract Debian source package %s in %s' % (dsc_path, destdir), extra={ 'swh_type': 'deb_extract', 'swh_dsc': dsc_path, 'swh_destdir': destdir, }) cmd = ['dpkg-source', '--no-copy', '--no-check', '--ignore-bad-version', '-x', dsc_path, destdir] try: with open(logfile, 'w') as stdout: subprocess.check_call(cmd, stdout=stdout, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: logdata = open(logfile, 'r').read() raise PackageExtractionFailed('dpkg-source exited with code %s: %s' % (e.returncode, logdata)) from None return dsc_path, destdir def get_file_info(filepath): """Retrieve the original file information from the file at filepath. Args: filepath: the path to the original file Returns: dict: information about the original file, in a dictionary with the following keys - name: the file name - sha1, sha1_git, sha256: original file hashes - length: original file length """ name = os.path.basename(filepath) if isinstance(name, bytes): name = name.decode('utf-8') hashes = hashutil.MultiHash.from_path(filepath).hexdigest() hashes['name'] = name hashes['length'] = os.path.getsize(filepath) return hashes def get_package_metadata(package, dsc_path, extracted_path): """Get the package metadata from the source package at dsc_path, extracted in extracted_path. Args: package: the package dict (with a dsc_path key) dsc_path: path to the package's dsc file extracted_path: the path where the package got extracted Returns: dict: a dictionary with the following keys: - history: list of (package_name, package_version) tuples parsed from the package changelog - source_files: information about all the files in the source package """ ret = {} with open(dsc_path, 'rb') as dsc: parsed_dsc = Dsc(dsc) source_files = [get_file_info(dsc_path)] dsc_dir = os.path.dirname(dsc_path) for filename in package['files']: file_path = os.path.join(dsc_dir, filename) file_info = get_file_info(file_path) source_files.append(file_info) ret['original_artifact'] = source_files # Parse the changelog to retrieve the rest of the package information changelog_path = os.path.join(extracted_path, 'debian/changelog') with open(changelog_path, 'rb') as changelog: try: parsed_changelog = Changelog(changelog) except UnicodeDecodeError: log.warn('Unknown encoding for changelog %s,' ' falling back to iso' % changelog_path.decode('utf-8'), extra={ 'swh_type': 'deb_changelog_encoding', 'swh_name': package['name'], 'swh_version': str(package['version']), 'swh_changelog': changelog_path.decode('utf-8'), }) # need to reset as Changelog scrolls to the end of the file changelog.seek(0) parsed_changelog = Changelog(changelog, encoding='iso-8859-15') package_info = { 'name': package['name'], 'version': str(package['version']), 'changelog': { 'person': converters.uid_to_person(parsed_changelog.author), 'date': parse_date(parsed_changelog.date), 'history': [(block.package, str(block.version)) for block in parsed_changelog][1:], } } maintainers = [ converters.uid_to_person(parsed_dsc['Maintainer'], encode=False), ] maintainers.extend( converters.uid_to_person(person, encode=False) for person in UPLOADERS_SPLIT.split(parsed_dsc.get('Uploaders', '')) ) package_info['maintainers'] = maintainers ret['package_info'] = package_info return ret def process_package(package): """Process a source package into its constituent components. The source package will be decompressed in a temporary directory. Args: package (dict): a dict with the following keys: - name: source package name - version: source package version - dsc: the full path of the package's DSC file. Returns: tuple: A tuple with two elements: - package: the original package dict augmented with the following keys: - metadata: the metadata from get_package_metadata - directory: the sha1_git of the root directory of the package - objects: a dictionary of the parsed directories and files, both indexed by id Raises: FileNotFoundError: if the dsc file does not exist PackageExtractionFailed: if package extraction failed """ log.info("Processing package %s_%s" % (package['name'], str(package['version'])), extra={ 'swh_type': 'deb_process_start', 'swh_name': package['name'], 'swh_version': str(package['version']), }) tempdir = download_package(package) dsc, debdir = extract_package(package, tempdir) directory = Directory.from_disk(path=os.fsencode(debdir), save_path=True) metadata = get_package_metadata(package, dsc, debdir) return directory, metadata, tempdir class DebianLoader(BufferedLoader): """A loader for Debian packages""" CONFIG_BASE_FILENAME = 'loader/debian' ADDITIONAL_CONFIG = { 'lister_db_url': ('str', 'postgresql:///lister-debian'), } visit_type = 'deb' def __init__(self, config=None): super().__init__(logging_class=None, config=config) self.db_engine = create_engine(self.config['lister_db_url']) self.mk_session = sessionmaker(bind=self.db_engine) self.db_session = self.mk_session() def load(self, *, origin, date, packages): return super().load(origin=origin, date=date, packages=packages) def prepare_origin_visit(self, *, origin, date, packages): - self.origin = origin + self.origin = {'url': origin, 'type': 'deb'} self.visit_date = date def prepare(self, *, origin, date, packages): self.packages = packages # Deduplicate branches according to equivalent files branches_files = {} branches_revs = {} equiv_branch = {} for branch, package in packages.items(): if 'files' not in package: # already loaded, use default values branches_revs[branch] = identifier_to_bytes( package['revision_id'] ) equiv_branch[branch] = branch continue for eq_branch, files in branches_files.items(): if package['files'] == files: equiv_branch[branch] = eq_branch if (not branches_revs[eq_branch] and package['revision_id']): branches_revs[eq_branch] = identifier_to_bytes( package['revision_id'] ) break else: # No match: new entry equiv_branch[branch] = branch branches_files[branch] = package['files'] if package['revision_id']: branches_revs[branch] = identifier_to_bytes( package['revision_id'] ) else: branches_revs[branch] = None self.equivs = { 'branches': equiv_branch, 'revisions': branches_revs, } self.versions_to_load = [ (branch, self.packages[branch]) for branch in sorted(branches_revs) if not branches_revs[branch] ] self.version_idx = 0 self.done = self.version_idx >= len(self.versions_to_load) self.current_data = {} self.tempdirs = [] self.partial = False def fetch_data(self): if self.done: return False branch, package = self.versions_to_load[self.version_idx] self.version_idx += 1 try: directory, metadata, tempdir = process_package(package) self.tempdirs.append(tempdir) self.current_data = directory.collect() revision = converters.package_metadata_to_revision( package, directory, metadata ) self.current_data['revision'] = { revision['id']: revision, } self.equivs['revisions'][branch] = revision['id'] except DebianLoaderException: log.exception('Package %s_%s failed to load' % (package['name'], package['version'])) self.partial = True self.done = self.version_idx >= len(self.versions_to_load) return not self.done def store_data(self): self.maybe_load_contents( self.current_data.get('content', {}).values()) self.maybe_load_directories( self.current_data.get('directory', {}).values()) self.maybe_load_revisions( self.current_data.get('revision', {}).values()) self.current_data = {} if self.done: self.flush() self.update_packages() self.generate_and_load_snapshot() def update_packages(self): for branch in self.packages: package = self.packages[branch] if package['revision_id']: continue rev = self.equivs['revisions'][self.equivs['branches'][branch]] if not rev: continue db_package = self.db_session.query(Package)\ .filter(Package.id == package['id'])\ .one() db_package.revision_id = rev self.db_session.commit() def generate_and_load_snapshot(self): """Create a SWH archive "snapshot" of the package being loaded, and send it to the archive. """ branches = {} for branch in self.packages: rev = self.equivs['revisions'][self.equivs['branches'][branch]] if rev: target = { 'target_type': 'revision', 'target': rev, } else: self.partial = True target = None branches[branch.encode('utf-8')] = target snapshot = {'branches': branches} snapshot['id'] = identifier_to_bytes(snapshot_identifier(snapshot)) self.maybe_load_snapshot(snapshot) def load_status(self): status = 'eventful' if self.versions_to_load else 'uneventful' return { 'status': status if not self.partial else 'failed', } def visit_status(self): return 'partial' if self.partial else 'full' def cleanup(self): for d in self.tempdirs: d.cleanup() if __name__ == '__main__': import click import logging logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(process)d %(message)s' ) @click.command() @click.option('--origin-url', required=True, help='Origin url to associate') @click.option('--packages', help='Debian packages to load') @click.option('--visit-date', default=None, help='Visit date time override') def main(origin_url, packages, visit_date): """Loading debian package tryout.""" - origin = {'url': origin_url, 'type': 'deb'} if not packages: packages = { - "buster/main/3.2.3-1": { - "id": 178584, - "name": "alex", - "version": "3.2.3-1", - "revision_id": "e8b2fe173ab909aa49d40b59292a44c2668e8a26" - }, - "jessie/main/3.1.3-1": { - "id": 230994, - "name": "alex", - "version": "3.1.3-1", - "revision_id": "9a7c853d4cb2521f59108d8d5f21f26a800038ca" - }, + 'stretch/contrib/0.7.2-3': { + 'files': { + 'cicero_0.7.2-3.diff.gz': { + 'md5sum': 'a93661b6a48db48d59ba7d26796fc9ce', + 'name': 'cicero_0.7.2-3.diff.gz', + 'sha256': 'f039c9642fe15c75bed5254315e2a29f9f2700da0e29d9b0729b3ffc46c8971c', # noqa + 'size': 3964, + 'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2-3.diff.gz' # noqa + }, + 'cicero_0.7.2-3.dsc': { + 'md5sum': 'd5dac83eb9cfc9bb52a15eb618b4670a', + 'name': 'cicero_0.7.2-3.dsc', + 'sha256': '35b7f1048010c67adfd8d70e4961aefd8800eb9a83a4d1cc68088da0009d9a03', # noqa + 'size': 1864, + 'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2-3.dsc'}, # noqa + 'cicero_0.7.2.orig.tar.gz': { + 'md5sum': '4353dede07c5728319ba7f5595a7230a', + 'name': 'cicero_0.7.2.orig.tar.gz', + 'sha256': '63f40f2436ea9f67b44e2d4bd669dbabe90e2635a204526c20e0b3c8ee957786', # noqa + 'size': 96527, + 'uri': 'http://deb.debian.org/debian//pool/contrib/c/cicero/cicero_0.7.2.orig.tar.gz' # noqa + } + }, + 'id': 23, + 'name': 'cicero', + 'revision_id': None, + 'version': '0.7.2-3' + } } - DebianLoader().load(origin=origin, date=visit_date, packages=packages) + + DebianLoader().load(origin=origin_url, date=visit_date, + packages=packages) main() diff --git a/swh/loader/debian/tests/test_loader.py b/swh/loader/debian/tests/test_loader.py index 9c5b8a0..e616021 100644 --- a/swh/loader/debian/tests/test_loader.py +++ b/swh/loader/debian/tests/test_loader.py @@ -1,148 +1,148 @@ # Copyright (C) 2018 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os from unittest import TestCase import pytest import requests_mock from swh.core.db.tests.db_testing import SingleDbTestFixture from swh.model.hashutil import hash_to_bytes from swh.storage.schemata.distribution import SQLBase from swh.loader.core.tests import BaseLoaderTest from swh.loader.debian.loader import get_file_info, DebianLoader from . import TEST_LOADER_CONFIG RESOURCES_PATH = os.path.join(os.path.dirname(__file__), 'resources') class DebianLoaderTest(DebianLoader): def parse_config_file(self, *args, **kwargs): return TEST_LOADER_CONFIG @pytest.mark.fs class TestFileInfo(TestCase): def test_get_file_info(self): path = '%s/%s' % (RESOURCES_PATH, 'onefile.txt') actual_info = get_file_info(path) expected_info = { 'name': 'onefile.txt', 'length': 62, 'sha1': '135572f4ac013f49e624612301f9076af1eacef2', 'sha1_git': '1d62cd247ef251d52d98bbd931d44ad1f967ea99', 'sha256': '40f1a3cbe9355879319759bae1a6ba09cbf34056e79e951cd2dc0adbff169b9f', # noqa 'blake2s256': '4072cf9a0017ad7705a9995bbfbbc098276e6a3afea8d84ab54bff6381c897ab', # noqa } self.assertEqual(actual_info, expected_info) @pytest.mark.fs class TestDebianLoader(SingleDbTestFixture, BaseLoaderTest): TEST_DB_NAME = 'test-lister-debian' TEST_DB_DUMP = [] def setUp(self): super().setUp(archive_name='', start_path=os.path.dirname(__file__), uncompress_archive=False) self.loader = DebianLoaderTest() SQLBase.metadata.create_all(self.loader.db_engine) self.storage = self.loader.storage make_uri = ('file://' + RESOURCES_PATH + '/').__add__ self.debian_src_name = 'hello_2.10-1+deb9u1.debian.tar.xz' self.orig_src_name = 'hello_2.10.orig.tar.gz' self.dsc_name = 'hello_2.10-1+deb9u1.dsc' self.files = { self.dsc_name: { 'name': self.dsc_name, 'uri': make_uri(self.dsc_name), 'size': 1866, }, self.debian_src_name: { 'name': self.debian_src_name, 'uri': make_uri(self.debian_src_name), 'size': 6156, }, self.orig_src_name: { 'name': self.orig_src_name, 'uri': make_uri(self.orig_src_name), 'size': 725946, }, } self._fill_db() def tearDown(self): SQLBase.metadata.drop_all(self.loader.db_engine) self.loader.db_session.close() self.loader.db_engine.dispose() # close the connection pool super().tearDown() def _fill_db(self): from swh.storage.schemata.distribution import \ Area, Distribution, Package dist = Distribution(name='Debian', type='deb', mirror_uri='devnull://') area = Area(distribution=dist, name='main') pkg = Package(area=area, name='hello', version='2.10-1+deb9u1', directory='dir', files=self.files) self.loader.db_session.add_all([area, pkg]) self.loader.db_session.commit() self.pkg_id = pkg.id def _load(self): self.loader.load( - origin={'url': self.repo_url, 'type': 'deb'}, + origin=self.repo_url, date='2018-12-14 16:45:00+00', packages={ 'stretch/main/2.10-1+deb9u1': { 'id': self.pkg_id, 'name': 'hello', 'version': '2.10-1+deb9u1', 'revision_id': None, 'files': self.files, } } ) def test_load(self): with requests_mock.Mocker() as m: for file_ in self.files.values(): path = os.path.join(RESOURCES_PATH, file_['name']) with open(path, 'rb') as fd: m.get(file_['uri'], content=fd.read()) self._load() self.assertCountSnapshots(1) self.assertCountReleases(0) # FIXME: Why not 1? self.assertCountRevisions(1) self.assertCountDirectories(14) self.assertCountContents(315) # Check the root dir was loaded, and contains 'src/' root_hash = 'c906789049d2327a69b81cca6a1c1737321c836f' ls_root = list(self.storage.directory_ls( hash_to_bytes(root_hash))) src_dirs = [x for x in ls_root if x['name'] == b'src'] self.assertEqual(len(src_dirs), 1, src_dirs) # Check 'src/hello.c' exists src_dir_hash = src_dirs[0]['target'] ls_src = list(self.storage.directory_ls(src_dir_hash)) hello_c = [x for x in ls_src if x['name'] == b'hello.c'] self.assertEqual(len(hello_c), 1, hello_c) # Check 'src.hello.c' was loaded and has the right hash hello_c_hash = 'b60a061ac9dd25b29d57b756b5959aadc1fe6386' self.assertEqual(hello_c[0]['sha1'], hash_to_bytes(hello_c_hash)) missing = list(self.storage.content_missing( [{'sha1': hash_to_bytes(hello_c_hash)}])) self.assertEqual(missing, [])