diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py index ab17f0b..5ccae39 100644 --- a/swh/loader/tar/loader.py +++ b/swh/loader/tar/loader.py @@ -1,113 +1,116 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import tempfile import shutil import sys import traceback +from swh.core import hashutil from swh.loader.dir import loader -from swh.loader.tar import tarball +from swh.loader.tar import tarball, utils class TarLoader(loader.DirLoader): """A tarball loader. """ def __init__(self, config): super().__init__(config) self.log = logging.getLogger('swh.loader.tar.TarLoader') def process(self, tarpath, origin, revision, release, occurrences): """Load a tarball in backend. This will: - persist the origin if it does not exist. - write an entry in fetch_history to mark the loading tarball start - uncompress locally the tarballs in a temporary location - process the content of the tarballs to persist on swh storage - clean up the temporary location - write an entry in fetch_history to mark the loading tarball end Args: - tarpath: path to the tarball to uncompress - origin: Dictionary origin - url: url origin we fetched - type: type of the origin - revision: Dictionary of information needed, keys are: - author_name: revision's author name - author_email: revision's author email - author_date: timestamp (e.g. 1444054085) - author_offset: date offset e.g. -0220, +0100 - committer_name: revision's committer name - committer_email: revision's committer email - committer_date: timestamp - committer_offset: date offset e.g. -0220, +0100 - type: type of revision dir, tar - message: synthetic message for the revision - release: Dictionary of information needed, keys are: - name: release name - date: release timestamp (e.g. 1444054085) - offset: release date offset e.g. -0220, +0100 - author_name: release author's name - author_email: release author's email - comment: release's comment message - occurrences: List of occurrence dictionary. Information needed, keys are: - branch: occurrence's branch name - authority_id: authority id (e.g. 1 for swh) - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' origin['id'] = self.storage.origin_add_one(origin) # Mark the start of the loading fetch_history_id = self.open_fetch_history(origin['id']) # Prepare the extraction path extraction_dir = self.config['extraction_dir'] os.makedirs(extraction_dir, 0o755, exist_ok=True) dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', dir=extraction_dir) # T62: # - create tarball as content in storage # - transit the information to the loader dir - # T22: add checksums in revision + # add checksums in revision + hashes = utils.convert_to_hex(hashutil.hashfile(tarpath)) + revision['metadata'] = {'checksums': hashes} # for edge cases (NotImplemented...) result = {'status': False, 'stderr': ''} try: self.log.info('Uncompress %s to %s' % (tarpath, dir_path)) tarball.uncompress(tarpath, dir_path) result = super().process(dir_path, origin, revision, release, occurrences) except: e_info = sys.exc_info() if not result['status']: # Enrich the error message with the tarball result['stderr'] = 'reason:%s\ntrace:%s\n%s' % ( e_info[1], ''.join(traceback.format_tb(e_info[2])), result.get('stderr', '')) raise finally: shutil.rmtree(dir_path) if not result['status']: result['stderr'] = 'archive:%s\nreason:%s' % ( tarpath, result.get('stderr', '')) # mark the end of the loading self.close_fetch_history(fetch_history_id, result) diff --git a/swh/loader/tar/tests/test_utils.py b/swh/loader/tar/tests/test_utils.py index 0dcdaeb..3dd870b 100644 --- a/swh/loader/tar/tests/test_utils.py +++ b/swh/loader/tar/tests/test_utils.py @@ -1,127 +1,157 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.tar import utils class TestUtils(unittest.TestCase): @classmethod def setupClass(cls): cls.files = { 'free-ipmi-1.2.2.tar': ('free-ipmi-', '1.2.2', '.tar'), 'free-ipmi-1.2.2.tar.gz': ('free-ipmi-', '1.2.2', '.tar.gz'), 'free-ipmi-1.2.2.tar.tgz': ('free-ipmi-', '1.2.2', '.tar.tgz'), 'gcc-testsuite-4.4.2-4.4.3.diff.bz2': ( 'gcc-testsuite-', '4.4.2-4.4.3', '.diff.bz2'), 'gcc-java-4.0.4.tar.gz': ('gcc-java-', '4.0.4', '.tar.gz'), 'gmp-2.0.tar.lzma': ('gmp-', '2.0', '.tar.lzma'), 'win-gerwin-0.6.zip': ('win-gerwin-', '0.6', '.zip'), 'ballandpaddle-0.8.0.tar.xz': ( 'ballandpaddle-', '0.8.0', '.tar.xz'), 'mail-1.1.1.some.lz': ('mail-', '1.1.1.some', '.lz'), 'gmp-4.1.1-4.1.2.diff.tar.Z': ( 'gmp-', '4.1.1-4.1.2', '.diff.tar.Z'), 'findutils-4.2.18.tar.bzip2': ( 'findutils-', '4.2.18', '.tar.bzip2'), 'gnunet-java-0.9.4.jar': ('gnunet-java-', '0.9.4', '.jar'), 'pycdio-0.15-py2.5-linux-i686.egg': ( 'pycdio-', '0.15-py2.5-linux-i686', '.egg'), 'rbcdio-0.04.gem': ('rbcdio-', '0.04', '.gem'), 'librejs-6.0.5.xpi': ('librejs-', '6.0.5', '.xpi'), 'icecat-31.8.0.csb.langpack.xpi': ( 'icecat-', '31.8.0.csb.langpack', '.xpi'), 'icecatmobile-31.8.0.en-US.android-arm.apk': ( 'icecatmobile-', '31.8.0.en-US.android-arm', '.apk'), 'icecat-31.8.0.en-US.mac.dmg': ( 'icecat-', '31.8.0.en-US.mac', '.dmg'), 'gnutls-3.0.21-1gn.DevPak': ('gnutls-', '3.0.21-1gn', '.DevPak'), # . separator 'greg-1.4.tar.gz': ('greg-', '1.4', '.tar.gz'), # number in software product 'aspell6-pt_BR-20070411-0.tar.bz2': ( 'aspell6-pt_BR-', '20070411-0', '.tar.bz2'), 'libosip2-3.3.0.tar.gz': ('libosip2-', '3.3.0', '.tar.gz'), # other cases 'hurd-F2-main.iso': ('hurd-F2-main', None, '.iso'), 'winboard-4_0_5.exe': ('winboard-', '4_0_5', '.exe'), # particular patterns... 'gift-0.1.9+3epsilon.tar.gz': ( 'gift-', '0.1.9+3epsilon', '.tar.gz'), 'gift-0.1.6pre2.tgz': ('gift-', '0.1.6pre2', '.tgz'), 'binutils-2.19.1a.tar.bz2': ('binutils-', '2.19.1a', '.tar.bz2'), 'readline-4.2-4.2a.diff.gz': ('readline-', '4.2-4.2a', '.diff.gz'), # with arch patterns 'cvs-1.12.6-BSD.bin.gz': ('cvs-', '1.12.6-BSD.bin', '.gz'), 'cvs-1.12.12-SunOS-5.8-i386.gz': ( 'cvs-', '1.12.12-SunOS-5.8-i386', '.gz'), 'gnutls-3.0.20-w32.zip': ('gnutls-', '3.0.20-w32', '.zip'), 'mit-scheme_7.7.90+20080130-0gutsy1.diff.gz': ( 'mit-scheme_', '7.7.90+20080130-0gutsy1', '.diff.gz'), # no release number 'gnu.ps.gz': ('gnu', None, '.ps.gz'), 'direvent-latest.tar.gz': ('direvent-latest', None, '.tar.gz'), } cls.files_error = ['.tar', '.anything'] @istest def parse_filename(self): for f in self.files: # when actual_components = utils.parse_filename(f) # then name, version, ext = self.files[f] expected_components = { 'software_name': name, 'release_number': version, 'extension': ext, } self.assertEquals(actual_components, expected_components) @istest def parse_filename_not_parseable_file(self): for f in self.files_error: with self.assertRaises(ValueError): utils.parse_filename(f) @istest def release_number(self): for f in self.files.keys(): # when actual_ext = utils.release_number(f) # then _, expected_rel_num, _ = self.files[f] self.assertEquals( actual_ext, expected_rel_num, 'for %s, the version should be %s' % (f, expected_rel_num)) @istest def commonname(self): # when actual_commonname = utils.commonname('/some/where/to/', '/some/where/to/go/to') # then self.assertEquals('go/to', actual_commonname) # when actual_commonname2 = utils.commonname(b'/some/where/to/', b'/some/where/to/go/to') # then self.assertEquals(b'go/to', actual_commonname2) + + @istest + def convert_to_hex(self): + # given + input_dict = { + 'sha1_git': b'\xf6\xb7 \x8b+\xcd \x9fq5E\xe6\x03\xffg\x87\xd7\xb9D\xa1', # noqa + 'sha1': b'\xf4O\xf0\xd4\xc0\xb0\xae\xca\xe4C\xab%\x10\xf7\x12h\x1e\x9f\xac\xeb', # noqa + 'sha256': b'\xa8\xf9=\xf3\xfek\xa2$\xee\xc7\x1b\xc2\x83\xca\x96\xae8\xaf&\xab\x08\xfa\xb1\x13\xec(.s]\xf6Yb'} # noqa + + expected_dict = {'sha1_git': 'f6b7208b2bcd209f713545e603ff6' + '787d7b944a1', + 'sha1': 'f44ff0d4c0b0aecae443ab2510f712681e' + '9faceb', + 'sha256': 'a8f93df3fe6ba224eec71bc283ca96ae3' + '8af26ab08fab113ec282e735df65962'} + + # when + actual_dict = utils.convert_to_hex(input_dict) + + # then + self.assertDictEqual(actual_dict, expected_dict) + + @istest + def convert_to_hex_edge_cases(self): + # when + actual_dict = utils.convert_to_hex({}) + # then + self.assertDictEqual(actual_dict, {}) + + self.assertIsNone(utils.convert_to_hex(None)) diff --git a/swh/loader/tar/utils.py b/swh/loader/tar/utils.py index 5440bca..4f5d9db 100644 --- a/swh/loader/tar/utils.py +++ b/swh/loader/tar/utils.py @@ -1,104 +1,127 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re +from swh.core import hashutil + # FIXME; extract this in property # to recognize existing naming pattern extensions = [ 'ps', 'zip', 'tar', 'gz', 'tgz', 'bz2', 'bzip2', 'lzma', 'lz', 'xz', 'Z', 'diff', 'iso', 'exe', 'jar', 'egg', 'gem', 'xpi', 'apk', 'dmg', 'DevPak', ] pattern = re.compile(r''' ^ (?: # We have a software name and a release number, separated with a # -, _ or dot. (?P.+?[-_.]) (?P[0-9][0-9a-zA-Z_.+:~-]*?) | # We couldn't match a release number, put everything in the # software name. (?P.+?) ) (?P(?:\.(?:%s))+) $ ''' % '|'.join(extensions), flags=re.VERBOSE) def parse_filename(filename): """Parse a filename into its components. Parsing policy: We use Debian's release number heuristic: A release number starts with a digit, and is followed by alphanumeric characters or any of ., +, :, ~ and - We hardcode a list of possible extensions, as this release number scheme would match them too... We match on any combination of those. Greedy matching is done right to left (we only match the extension greedily with +, software_name and release_number are matched lazily with +? and *?). Args: filename: filename without path. Returns: Dictionary with the following keys: - software_name - release_number: can be None if it could not be found. - extension Raises: ValueError if the filename could not be parsed. """ m = pattern.match(filename) if not m: raise ValueError('Filename %s could not be parsed.' % filename) d = m.groupdict() return { 'software_name': d['software_name1'] or d['software_name2'], 'release_number': d['release_number'], 'extension': d['extension'], } def release_number(filename): """Compute the release number from the filename. cf. parse_filename's docstring """ return parse_filename(filename)['release_number'] def commonname(path0, path1, as_str=False): """Compute the commonname between the path0 and path1. """ return path1.split(path0)[1] + + +def convert_to_hex(d): + """Convert a flat dictionary with bytes in values to the same dictionary + with hex as values. + + Args: + dict: flat dictionary with sha bytes in their values. + + Returns: + Mirror dictionary with values as string hex. + + """ + if not d: + return d + + checksums = {} + for key_hash in d: + checksums[key_hash] = hashutil.hash_to_hex(d[key_hash]) + + return checksums