Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/swh/loader/tar/loader.py b/swh/loader/tar/loader.py
index ab17f0b..5ccae39 100644
--- a/swh/loader/tar/loader.py
+++ b/swh/loader/tar/loader.py
@@ -1,113 +1,116 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import os
import tempfile
import shutil
import sys
import traceback
+from swh.core import hashutil
from swh.loader.dir import loader
-from swh.loader.tar import tarball
+from swh.loader.tar import tarball, utils
class TarLoader(loader.DirLoader):
"""A tarball loader.
"""
def __init__(self, config):
super().__init__(config)
self.log = logging.getLogger('swh.loader.tar.TarLoader')
def process(self, tarpath, origin, revision, release, occurrences):
"""Load a tarball in backend.
This will:
- persist the origin if it does not exist.
- write an entry in fetch_history to mark the loading tarball start
- uncompress locally the tarballs in a temporary location
- process the content of the tarballs to persist on swh storage
- clean up the temporary location
- write an entry in fetch_history to mark the loading tarball end
Args:
- tarpath: path to the tarball to uncompress
- origin: Dictionary origin
- url: url origin we fetched
- type: type of the origin
- revision: Dictionary of information needed, keys are:
- author_name: revision's author name
- author_email: revision's author email
- author_date: timestamp (e.g. 1444054085)
- author_offset: date offset e.g. -0220, +0100
- committer_name: revision's committer name
- committer_email: revision's committer email
- committer_date: timestamp
- committer_offset: date offset e.g. -0220, +0100
- type: type of revision dir, tar
- message: synthetic message for the revision
- release: Dictionary of information needed, keys are:
- name: release name
- date: release timestamp (e.g. 1444054085)
- offset: release date offset e.g. -0220, +0100
- author_name: release author's name
- author_email: release author's email
- comment: release's comment message
- occurrences: List of occurrence dictionary.
Information needed, keys are:
- branch: occurrence's branch name
- authority_id: authority id (e.g. 1 for swh)
- validity: validity date (e.g. 2015-01-01 00:00:00+00)
"""
if 'type' not in origin: # let the type flow if present
origin['type'] = 'tar'
origin['id'] = self.storage.origin_add_one(origin)
# Mark the start of the loading
fetch_history_id = self.open_fetch_history(origin['id'])
# Prepare the extraction path
extraction_dir = self.config['extraction_dir']
os.makedirs(extraction_dir, 0o755, exist_ok=True)
dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-',
dir=extraction_dir)
# T62:
# - create tarball as content in storage
# - transit the information to the loader dir
- # T22: add checksums in revision
+ # add checksums in revision
+ hashes = utils.convert_to_hex(hashutil.hashfile(tarpath))
+ revision['metadata'] = {'checksums': hashes}
# for edge cases (NotImplemented...)
result = {'status': False, 'stderr': ''}
try:
self.log.info('Uncompress %s to %s' % (tarpath, dir_path))
tarball.uncompress(tarpath, dir_path)
result = super().process(dir_path, origin, revision, release,
occurrences)
except:
e_info = sys.exc_info()
if not result['status']:
# Enrich the error message with the tarball
result['stderr'] = 'reason:%s\ntrace:%s\n%s' % (
e_info[1],
''.join(traceback.format_tb(e_info[2])),
result.get('stderr', ''))
raise
finally:
shutil.rmtree(dir_path)
if not result['status']:
result['stderr'] = 'archive:%s\nreason:%s' % (
tarpath,
result.get('stderr', ''))
# mark the end of the loading
self.close_fetch_history(fetch_history_id, result)
diff --git a/swh/loader/tar/tests/test_utils.py b/swh/loader/tar/tests/test_utils.py
index 0dcdaeb..3dd870b 100644
--- a/swh/loader/tar/tests/test_utils.py
+++ b/swh/loader/tar/tests/test_utils.py
@@ -1,127 +1,157 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
from nose.tools import istest
from swh.loader.tar import utils
class TestUtils(unittest.TestCase):
@classmethod
def setupClass(cls):
cls.files = {
'free-ipmi-1.2.2.tar': ('free-ipmi-', '1.2.2', '.tar'),
'free-ipmi-1.2.2.tar.gz': ('free-ipmi-', '1.2.2', '.tar.gz'),
'free-ipmi-1.2.2.tar.tgz': ('free-ipmi-', '1.2.2', '.tar.tgz'),
'gcc-testsuite-4.4.2-4.4.3.diff.bz2': (
'gcc-testsuite-', '4.4.2-4.4.3', '.diff.bz2'),
'gcc-java-4.0.4.tar.gz': ('gcc-java-', '4.0.4', '.tar.gz'),
'gmp-2.0.tar.lzma': ('gmp-', '2.0', '.tar.lzma'),
'win-gerwin-0.6.zip': ('win-gerwin-', '0.6', '.zip'),
'ballandpaddle-0.8.0.tar.xz': (
'ballandpaddle-', '0.8.0', '.tar.xz'),
'mail-1.1.1.some.lz': ('mail-', '1.1.1.some', '.lz'),
'gmp-4.1.1-4.1.2.diff.tar.Z': (
'gmp-', '4.1.1-4.1.2', '.diff.tar.Z'),
'findutils-4.2.18.tar.bzip2': (
'findutils-', '4.2.18', '.tar.bzip2'),
'gnunet-java-0.9.4.jar': ('gnunet-java-', '0.9.4', '.jar'),
'pycdio-0.15-py2.5-linux-i686.egg': (
'pycdio-', '0.15-py2.5-linux-i686', '.egg'),
'rbcdio-0.04.gem': ('rbcdio-', '0.04', '.gem'),
'librejs-6.0.5.xpi': ('librejs-', '6.0.5', '.xpi'),
'icecat-31.8.0.csb.langpack.xpi': (
'icecat-', '31.8.0.csb.langpack', '.xpi'),
'icecatmobile-31.8.0.en-US.android-arm.apk': (
'icecatmobile-', '31.8.0.en-US.android-arm', '.apk'),
'icecat-31.8.0.en-US.mac.dmg': (
'icecat-', '31.8.0.en-US.mac', '.dmg'),
'gnutls-3.0.21-1gn.DevPak': ('gnutls-', '3.0.21-1gn', '.DevPak'),
# . separator
'greg-1.4.tar.gz': ('greg-', '1.4', '.tar.gz'),
# number in software product
'aspell6-pt_BR-20070411-0.tar.bz2': (
'aspell6-pt_BR-', '20070411-0', '.tar.bz2'),
'libosip2-3.3.0.tar.gz': ('libosip2-', '3.3.0', '.tar.gz'),
# other cases
'hurd-F2-main.iso': ('hurd-F2-main', None, '.iso'),
'winboard-4_0_5.exe': ('winboard-', '4_0_5', '.exe'),
# particular patterns...
'gift-0.1.9+3epsilon.tar.gz': (
'gift-', '0.1.9+3epsilon', '.tar.gz'),
'gift-0.1.6pre2.tgz': ('gift-', '0.1.6pre2', '.tgz'),
'binutils-2.19.1a.tar.bz2': ('binutils-', '2.19.1a', '.tar.bz2'),
'readline-4.2-4.2a.diff.gz': ('readline-', '4.2-4.2a', '.diff.gz'),
# with arch patterns
'cvs-1.12.6-BSD.bin.gz': ('cvs-', '1.12.6-BSD.bin', '.gz'),
'cvs-1.12.12-SunOS-5.8-i386.gz': (
'cvs-', '1.12.12-SunOS-5.8-i386', '.gz'),
'gnutls-3.0.20-w32.zip': ('gnutls-', '3.0.20-w32', '.zip'),
'mit-scheme_7.7.90+20080130-0gutsy1.diff.gz': (
'mit-scheme_', '7.7.90+20080130-0gutsy1', '.diff.gz'),
# no release number
'gnu.ps.gz': ('gnu', None, '.ps.gz'),
'direvent-latest.tar.gz': ('direvent-latest', None, '.tar.gz'),
}
cls.files_error = ['.tar', '.anything']
@istest
def parse_filename(self):
for f in self.files:
# when
actual_components = utils.parse_filename(f)
# then
name, version, ext = self.files[f]
expected_components = {
'software_name': name,
'release_number': version,
'extension': ext,
}
self.assertEquals(actual_components, expected_components)
@istest
def parse_filename_not_parseable_file(self):
for f in self.files_error:
with self.assertRaises(ValueError):
utils.parse_filename(f)
@istest
def release_number(self):
for f in self.files.keys():
# when
actual_ext = utils.release_number(f)
# then
_, expected_rel_num, _ = self.files[f]
self.assertEquals(
actual_ext,
expected_rel_num,
'for %s, the version should be %s' % (f, expected_rel_num))
@istest
def commonname(self):
# when
actual_commonname = utils.commonname('/some/where/to/',
'/some/where/to/go/to')
# then
self.assertEquals('go/to', actual_commonname)
# when
actual_commonname2 = utils.commonname(b'/some/where/to/',
b'/some/where/to/go/to')
# then
self.assertEquals(b'go/to', actual_commonname2)
+
+ @istest
+ def convert_to_hex(self):
+ # given
+ input_dict = {
+ 'sha1_git': b'\xf6\xb7 \x8b+\xcd \x9fq5E\xe6\x03\xffg\x87\xd7\xb9D\xa1', # noqa
+ 'sha1': b'\xf4O\xf0\xd4\xc0\xb0\xae\xca\xe4C\xab%\x10\xf7\x12h\x1e\x9f\xac\xeb', # noqa
+ 'sha256': b'\xa8\xf9=\xf3\xfek\xa2$\xee\xc7\x1b\xc2\x83\xca\x96\xae8\xaf&\xab\x08\xfa\xb1\x13\xec(.s]\xf6Yb'} # noqa
+
+ expected_dict = {'sha1_git': 'f6b7208b2bcd209f713545e603ff6'
+ '787d7b944a1',
+ 'sha1': 'f44ff0d4c0b0aecae443ab2510f712681e'
+ '9faceb',
+ 'sha256': 'a8f93df3fe6ba224eec71bc283ca96ae3'
+ '8af26ab08fab113ec282e735df65962'}
+
+ # when
+ actual_dict = utils.convert_to_hex(input_dict)
+
+ # then
+ self.assertDictEqual(actual_dict, expected_dict)
+
+ @istest
+ def convert_to_hex_edge_cases(self):
+ # when
+ actual_dict = utils.convert_to_hex({})
+ # then
+ self.assertDictEqual(actual_dict, {})
+
+ self.assertIsNone(utils.convert_to_hex(None))
diff --git a/swh/loader/tar/utils.py b/swh/loader/tar/utils.py
index 5440bca..4f5d9db 100644
--- a/swh/loader/tar/utils.py
+++ b/swh/loader/tar/utils.py
@@ -1,104 +1,127 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import re
+from swh.core import hashutil
+
# FIXME; extract this in property
# to recognize existing naming pattern
extensions = [
'ps',
'zip',
'tar',
'gz', 'tgz',
'bz2', 'bzip2',
'lzma', 'lz',
'xz',
'Z',
'diff',
'iso',
'exe',
'jar',
'egg',
'gem',
'xpi',
'apk',
'dmg',
'DevPak',
]
pattern = re.compile(r'''
^
(?:
# We have a software name and a release number, separated with a
# -, _ or dot.
(?P<software_name1>.+?[-_.])
(?P<release_number>[0-9][0-9a-zA-Z_.+:~-]*?)
|
# We couldn't match a release number, put everything in the
# software name.
(?P<software_name2>.+?)
)
(?P<extension>(?:\.(?:%s))+)
$
''' % '|'.join(extensions),
flags=re.VERBOSE)
def parse_filename(filename):
"""Parse a filename into its components.
Parsing policy:
We use Debian's release number heuristic: A release number starts
with a digit, and is followed by alphanumeric characters or any of
., +, :, ~ and -
We hardcode a list of possible extensions, as this release number
scheme would match them too... We match on any combination of those.
Greedy matching is done right to left (we only match the extension
greedily with +, software_name and release_number are matched lazily
with +? and *?).
Args:
filename: filename without path.
Returns:
Dictionary with the following keys:
- software_name
- release_number: can be None if it could not be found.
- extension
Raises:
ValueError if the filename could not be parsed.
"""
m = pattern.match(filename)
if not m:
raise ValueError('Filename %s could not be parsed.' % filename)
d = m.groupdict()
return {
'software_name': d['software_name1'] or d['software_name2'],
'release_number': d['release_number'],
'extension': d['extension'],
}
def release_number(filename):
"""Compute the release number from the filename.
cf. parse_filename's docstring
"""
return parse_filename(filename)['release_number']
def commonname(path0, path1, as_str=False):
"""Compute the commonname between the path0 and path1.
"""
return path1.split(path0)[1]
+
+
+def convert_to_hex(d):
+ """Convert a flat dictionary with bytes in values to the same dictionary
+ with hex as values.
+
+ Args:
+ dict: flat dictionary with sha bytes in their values.
+
+ Returns:
+ Mirror dictionary with values as string hex.
+
+ """
+ if not d:
+ return d
+
+ checksums = {}
+ for key_hash in d:
+ checksums[key_hash] = hashutil.hash_to_hex(d[key_hash])
+
+ return checksums

File Metadata

Mime Type
text/x-diff
Expires
Jul 4 2025, 6:32 PM (5 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3236124

Event Timeline