diff --git a/bin/swh-loader-tar-producer b/bin/swh-loader-tar-producer index 343f9e5..ffcef33 100755 --- a/bin/swh-loader-tar-producer +++ b/bin/swh-loader-tar-producer @@ -1,218 +1,221 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import sys import os +import tarfile from swh.core import config from swh.loader.dir import producer from swh.loader.dir import tasks # Static setup EPOCH = 0 UTC_OFFSET = '+0000' SWH_PERSON = 'Software Heritage' SWH_MAIL = 'robot@swh.org' REVISION_MESSAGE = 'synthetic message' REVISION_TYPE = 'tar' REVISION = { 'author_date': EPOCH, 'author_offset': UTC_OFFSET, 'author_name': SWH_PERSON, 'author_email': SWH_MAIL, 'committer_date': EPOCH, 'committer_offset': UTC_OFFSET, 'committer_name': SWH_PERSON, 'committer_email': SWH_MAIL, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } SWH_AUTHORITY = 1 GNU_AUTHORITY = 2 LIMIT = 1 # LIMIT = None def compute_origin(config, filepath): """Compute the origin. Args: - config: configuration dict with url_scheme and type keys. - filepath: file's path Returns: Dictionary origin with keys: - url: origin's url - type: origin's type """ return { 'url': os.path.join(config['url_scheme'], filepath), 'type': config['type'], } def swh_occurrence(filepath): """Compute the occurrence from the filepath with swh authority. Args: filepath: file's path Return: Occurrence. - branch: occurrence's branch name - authority: swh authority - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ return { 'branch': os.path.dirname(filepath), 'authority': SWH_AUTHORITY, 'validity': time_from_file(filepath) # FIXME: Use the right time } def gnu_occurrence(filepath): """Compute the occurrence from the filepath with gnu authority. Args: filepath: file's path Return: Occurrence. """ return { 'branch': os.path.dirname(filepath), 'authority': GNU_AUTHORITY, 'validity': time_from_file(filepath) } def compute_occurrences(filepath): """Compute the occurrences from filepath. Args: filepath: Returns: list of occurrences from filepath. """ return [gnu_occurrence(filepath), swh_occurrence(filepath)] def time_from_file(filepath): """Extract time from filepath. Args: filepath: path to the file we want to extract metadata Returns: Modification time from filepath. """ return os.lstat(filepath).st_mtime def compute_release(filename, filepath): """Compute a release from a given filepath, filename. If the filepath does not contain a recognizable release number, the release can be skipped. Args: filename: file's name without path filepath: file's absolute path Returns: None if the release number cannot be extracted from the filename. Otherwise a synthetic release is computed with the following keys: - name: the release computed from the filename - date: the modification timestamp as returned by a fstat call - offset: +0000 - author_name: '' - author_email: '' - comment: '' """ release_number = producer.release_number(filename) if release_number: return { 'name': release_number, 'date': time_from_file(filepath), 'offset': UTC_OFFSET, 'author_name': '', 'author_email': '', 'comment': '', } return None def list_archives_from(path): """From path, produce archive tarball message to celery. Args: path: top directory to list archives from. """ for dirpath, dirnames, filenames in os.walk(path): - for fname in [f for f in filenames if producer.is_archive(f)]: - yield dirpath, fname + for fname in filenames: + filepath = os.path.join(dirpath, fname) + if os.path.exists(filepath) and tarfile.is_tarfile(filepath): + yield dirpath, fname def compute_message_from(conf, dirpath, filename): """Post the message to workers. Args: conf: dictionary holding static metadata dirpath: directory containing the filename filename: filename without any path Returns: None """ filepath = os.path.join(dirpath, filename) origin = compute_origin(conf, filepath) occurrences = compute_occurrences(filepath) release = compute_release(filename, filepath) task = tasks.LoadTarRepository() # tasks.LoadTarRepositoryPrint() task.apply_async((filepath, origin, REVISION, release, occurrences)) def produce_archive_messages(conf, path): """From path, produce archive tarball message to celery. Args: path: top directory to list archives from. Returns: None """ limit = 0 for dirpath, filename in list_archives_from(path): compute_message_from(conf, dirpath, filename) if LIMIT and limit > LIMIT: return limit += 1 conf_file = sys.argv[1] if not os.path.exists(conf_file): conf_file = '../resources/producer/tar.ini' conf = config.read(conf_file) produce_archive_messages(conf, conf['mirror_root_directory']) diff --git a/swh/loader/dir/producer.py b/swh/loader/dir/producer.py index 7f33f3e..d06e746 100644 --- a/swh/loader/dir/producer.py +++ b/swh/loader/dir/producer.py @@ -1,164 +1,148 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import itertools def init_archive_extension_pattern(exts): """Given a list of extensions, return the regexp for exts. """ res = [] for p, pp in itertools.product(exts, repeat=2): res.append('\.' + '\.'.join([p, pp])) for p in exts: res.append(''.join(['\.' + p])) return '|'.join(res) # FIXME; extract this in property # to recognize existing naming pattern archive_extension_patterns = [ 'zip', 'tar', 'gz', 'tgz', 'bz2', 'bzip2', 'lzma', 'lz', 'xz', 'Z', ] re_archive_patterns = re.compile( init_archive_extension_pattern(archive_extension_patterns), flags=re.IGNORECASE) software_name_pattern = re.compile('([a-zA-Z-_]*[0-9]*[a-zA-Z-_]*)') digit_pattern = re.compile('[0-9]') release_pattern = re.compile('[0-9.]+') -def is_archive(filename): - """Determine if the filename is an archive or not. - - This is dependent on the filename only. - - Args: - filename: the filename without any paths. - - Returns: - Boolean True if an archive, False otherwise. - - """ - return any(map(lambda ext: filename.endswith(ext), - archive_extension_patterns)) - - def _extension(filename): m = re_archive_patterns.search(filename) if m: return m.group() def release_number(filename): """Compute the release number from the filename. """ name = _software_name(filename) ext = _extension(filename) if not ext: return None version = filename.replace(name, '').replace(ext, '') if version: # some filename use . for delimitation # not caught by regexp so filtered here if version[0] == '.': version = version[1:] # arf if not release_pattern.match(version): # check pattern release return None return version return None def _software_name(filename): """Compute the software name from the filename. """ m = software_name_pattern.match(filename) res = m.group() if digit_pattern.match(res[-1]): # remains first version number return res[0:-1] return res # def filter_out_release_number(filename): # filtered_data = filter(lambda x: len(x) > 1, # re.findall('[-.a-zA-Z_]*', filename)) # return list(filtered_data) # def compute_release_software_ext(filename): # return filter_out_release_number(filename)[-1] # def compute_release_number_2(filename): # data_to_filter = filter_out_release_number(filename) # version_number = filename # for s in data_to_filter: # version_number = version_number.strip(s) # return version_number if version_number else None # def compute_release_number_3(filename): # res = re.findall('[-_]([0-9.a-z+-]+)(\.*){1,2}', filename) # if res: # return res[0] # def release_number(filename): # """Compute the release number from a filename. # First implementation without all use cases ok. # """ # filtered_version = list(filter(lambda s: len(s) > 2, # re.split('[a-zA-Z]', filename))) # if not filtered_version: # return None # version = filtered_version[0][1:-1] # if version[0] == '-': # package name contains a number in name # return version[1:] # if version[-1] == '-': # return version[0:-1] # if version[-1] in ['.', '+']: # string alongside version # return release_number_2(filename) # return version # special_case_patterns = [ # 'x86', # 'x86_64', # 'x64', # 'i386', # 'i686', # 'AIX', # 'BSD', # 'SGI', # 'SUN', # 'HP-UX', # 'HP', # 'SunOS', # 'w32', # 'win32', # 'pre', # 'alpha', # 'epsilon', # 'beta', # ] diff --git a/swh/loader/dir/tasks.py b/swh/loader/dir/tasks.py index 43587d0..574b401 100644 --- a/swh/loader/dir/tasks.py +++ b/swh/loader/dir/tasks.py @@ -1,108 +1,120 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import subprocess import shutil import tempfile +import tarfile from swh.core.scheduling import Task from swh.loader.dir.loader import DirLoader class LoadDirRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_dir' CONFIG_BASE_FILENAME = 'loader/dir.ini' ADDITIONAL_CONFIG = {} def __init__(self): self.config = DirLoader.parse_config_file( base_filename=self.CONFIG_BASE_FILENAME, additional_configs=[self.ADDITIONAL_CONFIG], ) def run(self, dir_path, origin, revision, release, occurrences): """Import a directory. Args: cf. swh.loader.dir.loader.run docstring """ loader = DirLoader(self.config) loader.log = self.log loader.process(dir_path, origin, revision, release, occurrences) def untar(tar_path, dir_path): """Decompress an archive tar_path to dir_path. At the end of this call, dir_path contains the tarball's uncompressed content. Args: tar_path: the path to access the tarball dir_path: The path where to extract the tarball's content. """ - untar_cmd = ['tar', 'xavf', tar_path, - '--preserve-permissions', - '-C', dir_path] - subprocess.check_call(untar_cmd, stderr=subprocess.STDOUT) + # tryout1 + # untar_cmd = ['tar', 'xavf', tar_path, + # '--preserve-permissions', + # '-C', dir_path] + # subprocess.check_call(untar_cmd, stderr=subprocess.STDOUT) + + # tryout2 + # try: + # subprocess.check_call(untar_cmd, stderr=subprocess.STDOUT) + # except: + # unzip_cmd = ['gzip', '--keep', '--decompress', tar_path] + # subprocess.check_call(unzip_cmd, stderr=subprocess.STDOUT) + + # tryout3 + with tarfile.open(tar_path) as tarball: + tarball.extractall(path=dir_path) class LoadTarRepository(LoadDirRepository): """Import a tarball to Software Heritage """ task_queue = 'swh_loader_tar' CONFIG_BASE_FILENAME = 'loader/tar.ini' ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh.loader.tar/'), } def run(self, tar_path, origin, revision, release, occurrences): """Import a tarball tar_path. Args: - tar_path: path access to the tarball - origin, revision, release, occurrences: see LoadDirRepository.run """ extraction_dir = self.config['extraction_dir'] dir_path = tempfile.mkdtemp(prefix='swh.loader.tar', dir=extraction_dir) # unarchive in dir_path untar(tar_path, dir_path) if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' try: super().run(dir_path, origin, revision, release, occurrences) finally: # always clean up shutil.rmtree(dir_path) class LoadTarRepositoryPrint(LoadDirRepository): """Import a tarball to Software Heritage DEBUG purposes """ task_queue = 'swh_loader_tar_print' def run(self, tar_path, origin, revision, release, occurrences): """Import a tarball tar_path. Args: - tar_path: path access to the tarball - origin, revision, release, occurrences: see LoadDirRepository.run """ print(tar_path, origin, revision, release, occurrences) diff --git a/swh/loader/dir/tests/test_producer.py b/swh/loader/dir/tests/test_producer.py index 1ab3712..f474198 100644 --- a/swh/loader/dir/tests/test_producer.py +++ b/swh/loader/dir/tests/test_producer.py @@ -1,107 +1,61 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.loader.dir import producer class TestProducer(unittest.TestCase): - @istest - def is_archive(self): - # given - file_tryouts = [ - 'free-ipmi-1.2.2.tar', - 'free-ipmi-1.2.2.tar.gz', - 'free-ipmi-1.2.2.tar.tgz', - 'gcc-testsuite-4.4.2-4.4.3.diff.bz2', - 'gcc-java-4.0.4.tar.gz', - 'gmp-2.0.tar.lzma', - 'win-gerwin-0.6.zip', - 'ballandpaddle-0.8.0.tar.xz', - 'mail-1.1.1.some.lz', - 'gmp-4.1.1-4.1.2.diff.tar.blah.foo.bar.Z', - 'findutils-4.2.18.tar.bzip2' - ] - - # then - for f in file_tryouts: - res = producer.is_archive(f) - self.assertTrue(res, - '%s should be identified as archive' % f) - - @istest - def is_archive_not(self): - # given - file_tryouts = [ - 'free-ipmi-1.2.2.gz.sig', - 'free-ipmi-1.2.2.bz3', - 'free-ipmi-1.2.2.blah', - 'free-ipmi-1.2.2.other', - 'free-ipmi-1.2.2.md5', - 'free-ipmi-1.2.2.rpm', - 'free-ipmi-1.2.2.dpkg', - 'free-ipmi-1.2.2.deb', - 'free-ipmi-1.2.2.7z', - 'free-ipmi-1.2.2.foobar', - 'apl_1.3-1_i386.deb.sig' - ] - - # then - for f in file_tryouts: - self.assertFalse( - producer.is_archive(f), - '%s should not be identified as archive' % f) - @istest def compute_basic_release_number(self): files = { 'free-ipmi-1.2.2.tar': '1.2.2', 'free-ipmi-1.2.2.tar.gz': '1.2.2', 'free-ipmi-1.2.2.tar.tgz': '1.2.2', 'gcc-testsuite-4.4.2-4.4.3.diff.bz2': '4.4.2-4.4.3.diff', 'gcc-java-4.0.4.tar.gz': '4.0.4', 'gmp-2.0.tar.lzma': '2.0', 'win-gerwin-0.6.zip': '0.6', 'ballandpaddle-0.8.0.tar.xz': '0.8.0', 'mail-1.1.1.some.lz': '1.1.1.some', 'gmp-4.1.1-4.1.2.diff.tar.Z': '4.1.1-4.1.2.diff', 'findutils-4.2.18.tar.bzip2': '4.2.18', 'greg-1.4.tar.gz': '1.4', # . separator 'greg.1.4.tar.gz': '1.4', # number in software product 'aspell6-pt_BR-20070411-0.tar.bz2': '20070411-0', 'libosip2-3.3.0.tar.gz': '3.3.0', # particular patterns... 'gift-0.1.9+3epsilon.tar.gz': '0.1.9+3epsilon', 'gift-0.1.6pre2.tgz': '0.1.6pre2', 'binutils-2.19.1a.tar.bz2': '2.19.1a', 'readline-4.2-4.2a.diff.gz': '4.2-4.2a.diff', # with arch patterns 'cvs-1.12.6-BSD.bin.gz': '1.12.6-BSD.bin', 'cvs-1.12.12-SunOS-5.8-i386.gz': '1.12.12-SunOS-5.8-i386', 'gnutls-3.0.20-w32.zip': '3.0.20-w32', 'mit-scheme_7.7.90+20080130-0gutsy1.diff.gz': '7.7.90+20080130-0gutsy1.diff', # no release number 'gnu.ps.gz': None, 'direvent-latest.tar.gz': None, } # then for f in files.keys(): rel_num = producer.release_number(f) self.assertEquals( files[f], rel_num, 'for %s, the version should be %s' % (f, files[f]))