diff --git a/bin/swh-loader-tar-producer b/bin/swh-loader-tar-producer new file mode 100755 index 0000000..d2da48c --- /dev/null +++ b/bin/swh-loader-tar-producer @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import sys +import os + +from swh.core import config +from swh.loader.dir import producer + + +conf_file = sys.argv[1] +if not os.path.exists(conf_file): + conf_file = '../resources/producer/tar.ini' + + +conf = config.read(conf_file) + + +def compute_origin(config, filename): + return { + 'origin_url': os.path.join(config['origin_url_scheme'], filename), + 'origin_type': config['origin_type'], + } + + +def compute_release_number(filename): + """Compute the release number from a given filename. + + """ + pass + + +def compute_occurrence(filepath): + pass + + +def compute_occurrences(filepath): + pass + + +def compute_release(filepath): + """Compute a release from a given filepath. + If the filepath does not contain a recognizable release number, the release + can be skipped. + """ + pass + + +def compute_revision(filepath): + pass + + +def compute_from_filepath(filepath): + pass + + +def list_archives_from(path): + """From path, produce archive tarball message to celery. + + Args: + path: top directory to list archives from. + + """ + for dirpath, dirnames, filenames in os.walk(path): + for fname in [f for f in filenames if producer.is_archive(f)]: + yield dirpath, fname + + +# LIMIT = 100 +LIMIT = None + + +def compute_message_from(dirpath, filename): + # filepath = os.path.join(dirpath, filename) + + version = producer.release_number(filename) + print('|'.join(['', filename, version, ''])) + + +def produce_archive_messages(path): + """From path, produce archive tarball message to celery. + + Args: + path: top directory to list archives from. + + """ + limit = 0 + for dirpath, filename in list_archives_from(path): + compute_message_from(dirpath, filename) + if LIMIT and limit > LIMIT: + return + limit += 1 + + +produce_archive_messages(conf['mirror_root_directory']) diff --git a/resources/producer/tar.ini b/resources/producer/tar.ini new file mode 100644 index 0000000..a5eb895 --- /dev/null +++ b/resources/producer/tar.ini @@ -0,0 +1,18 @@ +[main] + +# mirror's root directory from which producing archive messages to load +mirror_root_directory=/home/storage/space/mirrors/gnu.org/gnu +# mirror_root_directory=/tmp/storage/space/mirrors/gnu.org/gnu + +# archive extensions patterns (not yet used) +archive_extensions = gz, tgz, bz2, bzip2, Z, lzma, lz, lzma, tar, xz, zip + +# special pattern cases (not yet used). +archive_special = x86, x86_64, x64, i386, i686, AIX, BSD, SGI, SUN, HP-UX, HP, SunOS, w32, win32, pre, alpha, epsilon, beta + + +origin_url_scheme = rsync://ftp.gnu.org/ftp/ +origin_type = ftp + + + diff --git a/swh/loader/dir/producer.py b/swh/loader/dir/producer.py new file mode 100644 index 0000000..7f33f3e --- /dev/null +++ b/swh/loader/dir/producer.py @@ -0,0 +1,164 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import re +import itertools + + +def init_archive_extension_pattern(exts): + """Given a list of extensions, return the regexp for exts. + + """ + res = [] + for p, pp in itertools.product(exts, repeat=2): + res.append('\.' + '\.'.join([p, pp])) + for p in exts: + res.append(''.join(['\.' + p])) + + return '|'.join(res) + + +# FIXME; extract this in property +# to recognize existing naming pattern +archive_extension_patterns = [ + 'zip', + 'tar', + 'gz', 'tgz', + 'bz2', 'bzip2', + 'lzma', 'lz', + 'xz', + 'Z', +] + + +re_archive_patterns = re.compile( + init_archive_extension_pattern(archive_extension_patterns), + flags=re.IGNORECASE) +software_name_pattern = re.compile('([a-zA-Z-_]*[0-9]*[a-zA-Z-_]*)') +digit_pattern = re.compile('[0-9]') +release_pattern = re.compile('[0-9.]+') + + +def is_archive(filename): + """Determine if the filename is an archive or not. + + This is dependent on the filename only. + + Args: + filename: the filename without any paths. + + Returns: + Boolean True if an archive, False otherwise. + + """ + return any(map(lambda ext: filename.endswith(ext), + archive_extension_patterns)) + + +def _extension(filename): + m = re_archive_patterns.search(filename) + if m: + return m.group() + + +def release_number(filename): + """Compute the release number from the filename. + + """ + name = _software_name(filename) + ext = _extension(filename) + if not ext: + return None + version = filename.replace(name, '').replace(ext, '') + if version: + # some filename use . for delimitation + # not caught by regexp so filtered here + if version[0] == '.': + version = version[1:] # arf + if not release_pattern.match(version): # check pattern release + return None + return version + return None + + +def _software_name(filename): + """Compute the software name from the filename. + + """ + m = software_name_pattern.match(filename) + res = m.group() + if digit_pattern.match(res[-1]): # remains first version number + return res[0:-1] + return res + + +# def filter_out_release_number(filename): +# filtered_data = filter(lambda x: len(x) > 1, +# re.findall('[-.a-zA-Z_]*', filename)) +# return list(filtered_data) + + +# def compute_release_software_ext(filename): +# return filter_out_release_number(filename)[-1] + + +# def compute_release_number_2(filename): +# data_to_filter = filter_out_release_number(filename) +# version_number = filename +# for s in data_to_filter: +# version_number = version_number.strip(s) + +# return version_number if version_number else None + + +# def compute_release_number_3(filename): +# res = re.findall('[-_]([0-9.a-z+-]+)(\.*){1,2}', filename) +# if res: +# return res[0] + +# def release_number(filename): +# """Compute the release number from a filename. + +# First implementation without all use cases ok. + +# """ +# filtered_version = list(filter(lambda s: len(s) > 2, +# re.split('[a-zA-Z]', filename))) +# if not filtered_version: +# return None + +# version = filtered_version[0][1:-1] + +# if version[0] == '-': # package name contains a number in name +# return version[1:] + +# if version[-1] == '-': +# return version[0:-1] + +# if version[-1] in ['.', '+']: # string alongside version +# return release_number_2(filename) + +# return version + +# special_case_patterns = [ +# 'x86', +# 'x86_64', +# 'x64', +# 'i386', +# 'i686', +# 'AIX', +# 'BSD', +# 'SGI', +# 'SUN', +# 'HP-UX', +# 'HP', +# 'SunOS', +# 'w32', +# 'win32', +# 'pre', +# 'alpha', +# 'epsilon', +# 'beta', +# ] diff --git a/swh/loader/dir/tests/test_producer.py b/swh/loader/dir/tests/test_producer.py new file mode 100644 index 0000000..1ab3712 --- /dev/null +++ b/swh/loader/dir/tests/test_producer.py @@ -0,0 +1,107 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest + +from swh.loader.dir import producer + + +class TestProducer(unittest.TestCase): + @istest + def is_archive(self): + # given + file_tryouts = [ + 'free-ipmi-1.2.2.tar', + 'free-ipmi-1.2.2.tar.gz', + 'free-ipmi-1.2.2.tar.tgz', + 'gcc-testsuite-4.4.2-4.4.3.diff.bz2', + 'gcc-java-4.0.4.tar.gz', + 'gmp-2.0.tar.lzma', + 'win-gerwin-0.6.zip', + 'ballandpaddle-0.8.0.tar.xz', + 'mail-1.1.1.some.lz', + 'gmp-4.1.1-4.1.2.diff.tar.blah.foo.bar.Z', + 'findutils-4.2.18.tar.bzip2' + ] + + # then + for f in file_tryouts: + res = producer.is_archive(f) + self.assertTrue(res, + '%s should be identified as archive' % f) + + @istest + def is_archive_not(self): + # given + file_tryouts = [ + 'free-ipmi-1.2.2.gz.sig', + 'free-ipmi-1.2.2.bz3', + 'free-ipmi-1.2.2.blah', + 'free-ipmi-1.2.2.other', + 'free-ipmi-1.2.2.md5', + 'free-ipmi-1.2.2.rpm', + 'free-ipmi-1.2.2.dpkg', + 'free-ipmi-1.2.2.deb', + 'free-ipmi-1.2.2.7z', + 'free-ipmi-1.2.2.foobar', + 'apl_1.3-1_i386.deb.sig' + ] + + # then + for f in file_tryouts: + self.assertFalse( + producer.is_archive(f), + '%s should not be identified as archive' % f) + + @istest + def compute_basic_release_number(self): + files = { + 'free-ipmi-1.2.2.tar': '1.2.2', + 'free-ipmi-1.2.2.tar.gz': '1.2.2', + 'free-ipmi-1.2.2.tar.tgz': '1.2.2', + 'gcc-testsuite-4.4.2-4.4.3.diff.bz2': '4.4.2-4.4.3.diff', + 'gcc-java-4.0.4.tar.gz': '4.0.4', + 'gmp-2.0.tar.lzma': '2.0', + 'win-gerwin-0.6.zip': '0.6', + 'ballandpaddle-0.8.0.tar.xz': '0.8.0', + 'mail-1.1.1.some.lz': '1.1.1.some', + 'gmp-4.1.1-4.1.2.diff.tar.Z': '4.1.1-4.1.2.diff', + 'findutils-4.2.18.tar.bzip2': '4.2.18', + 'greg-1.4.tar.gz': '1.4', + + # . separator + 'greg.1.4.tar.gz': '1.4', + + # number in software product + 'aspell6-pt_BR-20070411-0.tar.bz2': '20070411-0', + 'libosip2-3.3.0.tar.gz': '3.3.0', + + # particular patterns... + 'gift-0.1.9+3epsilon.tar.gz': '0.1.9+3epsilon', + 'gift-0.1.6pre2.tgz': '0.1.6pre2', + 'binutils-2.19.1a.tar.bz2': '2.19.1a', + 'readline-4.2-4.2a.diff.gz': '4.2-4.2a.diff', + + # with arch patterns + 'cvs-1.12.6-BSD.bin.gz': '1.12.6-BSD.bin', + 'cvs-1.12.12-SunOS-5.8-i386.gz': '1.12.12-SunOS-5.8-i386', + 'gnutls-3.0.20-w32.zip': '3.0.20-w32', + 'mit-scheme_7.7.90+20080130-0gutsy1.diff.gz': + '7.7.90+20080130-0gutsy1.diff', + + # no release number + 'gnu.ps.gz': None, + 'direvent-latest.tar.gz': None, + } + + # then + for f in files.keys(): + rel_num = producer.release_number(f) + self.assertEquals( + files[f], + rel_num, + 'for %s, the version should be %s' % (f, files[f]))