diff --git a/swh/loader/dir/producer.py b/swh/loader/dir/producer.py index d06e746..6837163 100644 --- a/swh/loader/dir/producer.py +++ b/swh/loader/dir/producer.py @@ -1,148 +1,78 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import re import itertools def init_archive_extension_pattern(exts): """Given a list of extensions, return the regexp for exts. """ res = [] for p, pp in itertools.product(exts, repeat=2): res.append('\.' + '\.'.join([p, pp])) for p in exts: res.append(''.join(['\.' + p])) return '|'.join(res) # FIXME; extract this in property # to recognize existing naming pattern archive_extension_patterns = [ 'zip', 'tar', 'gz', 'tgz', 'bz2', 'bzip2', 'lzma', 'lz', 'xz', 'Z', ] re_archive_patterns = re.compile( init_archive_extension_pattern(archive_extension_patterns), flags=re.IGNORECASE) software_name_pattern = re.compile('([a-zA-Z-_]*[0-9]*[a-zA-Z-_]*)') digit_pattern = re.compile('[0-9]') release_pattern = re.compile('[0-9.]+') def _extension(filename): m = re_archive_patterns.search(filename) if m: return m.group() def release_number(filename): """Compute the release number from the filename. """ name = _software_name(filename) ext = _extension(filename) if not ext: return None version = filename.replace(name, '').replace(ext, '') if version: # some filename use . for delimitation # not caught by regexp so filtered here if version[0] == '.': version = version[1:] # arf if not release_pattern.match(version): # check pattern release return None return version return None def _software_name(filename): """Compute the software name from the filename. """ m = software_name_pattern.match(filename) res = m.group() if digit_pattern.match(res[-1]): # remains first version number return res[0:-1] return res - - -# def filter_out_release_number(filename): -# filtered_data = filter(lambda x: len(x) > 1, -# re.findall('[-.a-zA-Z_]*', filename)) -# return list(filtered_data) - - -# def compute_release_software_ext(filename): -# return filter_out_release_number(filename)[-1] - - -# def compute_release_number_2(filename): -# data_to_filter = filter_out_release_number(filename) -# version_number = filename -# for s in data_to_filter: -# version_number = version_number.strip(s) - -# return version_number if version_number else None - - -# def compute_release_number_3(filename): -# res = re.findall('[-_]([0-9.a-z+-]+)(\.*){1,2}', filename) -# if res: -# return res[0] - -# def release_number(filename): -# """Compute the release number from a filename. - -# First implementation without all use cases ok. - -# """ -# filtered_version = list(filter(lambda s: len(s) > 2, -# re.split('[a-zA-Z]', filename))) -# if not filtered_version: -# return None - -# version = filtered_version[0][1:-1] - -# if version[0] == '-': # package name contains a number in name -# return version[1:] - -# if version[-1] == '-': -# return version[0:-1] - -# if version[-1] in ['.', '+']: # string alongside version -# return release_number_2(filename) - -# return version - -# special_case_patterns = [ -# 'x86', -# 'x86_64', -# 'x64', -# 'i386', -# 'i686', -# 'AIX', -# 'BSD', -# 'SGI', -# 'SUN', -# 'HP-UX', -# 'HP', -# 'SunOS', -# 'w32', -# 'win32', -# 'pre', -# 'alpha', -# 'epsilon', -# 'beta', -# ] diff --git a/swh/loader/dir/tasks.py b/swh/loader/dir/tasks.py index 574b401..981ba36 100644 --- a/swh/loader/dir/tasks.py +++ b/swh/loader/dir/tasks.py @@ -1,120 +1,106 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import shutil import tempfile import tarfile from swh.core.scheduling import Task from swh.loader.dir.loader import DirLoader class LoadDirRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_dir' CONFIG_BASE_FILENAME = 'loader/dir.ini' ADDITIONAL_CONFIG = {} def __init__(self): self.config = DirLoader.parse_config_file( base_filename=self.CONFIG_BASE_FILENAME, additional_configs=[self.ADDITIONAL_CONFIG], ) def run(self, dir_path, origin, revision, release, occurrences): """Import a directory. Args: cf. swh.loader.dir.loader.run docstring """ loader = DirLoader(self.config) loader.log = self.log loader.process(dir_path, origin, revision, release, occurrences) def untar(tar_path, dir_path): """Decompress an archive tar_path to dir_path. At the end of this call, dir_path contains the tarball's uncompressed content. Args: tar_path: the path to access the tarball dir_path: The path where to extract the tarball's content. """ - # tryout1 - # untar_cmd = ['tar', 'xavf', tar_path, - # '--preserve-permissions', - # '-C', dir_path] - # subprocess.check_call(untar_cmd, stderr=subprocess.STDOUT) - - # tryout2 - # try: - # subprocess.check_call(untar_cmd, stderr=subprocess.STDOUT) - # except: - # unzip_cmd = ['gzip', '--keep', '--decompress', tar_path] - # subprocess.check_call(unzip_cmd, stderr=subprocess.STDOUT) - - # tryout3 with tarfile.open(tar_path) as tarball: tarball.extractall(path=dir_path) class LoadTarRepository(LoadDirRepository): """Import a tarball to Software Heritage """ task_queue = 'swh_loader_tar' CONFIG_BASE_FILENAME = 'loader/tar.ini' ADDITIONAL_CONFIG = { 'extraction_dir': ('str', '/tmp/swh.loader.tar/'), } def run(self, tar_path, origin, revision, release, occurrences): """Import a tarball tar_path. Args: - tar_path: path access to the tarball - origin, revision, release, occurrences: see LoadDirRepository.run """ extraction_dir = self.config['extraction_dir'] dir_path = tempfile.mkdtemp(prefix='swh.loader.tar', dir=extraction_dir) # unarchive in dir_path untar(tar_path, dir_path) if 'type' not in origin: # let the type flow if present origin['type'] = 'tar' try: super().run(dir_path, origin, revision, release, occurrences) finally: # always clean up shutil.rmtree(dir_path) class LoadTarRepositoryPrint(LoadDirRepository): """Import a tarball to Software Heritage DEBUG purposes """ task_queue = 'swh_loader_tar_print' def run(self, tar_path, origin, revision, release, occurrences): """Import a tarball tar_path. Args: - tar_path: path access to the tarball - origin, revision, release, occurrences: see LoadDirRepository.run """ print(tar_path, origin, revision, release, occurrences)