diff --git a/README b/README index 2c7ce8c..3a4f721 100644 --- a/README +++ b/README @@ -1,55 +1,51 @@ SWH-loader-dir ============== -The Software Heritage Loader Dir is a tool and a library to walk a local -Directory and inject into the SWH dataset all unknown contained files. - -It can also deal with: -- a tarball -- ... +The Software Heritage Directory Loader is a tool and a library to walk a local +directory and inject into the SWH dataset all unknown contained files. Configuration sample ==================== -### tar +### dir -Sample tar.ini: +Sample dir.ini: [main] - dir_path = /tmp/swh/loader/tar/ + dir_path = /tmp/swh/loader/dir/ storage_class = remote_storage storage_args = http://localhost:5000/ send_contents = True send_directories = True send_revisions = True send_releases = True send_occurrences = True content_packet_size = 10000 content_packet_size_bytes = 1073741824 directory_packet_size = 25000 revision_packet_size = 100000 release_packet_size = 100000 occurrence_packet_size = 100000 Present in possible locations: -- ~/.config/swh/loader/tar.ini -- ~/.swh/loader/tar.ini -- /etc/softwareheritage/loader/tar.ini +- ~/.config/swh/loader/dir.ini +- ~/.swh/loader/dir.ini +- /etc/softwareheritage/loader/dir.ini -### Load tarball +### Load directory #### toplevel - from swh.loader.dir.tasks import LoadTarRepository - LoadTarRepository().run('/path/to/tarball.tgz') + from swh.loader.dir.tasks import LoadDirRepository + LoadDirRepository().run('/path/to/dir') #### celery Providing you have a celery up and running (cf. https://forge.softwareheritage.org/diffusion/DCORE/browse/master/README.md) - from swh.loader.dir.tasks import LoadTarRepository - LoadTarRepository().delay('/path/to/tarball.tgz') + from swh.loader.dir.tasks import LoadDirRepository + LoadDirRepository().delay('/path/to/dir') diff --git a/bin/swh-loader-tar-producer b/bin/swh-loader-tar-producer deleted file mode 100755 index 55b3414..0000000 --- a/bin/swh-loader-tar-producer +++ /dev/null @@ -1,241 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import sys -import os -import tarfile -import datetime - -from swh.core import config -from swh.loader.dir import producer -from swh.loader.dir import tasks - - -# Static setup -EPOCH = 0 -UTC_OFFSET = '+0000' -SWH_PERSON = 'Software Heritage' -SWH_MAIL = 'robot@swh.org' -REVISION_MESSAGE = 'synthetic message' -REVISION_TYPE = 'tar' -REVISION = { - 'author_date': EPOCH, - 'author_offset': UTC_OFFSET, - 'author_name': SWH_PERSON, - 'author_email': SWH_MAIL, - 'committer_date': EPOCH, - 'committer_offset': UTC_OFFSET, - 'committer_name': SWH_PERSON, - 'committer_email': SWH_MAIL, - 'type': REVISION_TYPE, - 'message': REVISION_MESSAGE, -} -SWH_AUTHORITY = 1 -GNU_AUTHORITY = 2 - - -def compute_origin(url_scheme, url_type, root_dirpath, tarpath): - """Compute the origin. - - Args: - - config: configuration dict with url_scheme and type keys. - - tarpath: file's path - - Returns: - Dictionary origin with keys: - - url: origin's url - - type: origin's type - - """ - relative_path = tarpath.split(root_dirpath)[1] - return { - 'url': ''.join([url_scheme, os.path.dirname(relative_path)]), - 'type': url_type, - } - - -def _build_occurrence(tarpath, authority_id, validity_ts): - """Build an occurrence from branch_name, authority_id and validity_ts. - - Args: - - tarpath: file's path - - authority_id: swh authority id (as per swh's storage values in - organization table) - - validity_ts: validity timestamp - - Returns: - Occurrence dictionary - - tarpath: file's path - - authority: swh authority - - validity: validity date (e.g. 2015-01-01 00:00:00+00) - """ - validity = '%s+00' % datetime.datetime.utcfromtimestamp(validity_ts) - return { - 'branch': os.path.basename(tarpath), - 'authority': authority_id, - 'validity': validity - } - - -def swh_occurrence(tarpath): - """Compute the occurrence from the tarpath with swh authority. - - Args: - tarpath: file's path - - Returns: - Occurrence dictionary (cf. _build_occurrence) - - """ - validity_ts = os.lstat(tarpath).st_atime - return _build_occurrence(tarpath, SWH_AUTHORITY, validity_ts) - - -def gnu_occurrence(tarpath): - """Compute the occurrence from the tarpath with gnu authority. - - Args: - tarpath: file's path - - Return: - Occurrence dictionary (cf. _build_occurrence) - - """ - validity_ts = os.lstat(tarpath).st_mtime - return _build_occurrence(tarpath, GNU_AUTHORITY, validity_ts) - - -def compute_release(filename, tarpath): - """Compute a release from a given tarpath, filename. - If the tarpath does not contain a recognizable release number, the release - can be skipped. - - Args: - filename: file's name without path - tarpath: file's absolute path - - Returns: - None if the release number cannot be extracted from the filename. - Otherwise a synthetic release is computed with the following keys: - - name: the release computed from the filename - - date: the modification timestamp as returned by a fstat call - - offset: +0000 - - author_name: '' - - author_email: '' - - comment: '' - - """ - release_number = producer.release_number(filename) - if release_number: - return { - 'name': release_number, - 'date': os.lstat(tarpath).st_mtime, - 'offset': UTC_OFFSET, - 'author_name': '', - 'author_email': '', - 'comment': '', - } - return None - - -def list_archives_from(path): - """From path, produce archive tarball message to celery. - - Args: - path: top directory to list archives from. - - """ - for dirpath, dirnames, filenames in os.walk(path): - for fname in filenames: - tarpath = os.path.join(dirpath, fname) - if os.path.exists(tarpath) and tarfile.is_tarfile(tarpath): - yield dirpath, fname - - -def compute_message_from(conf, dirpath, filename): - """Post the message to workers. - - Args: - conf: dictionary holding static metadata - dirpath: directory containing the filename - filename: filename without any path - - Returns: - None - - """ - tarpath = os.path.join(dirpath, filename) - root_dirpath = conf['mirror_root_directory'] - - origin = compute_origin(conf['url_scheme'], - conf['type'], - root_dirpath, - tarpath) - occurrences = [gnu_occurrence(tarpath), swh_occurrence(tarpath)] - release = compute_release(filename, tarpath) - - task = tasks.LoadTarRepository() # tasks.LoadTarRepositoryPrint() - task.apply_async((tarpath, - origin, - REVISION, - release, - occurrences)) - - -def produce_archive_messages(conf, path): - """From path, produce archive tarball message to celery. - - Args: - path: top directory to list archives from. - - Returns: - None - """ - LIMIT = conf['limit'] - count = 0 - - for dirpath, filename in list_archives_from(path): - count += 1 - compute_message_from(conf, dirpath, filename) - if LIMIT and count >= LIMIT: - return count - - return count - - -def load_config(conf_file): - """Load the configuration from file. - - """ - conf = config.read(conf_file, - default_conf={'limit': ('int', None)}) - url_scheme = conf['url_scheme'] - mirror_dir = conf['mirror_root_directory'] - - # remove trailing / in configuration (to ease ulterior computation) - if url_scheme[-1] == '/': - conf.update({ - 'url_scheme': url_scheme[0:-1] - }) - - if mirror_dir[-1] == '/': - conf.update({ - 'mirror_root_directory': mirror_dir[0:-1] - }) - - return conf - -if __name__ == '__main__': - conf_file = sys.argv[1] - if not os.path.exists(conf_file): - conf_file = '../resources/producer/tar.ini' - - conf = load_config(conf_file) - - nb_tarballs = produce_archive_messages(conf, conf['mirror_root_directory']) - - print('%s tasks sent!' % nb_tarballs) diff --git a/debian/control b/debian/control index 7f08397..813affc 100644 --- a/debian/control +++ b/debian/control @@ -1,19 +1,19 @@ Source: swh-loader-dir Maintainer: Software Heritage developers Section: python Priority: optional Build-Depends: debhelper (>= 9), dh-python, python3-all, python3-nose, python3-setuptools, python3-swh.core, python3-swh.storage, python3-vcversioner Standards-Version: 3.9.6 Homepage: https://forge.softwareheritage.org/diffusion/DLDDIR/ Package: python3-swh.loader.dir Architecture: all Depends: ${misc:Depends}, ${python3:Depends} -Description: Software Heritage Loader Directory +Description: Software Heritage Directory Loader diff --git a/resources/loader/tar.ini b/resources/loader/tar.ini deleted file mode 100644 index c0aaf45..0000000 --- a/resources/loader/tar.ini +++ /dev/null @@ -1,39 +0,0 @@ -[main] - -# NOT FOR PRODUCTION -tar_path = /home/tony/work/inria/repo/linux.tgz -dir_path = /tmp/linux/ - -# synthetic origin -origin_url = file:///dev/null -origin_type = tar - -# occurrence -occurrence_branch = master2 -occurrence_authority = 1 -occurrence_validity = 2015-01-01 00:00:00+00 - -# occurrence 2 -occurrence2_branch = dev -occurrence2_authority = 2 -occurrence2_validity = 2015-01-01 00:00:00+00 - -# synthetic revision -revision_author_name = swh author -revision_author_email = swh@inria.fr -revision_author_date = 1444054085 -revision_author_offset = +0200 -revision_committer_name = swh committer -revision_committer_email = swh@inria.fr -revision_committer_date = 1444054085 -revision_committer_offset = +0200 -revision_type = tar -revision_message = synthetic revision message - -# synthetic release -release_name = v0.0.1 -release_date = 1444054085 -release_offset = +0200 -release_author_name = swh author -release_author_email = swh@inria.fr -release_comment = synthetic release diff --git a/resources/producer/tar.ini b/resources/producer/tar.ini deleted file mode 100644 index 96fa248..0000000 --- a/resources/producer/tar.ini +++ /dev/null @@ -1,21 +0,0 @@ -[main] - -# mirror's root directory from which producing archive messages to load -mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ -# mirror_root_directory = /home/storage/space/mirrors/gnu.org/old-gnu/ - -# archive extensions patterns (not yet used) -# archive_extensions = gz, tgz, bz2, bzip2, Z, lzma, lz, lzma, tar, xz, zip - -# special pattern cases (not yet used). -# archive_special = x86, x86_64, x64, i386, i686, AIX, BSD, SGI, SUN, HP-UX, HP, SunOS, w32, win32, pre, alpha, epsilon, beta - -# origin setup's possible scheme url -#url_scheme = http://ftp.gnu.org/gnu/ -url_scheme = rsync://ftp.gnu.org/gnu/ -#url_scheme = rsync://ftp.gnu.org/old-gnu/ - -type = ftp - -# For tryouts purposes (no limit if not specified) -# limit = 1 diff --git a/scratch/count_tarballs.py b/scratch/count_tarballs.py deleted file mode 100755 index 666c6d2..0000000 --- a/scratch/count_tarballs.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 - -import os - -from swh.loader.dir import producer - - -def is_tarball(filename): - """Determine if the filename is an tarball or not. - - This is dependent on the filename only. - - Args: - filename: the filename without any paths. - - Returns: - Boolean True if an tarball, False otherwise. - - """ - return any(map(lambda ext: filename.endswith(ext), - producer.archive_extension_patterns)) - -def list_tarballs_from(path): - """From path, produce tarball tarball message to celery. - - Args: - path: top directory to list tarballs from. - - """ - for dirpath, dirnames, filenames in os.walk(path): - for fname in filenames: - if is_tarball(fname): - yield dirpath, fname - -def count_tarballs_from(path): - count = 0 - for dirpath, fname in list_tarballs_from(path): - count += 1 - - return count - -if __name__ == '__main__': - for path in ['/home/storage/space/mirrors/gnu.org/gnu', - '/home/storage/space/mirrors/gnu.org/old-gnu']: - print("%s %s" % (path, count_tarballs_from(path))) diff --git a/scratch/producer.py b/scratch/producer.py deleted file mode 100644 index 72bc641..0000000 --- a/scratch/producer.py +++ /dev/null @@ -1,8 +0,0 @@ -from swh.loader.dir.tasks import LoadTarRepository - -# Create a load tar instance (this will load a potential configuration file -# from ~/.config/swh/loader/tar.ini) -loadertar = LoadTarRepository() - -tar_path = '/home/tony/work/inria/repo/linux.tgz' -loadertar.delay(tar_path) diff --git a/setup.py b/setup.py index 44e9d7f..246b038 100644 --- a/setup.py +++ b/setup.py @@ -1,28 +1,28 @@ from setuptools import setup def parse_requirements(): requirements = [] with open('requirements.txt') as f: for line in f.readlines(): line = line.strip() if not line or line.startswith('#'): continue requirements.append(line) return requirements setup( name='swh.loader.dir', - description='Software Heritage Loader Directory', + description='Software Heritage Directory Loader', author='Software Heritage developers', author_email='swh-devel@inria.fr', url='https://forge.softwareheritage.org/diffusion/DLDDIR', packages=['swh.loader.dir', 'swh.loader.dir.tests'], scripts=['bin/swh-loader-dir'], install_requires=parse_requirements(), setup_requires=['vcversioner'], vcversioner={}, include_package_data=True, ) diff --git a/swh/loader/dir/producer.py b/swh/loader/dir/producer.py deleted file mode 100644 index 6837163..0000000 --- a/swh/loader/dir/producer.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import re -import itertools - - -def init_archive_extension_pattern(exts): - """Given a list of extensions, return the regexp for exts. - - """ - res = [] - for p, pp in itertools.product(exts, repeat=2): - res.append('\.' + '\.'.join([p, pp])) - for p in exts: - res.append(''.join(['\.' + p])) - - return '|'.join(res) - - -# FIXME; extract this in property -# to recognize existing naming pattern -archive_extension_patterns = [ - 'zip', - 'tar', - 'gz', 'tgz', - 'bz2', 'bzip2', - 'lzma', 'lz', - 'xz', - 'Z', -] - - -re_archive_patterns = re.compile( - init_archive_extension_pattern(archive_extension_patterns), - flags=re.IGNORECASE) -software_name_pattern = re.compile('([a-zA-Z-_]*[0-9]*[a-zA-Z-_]*)') -digit_pattern = re.compile('[0-9]') -release_pattern = re.compile('[0-9.]+') - - -def _extension(filename): - m = re_archive_patterns.search(filename) - if m: - return m.group() - - -def release_number(filename): - """Compute the release number from the filename. - - """ - name = _software_name(filename) - ext = _extension(filename) - if not ext: - return None - version = filename.replace(name, '').replace(ext, '') - if version: - # some filename use . for delimitation - # not caught by regexp so filtered here - if version[0] == '.': - version = version[1:] # arf - if not release_pattern.match(version): # check pattern release - return None - return version - return None - - -def _software_name(filename): - """Compute the software name from the filename. - - """ - m = software_name_pattern.match(filename) - res = m.group() - if digit_pattern.match(res[-1]): # remains first version number - return res[0:-1] - return res diff --git a/swh/loader/dir/tasks.py b/swh/loader/dir/tasks.py index 4306818..aec8997 100644 --- a/swh/loader/dir/tasks.py +++ b/swh/loader/dir/tasks.py @@ -1,106 +1,35 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import shutil -import tempfile -import tarfile - from swh.core.scheduling import Task from swh.loader.dir.loader import DirLoader class LoadDirRepository(Task): """Import a directory to Software Heritage """ task_queue = 'swh_loader_dir' CONFIG_BASE_FILENAME = 'loader/dir.ini' ADDITIONAL_CONFIG = {} def __init__(self): self.config = DirLoader.parse_config_file( base_filename=self.CONFIG_BASE_FILENAME, additional_configs=[self.ADDITIONAL_CONFIG], ) def run(self, dir_path, origin, revision, release, occurrences): """Import a directory. Args: cf. swh.loader.dir.loader.run docstring """ loader = DirLoader(self.config) loader.log = self.log loader.process(dir_path, origin, revision, release, occurrences) - - -def uncompress(tar_path, dir_path): - """Decompress an archive tar_path to dir_path. - - At the end of this call, dir_path contains the tarball's - uncompressed content. - - Args: - tar_path: the path to access the tarball - dir_path: The path where to extract the tarball's content. - """ - with tarfile.open(tar_path) as tarball: - tarball.extractall(path=dir_path) - - -class LoadTarRepository(LoadDirRepository): - """Import a tarball to Software Heritage - - """ - task_queue = 'swh_loader_tar' - - CONFIG_BASE_FILENAME = 'loader/tar.ini' - ADDITIONAL_CONFIG = { - 'extraction_dir': ('str', '/tmp/swh.loader.tar/'), - } - - def run(self, tar_path, origin, revision, release, occurrences): - """Import a tarball tar_path. - - Args: - - tar_path: path access to the tarball - - origin, revision, release, occurrences: see LoadDirRepository.run - - """ - extraction_dir = self.config['extraction_dir'] - dir_path = tempfile.mkdtemp(prefix='swh.loader.tar-', - dir=extraction_dir) - - self.log.info('Uncompress %s to %s' % (tar_path, dir_path)) - uncompress(tar_path, dir_path) - - if 'type' not in origin: # let the type flow if present - origin['type'] = 'tar' - - try: - super().run(dir_path, origin, revision, release, occurrences) - finally: # always clean up - shutil.rmtree(dir_path) - - -class LoadTarRepositoryPrint(LoadDirRepository): - """Import a tarball to Software Heritage - - DEBUG purposes - """ - task_queue = 'swh_loader_tar_print' - - def run(self, tar_path, origin, revision, release, occurrences): - """Import a tarball tar_path. - - Args: - - tar_path: path access to the tarball - - origin, revision, release, occurrences: see LoadDirRepository.run - - """ - print(tar_path, origin, revision, release, occurrences) diff --git a/swh/loader/dir/tests/test_producer.py b/swh/loader/dir/tests/test_producer.py deleted file mode 100644 index f474198..0000000 --- a/swh/loader/dir/tests/test_producer.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest - -from swh.loader.dir import producer - - -class TestProducer(unittest.TestCase): - @istest - def compute_basic_release_number(self): - files = { - 'free-ipmi-1.2.2.tar': '1.2.2', - 'free-ipmi-1.2.2.tar.gz': '1.2.2', - 'free-ipmi-1.2.2.tar.tgz': '1.2.2', - 'gcc-testsuite-4.4.2-4.4.3.diff.bz2': '4.4.2-4.4.3.diff', - 'gcc-java-4.0.4.tar.gz': '4.0.4', - 'gmp-2.0.tar.lzma': '2.0', - 'win-gerwin-0.6.zip': '0.6', - 'ballandpaddle-0.8.0.tar.xz': '0.8.0', - 'mail-1.1.1.some.lz': '1.1.1.some', - 'gmp-4.1.1-4.1.2.diff.tar.Z': '4.1.1-4.1.2.diff', - 'findutils-4.2.18.tar.bzip2': '4.2.18', - 'greg-1.4.tar.gz': '1.4', - - # . separator - 'greg.1.4.tar.gz': '1.4', - - # number in software product - 'aspell6-pt_BR-20070411-0.tar.bz2': '20070411-0', - 'libosip2-3.3.0.tar.gz': '3.3.0', - - # particular patterns... - 'gift-0.1.9+3epsilon.tar.gz': '0.1.9+3epsilon', - 'gift-0.1.6pre2.tgz': '0.1.6pre2', - 'binutils-2.19.1a.tar.bz2': '2.19.1a', - 'readline-4.2-4.2a.diff.gz': '4.2-4.2a.diff', - - # with arch patterns - 'cvs-1.12.6-BSD.bin.gz': '1.12.6-BSD.bin', - 'cvs-1.12.12-SunOS-5.8-i386.gz': '1.12.12-SunOS-5.8-i386', - 'gnutls-3.0.20-w32.zip': '3.0.20-w32', - 'mit-scheme_7.7.90+20080130-0gutsy1.diff.gz': - '7.7.90+20080130-0gutsy1.diff', - - # no release number - 'gnu.ps.gz': None, - 'direvent-latest.tar.gz': None, - } - - # then - for f in files.keys(): - rel_num = producer.release_number(f) - self.assertEquals( - files[f], - rel_num, - 'for %s, the version should be %s' % (f, files[f]))