diff --git a/bin/swh-loader-tar-producer b/bin/swh-loader-tar-producer index 8e21b73..42b9a54 100755 --- a/bin/swh-loader-tar-producer +++ b/bin/swh-loader-tar-producer @@ -1,156 +1,166 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import sys import os from swh.core import config from swh.loader.dir import producer # Static setup EPOCH = 0 UTC_OFFSET = '+0000' SWH_PERSON = 'Software Heritage' SWH_MAIL = 'robot@swh.org' REVISION_MESSAGE = 'synthetic message' REVISION_TYPE = 'tar' REVISION = { 'author_date': EPOCH, 'author_offset': UTC_OFFSET, 'author_name': SWH_PERSON, 'author_email': SWH_MAIL, 'committer_date': EPOCH, 'committer_offset': UTC_OFFSET, 'committer_name': SWH_PERSON, 'committer_email': SWH_MAIL, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } -conf_file = sys.argv[1] -if not os.path.exists(conf_file): - conf_file = '../resources/producer/tar.ini' - +def compute_origin(config, filename): + """Compute the origin. -conf = config.read(conf_file) + Args: + - config: + - filename: + Returns: + Dictionary origin with keys: + - url: origin's url + - type: origin's type -def compute_origin(config, filename): + """ return { - 'origin_url': os.path.join(config['origin_url_scheme'], filename), - 'origin_type': config['origin_type'], + 'url': os.path.join(config['url_scheme'], filename), + 'type': config['type'], } def compute_occurrence(filepath): pass def compute_occurrences(filepath): pass def time_from_file(filepath): """Extract time from filepath. Args: filepath: path to the file we want to extract metadata Returns: Modification time from filepath. """ return os.lstat(filepath).st_mtime def compute_release(filepath): """Compute a release from a given filepath. If the filepath does not contain a recognizable release number, the release can be skipped. Args: filepath: file's absolute path Returns: None if the release number cannot be extracted from the filename. Otherwise a synthetic release is computed with the following keys: - name: the release computed from the filename - date: the modification timestamp as returned by a fstat call - offset: +0000 - author_name: '' - author_email: '' - comment: '' """ filename = os.path.basename(filepath) release_number = producer.release_number(filename) if release_number: return { 'name': release_number, 'date': time_from_file(filepath), 'offset': UTC_OFFSET, 'author_name': '', 'author_email': '', 'comment': '', } return None def compute_revision(filepath): """Compute the revision from filepath. Args: filepath: file's absolute path Returns: Synthetic revision. """ return REVISION def list_archives_from(path): """From path, produce archive tarball message to celery. Args: path: top directory to list archives from. """ for dirpath, dirnames, filenames in os.walk(path): for fname in [f for f in filenames if producer.is_archive(f)]: yield dirpath, fname # LIMIT = 100 LIMIT = None def compute_message_from(dirpath, filename): # filepath = os.path.join(dirpath, filename) version = producer.release_number(filename) print('|'.join(['', filename, version, ''])) def produce_archive_messages(path): """From path, produce archive tarball message to celery. Args: path: top directory to list archives from. """ limit = 0 for dirpath, filename in list_archives_from(path): compute_message_from(dirpath, filename) if LIMIT and limit > LIMIT: return limit += 1 +conf_file = sys.argv[1] +if not os.path.exists(conf_file): + conf_file = '../resources/producer/tar.ini' + +conf = config.read(conf_file) + produce_archive_messages(conf['mirror_root_directory']) diff --git a/resources/producer/tar.ini b/resources/producer/tar.ini index a5eb895..4999471 100644 --- a/resources/producer/tar.ini +++ b/resources/producer/tar.ini @@ -1,18 +1,18 @@ [main] # mirror's root directory from which producing archive messages to load mirror_root_directory=/home/storage/space/mirrors/gnu.org/gnu # mirror_root_directory=/tmp/storage/space/mirrors/gnu.org/gnu # archive extensions patterns (not yet used) archive_extensions = gz, tgz, bz2, bzip2, Z, lzma, lz, lzma, tar, xz, zip # special pattern cases (not yet used). archive_special = x86, x86_64, x64, i386, i686, AIX, BSD, SGI, SUN, HP-UX, HP, SunOS, w32, win32, pre, alpha, epsilon, beta -origin_url_scheme = rsync://ftp.gnu.org/ftp/ -origin_type = ftp +url_scheme = rsync://ftp.gnu.org/ftp/ +type = ftp