diff --git a/bin/swh-loader-tar-producer b/bin/swh-loader-tar-producer index dae9622..087fa32 100755 --- a/bin/swh-loader-tar-producer +++ b/bin/swh-loader-tar-producer @@ -1,245 +1,235 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import sys import os import tarfile import datetime from swh.core import config from swh.loader.dir import producer from swh.loader.dir import tasks # Static setup EPOCH = 0 UTC_OFFSET = '+0000' SWH_PERSON = 'Software Heritage' SWH_MAIL = 'robot@swh.org' REVISION_MESSAGE = 'synthetic message' REVISION_TYPE = 'tar' REVISION = { 'author_date': EPOCH, 'author_offset': UTC_OFFSET, 'author_name': SWH_PERSON, 'author_email': SWH_MAIL, 'committer_date': EPOCH, 'committer_offset': UTC_OFFSET, 'committer_name': SWH_PERSON, 'committer_email': SWH_MAIL, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } SWH_AUTHORITY = 1 GNU_AUTHORITY = 2 -LIMIT = 1 +LIMIT = 10 # LIMIT = None -def relative_path(root_dirpath, tarpath): - """Compute the relative_path from root_dirpath and tarpath. - - """ - return tarpath.split(root_dirpath)[1] - - def compute_origin(url_scheme, url_type, root_dirpath, tarpath): """Compute the origin. Args: - config: configuration dict with url_scheme and type keys. - tarpath: file's path Returns: Dictionary origin with keys: - url: origin's url - type: origin's type """ + relative_path = tarpath.split(root_dirpath)[1] return { - 'url': ''.join([url_scheme, relative_path(root_dirpath, tarpath)]), + 'url': ''.join([url_scheme, os.path.dirname(relative_path)]), 'type': url_type, } -def _build_occurrence(branch_name, authority_id, validity_ts): +def _build_occurrence(tarpath, authority_id, validity_ts): """Build an occurrence from branch_name, authority_id and validity_ts. Returns: Occurrence dictionary - - branch: occurrence's branch name + - tarpath: file's path - authority: swh authority - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ validity = '%s+00' % datetime.datetime.utcfromtimestamp(validity_ts) return { - 'branch': branch_name, + 'branch': os.path.basename(tarpath), 'authority': authority_id, 'validity': validity } -def swh_occurrence(root_dirpath, tarpath): +def swh_occurrence(tarpath): """Compute the occurrence from the tarpath with swh authority. Args: tarpath: file's path Returns: Occurrence dictionary (cf. _build_occurrence) """ validity_ts = os.lstat(tarpath).st_atime - branch_name = relative_path(root_dirpath, tarpath) - return _build_occurrence(branch_name, SWH_AUTHORITY, validity_ts) + return _build_occurrence(tarpath, SWH_AUTHORITY, validity_ts) -def gnu_occurrence(root_dirpath, tarpath): +def gnu_occurrence(tarpath): """Compute the occurrence from the tarpath with gnu authority. Args: - root_dirpath: root_dirpath containing the tarpath. tarpath: file's path Return: Occurrence dictionary (cf. _build_occurrence) """ validity_ts = os.lstat(tarpath).st_mtime - branch_name = relative_path(root_dirpath, tarpath) - return _build_occurrence(branch_name, GNU_AUTHORITY, validity_ts) + return _build_occurrence(tarpath, GNU_AUTHORITY, validity_ts) def compute_release(filename, tarpath): """Compute a release from a given tarpath, filename. If the tarpath does not contain a recognizable release number, the release can be skipped. Args: filename: file's name without path tarpath: file's absolute path Returns: None if the release number cannot be extracted from the filename. Otherwise a synthetic release is computed with the following keys: - name: the release computed from the filename - date: the modification timestamp as returned by a fstat call - offset: +0000 - author_name: '' - author_email: '' - comment: '' """ release_number = producer.release_number(filename) if release_number: return { 'name': release_number, 'date': os.lstat(tarpath).st_mtime, 'offset': UTC_OFFSET, 'author_name': '', 'author_email': '', 'comment': '', } return None def list_archives_from(path): """From path, produce archive tarball message to celery. Args: path: top directory to list archives from. """ for dirpath, dirnames, filenames in os.walk(path): for fname in filenames: tarpath = os.path.join(dirpath, fname) if os.path.exists(tarpath) and tarfile.is_tarfile(tarpath): yield dirpath, fname def compute_message_from(conf, dirpath, filename): """Post the message to workers. Args: conf: dictionary holding static metadata dirpath: directory containing the filename filename: filename without any path Returns: None """ tarpath = os.path.join(dirpath, filename) root_dirpath = conf['mirror_root_directory'] origin = compute_origin(conf['url_scheme'], conf['type'], root_dirpath, tarpath) - occurrences = [gnu_occurrence(root_dirpath, tarpath), - swh_occurrence(root_dirpath, tarpath)] + occurrences = [gnu_occurrence(tarpath), swh_occurrence(tarpath)] release = compute_release(filename, tarpath) task = tasks.LoadTarRepository() # tasks.LoadTarRepositoryPrint() task.apply_async((tarpath, origin, REVISION, release, occurrences)) def produce_archive_messages(conf, path): """From path, produce archive tarball message to celery. Args: path: top directory to list archives from. Returns: None """ limit = 0 for dirpath, filename in list_archives_from(path): limit += 1 compute_message_from(conf, dirpath, filename) if LIMIT and limit >= LIMIT: return limit return limit def load_config(conf_file): """Load the configuration from file. """ conf = config.read(conf_file) url_scheme = conf['url_scheme'] mirror_dir = conf['mirror_root_directory'] # remove trailing / in configuration (to ease ulterior computation) if url_scheme[-1] == '/': conf.update({ 'url_scheme': url_scheme[0:-1] }) if mirror_dir[-1] == '/': conf.update({ 'mirror_root_directory': mirror_dir[0:-1] }) return conf conf_file = sys.argv[1] if not os.path.exists(conf_file): conf_file = '../resources/producer/tar.ini' conf = load_config(conf_file) nb_tarballs = produce_archive_messages(conf, conf['mirror_root_directory']) print('%s sent to celery!' % nb_tarballs) diff --git a/resources/producer/tar.ini b/resources/producer/tar.ini index c23818d..4e58d62 100644 --- a/resources/producer/tar.ini +++ b/resources/producer/tar.ini @@ -1,14 +1,15 @@ [main] # mirror's root directory from which producing archive messages to load mirror_root_directory = /home/storage/space/mirrors/gnu.org/gnu/ # archive extensions patterns (not yet used) # archive_extensions = gz, tgz, bz2, bzip2, Z, lzma, lz, lzma, tar, xz, zip # special pattern cases (not yet used). # archive_special = x86, x86_64, x64, i386, i686, AIX, BSD, SGI, SUN, HP-UX, HP, SunOS, w32, win32, pre, alpha, epsilon, beta -# origin setup -url_scheme = rsync://ftp.gnu.org/gnu/ +# origin setup's possible scheme url +#url_scheme = http://ftp.gnu.org/gnu/ +url_scheme = rsync://ftp.gnu.org/ftp/ type = ftp