diff --git a/bin/swh-loader-tar-producer b/bin/swh-loader-tar-producer index ffcef33..8085595 100755 --- a/bin/swh-loader-tar-producer +++ b/bin/swh-loader-tar-producer @@ -1,221 +1,221 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import sys import os import tarfile from swh.core import config from swh.loader.dir import producer from swh.loader.dir import tasks # Static setup EPOCH = 0 UTC_OFFSET = '+0000' SWH_PERSON = 'Software Heritage' SWH_MAIL = 'robot@swh.org' REVISION_MESSAGE = 'synthetic message' REVISION_TYPE = 'tar' REVISION = { 'author_date': EPOCH, 'author_offset': UTC_OFFSET, 'author_name': SWH_PERSON, 'author_email': SWH_MAIL, 'committer_date': EPOCH, 'committer_offset': UTC_OFFSET, 'committer_name': SWH_PERSON, 'committer_email': SWH_MAIL, 'type': REVISION_TYPE, 'message': REVISION_MESSAGE, } SWH_AUTHORITY = 1 GNU_AUTHORITY = 2 LIMIT = 1 # LIMIT = None -def compute_origin(config, filepath): +def compute_origin(config, tarpath): """Compute the origin. Args: - config: configuration dict with url_scheme and type keys. - - filepath: file's path + - tarpath: file's path Returns: Dictionary origin with keys: - url: origin's url - type: origin's type """ return { - 'url': os.path.join(config['url_scheme'], filepath), + 'url': os.path.join(config['url_scheme'], tarpath), 'type': config['type'], } -def swh_occurrence(filepath): - """Compute the occurrence from the filepath with swh authority. +def swh_occurrence(tarpath): + """Compute the occurrence from the tarpath with swh authority. Args: - filepath: file's path + tarpath: file's path Return: Occurrence. - branch: occurrence's branch name - authority: swh authority - validity: validity date (e.g. 2015-01-01 00:00:00+00) """ return { - 'branch': os.path.dirname(filepath), + 'branch': os.path.dirname(tarpath), 'authority': SWH_AUTHORITY, - 'validity': time_from_file(filepath) # FIXME: Use the right time + 'validity': time_from_file(tarpath) # FIXME: Use the right time } -def gnu_occurrence(filepath): - """Compute the occurrence from the filepath with gnu authority. +def gnu_occurrence(tarpath): + """Compute the occurrence from the tarpath with gnu authority. Args: - filepath: file's path + tarpath: file's path Return: Occurrence. """ return { - 'branch': os.path.dirname(filepath), + 'branch': os.path.dirname(tarpath), 'authority': GNU_AUTHORITY, - 'validity': time_from_file(filepath) + 'validity': time_from_file(tarpath) } -def compute_occurrences(filepath): - """Compute the occurrences from filepath. +def compute_occurrences(tarpath): + """Compute the occurrences from tarpath. Args: - filepath: + tarpath: Returns: - list of occurrences from filepath. + list of occurrences from tarpath. """ - return [gnu_occurrence(filepath), swh_occurrence(filepath)] + return [gnu_occurrence(tarpath), swh_occurrence(tarpath)] -def time_from_file(filepath): - """Extract time from filepath. +def time_from_file(tarpath): + """Extract time from tarpath. Args: - filepath: path to the file we want to extract metadata + tarpath: path to the file we want to extract metadata Returns: - Modification time from filepath. + Modification time from tarpath. """ - return os.lstat(filepath).st_mtime + return os.lstat(tarpath).st_mtime -def compute_release(filename, filepath): - """Compute a release from a given filepath, filename. - If the filepath does not contain a recognizable release number, the release +def compute_release(filename, tarpath): + """Compute a release from a given tarpath, filename. + If the tarpath does not contain a recognizable release number, the release can be skipped. Args: filename: file's name without path - filepath: file's absolute path + tarpath: file's absolute path Returns: None if the release number cannot be extracted from the filename. Otherwise a synthetic release is computed with the following keys: - name: the release computed from the filename - date: the modification timestamp as returned by a fstat call - offset: +0000 - author_name: '' - author_email: '' - comment: '' """ release_number = producer.release_number(filename) if release_number: return { 'name': release_number, - 'date': time_from_file(filepath), + 'date': time_from_file(tarpath), 'offset': UTC_OFFSET, 'author_name': '', 'author_email': '', 'comment': '', } return None def list_archives_from(path): """From path, produce archive tarball message to celery. Args: path: top directory to list archives from. """ for dirpath, dirnames, filenames in os.walk(path): for fname in filenames: - filepath = os.path.join(dirpath, fname) - if os.path.exists(filepath) and tarfile.is_tarfile(filepath): + tarpath = os.path.join(dirpath, fname) + if os.path.exists(tarpath) and tarfile.is_tarfile(tarpath): yield dirpath, fname def compute_message_from(conf, dirpath, filename): """Post the message to workers. Args: conf: dictionary holding static metadata dirpath: directory containing the filename filename: filename without any path Returns: None """ - filepath = os.path.join(dirpath, filename) + tarpath = os.path.join(dirpath, filename) - origin = compute_origin(conf, filepath) - occurrences = compute_occurrences(filepath) - release = compute_release(filename, filepath) + origin = compute_origin(conf, tarpath) + occurrences = compute_occurrences(tarpath) + release = compute_release(filename, tarpath) task = tasks.LoadTarRepository() # tasks.LoadTarRepositoryPrint() - task.apply_async((filepath, + task.apply_async((tarpath, origin, REVISION, release, occurrences)) def produce_archive_messages(conf, path): """From path, produce archive tarball message to celery. Args: path: top directory to list archives from. Returns: None """ limit = 0 for dirpath, filename in list_archives_from(path): compute_message_from(conf, dirpath, filename) if LIMIT and limit > LIMIT: return limit += 1 conf_file = sys.argv[1] if not os.path.exists(conf_file): conf_file = '../resources/producer/tar.ini' conf = config.read(conf_file) produce_archive_messages(conf, conf['mirror_root_directory'])