diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py --- a/swh/loader/package/loader.py +++ b/swh/loader/package/loader.py @@ -1,10 +1,10 @@ - # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +import shutil from abc import abstractmethod from tempfile import mkdtemp @@ -14,6 +14,12 @@ from swh.loader.core.loader import BufferedLoader from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE from swh.storage.algos.snapshot import snapshot_get_all_branches +from swh.model.from_disk import Directory + +from swh.model.identifiers import ( + identifier_to_bytes, revision_identifier +) + from .build_revision import BuildRevision @@ -335,3 +341,92 @@ except Exception: self.tarball_invalid = True return None + + def fetch_data(self): + """Called once per release artifact version (can be many for one + release). + + This will for each call: + - retrieve a release artifact (associated to a release version) + - Computes the swh objects + + Returns: + True as long as data to fetch exist + + """ + data = None + if self.done: + return False + + try: + data = next(self.new_versions) + self._load_status = 'eventful' + except StopIteration: + self.done = True + return False + + package_source_data, dir_path = data + + # package release tarball was corrupted + if self.tarball_invalid: + return not self.done + + dir_path = dir_path.encode('utf-8') + directory = Directory.from_disk(path=dir_path, data=True) + objects = directory.collect() + + objects = self.check_objects(objects) + + self.package_contents = objects['content'].values() + self.package_directories = objects['directory'].values() + + revision = self.compute_revision(directory, + package_source_data) + + revision['id'] = identifier_to_bytes( + revision_identifier(revision)) + self.package_revisions.append(revision) + + self.update_known_version(package_source_data, revision['id']) + + self.log.debug('Removing unpacked package files at %s', dir_path) + shutil.rmtree(dir_path) + + return not self.done + + def check_objects(self, objects): + """The the object for necessary fields and initialise it if not + already present. + + """ + if 'content' not in objects: + objects['content'] = {} + if 'directory' not in objects: + objects['directory'] = {} + return objects + + def update_known_version(self, package_source_data, revision_id): + """Update the `known_versions` variable with new discovered versions + + Args: + package_source_data (dict): Metadata available for a particular package + version + revision_id (str): Revision id of the revsion of focused package + version + + """ + key = self.get_key() + package_key = package_source_data[key] + self.known_versions[package_key] = revision_id # SEE ME + + def store_data(self): + """Store fetched data in the database. + + """ + self.maybe_load_contents(self.package_contents) + self.maybe_load_directories(self.package_directories) + self.maybe_load_revisions(self.package_revisions) + + if self.done: + self.generate_and_load_snapshot() + self.flush()