Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/loader.py
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import os | import os | ||||
import shutil | |||||
from abc import abstractmethod | from abc import abstractmethod | ||||
from tempfile import mkdtemp | from tempfile import mkdtemp | ||||
from swh.core import tarball | from swh.core import tarball | ||||
from swh.loader.core.utils import clean_dangling_folders | from swh.loader.core.utils import clean_dangling_folders | ||||
from swh.loader.core.loader import BufferedLoader | from swh.loader.core.loader import BufferedLoader | ||||
from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE | from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE | ||||
from swh.storage.algos.snapshot import snapshot_get_all_branches | from swh.storage.algos.snapshot import snapshot_get_all_branches | ||||
from swh.model.from_disk import Directory | |||||
from swh.model.identifiers import ( | |||||
identifier_to_bytes, revision_identifier | |||||
) | |||||
from .build_revision import BuildRevision | from .build_revision import BuildRevision | ||||
DEBUG_MODE = '** DEBUG MODE **' | DEBUG_MODE = '** DEBUG MODE **' | ||||
class PackageLoader(BufferedLoader, BuildRevision): | class PackageLoader(BufferedLoader, BuildRevision): | ||||
"""Package loader class for package manager loader | """Package loader class for package manager loader | ||||
▲ Show 20 Lines • Show All 305 Lines • ▼ Show 20 Lines | def uncompress_tarball(self, filepath, path): | ||||
""" | """ | ||||
# filepath = tempdir + url | # filepath = tempdir + url | ||||
try: | try: | ||||
self.tarball_invalid = False | self.tarball_invalid = False | ||||
return tarball.uncompress(filepath, path) | return tarball.uncompress(filepath, path) | ||||
except Exception: | except Exception: | ||||
self.tarball_invalid = True | self.tarball_invalid = True | ||||
return None | return None | ||||
def fetch_data(self): | |||||
"""Called once per release artifact version (can be many for one | |||||
release). | |||||
This will for each call: | |||||
- retrieve a release artifact (associated to a release version) | |||||
- Computes the swh objects | |||||
Returns: | |||||
True as long as data to fetch exist | |||||
""" | |||||
data = None | |||||
if self.done: | |||||
return False | |||||
try: | |||||
data = next(self.new_versions) | |||||
self._load_status = 'eventful' | |||||
except StopIteration: | |||||
self.done = True | |||||
return False | |||||
package_source_data, dir_path = data | |||||
# package release tarball was corrupted | |||||
if self.tarball_invalid: | |||||
return not self.done | |||||
dir_path = dir_path.encode('utf-8') | |||||
directory = Directory.from_disk(path=dir_path, data=True) | |||||
objects = directory.collect() | |||||
objects = self.check_objects(objects) | |||||
self.package_contents = objects['content'].values() | |||||
self.package_directories = objects['directory'].values() | |||||
revision = self.compute_revision(directory, | |||||
package_source_data) | |||||
revision['id'] = identifier_to_bytes( | |||||
revision_identifier(revision)) | |||||
self.package_revisions.append(revision) | |||||
self.update_known_version(package_source_data, revision['id']) | |||||
self.log.debug('Removing unpacked package files at %s', dir_path) | |||||
shutil.rmtree(dir_path) | |||||
return not self.done | |||||
def check_objects(self, objects): | |||||
"""The the object for necessary fields and initialise it if not | |||||
already present. | |||||
""" | |||||
if 'content' not in objects: | |||||
objects['content'] = {} | |||||
if 'directory' not in objects: | |||||
objects['directory'] = {} | |||||
return objects | |||||
def update_known_version(self, package_source_data, revision_id): | |||||
"""Update the `known_versions` variable with new discovered versions | |||||
Args: | |||||
package_source_data (dict): Metadata available for a particular package | |||||
version | |||||
revision_id (str): Revision id of the revsion of focused package | |||||
version | |||||
""" | |||||
key = self.get_key() | |||||
package_key = package_source_data[key] | |||||
self.known_versions[package_key] = revision_id # SEE ME | |||||
def store_data(self): | |||||
"""Store fetched data in the database. | |||||
""" | |||||
self.maybe_load_contents(self.package_contents) | |||||
self.maybe_load_directories(self.package_directories) | |||||
self.maybe_load_revisions(self.package_revisions) | |||||
if self.done: | |||||
self.generate_and_load_snapshot() | |||||
self.flush() |