Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F7123705
D1694.id5856.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
35 KB
Subscribers
None
D1694.id5856.diff
View Options
diff --git a/swh/loader/base/__init__.py b/swh/loader/base/__init__.py
new file mode 100644
diff --git a/swh/loader/base/build_revision.py b/swh/loader/base/build_revision.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/build_revision.py
@@ -0,0 +1,104 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime
+from swh.model.identifiers import normalize_timestamp
+from dateutil import parser as date_parser
+
+
+class construct_revision:
+ """Construct revision from the metadata of the package version
+
+ Construct the revision for a package version using the metadata provided.
+ There are several hookpoints that can be overridden as per the need of
+ package manager.
+
+ """
+
+ UTC_OFFSET = 0
+ SWH_PERSON = {
+ 'name': b'Software Heritage',
+ 'fullname': b'Software Heritage',
+ 'email': b'robot@softwareheritage.org'
+ }
+ REVISION_MESSAGE = b'swh-loader-base: synthetic revision message'
+
+ def modify_revision(self, revision):
+ """Make modification on revision created
+ If the revision of a package manager needs to be in a specific format,
+ this method can be overridden to perform that operation insted of
+ overriding the `compute_revision()`.
+
+ Args:
+ revision (dict): Created revision
+
+ Returns:
+ dict: Modified revision
+ """
+ return revision
+
+ def compute_revision(self, directory, package_source_data):
+ """Compute a revision.
+
+ Args:
+ directory (str): absolute path to the tarball
+ package_source_data (dict): Information about the package
+ release version
+
+ Returns:
+ dict: Revision
+
+ """
+ revision = {
+ 'metadata': self.find_metadata(package_source_data),
+ 'date': {
+ 'timestamp': self.find_date(package_source_data),
+ 'offset': self.UTC_OFFSET,
+ },
+ 'committer_date': {
+ 'timestamp': self.find_date(package_source_data),
+ 'offset': self.UTC_OFFSET,
+ },
+ 'author': self.find_author(package_source_data),
+ 'committer': self.find_author(package_source_data),
+ 'type': self.find_type(package_source_data),
+ 'message': self.find_message(package_source_data),
+ 'directory': self.directory(directory),
+ 'synthetic': self.find_synthetic(),
+ 'parents': [],
+ }
+
+ return self.modify_revision(revision)
+
+ def find_synthetic(self):
+ return True
+
+ def find_type(self, package_source_data):
+ return package_source_data['nature']
+
+ def find_message(self, package_source_data):
+ return self.REVISION_MESSAGE
+
+ def directory(self, directory):
+ return directory.hash
+
+ def find_author(self, package_source_data):
+ if 'author' in package_source_data:
+ return package_source_data['author']
+ return self.SWH_PERSON
+
+ def find_metadata(self, package_source_data):
+ return {
+ 'package': package_source_data
+ }
+
+ def find_date(self, package_source_data):
+ try:
+ # if 'date' in package_source_data:
+ date = date_parser.parse(package_source_data['date'])
+ return normalize_timestamp(int(date.timestamp()))
+ except Exception:
+ now = datetime.now()
+ return normalize_timestamp(int(datetime.timestamp(now)))
diff --git a/swh/loader/base/dowload.py b/swh/loader/base/dowload.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/dowload.py
@@ -0,0 +1,370 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import requests
+import time
+
+try:
+ from _version import __version__
+except ImportError:
+ __version__ = 'devel'
+
+
+# This file contains methods to check and remove archived package version
+
+
+class If_Modified_Since:
+ """Uses if_modified_then header to check for if the package is previously
+ archived.
+
+ This class is to be used to identify and avoid the reprocessing of
+ previously archived package version when there is no reliable field is
+ provided in the metadata that can so the job.
+
+ The following operations are performed:
+ - Retrive known versions and store then in a dict with tarball `url`
+ as key(can be changed by overriding `get_key` method)
+ - Check if the tarballs are present in knowns versions.
+ * If the match found, it sends a request with `if_modified_since`
+ header to confirm the match
+ * If the match is not found it sends a simple request
+ - Store the request and the time for further processing
+ - Instantiate a generator to process a specific package released
+ version
+
+ """
+ def __init__(self):
+ self.session = requests.session()
+ self.time_last_visit = {}
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def get_artifact(self, revision):
+ """Fetch artifact form revision
+
+ Args:
+ revision (dict): Previous revision
+
+ Returns:
+ dict: metadata present in the revision
+
+ """
+ return revision['metadata']['package']
+
+ def get_key(self):
+ """Returns the key to be used to identify known revisions
+
+ """
+ return 'url'
+
+ def known_versions(self, last_snapshot):
+ """
+ Retrieve the known release versions for the npm package
+ (i.e. those already ingested into the archive).
+
+ Args
+ last_snapshot (dict): Last snapshot for the visit
+
+ Returns:
+ dict: Dict whose key is url and values are revision ids.
+
+ """
+ if not last_snapshot or 'branches' not in last_snapshot:
+ return {}
+
+ # retrieve only revisions (e.g the alias we do not want here)
+ revs = [rev['target']
+ for rev in last_snapshot['branches'].values()
+ if rev and rev['target_type'] == 'revision']
+ known_revisions = self.storage.revision_get(revs)
+ ret = {}
+ key = self.get_key()
+ for revision in known_revisions:
+ if not revision: # revision_get can return None
+ continue
+
+ artifact = self.get_artifact(revision)
+ ret[artifact[key]] = revision['id']
+ self.time_last_visit[artifact[key]] = artifact['time_last_visit']
+ return ret
+
+ def filter_package_versions(self, tarballs, known_versions):
+ """Return the available tarballs that are not previously archived.
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Returns:
+ A list of dicts containing information about the respective
+ tarballs that are not previously archived.
+
+ """
+ # Done
+ versions = []
+ key = self.get_key()
+ for release in tarballs:
+ tarball_url = release['url']
+
+ if release[key] in known_versions:
+ tarball_request = self._request(
+ tarball_url,
+ time_last_visit=self.time_last_visit[release[key]],
+ throw_error=False)
+ else:
+ tarball_request = self._request(
+ tarball_url, time_last_visit=None, throw_error=False)
+
+ if tarball_request.status_code == 304:
+ continue
+
+ elif tarball_request.status_code != 200:
+ self.log.debug("Fail to query '%s'. Reason: %s" % (
+ tarball_url, tarball_request.status_code))
+ continue
+
+ new_release = self.update_release_info(release, tarball_request)
+ versions.append(new_release)
+
+ return versions
+
+ def update_release_info(self, release, tarball_request):
+ """
+
+ """
+ release['response'] = tarball_request
+ time_now = time.time()
+ time_now = time.strftime('%a, %d %b %Y %H:%M:%S GMT',
+ time.gmtime(time_now))
+ release['time_last_visit'] = time_now
+ return release
+
+ def _request(self, url, time_last_visit=None, throw_error=True):
+ """Request the remote tarball url.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ server response
+
+ """
+ # Done
+ if time_last_visit:
+ self.params['headers']['If-Modified-Since'] = time_last_visit
+ response = self.session.get(url, **self.params, stream=True)
+ return response
+
+ def prepare_package_versions(self, tarballs, known_versions=None):
+ """
+ Instantiate a generator that will process a specific package released
+ version at each iteration step. The following operations will be
+ performed:
+
+ 1. Create a temporary directory to download and extract the
+ release tarball
+ 2. Download the tarball
+ 3. Uncompress the tarball
+ 4. Parse the file associated to the package version to extract
+ metadata (optional)
+ 5. Delete unnecessary files (optional)
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Yields:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ new_versions = self.filter_package_versions(tarballs, known_versions)
+ for package_source_data in new_versions:
+ tarball_request = package_source_data['response']
+
+ # To make things simple while creating revisions
+ del package_source_data['response']
+ yield self._prepare_package_version(package_source_data,
+ tarball_request)
+
+
+class compareField:
+ """Uses a field present in the metadata to check for if the package is
+ previously archived.
+
+ This class is to be used to identify and avoid the reprocessing of
+ previously archived package version using a field provided by the
+ API in the metadata of the package version
+
+ The following operations are performed:
+ - Retrive known versions and store then in a dict with key of same
+ field that is mentioned in compare field
+ - Check if the tarballs are present in knowns versions.
+ - Instantiate a generator to process a specific package released
+ version
+
+ """
+ compare_field = """Field used to identify if the package
+ version is previously archived"""
+ # eg for pypi loader compare_field = 'sha'
+
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def get_key(self):
+ """Returns the key to be used to identify known revisions
+
+ """
+ return self.compare_field
+
+ def _request(self, url, throw_error=True):
+ """Request the remote tarball url.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ # Done
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200 and throw_error:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return response
+
+ def known_versions(self, last_snapshot):
+ """
+ Retrieve the known release versions for the npm package
+ (i.e. those already ingested into the archive).
+
+ Args
+ last_snapshot (dict): Last snapshot for the visit
+
+ Returns:
+ dict: Dict whose key is the value of field chosen for
+ checking archived artifacts and values are revision ids.
+
+ """
+ if not last_snapshot or 'branches' not in last_snapshot:
+ return {}
+
+ # retrieve only revisions (e.g the alias we do not want here)
+ revs = [rev['target']
+ for rev in last_snapshot['branches'].values()
+ if rev and rev['target_type'] == 'revision']
+ known_revisions = self.storage.revision_get(revs)
+ ret = {}
+ for revision in known_revisions:
+ if not revision: # revision_get can return None
+ continue
+ artifact = self.artifact_from_revision(revision)
+ ret[artifact[self.compare_field]] = revision['id'] # Check me
+ return ret
+
+ def artifact_from_revision(self, revision):
+ """Find artifacts from the revision
+ """
+ # Can be overridden if change in standard revision pattern
+ if 'package' in revision['metadata']:
+ return revision['metadata']['package']
+
+ def filter_package_versions(self, tarballs, known_versions):
+ """
+ Return the available tarballs that are not previously archived.
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Returns:
+ A list of dicts containing information about the respective
+ tarballs that are not previously archived.
+
+ """
+ # Done
+ versions = []
+
+ for release in tarballs:
+ if release[self.compare_field] in known_versions:
+ continue
+ versions.append(release)
+
+ return versions
+
+ def prepare_package_versions(self, tarballs, known_versions=None):
+ """
+ Instantiate a generator that will process a specific package released
+ version at each iteration step. The following operations will be
+ performed:
+
+ 1. Create a temporary directory to download and extract the
+ release tarball
+ 2. Download the tarball
+ 3. Uncompress the tarball
+ 4. Parse the file associated to the package version to extract
+ metadata (optional)
+ 5. Delete unnecessary files (optional)
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Yields:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ new_versions = self.filter_package_versions(tarballs, known_versions)
+ for package_source_data in new_versions:
+ # filter out version with missing tarball,
+ # package visit will be marked as partial at the end of
+ # the loading process
+
+ tarball_url = package_source_data['url']
+ tarball_request = self._request(tarball_url,
+ throw_error=False)
+ if tarball_request.status_code == 404:
+ self.log.debug('Tarball url %s returns a 404 error.',
+ tarball_url)
+ continue
+
+ yield self._prepare_package_version(package_source_data,
+ tarball_request)
diff --git a/swh/loader/base/loader.py b/swh/loader/base/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/loader.py
@@ -0,0 +1,471 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import shutil
+
+from abc import abstractmethod
+from tempfile import mkdtemp
+
+from swh.core import tarball
+from swh.loader.core.utils import clean_dangling_folders
+from swh.loader.core.loader import BufferedLoader
+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
+from swh.storage.algos.snapshot import snapshot_get_all_branches
+from swh.model.from_disk import Directory
+
+from .build_revision import construct_revision
+
+from swh.model.identifiers import (
+ identifier_to_bytes, revision_identifier, snapshot_identifier
+)
+
+DEBUG_MODE = '** DEBUG MODE **'
+
+
+class BaseLoader(BufferedLoader, construct_revision):
+ """
+
+ Required Overrides:
+ loader_name
+ class_name
+ def convert_to_standard_format
+
+ Optional Overrides:
+ def cleanup_artifact
+ def extract_metadata
+
+ """
+
+ loader_name = """Name of the package manager""" # e.g pypi
+ class_name = """Name of the loader class""" # eg PyPILoader
+
+ def __init__(self):
+ print('swh.loader.%s.%s' % (self.loader_name, self.class_name))
+ self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name
+ self.CONFIG_BASE_FILENAME = 'loader/%s' % self.loader_name
+
+ self.ADDITIONAL_CONFIG = {
+ 'temp_directory': ('str',
+ '/tmp/swh.loader.%s/' % self.loader_name),
+ 'debug': ('bool', False), # NOT FOR PRODUCTION
+ }
+ super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name,
+ self.class_name))
+
+ self.local_cache = None
+ self.dir_path = None
+
+ temp_directory = self.config['temp_directory']
+ os.makedirs(temp_directory, exist_ok=True)
+
+ self.temp_directory = mkdtemp(
+ suffix='-%s' % os.getpid(),
+ prefix=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ dir=temp_directory)
+
+ self.debug = self.config.get('debug', False)
+
+ @abstractmethod
+ def convert_to_standard_format(self, **kwargs):
+ """Fetch the metadata and convert it into a standard format
+ The standard format is a dict with keys
+ `name` (str): Holding name of the package
+ `origin_url` (str): Holding the origin_url of the package
+ `tarballs` (list): A list of dicts where each dict contains
+ information related to a single version of the
+ package. The `url` key in the dict is necessary and will
+ hold tarball url. Other keys are optional and as per
+ availability of metadata.
+
+ Note: Keys `nature` and `response` are reserved keywords and cannot be
+ used in the dict that are present under key `tarballs`
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
+
+ Returns:
+ dict: Containing information as directed by the guidelines
+ mentioned above
+
+ Example:
+ {
+ name:'8sync',
+ origin_url:'https://ftp.gnu.org/gnu/8sync/',
+ tarballs:[{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ time_modified: 1562878592 },
+ {url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ time_modified: 1599887203 },
+ ...
+ ]
+ }
+
+ """
+ pass
+
+ def cleanup_artifact(self, uncompressed_path):
+ """Clean up unnecessary files from the downloaded tarball
+ also some special operation if needed.
+
+ Implementation of this method depends on the file structure of the
+ tarball. It is used to clean up files from the uncompressed tarball
+ that are not to be archived(eg binaries).
+
+ Args:
+ uncompressed_path (str): Path of uncompressed tarball
+
+ Returns:
+ uncompressed_path (str): Path of uncompressed tarball after
+ removing unnecessary files
+
+ """
+ return uncompressed_path
+
+ def extract_metadata(self, package_path, package_source_data):
+ """Fetch the metadata from the downloaded file.
+
+ """
+ return package_source_data
+
+ # Do something
+ def prepare_origin_visit(self, **kwargs):
+ """Prepare package visit.
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
+
+ """
+ # reset statuses
+ self._load_status = 'uneventful'
+ self._visit_status = 'full'
+ self.done = False
+ # fetch the npm package metadata from the registry
+ self.package_details = self.convert_to_standard_format(**kwargs)
+ self.set_origin()
+ self.visit_date = None # loader core will populate it
+
+ def set_origin(self):
+ """Assign value to self.origin
+ """
+ self.origin = {
+ 'url': self.package_details['origin_url'],
+ 'type': self.loader_name,
+ }
+
+ def prepare(self, **kwargs):
+ """Prepare effective loading of source tarballs for a package manager
+ package.
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
+
+ """
+ self.package_contents = []
+ self.package_directories = []
+ self.package_revisions = []
+ self.package_source_data = []
+ self.package_temp_dir = os.path.join(self.temp_directory,
+ self.package_details['name'])
+
+ last_snapshot = self.last_snapshot()
+ self.known_versions = self.known_versions(last_snapshot)
+
+ self.new_artifacts = \
+ self.prepare_package_versions(self.package_details['tarballs'],
+ self.known_versions)
+
+ def last_snapshot(self):
+ """Retrieve the last snapshot of the package if any.
+
+ """
+ # Done
+ visit = self.storage.origin_visit_get_latest(
+ self.origin['url'], require_snapshot=True)
+ if visit:
+ return snapshot_get_all_branches(self.storage, visit['snapshot'])
+
+ def _prepare_package_version(self, package_source_data, tarball_request):
+ """Process the package release version.
+
+ The following operations are performed:
+
+ 1. Download the tarball
+ 2. Uncompress the tarball
+ 3. Delete unnecessary files (optional)
+ 4. Parse the file associated to the package version to extract
+ metadata (optional)
+
+ Args:
+ package_source_data (list): a list of dicts containing information
+ about the respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Return:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ url = package_source_data['url']
+ tarball_path, hashes = self.download_generate_hash(tarball_request,
+ url)
+ uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed',
+ os.path.basename(url)) # SEE ME
+ package_source_data['nature'] = self.uncompress_tarball(
+ tarball_path, uncompressed_path)
+
+ # remove tarball
+ os.remove(tarball_path)
+
+ if self.tarball_invalid:
+ return None, None
+
+ package_path = self.cleanup_artifact(uncompressed_path)
+ package_source_data = self.extract_metadata(package_path,
+ package_source_data)
+ self.package_source_data.append(package_source_data)
+ return package_source_data, package_path
+
+ # generate
+ def download_generate_hash(self, response, url):
+ """Store file in temp directory and computes hash of its filepath
+
+ Args:
+ response (Response): Server response of the url
+ url (str): Url of the tarball
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ # Done
+ length = int(response.headers['content-length'])
+ os.makedirs(self.package_temp_dir, exist_ok=True)
+ # SEE ME
+ filepath = os.path.join(self.package_temp_dir, os.path.basename(url))
+
+ h = self.write_file(filepath, length, response)
+
+ self.check_file(filepath, length)
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+
+ def write_file(self, filepath, length, response):
+ """
+
+ """
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+ return h
+
+ def check_file(self, length, filepath):
+ """
+
+ """
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ def uncompress_tarball(self, filepath, path):
+ """Uncompress a tarball.
+
+ Args:
+ filepath (str): Path of tarball to uncompress
+ path (str): The destination folder where to uncompress the tarball
+ Returns:
+ The nature of the tarball, zip or tar.
+
+ """
+ # Done
+ # filepath = tempdir + url
+ try:
+ self.tarball_invalid = False
+ return tarball.uncompress(filepath, path)
+ except Exception:
+ self.tarball_invalid = True
+ return None
+
+ def fetch_data(self):
+ """Called once per release artifact version (can be many for one
+ release).
+
+ This will for each call:
+ - retrieve a release artifact (associated to a release version)
+ - Computes the swh objects
+
+ Returns:
+ True as long as data to fetch exist
+
+ """
+ data = None
+ if self.done:
+ return False
+
+ try:
+ data = next(self.new_artifacts)
+ self._load_status = 'eventful'
+ except StopIteration:
+ self.done = True
+ return False
+
+ package_source_data, dir_path = data
+
+ # package release tarball was corrupted
+ if self.tarball_invalid:
+ return not self.done
+
+ dir_path = dir_path.encode('utf-8')
+ directory = Directory.from_disk(path=dir_path, data=True)
+ objects = directory.collect()
+
+ objects = self.check_objects(objects)
+
+ self.package_contents = objects['content'].values()
+ self.package_directories = objects['directory'].values()
+
+ revision = self.compute_revision(directory,
+ package_source_data)
+
+ revision['id'] = identifier_to_bytes(
+ revision_identifier(revision))
+ self.package_revisions.append(revision)
+
+ self.update_known_version(package_source_data, revision)
+
+ self.log.debug('Removing unpacked package files at %s', dir_path)
+ shutil.rmtree(dir_path)
+
+ return not self.done
+
+ def update_known_version(self, package_source_data, revision):
+ """
+
+ """
+ key = self.get_key()
+ package_key = package_source_data[key]
+ self.known_versions[package_key] = revision['id'] # SEE ME
+
+ def check_objects(self, objects):
+ """
+ """
+ if 'content' not in objects:
+ objects['content'] = {}
+ if 'directory' not in objects:
+ objects['directory'] = {}
+ return objects
+
+ def store_data(self):
+ """Store fetched data in the database.
+
+ """
+ # Done
+ self.maybe_load_contents(self.package_contents)
+ self.maybe_load_directories(self.package_directories)
+ self.maybe_load_revisions(self.package_revisions)
+
+ if self.done:
+ self.generate_and_load_snapshot()
+ self.flush()
+
+ def generate_and_load_snapshot(self):
+ """Generate and load snapshot for the package visit.
+
+ """
+ # Done
+ snapshot = {
+ 'branches': self.generate_branches(),
+ }
+ snapshot['id'] = identifier_to_bytes(snapshot_identifier(snapshot))
+ self.maybe_load_snapshot(snapshot)
+
+ def generate_branches(self):
+ """
+ Make ME
+ """
+ branches = {}
+ key = self.get_key()
+ for version in self.package_source_data:
+ branch_name = self.branch_name(version)
+ target = self.target_from_version(version[key])
+ branches[branch_name] = target
+ branches = self.find_head(branches, branch_name)
+ # How to find HEAD and branch name?
+
+ if not target:
+ self.package_visit_status = 'partial'
+
+ return branches
+
+ def find_head(self, branches, branch_name):
+ """
+ """
+ if True: # I don't know what to do here
+ branches[b'HEAD'] = {
+ 'target_type': 'alias',
+ 'target': branch_name,
+ }
+ return branches
+
+ def target_from_version(self, key_value):
+ target = self.known_versions.get(key_value)
+ return {
+ 'target': target,
+ 'target_type': 'revision',
+ } if target else None
+
+ def branch_name(self, version):
+ """Find branch name
+
+ Args:
+ version (dict): All the data related to a particular version
+
+ Returns:
+ Branch name encoded in ascii
+
+ """
+ # How to tackle this
+ pass
+
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ # Done
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)
+
+ def cleanup(self):
+ """Clean up temporary disk use after downloading and extracting
+ package tarballs.
+
+ """
+ # Done
+ if self.debug:
+ self.log.warn('%s Will not clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ if os.path.exists(self.temp_directory):
+ self.log.debug('Clean up %s' % self.temp_directory)
+ shutil.rmtree(self.temp_directory)
diff --git a/swh/loader/base/tests/__init__.py b/swh/loader/base/tests/__init__.py
new file mode 100644
diff --git a/swh/loader/base/tests/test_build_revision.py b/swh/loader/base/tests/test_build_revision.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/base/tests/test_build_revision.py
@@ -0,0 +1,41 @@
+# Copyright (C) 2015-2018 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+from unittest.mock import patch
+
+from swh.loader.base import build_revision as build
+
+# Under construction
+
+
+class TestBuild_Revision(unittest.TestCase):
+ @patch('swh.loader.base.build_revision.directory')
+ def test_compute_revision(self, mock_directory):
+ package_source_data = {}
+ mock_directory.return_value = 'some-other-time'
+ actual_revision = build.compute_revision('/some/path',
+ package_source_data)
+ # make me
+ expected_revision = {
+ 'date': {
+ 'timestamp': 'some-other-time',
+ 'offset': build.UTC_OFFSET,
+ },
+ 'committer_date': {
+ 'timestamp': 'some-other-time',
+ 'offset': build.UTC_OFFSET,
+ },
+ 'author': build.SWH_PERSON,
+ 'committer': build.SWH_PERSON,
+ 'type': build.REVISION_TYPE,
+ 'message': build.REVISION_MESSAGE,
+ 'synthetic': True,
+ }
+
+ # then
+ self.assertEqual(actual_revision, expected_revision)
+
+ mock_directory.assert_called_once_with('/some/path')
diff --git a/swh/loader/base/tests/test_download.py b/swh/loader/base/tests/test_download.py
new file mode 100644
diff --git a/swh/loader/base/tests/test_loader.py b/swh/loader/base/tests/test_loader.py
new file mode 100644
diff --git a/swh/loader/cran/loader.py b/swh/loader/cran/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/cran/loader.py
@@ -0,0 +1,18 @@
+from swh.loader.base.loader import BaseLoader
+from swh.loader.base.dowload import If_Modified_Since
+
+
+class CRANLoader(BaseLoader, If_Modified_Since):
+ loader_name = 'cran'
+ class_name = 'CRANLoader'
+
+ def __init__(self):
+ BaseLoader.__init__(self)
+ If_Modified_Since.__init__(self)
+
+ def convert_to_standard_format(self, **kwargs):
+ return {
+ 'name': kwargs['name'],
+ 'origin_url': kwargs['origin_url'],
+ 'tarballs': kwargs['tarballs']
+ }
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Dec 19 2024, 8:56 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226110
Attached To
D1694: swh.loader.package: Implement a method to prepare package visit
Event Timeline
Log In to Comment