Page MenuHomeSoftware Heritage

D1694.id6019.diff
No OneTemporary

D1694.id6019.diff

diff --git a/swh/loader/package/__init__.py b/swh/loader/package/__init__.py
new file mode 100644
diff --git a/swh/loader/package/dowload.py b/swh/loader/package/dowload.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/dowload.py
@@ -0,0 +1,379 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import requests
+import time
+
+try:
+ from _version import __version__
+except ImportError:
+ __version__ = 'devel'
+
+
+# This file contains methods to check and remove archived package version
+
+
+class IfModifiedSince:
+ """Uses if_modified_since header to check if the package is previously
+ archived.
+
+ This class is to be used to identify and avoid the reprocessing of
+ previously archived package version when there is no reliable field
+ provided in the metadata that can serve the purpose.
+
+ It uses if-modified-since header to find it the file is changed
+ since last visit.
+
+ The following operations are performed:
+ - Retrieve known versions and store them in a dict with tarball `url`
+ as key(can be changed by overriding `get_key` method)
+ - Checks if the tarballs are present in knowns versions.
+ * If the match found, it sends a request with `if_modified_since`
+ header to confirm the match
+ * If the match is not found it sends a simple request
+ - Store the request and the time for further processing
+ - Instantiate a generator to process a specific package released
+ version
+
+ """
+ def __init__(self):
+ self.session = requests.session()
+ self.time_last_visit = {}
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def get_artifact(self, revision):
+ """Fetch artifact from revision
+
+ Args:
+ revision (dict): Previous revision
+
+ Returns:
+ dict: metadata present in the revision
+
+ """
+ return revision['metadata']['package']
+
+ def get_key(self):
+ """Returns the key to be used to identify known revisions
+
+ """
+ return 'url'
+
+ def get_known_versions(self, last_snapshot):
+ """Retrieve the known release versions for the package
+ (i.e. those already ingested into the archive).
+
+ Args
+ last_snapshot (dict): Last snapshot for the visit
+
+ Returns:
+ dict: Dict whose key is url and values are revision ids.
+
+ """
+ if not last_snapshot or 'branches' not in last_snapshot:
+ return {}
+
+ # retrieve only revisions (e.g the alias we do not want here)
+ revs = [rev['target']
+ for rev in last_snapshot['branches'].values()
+ if rev and rev['target_type'] == 'revision']
+ known_revisions = self.storage.revision_get(revs)
+ ret = {}
+ key = self.get_key()
+ for revision in known_revisions:
+ if not revision: # revision_get can return None
+ continue
+
+ artifact = self.get_artifact(revision)
+ ret[artifact[key]] = revision['id']
+ self.time_last_visit[artifact[key]] = artifact['time_last_visit']
+ return ret
+
+ def filter_package_versions(self, tarballs, known_versions):
+ """Finds the available tarballs that are not previously archived.
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it is used
+ to filter out versions already ingested in the archive.
+
+ Returns:
+ A list of dicts containing information about the respective
+ tarballs that are not previously archived.
+
+ """
+ versions = []
+ key = self.get_key()
+ for release in tarballs:
+ tarball_url = release['url']
+
+ if release[key] in known_versions:
+ tarball_request = self._request(
+ tarball_url,
+ time_last_visit=self.time_last_visit[release[key]],
+ throw_error=False)
+ else:
+ tarball_request = self._request(
+ tarball_url, time_last_visit=None, throw_error=False)
+
+ if tarball_request.status_code == 304:
+ continue
+
+ elif tarball_request.status_code != 200:
+ self.log.debug("Fail to query '%s'. Reason: %s" % (
+ tarball_url, tarball_request.status_code))
+ continue
+
+ new_release = self.update_release_info(release, tarball_request)
+ versions.append(new_release)
+
+ return versions
+
+ def update_release_info(self, release, tarball_request):
+ """Update metadata of the package version with time_last_visit and the
+ server response
+
+ Args:
+ release (dict): Metadata of the focused package version
+ tarball_request (Request): Server response of the tarball url
+
+ Returns:
+ dict: release with updated information
+
+ """
+ release['response'] = tarball_request
+ time_now = time.time()
+ time_now = time.strftime('%a, %d %b %Y %H:%M:%S GMT',
+ time.gmtime(time_now))
+ release['time_last_visit'] = time_now
+ return release
+
+ def _request(self, url, time_last_visit=None, throw_error=True):
+ """Request the remote tarball url.
+
+ Args:
+ url (str): Url (file or http*)
+
+ Raises:
+ ValueError in case of failing to query
+
+ Returns:
+ server response
+
+ """
+ if time_last_visit:
+ self.params['headers']['If-Modified-Since'] = time_last_visit
+ response = self.session.get(url, **self.params, stream=True)
+ return response
+
+ def prepare_package_versions(self, tarballs, known_versions=None):
+ """
+ Instantiate a generator that will process a specific package release
+ version at each iteration step. The following operations will be
+ performed:
+
+ 1. Create a temporary directory to download and extract the
+ release tarball
+ 2. Download the tarball
+ 3. Uncompress the tarball
+ 4. Parse the file associated to the package version to extract
+ metadata (optional)
+ 5. Delete unnecessary files (optional)
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Yields:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ new_versions = self.filter_package_versions(tarballs, known_versions)
+ for package_source_data in new_versions:
+ tarball_request = package_source_data['response']
+
+ # To make things simple while creating revisions
+ del package_source_data['response']
+ yield self._prepare_package_version(package_source_data,
+ tarball_request)
+
+
+class compareField:
+ """Uses a field present in the metadata to check for if the package is
+ previously archived.
+
+ This class is to be used to identify and avoid the reprocessing of
+ previously archived package version using a field provided by the
+ API as the metadata of the package version.
+
+ The following operations are performed:
+ - Retrive known versions and store then in a dict with key of same
+ field that is mentioned in compare field
+ - Check if the tarballs are present in knowns versions.
+ - Instantiate a generator to process a specific package released
+ version
+
+ """
+ compare_field = None
+ """Field used to identify if the package version is previously archived"""
+ # eg for pypi loader compare_field = 'sha'
+
+ def __init__(self):
+ self.session = requests.session()
+ self.params = {
+ 'headers': {
+ 'User-Agent': 'Software Heritage Loader (%s)' % (
+ __version__
+ )
+ }
+ }
+
+ def get_key(self):
+ """Returns the key to be used to identify known revisions.
+
+ """
+ return self.compare_field
+
+ def _request(self, url, throw_error=True):
+ """Request the remote tarball url.
+
+ Args:
+ url (str): Url (file or http*).
+
+ Raises:
+ ValueError in case of failing to query.
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath).
+
+ """
+ # Done
+ response = self.session.get(url, **self.params, stream=True)
+ if response.status_code != 200 and throw_error:
+ raise ValueError("Fail to query '%s'. Reason: %s" % (
+ url, response.status_code))
+ return response
+
+ def get_known_versions(self, last_snapshot):
+ """
+ Retrieve the known release versions for the package
+ (i.e. those already ingested into the archive).
+
+ Args
+ last_snapshot (dict): Last snapshot for the visit.
+
+ Returns:
+ dict: Dict whose key is the value of field chosen for
+ checking archived artifacts and values are revision ids.
+
+ """
+ if not last_snapshot or 'branches' not in last_snapshot:
+ return {}
+
+ # retrieve only revisions (e.g the alias we do not want here)
+ revs = [rev['target']
+ for rev in last_snapshot['branches'].values()
+ if rev and rev['target_type'] == 'revision']
+ known_revisions = self.storage.revision_get(revs)
+ ret = {}
+ for revision in known_revisions:
+ if not revision: # revision_get can return None
+ continue
+ artifact = self.artifact_from_revision(revision)
+ ret[artifact[self.compare_field]] = revision['id'] # Check me
+ return ret
+
+ def artifact_from_revision(self, revision):
+ """Find artifacts from the revision.
+
+ """
+ # Can be overridden if change in standard revision pattern
+ if 'package' in revision['metadata']:
+ return revision['metadata']['package']
+
+ def filter_package_versions(self, tarballs, known_versions):
+ """
+ Return the available tarballs that are not previously archived.
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Returns:
+ A list of dicts containing information about the respective
+ tarballs that are not previously archived.
+
+ """
+ # Done
+ versions = []
+
+ for release in tarballs:
+ if release[self.compare_field] in known_versions:
+ continue
+ versions.append(release)
+
+ return versions
+
+ def prepare_package_versions(self, tarballs, known_versions={}):
+ """
+ Instantiate a generator that will process a specific package release
+ version at each iteration step. The following operations will be
+ performed:
+
+ 1. Create a temporary directory to download and extract the
+ release tarball.
+ 2. Download the tarball.
+ 3. Uncompress the tarball.
+ 4. Parse the file associated to the package version to extract
+ metadata (optional).
+ 5. Delete unnecessary files (optional).
+
+ Args:
+ tarballs (list): a list of dicts containing information about the
+ respective tarball that is provided by lister.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Yields:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ new_versions = self.filter_package_versions(tarballs, known_versions)
+ for package_source_data in new_versions:
+ # filter out version with missing tarball,
+ # package visit will be marked as partial at the end of
+ # the loading process
+
+ tarball_url = package_source_data['url']
+ tarball_request = self._request(tarball_url,
+ throw_error=False)
+ if tarball_request.status_code == 404:
+ self.log.debug('Tarball url %s returns a 404 error.',
+ tarball_url)
+ continue
+
+ yield self._prepare_package_version(package_source_data,
+ tarball_request)
diff --git a/swh/loader/package/loader.py b/swh/loader/package/loader.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/loader.py
@@ -0,0 +1,518 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import os
+import shutil
+
+from abc import abstractmethod
+from tempfile import mkdtemp
+
+from swh.core import tarball
+from swh.loader.core.utils import clean_dangling_folders
+from swh.loader.core.loader import BufferedLoader
+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
+from swh.storage.algos.snapshot import snapshot_get_all_branches
+from swh.model.from_disk import Directory
+
+from .revision import constructRevision
+
+from swh.model.identifiers import (
+ identifier_to_bytes, revision_identifier, snapshot_identifier
+)
+
+DEBUG_MODE = '** DEBUG MODE **'
+
+
+class PackageLoader(BufferedLoader, constructRevision):
+ """Package loader class for package manager loader
+
+ A loader is a component of the Software Heritage architecture responsible
+ for reading a source code origin and add new file contents in the object
+ storage and repository structure in the storage database.
+
+ The task of loader for package managers somehow similar for all
+ of them, notably it includes querying of an API to get metadata and
+ retrieval of the package source code and its ingestion into the archive.
+
+ The steps involving in ingestion of the package source code are automated
+ by this class which ease up the process of creating of the new loader.
+
+ API quering and obtaining the metadata for a package is done seperately
+ for each new loader by overriding the :func:`convert_to_standard_format`
+ function. It returns the all the information about the package in a
+ specific format. after which all the process involving downloading,
+ decompressing, creating and loading snapshots is automated by this class.
+
+ Required Overrides:
+ loader_name
+ class_name
+ def convert_to_standard_format
+
+ Optional Overrides:
+ def cleanup_artifact
+ def extract_metadata
+
+ """
+
+ loader_name = None
+ """Package manager name""" # e.g pypi
+ class_name = None
+ """Loader class name""" # eg PyPILoader
+
+ def __init__(self):
+ self.TEMPORARY_DIR_PREFIX_PATTERN = 'swh.loader.%s.' % self.loader_name
+
+ super().__init__(logging_class='swh.loader.%s.%s' % (self.loader_name,
+ self.class_name))
+
+ self.local_cache = None
+ self.dir_path = None
+
+ temp_directory = self.config['temp_directory']
+ os.makedirs(temp_directory, exist_ok=True)
+
+ self.temp_directory = mkdtemp(
+ suffix='-%s' % os.getpid(),
+ prefix=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ dir=temp_directory)
+
+ self.debug = self.config.get('debug', False)
+
+ @abstractmethod
+ def convert_to_standard_format(self, kwargs):
+ """Fetch the metadata and convert it into a standard format
+
+ This method serve two purpose:
+ * Make API call to get the package versions and metadata(if needed)
+ * Convert the information received from lister and API call to a
+ standard format
+
+ The standard format is a dict with keys
+ `name` (str): Holding name of the package
+ `origin_url` (str): Holding the origin_url of the package
+ `tarballs` (list): A list of dicts where each dict contains
+ information related to a single version of the
+ package. The `url` key in the dict is necessary and will
+ hold tarball url. Other keys are optional and as per
+ availability of metadata.
+
+ Note: Keys `nature` and `response` are reserved keywords and cannot be
+ used in the dict that are present under key `tarballs`
+
+ Args:
+ kwargs (dict): Dict of arbitrary keyword arguments passed by
+ the lister.
+
+ Returns:
+ dict: Containing information as directed by the guidelines
+ mentioned above
+
+ Example:
+ {
+ name:'8sync',
+ origin_url:'https://ftp.gnu.org/gnu/8sync/',
+ tarballs:[{url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz',
+ time_modified: 1562878592 },
+ {url: 'https://ftp.gnu.org/gnu/8sync/8sync-0.2.0.tar.gz',
+ time_modified: 1599887203 },
+ ...
+ ]
+ }
+
+ """
+ pass
+
+ def cleanup_artifact(self, uncompressed_path):
+ """Clean up unnecessary files from the downloaded tarball
+ also some special operation if needed.
+
+ Implementation of this method depends on the file structure of the
+ tarball. It is used to clean up files from the uncompressed tarball
+ that are not to be archived(eg binaries).
+
+ Args:
+ uncompressed_path (str): Path of uncompressed tarball
+
+ Returns:
+ uncompressed_path (str): Path of uncompressed tarball after
+ removing unnecessary files
+
+ """
+ return uncompressed_path
+
+ def extract_metadata(self, package_path, package_source_data):
+ """Fetch the metadata from the downloaded file.
+
+ Override this method to perform metadata extraction for each version
+ of a package from the uncompressed package by parsing over the file
+ containing metadata(package.json, PKG-INFO, ...).
+ Add the extracted metadata to the present `package_source_data` which
+ contains information related the focused package version.
+
+ Args:
+ package_path (str): Uncompressed package version path
+ package_source_data (dict): Information about the focused package
+ version.
+
+ Returns:
+ dict: Updated information about the focused package
+ version.
+
+ """
+ return package_source_data
+
+ def prepare_origin_visit(self, *args, **kwargs):
+ """Prepare package visit.
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
+
+ """
+ # reset statuses
+ self._load_status = 'uneventful'
+ self._visit_status = 'full'
+ self.done = False
+ # fetch the package metadata from the registry
+ self.package_details = self.convert_to_standard_format(kwargs)
+ self.set_origin()
+ self.visit_date = None # loader core will populate it
+
+ def set_origin(self):
+ """Assign value to self.origin.
+
+ """
+ self.origin = {
+ 'url': self.package_details['origin_url'],
+ 'type': self.loader_name,
+ }
+
+ def prepare(self, *args, **kwargs):
+ """Prepare effective loading of source tarballs for a package manager
+ package.
+
+ Args:
+ **kwargs: Arbitrary keyword arguments passed by the lister.
+
+ """
+ self.package_contents = []
+ self.package_directories = []
+ self.package_revisions = []
+ self.package_source_data = []
+ self.package_temp_dir = os.path.join(self.temp_directory,
+ self.package_details['name'])
+
+ last_snapshot = self.last_snapshot()
+ self.known_versions = self.get_known_versions(last_snapshot)
+
+ self.new_versions = \
+ self.prepare_package_versions(self.package_details['tarballs'],
+ self.known_versions)
+
+ def last_snapshot(self):
+ """Retrieve the last snapshot of the package if any.
+
+ """
+ visit = self.storage.origin_visit_get_latest(
+ self.origin['url'], require_snapshot=True)
+ if visit:
+ return snapshot_get_all_branches(self.storage, visit['snapshot'])
+
+ def _prepare_package_version(self, package_source_data, tarball_request):
+ """Process the package release version.
+
+ The following operations are performed:
+
+ 1. Download the tarball
+ 2. Uncompress the tarball
+ 3. Delete unnecessary files (optional)
+ 4. Parse the file associated to the package version to extract
+ metadata (optional)
+
+ Args:
+ package_source_data (dict): containing information
+ about the focused package version.
+ known_versions (dict): may be provided by the loader, it enables
+ to filter out versions already ingested in the archive.
+
+ Return:
+ Tuple[dict, str]: tuples containing the following
+ members:
+
+ * a dict holding package tarball information and metadata
+ * a string holding the path of the uncompressed package to
+ load into the archive
+
+ """
+ url = package_source_data['url']
+ tarball_path, hashes = self.download_generate_hash(tarball_request,
+ url)
+ uncompressed_path = os.path.join(self.package_temp_dir, 'uncompressed',
+ os.path.basename(url)) # SEE ME
+ package_source_data['nature'] = self.uncompress_tarball(
+ tarball_path, uncompressed_path)
+
+ # remove tarball
+ os.remove(tarball_path)
+
+ if self.tarball_invalid:
+ return None, None
+
+ package_path = self.cleanup_artifact(uncompressed_path)
+ package_source_data = self.extract_metadata(package_path,
+ package_source_data)
+ self.package_source_data.append(package_source_data)
+ return package_source_data, package_path
+
+ def download_generate_hash(self, response, url):
+ """Store file in temp directory and computes hash of its filepath.
+
+ Args:
+ response (Response): Server response of the url
+ url (str): Url of the tarball
+
+ Returns:
+ Tuple of local (filepath, hashes of filepath)
+
+ """
+ length = int(response.headers['content-length'])
+ os.makedirs(self.package_temp_dir, exist_ok=True)
+ # SEE ME
+ filepath = os.path.join(self.package_temp_dir, os.path.basename(url))
+ h = self.write_file(filepath, length, response)
+ self.check_file(filepath, length)
+
+ hashes = {
+ 'length': length,
+ **h.hexdigest()
+ }
+ return filepath, hashes
+
+ def write_file(self, filepath, length, response):
+ """Convert the server response to a file.
+
+ """
+ h = MultiHash(length=length)
+ with open(filepath, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+ h.update(chunk)
+ f.write(chunk)
+ return h
+
+ def check_file(self, filepath, length):
+ """Check for the validity of the tarball downloaded.
+
+ """
+ actual_length = os.path.getsize(filepath)
+ if length != actual_length:
+ raise ValueError('Error when checking size: %s != %s' % (
+ length, actual_length))
+
+ def uncompress_tarball(self, filepath, path):
+ """Uncompress a tarball.
+
+ Args:
+ filepath (str): Path of tarball to uncompress
+ path (str): The destination folder where to uncompress the tarball
+ Returns:
+ The nature of the tarball, zip or tar.
+
+ """
+ # filepath = tempdir + url
+ try:
+ self.tarball_invalid = False
+ return tarball.uncompress(filepath, path)
+ except Exception:
+ self.tarball_invalid = True
+ return None
+
+ def fetch_data(self):
+ """Called once per release artifact version (can be many for one
+ release).
+
+ This will for each call:
+ - retrieve a release artifact (associated to a release version)
+ - Computes the swh objects
+
+ Returns:
+ True as long as data to fetch exist
+
+ """
+ data = None
+ if self.done:
+ return False
+
+ try:
+ data = next(self.new_versions)
+ self._load_status = 'eventful'
+ except StopIteration:
+ self.done = True
+ return False
+
+ package_source_data, dir_path = data
+
+ # package release tarball was corrupted
+ if self.tarball_invalid:
+ return not self.done
+
+ dir_path = dir_path.encode('utf-8')
+ directory = Directory.from_disk(path=dir_path, data=True)
+ objects = directory.collect()
+
+ objects = self.check_objects(objects)
+
+ self.package_contents = objects['content'].values()
+ self.package_directories = objects['directory'].values()
+
+ revision = self.compute_revision(directory,
+ package_source_data)
+
+ revision['id'] = identifier_to_bytes(
+ revision_identifier(revision))
+ self.package_revisions.append(revision)
+
+ self.update_known_version(package_source_data, revision['id'])
+
+ self.log.debug('Removing unpacked package files at %s', dir_path)
+ shutil.rmtree(dir_path)
+
+ return not self.done
+
+ def update_known_version(self, package_source_data, revision_id):
+ """Update the `known_versions` variable with new discovered versions
+
+ Args:
+ package_source_data (dict): Metadata available for a particular package
+ version
+ revision_id (str): Revision id of the revsion of focused package
+ version
+
+ """
+ key = self.get_key()
+ package_key = package_source_data[key]
+ self.known_versions[package_key] = revision_id # SEE ME
+
+ def check_objects(self, objects):
+ """The the object for necessary fields and initialise it if not
+ already present.
+
+ """
+ if 'content' not in objects:
+ objects['content'] = {}
+ if 'directory' not in objects:
+ objects['directory'] = {}
+ return objects
+
+ def store_data(self):
+ """Store fetched data in the database.
+
+ """
+ self.maybe_load_contents(self.package_contents)
+ self.maybe_load_directories(self.package_directories)
+ self.maybe_load_revisions(self.package_revisions)
+
+ if self.done:
+ self.generate_and_load_snapshot()
+ self.flush()
+
+ def generate_and_load_snapshot(self):
+ """Generate and load snapshot for the package visit.
+
+ """
+ snapshot = {
+ 'branches': self.generate_branches(),
+ }
+
+ snapshot['id'] = identifier_to_bytes(snapshot_identifier(snapshot))
+ self.maybe_load_snapshot(snapshot)
+
+ def generate_branches(self):
+ """Generate branches for the focused package
+
+ """
+ branches = {}
+ key = self.get_key()
+ for version in self.package_source_data:
+ branch_name = self.branch_name(version)
+ target = self.target_from_version(version[key])
+ branches[branch_name] = target
+ branches = self.find_head(branches, branch_name)
+ # How to find HEAD and branch name?
+
+ if not target:
+ self.package_visit_status = 'partial'
+
+ return branches
+
+ def find_head(self, branches, branch_name):
+ """Make branch head.
+
+ Checks if the current version is the latest version. Make it as head
+ if it is the latest version.
+
+ Args:
+ branches (dict): Branches for the focused package.
+ branch_name (str): Branch name
+
+ Returns:
+ dict: Branches for the focused package
+
+ """
+ if True: # I don't know what to do here
+ # we need some condition here to check if the version is the
+ # latest version. I don't know how to check that because all
+ # of the package manager do not provide field like version
+ branches[b'HEAD'] = {
+ 'target_type': 'alias',
+ 'target': branch_name,
+ }
+ return branches
+
+ def branch_name(self, version):
+ """Find branch name.
+
+ Args:
+ version (dict): Information related to a particular package version
+
+ Returns:
+ Branch name encoded in ascii
+
+ """
+ # How to tackle this
+ pass
+
+ def target_from_version(self, key_value):
+ target = self.known_versions.get(key_value)
+ return {
+ 'target': target,
+ 'target_type': 'revision',
+ } if target else None
+
+ def pre_cleanup(self):
+ """To prevent disk explosion if some other workers exploded
+ in mid-air (OOM killed), we try and clean up dangling files.
+
+ """
+ if self.debug:
+ self.log.warn('%s Will not pre-clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ clean_dangling_folders(self.config['temp_directory'],
+ pattern_check=self.TEMPORARY_DIR_PREFIX_PATTERN,
+ log=self.log)
+
+ def cleanup(self):
+ """Clean up temporary disk use after downloading and extracting
+ package tarballs.
+
+ """
+ if self.debug:
+ self.log.warn('%s Will not clean up temp dir %s' % (
+ DEBUG_MODE, self.temp_directory
+ ))
+ return
+ if os.path.exists(self.temp_directory):
+ self.log.debug('Clean up %s' % self.temp_directory)
+ shutil.rmtree(self.temp_directory)
diff --git a/swh/loader/package/revision.py b/swh/loader/package/revision.py
new file mode 100644
--- /dev/null
+++ b/swh/loader/package/revision.py
@@ -0,0 +1,97 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from datetime import datetime
+from swh.model.identifiers import normalize_timestamp
+from dateutil import parser as date_parser
+
+
+class ConstructRevision:
+ """Construct revision from the metadata of the package version
+
+ Construct the revision for a package version using the metadata provided.
+ There are several hookpoints that can be overridden as per the need of
+ package manager.
+
+ """
+
+ SWH_PERSON = {
+ 'name': b'Software Heritage',
+ 'fullname': b'Software Heritage',
+ 'email': b'robot@softwareheritage.org'
+ }
+ REVISION_MESSAGE = b'swh-loader-base: synthetic revision message'
+
+ def modify_revision(self, revision):
+ """Make modification on revision created
+ If the revision of a package manager needs to be in a specific format,
+ this method can be overridden to perform that operation insted of
+ overriding the `compute_revision()`.
+
+ Args:
+ revision (dict): Created revision
+
+ Returns:
+ dict: Modified revision
+ """
+ return revision
+
+ def compute_revision(self, directory, package_source_data):
+ """Compute a revision.
+
+ Args:
+ directory (str): absolute path to the tarball
+ package_source_data (dict): Information about the package
+ release version
+
+ Returns:
+ dict: Revision
+
+ """
+ revision = {
+ 'metadata': self.find_metadata(package_source_data),
+ 'date': self.find_date(package_source_data),
+ 'committer_date': self.find_date(package_source_data),
+ 'author': self.find_author(package_source_data),
+ 'committer': self.find_author(package_source_data),
+ 'type': self.find_type(package_source_data),
+ 'message': self.find_message(package_source_data),
+ 'directory': self.directory(directory),
+ 'synthetic': self.find_synthetic(),
+ 'parents': [],
+ }
+
+ return self.modify_revision(revision)
+
+ def find_synthetic(self):
+ return True
+
+ def find_type(self, package_source_data):
+ return package_source_data['nature']
+
+ def find_message(self, package_source_data):
+ return self.REVISION_MESSAGE
+
+ def directory(self, directory):
+ return directory.hash
+
+ def find_author(self, package_source_data):
+ if 'author' in package_source_data:
+ return package_source_data['author']
+ return self.SWH_PERSON
+
+ def find_metadata(self, package_source_data):
+ return {
+ 'package': package_source_data
+ }
+
+ def find_date(self, package_source_data):
+ try:
+ # if `date` key in package_source_data:
+ date = date_parser.parse(package_source_data['date'])
+ return normalize_timestamp(int(date.timestamp()))
+ except Exception:
+ now = datetime.now()
+ return normalize_timestamp(int(datetime.timestamp(now)))

File Metadata

Mime Type
text/plain
Expires
Dec 19 2024, 9:46 PM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3226113

Event Timeline