Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/download.py
- This file was added.
# Copyright (C) 2019 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import requests | |||||
try: | |||||
from _version import __version__ | |||||
except ImportError: | |||||
__version__ = 'devel' | |||||
# This file contains methods to check and remove archived package version | |||||
class compareField: | |||||
"""Uses a field present in the metadata to check for if the package is | |||||
previously archived. | |||||
This class is to be used to identify and avoid the reprocessing of | |||||
previously archived package version using a field provided by the | |||||
API as the metadata of the package version. | |||||
The following operations are performed: | |||||
- Retrive known versions and store then in a dict with key of same | |||||
field that is mentioned in compare field | |||||
- Check if the tarballs are present in knowns versions. | |||||
- Instantiate a generator to process a specific package released | |||||
version | |||||
""" | |||||
compare_field = None | |||||
"""Field used to identify if the package version is previously archived""" | |||||
# eg for pypi loader compare_field = 'sha' | |||||
def __init__(self): | |||||
self.session = requests.session() | |||||
self.params = { | |||||
'headers': { | |||||
'User-Agent': 'Software Heritage Loader (%s)' % ( | |||||
__version__ | |||||
) | |||||
} | |||||
} | |||||
def get_key(self): | |||||
"""Returns the key to be used to identify known revisions. | |||||
""" | |||||
return self.compare_field | |||||
def get_known_versions(self, last_snapshot): | |||||
""" | |||||
Retrieve the known release versions for the package | |||||
(i.e. those already ingested into the archive). | |||||
Args | |||||
last_snapshot (dict): Last snapshot for the visit. | |||||
Returns: | |||||
dict: Dict whose key is the value of field chosen for | |||||
checking archived artifacts and values are revision ids. | |||||
""" | |||||
if not last_snapshot or 'branches' not in last_snapshot: | |||||
return {} | |||||
# retrieve only revisions (e.g the alias we do not want here) | |||||
revs = [rev['target'] | |||||
for rev in last_snapshot['branches'].values() | |||||
if rev and rev['target_type'] == 'revision'] | |||||
known_revisions = self.storage.revision_get(revs) | |||||
ret = {} | |||||
for revision in known_revisions: | |||||
if not revision: # revision_get can return None | |||||
continue | |||||
artifact = self.artifact_from_revision(revision) | |||||
ret[artifact[self.compare_field]] = revision['id'] # Check me | |||||
return ret | |||||
def artifact_from_revision(self, revision): | |||||
"""Find artifacts from the revision. | |||||
""" | |||||
# Can be overridden if change in standard revision pattern | |||||
if 'package' in revision['metadata']: | |||||
return revision['metadata']['package'] | |||||
def filter_package_versions(self, tarballs, known_versions): | |||||
""" | |||||
Return the available tarballs that are not previously archived. | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Returns: | |||||
A list of dicts containing information about the respective | |||||
tarballs that are not previously archived. | |||||
""" | |||||
# Done | |||||
versions = [] | |||||
for release in tarballs: | |||||
if release[self.compare_field] in known_versions: | |||||
continue | |||||
versions.append(release) | |||||
return versions | |||||
def prepare_package_versions(self, tarballs, known_versions={}): | |||||
""" | |||||
Instantiate a generator that will process a specific package release | |||||
version at each iteration step. The following operations will be | |||||
performed: | |||||
1. Create a temporary directory to download and extract the | |||||
release tarball. | |||||
2. Download the tarball. | |||||
3. Uncompress the tarball. | |||||
4. Parse the file associated to the package version to extract | |||||
metadata (optional). | |||||
5. Delete unnecessary files (optional). | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Yields: | |||||
Tuple[dict, str]: tuples containing the following | |||||
members: | |||||
* a dict holding package tarball information and metadata | |||||
* a string holding the path of the uncompressed package to | |||||
load into the archive | |||||
""" | |||||
new_versions = self.filter_package_versions(tarballs, known_versions) | |||||
for package_source_data in new_versions: | |||||
# filter out version with missing tarball, | |||||
# package visit will be marked as partial at the end of | |||||
# the loading process | |||||
tarball_url = package_source_data['url'] | |||||
tarball_request = self._request(tarball_url, | |||||
throw_error=False) | |||||
if tarball_request.status_code == 404: | |||||
self.log.debug('Tarball url %s returns a 404 error.', | |||||
tarball_url) | |||||
continue | |||||
yield self._prepare_package_version(package_source_data, | |||||
tarball_request) |