Changeset View
Changeset View
Standalone View
Standalone View
swh/loader/package/dowload.py
- This file was added.
# Copyright (C) 2019 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import requests | |||||
import time | |||||
try: | |||||
from _version import __version__ | |||||
except ImportError: | |||||
__version__ = 'devel' | |||||
# This file contains methods to check and remove archived package version | |||||
class IfModifiedSince: | |||||
"""Uses if_modified_since header to check if the package is previously | |||||
archived. | |||||
This class is to be used to identify and avoid the reprocessing of | |||||
previously archived package version when there is no reliable field | |||||
provided in the metadata that can serve the purpose. | |||||
It uses if-modified-since header to find it the file is changed | |||||
since last visit. | |||||
The following operations are performed: | |||||
- Retrieve known versions and store them in a dict with tarball `url` | |||||
as key(can be changed by overriding `get_key` method) | |||||
- Checks if the tarballs are present in knowns versions. | |||||
* If the match found, it sends a request with `if_modified_since` | |||||
header to confirm the match | |||||
* If the match is not found it sends a simple request | |||||
- Store the request and the time for further processing | |||||
- Instantiate a generator to process a specific package released | |||||
version | |||||
""" | |||||
def __init__(self): | |||||
self.session = requests.session() | |||||
self.time_last_visit = {} | |||||
self.params = { | |||||
'headers': { | |||||
'User-Agent': 'Software Heritage Loader (%s)' % ( | |||||
__version__ | |||||
) | |||||
} | |||||
} | |||||
def get_artifact(self, revision): | |||||
"""Fetch artifact from revision | |||||
Args: | |||||
revision (dict): Previous revision | |||||
Returns: | |||||
dict: metadata present in the revision | |||||
""" | |||||
return revision['metadata']['package'] | |||||
def get_key(self): | |||||
"""Returns the key to be used to identify known revisions | |||||
""" | |||||
return 'url' | |||||
def get_known_versions(self, last_snapshot): | |||||
"""Retrieve the known release versions for the package | |||||
(i.e. those already ingested into the archive). | |||||
Args | |||||
last_snapshot (dict): Last snapshot for the visit | |||||
Returns: | |||||
dict: Dict whose key is url and values are revision ids. | |||||
""" | |||||
if not last_snapshot or 'branches' not in last_snapshot: | |||||
return {} | |||||
# retrieve only revisions (e.g the alias we do not want here) | |||||
revs = [rev['target'] | |||||
for rev in last_snapshot['branches'].values() | |||||
if rev and rev['target_type'] == 'revision'] | |||||
known_revisions = self.storage.revision_get(revs) | |||||
ret = {} | |||||
key = self.get_key() | |||||
for revision in known_revisions: | |||||
if not revision: # revision_get can return None | |||||
continue | |||||
artifact = self.get_artifact(revision) | |||||
ret[artifact[key]] = revision['id'] | |||||
self.time_last_visit[artifact[key]] = artifact['time_last_visit'] | |||||
return ret | |||||
def filter_package_versions(self, tarballs, known_versions): | |||||
"""Finds the available tarballs that are not previously archived. | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it is used | |||||
to filter out versions already ingested in the archive. | |||||
Returns: | |||||
A list of dicts containing information about the respective | |||||
tarballs that are not previously archived. | |||||
""" | |||||
versions = [] | |||||
key = self.get_key() | |||||
for release in tarballs: | |||||
tarball_url = release['url'] | |||||
if release[key] in known_versions: | |||||
tarball_request = self._request( | |||||
tarball_url, | |||||
time_last_visit=self.time_last_visit[release[key]], | |||||
throw_error=False) | |||||
else: | |||||
tarball_request = self._request( | |||||
tarball_url, time_last_visit=None, throw_error=False) | |||||
if tarball_request.status_code == 304: | |||||
continue | |||||
elif tarball_request.status_code != 200: | |||||
self.log.debug("Fail to query '%s'. Reason: %s" % ( | |||||
tarball_url, tarball_request.status_code)) | |||||
continue | |||||
new_release = self.update_release_info(release, tarball_request) | |||||
versions.append(new_release) | |||||
return versions | |||||
def update_release_info(self, release, tarball_request): | |||||
"""Update metadata of the package version with time_last_visit and the | |||||
server response | |||||
Args: | |||||
release (dict): Metadata of the focused package version | |||||
tarball_request (Request): Server response of the tarball url | |||||
Returns: | |||||
dict: release with updated information | |||||
""" | |||||
release['response'] = tarball_request | |||||
time_now = time.time() | |||||
time_now = time.strftime('%a, %d %b %Y %H:%M:%S GMT', | |||||
time.gmtime(time_now)) | |||||
release['time_last_visit'] = time_now | |||||
return release | |||||
def _request(self, url, time_last_visit=None, throw_error=True): | |||||
"""Request the remote tarball url. | |||||
Args: | |||||
url (str): Url (file or http*) | |||||
Raises: | |||||
ValueError in case of failing to query | |||||
Returns: | |||||
server response | |||||
""" | |||||
if time_last_visit: | |||||
self.params['headers']['If-Modified-Since'] = time_last_visit | |||||
response = self.session.get(url, **self.params, stream=True) | |||||
return response | |||||
def prepare_package_versions(self, tarballs, known_versions=None): | |||||
""" | |||||
Instantiate a generator that will process a specific package release | |||||
version at each iteration step. The following operations will be | |||||
performed: | |||||
1. Create a temporary directory to download and extract the | |||||
release tarball | |||||
2. Download the tarball | |||||
3. Uncompress the tarball | |||||
4. Parse the file associated to the package version to extract | |||||
metadata (optional) | |||||
5. Delete unnecessary files (optional) | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Yields: | |||||
Tuple[dict, str]: tuples containing the following | |||||
members: | |||||
* a dict holding package tarball information and metadata | |||||
* a string holding the path of the uncompressed package to | |||||
load into the archive | |||||
""" | |||||
new_versions = self.filter_package_versions(tarballs, known_versions) | |||||
for package_source_data in new_versions: | |||||
tarball_request = package_source_data['response'] | |||||
# To make things simple while creating revisions | |||||
del package_source_data['response'] | |||||
yield self._prepare_package_version(package_source_data, | |||||
tarball_request) | |||||
class compareField: | |||||
"""Uses a field present in the metadata to check for if the package is | |||||
previously archived. | |||||
This class is to be used to identify and avoid the reprocessing of | |||||
previously archived package version using a field provided by the | |||||
API as the metadata of the package version. | |||||
The following operations are performed: | |||||
- Retrive known versions and store then in a dict with key of same | |||||
field that is mentioned in compare field | |||||
- Check if the tarballs are present in knowns versions. | |||||
- Instantiate a generator to process a specific package released | |||||
version | |||||
""" | |||||
compare_field = None | |||||
"""Field used to identify if the package version is previously archived""" | |||||
# eg for pypi loader compare_field = 'sha' | |||||
def __init__(self): | |||||
self.session = requests.session() | |||||
self.params = { | |||||
'headers': { | |||||
'User-Agent': 'Software Heritage Loader (%s)' % ( | |||||
__version__ | |||||
) | |||||
} | |||||
} | |||||
def get_key(self): | |||||
"""Returns the key to be used to identify known revisions. | |||||
""" | |||||
return self.compare_field | |||||
def _request(self, url, throw_error=True): | |||||
"""Request the remote tarball url. | |||||
Args: | |||||
url (str): Url (file or http*). | |||||
Raises: | |||||
ValueError in case of failing to query. | |||||
Returns: | |||||
Tuple of local (filepath, hashes of filepath). | |||||
""" | |||||
# Done | |||||
response = self.session.get(url, **self.params, stream=True) | |||||
if response.status_code != 200 and throw_error: | |||||
raise ValueError("Fail to query '%s'. Reason: %s" % ( | |||||
url, response.status_code)) | |||||
return response | |||||
def get_known_versions(self, last_snapshot): | |||||
""" | |||||
Retrieve the known release versions for the package | |||||
(i.e. those already ingested into the archive). | |||||
Args | |||||
last_snapshot (dict): Last snapshot for the visit. | |||||
Returns: | |||||
dict: Dict whose key is the value of field chosen for | |||||
checking archived artifacts and values are revision ids. | |||||
""" | |||||
if not last_snapshot or 'branches' not in last_snapshot: | |||||
return {} | |||||
# retrieve only revisions (e.g the alias we do not want here) | |||||
revs = [rev['target'] | |||||
for rev in last_snapshot['branches'].values() | |||||
if rev and rev['target_type'] == 'revision'] | |||||
known_revisions = self.storage.revision_get(revs) | |||||
ret = {} | |||||
for revision in known_revisions: | |||||
if not revision: # revision_get can return None | |||||
continue | |||||
artifact = self.artifact_from_revision(revision) | |||||
ret[artifact[self.compare_field]] = revision['id'] # Check me | |||||
return ret | |||||
def artifact_from_revision(self, revision): | |||||
"""Find artifacts from the revision. | |||||
""" | |||||
# Can be overridden if change in standard revision pattern | |||||
if 'package' in revision['metadata']: | |||||
return revision['metadata']['package'] | |||||
def filter_package_versions(self, tarballs, known_versions): | |||||
""" | |||||
Return the available tarballs that are not previously archived. | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Returns: | |||||
A list of dicts containing information about the respective | |||||
tarballs that are not previously archived. | |||||
""" | |||||
# Done | |||||
versions = [] | |||||
for release in tarballs: | |||||
if release[self.compare_field] in known_versions: | |||||
continue | |||||
versions.append(release) | |||||
return versions | |||||
def prepare_package_versions(self, tarballs, known_versions={}): | |||||
""" | |||||
Instantiate a generator that will process a specific package release | |||||
version at each iteration step. The following operations will be | |||||
performed: | |||||
1. Create a temporary directory to download and extract the | |||||
release tarball. | |||||
2. Download the tarball. | |||||
3. Uncompress the tarball. | |||||
4. Parse the file associated to the package version to extract | |||||
metadata (optional). | |||||
5. Delete unnecessary files (optional). | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Yields: | |||||
Tuple[dict, str]: tuples containing the following | |||||
members: | |||||
* a dict holding package tarball information and metadata | |||||
* a string holding the path of the uncompressed package to | |||||
load into the archive | |||||
""" | |||||
new_versions = self.filter_package_versions(tarballs, known_versions) | |||||
for package_source_data in new_versions: | |||||
# filter out version with missing tarball, | |||||
# package visit will be marked as partial at the end of | |||||
# the loading process | |||||
tarball_url = package_source_data['url'] | |||||
tarball_request = self._request(tarball_url, | |||||
throw_error=False) | |||||
if tarball_request.status_code == 404: | |||||
self.log.debug('Tarball url %s returns a 404 error.', | |||||
tarball_url) | |||||
continue | |||||
yield self._prepare_package_version(package_source_data, | |||||
tarball_request) |