Changeset View
Standalone View
swh/loader/base/dowload.py
- This file was added.
# Copyright (C) 2019 The Software Heritage developers | |||||
anlambert: I am not a big fan of using inheritance in derived package loaders implementation (CRAN, GNU, .. | |||||
Not Done Inline Actions
Yes, thank you. I did not find the right way to explicit that. We must try and keep separated what's logically separated (artefacts retrieval, ingestion, etc...).
Yes, that's what i call a collaborator. ardumont: > I am not a big fan of using inheritance in derived package loaders implementation (CRAN, GNU… | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import requests | |||||
import time | |||||
from .abstractattribute import AbstractAttribute | |||||
try: | |||||
from _version import __version__ | |||||
except ImportError: | |||||
__version__ = 'devel' | |||||
# This file contains methods to check and remove archived package version | |||||
class If_Modified_Since: | |||||
"""Uses if_modified_then header to check for if the package is previously | |||||
archived. | |||||
This class is to be used to identify and avoid the reprocessing of | |||||
previously archived package version when there is no reliable field is | |||||
provided in the metadata that can so the job. | |||||
The following operations are performed: | |||||
- Retrive known versions and store then in a dict with tarball `url` | |||||
as key(can be changed by overriding `get_key` method) | |||||
- Check if the tarballs are present in knowns versions. | |||||
* If the match found, it sends a request with `if_modified_since` | |||||
header to confirm the match | |||||
* If the match is not found it sends a simple request | |||||
- Store the request and the time for further processing | |||||
- Instantiate a generator to process a specific package released | |||||
version | |||||
""" | |||||
def __init__(self): | |||||
self.session = requests.session() | |||||
self.time_last_visit = {} | |||||
self.params = { | |||||
'headers': { | |||||
'User-Agent': 'Software Heritage Loader (%s)' % ( | |||||
__version__ | |||||
) | |||||
} | |||||
} | |||||
def get_artifact(self, revision): | |||||
"""Fetch artifact form revision | |||||
Args: | |||||
revision (dict): Previous revision | |||||
Returns: | |||||
dict: metadata present in the revision | |||||
""" | |||||
return revision['metadata']['package'] | |||||
def get_key(self): | |||||
"""Returns the key to be used to identify known revisions | |||||
""" | |||||
return 'url' | |||||
def known_versions(self, last_snapshot): | |||||
""" | |||||
Retrieve the known release versions for the npm package | |||||
(i.e. those already ingested into the archive). | |||||
Args | |||||
last_snapshot (dict): Last snapshot for the visit | |||||
Returns: | |||||
dict: Dict whose key is url and values are revision ids. | |||||
""" | |||||
if not last_snapshot or 'branches' not in last_snapshot: | |||||
return {} | |||||
# retrieve only revisions (e.g the alias we do not want here) | |||||
revs = [rev['target'] | |||||
for rev in last_snapshot['branches'].values() | |||||
if rev and rev['target_type'] == 'revision'] | |||||
known_revisions = self.storage.revision_get(revs) | |||||
ret = {} | |||||
key = self.get_key() | |||||
for revision in known_revisions: | |||||
if not revision: # revision_get can return None | |||||
continue | |||||
artifact = self.get_artifact(revision) | |||||
ret[artifact[key]] = revision['id'] | |||||
self.time_last_visit[artifact[key]] = artifact['time_last_visit'] | |||||
return ret | |||||
def filter_package_versions(self, tarballs, known_versions): | |||||
"""Return the available tarballs that are not previously archived. | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Returns: | |||||
A list of dicts containing information about the respective | |||||
tarballs that are not previously archived. | |||||
""" | |||||
# Done | |||||
versions = [] | |||||
key = self.get_key() | |||||
for release in tarballs: | |||||
tarball_url = release['url'] | |||||
if release[key] in known_versions: | |||||
tarball_request = self._request( | |||||
tarball_url, | |||||
time_last_visit=self.time_last_visit[key], | |||||
throw_error=False) | |||||
else: | |||||
tarball_request = self._request( | |||||
tarball_url, time_last_visit=None, throw_error=False) | |||||
if tarball_request.status_code == 304: | |||||
continue | |||||
elif tarball_request.status_code == 404: | |||||
self.log.debug('Tarball url %s returns a 404 error.', | |||||
tarball_url) | |||||
continue | |||||
release['response'] = tarball_request | |||||
timestamp = time.time() | |||||
timestamp = time.strftime('%a, %d %b %Y %H:%M:%S GMT', | |||||
time.gmtime(timestamp)) | |||||
release['time_last_visit'] = timestamp | |||||
versions.append(release) | |||||
return versions | |||||
def _request(self, url, time_last_visit=None, throw_error=True): | |||||
"""Request the remote tarball url. | |||||
Args: | |||||
url (str): Url (file or http*) | |||||
Raises: | |||||
ValueError in case of failing to query | |||||
Returns: | |||||
server response | |||||
""" | |||||
# Done | |||||
if time_last_visit: | |||||
self.params['headers']['If-Modified-Since'] = time_last_visit | |||||
response = self.session.get(url, **self.params, stream=True) | |||||
if response.status_code != 200: | |||||
raise ValueError("Fail to query '%s'. Reason: %s" % ( | |||||
url, response.status_code)) | |||||
return response | |||||
def prepare_package_versions(self, tarballs, known_versions=None): | |||||
Done Inline ActionsCan you suggest a better name for this class? nahimilega: Can you suggest a better name for this class? | |||||
""" | |||||
Instantiate a generator that will process a specific package released | |||||
version at each iteration step. The following operations will be | |||||
performed: | |||||
1. Create a temporary directory to download and extract the | |||||
release tarball | |||||
2. Download the tarball | |||||
3. Uncompress the tarball | |||||
4. Parse the file associated to the package version to extract | |||||
metadata (optional) | |||||
5. Delete unnecessary files (optional) | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Yields: | |||||
Tuple[dict, str]: tuples containing the following | |||||
members: | |||||
* a dict holding package tarball information and metadata | |||||
* a string holding the path of the uncompressed package to | |||||
load into the archive | |||||
""" | |||||
new_versions = self.filter_package_versions(tarballs, known_versions) | |||||
for package_source_data in new_versions: | |||||
tarball_request = package_source_data['response'] | |||||
# To make things simple while creating revisions | |||||
del package_source_data['response'] | |||||
yield self._prepare_package_version(package_source_data, | |||||
tarball_request) | |||||
class compare_field: | |||||
ardumontAuthorUnsubmitted Done Inline ActionsUse camel case like everywhere else. ardumont: Use camel case like everywhere else. | |||||
"""Uses a field present in the metadata to check for if the package is | |||||
previously archived. | |||||
This class is to be used to identify and avoid the reprocessing of | |||||
previously archived package version using a field provided by the | |||||
API in the metadata of the package version | |||||
The following operations are performed: | |||||
- Retrive known versions and store then in a dict with key of same | |||||
field that is mentioned in compare field | |||||
- Check if the tarballs are present in knowns versions. | |||||
- Instantiate a generator to process a specific package released | |||||
version | |||||
Done Inline ActionsCan you suggest a better name for this class? nahimilega: Can you suggest a better name for this class?
| |||||
""" | |||||
compare_field = AbstractAttribute("Field used to identify if the package" | |||||
"version is previously archived") | |||||
ardumontAuthorUnsubmitted Done Inline ActionsAs mentioned, remove that abstract attribute and just add a plain docstring with triple backquotes. ardumont: As mentioned, remove that abstract attribute and just add a plain docstring with triple… | |||||
# eg for pypi loader compare_field = 'sha' | |||||
def __init__(self): | |||||
Done Inline ActionsThe docstring is below the variable definition. compare_field = None """Field used to identify if the p...""" ardumont: The docstring is below the variable definition.
```
compare_field = None
"""Field used to… | |||||
Not Done Inline Actionsping ;) ardumont: ping ;) | |||||
self.session = requests.session() | |||||
self.params = { | |||||
'headers': { | |||||
'User-Agent': 'Software Heritage Loader (%s)' % ( | |||||
__version__ | |||||
) | |||||
} | |||||
} | |||||
def get_key(self): | |||||
"""Returns the key to be used to identify known revisions | |||||
""" | |||||
return self.compare_field | |||||
def _request(self, url, throw_error=True): | |||||
"""Request the remote tarball url. | |||||
Args: | |||||
url (str): Url (file or http*) | |||||
Raises: | |||||
ValueError in case of failing to query | |||||
Returns: | |||||
Tuple of local (filepath, hashes of filepath) | |||||
""" | |||||
# Done | |||||
response = self.session.get(url, **self.params, stream=True) | |||||
if response.status_code != 200 and throw_error: | |||||
raise ValueError("Fail to query '%s'. Reason: %s" % ( | |||||
url, response.status_code)) | |||||
return response | |||||
def known_versions(self, last_snapshot): | |||||
""" | |||||
Retrieve the known release versions for the npm package | |||||
(i.e. those already ingested into the archive). | |||||
Args | |||||
last_snapshot (dict): Last snapshot for the visit | |||||
Returns: | |||||
dict: Dict whose key is the value of field chosen for | |||||
checking archived artifacts and values are revision ids. | |||||
""" | |||||
if not last_snapshot or 'branches' not in last_snapshot: | |||||
return {} | |||||
# retrieve only revisions (e.g the alias we do not want here) | |||||
revs = [rev['target'] | |||||
for rev in last_snapshot['branches'].values() | |||||
if rev and rev['target_type'] == 'revision'] | |||||
known_revisions = self.storage.revision_get(revs) | |||||
ret = {} | |||||
for revision in known_revisions: | |||||
if not revision: # revision_get can return None | |||||
continue | |||||
artifact = self.artifact_from_revision(revision) | |||||
ret[artifact[self.compare_field]] = revision['id'] # Check me | |||||
return ret | |||||
def artifact_from_revision(self, revision): | |||||
"""Find artifacts from the revision | |||||
""" | |||||
# Can be overridden if change in standard revision pattern | |||||
if 'package' in revision['metadata']: | |||||
return revision['metadata']['package'] | |||||
def filter_package_versions(self, tarballs, known_versions): | |||||
""" | |||||
Return the available tarballs that are not previously archived. | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Returns: | |||||
A list of dicts containing information about the respective | |||||
tarballs that are not previously archived. | |||||
""" | |||||
# Done | |||||
versions = [] | |||||
for release in tarballs: | |||||
if release[self.compare_field] in known_versions: | |||||
continue | |||||
versions.append(release) | |||||
return versions | |||||
def prepare_package_versions(self, tarballs, known_versions=None): | |||||
""" | |||||
Instantiate a generator that will process a specific package released | |||||
version at each iteration step. The following operations will be | |||||
performed: | |||||
1. Create a temporary directory to download and extract the | |||||
release tarball | |||||
2. Download the tarball | |||||
3. Uncompress the tarball | |||||
4. Parse the file associated to the package version to extract | |||||
metadata (optional) | |||||
5. Delete unnecessary files (optional) | |||||
Done Inline ActionsChanged known_versions=None to known_versions={} because if known_versions is None then it would create a problem later in code nahimilega: Changed `known_versions=None` to `known_versions={}` because if known_versions is None then it… | |||||
Args: | |||||
tarballs (list): a list of dicts containing information about the | |||||
respective tarball that is provided by lister. | |||||
known_versions (dict): may be provided by the loader, it enables | |||||
to filter out versions already ingested in the archive. | |||||
Yields: | |||||
Tuple[dict, str]: tuples containing the following | |||||
members: | |||||
* a dict holding package tarball information and metadata | |||||
* a string holding the path of the uncompressed package to | |||||
load into the archive | |||||
""" | |||||
new_versions = self.filter_package_versions(tarballs, known_versions) | |||||
for package_source_data in new_versions: | |||||
# filter out version with missing tarball, | |||||
# package visit will be marked as partial at the end of | |||||
# the loading process | |||||
tarball_url = package_source_data['url'] | |||||
tarball_request = self._request(tarball_url, | |||||
throw_error=False) | |||||
if tarball_request.status_code == 404: | |||||
self.log.debug('Tarball url %s returns a 404 error.', | |||||
tarball_url) | |||||
continue | |||||
yield self._prepare_package_version(package_source_data, | |||||
tarball_request) |
I am not a big fan of using inheritance in derived package loaders implementation (CRAN, GNU, ...) to declare the package tarballs retrieval behavior.
I would rather use a single class, named PackageDownloader for instance, that will internally be used by the PackageLoader class through composition instead.
Derived classes implementing real packages loader (GNU, CRAN, ..) should only inherit from the PackageLoader class and reimplement some hook methods to override the default behavior.
I need to further analyze what is implemented in that file before suggesting improvements.