diff --git a/zack/webclient.py b/zack/webclient.py new file mode 100644 --- /dev/null +++ b/zack/webclient.py @@ -0,0 +1,419 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Python client for the Software Heritage Web API + +Light wrapper around requests for the archive API, taking care of data +conversions and pagination. + +>>> from webclient import WebAPIClient +>>> cli = WebAPIClient() +>>> cli.get('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6') +{'id': PersistentId(namespace='swh', scheme_version=1, object_type='revision', + object_id='aafb16d69fd30ff58afdd69036a26047f3aebdc6', + metadata={}), + 'author': { + 'fullname': 'Nicolas Dandrimont ', + 'name': 'Nicolas Dandrimont', + 'email': 'nicolas.dandrimont@crans.org' + }, + 'date': datetime.datetime(2014, 8, 18, 18, 18, 25, + tzinfo=tzoffset(None, 7200)), + 'committer': { + 'fullname': 'Nicolas Dandrimont ', + 'name': 'Nicolas Dandrimont', + 'email': 'nicolas.dandrimont@crans.org' + }, + 'committer_date': datetime.datetime(2014, 8, 18, 18, 18, 25, + tzinfo=tzoffset(None, 7200)) + 'type': 'git', + 'directory': PersistentId(namespace='swh', scheme_version=1, + object_type='directory', + object_id='9f2e5898e00a66e6ac11033959d7e05b1593353b', + metadata={}), + 'message': "Merge branch 'master' into pr/584\n", + 'metadata': {}, + 'synthetic': False, + 'parents': [ + {'id': PersistentId(namespace='swh', scheme_version=1, + object_type='revision', + object_id='26307d261279861c2d9c9eca3bb38519f951bea4', + metadata={}), + 'url': '/api/1/revision/26307d261279861c2d9c9eca3bb38519f951bea4/'}, + {'id': PersistentId(namespace='swh', scheme_version=1, + object_type='revision', + object_id='37fc9e08d0c4b71807a4f1ecb06112e78d91c283', + metadata={}), + 'url': '/api/1/revision/37fc9e08d0c4b71807a4f1ecb06112e78d91c283/'} + ], + 'merge': True, + 'url': '/api/1/revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/', + 'history_url': '/api/1/revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/log/', + 'directory_url': '/api/1/directory/9f2e5898e00a66e6ac11033959d7e05b1593353b/' +} + +""" + +from typing import Any, Dict, Generator, List, Union +from urllib.parse import urlparse + +import dateutil.parser +import requests + +from swh.model.identifiers import \ + SNAPSHOT, REVISION, RELEASE, DIRECTORY, CONTENT +from swh.model.identifiers import PersistentId as PID +from swh.model.identifiers import parse_persistent_identifier as parse_pid + + +PIDish = Union[PID, str] + + +def _get_pid(pidish: PIDish) -> PID: + """parse string to PID if needed""" + if isinstance(pidish, str): + return parse_pid(pidish) + else: + return pidish + + +def typify(json: Any, obj_type: str) -> Any: + """type json data using pythonic types where appropriate + + the following conversions are performed: + + - identifiers are converted from strings to PersistentId instances + - timestamps are converted from strings to datetime.datetime objects + + """ + def to_pid(object_type, s): + return PID(object_type=object_type, object_id=s) + + def to_date(s): + return dateutil.parser.parse(s) + + def obj_type_of_entry_type(s): + if s == 'file': + return CONTENT + elif s == 'dir': + return DIRECTORY + elif s == 'rev': + return REVISION + + if obj_type == SNAPSHOT: + json['id'] = to_pid(obj_type, json['id']) + branches = json['branches'] + for name, target in branches.items(): + target['target'] = to_pid(target['target_type'], target['target']) + elif obj_type == REVISION: + json['id'] = to_pid(obj_type, json['id']) + json['directory'] = to_pid(DIRECTORY, json['directory']) + for key in ('date', 'committer_date'): + json[key] = to_date(json[key]) + for parent in json['parents']: + parent['id'] = to_pid(REVISION, parent['id']) + elif obj_type == RELEASE: + json['id'] = to_pid(obj_type, json['id']) + json['date'] = to_date(json['date']) + json['target'] = to_pid(json['target_type'], json['target']) + elif obj_type == DIRECTORY: + dir_pid = None + for entry in json: + dir_pid = dir_pid or to_pid(obj_type, entry['dir_id']) + entry['dir_id'] = dir_pid + entry['target'] = to_pid(obj_type_of_entry_type(entry['type']), + entry['target']) + elif obj_type == CONTENT: + pass # nothing to do for contents + else: + raise ValueError(f'invalid object type: {obj_type}') + + return json + + +def jsonify(res: requests.Response, obj_type: str) -> Any: + """interpret res body as JSON and return it as (typed) Python data + + """ + return typify(res.json(), obj_type=obj_type) + + +class WebAPIClient: + """client for the Software Heritage archive Web API, see + + https://archive.softwareheritage.org/api/ + + """ + + def __init__(self, api_url='https://archive.softwareheritage.org/api/1'): + """create a client for the Software Heritage Web API + + see: https://archive.softwareheritage.org/api/ + + Args: + api_url: base URL for API calls (default: + "https://archive.softwareheritage.org/api/1") + + """ + api_url = api_url.rstrip('/') + u = urlparse(api_url) + + self.api_url = api_url + self.api_path = u.path + + def _call(self, query: str, http_method: str = 'get', + **req_args) -> requests.models.Response: + """dispatcher for archive API invocation + + Args: + query: API method to be invoked, rooted at api_url + http_method: HTTP method to be invoked, one of: 'get', 'head' + req_args: extra keyword arguments for requests.get()/.head() + + Raises: + requests.HTTPError: if HTTP request fails and http_method is 'get' + + """ + url = '/'.join([self.api_url, query]) + r = None + + if http_method == 'get': + r = requests.get(url, **req_args) + r.raise_for_status() + elif http_method == 'head': + r = requests.head(url, **req_args) + else: + raise ValueError(f'unsupported HTTP method: {http_method}') + + return r + + def get(self, pid: PIDish, **req_args) -> Any: + """retrieve information about an object of any kind + + dispatcher method over the more specific methods content(), + directory(), etc. + + note that this method will buffer the entire output in case of long, + iterable output (e.g., for snapshot()), see the iter() method for + streaming + + """ + pid_ = _get_pid(pid) + obj_type = pid_.object_type + if obj_type == SNAPSHOT: + return list(self.snapshot(pid_)) + elif obj_type == REVISION: + return self.revision(pid_) + elif obj_type == RELEASE: + return self.release(pid_) + elif obj_type == DIRECTORY: + return self.directory(pid_) + elif obj_type == CONTENT: + return self.content(pid_) + + def iter(self, pid: PIDish, **req_args) -> Generator[Dict[str, Any], + None, None]: + """stream over the information about an object of any kind + + streaming variant of get() + + """ + pid_ = _get_pid(pid) + obj_type = pid_.object_type + if obj_type == SNAPSHOT: + yield from self.snapshot(pid_) + elif obj_type == REVISION: + yield from [self.revision(pid_)] + elif obj_type == RELEASE: + yield from [self.release(pid_)] + elif obj_type == DIRECTORY: + yield from self.directory(pid_) + elif obj_type == CONTENT: + yield from [self.content(pid_)] + + def content(self, pid: PIDish, **req_args) -> Dict[str, Any]: + + """retrieve information about a content object + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.get() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return jsonify( + self._call(f'content/sha1_git:{_get_pid(pid).object_id}/', + **req_args), + CONTENT) + + def directory(self, pid: PIDish, **req_args) -> List[Dict[str, Any]]: + """retrieve information about a directory object + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.get() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return jsonify( + self._call(f'directory/{_get_pid(pid).object_id}/', **req_args), + DIRECTORY) + + def revision(self, pid: PIDish, **req_args) -> Dict[str, Any]: + """retrieve information about a revision object + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.get() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return jsonify( + self._call(f'revision/{_get_pid(pid).object_id}/', **req_args), + REVISION) + + def release(self, pid: PIDish, **req_args) -> Dict[str, Any]: + """retrieve information about a release object + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.get() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return jsonify( + self._call(f'release/{_get_pid(pid).object_id}/', **req_args), + RELEASE) + + def snapshot(self, pid: PIDish, + **req_args) -> Generator[Dict[str, Any], None, None]: + """retrieve information about a snapshot object + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.get() + + Returns: + an iterator over partial snapshots, each containing a subset of + available branches + + Raises: + requests.HTTPError: if HTTP request fails + + """ + done = False + r = None + query = f'snapshot/{_get_pid(pid).object_id}/' + + while not done: + r = self._call(query, http_method='get', **req_args) + yield jsonify(r, SNAPSHOT) + if 'next' in r.links and 'url' in r.links['next']: + query = r.links['next']['url'] + if query.startswith(self.api_path): + # XXX hackish URL cleaning while we wait for swh-web API to + # return complete URLs (a-la GitHub/GitLab) in Link headers + # instead of absolute paths rooted at https://archive.s.o/ + # cf. https://forge.softwareheritage.org/T2147 + query = query[len(self.api_path):].lstrip('/') + else: + done = True + + def content_exists(self, pid: PIDish, **req_args) -> bool: + """check if a content object exists in the archive + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.head() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return bool(self._call(f'content/sha1_git:{_get_pid(pid).object_id}/', + http_method='head', **req_args)) + + def directory_exists(self, pid: PIDish, **req_args) -> bool: + """check if a directory object exists in the archive + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.head() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return bool(self._call(f'directory/{_get_pid(pid).object_id}/', + http_method='head', **req_args)) + + def revision_exists(self, pid: PIDish, **req_args) -> bool: + """check if a revision object exists in the archive + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.head() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return bool(self._call(f'revision/{_get_pid(pid).object_id}/', + http_method='head', **req_args)) + + def release_exists(self, pid: PIDish, **req_args) -> bool: + """check if a release object exists in the archive + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.head() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return bool(self._call(f'release/{_get_pid(pid).object_id}/', + http_method='head', **req_args)) + + def snapshot_exists(self, pid: PIDish, **req_args) -> bool: + """check if a snapshot object exists in the archive + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.head() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + return bool(self._call(f'snapshot/{_get_pid(pid).object_id}/', + http_method='head', **req_args)) + + def content_raw(self, pid: PIDish, + **req_args) -> Generator[bytes, None, None]: + """iterate over the raw content of a content object + + Args: + pid: object identifier + req_args: extra keyword arguments for requests.get() + + Raises: + requests.HTTPError: if HTTP request fails + + """ + r = self._call(f'content/sha1_git:{_get_pid(pid).object_id}/raw/', + stream=True, **req_args) + r.raise_for_status() + + for chunk in r.iter_content(chunk_size=None, decode_unicode=False): + yield chunk