Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9696835
D2461.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
D2461.id.diff
View Options
diff --git a/zack/webclient.py b/zack/webclient.py
new file mode 100644
--- /dev/null
+++ b/zack/webclient.py
@@ -0,0 +1,412 @@
+# Copyright (C) 2019 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+"""Python client for the Software Heritage Web API
+
+Light wrapper around requests for the archive API, taking care of data
+conversions and pagination.
+
+>>> from webclient import WebAPIClient
+>>> cli = WebAPIClient()
+>>> cli.get('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6')
+{'id': PersistentId(namespace='swh', scheme_version=1, object_type='revision',
+ object_id='aafb16d69fd30ff58afdd69036a26047f3aebdc6',
+ metadata={}),
+ 'author': {
+ 'fullname': 'Nicolas Dandrimont <nicolas.dandrimont@crans.org>',
+ 'name': 'Nicolas Dandrimont',
+ 'email': 'nicolas.dandrimont@crans.org'
+ },
+ 'date': datetime.datetime(2014, 8, 18, 18, 18, 25,
+ tzinfo=tzoffset(None, 7200)),
+ 'committer': {
+ 'fullname': 'Nicolas Dandrimont <nicolas.dandrimont@crans.org>',
+ 'name': 'Nicolas Dandrimont',
+ 'email': 'nicolas.dandrimont@crans.org'
+ },
+ 'committer_date': datetime.datetime(2014, 8, 18, 18, 18, 25,
+ tzinfo=tzoffset(None, 7200))
+ 'type': 'git',
+ 'directory': PersistentId(namespace='swh', scheme_version=1,
+ object_type='directory',
+ object_id='9f2e5898e00a66e6ac11033959d7e05b1593353b',
+ metadata={}),
+ 'message': "Merge branch 'master' into pr/584\n",
+ 'metadata': {},
+ 'synthetic': False,
+ 'parents': [
+ {'id': PersistentId(namespace='swh', scheme_version=1,
+ object_type='revision',
+ object_id='26307d261279861c2d9c9eca3bb38519f951bea4',
+ metadata={}),
+ 'url': '/api/1/revision/26307d261279861c2d9c9eca3bb38519f951bea4/'},
+ {'id': PersistentId(namespace='swh', scheme_version=1,
+ object_type='revision',
+ object_id='37fc9e08d0c4b71807a4f1ecb06112e78d91c283',
+ metadata={}),
+ 'url': '/api/1/revision/37fc9e08d0c4b71807a4f1ecb06112e78d91c283/'}
+ ],
+ 'merge': True,
+ 'url': '/api/1/revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/',
+ 'history_url': '/api/1/revision/aafb16d69fd30ff58afdd69036a26047f3aebdc6/log/',
+ 'directory_url': '/api/1/directory/9f2e5898e00a66e6ac11033959d7e05b1593353b/'
+}
+
+"""
+
+from typing import Any, Dict, Generator, List, Union
+from urllib.parse import urlparse
+
+import dateutil.parser
+import requests
+
+from swh.model.identifiers import \
+ SNAPSHOT, REVISION, RELEASE, DIRECTORY, CONTENT
+from swh.model.identifiers import PersistentId as PID
+from swh.model.identifiers import parse_persistent_identifier as parse_pid
+
+
+PIDish = Union[PID, str]
+
+
+def _get_pid(pidish: PIDish) -> PID:
+ """parse string to PID if needed"""
+ if isinstance(pidish, str):
+ return parse_pid(pidish)
+ else:
+ return pidish
+
+
+def typify(data: Any, obj_type: str) -> Any:
+ """type API responses using pythonic types where appropriate
+
+ the following conversions are performed:
+
+ - identifiers are converted from strings to PersistentId instances
+ - timestamps are converted from strings to datetime.datetime objects
+
+ """
+ def to_pid(object_type, s):
+ return PID(object_type=object_type, object_id=s)
+
+ def to_date(s):
+ return dateutil.parser.parse(s)
+
+ def obj_type_of_entry_type(s):
+ if s == 'file':
+ return CONTENT
+ elif s == 'dir':
+ return DIRECTORY
+ elif s == 'rev':
+ return REVISION
+ else:
+ raise ValueError(f'invalid directory entry type: {s}')
+
+ if obj_type == SNAPSHOT:
+ for name, target in data.items():
+ target['target'] = to_pid(target['target_type'], target['target'])
+ elif obj_type == REVISION:
+ data['id'] = to_pid(obj_type, data['id'])
+ data['directory'] = to_pid(DIRECTORY, data['directory'])
+ for key in ('date', 'committer_date'):
+ data[key] = to_date(data[key])
+ for parent in data['parents']:
+ parent['id'] = to_pid(REVISION, parent['id'])
+ elif obj_type == RELEASE:
+ data['id'] = to_pid(obj_type, data['id'])
+ data['date'] = to_date(data['date'])
+ data['target'] = to_pid(data['target_type'], data['target'])
+ elif obj_type == DIRECTORY:
+ dir_pid = None
+ for entry in data:
+ dir_pid = dir_pid or to_pid(obj_type, entry['dir_id'])
+ entry['dir_id'] = dir_pid
+ entry['target'] = to_pid(obj_type_of_entry_type(entry['type']),
+ entry['target'])
+ elif obj_type == CONTENT:
+ pass # nothing to do for contents
+ else:
+ raise ValueError(f'invalid object type: {obj_type}')
+
+ return data
+
+
+class WebAPIClient:
+ """client for the Software Heritage archive Web API, see
+
+ https://archive.softwareheritage.org/api/
+
+ """
+
+ def __init__(self, api_url='https://archive.softwareheritage.org/api/1'):
+ """create a client for the Software Heritage Web API
+
+ see: https://archive.softwareheritage.org/api/
+
+ Args:
+ api_url: base URL for API calls (default:
+ "https://archive.softwareheritage.org/api/1")
+
+ """
+ api_url = api_url.rstrip('/')
+ u = urlparse(api_url)
+
+ self.api_url = api_url
+ self.api_path = u.path
+
+ def _call(self, query: str, http_method: str = 'get',
+ **req_args) -> requests.models.Response:
+ """dispatcher for archive API invocation
+
+ Args:
+ query: API method to be invoked, rooted at api_url
+ http_method: HTTP method to be invoked, one of: 'get', 'head'
+ req_args: extra keyword arguments for requests.get()/.head()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails and http_method is 'get'
+
+ """
+ url = '/'.join([self.api_url, query])
+ r = None
+
+ if http_method == 'get':
+ r = requests.get(url, **req_args)
+ r.raise_for_status()
+ elif http_method == 'head':
+ r = requests.head(url, **req_args)
+ else:
+ raise ValueError(f'unsupported HTTP method: {http_method}')
+
+ return r
+
+ def get(self, pid: PIDish, **req_args) -> Any:
+ """retrieve information about an object of any kind
+
+ dispatcher method over the more specific methods content(),
+ directory(), etc.
+
+ note that this method will buffer the entire output in case of long,
+ iterable output (e.g., for snapshot()), see the iter() method for
+ streaming
+
+ """
+ pid_ = _get_pid(pid)
+ getters = {
+ CONTENT: self.content,
+ DIRECTORY: self.directory,
+ RELEASE: self.release,
+ REVISION: self.revision,
+ SNAPSHOT: lambda pid: dict(self.snapshot(pid)),
+ }
+ return getters[pid_.object_type](pid_)
+
+ def iter(self, pid: PIDish, **req_args) -> Generator[Dict[str, Any],
+ None, None]:
+ """stream over the information about an object of any kind
+
+ streaming variant of get()
+
+ """
+ pid_ = _get_pid(pid)
+ obj_type = pid_.object_type
+ if obj_type == SNAPSHOT:
+ yield from self.snapshot(pid_)
+ elif obj_type == REVISION:
+ yield from [self.revision(pid_)]
+ elif obj_type == RELEASE:
+ yield from [self.release(pid_)]
+ elif obj_type == DIRECTORY:
+ yield from self.directory(pid_)
+ elif obj_type == CONTENT:
+ yield from [self.content(pid_)]
+ else:
+ raise ValueError(f'invalid object type: {obj_type}')
+
+ def content(self, pid: PIDish, **req_args) -> Dict[str, Any]:
+ """retrieve information about a content object
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.get()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return typify(
+ self._call(f'content/sha1_git:{_get_pid(pid).object_id}/',
+ **req_args).json(),
+ CONTENT)
+
+ def directory(self, pid: PIDish, **req_args) -> List[Dict[str, Any]]:
+ """retrieve information about a directory object
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.get()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return typify(
+ self._call(f'directory/{_get_pid(pid).object_id}/',
+ **req_args).json(),
+ DIRECTORY)
+
+ def revision(self, pid: PIDish, **req_args) -> Dict[str, Any]:
+ """retrieve information about a revision object
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.get()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return typify(
+ self._call(f'revision/{_get_pid(pid).object_id}/',
+ **req_args).json(),
+ REVISION)
+
+ def release(self, pid: PIDish, **req_args) -> Dict[str, Any]:
+ """retrieve information about a release object
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.get()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return typify(
+ self._call(f'release/{_get_pid(pid).object_id}/',
+ **req_args).json(),
+ RELEASE)
+
+ def snapshot(self, pid: PIDish,
+ **req_args) -> Generator[Dict[str, Any], None, None]:
+ """retrieve information about a snapshot object
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.get()
+
+ Returns:
+ an iterator over partial snapshots, each containing a subset of
+ available branches
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ done = False
+ r = None
+ query = f'snapshot/{_get_pid(pid).object_id}/'
+
+ while not done:
+ r = self._call(query, http_method='get', **req_args)
+ yield from typify(r.json()['branches'], SNAPSHOT).items()
+ if 'next' in r.links and 'url' in r.links['next']:
+ query = r.links['next']['url']
+ if query.startswith(self.api_path):
+ # XXX hackish URL cleaning while we wait for swh-web API to
+ # return complete URLs (a-la GitHub/GitLab) in Link headers
+ # instead of absolute paths rooted at https://archive.s.o/
+ # cf. https://forge.softwareheritage.org/T2147
+ query = query[len(self.api_path):].lstrip('/')
+ else:
+ done = True
+
+ def content_exists(self, pid: PIDish, **req_args) -> bool:
+ """check if a content object exists in the archive
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.head()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return bool(self._call(f'content/sha1_git:{_get_pid(pid).object_id}/',
+ http_method='head', **req_args))
+
+ def directory_exists(self, pid: PIDish, **req_args) -> bool:
+ """check if a directory object exists in the archive
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.head()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return bool(self._call(f'directory/{_get_pid(pid).object_id}/',
+ http_method='head', **req_args))
+
+ def revision_exists(self, pid: PIDish, **req_args) -> bool:
+ """check if a revision object exists in the archive
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.head()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return bool(self._call(f'revision/{_get_pid(pid).object_id}/',
+ http_method='head', **req_args))
+
+ def release_exists(self, pid: PIDish, **req_args) -> bool:
+ """check if a release object exists in the archive
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.head()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return bool(self._call(f'release/{_get_pid(pid).object_id}/',
+ http_method='head', **req_args))
+
+ def snapshot_exists(self, pid: PIDish, **req_args) -> bool:
+ """check if a snapshot object exists in the archive
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.head()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ return bool(self._call(f'snapshot/{_get_pid(pid).object_id}/',
+ http_method='head', **req_args))
+
+ def content_raw(self, pid: PIDish,
+ **req_args) -> Generator[bytes, None, None]:
+ """iterate over the raw content of a content object
+
+ Args:
+ pid: object identifier
+ req_args: extra keyword arguments for requests.get()
+
+ Raises:
+ requests.HTTPError: if HTTP request fails
+
+ """
+ r = self._call(f'content/sha1_git:{_get_pid(pid).object_id}/raw/',
+ stream=True, **req_args)
+ r.raise_for_status()
+
+ yield from r.iter_content(chunk_size=None, decode_unicode=False)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Aug 17, 9:44 PM (13 h, 3 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3216544
Attached To
D2461: add Python client for the archive WEB API
Event Timeline
Log In to Comment