diff --git a/docs/README.rst b/docs/README.rst index 5f1355e..02cd253 100644 --- a/docs/README.rst +++ b/docs/README.rst @@ -1,26 +1,26 @@ Software Heritage - Web client ============================== Client for Software Heritage Web applications, via their APIs. Sample usage ------------ .. code-block:: python from swh.web.client.client import WebAPIClient cli = WebAPIClient() - # retrieve any archived object via its PID + # retrieve any archived object via its SWHID cli.get('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6') # same, but for specific object types cli.revision('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6') # get() always retrieve entire objects, following pagination # WARNING: this might *not* be what you want for large objects cli.get('swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a') # type-specific methods support explicit iteration through pages next(cli.snapshot('swh:1:snp:cabcc7d7bf639bbe1cc3b41989e1806618dd5764')) diff --git a/swh/web/client/client.py b/swh/web/client/client.py index 5630d67..8d5250a 100644 --- a/swh/web/client/client.py +++ b/swh/web/client/client.py @@ -1,475 +1,489 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """Python client for the Software Heritage Web API Light wrapper around requests for the archive API, taking care of data conversions and pagination. .. code-block:: python from swh.web.client.client import WebAPIClient cli = WebAPIClient() - # retrieve any archived object via its PID + # retrieve any archived object via its SWHID cli.get('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6') # same, but for specific object types cli.revision('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6') # get() always retrieve entire objects, following pagination # WARNING: this might *not* be what you want for large objects cli.get('swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a') # type-specific methods support explicit iteration through pages next(cli.snapshot('swh:1:snp:cabcc7d7bf639bbe1cc3b41989e1806618dd5764')) """ -from typing import Any, Callable, Dict, Generator, List, Optional, Union +from typing import Any, Callable, Dict, Iterator, List, Optional, Union from urllib.parse import urlparse import dateutil.parser import requests -from swh.model.identifiers import CONTENT, DIRECTORY, RELEASE, REVISION, SNAPSHOT -from swh.model.identifiers import PersistentId as PID -from swh.model.identifiers import parse_persistent_identifier as parse_pid +from swh.model.identifiers import ( + CONTENT, + DIRECTORY, + RELEASE, + REVISION, + SNAPSHOT, + SWHID, + parse_swhid, +) -PIDish = Union[PID, str] +SWHIDish = Union[SWHID, str] ORIGIN_VISIT = "origin_visit" -def _get_pid(pidish: PIDish) -> PID: - """Parse string to PID if needed""" - if isinstance(pidish, str): - return parse_pid(pidish) +def _get_swhid(swhidish: SWHIDish) -> SWHID: + """Parse string to SWHID if needed""" + if isinstance(swhidish, str): + return parse_swhid(swhidish) else: - return pidish + return swhidish def typify(data: Any, obj_type: str) -> Any: """Type API responses using pythonic types where appropriate The following conversions are performed: - - identifiers are converted from strings to PersistentId instances + - identifiers are converted from strings to SWHID instances - timestamps are converted from strings to datetime.datetime objects """ - def to_pid(object_type, s): - return PID(object_type=object_type, object_id=s) + def to_swhid(object_type, s): + return SWHID(object_type=object_type, object_id=s) def to_date(s): return dateutil.parser.parse(s) def obj_type_of_entry_type(s): if s == "file": return CONTENT elif s == "dir": return DIRECTORY elif s == "rev": return REVISION else: raise ValueError(f"invalid directory entry type: {s}") if obj_type == SNAPSHOT: for name, target in data.items(): if target["target_type"] != "alias": - # alias targets do not point to objects via PIDs; others do - target["target"] = to_pid(target["target_type"], target["target"]) + # alias targets do not point to objects via SWHIDs; others do + target["target"] = to_swhid(target["target_type"], target["target"]) elif obj_type == REVISION: - data["id"] = to_pid(obj_type, data["id"]) - data["directory"] = to_pid(DIRECTORY, data["directory"]) + data["id"] = to_swhid(obj_type, data["id"]) + data["directory"] = to_swhid(DIRECTORY, data["directory"]) for key in ("date", "committer_date"): data[key] = to_date(data[key]) for parent in data["parents"]: - parent["id"] = to_pid(REVISION, parent["id"]) + parent["id"] = to_swhid(REVISION, parent["id"]) elif obj_type == RELEASE: - data["id"] = to_pid(obj_type, data["id"]) + data["id"] = to_swhid(obj_type, data["id"]) data["date"] = to_date(data["date"]) - data["target"] = to_pid(data["target_type"], data["target"]) + data["target"] = to_swhid(data["target_type"], data["target"]) elif obj_type == DIRECTORY: - dir_pid = None + dir_swhid = None for entry in data: - dir_pid = dir_pid or to_pid(obj_type, entry["dir_id"]) - entry["dir_id"] = dir_pid - entry["target"] = to_pid( + dir_swhid = dir_swhid or to_swhid(obj_type, entry["dir_id"]) + entry["dir_id"] = dir_swhid + entry["target"] = to_swhid( obj_type_of_entry_type(entry["type"]), entry["target"] ) elif obj_type == CONTENT: pass # nothing to do for contents elif obj_type == ORIGIN_VISIT: data["date"] = to_date(data["date"]) if data["snapshot"] is not None: - data["snapshot"] = to_pid(SNAPSHOT, data["snapshot"]) + data["snapshot"] = to_swhid(SNAPSHOT, data["snapshot"]) else: raise ValueError(f"invalid object type: {obj_type}") return data class WebAPIClient: """Client for the Software Heritage archive Web API, see https://archive.softwareheritage.org/api/ """ def __init__( self, api_url: str = "https://archive.softwareheritage.org/api/1", bearer_token: Optional[str] = None, ): """Create a client for the Software Heritage Web API See: https://archive.softwareheritage.org/api/ Args: api_url: base URL for API calls (default: "https://archive.softwareheritage.org/api/1") bearer_token: optional bearer token to do authenticated API calls """ api_url = api_url.rstrip("/") u = urlparse(api_url) self.api_url = api_url self.api_path = u.path self.bearer_token = bearer_token - self._getters: Dict[str, Callable[[PIDish], Any]] = { + self._getters: Dict[str, Callable[[SWHIDish], Any]] = { CONTENT: self.content, DIRECTORY: self.directory, RELEASE: self.release, REVISION: self.revision, SNAPSHOT: self._get_snapshot, } def _call( self, query: str, http_method: str = "get", **req_args ) -> requests.models.Response: """Dispatcher for archive API invocation Args: query: API method to be invoked, rooted at api_url http_method: HTTP method to be invoked, one of: 'get', 'head' req_args: extra keyword arguments for requests.get()/.head() Raises: requests.HTTPError: if HTTP request fails and http_method is 'get' """ url = None if urlparse(query).scheme: # absolute URL url = query else: # relative URL; prepend base API URL url = "/".join([self.api_url, query]) r = None headers = {} if self.bearer_token is not None: headers = {"Authorization": f"Bearer {self.bearer_token}"} if http_method == "get": r = requests.get(url, **req_args, headers=headers) r.raise_for_status() elif http_method == "head": r = requests.head(url, **req_args, headers=headers) else: raise ValueError(f"unsupported HTTP method: {http_method}") return r - def _get_snapshot(self, pid: PIDish) -> Dict[str, Any]: + def _get_snapshot(self, swhid: SWHIDish) -> Dict[str, Any]: """Analogous to self.snapshot(), but zipping through partial snapshots, merging them together before returning """ snapshot = {} - for snp in self.snapshot(pid): + for snp in self.snapshot(swhid): snapshot.update(snp) return snapshot - def get(self, pid: PIDish, **req_args) -> Any: + def get(self, swhid: SWHIDish, **req_args) -> Any: """Retrieve information about an object of any kind Dispatcher method over the more specific methods content(), directory(), etc. Note that this method will buffer the entire output in case of long, iterable output (e.g., for snapshot()), see the iter() method for streaming. """ - pid_ = _get_pid(pid) - return self._getters[pid_.object_type](pid_) + swhid_ = _get_swhid(swhid) + return self._getters[swhid_.object_type](swhid_) - def iter(self, pid: PIDish, **req_args) -> Generator[Dict[str, Any], None, None]: + def iter(self, swhid: SWHIDish, **req_args) -> Iterator[Dict[str, Any]]: """Stream over the information about an object of any kind Streaming variant of get() """ - pid_ = _get_pid(pid) - obj_type = pid_.object_type + swhid_ = _get_swhid(swhid) + obj_type = swhid_.object_type if obj_type == SNAPSHOT: - yield from self.snapshot(pid_) + yield from self.snapshot(swhid_) elif obj_type == REVISION: - yield from [self.revision(pid_)] + yield from [self.revision(swhid_)] elif obj_type == RELEASE: - yield from [self.release(pid_)] + yield from [self.release(swhid_)] elif obj_type == DIRECTORY: - yield from self.directory(pid_) + yield from self.directory(swhid_) elif obj_type == CONTENT: - yield from [self.content(pid_)] + yield from [self.content(swhid_)] else: raise ValueError(f"invalid object type: {obj_type}") - def content(self, pid: PIDish, **req_args) -> Dict[str, Any]: + def content(self, swhid: SWHIDish, **req_args) -> Dict[str, Any]: """Retrieve information about a content object Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.get() Raises: requests.HTTPError: if HTTP request fails """ return typify( self._call( - f"content/sha1_git:{_get_pid(pid).object_id}/", **req_args + f"content/sha1_git:{_get_swhid(swhid).object_id}/", **req_args ).json(), CONTENT, ) - def directory(self, pid: PIDish, **req_args) -> List[Dict[str, Any]]: + def directory(self, swhid: SWHIDish, **req_args) -> List[Dict[str, Any]]: """Retrieve information about a directory object Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.get() Raises: requests.HTTPError: if HTTP request fails """ return typify( - self._call(f"directory/{_get_pid(pid).object_id}/", **req_args).json(), + self._call(f"directory/{_get_swhid(swhid).object_id}/", **req_args).json(), DIRECTORY, ) - def revision(self, pid: PIDish, **req_args) -> Dict[str, Any]: + def revision(self, swhid: SWHIDish, **req_args) -> Dict[str, Any]: """Retrieve information about a revision object Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.get() Raises: requests.HTTPError: if HTTP request fails """ return typify( - self._call(f"revision/{_get_pid(pid).object_id}/", **req_args).json(), + self._call(f"revision/{_get_swhid(swhid).object_id}/", **req_args).json(), REVISION, ) - def release(self, pid: PIDish, **req_args) -> Dict[str, Any]: + def release(self, swhid: SWHIDish, **req_args) -> Dict[str, Any]: """Retrieve information about a release object Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.get() Raises: requests.HTTPError: if HTTP request fails """ return typify( - self._call(f"release/{_get_pid(pid).object_id}/", **req_args).json(), + self._call(f"release/{_get_swhid(swhid).object_id}/", **req_args).json(), RELEASE, ) - def snapshot( - self, pid: PIDish, **req_args - ) -> Generator[Dict[str, Any], None, None]: + def snapshot(self, swhid: SWHIDish, **req_args) -> Iterator[Dict[str, Any]]: """Retrieve information about a snapshot object Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.get() Returns: an iterator over partial snapshots (dictionaries mapping branch names to information about where they point to), each containing a subset of available branches Raises: requests.HTTPError: if HTTP request fails """ done = False r = None - query = f"snapshot/{_get_pid(pid).object_id}/" + query = f"snapshot/{_get_swhid(swhid).object_id}/" while not done: r = self._call(query, http_method="get", **req_args) yield typify(r.json()["branches"], SNAPSHOT) if "next" in r.links and "url" in r.links["next"]: query = r.links["next"]["url"] else: done = True def visits( self, origin: str, per_page: Optional[int] = None, last_visit: Optional[int] = None, **req_args, - ) -> Generator[Dict[str, Any], None, None]: + ) -> Iterator[Dict[str, Any]]: """List visits of an origin Args: origin: the URL of a software origin per_page: the number of visits to list last_visit: visit to start listing from req_args: extra keyword arguments for requests.get() Returns: an iterator over visits of the origin Raises: requests.HTTPError: if HTTP request fails """ done = False r = None params = [] if last_visit is not None: params.append(("last_visit", last_visit)) if per_page is not None: params.append(("per_page", per_page)) query = f"origin/{origin}/visits/" while not done: r = self._call(query, http_method="get", params=params, **req_args) yield from [typify(v, ORIGIN_VISIT) for v in r.json()] if "next" in r.links and "url" in r.links["next"]: params = [] query = r.links["next"]["url"] else: done = True - def content_exists(self, pid: PIDish, **req_args) -> bool: + def content_exists(self, swhid: SWHIDish, **req_args) -> bool: """Check if a content object exists in the archive Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.head() Raises: requests.HTTPError: if HTTP request fails """ return bool( self._call( - f"content/sha1_git:{_get_pid(pid).object_id}/", + f"content/sha1_git:{_get_swhid(swhid).object_id}/", http_method="head", **req_args, ) ) - def directory_exists(self, pid: PIDish, **req_args) -> bool: + def directory_exists(self, swhid: SWHIDish, **req_args) -> bool: """Check if a directory object exists in the archive Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.head() Raises: requests.HTTPError: if HTTP request fails """ return bool( self._call( - f"directory/{_get_pid(pid).object_id}/", http_method="head", **req_args + f"directory/{_get_swhid(swhid).object_id}/", + http_method="head", + **req_args, ) ) - def revision_exists(self, pid: PIDish, **req_args) -> bool: + def revision_exists(self, swhid: SWHIDish, **req_args) -> bool: """Check if a revision object exists in the archive Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.head() Raises: requests.HTTPError: if HTTP request fails """ return bool( self._call( - f"revision/{_get_pid(pid).object_id}/", http_method="head", **req_args + f"revision/{_get_swhid(swhid).object_id}/", + http_method="head", + **req_args, ) ) - def release_exists(self, pid: PIDish, **req_args) -> bool: + def release_exists(self, swhid: SWHIDish, **req_args) -> bool: """Check if a release object exists in the archive Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.head() Raises: requests.HTTPError: if HTTP request fails """ return bool( self._call( - f"release/{_get_pid(pid).object_id}/", http_method="head", **req_args + f"release/{_get_swhid(swhid).object_id}/", + http_method="head", + **req_args, ) ) - def snapshot_exists(self, pid: PIDish, **req_args) -> bool: + def snapshot_exists(self, swhid: SWHIDish, **req_args) -> bool: """Check if a snapshot object exists in the archive Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.head() Raises: requests.HTTPError: if HTTP request fails """ return bool( self._call( - f"snapshot/{_get_pid(pid).object_id}/", http_method="head", **req_args + f"snapshot/{_get_swhid(swhid).object_id}/", + http_method="head", + **req_args, ) ) - def content_raw(self, pid: PIDish, **req_args) -> Generator[bytes, None, None]: + def content_raw(self, swhid: SWHIDish, **req_args) -> Iterator[bytes]: """Iterate over the raw content of a content object Args: - pid: object identifier + swhid: object persistent identifier req_args: extra keyword arguments for requests.get() Raises: requests.HTTPError: if HTTP request fails """ r = self._call( - f"content/sha1_git:{_get_pid(pid).object_id}/raw/", stream=True, **req_args + f"content/sha1_git:{_get_swhid(swhid).object_id}/raw/", + stream=True, + **req_args, ) r.raise_for_status() yield from r.iter_content(chunk_size=None, decode_unicode=False) diff --git a/swh/web/client/tests/test_web_api_client.py b/swh/web/client/tests/test_web_api_client.py index 9f06782..082afbf 100644 --- a/swh/web/client/tests/test_web_api_client.py +++ b/swh/web/client/tests/test_web_api_client.py @@ -1,145 +1,145 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dateutil.parser import parse as parse_date -from swh.model.identifiers import parse_persistent_identifier as parse_pid +from swh.model.identifiers import parse_swhid def test_get_content(web_api_client, web_api_mock): - pid = parse_pid("swh:1:cnt:fe95a46679d128ff167b7c55df5d02356c5a1ae1") - obj = web_api_client.get(pid) + swhid = parse_swhid("swh:1:cnt:fe95a46679d128ff167b7c55df5d02356c5a1ae1") + obj = web_api_client.get(swhid) assert obj["length"] == 151810 for key in ("length", "status", "checksums", "data_url"): assert key in obj - assert obj["checksums"]["sha1_git"] == str(pid).split(":")[3] + assert obj["checksums"]["sha1_git"] == str(swhid).split(":")[3] assert obj["checksums"]["sha1"] == "dc2830a9e72f23c1dfebef4413003221baa5fb62" - assert obj == web_api_client.content(pid) + assert obj == web_api_client.content(swhid) def test_get_directory(web_api_client, web_api_mock): - pid = parse_pid("swh:1:dir:977fc4b98c0e85816348cebd3b12026407c368b6") - obj = web_api_client.get(pid) + swhid = parse_swhid("swh:1:dir:977fc4b98c0e85816348cebd3b12026407c368b6") + obj = web_api_client.get(swhid) assert len(obj) == 35 # number of directory entries - assert all(map(lambda entry: entry["dir_id"] == pid, obj)) + assert all(map(lambda entry: entry["dir_id"] == swhid, obj)) dir_entry = obj[0] assert dir_entry["type"] == "file" - assert dir_entry["target"] == parse_pid( + assert dir_entry["target"] == parse_swhid( "swh:1:cnt:58471109208922c9ee8c4b06135725f03ed16814" ) assert dir_entry["name"] == ".bzrignore" assert dir_entry["length"] == 582 - assert obj == web_api_client.directory(pid) + assert obj == web_api_client.directory(swhid) def test_get_release(web_api_client, web_api_mock): - pid = parse_pid("swh:1:rel:b9db10d00835e9a43e2eebef2db1d04d4ae82342") - obj = web_api_client.get(pid) + swhid = parse_swhid("swh:1:rel:b9db10d00835e9a43e2eebef2db1d04d4ae82342") + obj = web_api_client.get(swhid) - assert obj["id"] == pid + assert obj["id"] == swhid assert obj["author"]["fullname"] == "Paul Tagliamonte " assert obj["author"]["name"] == "Paul Tagliamonte" assert obj["date"] == parse_date("2013-07-06T19:34:11-04:00") assert obj["name"] == "0.9.9" assert obj["target_type"] == "revision" - assert obj["target"] == parse_pid( + assert obj["target"] == parse_swhid( "swh:1:rev:e005cb773c769436709ca6a1d625dc784dbc1636" ) assert not obj["synthetic"] - assert obj == web_api_client.release(pid) + assert obj == web_api_client.release(swhid) def test_get_revision(web_api_client, web_api_mock): - pid = parse_pid("swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6") - obj = web_api_client.get(pid) + swhid = parse_swhid("swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6") + obj = web_api_client.get(swhid) - assert obj["id"] == pid + assert obj["id"] == swhid for role in ("author", "committer"): assert ( obj[role]["fullname"] == "Nicolas Dandrimont " ) assert obj[role]["name"] == "Nicolas Dandrimont" timestamp = parse_date("2014-08-18T18:18:25+02:00") assert obj["date"] == timestamp assert obj["committer_date"] == timestamp assert obj["message"].startswith("Merge branch") assert obj["merge"] assert len(obj["parents"]) == 2 - assert obj["parents"][0]["id"] == parse_pid( + assert obj["parents"][0]["id"] == parse_swhid( "swh:1:rev:26307d261279861c2d9c9eca3bb38519f951bea4" ) - assert obj["parents"][1]["id"] == parse_pid( + assert obj["parents"][1]["id"] == parse_swhid( "swh:1:rev:37fc9e08d0c4b71807a4f1ecb06112e78d91c283" ) - assert obj == web_api_client.revision(pid) + assert obj == web_api_client.revision(swhid) def test_get_snapshot(web_api_client, web_api_mock): # small snapshot, the one from Web API doc - pid = parse_pid("swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a") - obj = web_api_client.get(pid) + swhid = parse_swhid("swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a") + obj = web_api_client.get(swhid) assert len(obj) == 4 assert obj["refs/heads/master"]["target_type"] == "revision" - assert obj["refs/heads/master"]["target"] == parse_pid( + assert obj["refs/heads/master"]["target"] == parse_swhid( "swh:1:rev:83c20a6a63a7ebc1a549d367bc07a61b926cecf3" ) assert obj["refs/tags/dpkt-1.7"]["target_type"] == "revision" - assert obj["refs/tags/dpkt-1.7"]["target"] == parse_pid( + assert obj["refs/tags/dpkt-1.7"]["target"] == parse_swhid( "swh:1:rev:0c9dbfbc0974ec8ac1d8253aa1092366a03633a8" ) def test_iter_snapshot(web_api_client, web_api_mock): # large snapshot from the Linux kernel, usually spanning two pages - pid = parse_pid("swh:1:snp:cabcc7d7bf639bbe1cc3b41989e1806618dd5764") - obj = web_api_client.snapshot(pid) + swhid = parse_swhid("swh:1:snp:cabcc7d7bf639bbe1cc3b41989e1806618dd5764") + obj = web_api_client.snapshot(swhid) snp = {} for partial in obj: snp.update(partial) assert len(snp) == 1391 def test_authentication(web_api_client, web_api_mock): rel_id = "b9db10d00835e9a43e2eebef2db1d04d4ae82342" url = f"{web_api_client.api_url}/release/{rel_id}/" refresh_token = "user-refresh-token" web_api_client.bearer_token = refresh_token - pid = parse_pid(f"swh:1:rel:{rel_id}") - web_api_client.get(pid) + swhid = parse_swhid(f"swh:1:rel:{rel_id}") + web_api_client.get(swhid) sent_request = web_api_mock._adapter.last_request assert sent_request.url == url assert "Authorization" in sent_request.headers assert sent_request.headers["Authorization"] == f"Bearer {refresh_token}" def test_get_visits(web_api_client, web_api_mock): obj = web_api_client.visits( "https://github.com/NixOS/nixpkgs", last_visit=50, per_page=10 ) visits = [v for v in obj] assert len(visits) == 20 timestamp = parse_date("2018-07-31 04:34:23.298931+00:00") assert visits[0]["date"] == timestamp assert visits[0]["snapshot"] is None - snapshot_pid = "swh:1:snp:456550ea74af4e2eecaa406629efaaf0b9b5f976" - assert visits[7]["snapshot"] == parse_pid(snapshot_pid) + snapshot_swhid = "swh:1:snp:456550ea74af4e2eecaa406629efaaf0b9b5f976" + assert visits[7]["snapshot"] == parse_swhid(snapshot_swhid)