Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F8394978
client.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
14 KB
Subscribers
None
client.py
View Options
# Copyright (C) 2019-2020 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""Python client for the Software Heritage Web API
Light wrapper around requests for the archive API, taking care of data
conversions and pagination.
.. code-block:: python
from swh.web.client.client import WebAPIClient
cli = WebAPIClient()
# retrieve any archived object via its PID
cli.get('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6')
# same, but for specific object types
cli.revision('swh:1:rev:aafb16d69fd30ff58afdd69036a26047f3aebdc6')
# get() always retrieve entire objects, following pagination
# WARNING: this might *not* be what you want for large objects
cli.get('swh:1:snp:6a3a2cf0b2b90ce7ae1cf0a221ed68035b686f5a')
# type-specific methods support explicit iteration through pages
next(cli.snapshot('swh:1:snp:cabcc7d7bf639bbe1cc3b41989e1806618dd5764'))
"""
from
typing
import
Any
,
Callable
,
Dict
,
Generator
,
List
,
Optional
,
Union
from
urllib.parse
import
urlparse
import
dateutil.parser
import
requests
from
swh.model.identifiers
import
CONTENT
,
DIRECTORY
,
RELEASE
,
REVISION
,
SNAPSHOT
from
swh.model.identifiers
import
PersistentId
as
PID
from
swh.model.identifiers
import
parse_persistent_identifier
as
parse_pid
PIDish
=
Union
[
PID
,
str
]
ORIGIN_VISIT
=
"origin_visit"
def
_get_pid
(
pidish
:
PIDish
)
->
PID
:
"""Parse string to PID if needed"""
if
isinstance
(
pidish
,
str
):
return
parse_pid
(
pidish
)
else
:
return
pidish
def
typify
(
data
:
Any
,
obj_type
:
str
)
->
Any
:
"""Type API responses using pythonic types where appropriate
The following conversions are performed:
- identifiers are converted from strings to PersistentId instances
- timestamps are converted from strings to datetime.datetime objects
"""
def
to_pid
(
object_type
,
s
):
return
PID
(
object_type
=
object_type
,
object_id
=
s
)
def
to_date
(
s
):
return
dateutil
.
parser
.
parse
(
s
)
def
obj_type_of_entry_type
(
s
):
if
s
==
"file"
:
return
CONTENT
elif
s
==
"dir"
:
return
DIRECTORY
elif
s
==
"rev"
:
return
REVISION
else
:
raise
ValueError
(
f
"invalid directory entry type: {s}"
)
if
obj_type
==
SNAPSHOT
:
for
name
,
target
in
data
.
items
():
if
target
[
"target_type"
]
!=
"alias"
:
# alias targets do not point to objects via PIDs; others do
target
[
"target"
]
=
to_pid
(
target
[
"target_type"
],
target
[
"target"
])
elif
obj_type
==
REVISION
:
data
[
"id"
]
=
to_pid
(
obj_type
,
data
[
"id"
])
data
[
"directory"
]
=
to_pid
(
DIRECTORY
,
data
[
"directory"
])
for
key
in
(
"date"
,
"committer_date"
):
data
[
key
]
=
to_date
(
data
[
key
])
for
parent
in
data
[
"parents"
]:
parent
[
"id"
]
=
to_pid
(
REVISION
,
parent
[
"id"
])
elif
obj_type
==
RELEASE
:
data
[
"id"
]
=
to_pid
(
obj_type
,
data
[
"id"
])
data
[
"date"
]
=
to_date
(
data
[
"date"
])
data
[
"target"
]
=
to_pid
(
data
[
"target_type"
],
data
[
"target"
])
elif
obj_type
==
DIRECTORY
:
dir_pid
=
None
for
entry
in
data
:
dir_pid
=
dir_pid
or
to_pid
(
obj_type
,
entry
[
"dir_id"
])
entry
[
"dir_id"
]
=
dir_pid
entry
[
"target"
]
=
to_pid
(
obj_type_of_entry_type
(
entry
[
"type"
]),
entry
[
"target"
]
)
elif
obj_type
==
CONTENT
:
pass
# nothing to do for contents
elif
obj_type
==
ORIGIN_VISIT
:
data
[
"date"
]
=
to_date
(
data
[
"date"
])
if
data
[
"snapshot"
]
is
not
None
:
data
[
"snapshot"
]
=
to_pid
(
SNAPSHOT
,
data
[
"snapshot"
])
else
:
raise
ValueError
(
f
"invalid object type: {obj_type}"
)
return
data
class
WebAPIClient
:
"""Client for the Software Heritage archive Web API, see
https://archive.softwareheritage.org/api/
"""
def
__init__
(
self
,
api_url
:
str
=
"https://archive.softwareheritage.org/api/1"
,
bearer_token
:
Optional
[
str
]
=
None
,
):
"""Create a client for the Software Heritage Web API
See: https://archive.softwareheritage.org/api/
Args:
api_url: base URL for API calls (default:
"https://archive.softwareheritage.org/api/1")
bearer_token: optional bearer token to do authenticated API calls
"""
api_url
=
api_url
.
rstrip
(
"/"
)
u
=
urlparse
(
api_url
)
self
.
api_url
=
api_url
self
.
api_path
=
u
.
path
self
.
bearer_token
=
bearer_token
self
.
_getters
:
Dict
[
str
,
Callable
[[
PIDish
],
Any
]]
=
{
CONTENT
:
self
.
content
,
DIRECTORY
:
self
.
directory
,
RELEASE
:
self
.
release
,
REVISION
:
self
.
revision
,
SNAPSHOT
:
self
.
_get_snapshot
,
}
def
_call
(
self
,
query
:
str
,
http_method
:
str
=
"get"
,
**
req_args
)
->
requests
.
models
.
Response
:
"""Dispatcher for archive API invocation
Args:
query: API method to be invoked, rooted at api_url
http_method: HTTP method to be invoked, one of: 'get', 'head'
req_args: extra keyword arguments for requests.get()/.head()
Raises:
requests.HTTPError: if HTTP request fails and http_method is 'get'
"""
url
=
None
if
urlparse
(
query
)
.
scheme
:
# absolute URL
url
=
query
else
:
# relative URL; prepend base API URL
url
=
"/"
.
join
([
self
.
api_url
,
query
])
r
=
None
headers
=
{}
if
self
.
bearer_token
is
not
None
:
headers
=
{
"Authorization"
:
f
"Bearer {self.bearer_token}"
}
if
http_method
==
"get"
:
r
=
requests
.
get
(
url
,
**
req_args
,
headers
=
headers
)
r
.
raise_for_status
()
elif
http_method
==
"head"
:
r
=
requests
.
head
(
url
,
**
req_args
,
headers
=
headers
)
else
:
raise
ValueError
(
f
"unsupported HTTP method: {http_method}"
)
return
r
def
_get_snapshot
(
self
,
pid
:
PIDish
)
->
Dict
[
str
,
Any
]:
"""Analogous to self.snapshot(), but zipping through partial snapshots,
merging them together before returning
"""
snapshot
=
{}
for
snp
in
self
.
snapshot
(
pid
):
snapshot
.
update
(
snp
)
return
snapshot
def
get
(
self
,
pid
:
PIDish
,
**
req_args
)
->
Any
:
"""Retrieve information about an object of any kind
Dispatcher method over the more specific methods content(),
directory(), etc.
Note that this method will buffer the entire output in case of long,
iterable output (e.g., for snapshot()), see the iter() method for
streaming.
"""
pid_
=
_get_pid
(
pid
)
return
self
.
_getters
[
pid_
.
object_type
](
pid_
)
def
iter
(
self
,
pid
:
PIDish
,
**
req_args
)
->
Generator
[
Dict
[
str
,
Any
],
None
,
None
]:
"""Stream over the information about an object of any kind
Streaming variant of get()
"""
pid_
=
_get_pid
(
pid
)
obj_type
=
pid_
.
object_type
if
obj_type
==
SNAPSHOT
:
yield from
self
.
snapshot
(
pid_
)
elif
obj_type
==
REVISION
:
yield from
[
self
.
revision
(
pid_
)]
elif
obj_type
==
RELEASE
:
yield from
[
self
.
release
(
pid_
)]
elif
obj_type
==
DIRECTORY
:
yield from
self
.
directory
(
pid_
)
elif
obj_type
==
CONTENT
:
yield from
[
self
.
content
(
pid_
)]
else
:
raise
ValueError
(
f
"invalid object type: {obj_type}"
)
def
content
(
self
,
pid
:
PIDish
,
**
req_args
)
->
Dict
[
str
,
Any
]:
"""Retrieve information about a content object
Args:
pid: object identifier
req_args: extra keyword arguments for requests.get()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
typify
(
self
.
_call
(
f
"content/sha1_git:{_get_pid(pid).object_id}/"
,
**
req_args
)
.
json
(),
CONTENT
,
)
def
directory
(
self
,
pid
:
PIDish
,
**
req_args
)
->
List
[
Dict
[
str
,
Any
]]:
"""Retrieve information about a directory object
Args:
pid: object identifier
req_args: extra keyword arguments for requests.get()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
typify
(
self
.
_call
(
f
"directory/{_get_pid(pid).object_id}/"
,
**
req_args
)
.
json
(),
DIRECTORY
,
)
def
revision
(
self
,
pid
:
PIDish
,
**
req_args
)
->
Dict
[
str
,
Any
]:
"""Retrieve information about a revision object
Args:
pid: object identifier
req_args: extra keyword arguments for requests.get()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
typify
(
self
.
_call
(
f
"revision/{_get_pid(pid).object_id}/"
,
**
req_args
)
.
json
(),
REVISION
,
)
def
release
(
self
,
pid
:
PIDish
,
**
req_args
)
->
Dict
[
str
,
Any
]:
"""Retrieve information about a release object
Args:
pid: object identifier
req_args: extra keyword arguments for requests.get()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
typify
(
self
.
_call
(
f
"release/{_get_pid(pid).object_id}/"
,
**
req_args
)
.
json
(),
RELEASE
,
)
def
snapshot
(
self
,
pid
:
PIDish
,
**
req_args
)
->
Generator
[
Dict
[
str
,
Any
],
None
,
None
]:
"""Retrieve information about a snapshot object
Args:
pid: object identifier
req_args: extra keyword arguments for requests.get()
Returns:
an iterator over partial snapshots (dictionaries mapping branch
names to information about where they point to), each containing a
subset of available branches
Raises:
requests.HTTPError: if HTTP request fails
"""
done
=
False
r
=
None
query
=
f
"snapshot/{_get_pid(pid).object_id}/"
while
not
done
:
r
=
self
.
_call
(
query
,
http_method
=
"get"
,
**
req_args
)
yield
typify
(
r
.
json
()[
"branches"
],
SNAPSHOT
)
if
"next"
in
r
.
links
and
"url"
in
r
.
links
[
"next"
]:
query
=
r
.
links
[
"next"
][
"url"
]
else
:
done
=
True
def
visits
(
self
,
origin
:
str
,
per_page
:
Optional
[
int
]
=
None
,
last_visit
:
Optional
[
int
]
=
None
,
**
req_args
,
)
->
Generator
[
Dict
[
str
,
Any
],
None
,
None
]:
"""List visits of an origin
Args:
origin: the URL of a software origin
per_page: the number of visits to list
last_visit: visit to start listing from
req_args: extra keyword arguments for requests.get()
Returns:
an iterator over visits of the origin
Raises:
requests.HTTPError: if HTTP request fails
"""
done
=
False
r
=
None
params
=
[]
if
last_visit
is
not
None
:
params
.
append
((
"last_visit"
,
last_visit
))
if
per_page
is
not
None
:
params
.
append
((
"per_page"
,
per_page
))
query
=
f
"origin/{origin}/visits/"
while
not
done
:
r
=
self
.
_call
(
query
,
http_method
=
"get"
,
params
=
params
,
**
req_args
)
yield from
[
typify
(
v
,
ORIGIN_VISIT
)
for
v
in
r
.
json
()]
if
"next"
in
r
.
links
and
"url"
in
r
.
links
[
"next"
]:
params
=
[]
query
=
r
.
links
[
"next"
][
"url"
]
else
:
done
=
True
def
content_exists
(
self
,
pid
:
PIDish
,
**
req_args
)
->
bool
:
"""Check if a content object exists in the archive
Args:
pid: object identifier
req_args: extra keyword arguments for requests.head()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
bool
(
self
.
_call
(
f
"content/sha1_git:{_get_pid(pid).object_id}/"
,
http_method
=
"head"
,
**
req_args
,
)
)
def
directory_exists
(
self
,
pid
:
PIDish
,
**
req_args
)
->
bool
:
"""Check if a directory object exists in the archive
Args:
pid: object identifier
req_args: extra keyword arguments for requests.head()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
bool
(
self
.
_call
(
f
"directory/{_get_pid(pid).object_id}/"
,
http_method
=
"head"
,
**
req_args
)
)
def
revision_exists
(
self
,
pid
:
PIDish
,
**
req_args
)
->
bool
:
"""Check if a revision object exists in the archive
Args:
pid: object identifier
req_args: extra keyword arguments for requests.head()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
bool
(
self
.
_call
(
f
"revision/{_get_pid(pid).object_id}/"
,
http_method
=
"head"
,
**
req_args
)
)
def
release_exists
(
self
,
pid
:
PIDish
,
**
req_args
)
->
bool
:
"""Check if a release object exists in the archive
Args:
pid: object identifier
req_args: extra keyword arguments for requests.head()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
bool
(
self
.
_call
(
f
"release/{_get_pid(pid).object_id}/"
,
http_method
=
"head"
,
**
req_args
)
)
def
snapshot_exists
(
self
,
pid
:
PIDish
,
**
req_args
)
->
bool
:
"""Check if a snapshot object exists in the archive
Args:
pid: object identifier
req_args: extra keyword arguments for requests.head()
Raises:
requests.HTTPError: if HTTP request fails
"""
return
bool
(
self
.
_call
(
f
"snapshot/{_get_pid(pid).object_id}/"
,
http_method
=
"head"
,
**
req_args
)
)
def
content_raw
(
self
,
pid
:
PIDish
,
**
req_args
)
->
Generator
[
bytes
,
None
,
None
]:
"""Iterate over the raw content of a content object
Args:
pid: object identifier
req_args: extra keyword arguments for requests.get()
Raises:
requests.HTTPError: if HTTP request fails
"""
r
=
self
.
_call
(
f
"content/sha1_git:{_get_pid(pid).object_id}/raw/"
,
stream
=
True
,
**
req_args
)
r
.
raise_for_status
()
yield from
r
.
iter_content
(
chunk_size
=
None
,
decode_unicode
=
False
)
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Jun 4 2025, 7:32 PM (9 w, 6 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3347908
Attached To
rDWCLI Web client
Event Timeline
Log In to Comment