diff --git a/README.md b/README.md
--- a/README.md
+++ b/README.md
@@ -4,8 +4,8 @@
Implementation of the Data model of the Software Heritage project, used to
archive source code artifacts.
-This module defines the notion of Persistent Identifier (PID) and provides
-tools to compute them:
+This module defines the notion of SoftWare Heritage persistent IDentifiers
+(SWHIDs) and provides tools to compute them:
```sh
$ swh-identify fork.c kmod.c sched/deadline.c
diff --git a/docs/persistent-identifiers.rst b/docs/persistent-identifiers.rst
--- a/docs/persistent-identifiers.rst
+++ b/docs/persistent-identifiers.rst
@@ -331,7 +331,7 @@
A **dedicated** ``/resolve`` **endpoint** of the Software Heritage `Web API
`_ is also available to
-programmatically resolve SWHIDs; see: :http:get:`/api/1/resolve/(swh_id)/`.
+programmatically resolve SWHIDs; see: :http:get:`/api/1/resolve/(swhid)/`.
Examples:
diff --git a/mypy.ini b/mypy.ini
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,6 +7,9 @@
[mypy-attrs_strict.*] # a bit sad, but...
ignore_missing_imports = True
+[mypy-deprecated.*]
+ignore_missing_imports = True
+
[mypy-django.*] # false positive, only used my hypotesis' extras
ignore_missing_imports = True
diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@
# dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html
attrs
attrs_strict >= 0.0.7
+deprecated
hypothesis
iso8601
python-dateutil
diff --git a/swh/model/cli.py b/swh/model/cli.py
--- a/swh/model/cli.py
+++ b/swh/model/cli.py
@@ -12,7 +12,15 @@
from urllib.parse import urlparse
from swh.model import hashutil
-from swh.model import identifiers as pids
+from swh.model.identifiers import (
+ origin_identifier,
+ snapshot_identifier,
+ parse_swhid,
+ swhid,
+ SWHID,
+ CONTENT,
+ DIRECTORY,
+)
from swh.model.exceptions import ValidationError
from swh.model.from_disk import Content, Directory
@@ -29,40 +37,38 @@
}
-class PidParamType(click.ParamType):
+class SWHIDParamType(click.ParamType):
name = "persistent identifier"
def convert(self, value, param, ctx):
try:
- pids.parse_persistent_identifier(value)
+ parse_swhid(value)
return value # return as string, as we need just that
except ValidationError as e:
self.fail("%s is not a valid SWHID. %s." % (value, e), param, ctx)
-def pid_of_file(path):
+def swhid_of_file(path):
object = Content.from_file(path=path).get_data()
- return pids.persistent_identifier(pids.CONTENT, object)
+ return swhid(CONTENT, object)
-def pid_of_file_content(data):
+def swhid_of_file_content(data):
object = Content.from_bytes(mode=644, data=data).get_data()
- return pids.persistent_identifier(pids.CONTENT, object)
+ return swhid(CONTENT, object)
-def pid_of_dir(path):
+def swhid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
- return pids.persistent_identifier(pids.DIRECTORY, object)
+ return swhid(DIRECTORY, object)
-def pid_of_origin(url):
- pid = pids.PersistentId(
- object_type="origin", object_id=pids.origin_identifier({"url": url})
- )
- return str(pid)
+def swhid_of_origin(url):
+ swhid = SWHID(object_type="origin", object_id=origin_identifier({"url": url}))
+ return str(swhid)
-def pid_of_git_repo(path):
+def swhid_of_git_repo(path):
repo = dulwich.repo.Repo(path)
branches = {}
@@ -84,10 +90,8 @@
snapshot = {"branches": branches}
- pid = pids.PersistentId(
- object_type="snapshot", object_id=pids.snapshot_identifier(snapshot)
- )
- return str(pid)
+ swhid = SWHID(object_type="snapshot", object_id=snapshot_identifier(snapshot))
+ return str(swhid)
def identify_object(obj_type, follow_symlinks, obj):
@@ -105,29 +109,29 @@
except ValueError:
raise click.BadParameter("cannot detect object type for %s" % obj)
- pid = None
+ swhid = None
if obj == "-":
content = sys.stdin.buffer.read()
- pid = pid_of_file_content(content)
+ swhid = swhid_of_file_content(content)
elif obj_type in ["content", "directory"]:
path = obj.encode(sys.getfilesystemencoding())
if follow_symlinks and os.path.islink(obj):
path = os.path.realpath(obj)
if obj_type == "content":
- pid = pid_of_file(path)
+ swhid = swhid_of_file(path)
elif obj_type == "directory":
- pid = pid_of_dir(path)
+ swhid = swhid_of_dir(path)
elif obj_type == "origin":
- pid = pid_of_origin(obj)
+ swhid = swhid_of_origin(obj)
elif obj_type == "snapshot":
- pid = pid_of_git_repo(obj)
+ swhid = swhid_of_git_repo(obj)
else: # shouldn't happen, due to option validation
raise click.BadParameter("invalid object type: " + obj_type)
# note: we return original obj instead of path here, to preserve user-given
# file name in output
- return (obj, pid)
+ return (obj, swhid)
@click.command(context_settings=CONTEXT_SETTINGS)
@@ -156,7 +160,7 @@
"--verify",
"-v",
metavar="SWHID",
- type=PidParamType(),
+ type=SWHIDParamType(),
help="reference identifier to be compared with computed one",
)
@click.argument("objects", nargs=-1, required=True)
@@ -197,18 +201,18 @@
results = map(partial(identify_object, obj_type, follow_symlinks), objects)
if verify:
- pid = next(results)[1]
- if verify == pid:
- click.echo("SWHID match: %s" % pid)
+ swhid = next(results)[1]
+ if verify == swhid:
+ click.echo("SWHID match: %s" % swhid)
sys.exit(0)
else:
- click.echo("SWHID mismatch: %s != %s" % (verify, pid))
+ click.echo("SWHID mismatch: %s != %s" % (verify, swhid))
sys.exit(1)
else:
- for (obj, pid) in results:
- msg = pid
+ for (obj, swhid) in results:
+ msg = swhid
if show_filename:
- msg = "%s\t%s" % (pid, os.fsdecode(obj))
+ msg = "%s\t%s" % (swhid, os.fsdecode(obj))
click.echo(msg)
diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py
--- a/swh/model/identifiers.py
+++ b/swh/model/identifiers.py
@@ -10,6 +10,8 @@
from functools import lru_cache
from typing import Any, Dict, NamedTuple
+from deprecated import deprecated
+
from .exceptions import ValidationError
from .fields.hashes import validate_sha1
from .hashutil import hash_git_data, hash_to_hex, MultiHash
@@ -22,11 +24,18 @@
DIRECTORY = "directory"
CONTENT = "content"
-PID_NAMESPACE = "swh"
-PID_VERSION = 1
-PID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"]
-PID_SEP = ":"
-PID_CTXT_SEP = ";"
+SWHID_NAMESPACE = "swh"
+SWHID_VERSION = 1
+SWHID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"]
+SWHID_SEP = ":"
+SWHID_CTXT_SEP = ";"
+
+# deprecated variables
+PID_NAMESPACE = SWHID_NAMESPACE
+PID_VERSION = SWHID_VERSION
+PID_TYPES = SWHID_TYPES
+PID_SEP = SWHID_SEP
+PID_CTXT_SEP = SWHID_CTXT_SEP
@lru_cache()
@@ -649,8 +658,8 @@
}
-_PersistentId = NamedTuple(
- "PersistentId",
+_SWHID = NamedTuple(
+ "SWHID",
[
("namespace", str),
("scheme_version", int),
@@ -661,25 +670,23 @@
)
-class PersistentId(_PersistentId):
+class SWHID(_SWHID):
"""
- Named tuple holding the relevant info associated to a Software Heritage
- persistent identifier.
+ Named tuple holding the relevant info associated to a SoftWare Heritage
+ persistent IDentifier (SWHID)
Args:
- namespace (str): the namespace of the identifier, defaults to 'swh'
+ namespace (str): the namespace of the identifier, defaults to ``swh``
scheme_version (int): the scheme version of the identifier,
defaults to 1
object_type (str): the type of object the identifier points to,
- either 'content', 'directory', 'release', 'revision' or 'snapshot'
- object_id (dict/bytes/str): object's dict representation or
- object identifier
+ either ``content``, ``directory``, ``release``, ``revision`` or ``snapshot``
+ object_id (str): object's identifier
metadata (dict): optional dict filled with metadata related to
pointed object
Raises:
- swh.model.exceptions.ValidationError: In case of invalid object type
- or id
+ swh.model.exceptions.ValidationError: In case of invalid object type or id
Once created, it contains the following attributes:
@@ -690,14 +697,14 @@
object_id (str): hexadecimal representation of the object hash
metadata (dict): metadata related to the pointed object
- To get the raw persistent identifier string from an instance of
- this named tuple, use the :func:`str` function::
+ To get the raw SWHID string from an instance of this named tuple,
+ use the :func:`str` function::
- pid = PersistentId(
+ swhid = SWHID(
object_type='content',
object_id='8ff44f081d43176474b267de5451f2c2e88089d0'
)
- pid_str = str(pid)
+ swhid_str = str(swhid)
# 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0'
"""
@@ -705,79 +712,110 @@
def __new__(
cls,
- namespace=PID_NAMESPACE,
- scheme_version=PID_VERSION,
- object_type="",
- object_id="",
- metadata={},
+ namespace: str = SWHID_NAMESPACE,
+ scheme_version: int = SWHID_VERSION,
+ object_type: str = "",
+ object_id: str = "",
+ metadata: Dict[str, Any] = {},
):
o = _object_type_map.get(object_type)
if not o:
raise ValidationError(
"Wrong input: Supported types are %s" % (list(_object_type_map.keys()))
)
- if namespace != PID_NAMESPACE:
+ if namespace != SWHID_NAMESPACE:
raise ValidationError(
- "Wrong format: only supported namespace is '%s'" % PID_NAMESPACE
+ "Wrong format: only supported namespace is '%s'" % SWHID_NAMESPACE
)
- if scheme_version != PID_VERSION:
+ if scheme_version != SWHID_VERSION:
raise ValidationError(
- "Wrong format: only supported version is %d" % PID_VERSION
+ "Wrong format: only supported version is %d" % SWHID_VERSION
)
+
# internal swh representation resolution
if isinstance(object_id, dict):
object_id = object_id[o["key_id"]]
+
validate_sha1(object_id) # can raise if invalid hash
object_id = hash_to_hex(object_id)
- return super(cls, PersistentId).__new__(
+ return super().__new__(
cls, namespace, scheme_version, object_type, object_id, metadata
)
- def __str__(self):
+ def __str__(self) -> str:
o = _object_type_map.get(self.object_type)
- pid = PID_SEP.join(
+ assert o
+ swhid = SWHID_SEP.join(
[self.namespace, str(self.scheme_version), o["short_name"], self.object_id]
)
if self.metadata:
for k, v in self.metadata.items():
- pid += "%s%s=%s" % (PID_CTXT_SEP, k, v)
- return pid
+ swhid += "%s%s=%s" % (SWHID_CTXT_SEP, k, v)
+ return swhid
+
+
+@deprecated("Use swh.model.identifiers.SWHID instead")
+class PersistentId(SWHID):
+ """
+ Named tuple holding the relevant info associated to a SoftWare Heritage
+ persistent IDentifier.
+
+ .. deprecated:: 0.3.8
+ Use :class:`swh.model.identifiers.SWHID` instead
+
+ """
+
+ def __new__(cls, *args, **kwargs):
+ return super(cls, PersistentId).__new__(cls, *args, **kwargs)
-def persistent_identifier(object_type, object_id, scheme_version=1, metadata={}):
- """Compute :ref:`SWHID ` persistent identifiers.
+def swhid(
+ object_type: str,
+ object_id: str,
+ scheme_version: int = 1,
+ metadata: Dict[str, Any] = {},
+) -> str:
+ """Compute :ref:`persistent-identifiers`
Args:
- object_type (str): object's type, either 'content', 'directory',
- 'release', 'revision' or 'snapshot'
- object_id (dict/bytes/str): object's dict representation or object
- identifier
- scheme_version (int): persistent identifier scheme version,
- defaults to 1
- metadata (dict): metadata related to the pointed object
+ object_type: object's type, either ``content``, ``directory``,
+ ``release``, ``revision`` or ``snapshot``
+ object_id: object's identifier
+ scheme_version: SWHID scheme version, defaults to 1
+ metadata: metadata related to the pointed object
Raises:
- swh.model.exceptions.ValidationError: In case of invalid object type
- or id
+ swh.model.exceptions.ValidationError: In case of invalid object type or id
Returns:
- str: the persistent identifier
+ the SWHID of the object
"""
- pid = PersistentId(
+ swhid = SWHID(
scheme_version=scheme_version,
object_type=object_type,
object_id=object_id,
metadata=metadata,
)
- return str(pid)
+ return str(swhid)
+
+@deprecated("Use swh.model.identifiers.swhid instead")
+def persistent_identifier(*args, **kwargs) -> str:
+ """Compute :ref:`persistent-identifiers`
+
+ .. deprecated:: 0.3.8
+ Use :func:`swh.model.identifiers.swhid` instead
+
+ """
+ return swhid(*args, **kwargs)
-def parse_persistent_identifier(persistent_id):
- """Parse :ref:`SWHID ` persistent identifiers.
+
+def parse_swhid(swhid: str) -> SWHID:
+ """Parse :ref:`persistent-identifiers`.
Args:
- persistent_id (str): A persistent identifier
+ swhid (str): A persistent identifier
Raises:
swh.model.exceptions.ValidationError: in case of:
@@ -790,35 +828,43 @@
* invalid hash identifier supplied
Returns:
- PersistentId: a named tuple holding the parsing result
+ a named tuple holding the parsing result
"""
- # ;
- persistent_id_parts = persistent_id.split(PID_CTXT_SEP)
- pid_data = persistent_id_parts.pop(0).split(":")
+ # ;
+ swhid_parts = swhid.split(SWHID_CTXT_SEP)
+ swhid_data = swhid_parts.pop(0).split(":")
- if len(pid_data) != 4:
+ if len(swhid_data) != 4:
raise ValidationError("Wrong format: There should be 4 mandatory values")
# Checking for parsing errors
- _ns, _version, _type, _id = pid_data
- pid_data[1] = int(pid_data[1])
+ _ns, _version, _type, _id = swhid_data
for otype, data in _object_type_map.items():
if _type == data["short_name"]:
- pid_data[2] = otype
+ _type = otype
break
if not _id:
raise ValidationError("Wrong format: Identifier should be present")
- persistent_id_metadata = {}
- for part in persistent_id_parts:
+ _metadata = {}
+ for part in swhid_parts:
try:
key, val = part.split("=")
- persistent_id_metadata[key] = val
+ _metadata[key] = val
except Exception:
msg = "Contextual data is badly formatted, form key=val expected"
raise ValidationError(msg)
- pid_data.append(persistent_id_metadata)
- return PersistentId(*pid_data)
+ return SWHID(_ns, int(_version), _type, _id, _metadata)
+
+
+@deprecated("Use swh.model.identifiers.parse_swhid instead")
+def parse_persistent_identifier(persistent_id: str) -> PersistentId:
+ """Parse :ref:`persistent-identifiers`.
+
+ .. deprecated:: 0.3.8
+ Use :func:`swh.model.identifiers.parse_swhid` instead
+ """
+ return PersistentId(**parse_swhid(persistent_id)._asdict())
diff --git a/swh/model/tests/test_cli.py b/swh/model/tests/test_cli.py
--- a/swh/model/tests/test_cli.py
+++ b/swh/model/tests/test_cli.py
@@ -22,9 +22,9 @@
super().setUp()
self.runner = CliRunner()
- def assertPidOK(self, result, pid):
+ def assertSWHID(self, result, swhid):
self.assertEqual(result.exit_code, 0)
- self.assertEqual(result.output.split()[0], pid)
+ self.assertEqual(result.output.split()[0], swhid)
def test_no_args(self):
result = self.runner.invoke(cli.identify)
@@ -36,21 +36,21 @@
for filename, content in self.contents.items():
path = os.path.join(self.tmpdir_name, filename)
result = self.runner.invoke(cli.identify, ["--type", "content", path])
- self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
+ self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
def test_content_id_from_stdin(self):
"""identify file content"""
self.make_contents(self.tmpdir_name)
for _, content in self.contents.items():
result = self.runner.invoke(cli.identify, ["-"], input=content["data"])
- self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
+ self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
def test_directory_id(self):
"""identify an entire directory"""
self.make_from_tarball(self.tmpdir_name)
path = os.path.join(self.tmpdir_name, b"sample-folder")
result = self.runner.invoke(cli.identify, ["--type", "directory", path])
- self.assertPidOK(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759")
+ self.assertSWHID(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759")
def test_snapshot_id(self):
"""identify a snapshot"""
@@ -64,7 +64,7 @@
result = self.runner.invoke(
cli.identify, ["--type", "snapshot", repo_dir]
)
- self.assertPidOK(
+ self.assertSWHID(
result, "swh:1:snp:abc888898124270905a0ef3c67e872ce08e7e0c1"
)
@@ -72,7 +72,7 @@
"""identify an origin URL"""
url = "https://github.com/torvalds/linux"
result = self.runner.invoke(cli.identify, ["--type", "origin", url])
- self.assertPidOK(result, "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f")
+ self.assertSWHID(result, "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f")
def test_symlink(self):
"""identify symlink --- both itself and target"""
@@ -82,10 +82,10 @@
os.symlink(os.path.basename(regular), link)
result = self.runner.invoke(cli.identify, [link])
- self.assertPidOK(result, "swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99")
+ self.assertSWHID(result, "swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99")
result = self.runner.invoke(cli.identify, ["--no-dereference", link])
- self.assertPidOK(result, "swh:1:cnt:996f1789ff67c0e3f69ef5933a55d54c5d0e9954")
+ self.assertSWHID(result, "swh:1:cnt:996f1789ff67c0e3f69ef5933a55d54c5d0e9954")
def test_show_filename(self):
"""filename is shown by default"""
@@ -108,7 +108,7 @@
result = self.runner.invoke(
cli.identify, ["--type", "content", "--no-filename", path]
)
- self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
+ self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"]))
def test_auto_content(self):
"""automatic object type detection: content"""
diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py
--- a/swh/model/tests/test_identifiers.py
+++ b/swh/model/tests/test_identifiers.py
@@ -17,7 +17,7 @@
RELEASE,
REVISION,
SNAPSHOT,
- PersistentId,
+ SWHID,
normalize_timestamp,
)
@@ -739,7 +739,7 @@
identifiers.identifier_to_str(self.all_types["id"]),
)
- def test_persistent_identifier(self):
+ def test_swhid(self):
_snapshot_id = _x("c7c108084bc0bf3d81436bf980b46e98bd338453")
_release_id = "22ece559cc7cc2364edc5e5593d63ae8bd229f9f"
_revision_id = "309cf2674ee7a0749978cf8265ab91a60aea0f7d"
@@ -751,7 +751,7 @@
_directory = {"id": _directory_id}
_content = {"sha1_git": _content_id}
- for full_type, _hash, expected_persistent_id, version, _meta in [
+ for full_type, _hash, expected_swhid, version, _meta in [
(
SNAPSHOT,
_snapshot_id,
@@ -831,17 +831,15 @@
),
]:
if version:
- actual_value = identifiers.persistent_identifier(
+ actual_value = identifiers.swhid(
full_type, _hash, version, metadata=_meta
)
else:
- actual_value = identifiers.persistent_identifier(
- full_type, _hash, metadata=_meta
- )
+ actual_value = identifiers.swhid(full_type, _hash, metadata=_meta)
- self.assertEqual(actual_value, expected_persistent_id)
+ self.assertEqual(actual_value, expected_swhid)
- def test_persistent_identifier_wrong_input(self):
+ def test_swhid_wrong_input(self):
_snapshot_id = "notahash4bc0bf3d81436bf980b46e98bd338453"
_snapshot = {"id": _snapshot_id}
@@ -851,10 +849,10 @@
("foo", ""),
]:
with self.assertRaises(ValidationError):
- identifiers.persistent_identifier(_type, _hash)
+ identifiers.swhid(_type, _hash)
- def test_parse_persistent_identifier(self):
- for pid, _type, _version, _hash in [
+ def test_parse_swhid(self):
+ for swhid, _type, _version, _hash in [
(
"swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2",
CONTENT,
@@ -886,17 +884,17 @@
"c7c108084bc0bf3d81436bf980b46e98bd338453",
),
]:
- expected_result = PersistentId(
+ expected_result = SWHID(
namespace="swh",
scheme_version=_version,
object_type=_type,
object_id=_hash,
metadata={},
)
- actual_result = identifiers.parse_persistent_identifier(pid)
+ actual_result = identifiers.parse_swhid(swhid)
self.assertEqual(actual_result, expected_result)
- for pid, _type, _version, _hash, _metadata in [
+ for swhid, _type, _version, _hash, _metadata in [
(
"swh:1:cnt:9c95815d9e9d91b8dae8e05d8bbc696fe19f796b;lines=1-18;origin=https://github.com/python/cpython", # noqa
CONTENT,
@@ -912,18 +910,18 @@
{"origin": "deb://Debian/packages/linuxdoc-tools"},
),
]:
- expected_result = PersistentId(
+ expected_result = SWHID(
namespace="swh",
scheme_version=_version,
object_type=_type,
object_id=_hash,
metadata=_metadata,
)
- actual_result = identifiers.parse_persistent_identifier(pid)
+ actual_result = identifiers.parse_swhid(swhid)
self.assertEqual(actual_result, expected_result)
- def test_parse_persistent_identifier_parsing_error(self):
- for pid in [
+ def test_parse_swhid_parsing_error(self):
+ for swhid in [
("swh:1:cnt"),
("swh:1:"),
("swh:"),
@@ -936,7 +934,7 @@
("swh:1:snp:foo"),
]:
with self.assertRaises(ValidationError):
- identifiers.parse_persistent_identifier(pid)
+ identifiers.parse_swhid(swhid)
def test_persistentid_class_validation_error(self):
for _ns, _version, _type, _id in [
@@ -946,7 +944,7 @@
("swh", 1, SNAPSHOT, "gh6959356d30f1a4e9b7f6bca59b9a336464c03d"),
]:
with self.assertRaises(ValidationError):
- PersistentId(
+ SWHID(
namespace=_ns,
scheme_version=_version,
object_type=_type,