diff --git a/README.md b/README.md --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ Implementation of the Data model of the Software Heritage project, used to archive source code artifacts. -This module defines the notion of Persistent Identifier (PID) and provides -tools to compute them: +This module defines the notion of SoftWare Heritage persistent IDentifiers +(SWHIDs) and provides tools to compute them: ```sh $ swh-identify fork.c kmod.c sched/deadline.c diff --git a/docs/persistent-identifiers.rst b/docs/persistent-identifiers.rst --- a/docs/persistent-identifiers.rst +++ b/docs/persistent-identifiers.rst @@ -331,7 +331,7 @@ A **dedicated** ``/resolve`` **endpoint** of the Software Heritage `Web API `_ is also available to -programmatically resolve SWHIDs; see: :http:get:`/api/1/resolve/(swh_id)/`. +programmatically resolve SWHIDs; see: :http:get:`/api/1/resolve/(swhid)/`. Examples: diff --git a/mypy.ini b/mypy.ini --- a/mypy.ini +++ b/mypy.ini @@ -7,6 +7,9 @@ [mypy-attrs_strict.*] # a bit sad, but... ignore_missing_imports = True +[mypy-deprecated.*] +ignore_missing_imports = True + [mypy-django.*] # false positive, only used my hypotesis' extras ignore_missing_imports = True diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ # dependency lines, see https://pip.readthedocs.org/en/1.1/requirements.html attrs attrs_strict >= 0.0.7 +deprecated hypothesis iso8601 python-dateutil diff --git a/swh/model/cli.py b/swh/model/cli.py --- a/swh/model/cli.py +++ b/swh/model/cli.py @@ -12,7 +12,15 @@ from urllib.parse import urlparse from swh.model import hashutil -from swh.model import identifiers as pids +from swh.model.identifiers import ( + origin_identifier, + snapshot_identifier, + parse_swhid, + swhid, + SWHID, + CONTENT, + DIRECTORY, +) from swh.model.exceptions import ValidationError from swh.model.from_disk import Content, Directory @@ -29,40 +37,38 @@ } -class PidParamType(click.ParamType): +class SWHIDParamType(click.ParamType): name = "persistent identifier" def convert(self, value, param, ctx): try: - pids.parse_persistent_identifier(value) + parse_swhid(value) return value # return as string, as we need just that except ValidationError as e: self.fail("%s is not a valid SWHID. %s." % (value, e), param, ctx) -def pid_of_file(path): +def swhid_of_file(path): object = Content.from_file(path=path).get_data() - return pids.persistent_identifier(pids.CONTENT, object) + return swhid(CONTENT, object) -def pid_of_file_content(data): +def swhid_of_file_content(data): object = Content.from_bytes(mode=644, data=data).get_data() - return pids.persistent_identifier(pids.CONTENT, object) + return swhid(CONTENT, object) -def pid_of_dir(path): +def swhid_of_dir(path): object = Directory.from_disk(path=path).get_data() - return pids.persistent_identifier(pids.DIRECTORY, object) + return swhid(DIRECTORY, object) -def pid_of_origin(url): - pid = pids.PersistentId( - object_type="origin", object_id=pids.origin_identifier({"url": url}) - ) - return str(pid) +def swhid_of_origin(url): + swhid = SWHID(object_type="origin", object_id=origin_identifier({"url": url})) + return str(swhid) -def pid_of_git_repo(path): +def swhid_of_git_repo(path): repo = dulwich.repo.Repo(path) branches = {} @@ -84,10 +90,8 @@ snapshot = {"branches": branches} - pid = pids.PersistentId( - object_type="snapshot", object_id=pids.snapshot_identifier(snapshot) - ) - return str(pid) + swhid = SWHID(object_type="snapshot", object_id=snapshot_identifier(snapshot)) + return str(swhid) def identify_object(obj_type, follow_symlinks, obj): @@ -105,29 +109,29 @@ except ValueError: raise click.BadParameter("cannot detect object type for %s" % obj) - pid = None + swhid = None if obj == "-": content = sys.stdin.buffer.read() - pid = pid_of_file_content(content) + swhid = swhid_of_file_content(content) elif obj_type in ["content", "directory"]: path = obj.encode(sys.getfilesystemencoding()) if follow_symlinks and os.path.islink(obj): path = os.path.realpath(obj) if obj_type == "content": - pid = pid_of_file(path) + swhid = swhid_of_file(path) elif obj_type == "directory": - pid = pid_of_dir(path) + swhid = swhid_of_dir(path) elif obj_type == "origin": - pid = pid_of_origin(obj) + swhid = swhid_of_origin(obj) elif obj_type == "snapshot": - pid = pid_of_git_repo(obj) + swhid = swhid_of_git_repo(obj) else: # shouldn't happen, due to option validation raise click.BadParameter("invalid object type: " + obj_type) # note: we return original obj instead of path here, to preserve user-given # file name in output - return (obj, pid) + return (obj, swhid) @click.command(context_settings=CONTEXT_SETTINGS) @@ -156,7 +160,7 @@ "--verify", "-v", metavar="SWHID", - type=PidParamType(), + type=SWHIDParamType(), help="reference identifier to be compared with computed one", ) @click.argument("objects", nargs=-1, required=True) @@ -197,18 +201,18 @@ results = map(partial(identify_object, obj_type, follow_symlinks), objects) if verify: - pid = next(results)[1] - if verify == pid: - click.echo("SWHID match: %s" % pid) + swhid = next(results)[1] + if verify == swhid: + click.echo("SWHID match: %s" % swhid) sys.exit(0) else: - click.echo("SWHID mismatch: %s != %s" % (verify, pid)) + click.echo("SWHID mismatch: %s != %s" % (verify, swhid)) sys.exit(1) else: - for (obj, pid) in results: - msg = pid + for (obj, swhid) in results: + msg = swhid if show_filename: - msg = "%s\t%s" % (pid, os.fsdecode(obj)) + msg = "%s\t%s" % (swhid, os.fsdecode(obj)) click.echo(msg) diff --git a/swh/model/identifiers.py b/swh/model/identifiers.py --- a/swh/model/identifiers.py +++ b/swh/model/identifiers.py @@ -10,6 +10,8 @@ from functools import lru_cache from typing import Any, Dict, NamedTuple +from deprecated import deprecated + from .exceptions import ValidationError from .fields.hashes import validate_sha1 from .hashutil import hash_git_data, hash_to_hex, MultiHash @@ -22,11 +24,18 @@ DIRECTORY = "directory" CONTENT = "content" -PID_NAMESPACE = "swh" -PID_VERSION = 1 -PID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"] -PID_SEP = ":" -PID_CTXT_SEP = ";" +SWHID_NAMESPACE = "swh" +SWHID_VERSION = 1 +SWHID_TYPES = ["ori", "snp", "rel", "rev", "dir", "cnt"] +SWHID_SEP = ":" +SWHID_CTXT_SEP = ";" + +# deprecated variables +PID_NAMESPACE = SWHID_NAMESPACE +PID_VERSION = SWHID_VERSION +PID_TYPES = SWHID_TYPES +PID_SEP = SWHID_SEP +PID_CTXT_SEP = SWHID_CTXT_SEP @lru_cache() @@ -649,8 +658,8 @@ } -_PersistentId = NamedTuple( - "PersistentId", +_SWHID = NamedTuple( + "SWHID", [ ("namespace", str), ("scheme_version", int), @@ -661,25 +670,23 @@ ) -class PersistentId(_PersistentId): +class SWHID(_SWHID): """ - Named tuple holding the relevant info associated to a Software Heritage - persistent identifier. + Named tuple holding the relevant info associated to a SoftWare Heritage + persistent IDentifier (SWHID) Args: - namespace (str): the namespace of the identifier, defaults to 'swh' + namespace (str): the namespace of the identifier, defaults to ``swh`` scheme_version (int): the scheme version of the identifier, defaults to 1 object_type (str): the type of object the identifier points to, - either 'content', 'directory', 'release', 'revision' or 'snapshot' - object_id (dict/bytes/str): object's dict representation or - object identifier + either ``content``, ``directory``, ``release``, ``revision`` or ``snapshot`` + object_id (str): object's identifier metadata (dict): optional dict filled with metadata related to pointed object Raises: - swh.model.exceptions.ValidationError: In case of invalid object type - or id + swh.model.exceptions.ValidationError: In case of invalid object type or id Once created, it contains the following attributes: @@ -690,14 +697,14 @@ object_id (str): hexadecimal representation of the object hash metadata (dict): metadata related to the pointed object - To get the raw persistent identifier string from an instance of - this named tuple, use the :func:`str` function:: + To get the raw SWHID string from an instance of this named tuple, + use the :func:`str` function:: - pid = PersistentId( + swhid = SWHID( object_type='content', object_id='8ff44f081d43176474b267de5451f2c2e88089d0' ) - pid_str = str(pid) + swhid_str = str(swhid) # 'swh:1:cnt:8ff44f081d43176474b267de5451f2c2e88089d0' """ @@ -705,79 +712,110 @@ def __new__( cls, - namespace=PID_NAMESPACE, - scheme_version=PID_VERSION, - object_type="", - object_id="", - metadata={}, + namespace: str = SWHID_NAMESPACE, + scheme_version: int = SWHID_VERSION, + object_type: str = "", + object_id: str = "", + metadata: Dict[str, Any] = {}, ): o = _object_type_map.get(object_type) if not o: raise ValidationError( "Wrong input: Supported types are %s" % (list(_object_type_map.keys())) ) - if namespace != PID_NAMESPACE: + if namespace != SWHID_NAMESPACE: raise ValidationError( - "Wrong format: only supported namespace is '%s'" % PID_NAMESPACE + "Wrong format: only supported namespace is '%s'" % SWHID_NAMESPACE ) - if scheme_version != PID_VERSION: + if scheme_version != SWHID_VERSION: raise ValidationError( - "Wrong format: only supported version is %d" % PID_VERSION + "Wrong format: only supported version is %d" % SWHID_VERSION ) + # internal swh representation resolution if isinstance(object_id, dict): object_id = object_id[o["key_id"]] + validate_sha1(object_id) # can raise if invalid hash object_id = hash_to_hex(object_id) - return super(cls, PersistentId).__new__( + return super().__new__( cls, namespace, scheme_version, object_type, object_id, metadata ) - def __str__(self): + def __str__(self) -> str: o = _object_type_map.get(self.object_type) - pid = PID_SEP.join( + assert o + swhid = SWHID_SEP.join( [self.namespace, str(self.scheme_version), o["short_name"], self.object_id] ) if self.metadata: for k, v in self.metadata.items(): - pid += "%s%s=%s" % (PID_CTXT_SEP, k, v) - return pid + swhid += "%s%s=%s" % (SWHID_CTXT_SEP, k, v) + return swhid + + +@deprecated("Use swh.model.identifiers.SWHID instead") +class PersistentId(SWHID): + """ + Named tuple holding the relevant info associated to a SoftWare Heritage + persistent IDentifier. + + .. deprecated:: 0.3.8 + Use :class:`swh.model.identifiers.SWHID` instead + + """ + + def __new__(cls, *args, **kwargs): + return super(cls, PersistentId).__new__(cls, *args, **kwargs) -def persistent_identifier(object_type, object_id, scheme_version=1, metadata={}): - """Compute :ref:`SWHID ` persistent identifiers. +def swhid( + object_type: str, + object_id: str, + scheme_version: int = 1, + metadata: Dict[str, Any] = {}, +) -> str: + """Compute :ref:`persistent-identifiers` Args: - object_type (str): object's type, either 'content', 'directory', - 'release', 'revision' or 'snapshot' - object_id (dict/bytes/str): object's dict representation or object - identifier - scheme_version (int): persistent identifier scheme version, - defaults to 1 - metadata (dict): metadata related to the pointed object + object_type: object's type, either ``content``, ``directory``, + ``release``, ``revision`` or ``snapshot`` + object_id: object's identifier + scheme_version: SWHID scheme version, defaults to 1 + metadata: metadata related to the pointed object Raises: - swh.model.exceptions.ValidationError: In case of invalid object type - or id + swh.model.exceptions.ValidationError: In case of invalid object type or id Returns: - str: the persistent identifier + the SWHID of the object """ - pid = PersistentId( + swhid = SWHID( scheme_version=scheme_version, object_type=object_type, object_id=object_id, metadata=metadata, ) - return str(pid) + return str(swhid) + +@deprecated("Use swh.model.identifiers.swhid instead") +def persistent_identifier(*args, **kwargs) -> str: + """Compute :ref:`persistent-identifiers` + + .. deprecated:: 0.3.8 + Use :func:`swh.model.identifiers.swhid` instead + + """ + return swhid(*args, **kwargs) -def parse_persistent_identifier(persistent_id): - """Parse :ref:`SWHID ` persistent identifiers. + +def parse_swhid(swhid: str) -> SWHID: + """Parse :ref:`persistent-identifiers`. Args: - persistent_id (str): A persistent identifier + swhid (str): A persistent identifier Raises: swh.model.exceptions.ValidationError: in case of: @@ -790,35 +828,43 @@ * invalid hash identifier supplied Returns: - PersistentId: a named tuple holding the parsing result + a named tuple holding the parsing result """ - # ; - persistent_id_parts = persistent_id.split(PID_CTXT_SEP) - pid_data = persistent_id_parts.pop(0).split(":") + # ; + swhid_parts = swhid.split(SWHID_CTXT_SEP) + swhid_data = swhid_parts.pop(0).split(":") - if len(pid_data) != 4: + if len(swhid_data) != 4: raise ValidationError("Wrong format: There should be 4 mandatory values") # Checking for parsing errors - _ns, _version, _type, _id = pid_data - pid_data[1] = int(pid_data[1]) + _ns, _version, _type, _id = swhid_data for otype, data in _object_type_map.items(): if _type == data["short_name"]: - pid_data[2] = otype + _type = otype break if not _id: raise ValidationError("Wrong format: Identifier should be present") - persistent_id_metadata = {} - for part in persistent_id_parts: + _metadata = {} + for part in swhid_parts: try: key, val = part.split("=") - persistent_id_metadata[key] = val + _metadata[key] = val except Exception: msg = "Contextual data is badly formatted, form key=val expected" raise ValidationError(msg) - pid_data.append(persistent_id_metadata) - return PersistentId(*pid_data) + return SWHID(_ns, int(_version), _type, _id, _metadata) + + +@deprecated("Use swh.model.identifiers.parse_swhid instead") +def parse_persistent_identifier(persistent_id: str) -> PersistentId: + """Parse :ref:`persistent-identifiers`. + + .. deprecated:: 0.3.8 + Use :func:`swh.model.identifiers.parse_swhid` instead + """ + return PersistentId(**parse_swhid(persistent_id)._asdict()) diff --git a/swh/model/tests/test_cli.py b/swh/model/tests/test_cli.py --- a/swh/model/tests/test_cli.py +++ b/swh/model/tests/test_cli.py @@ -22,9 +22,9 @@ super().setUp() self.runner = CliRunner() - def assertPidOK(self, result, pid): + def assertSWHID(self, result, swhid): self.assertEqual(result.exit_code, 0) - self.assertEqual(result.output.split()[0], pid) + self.assertEqual(result.output.split()[0], swhid) def test_no_args(self): result = self.runner.invoke(cli.identify) @@ -36,21 +36,21 @@ for filename, content in self.contents.items(): path = os.path.join(self.tmpdir_name, filename) result = self.runner.invoke(cli.identify, ["--type", "content", path]) - self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"])) + self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"])) def test_content_id_from_stdin(self): """identify file content""" self.make_contents(self.tmpdir_name) for _, content in self.contents.items(): result = self.runner.invoke(cli.identify, ["-"], input=content["data"]) - self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"])) + self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"])) def test_directory_id(self): """identify an entire directory""" self.make_from_tarball(self.tmpdir_name) path = os.path.join(self.tmpdir_name, b"sample-folder") result = self.runner.invoke(cli.identify, ["--type", "directory", path]) - self.assertPidOK(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759") + self.assertSWHID(result, "swh:1:dir:e8b0f1466af8608c8a3fb9879db172b887e80759") def test_snapshot_id(self): """identify a snapshot""" @@ -64,7 +64,7 @@ result = self.runner.invoke( cli.identify, ["--type", "snapshot", repo_dir] ) - self.assertPidOK( + self.assertSWHID( result, "swh:1:snp:abc888898124270905a0ef3c67e872ce08e7e0c1" ) @@ -72,7 +72,7 @@ """identify an origin URL""" url = "https://github.com/torvalds/linux" result = self.runner.invoke(cli.identify, ["--type", "origin", url]) - self.assertPidOK(result, "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f") + self.assertSWHID(result, "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f") def test_symlink(self): """identify symlink --- both itself and target""" @@ -82,10 +82,10 @@ os.symlink(os.path.basename(regular), link) result = self.runner.invoke(cli.identify, [link]) - self.assertPidOK(result, "swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99") + self.assertSWHID(result, "swh:1:cnt:257cc5642cb1a054f08cc83f2d943e56fd3ebe99") result = self.runner.invoke(cli.identify, ["--no-dereference", link]) - self.assertPidOK(result, "swh:1:cnt:996f1789ff67c0e3f69ef5933a55d54c5d0e9954") + self.assertSWHID(result, "swh:1:cnt:996f1789ff67c0e3f69ef5933a55d54c5d0e9954") def test_show_filename(self): """filename is shown by default""" @@ -108,7 +108,7 @@ result = self.runner.invoke( cli.identify, ["--type", "content", "--no-filename", path] ) - self.assertPidOK(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"])) + self.assertSWHID(result, "swh:1:cnt:" + hash_to_hex(content["sha1_git"])) def test_auto_content(self): """automatic object type detection: content""" diff --git a/swh/model/tests/test_identifiers.py b/swh/model/tests/test_identifiers.py --- a/swh/model/tests/test_identifiers.py +++ b/swh/model/tests/test_identifiers.py @@ -17,7 +17,7 @@ RELEASE, REVISION, SNAPSHOT, - PersistentId, + SWHID, normalize_timestamp, ) @@ -739,7 +739,7 @@ identifiers.identifier_to_str(self.all_types["id"]), ) - def test_persistent_identifier(self): + def test_swhid(self): _snapshot_id = _x("c7c108084bc0bf3d81436bf980b46e98bd338453") _release_id = "22ece559cc7cc2364edc5e5593d63ae8bd229f9f" _revision_id = "309cf2674ee7a0749978cf8265ab91a60aea0f7d" @@ -751,7 +751,7 @@ _directory = {"id": _directory_id} _content = {"sha1_git": _content_id} - for full_type, _hash, expected_persistent_id, version, _meta in [ + for full_type, _hash, expected_swhid, version, _meta in [ ( SNAPSHOT, _snapshot_id, @@ -831,17 +831,15 @@ ), ]: if version: - actual_value = identifiers.persistent_identifier( + actual_value = identifiers.swhid( full_type, _hash, version, metadata=_meta ) else: - actual_value = identifiers.persistent_identifier( - full_type, _hash, metadata=_meta - ) + actual_value = identifiers.swhid(full_type, _hash, metadata=_meta) - self.assertEqual(actual_value, expected_persistent_id) + self.assertEqual(actual_value, expected_swhid) - def test_persistent_identifier_wrong_input(self): + def test_swhid_wrong_input(self): _snapshot_id = "notahash4bc0bf3d81436bf980b46e98bd338453" _snapshot = {"id": _snapshot_id} @@ -851,10 +849,10 @@ ("foo", ""), ]: with self.assertRaises(ValidationError): - identifiers.persistent_identifier(_type, _hash) + identifiers.swhid(_type, _hash) - def test_parse_persistent_identifier(self): - for pid, _type, _version, _hash in [ + def test_parse_swhid(self): + for swhid, _type, _version, _hash in [ ( "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", CONTENT, @@ -886,17 +884,17 @@ "c7c108084bc0bf3d81436bf980b46e98bd338453", ), ]: - expected_result = PersistentId( + expected_result = SWHID( namespace="swh", scheme_version=_version, object_type=_type, object_id=_hash, metadata={}, ) - actual_result = identifiers.parse_persistent_identifier(pid) + actual_result = identifiers.parse_swhid(swhid) self.assertEqual(actual_result, expected_result) - for pid, _type, _version, _hash, _metadata in [ + for swhid, _type, _version, _hash, _metadata in [ ( "swh:1:cnt:9c95815d9e9d91b8dae8e05d8bbc696fe19f796b;lines=1-18;origin=https://github.com/python/cpython", # noqa CONTENT, @@ -912,18 +910,18 @@ {"origin": "deb://Debian/packages/linuxdoc-tools"}, ), ]: - expected_result = PersistentId( + expected_result = SWHID( namespace="swh", scheme_version=_version, object_type=_type, object_id=_hash, metadata=_metadata, ) - actual_result = identifiers.parse_persistent_identifier(pid) + actual_result = identifiers.parse_swhid(swhid) self.assertEqual(actual_result, expected_result) - def test_parse_persistent_identifier_parsing_error(self): - for pid in [ + def test_parse_swhid_parsing_error(self): + for swhid in [ ("swh:1:cnt"), ("swh:1:"), ("swh:"), @@ -936,7 +934,7 @@ ("swh:1:snp:foo"), ]: with self.assertRaises(ValidationError): - identifiers.parse_persistent_identifier(pid) + identifiers.parse_swhid(swhid) def test_persistentid_class_validation_error(self): for _ns, _version, _type, _id in [ @@ -946,7 +944,7 @@ ("swh", 1, SNAPSHOT, "gh6959356d30f1a4e9b7f6bca59b9a336464c03d"), ]: with self.assertRaises(ValidationError): - PersistentId( + SWHID( namespace=_ns, scheme_version=_version, object_type=_type,