Changeset View
Changeset View
Standalone View
Standalone View
swh/graph/swhid.py
- This file was moved from swh/graph/pid.py.
# Copyright (C) 2019 The Software Heritage developers | # Copyright (C) 2019-2020 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
import mmap | import mmap | ||||
import os | import os | ||||
import struct | import struct | ||||
from collections.abc import MutableMapping | from collections.abc import MutableMapping | ||||
from enum import Enum | from enum import Enum | ||||
from mmap import MAP_SHARED, MAP_PRIVATE | from mmap import MAP_SHARED, MAP_PRIVATE | ||||
from typing import BinaryIO, Iterator, Tuple | from typing import BinaryIO, Iterator, Tuple | ||||
from swh.model.identifiers import PersistentId, parse_persistent_identifier | from swh.model.identifiers import PersistentId, parse_persistent_identifier | ||||
PID_BIN_FMT = "BB20s" # 2 unsigned chars + 20 bytes | SWHID_BIN_FMT = "BB20s" # 2 unsigned chars + 20 bytes | ||||
INT_BIN_FMT = ">q" # big endian, 8-byte integer | INT_BIN_FMT = ">q" # big endian, 8-byte integer | ||||
PID_BIN_SIZE = 22 # in bytes | SWHID_BIN_SIZE = 22 # in bytes | ||||
INT_BIN_SIZE = 8 # in bytes | INT_BIN_SIZE = 8 # in bytes | ||||
class PidType(Enum): | class SwhidType(Enum): | ||||
"""types of existing PIDs, used to serialize PID type as a (char) integer | """types of existing SWHIDs, used to serialize SWHID type as a (char) integer | ||||
Note that the order does matter also for driving the binary search in | Note that the order does matter also for driving the binary search in | ||||
PID-indexed maps. Integer values also matter, for compatibility with the | SWHID-indexed maps. Integer values also matter, for compatibility with the | ||||
Java layer. | Java layer. | ||||
""" | """ | ||||
content = 0 | content = 0 | ||||
directory = 1 | directory = 1 | ||||
origin = 2 | origin = 2 | ||||
release = 3 | release = 3 | ||||
revision = 4 | revision = 4 | ||||
snapshot = 5 | snapshot = 5 | ||||
def str_to_bytes(pid_str: str) -> bytes: | def str_to_bytes(swhid_str: str) -> bytes: | ||||
"""Convert a PID to a byte sequence | """Convert a SWHID to a byte sequence | ||||
The binary format used to represent PIDs as 22-byte long byte sequences as | The binary format used to represent SWHIDs as 22-byte long byte sequences as | ||||
follows: | follows: | ||||
- 1 byte for the namespace version represented as a C `unsigned char` | - 1 byte for the namespace version represented as a C `unsigned char` | ||||
- 1 byte for the object type, as the int value of :class:`PidType` enums, | - 1 byte for the object type, as the int value of :class:`SwhidType` enums, | ||||
represented as a C `unsigned char` | represented as a C `unsigned char` | ||||
- 20 bytes for the SHA1 digest as a byte sequence | - 20 bytes for the SHA1 digest as a byte sequence | ||||
Args: | Args: | ||||
pid: persistent identifier | swhid: SWHID identifier | ||||
Returns: | Returns: | ||||
bytes: byte sequence representation of pid | bytes: byte sequence representation of swhid | ||||
""" | """ | ||||
pid = parse_persistent_identifier(pid_str) | swhid = parse_persistent_identifier(swhid_str) | ||||
return struct.pack( | return struct.pack( | ||||
PID_BIN_FMT, | SWHID_BIN_FMT, | ||||
pid.scheme_version, | swhid.scheme_version, | ||||
PidType[pid.object_type].value, | SwhidType[swhid.object_type].value, | ||||
bytes.fromhex(pid.object_id), | bytes.fromhex(swhid.object_id), | ||||
) | ) | ||||
def bytes_to_str(bytes: bytes) -> str: | def bytes_to_str(bytes: bytes) -> str: | ||||
"""Inverse function of :func:`str_to_bytes` | """Inverse function of :func:`str_to_bytes` | ||||
See :func:`str_to_bytes` for a description of the binary PID format. | See :func:`str_to_bytes` for a description of the binary SWHID format. | ||||
Args: | Args: | ||||
bytes: byte sequence representation of pid | bytes: byte sequence representation of swhid | ||||
Returns: | Returns: | ||||
pid: persistent identifier | swhid: SWHID identifier | ||||
""" | """ | ||||
(version, type, bin_digest) = struct.unpack(PID_BIN_FMT, bytes) | (version, type, bin_digest) = struct.unpack(SWHID_BIN_FMT, bytes) | ||||
pid = PersistentId(object_type=PidType(type).name, object_id=bin_digest) | swhid = PersistentId(object_type=SwhidType(type).name, object_id=bin_digest) | ||||
return str(pid) | return str(swhid) | ||||
class _OnDiskMap: | class _OnDiskMap: | ||||
"""mmap-ed on-disk sequence of fixed size records | """mmap-ed on-disk sequence of fixed size records | ||||
""" | """ | ||||
def __init__( | def __init__( | ||||
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | class _OnDiskMap: | ||||
def __len__(self) -> int: | def __len__(self) -> int: | ||||
return self.length | return self.length | ||||
def __delitem__(self, pos: int) -> None: | def __delitem__(self, pos: int) -> None: | ||||
raise NotImplementedError("cannot delete records from fixed-size map") | raise NotImplementedError("cannot delete records from fixed-size map") | ||||
class PidToNodeMap(_OnDiskMap, MutableMapping): | class SwhidToNodeMap(_OnDiskMap, MutableMapping): | ||||
"""memory mapped map from :ref:`SWHIDs <persistent-identifiers>` to a | """memory mapped map from :ref:`SWHIDs <persistent-identifiers>` to a | ||||
continuous range 0..N of (8-byte long) integers | continuous range 0..N of (8-byte long) integers | ||||
This is the converse mapping of :class:`NodeToPidMap`. | This is the converse mapping of :class:`NodeToSwhidMap`. | ||||
The on-disk serialization format is a sequence of fixed length (30 bytes) | The on-disk serialization format is a sequence of fixed length (30 bytes) | ||||
records with the following fields: | records with the following fields: | ||||
- PID (22 bytes): binary PID representation as per :func:`str_to_bytes` | - SWHID (22 bytes): binary SWHID representation as per :func:`str_to_bytes` | ||||
- long (8 bytes): big endian long integer | - long (8 bytes): big endian long integer | ||||
The records are sorted lexicographically by PID type and checksum, where | The records are sorted lexicographically by SWHID type and checksum, where | ||||
type is the integer value of :class:`PidType`. PID lookup in the map is | type is the integer value of :class:`SwhidType`. SWHID lookup in the map is | ||||
performed via binary search. Hence a huge map with, say, 11 B entries, | performed via binary search. Hence a huge map with, say, 11 B entries, | ||||
will require ~30 disk seeks. | will require ~30 disk seeks. | ||||
Note that, due to fixed size + ordering, it is not possible to create these | Note that, due to fixed size + ordering, it is not possible to create these | ||||
maps by random writing. Hence, __setitem__ can be used only to *update* the | maps by random writing. Hence, __setitem__ can be used only to *update* the | ||||
value associated to an existing key, rather than to add a missing item. To | value associated to an existing key, rather than to add a missing item. To | ||||
create an entire map from scratch, you should do so *sequentially*, using | create an entire map from scratch, you should do so *sequentially*, using | ||||
static method :meth:`write_record` (or, at your own risk, by hand via the | static method :meth:`write_record` (or, at your own risk, by hand via the | ||||
mmap :attr:`mm`). | mmap :attr:`mm`). | ||||
""" | """ | ||||
# record binary format: PID + a big endian 8-byte big endian integer | # record binary format: SWHID + a big endian 8-byte big endian integer | ||||
RECORD_BIN_FMT = ">" + PID_BIN_FMT + "q" | RECORD_BIN_FMT = ">" + SWHID_BIN_FMT + "q" | ||||
RECORD_SIZE = PID_BIN_SIZE + INT_BIN_SIZE | RECORD_SIZE = SWHID_BIN_SIZE + INT_BIN_SIZE | ||||
def __init__(self, fname: str, mode: str = "rb", length: int = None): | def __init__(self, fname: str, mode: str = "rb", length: int = None): | ||||
"""open an existing on-disk map | """open an existing on-disk map | ||||
Args: | Args: | ||||
fname: path to the on-disk map | fname: path to the on-disk map | ||||
mode: file open mode, usually either 'rb' for read-only maps, 'wb' | mode: file open mode, usually either 'rb' for read-only maps, 'wb' | ||||
for creating new maps, or 'rb+' for updating existing ones | for creating new maps, or 'rb+' for updating existing ones | ||||
Show All 10 Lines | def _get_bin_record(self, pos: int) -> Tuple[bytes, bytes]: | ||||
see :func:`_get_record` for an equivalent function with additional | see :func:`_get_record` for an equivalent function with additional | ||||
deserialization | deserialization | ||||
Args: | Args: | ||||
pos: 0-based record number | pos: 0-based record number | ||||
Returns: | Returns: | ||||
a pair `(pid, int)`, where pid and int are bytes | a pair `(swhid, int)`, where swhid and int are bytes | ||||
""" | """ | ||||
rec_pos = pos * self.RECORD_SIZE | rec_pos = pos * self.RECORD_SIZE | ||||
int_pos = rec_pos + PID_BIN_SIZE | int_pos = rec_pos + SWHID_BIN_SIZE | ||||
return (self.mm[rec_pos:int_pos], self.mm[int_pos : int_pos + INT_BIN_SIZE]) | return (self.mm[rec_pos:int_pos], self.mm[int_pos : int_pos + INT_BIN_SIZE]) | ||||
def _get_record(self, pos: int) -> Tuple[str, int]: | def _get_record(self, pos: int) -> Tuple[str, int]: | ||||
"""seek and return the record at a given (logical) position | """seek and return the record at a given (logical) position | ||||
moral equivalent of :func:`_get_bin_record`, with additional | moral equivalent of :func:`_get_bin_record`, with additional | ||||
deserialization to non-bytes types | deserialization to non-bytes types | ||||
Args: | Args: | ||||
pos: 0-based record number | pos: 0-based record number | ||||
Returns: | Returns: | ||||
a pair `(pid, int)`, where pid is a string-based PID and int the | a pair `(swhid, int)`, where swhid is a string-based SWHID and int the | ||||
corresponding integer identifier | corresponding integer identifier | ||||
""" | """ | ||||
(pid_bytes, int_bytes) = self._get_bin_record(pos) | (swhid_bytes, int_bytes) = self._get_bin_record(pos) | ||||
return (bytes_to_str(pid_bytes), struct.unpack(INT_BIN_FMT, int_bytes)[0]) | return (bytes_to_str(swhid_bytes), struct.unpack(INT_BIN_FMT, int_bytes)[0]) | ||||
@classmethod | @classmethod | ||||
def write_record(cls, f: BinaryIO, pid: str, int: int) -> None: | def write_record(cls, f: BinaryIO, swhid: str, int: int) -> None: | ||||
"""write a logical record to a file-like object | """write a logical record to a file-like object | ||||
Args: | Args: | ||||
f: file-like object to write the record to | f: file-like object to write the record to | ||||
pid: textual PID | swhid: textual SWHID | ||||
int: PID integer identifier | int: SWHID integer identifier | ||||
""" | """ | ||||
f.write(str_to_bytes(pid)) | f.write(str_to_bytes(swhid)) | ||||
f.write(struct.pack(INT_BIN_FMT, int)) | f.write(struct.pack(INT_BIN_FMT, int)) | ||||
def _bisect_pos(self, pid_str: str) -> int: | def _bisect_pos(self, swhid_str: str) -> int: | ||||
"""bisect the position of the given identifier. If the identifier is | """bisect the position of the given identifier. If the identifier is | ||||
not found, the position of the pid immediately after is returned. | not found, the position of the swhid immediately after is returned. | ||||
Args: | Args: | ||||
pid_str: the pid as a string | swhid_str: the swhid as a string | ||||
Returns: | Returns: | ||||
the logical record of the bisected position in the map | the logical record of the bisected position in the map | ||||
""" | """ | ||||
if not isinstance(pid_str, str): | if not isinstance(swhid_str, str): | ||||
raise TypeError("PID must be a str, not {}".format(type(pid_str))) | raise TypeError("SWHID must be a str, not {}".format(type(swhid_str))) | ||||
try: | try: | ||||
target = str_to_bytes(pid_str) # desired PID as bytes | target = str_to_bytes(swhid_str) # desired SWHID as bytes | ||||
except ValueError: | except ValueError: | ||||
raise ValueError('invalid PID: "{}"'.format(pid_str)) | raise ValueError('invalid SWHID: "{}"'.format(swhid_str)) | ||||
lo = 0 | lo = 0 | ||||
hi = self.length - 1 | hi = self.length - 1 | ||||
while lo < hi: | while lo < hi: | ||||
mid = (lo + hi) // 2 | mid = (lo + hi) // 2 | ||||
(pid, _value) = self._get_bin_record(mid) | (swhid, _value) = self._get_bin_record(mid) | ||||
if pid < target: | if swhid < target: | ||||
lo = mid + 1 | lo = mid + 1 | ||||
else: | else: | ||||
hi = mid | hi = mid | ||||
return lo | return lo | ||||
def _find(self, pid_str: str) -> Tuple[int, int]: | def _find(self, swhid_str: str) -> Tuple[int, int]: | ||||
"""lookup the integer identifier of a pid and its position | """lookup the integer identifier of a swhid and its position | ||||
Args: | Args: | ||||
pid_str: the pid as a string | swhid_str: the swhid as a string | ||||
Returns: | Returns: | ||||
a pair `(pid, pos)` with pid integer identifier and its logical | a pair `(swhid, pos)` with swhid integer identifier and its logical | ||||
record position in the map | record position in the map | ||||
""" | """ | ||||
pos = self._bisect_pos(pid_str) | pos = self._bisect_pos(swhid_str) | ||||
pid_found, value = self._get_record(pos) | swhid_found, value = self._get_record(pos) | ||||
if pid_found == pid_str: | if swhid_found == swhid_str: | ||||
return (value, pos) | return (value, pos) | ||||
raise KeyError(pid_str) | raise KeyError(swhid_str) | ||||
def __getitem__(self, pid_str: str) -> int: | def __getitem__(self, swhid_str: str) -> int: | ||||
"""lookup the integer identifier of a PID | """lookup the integer identifier of a SWHID | ||||
Args: | Args: | ||||
pid: the PID as a string | swhid: the SWHID as a string | ||||
Returns: | Returns: | ||||
the integer identifier of pid | the integer identifier of swhid | ||||
""" | """ | ||||
return self._find(pid_str)[0] # return element, ignore position | return self._find(swhid_str)[0] # return element, ignore position | ||||
def __setitem__(self, pid_str: str, int: str) -> None: | def __setitem__(self, swhid_str: str, int: str) -> None: | ||||
(_pid, pos) = self._find(pid_str) # might raise KeyError and that's OK | (_swhid, pos) = self._find(swhid_str) # might raise KeyError and that's OK | ||||
rec_pos = pos * self.RECORD_SIZE | rec_pos = pos * self.RECORD_SIZE | ||||
int_pos = rec_pos + PID_BIN_SIZE | int_pos = rec_pos + SWHID_BIN_SIZE | ||||
self.mm[rec_pos:int_pos] = str_to_bytes(pid_str) | self.mm[rec_pos:int_pos] = str_to_bytes(swhid_str) | ||||
self.mm[int_pos : int_pos + INT_BIN_SIZE] = struct.pack(INT_BIN_FMT, int) | self.mm[int_pos : int_pos + INT_BIN_SIZE] = struct.pack(INT_BIN_FMT, int) | ||||
def __iter__(self) -> Iterator[Tuple[str, int]]: | def __iter__(self) -> Iterator[Tuple[str, int]]: | ||||
for pos in range(self.length): | for pos in range(self.length): | ||||
yield self._get_record(pos) | yield self._get_record(pos) | ||||
def iter_prefix(self, prefix: str): | def iter_prefix(self, prefix: str): | ||||
swh, n, t, sha = prefix.split(":") | swh, n, t, sha = prefix.split(":") | ||||
sha = sha.ljust(40, "0") | sha = sha.ljust(40, "0") | ||||
start_pid = ":".join([swh, n, t, sha]) | start_swhid = ":".join([swh, n, t, sha]) | ||||
start = self._bisect_pos(start_pid) | start = self._bisect_pos(start_swhid) | ||||
for pos in range(start, self.length): | for pos in range(start, self.length): | ||||
pid, value = self._get_record(pos) | swhid, value = self._get_record(pos) | ||||
if not pid.startswith(prefix): | if not swhid.startswith(prefix): | ||||
break | break | ||||
yield pid, value | yield swhid, value | ||||
def iter_type(self, pid_type: str) -> Iterator[Tuple[str, int]]: | def iter_type(self, swhid_type: str) -> Iterator[Tuple[str, int]]: | ||||
prefix = "swh:1:{}:".format(pid_type) | prefix = "swh:1:{}:".format(swhid_type) | ||||
yield from self.iter_prefix(prefix) | yield from self.iter_prefix(prefix) | ||||
class NodeToPidMap(_OnDiskMap, MutableMapping): | class NodeToSwhidMap(_OnDiskMap, MutableMapping): | ||||
"""memory mapped map from a continuous range of 0..N (8-byte long) integers to | """memory mapped map from a continuous range of 0..N (8-byte long) integers to | ||||
:ref:`SWHIDs <persistent-identifiers>` | :ref:`SWHIDs <persistent-identifiers>` | ||||
This is the converse mapping of :class:`PidToNodeMap`. | This is the converse mapping of :class:`SwhidToNodeMap`. | ||||
The on-disk serialization format is a sequence of fixed length records (22 | The on-disk serialization format is a sequence of fixed length records (22 | ||||
bytes), each being the binary representation of a PID as per | bytes), each being the binary representation of a SWHID as per | ||||
:func:`str_to_bytes`. | :func:`str_to_bytes`. | ||||
The records are sorted by long integer, so that integer lookup is possible | The records are sorted by long integer, so that integer lookup is possible | ||||
via fixed-offset seek. | via fixed-offset seek. | ||||
""" | """ | ||||
RECORD_BIN_FMT = PID_BIN_FMT | RECORD_BIN_FMT = SWHID_BIN_FMT | ||||
RECORD_SIZE = PID_BIN_SIZE | RECORD_SIZE = SWHID_BIN_SIZE | ||||
def __init__(self, fname: str, mode: str = "rb", length: int = None): | def __init__(self, fname: str, mode: str = "rb", length: int = None): | ||||
"""open an existing on-disk map | """open an existing on-disk map | ||||
Args: | Args: | ||||
fname: path to the on-disk map | fname: path to the on-disk map | ||||
mode: file open mode, usually either 'rb' for read-only maps, 'wb' | mode: file open mode, usually either 'rb' for read-only maps, 'wb' | ||||
for creating new maps, or 'rb+' for updating existing ones | for creating new maps, or 'rb+' for updating existing ones | ||||
(default: 'rb') | (default: 'rb') | ||||
size: map size in number of logical records; used to initialize | size: map size in number of logical records; used to initialize | ||||
read-write maps at creation time. Must be given when mode is | read-write maps at creation time. Must be given when mode is | ||||
'wb'; ignored otherwise | 'wb'; ignored otherwise | ||||
length: passed to :class:`_OnDiskMap` | length: passed to :class:`_OnDiskMap` | ||||
""" | """ | ||||
super().__init__(self.RECORD_SIZE, fname, mode=mode, length=length) | super().__init__(self.RECORD_SIZE, fname, mode=mode, length=length) | ||||
def _get_bin_record(self, pos: int) -> bytes: | def _get_bin_record(self, pos: int) -> bytes: | ||||
"""seek and return the (binary) PID at a given (logical) position | """seek and return the (binary) SWHID at a given (logical) position | ||||
Args: | Args: | ||||
pos: 0-based record number | pos: 0-based record number | ||||
Returns: | Returns: | ||||
PID as a byte sequence | SWHID as a byte sequence | ||||
""" | """ | ||||
rec_pos = pos * self.RECORD_SIZE | rec_pos = pos * self.RECORD_SIZE | ||||
return self.mm[rec_pos : rec_pos + self.RECORD_SIZE] | return self.mm[rec_pos : rec_pos + self.RECORD_SIZE] | ||||
@classmethod | @classmethod | ||||
def write_record(cls, f: BinaryIO, pid: str) -> None: | def write_record(cls, f: BinaryIO, swhid: str) -> None: | ||||
"""write a PID to a file-like object | """write a SWHID to a file-like object | ||||
Args: | Args: | ||||
f: file-like object to write the record to | f: file-like object to write the record to | ||||
pid: textual PID | swhid: textual SWHID | ||||
""" | """ | ||||
f.write(str_to_bytes(pid)) | f.write(str_to_bytes(swhid)) | ||||
def __getitem__(self, pos: int) -> str: | def __getitem__(self, pos: int) -> str: | ||||
orig_pos = pos | orig_pos = pos | ||||
if pos < 0: | if pos < 0: | ||||
pos = len(self) + pos | pos = len(self) + pos | ||||
if not (0 <= pos < len(self)): | if not (0 <= pos < len(self)): | ||||
raise IndexError(orig_pos) | raise IndexError(orig_pos) | ||||
return bytes_to_str(self._get_bin_record(pos)) | return bytes_to_str(self._get_bin_record(pos)) | ||||
def __setitem__(self, pos: int, pid: str) -> None: | def __setitem__(self, pos: int, swhid: str) -> None: | ||||
rec_pos = pos * self.RECORD_SIZE | rec_pos = pos * self.RECORD_SIZE | ||||
self.mm[rec_pos : rec_pos + self.RECORD_SIZE] = str_to_bytes(pid) | self.mm[rec_pos : rec_pos + self.RECORD_SIZE] = str_to_bytes(swhid) | ||||
def __iter__(self) -> Iterator[Tuple[int, str]]: | def __iter__(self) -> Iterator[Tuple[int, str]]: | ||||
for pos in range(self.length): | for pos in range(self.length): | ||||
yield (pos, self[pos]) | yield (pos, self[pos]) |