Changeset View
Standalone View
swh/graph/pid.py
- This file was added.
# Copyright (C) 2019 The Software Heritage developers | |||||
# See the AUTHORS file at the top-level directory of this distribution | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
import mmap | |||||
import os | |||||
import struct | |||||
from collections.abc import Mapping, Sequence | |||||
from enum import Enum | |||||
from swh.model.identifiers import PID_NAMESPACE, PID_SEP | |||||
PID_BIN_FMT = 'BB20s' # 2 unsigned chars + 20 bytes | |||||
INT_BIN_FMT = '>q' # big endian, 8-byte integer | |||||
PID_BIN_SIZE = 22 # in bytes | |||||
INT_BIN_SIZE = 8 # in bytes | |||||
class PidType(Enum): | |||||
ori = 1 | |||||
snp = 2 | |||||
rev = 3 | |||||
rel = 4 | |||||
dir = 5 | |||||
cnt = 6 | |||||
haltode: Anticipating the Java part, it would be best to follow the ordering already defined in https… | |||||
Done Inline ActionsAh, good point. Not much the Java compatibility (we're gonna change the serialization on the Java side anyway), but the fact the previous one I used was not alphabetic on textual PIDs. Having that is a good property, because we already have alpha-sorted PID lists, e.g., in the nodes file. Being able to easily translate those files to binary serializations is a feature that we want to keep. Diff updated to fix this. zack: Ah, good point. Not much the Java compatibility (we're gonna change the serialization on the… | |||||
def str_to_bytes(pid): | |||||
"""Convert a PID to a byte sequence | |||||
The binary format used to represent PIDs as byte sequences is as follows: | |||||
- 1 byte for the namespace version represented as a C `unsigned char` | |||||
- 1 byte for the object type, as the int value of :class:`PidType` enums, | |||||
represented as a C `unsigned char` | |||||
- 20 bytes for the SHA1 digest as a byte sequence | |||||
Args: | |||||
pid (str): persistent identifier | |||||
Not Done Inline ActionsIs it useful to include namespace version? (ie. will we ever mix two different namespace versions in the same file?) vlorentz: Is it useful to include namespace version? (ie. will we ever mix two different namespace… | |||||
Done Inline ActionsUnlikely in these maps, but the serialization is meant to be general purpose. As such, it should contain the same information that exist in a PID, which includes the version number. I can imagine in the future using this serialization format elsewhere, where you cannot necessarily expect uniformity. zack: Unlikely in these maps, but the serialization is meant to be general purpose. As such, it… | |||||
Not Done Inline ActionsIf it's general purpose, shouldn't it be in swh.model.identifiers? vlorentz: If it's general purpose, shouldn't it be in `swh.model.identifiers`? | |||||
Done Inline ActionsI've raised this on #swh-devel the other day (although only @olasd participated). The consensus was that it could go there eventually, but until it's needed there is no point in doing so. I'm personally fine with waiting for a second user to promote this as something general purposes. But if there is the desire from other to move it there right now, I wouldn't mind doing so either. zack: I've raised this on #swh-devel the other day (although only @olasd participated). The consensus… | |||||
Returns: | |||||
bytes (bytes): byte sequence representation of pid | |||||
""" | |||||
(_scheme, version, type, hex_digest) = pid.split(':', 3) | |||||
return struct.pack(PID_BIN_FMT, int(version), PidType[type].value, | |||||
bytes.fromhex(hex_digest)) | |||||
def bytes_to_str(bytes): | |||||
"""Inverse function of :func:`str_to_bytes` | |||||
Done Inline Actionsstr_to_bytes probably seirl: `str_to_bytes` probably | |||||
Done Inline ActionsThere's a function in swh.model.identifiers to parse pids. vlorentz: There's a function in `swh.model.identifiers` to parse pids. | |||||
See :func:`str_to_bytes` for a description of the binary PID format. | |||||
Args: | |||||
bytes (bytes): byte sequence representation of pid | |||||
Returns: | |||||
pid (str): persistent identifier | |||||
""" | |||||
(version, type, bin_digest) = struct.unpack(PID_BIN_FMT, bytes) | |||||
return PID_SEP.join([PID_NAMESPACE, str(version), PidType(type).name, | |||||
bin_digest.hex()]) | |||||
class _OnDiskMap(): | |||||
"""mmap-ed on-disk sequence of fixed size records | |||||
""" | |||||
Done Inline ActionsThere's also a function for this. vlorentz: There's also a function for this. | |||||
def __init__(self, record_size, fname): | |||||
"""open an existing on-disk map | |||||
Args: | |||||
record_size (int): size of each record in bytes | |||||
fname (str): path to the on-disk map | |||||
""" | |||||
self.record_size = record_size | |||||
self.size = os.path.getsize(fname) | |||||
self.length = self.size // record_size | |||||
if self.size % record_size: | |||||
raise ValueError( | |||||
'map size {} is not a multiple of the record size {}'.format( | |||||
self.size, record_size)) | |||||
self.f = open(fname, 'rb') | |||||
self.mm = mmap.mmap(self.f.fileno(), self.size, mmap.MAP_PRIVATE) | |||||
Done Inline Actions(self.length, remainder) = divmod(self.size, record_size) vlorentz: `(self.length, remainder) = divmod(self.size, record_size)` | |||||
def close(self): | |||||
"""close the map | |||||
shuts down both the mmap and the underlying file descriptor | |||||
""" | |||||
if not self.mm.closed: | |||||
self.mm.close() | |||||
if not self.f.closed: | |||||
self.f.close() | |||||
class PidToIntMap(_OnDiskMap, Mapping): | |||||
"""memory mapped map from PID (:ref:`persistent-identifiers`) to a continuous | |||||
range 0..N of (8-byte long) integers | |||||
This is the converse mapping of :class:`IntToPidMap`. | |||||
The on-disk serialization format is a sequence of fixed length (30 bytes) | |||||
records with the following fields: | |||||
- PID (22 bytes): binary PID representation as per :func:`str_to_bytes` | |||||
- long (8 bytes): big endian long integer | |||||
The records are sorted lexicographically by PID type and checksum, where | |||||
type is the integer value of :class:`PidType`. PID lookup in the map is | |||||
performed via binary search. Hence a huge map with, say, 11 B entries, | |||||
will require ~30 disk seeks. | |||||
""" | |||||
# record binary format: PID + a big endian 8-byte big endian integer | |||||
RECORD_BIN_FMT = '>' + PID_BIN_FMT + 'q' | |||||
RECORD_SIZE = PID_BIN_SIZE + INT_BIN_SIZE | |||||
def __init__(self, fname): | |||||
"""open an existing on-disk map | |||||
Args: | |||||
fname (str): path to the on-disk map | |||||
""" | |||||
super().__init__(self.RECORD_SIZE, fname) | |||||
def _get_bin_record(self, pos): | |||||
"""seek and return the (binary) record at a given (logical) position | |||||
see :func:`_get_record` for an equivalent function with additional | |||||
deserialization | |||||
Args: | |||||
pos: 0-based record number | |||||
Returns: | |||||
tuple: a pair `(pid, int)`, where pid and int are bytes | |||||
""" | |||||
rec_pos = pos * self.RECORD_SIZE | |||||
int_pos = rec_pos + PID_BIN_SIZE | |||||
return (self.mm[rec_pos:int_pos], | |||||
self.mm[int_pos:int_pos+INT_BIN_SIZE]) | |||||
def _get_record(self, pos): | |||||
"""seek and return the record at a given (logical) position | |||||
moral equivalent of :func:`_get_bin_record`, with additional | |||||
deserialization to non-bytes types | |||||
Args: | |||||
pos: 0-based record number | |||||
Returns: | |||||
tuple: a pair `(pid, int)`, where pid is a string-based PID and int | |||||
an integer | |||||
""" | |||||
(pid_bytes, int_bytes) = self._get_bin_record(pos) | |||||
return (bytes_to_str(pid_bytes), | |||||
struct.unpack(INT_BIN_FMT, int_bytes)[0]) | |||||
@classmethod | |||||
def write_record(cls, f, pid, int): | |||||
"""write a logical record to a file-like object | |||||
Args: | |||||
f: file-like object to write the record to | |||||
pid (str): textual PID | |||||
int (int): PID integer identifier | |||||
""" | |||||
f.write(str_to_bytes(pid)) | |||||
f.write(struct.pack(INT_BIN_FMT, int)) | |||||
def __getitem__(self, pid_str): | |||||
"""lookup the integer identifier of a PID | |||||
Args: | |||||
pid (str): the PID as a string | |||||
Returns: | |||||
int: the integer identifier of pid | |||||
""" | |||||
try: | |||||
target = str_to_bytes(pid_str) # desired PID as bytes | |||||
except ValueError: | |||||
raise ValueError('invalid PID: "{}"'.format(pid_str)) | |||||
min = 0 | |||||
max = self.length - 1 | |||||
while (min <= max): | |||||
mid = (min + max) // 2 | |||||
(pid, int) = self._get_bin_record(mid) | |||||
if pid < target: | |||||
min = mid + 1 | |||||
elif pid > target: | |||||
max = mid - 1 | |||||
else: # pid == target | |||||
return struct.unpack(INT_BIN_FMT, int)[0] | |||||
raise KeyError(pid_str) | |||||
def __iter__(self): | |||||
for pos in range(self.length): | |||||
yield self._get_record(pos) | |||||
def __len__(self): | |||||
return self.length | |||||
class IntToPidMap(_OnDiskMap, Sequence): | |||||
"""memory mapped map from a continuous range of 0..N (8-byte long) integers to | |||||
PIDs (:ref:`persistent-identifiers`) | |||||
This is the converse mapping of :class:`PidToIntMap`. | |||||
The on-disk serialization format is a sequence of fixed length (22 bytes), | |||||
where each record is the binary representation of PID as per | |||||
:func:`str_to_bytes`. | |||||
The records are sorted by long integer, so that integer lookup is possible | |||||
via fixed-offset seek. | |||||
""" | |||||
RECORD_BIN_FMT = PID_BIN_FMT | |||||
RECORD_SIZE = PID_BIN_SIZE | |||||
def __init__(self, fname): | |||||
"""open an existing on-disk map | |||||
Args: | |||||
fname (str): path to the on-disk map | |||||
""" | |||||
super().__init__(self.RECORD_SIZE, fname) | |||||
def _get_bin_record(self, pos): | |||||
"""seek and return the (binary) PID at a given (logical) position | |||||
Args: | |||||
pos: 0-based record number | |||||
Returns: | |||||
bytes: PID as a byte sequence | |||||
""" | |||||
rec_pos = pos * self.RECORD_SIZE | |||||
return self.mm[rec_pos:rec_pos+self.RECORD_SIZE] | |||||
@classmethod | |||||
def write_record(cls, f, pid): | |||||
"""write a PID to a file-like object | |||||
Args: | |||||
f: file-like object to write the record to | |||||
pid (str): textual PID | |||||
""" | |||||
f.write(str_to_bytes(pid)) | |||||
def __getitem__(self, pos): | |||||
if pos < 0: | |||||
pos = len(self) + pos | |||||
if not (0 <= pos < len(self)): | |||||
raise IndexError(pos) | |||||
return bytes_to_str(self._get_bin_record(pos)) | |||||
def __len__(self): | |||||
return self.length | |||||
Done Inline ActionsThe error should probably show the original pos, not len(self) + <original pos>. It might give surprising errors otherwise. vlorentz: The error should probably show the original `pos`, not `len(self) + <original pos>`. It might… |
Anticipating the Java part, it would be best to follow the ordering already defined in https://forge.softwareheritage.org/source/swh-graph/browse/master/java/server/src/main/java/org/softwareheritage/graph/Node.java (or at least make sure in the future Java part diff that the order is the same in both Java/Python code).