Changeset View
Changeset View
Standalone View
Standalone View
swh/graph/swhid.py
# Copyright (C) 2019-2021 The Software Heritage developers | # Copyright (C) 2019-2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from __future__ import annotations | from __future__ import annotations | ||||
from collections.abc import MutableMapping | from collections.abc import MutableMapping | ||||
from enum import Enum | from enum import Enum | ||||
import mmap | import mmap | ||||
from mmap import MAP_SHARED, PROT_READ, PROT_WRITE | from mmap import MAP_SHARED, PROT_READ, PROT_WRITE | ||||
import os | import os | ||||
import struct | import struct | ||||
from typing import BinaryIO, Iterator, Tuple | from typing import BinaryIO, Iterator, Tuple | ||||
from swh.model.hashutil import hash_to_hex | |||||
from swh.model.identifiers import ExtendedObjectType, ExtendedSWHID | from swh.model.identifiers import ExtendedObjectType, ExtendedSWHID | ||||
SWHID_BIN_FMT = "BB20s" # 2 unsigned chars + 20 bytes | SWHID_BIN_FMT = "BB20s" # 2 unsigned chars + 20 bytes | ||||
INT_BIN_FMT = ">q" # big endian, 8-byte integer | INT_BIN_FMT = ">q" # big endian, 8-byte integer | ||||
SWHID_BIN_SIZE = 22 # in bytes | SWHID_BIN_SIZE = 22 # in bytes | ||||
INT_BIN_SIZE = 8 # in bytes | INT_BIN_SIZE = 8 # in bytes | ||||
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines | def bytes_to_str(bytes: bytes) -> str: | ||||
Args: | Args: | ||||
bytes: byte sequence representation of swhid | bytes: byte sequence representation of swhid | ||||
Returns: | Returns: | ||||
swhid: persistent identifier | swhid: persistent identifier | ||||
""" | """ | ||||
(version, type, bin_digest) = struct.unpack(SWHID_BIN_FMT, bytes) | (version, type, bin_digest) = struct.unpack(SWHID_BIN_FMT, bytes) | ||||
swhid = ExtendedSWHID( | |||||
object_type=SwhidType(type).to_extended_object_type(), object_id=bin_digest | # The following is equivalent to: | ||||
) | # return str(ExtendedSWHID( | ||||
return str(swhid) | # object_type=SwhidType(type).to_extended_object_type(), object_id=bin_digest | ||||
# ) | |||||
# but more efficient, because ExtendedSWHID.__init__ is extremely slow. | |||||
object_type = ExtendedObjectType[SwhidType(type).name.upper()] | |||||
douardda: if you want speed, why not also cut the hash_to_hex call and simply use `.hex()` ?
quick… | |||||
Done Inline Actionshash_to_hex is cached vlorentz: `hash_to_hex` is cached | |||||
Not Done Inline Actionsyeah, well, it's currently cached with default lru_cache maxsize, which is a very small 128, so I'm not sure it's a lifesaver here. And you can just lru_cache this byte_to_str function :-) Do we have an idea of the average cache-hit ratio we have when used in swh-graph? douardda: yeah, well, it's currently cached with default lru_cache maxsize, which is a very small 128, so… | |||||
Done Inline ActionsI don't, but it's probably very low. vlorentz: I don't, but it's probably very low. | |||||
return f"swh:1:{object_type.value}:{hash_to_hex(bin_digest)}" | |||||
class _OnDiskMap: | class _OnDiskMap: | ||||
"""mmap-ed on-disk sequence of fixed size records""" | """mmap-ed on-disk sequence of fixed size records""" | ||||
def __init__( | def __init__( | ||||
self, record_size: int, fname: str, mode: str = "rb", length: int = None | self, record_size: int, fname: str, mode: str = "rb", length: int = None | ||||
): | ): | ||||
▲ Show 20 Lines • Show All 314 Lines • Show Last 20 Lines |
if you want speed, why not also cut the hash_to_hex call and simply use .hex() ?
quick stupid test showed a x2 factor between the 2 on my laptop (just a timeit in ipython of building 1k swhid list)