diff --git a/docs/api.rst b/docs/api.rst index 70982fd..b618b5d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,360 +1,357 @@ Graph REST API ============== Terminology ----------- This API uses the following notions: -- **Node**: a node in the `Software Heritage graph - `_, - represented by a `persistent identifier - `_ - (abbreviated as *SWH PID*, or simply *PID*). +- **Node**: a node in the :ref:`Software Heritage graph `, + represented by a :ref:`SWHID `. -- **Node type**: the 3-letter specifier from the node PID (``cnt``, ``dir``, +- **Node type**: the 3-letter specifier from the node SWHID (``cnt``, ``dir``, ``rel``, ``rev``, ``snp``, ``ori``), or ``*`` for all node types. - **Edge type**: a pair ``src:dst`` where ``src`` and ``dst`` are either node types, or ``*`` to denote all node types. - **Edge restrictions**: a textual specification of which edges can be followed during graph traversal. Either ``*`` to denote that all edges can be followed or a comma separated list of edge types to allow following only those edges. Note that when traversing the *backward* (i.e., transposed) graph, edge types are reversed too. So, for instance, ``ori:snp`` makes sense when traversing the forward graph, but useless (due to lack of matching edges in the graph) when traversing the backward graph; conversely ``snp:ori`` is useful when traversing the backward graph, but not in the forward one. For the same reason ``dir:dir`` allows following edges from parent directories to sub-directories when traversing the forward graph, but the same restriction allows following edges from sub-directories to parent directories. Examples ~~~~~~~~ -- ``swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2`` the PID of a node of +- ``swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2`` the SWHID of a node of type content containing the full text of the GPL3 license. -- ``swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35`` the PID of a node of +- ``swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35`` the SWHID of a node of type revision corresponding to the commit in Linux that merged the 'x86/urgent' branch on 31 December 2017. - ``"dir:dir,dir:cnt"`` node types allowing edges from directories to directories nodes, or directories to contents nodes. - ``"rev:rev,dir:*"`` node types allowing edges from revisions to revisions nodes, or from directories nodes. - ``"*:rel"`` node types allowing all edges to releases. Leaves ------ .. http:get:: /graph/leaves/:src Performs a graph traversal and returns the leaves of the subgraph rooted at the specified source node. - :param string src: source node specified as a SWH PID + :param string src: source node specified as a SWHID :query string edges: edges types the traversal can follow; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http GET /graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323 HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:cnt:540faad6b1e02e2db4f349a4845192db521ff2bd swh:1:cnt:630585fc6d34e5e121139e2aee0a64e83dc9aae6 swh:1:cnt:f8634ced669f0a9155c8cab1b2621d57d778215e swh:1:cnt:ba6daa801ad3ea587904b1abe9161dceedb2e0bd ... Neighbors --------- .. http:get:: /graph/neighbors/:src Returns node direct neighbors (linked with exactly one edge) in the graph. - :param string src: source node specified as a SWH PID + :param string src: source node specified as a SWHID :query string edges: edges types allowed to be listed as neighbors; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http GET /graph/neighbors/swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:rev:a31e58e129f73ab5b04016330b13ed51fde7a961 swh:1:dir:b5d2aa0746b70300ebbca82a8132af386cc5986d swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd ... Walk ---- .. .. http:get:: /graph/walk/:src/:dst Performs a graph traversal and returns the first found path from source to destination (final destination node included). - :param string src: starting node specified as a SWH PID - :param string dst: destination node, either as a node PID or a node type. - The traversal will stop at the first node encountered matching the - desired destination. + :param string src: starting node specified as a SWHID + :param string dst: destination node, either as a node SWHID or a node + type. The traversal will stop at the first node encountered matching + the desired destination. :query string edges: edges types the traversal can follow; default to ``"*"`` :query string traversal: traversal algorithm; can be either ``dfs`` or ``bfs``, default to ``dfs`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http HTTP/1.1 200 OK swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd swh:1:rev:cea92e843e40452c08ba313abc39f59efbb4c29c swh:1:rev:8d517bdfb57154b8a11d7f1682ecc0f79abf8e02 ... .. http:get:: /graph/randomwalk/:src/:dst Performs a graph *random* traversal, i.e., picking one random successor node at each hop, from source to destination (final destination node included). - :param string src: starting node specified as a SWH PID - :param string dst: destination node, either as a node PID or a node type. + :param string src: starting node specified as a SWHID + :param string dst: destination node, either as a node SWHID or a node type. The traversal will stop at the first node encountered matching the desired destination. :query string edges: edges types the traversal can follow; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :query int limit: limit the number of nodes returned. You can use positive numbers to get the first N results, or negative numbers to get the last N results starting from the tail; default to ``0``, meaning no limit. :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http GET /graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2 swh:1:dir:8de8a8823a0780524529c94464ee6ef60b98e2ed swh:1:dir:7146ea6cbd5ffbfec58cc8df5e0552da45e69cb7 swh:1:rev:b12563e00026b48b817fd3532fc3df2db2a0f460 swh:1:rev:13e8ebe80fb878bade776131e738d5772aa0ad1b swh:1:rev:cb39b849f167c70c1f86d4356f02d1285d49ee13 ... swh:1:rev:ff70949f336593d6c59b18e4989edf24d7f0f254 swh:1:snp:a511810642b7795e725033febdd82075064ed863 swh:1:ori:98aa0e71f5c789b12673717a97f6e9fa20aa1161 **Limit example:** .. sourcecode:: http GET /graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward&limit=-2 HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:ori:98aa0e71f5c789b12673717a97f6e9fa20aa1161 swh:1:snp:a511810642b7795e725033febdd82075064ed863 Visit ----- .. http:get:: /graph/visit/nodes/:src .. http:get:: /graph/visit/paths/:src Performs a graph traversal and returns explored nodes or paths (in the order of the traversal). - :param string src: starting node specified as a SWH PID + :param string src: starting node specified as a SWHID :query string edges: edges types the traversal can follow; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http GET /graph/visit/nodes/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:cfab784723a6c2d33468c9ed8a566fd5e2abd8c9 swh:1:rev:53e5df0e7a6b7bd4919074c081a173655c0da164 swh:1:rev:f85647f14b8243532283eff3e08f4ee96c35945f swh:1:rev:fe5f9ef854715fc59b9ec22f9878f11498cfcdbf swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb swh:1:cnt:c8cece50beae7a954f4ea27e3ae7bf941dc6d0c0 swh:1:dir:a358d0cf89821227d4c00b0ced5e0a8b3756b5db swh:1:cnt:cc407b7e24dd300d2e1a77d8f04af89b3f962a51 swh:1:cnt:701bd0a63e11b3390a547ce8515d28c6bab8a201 ... **Example:** .. sourcecode:: http GET /graph/visit/nodes/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb HTTP/1.1 Content-Type: application/x-ndjson Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"] ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"] ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"] ... ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:dir:2ebd4b96fa5665ff74f2b27ae41aecdc43af4463", "swh:1:cnt:1d3b6575fb7bf2a147d228e78ffd77ea193c3639"] ... Counting results ---------------- The following method variants, with trailing `/count` added, behave like their already discussed counterparts but, instead of returning results, return the *amount* of results that would have been returned: .. http:get:: /graph/leaves/count/:src Return the amount of :http:get:`/graph/leaves/:src` results .. http:get:: /graph/neighbors/count/:src Return the amount of :http:get:`/graph/neighbors/:src` results .. http:get:: /graph/visit/nodes/count/:src Return the amount of :http:get:`/graph/visit/nodes/:src` results Stats ----- .. http:get:: /graph/stats Returns statistics on the compressed graph. :statuscode 200: success **Example** .. sourcecode:: http GET /graph/stats HTTP/1.1 Content-Type: application/json .. sourcecode:: http HTTP/1.1 200 OK { "counts": { "nodes": 16222788, "edges": 9907464 }, "ratios": { "compression": 0.367, "bits_per_node": 5.846, "bits_per_edge": 9.573, "avg_locality": 270.369 }, "indegree": { "min": 0, "max": 12382, "avg": 0.6107127825377487 }, "outdegree": { "min": 0, "max": 1, "avg": 0.6107127825377487 } } diff --git a/docs/compression.rst b/docs/compression.rst index 8d9ebcd..e46e7ec 100644 --- a/docs/compression.rst +++ b/docs/compression.rst @@ -1,122 +1,123 @@ Graph compression ================= The compression process is a pipeline implemented for the most part on top of the `WebGraph framework `_ and ecosystem libraries. The compression pipeline consists of the following steps: .. figure:: images/compression_steps.png :align: center :alt: Compression steps Compression steps Each of these steps is briefly described below. For more details see the following paper: .. note:: Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli. `Ultra-Large-Scale Repository Analysis via Graph Compression `_. In proceedings of `SANER 2020 `_: The 27th IEEE International Conference on Software Analysis, Evolution and Reengineering. IEEE 2020. Links: `preprint `_, `bibtex `_. In order to practically perform graph compression, install the ``swh.graph`` module and use the ``swh graph compress`` command line interface of the compression driver, that will conduct the various steps in the right order. See ``swh graph compress --help`` for usage details. 1. MPH ------ -A node in the Software Heritage :ref:`data-model` is identified using its PID -(see :ref:`persistent-identifiers`). However, WebGraph internally uses integers -to refer to node ids. +A node in the Software Heritage :ref:`data model ` is identified +using its SWHID (see :ref:`persistent identifiers +`). However, WebGraph internally uses integers to refer +to node ids. Mapping between the strings and longs ids is needed before compressing the graph. From the `Sux4J `_ utility tool, we use the `GOVMinimalPerfectHashFunction `_ class, mapping with no collisions N keys to N consecutive integers. The step produces a ``.mph`` file (MPH stands for *Minimal Perfect Hash-function*) storing the hash function taking as input a string and returning a unique integer. 2. BV compress -------------- This is the first actual compression step, building a compressed version of the input graph using WebGraph techniques presented in the framework paper. We use the `ScatteredArcsASCIIGraph `_ class, from WebGraph. The resulting BV graph is stored as a set of files: - ``.graph``: the compressed graph in the BV format - ``.offsets``: offsets values to read the bit stream graph file - ``.obl``: offsets cache to load the graph faster - ``.properties``: entries used to correctly decode graph and offset files 3. BFS ------- In the LLP paper, authors propose an empirical analysis linking node ordering and high compression ratio: it is important to use an ordering of nodes ids such that vertices from the same host are close to one another. Building on this insight, the previous compression results in the BV compress step are improved by re-ordering nodes ids using a BFS traversal order. We use the `BFS `_ class from the `LAW `_ library. The resulting ordering is stored in the ``.order`` file, listing nodes ids in order of traversal. 4. Permute ---------- Once the order is computed (BFS or another ordering technique), the final compressed graph is created based on the initial BV compress result, and using the new node order mapping. The permutation uses the `Transform `_ class from WebGraph framework. The final compressed graph is only stored in the resulting ``.graph``, ``.offsets``, ``.obl``, and ``.properties`` files. 5. Stats -------- Compute various statistics on the final compressed graph: - ``.stats``: entries such as number of nodes, edges, avg/min/max degree, average locality, etc. - ``.indegree``: graph indegree distribution - ``.outdegree``: graph outdegree distribution This step uses the `Stats `_ class from WebGraph. 6. Transpose ------------ Create a transposed graph to allow backward traversal, using the `Transform `_ class from WebGraph. diff --git a/docs/docker.rst b/docs/docker.rst index ddaccaf..4931a6b 100644 --- a/docs/docker.rst +++ b/docs/docker.rst @@ -1,70 +1,70 @@ Docker environment ================== Build ----- .. code:: bash $ git clone https://forge.softwareheritage.org/source/swh-graph.git $ cd swh-graph $ docker build --tag swh-graph docker/ ``docker/build.sh`` also exists as an alias for the last line. Run --- Given a graph ``g`` specified by: - ``g.edges.csv.zst``: zstd-compressed CSV file with one edge per line, as a - "SRC_ID SPACE DST_ID" string, where identifiers are the - :ref:`persistent-identifiers` of each node. + "SRC_ID SPACE DST_ID" string, where identifiers are the :ref:`SWHIDs + ` of each node. - ``g.nodes.csv.zst``: sorted list of unique node identifiers appearing in the corresponding ``g.edges.csv.zst`` file. The format is a zst-compressed CSV file (single column) with one persistent identifier per line. .. code:: bash $ docker run -ti \ --volume /PATH/TO/GRAPH/:/srv/softwareheritage/graph/data \ --publish 127.0.0.1:5009:5009 \ swh-graph:latest \ bash or, as a shortcut: .. code:: bash $ docker/run.sh /PATH/TO/GRAPH/ Where ``/PATH/TO/GRAPH`` is a directory containing the ``g.edges.csv.zst`` and ``g.nodes.csv.zst`` files. By default, when entering the container the current working directory will be ``/srv/softwareheritage/graph``; all relative paths found below are intended to be relative to that dir. Graph compression ~~~~~~~~~~~~~~~~~ To compress the graph (from within the docker container): .. code:: bash $ docker/run.sh /PATH/TO/GRAPH/ root@7f3306806861:/srv/softwareheritage/graph# \ swh graph compress --graph data/g --outdir data/compressed Graph server ~~~~~~~~~~~~ To start the swh-graph server (from within the docker container): .. code:: bash $ docker/run.sh /PATH/TO/GRAPH/ root@7f3306806861:/srv/softwareheritage/graph# \ swh graph rpc-serve --graph data/compressed/g diff --git a/docs/use-cases.rst b/docs/use-cases.rst index 983568a..ce01d8c 100644 --- a/docs/use-cases.rst +++ b/docs/use-cases.rst @@ -1,167 +1,167 @@ ========= Use cases ========= This document lists use cases and benchmark scenarios for the Software Heritage graph service. Conventions =========== - **Node identification**: in the following, nodes are always identified by - their Software Heritage :ref:`persistent-identifiers` (SWH PIDs). + their :ref:`SWHIDs `. Use cases ========= Browsing -------- The following use cases require traversing the *forward graph*. - **ls**: given a directory node, list (non recursively) all linked nodes of type directory and content Implementation:: /graph/neighbors/:DIR_ID?edges=dir:cnt,dir:dir - **ls -R**: given a directory node, recursively list all linked nodes of type directory and content Implementation:: /graph/visit/paths/:DIR_ID?edges=dir:cnt,dir:dir - **git log**: given a revision node, recursively list all linked nodes of type revision Implementation:: /graph/visit/nodes/:REV_ID?edges=rev:rev Vault ----- The following use cases require traversing the *forward graph*. - **tarball** (same as *ls -R* above) - **git bundle**: given a node, recursively list all linked nodes of any kind Implementation:: /graph/visit/nodes/:NODE_ID?edges=* Provenance ---------- The following use cases require traversing the *backward (transposed) graph*. - **commit provenance**: given a content or directory node, return *a* commit whose directory (recursively) contains it Implementation:: /graph/walk/:NODE_ID/rev?direction=backward&edges=dir:dir,cnt:dir,dir:rev - **complete commit provenance**: given a content or directory node, return *all* commits whose directory (recursively) contains it Implementation:: /graph/leaves/:NODE_ID?direction=backward&edges=dir:dir,cnt:dir,dir:rev - **origin provenance**: given a content, directory, or commit node, return *an* origin that has at least one snapshot that (recursively) contains it Implementation:: /graph/walk/:NODE_ID/ori?direction=backward&edges=* - **complete origin provenance**: given a content, directory, or commit node, return *all* origins that have at least one snapshot that (recursively) contains it Implementation:: /graph/leaves/:NODE_ID?direction=backward&edges=* - *SLOC tracking*: left as future work Provenance statistics ~~~~~~~~~~~~~~~~~~~~~ The following use cases require traversing the *backward (transposed) graph*. - **content popularity across commits**: for each content, count the number of commits (or *commit popularity*) that link to a directory that (recursively) includes it. Plot the distribution of content popularity across commits. Implementation: apply *complete commit provenance* to each content node, count the returned commits, aggregate. - **commit popularity across origins**: for each commit, count the number of origins (or *origin popularity*) that have a snapshot that (recursively) includes it. Plot the distribution of commit popularity across origins. Implementation: apply *complete origin provenance* to each commit node, count the returned origins, aggregate. - *SLOC popularity across contents*: left as future work The following use cases require traversing the *forward graph*. - **revision size** (as n. of contents) distribution: for each revision, count the number of contents that are (recursively) reachable from it. Plot the distribution of revision sizes. - **origin size** (as n. of revisions) distribution: for each origin, count the number of revisions that are (recursively) reachable from it. Plot the distribution of origin sizes. Benchmarks ========== Notes on how to benchmark graph access: - separate pure-graph timings from timings related to use additional mappings (e.g., node types), no matter if the mappings are in-memory or on-disk - separate in-memory timings from on-disk timings; in particular, separate the - timing of translating node identifiers between internal integers and SWH PIDs + timing of translating node identifiers between internal integers and SWHIDs - for each use case that requires a node as input, we will randomize the choice of the input node and repeat the experiment a suitable number of times; where possible we will aggregate results computing basic statistics (average, standard deviation), as well as normalize results w.r.t. the “size” of the chosen node (e.g., number of nodes/path length in the resulting visit) Basic benchmarks ---------------- - **Edge traversal**: given a node, retrieve the first node in its adjacency list. For reference: Apostolico, Drovandi in *Graph Compression by BFS* report times to retrieve the adjacency list of a node (and/or test if an edge exists between two nodes) in the 2-3 us range, for the largest graph in their experiments (22 M nodes, 600 M edges). Each use case is a benchmark ---------------------------- In addition to abstract benchmark, we will use each use case above as a scenario-based benchmark. diff --git a/swh/graph/pid.py b/swh/graph/pid.py index 330ba25..83a1dff 100644 --- a/swh/graph/pid.py +++ b/swh/graph/pid.py @@ -1,403 +1,403 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import mmap import os import struct from collections.abc import MutableMapping from enum import Enum from mmap import MAP_SHARED, MAP_PRIVATE from typing import BinaryIO, Iterator, Tuple from swh.model.identifiers import PersistentId, parse_persistent_identifier PID_BIN_FMT = "BB20s" # 2 unsigned chars + 20 bytes INT_BIN_FMT = ">q" # big endian, 8-byte integer PID_BIN_SIZE = 22 # in bytes INT_BIN_SIZE = 8 # in bytes class PidType(Enum): """types of existing PIDs, used to serialize PID type as a (char) integer Note that the order does matter also for driving the binary search in PID-indexed maps. Integer values also matter, for compatibility with the Java layer. """ content = 0 directory = 1 origin = 2 release = 3 revision = 4 snapshot = 5 def str_to_bytes(pid_str: str) -> bytes: """Convert a PID to a byte sequence The binary format used to represent PIDs as 22-byte long byte sequences as follows: - 1 byte for the namespace version represented as a C `unsigned char` - 1 byte for the object type, as the int value of :class:`PidType` enums, represented as a C `unsigned char` - 20 bytes for the SHA1 digest as a byte sequence Args: pid: persistent identifier Returns: bytes: byte sequence representation of pid """ pid = parse_persistent_identifier(pid_str) return struct.pack( PID_BIN_FMT, pid.scheme_version, PidType[pid.object_type].value, bytes.fromhex(pid.object_id), ) def bytes_to_str(bytes: bytes) -> str: """Inverse function of :func:`str_to_bytes` See :func:`str_to_bytes` for a description of the binary PID format. Args: bytes: byte sequence representation of pid Returns: pid: persistent identifier """ (version, type, bin_digest) = struct.unpack(PID_BIN_FMT, bytes) pid = PersistentId(object_type=PidType(type).name, object_id=bin_digest) return str(pid) class _OnDiskMap: """mmap-ed on-disk sequence of fixed size records """ def __init__( self, record_size: int, fname: str, mode: str = "rb", length: int = None ): """open an existing on-disk map Args: record_size: size of each record in bytes fname: path to the on-disk map mode: file open mode, usually either 'rb' for read-only maps, 'wb' for creating new maps, or 'rb+' for updating existing ones (default: 'rb') length: map size in number of logical records; used to initialize writable maps at creation time. Must be given when mode is 'wb' and the map doesn't exist on disk; ignored otherwise """ os_modes = {"rb": os.O_RDONLY, "wb": os.O_RDWR | os.O_CREAT, "rb+": os.O_RDWR} if mode not in os_modes: raise ValueError("invalid file open mode: " + mode) new_map = mode == "wb" writable_map = mode in ["wb", "rb+"] self.record_size = record_size self.fd = os.open(fname, os_modes[mode]) if new_map: if length is None: raise ValueError("missing length when creating new map") os.truncate(self.fd, length * self.record_size) self.size = os.path.getsize(fname) (self.length, remainder) = divmod(self.size, record_size) if remainder: raise ValueError( "map size {} is not a multiple of the record size {}".format( self.size, record_size ) ) self.mm = mmap.mmap( self.fd, self.size, flags=MAP_SHARED if writable_map else MAP_PRIVATE ) def close(self) -> None: """close the map shuts down both the mmap and the underlying file descriptor """ if not self.mm.closed: self.mm.close() os.close(self.fd) def __len__(self) -> int: return self.length def __delitem__(self, pos: int) -> None: raise NotImplementedError("cannot delete records from fixed-size map") class PidToNodeMap(_OnDiskMap, MutableMapping): - """memory mapped map from PID (:ref:`persistent-identifiers`) to a continuous - range 0..N of (8-byte long) integers + """memory mapped map from :ref:`SWHIDs ` to a + continuous range 0..N of (8-byte long) integers This is the converse mapping of :class:`NodeToPidMap`. The on-disk serialization format is a sequence of fixed length (30 bytes) records with the following fields: - PID (22 bytes): binary PID representation as per :func:`str_to_bytes` - long (8 bytes): big endian long integer The records are sorted lexicographically by PID type and checksum, where type is the integer value of :class:`PidType`. PID lookup in the map is performed via binary search. Hence a huge map with, say, 11 B entries, will require ~30 disk seeks. Note that, due to fixed size + ordering, it is not possible to create these maps by random writing. Hence, __setitem__ can be used only to *update* the value associated to an existing key, rather than to add a missing item. To create an entire map from scratch, you should do so *sequentially*, using static method :meth:`write_record` (or, at your own risk, by hand via the mmap :attr:`mm`). """ # record binary format: PID + a big endian 8-byte big endian integer RECORD_BIN_FMT = ">" + PID_BIN_FMT + "q" RECORD_SIZE = PID_BIN_SIZE + INT_BIN_SIZE def __init__(self, fname: str, mode: str = "rb", length: int = None): """open an existing on-disk map Args: fname: path to the on-disk map mode: file open mode, usually either 'rb' for read-only maps, 'wb' for creating new maps, or 'rb+' for updating existing ones (default: 'rb') length: map size in number of logical records; used to initialize read-write maps at creation time. Must be given when mode is 'wb'; ignored otherwise """ super().__init__(self.RECORD_SIZE, fname, mode=mode, length=length) def _get_bin_record(self, pos: int) -> Tuple[bytes, bytes]: """seek and return the (binary) record at a given (logical) position see :func:`_get_record` for an equivalent function with additional deserialization Args: pos: 0-based record number Returns: a pair `(pid, int)`, where pid and int are bytes """ rec_pos = pos * self.RECORD_SIZE int_pos = rec_pos + PID_BIN_SIZE return (self.mm[rec_pos:int_pos], self.mm[int_pos : int_pos + INT_BIN_SIZE]) def _get_record(self, pos: int) -> Tuple[str, int]: """seek and return the record at a given (logical) position moral equivalent of :func:`_get_bin_record`, with additional deserialization to non-bytes types Args: pos: 0-based record number Returns: a pair `(pid, int)`, where pid is a string-based PID and int the corresponding integer identifier """ (pid_bytes, int_bytes) = self._get_bin_record(pos) return (bytes_to_str(pid_bytes), struct.unpack(INT_BIN_FMT, int_bytes)[0]) @classmethod def write_record(cls, f: BinaryIO, pid: str, int: int) -> None: """write a logical record to a file-like object Args: f: file-like object to write the record to pid: textual PID int: PID integer identifier """ f.write(str_to_bytes(pid)) f.write(struct.pack(INT_BIN_FMT, int)) def _bisect_pos(self, pid_str: str) -> int: """bisect the position of the given identifier. If the identifier is not found, the position of the pid immediately after is returned. Args: pid_str: the pid as a string Returns: the logical record of the bisected position in the map """ if not isinstance(pid_str, str): raise TypeError("PID must be a str, not {}".format(type(pid_str))) try: target = str_to_bytes(pid_str) # desired PID as bytes except ValueError: raise ValueError('invalid PID: "{}"'.format(pid_str)) lo = 0 hi = self.length - 1 while lo < hi: mid = (lo + hi) // 2 (pid, _value) = self._get_bin_record(mid) if pid < target: lo = mid + 1 else: hi = mid return lo def _find(self, pid_str: str) -> Tuple[int, int]: """lookup the integer identifier of a pid and its position Args: pid_str: the pid as a string Returns: a pair `(pid, pos)` with pid integer identifier and its logical record position in the map """ pos = self._bisect_pos(pid_str) pid_found, value = self._get_record(pos) if pid_found == pid_str: return (value, pos) raise KeyError(pid_str) def __getitem__(self, pid_str: str) -> int: """lookup the integer identifier of a PID Args: pid: the PID as a string Returns: the integer identifier of pid """ return self._find(pid_str)[0] # return element, ignore position def __setitem__(self, pid_str: str, int: str) -> None: (_pid, pos) = self._find(pid_str) # might raise KeyError and that's OK rec_pos = pos * self.RECORD_SIZE int_pos = rec_pos + PID_BIN_SIZE self.mm[rec_pos:int_pos] = str_to_bytes(pid_str) self.mm[int_pos : int_pos + INT_BIN_SIZE] = struct.pack(INT_BIN_FMT, int) def __iter__(self) -> Iterator[Tuple[str, int]]: for pos in range(self.length): yield self._get_record(pos) def iter_prefix(self, prefix: str): swh, n, t, sha = prefix.split(":") sha = sha.ljust(40, "0") start_pid = ":".join([swh, n, t, sha]) start = self._bisect_pos(start_pid) for pos in range(start, self.length): pid, value = self._get_record(pos) if not pid.startswith(prefix): break yield pid, value def iter_type(self, pid_type: str) -> Iterator[Tuple[str, int]]: prefix = "swh:1:{}:".format(pid_type) yield from self.iter_prefix(prefix) class NodeToPidMap(_OnDiskMap, MutableMapping): """memory mapped map from a continuous range of 0..N (8-byte long) integers to - PIDs (:ref:`persistent-identifiers`) + :ref:`SWHIDs ` This is the converse mapping of :class:`PidToNodeMap`. The on-disk serialization format is a sequence of fixed length records (22 bytes), each being the binary representation of a PID as per :func:`str_to_bytes`. The records are sorted by long integer, so that integer lookup is possible via fixed-offset seek. """ RECORD_BIN_FMT = PID_BIN_FMT RECORD_SIZE = PID_BIN_SIZE def __init__(self, fname: str, mode: str = "rb", length: int = None): """open an existing on-disk map Args: fname: path to the on-disk map mode: file open mode, usually either 'rb' for read-only maps, 'wb' for creating new maps, or 'rb+' for updating existing ones (default: 'rb') size: map size in number of logical records; used to initialize read-write maps at creation time. Must be given when mode is 'wb'; ignored otherwise length: passed to :class:`_OnDiskMap` """ super().__init__(self.RECORD_SIZE, fname, mode=mode, length=length) def _get_bin_record(self, pos: int) -> bytes: """seek and return the (binary) PID at a given (logical) position Args: pos: 0-based record number Returns: PID as a byte sequence """ rec_pos = pos * self.RECORD_SIZE return self.mm[rec_pos : rec_pos + self.RECORD_SIZE] @classmethod def write_record(cls, f: BinaryIO, pid: str) -> None: """write a PID to a file-like object Args: f: file-like object to write the record to pid: textual PID """ f.write(str_to_bytes(pid)) def __getitem__(self, pos: int) -> str: orig_pos = pos if pos < 0: pos = len(self) + pos if not (0 <= pos < len(self)): raise IndexError(orig_pos) return bytes_to_str(self._get_bin_record(pos)) def __setitem__(self, pos: int, pid: str) -> None: rec_pos = pos * self.RECORD_SIZE self.mm[rec_pos : rec_pos + self.RECORD_SIZE] = str_to_bytes(pid) def __iter__(self) -> Iterator[Tuple[int, str]]: for pos in range(self.length): yield (pos, self[pos])