diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..ec16ee1 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,5 @@ +# Enable black +4c3c6d839b642009ba1eeee4acf4a58f209580e6 + +# python: Reformat code with black 22.3.0 +1efea9bb9035e1d04191f8cd25a3f7ff9ad6d8f3 diff --git a/.gitignore b/.gitignore index 3c564c7..8416519 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,13 @@ *.pyc *.sw? *~ .coverage .eggs/ __pycache__ *.egg-info/ build/ dist/ version.txt .tox .mypy_cache/ +compressed/logs diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7a64a7f..5bf56ae 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,59 +1,52 @@ repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.4.0 - hooks: - - id: trailing-whitespace - - id: check-json - - id: check-yaml - -- repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 - hooks: - - id: flake8 - -- repo: https://github.com/codespell-project/codespell - rev: v1.16.0 - hooks: - - id: codespell - args: ["-L te,wth,alledges"] - -- repo: local - hooks: - - id: mypy - name: mypy - entry: mypy - args: [swh] - pass_filenames: false - language: system - types: [python] - -- repo: https://github.com/PyCQA/isort - rev: 5.5.2 - hooks: - - id: isort - -- repo: https://github.com/python/black - rev: 19.10b0 - hooks: - - id: black - -- repo: local - hooks: - - id: java-coding-style - name: java style - entry: mvn - args: ["-f", "java/pom.xml", "spotless:apply"] - pass_filenames: false - language: system - -# unfortunately, we are far from being able to enable this... -# - repo: https://github.com/PyCQA/pydocstyle.git -# rev: 4.0.0 -# hooks: -# - id: pydocstyle -# name: pydocstyle -# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions. -# entry: pydocstyle --convention=google -# language: python -# types: [python] - + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 + hooks: + - id: trailing-whitespace + - id: check-json + - id: check-yaml + + - repo: https://gitlab.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + additional_dependencies: [flake8-bugbear==22.3.23] + + - repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + name: Check source code spelling + args: ["-L te,wth,alledges,afterall"] + stages: [commit] + + - repo: local + hooks: + - id: mypy + name: mypy + entry: mypy + args: [swh] + pass_filenames: false + language: system + types: [python] + + - repo: https://github.com/PyCQA/isort + rev: 5.10.1 + hooks: + - id: isort + + - repo: https://github.com/python/black + rev: 22.3.0 + hooks: + - id: black + + - repo: local + hooks: + - id: java-coding-style + name: java style + entry: mvn + args: ["-f", "java/pom.xml", "spotless:apply"] + pass_filenames: false + language: system + +exclude: ^swh/graph/rpc/ diff --git a/Makefile.local b/Makefile.local index 034d1c7..1181cea 100644 --- a/Makefile.local +++ b/Makefile.local @@ -1,14 +1,17 @@ POM_PATH=java/pom.xml java: mvn -f $(POM_PATH) compile assembly:single java-doc: mvn -f $(POM_PATH) javadoc:javadoc java-%: mvn -f $(POM_PATH) $* +protoc: + python -m grpc_tools.protoc -I. --python_out=. --mypy_out=. --grpc_python_out=. swh/graph/rpc/*.proto + clean-java: java-clean .PHONY: java clean-java diff --git a/PKG-INFO b/PKG-INFO index 4839ff0..d6f1fb7 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,56 +1,52 @@ Metadata-Version: 2.1 Name: swh.graph -Version: 0.5.2 +Version: 1.0.0 Summary: Software Heritage graph service Home-page: https://forge.softwareheritage.org/diffusion/DGRPH Author: Software Heritage developers Author-email: swh-devel@inria.fr -License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-graph Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-graph/ -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - graph service ================================= Tooling and services, collectively known as ``swh-graph``, providing fast access to the graph representation of the `Software Heritage `_ `archive `_. The service is in-memory, based on a compressed representation of the Software Heritage Merkle DAG. Bibliography ------------ In addition to accompanying technical documentation, ``swh-graph`` is also described in the following scientific paper. If you publish results based on ``swh-graph``, please acknowledge it by citing the paper as follows: .. note:: Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli. `Ultra-Large-Scale Repository Analysis via Graph Compression `_. In proceedings of `SANER 2020 `_: The 27th IEEE International Conference on Software Analysis, Evolution and Reengineering, pages 184-194. IEEE 2020. Links: `preprint `_, `bibtex `_. - - diff --git a/docs/api.rst b/docs/api.rst index 0b7f1a2..3face8d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,407 +1,541 @@ .. _swh-graph-api: -Graph RPC API -============= +Graph Querying HTTP API +======================= + +The Graph Querying API is a high-level HTTP API intended to run common, +relatively simple traversal queries on the compressed graph. + +The client/server architecture allows it to only load the graph in memory once +then serve multiple different requests. However, it is limited in expressivity; +more complex or resource-intensive queries should rather use the +:ref:`Low-level Java API ` to run them as standalone +programs. Terminology ----------- This API uses the following notions: - **Node**: a node in the :ref:`Software Heritage graph `, represented by a :ref:`SWHID `. - **Node type**: the 3-letter specifier from the node SWHID (``cnt``, ``dir``, ``rel``, ``rev``, ``snp``, ``ori``), or ``*`` for all node types. - **Edge type**: a pair ``src:dst`` where ``src`` and ``dst`` are either node types, or ``*`` to denote all node types. - **Edge restrictions**: a textual specification of which edges can be followed during graph traversal. Either ``*`` to denote that all edges can be followed or a comma separated list of edge types to allow following only those edges. Note that when traversing the *backward* (i.e., transposed) graph, edge types are reversed too. So, for instance, ``ori:snp`` makes sense when traversing the forward graph, but useless (due to lack of matching edges in the graph) when traversing the backward graph; conversely ``snp:ori`` is useful when traversing the backward graph, but not in the forward one. For the same reason ``dir:dir`` allows following edges from parent directories to sub-directories when traversing the forward graph, but the same restriction allows following edges from sub-directories to parent directories. - **Node restrictions**: a textual specification of which type of nodes can be returned after a request. Either ``*`` to denote that all types of nodes can be returned or a comma separated list of node types to allow returning only those node types. Examples ~~~~~~~~ - ``swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2`` the SWHID of a node of type content containing the full text of the GPL3 license. - ``swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35`` the SWHID of a node of type revision corresponding to the commit in Linux that merged the 'x86/urgent' branch on 31 December 2017. - ``"dir:dir,dir:cnt"`` node types allowing edges from directories to directories nodes, or directories to contents nodes. - ``"rev:rev,dir:*"`` node types allowing edges from revisions to revisions nodes, or from directories nodes. - ``"*:rel"`` node types allowing all edges to releases. - ``"cnt,snp"`` accepted node types returned in the query results. +Endpoints +--------- + Leaves ------- +~~~~~~ .. http:get:: /graph/leaves/:src Performs a graph traversal and returns the leaves of the subgraph rooted at the specified source node. :param string src: source node specified as a SWHID :query string edges: edges types the traversal can follow; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :query integer max_edges: how many edges can be traversed during the visit; default to 0 (not restricted) :query string return_types: only return the nodes matching this type; default to ``"*"`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http GET /graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323 HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:cnt:540faad6b1e02e2db4f349a4845192db521ff2bd swh:1:cnt:630585fc6d34e5e121139e2aee0a64e83dc9aae6 swh:1:cnt:f8634ced669f0a9155c8cab1b2621d57d778215e swh:1:cnt:ba6daa801ad3ea587904b1abe9161dceedb2e0bd ... Neighbors ---------- +~~~~~~~~~ .. http:get:: /graph/neighbors/:src Returns node direct neighbors (linked with exactly one edge) in the graph. :param string src: source node specified as a SWHID :query string edges: edges types allowed to be listed as neighbors; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :query integer max_edges: how many edges can be traversed during the visit; default to 0 (not restricted) :query string return_types: only return the nodes matching this type; default to ``"*"`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http GET /graph/neighbors/swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:rev:a31e58e129f73ab5b04016330b13ed51fde7a961 swh:1:dir:b5d2aa0746b70300ebbca82a8132af386cc5986d swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd ... Walk ----- +~~~~ .. .. http:get:: /graph/walk/:src/:dst Performs a graph traversal and returns the first found path from source to destination (final destination node included). :param string src: starting node specified as a SWHID :param string dst: destination node, either as a node SWHID or a node type. The traversal will stop at the first node encountered matching the desired destination. :query string edges: edges types the traversal can follow; default to ``"*"`` :query string traversal: traversal algorithm; can be either ``dfs`` or ``bfs``, default to ``dfs`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :query string return_types: types of nodes we want to be displayed; default to ``"*"`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http HTTP/1.1 200 OK swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd swh:1:rev:cea92e843e40452c08ba313abc39f59efbb4c29c swh:1:rev:8d517bdfb57154b8a11d7f1682ecc0f79abf8e02 ... .. http:get:: /graph/randomwalk/:src/:dst Performs a graph *random* traversal, i.e., picking one random successor node at each hop, from source to destination (final destination node included). :param string src: starting node specified as a SWHID :param string dst: destination node, either as a node SWHID or a node type. The traversal will stop at the first node encountered matching the desired destination. :query string edges: edges types the traversal can follow; default to ``"*"`` :query string direction: direction in which graph edges will be followed; can be either ``forward`` or ``backward``, default to ``forward`` :query int limit: limit the number of nodes returned. You can use positive numbers to get the first N results, or negative numbers to get the last N results starting from the tail; default to ``0``, meaning no limit. :query integer max_edges: how many edges can be traversed during the visit; default to 0 (not restricted) :query string return_types: only return the nodes matching this type; default to ``"*"`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http GET /graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2 swh:1:dir:8de8a8823a0780524529c94464ee6ef60b98e2ed swh:1:dir:7146ea6cbd5ffbfec58cc8df5e0552da45e69cb7 swh:1:rev:b12563e00026b48b817fd3532fc3df2db2a0f460 swh:1:rev:13e8ebe80fb878bade776131e738d5772aa0ad1b swh:1:rev:cb39b849f167c70c1f86d4356f02d1285d49ee13 ... swh:1:rev:ff70949f336593d6c59b18e4989edf24d7f0f254 swh:1:snp:a511810642b7795e725033febdd82075064ed863 swh:1:ori:98aa0e71f5c789b12673717a97f6e9fa20aa1161 **Limit example:** .. sourcecode:: http GET /graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward&limit=-2 HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:ori:98aa0e71f5c789b12673717a97f6e9fa20aa1161 swh:1:snp:a511810642b7795e725033febdd82075064ed863 Visit ------ +~~~~~ .. http:get:: /graph/visit/nodes/:src .. http:get:: /graph/visit/edges/:src .. http:get:: /graph/visit/paths/:src Performs a graph traversal and returns explored nodes, edges or paths (in the order of the traversal). :param string src: starting node specified as a SWHID :query string edges: edges types the traversal can follow; default to ``"*"`` :query integer max_edges: how many edges can be traversed during the visit; default to 0 (not restricted) :query string return_types: only return the nodes matching this type; default to ``"*"`` :statuscode 200: success :statuscode 400: invalid query string provided :statuscode 404: starting node cannot be found **Example:** .. sourcecode:: http GET /graph/visit/nodes/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:cfab784723a6c2d33468c9ed8a566fd5e2abd8c9 swh:1:rev:53e5df0e7a6b7bd4919074c081a173655c0da164 swh:1:rev:f85647f14b8243532283eff3e08f4ee96c35945f swh:1:rev:fe5f9ef854715fc59b9ec22f9878f11498cfcdbf swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb swh:1:cnt:c8cece50beae7a954f4ea27e3ae7bf941dc6d0c0 swh:1:dir:a358d0cf89821227d4c00b0ced5e0a8b3756b5db swh:1:cnt:cc407b7e24dd300d2e1a77d8f04af89b3f962a51 swh:1:cnt:701bd0a63e11b3390a547ce8515d28c6bab8a201 ... **Example:** .. sourcecode:: http GET /graph/visit/edges/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc HTTP/1.1 Content-Type: text/plain Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:61f92a7db95f5a6d1fcb94d2b897ed3797584d7b swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:00e81c89c29ff3e58745fdaf7abb68daa1389e85 swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:7596fdc31c9aa00aed281ccb026a74cabf2383bb swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:ec7a2341ac3d9d8b571bbdfb90a089d4e54dea56 swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:1c5b5eac61eda2454034a43eb124ab490885ef3a swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:4dfa88ca55e04e8afe05e8543ddddee32dde7236 swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:d56ae79e43ff1b37534370911c8a78ec7f38d437 swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:19ba5d6203a040a39ecc4a77b165d3f097c1e662 swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:9c56102eefea23c95405533e1de23da4b873ecc4 swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:3f54e816b46c2e179cd164e17fea93b3013a9db4 ... **Example:** .. sourcecode:: http GET /graph/visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb HTTP/1.1 Content-Type: application/x-ndjson Transfer-Encoding: chunked .. sourcecode:: http HTTP/1.1 200 OK ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"] ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"] ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"] ... ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:dir:2ebd4b96fa5665ff74f2b27ae41aecdc43af4463", "swh:1:cnt:1d3b6575fb7bf2a147d228e78ffd77ea193c3639"] ... Counting results ----------------- +~~~~~~~~~~~~~~~~ The following method variants, with trailing `/count` added, behave like their already discussed counterparts but, instead of returning results, return the *amount* of results that would have been returned: .. http:get:: /graph/leaves/count/:src Return the amount of :http:get:`/graph/leaves/:src` results .. http:get:: /graph/neighbors/count/:src Return the amount of :http:get:`/graph/neighbors/:src` results .. http:get:: /graph/visit/nodes/count/:src Return the amount of :http:get:`/graph/visit/nodes/:src` results Stats ------ +~~~~~ .. http:get:: /graph/stats Returns statistics on the compressed graph. :statuscode 200: success **Example** .. sourcecode:: http GET /graph/stats HTTP/1.1 Content-Type: application/json .. sourcecode:: http HTTP/1.1 200 OK { "counts": { "nodes": 16222788, "edges": 9907464 }, "ratios": { "compression": 0.367, "bits_per_node": 5.846, "bits_per_edge": 9.573, "avg_locality": 270.369 }, "indegree": { "min": 0, "max": 12382, "avg": 0.6107127825377487 }, "outdegree": { "min": 0, "max": 1, "avg": 0.6107127825377487 } } + + +Use-case examples +----------------- + +This section showcases how to leverage the endpoints of the HTTP API described +above for some common use-cases. + + +Browsing +~~~~~~~~ + +The following use cases require traversing the *forward graph*. + +- **ls**: given a directory node, list (non recursively) all linked nodes of + type directory and content + + Endpoint:: + + /graph/neighbors/:DIR_ID?edges=dir:cnt,dir:dir + +- **ls -R**: given a directory node, recursively list all linked nodes of type + directory and content + + Endpoint:: + + /graph/visit/paths/:DIR_ID?edges=dir:cnt,dir:dir + +- **git log**: given a revision node, recursively list all linked nodes of type + revision + + Endpoint:: + + /graph/visit/nodes/:REV_ID?edges=rev:rev + + +Vault +~~~~~ + +The following use cases require traversing the *forward graph*. + +- **tarball** (same as *ls -R* above) + +- **git bundle**: given a node, recursively list all linked nodes of any kind + + Endpoint:: + + /graph/visit/nodes/:NODE_ID?edges=* + + +Provenance +~~~~~~~~~~ + +The following use cases require traversing the *backward (transposed) +graph*. + +- **commit provenance**: given a content or directory node, return *a* commit + whose directory (recursively) contains it + + Endpoint:: + + /graph/walk/:NODE_ID/rev?direction=backward&edges=dir:dir,cnt:dir,dir:rev + +- **complete commit provenance**: given a content or directory node, return + *all* commits whose directory (recursively) contains it + + Endpoint:: + + /graph/leaves/:NODE_ID?direction=backward&edges=dir:dir,cnt:dir,dir:rev + +- **origin provenance**: given a content, directory, or commit node, return + *an* origin that has at least one snapshot that (recursively) contains it + + Endpoint:: + + /graph/walk/:NODE_ID/ori?direction=backward&edges=* + +- **complete origin provenance**: given a content, directory, or commit node, + return *all* origins that have at least one snapshot that (recursively) + contains it + + Endpoint:: + + /graph/leaves/:NODE_ID?direction=backward&edges=* + + +Provenance statistics +~~~~~~~~~~~~~~~~~~~~~ + +The following use cases require traversing the *backward (transposed) +graph*. + +- **content popularity across commits**: count the number of commits (or + *commit popularity*) that link to a directory that (recursively) includes a + given content. + + Endpoint:: + + /graph/count/leaves/:NODE_ID?direction=backward&edges=cnt:dir,dir:dir,dir:rev + +- **commit popularity across origins**: count the number of origins (or *origin + popularity*) that have a snapshot that (recursively) includes a given commit. + + Endpoint:: + + /graph/count/leaves/:NODE_ID?direction=backward&edges=* + +The following use cases require traversing the *forward graph*. + +- **revision size** (as n. of contents) distribution: the number of contents + that are (recursively) reachable from a given revision. + + Endpoint:: + + /graph/count/leaves/:NODE_ID?edges=* + +- **origin size** (as n. of revisions) distribution: count the number of + revisions that are (recursively) reachable from a given origin. + + Endpoint:: + + /graph/count/leaves/:NODE_ID?edges=ori:snp,snp:rel,snp:rev,rel:rev,rev:rev diff --git a/docs/compression.rst b/docs/compression.rst index edca8a7..bfd6c9e 100644 --- a/docs/compression.rst +++ b/docs/compression.rst @@ -1,125 +1,611 @@ .. _graph-compression: +================= Graph compression ================= -The compression process is a pipeline implemented for the most part on top of -the `WebGraph framework `_ and ecosystem -libraries. The compression pipeline consists of the following steps: +The compression pipeline is implemented on top of the `WebGraph framework +`_. It takes an ORC Graph Dataset as an input, +such as the ones found in the :ref:`Graph Dataset List `, +and generates a compressed graph suitable for high intensity analyses on +large servers. -.. figure:: images/compression_steps.png - :align: center - :alt: Compression steps - Compression steps +Running the compression pipeline +================================ -Each of these steps is briefly described below. For more details see the -following paper: +Dependencies +------------ + +To compress a graph, you will need to install the ``swh.graph`` tool as well as +a recent JRE, as described in the :ref:`swh-graph-quickstart` page. + +You will also need the zstd_ compression tool:: + + $ sudo apt install zstd + +.. _zstd: https://facebook.github.io/zstd/ + + +Hardware Requirements +--------------------- + +The compression pipeline is even more demanding than the graph server in terms +of hardware requirements, especially RAM. Notably, the BFS compression step +loads a graph compressed in random order in memory, which is usually more than +a TiB for the full graph. While it is possible to do this step with a memory +mapping, our experiments show that this could take a very long time (several +months) on hard drives. + +The LLP compression step requires 13 bytes of RAM per node, which could amount +to storing hundreds of gigabytes in RAM in addition to loading the graph +itself. + +Some steps also involve sorting the entire set of edges and their labels, by +using large on-disk buffer files, sometimes reaching the size of the input +dataself itself. + +The machine we used to compress the entire graph (dataset version 2022-04-25) +has the following hardware specs: + +- 2 TiB of RAM (DDR4 ECC 2400Mhz) +- 64 vCPUs (Dual AMD EPYC 7302 16-Core) +- 24 TiB of SSD (NVMe) + +The server we rented is from the +`HGR-HCI-4 `_ +series from OVH. + + +Input dataset +------------- + +First, you need to retrieve a graph to compress, in ORC format. The :ref:`Graph +Dataset List ` has a list of datasets made available by the +Software Heritage archive, including "teaser" subdatasets which have a more +manageable size and are thus very useful for prototyping with less hardware +resources. + +The datasets can be retrieved from S3 or the annex, in a similar fashion to +what is described in :ref:`swh-graph-retrieving-compressed`, by simply +replacing "compressed" by "orc": + +.. code:: console + + (venv) $ mkdir -p 2021-03-23-popular-3k-python/orc + (venv) $ cd 2021-03-23-popular-3k-python/ + (venv) $ aws s3 cp --recursive s3://softwareheritage/graph/2021-03-23-popular-3k-python/orc/ orc + +Alternatively, any custom ORC dataset can be used as long as it respects +:ref:`the schema ` of the Software Heritage Graph Dataset. + +**Note:** for testing purposes, a fake test dataset is available in the +``swh-graph`` repository, with just a few dozen nodes. The ORC tables are +available in ``swh-graph/swh/graph/tests/dataset/orc/``. + + +Compression +----------- + +You can compress your dataset by using the ``swh graph compress`` command. It +will run all the various steps of the pipeline in the right order. + +.. code:: console + + + (venv) $ swh graph compress --input-dataset orc/ --outdir compressed/ + [...] + (venv) $ ls compressed/ + graph.edges.count.txt + graph.edges.stats.txt + graph.graph + graph.indegree + graph-labelled.labeloffsets + graph-labelled.labels + [...] + graph-transposed.obl + graph-transposed.offsets + graph-transposed.properties -.. note:: - Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli. - `Ultra-Large-Scale Repository Analysis via Graph Compression - `_. In - proceedings of `SANER 2020 `_: The 27th IEEE - International Conference on Software Analysis, Evolution and - Reengineering. IEEE 2020. +(The purpose of each of these files is detailed in the +:ref:`swh-graph-java-api` page. - Links: `preprint - `_, - `bibtex - `_. +For sufficiently large graphs, this command can take entire weeks. It is highly +recommended to run it in a systemd service or in a tmux session. -In order to practically perform graph compression, install the ``swh.graph`` -module and use the ``swh graph compress`` command line interface of the -compression driver, that will conduct the various steps in the right order. -See ``swh graph compress --help`` for usage details. +It is also possible to run single steps or step ranges from the CLI: +.. code:: bash -1. MPH + swh graph compress -i orc/ -o compressed/ --steps mph-bfs + +See ``swh graph compress --help`` for syntax and usage details. + + +Compression steps +================= + +The compression pipeline consists of the following steps: + +.. figure:: images/compression_steps.png + :align: center + :alt: Compression steps + :scale: 20% + + Compression steps + +Each of these steps is briefly described below. For more details see the +original Software Heritage graph compression paper [SWHGraphCompression2020]_, +as well as chapters 9 and 10 of Antoine Pietri's PhD thesis +[PietriThesis2021]_. + +.. [SWHGraphCompression2020] + | Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli. + | `Ultra-Large-Scale Repository Analysis via Graph Compression + `_. + | In proceedings of `SANER 2020 `_: The 27th + IEEE International Conference on Software Analysis, Evolution and + Reengineering. IEEE 2020. + | Links: `preprint + `_, + `bibtex + `_. + + + +.. [PietriThesis2021] + | Antoine Pietri + | `Organizing the graph of public software development for large-scale mining + `_. + | Doctoral dissertation. Inria, 2021. + + +1. EXTRACT_NODES +---------------- + +This step reads a graph dataset and extract all the unique node SWHIDs it +contains, including the ones that are not stored as actual objects in the +graph, but only *referred to* by the edges. Additionally, it extracts the set +of all unique edge labels in the graph. + +**Rationale:** Because the graph can contain holes, loose objects and dangling +objects, some nodes that are referred to as destinations in the edge +relationships might not actually be stored in the graph itself. However, to +compress the graph using a graph compressio library, it is necessary to have a +list of *all* the nodes in the graph, including the ones that are simply +referred to by the edges but not actually stored as concrete objects. + +This step reads the entire graph dataset, and uses ``sort -u`` to extract the +set of all the unique nodes and unique labels that will be needed as an input +for the compression process. It also write object count statistics in various +files: + +- The set of nodes is written in ``graph.nodes.csv.zst``, as a zst-compressed + sorted list of SWHIDs, one per line. +- The set of edge labels is written in ``graph.labels.csv.zst``, as a + zst-compressed sorted list of labels encoded in base64, one per line. +- The number of unique nodes referred to in the graph is written in a text + file, ``graph.nodes.count.txt`` +- The number of unique edges referred to in the graph is written in a text + file, ``graph.edges.count.txt`` +- The number of unique edge labels is written in a text file, + ``graph.labels.count.txt`` +- Statistics on the number of nodes of each type are written in a text file, + ``graph.nodes.stats.txt`` +- Statistics on the number of edges of each type are written in a text file, + ``graph.edges.stats.txt`` + + +2. MPH ------ -A node in the Software Heritage :ref:`data model ` is identified -using its SWHID (see :ref:`persistent identifiers -`). However, WebGraph internally uses integers to refer -to node ids. +As discussed in :ref:`swh-graph-java-basics`, a node in the Software Heritage +:ref:`data model ` is identified by its SWHID (see :ref:`persistent +identifiers `), but WebGraph internally uses integers +to refer to node ids. -Mapping between the strings and longs ids is needed before compressing the -graph. From the `Sux4J `_ utility tool, we use the +To create a mapping between integer node IDs and SWHIDs, we use the `GOVMinimalPerfectHashFunction `_ -class, mapping with no collisions N keys to N consecutive integers. +class of the `Sux4J `_ library, which maps N keys to N +consecutive integers. + +We run this function on the list of SWHIDs stored in the +``graph.nodes.csv.zst`` file generated in the previous step. +This allows us to generate a bijection from the set of all the *n* SWHIDs in the +graph to the set of integers :math:`[0, n - 1]`. -The step produces a ``.mph`` file (MPH stands for *Minimal Perfect -Hash-function*) storing the hash function taking as input a string and returning -a unique integer. +The step produces a ``graph.mph`` file (MPH stands for *Minimal Perfect +Hash-function*), containing a function which takes a SWHID (as a bytestring) +and returns its associated node ID. -2. BV compress +3. BV compress -------------- -This is the first actual compression step, building a compressed version of the -input graph using WebGraph techniques presented in the framework paper. We use -the `ScatteredArcsASCIIGraph +This is the first actual compression step, where we build a compressed version +of the input graph dataset. + +We use a ScatteredArcsORCGraph to load the dataset +(implementation inspired of the `ScatteredArcsASCIIGraph `_ -class, from WebGraph. +class in WebGraph). +This class wraps the ORC Graph dataset and exposes a *virtual* ImmutableGraph, +whose nodes and edges can be iterated sequentially as if it was any other +standard graph. To do so, it puts all the edges in batches and sorts them in an +aggressively parallel fashion, then stores them as ``.bitstream`` files, and +returns a `BatchGraph +` +created from these batches. + +Finally, it uses the ``BVGraph.store()`` method, which compresses the input +graph as a `BVGraph +`_, +using the compression techniques described in the article *The WebGraph +Framework I: Compression Techniques* cited above. The resulting BV graph is stored as a set of files: -- ``.graph``: the compressed graph in the BV format -- ``.offsets``: offsets values to read the bit stream graph file -- ``.obl``: offsets cache to load the graph faster -- ``.properties``: entries used to correctly decode graph and offset files +- ``graph-base.graph``: the compressed graph in the BV format +- ``graph-base.offsets``: offsets values to read the bit stream graph file +- ``graph-base.properties``: entries used to correctly decode graph and offset + files -3. BFS -------- +4. BFS +------ + +In [LLP]_, the paper authors empirically demonstrate that a high graph +compression ratio can be achieved for the graph of the Web by ordering nodes +such that vertices from the same host are close to each other. -In the LLP paper, authors propose an empirical analysis linking node ordering -and high compression ratio: it is important to use an ordering of nodes ids such -that vertices from the same host are close to one another. +In Software Heritage, there is no notion of "host" that can be used to generate +these compression-friendly orderings, because the identifiers are just +uniformly random cryptographic hashes. However, we can generate these orderings +by running algorithms to inform us on which nodes are close to each other. -Building on this insight, the previous compression results in the BV compress -step are improved by re-ordering nodes ids using a BFS traversal order. We use -the `BFS +In this step, we run a BFS traversal on the entire graph to get a more +compression-friendly ordering of nodes. We use the `BFS `_ class from the `LAW `_ library. -The resulting ordering is stored in the ``.order`` file, listing nodes ids in -order of traversal. +The resulting ordering is stored in a ``graph-bfs.order`` file, which contains +all the node IDs in the order of traversal. -4. Permute ----------- +5. PERMUTE_BFS +-------------- -Once the order is computed (BFS or another ordering technique), the final -compressed graph is created based on the initial BV compress result, and using -the new node order mapping. The permutation uses the `Transform +Once the BFS order is computed, we permute the initial "base" graph using the +this new ordering. The permutation uses the `Transform `_ class from WebGraph framework. -The final compressed graph is only stored in the resulting ``.graph``, -``.offsets``, ``.obl``, and ``.properties`` files. +The BFS-compressed graph is stored in the files +``graph-bfs.{graph,offsets,properties}``. +6. TRANSPOSE_BFS +---------------- -5. Stats --------- +We transpose the BFS-compressed graph, using the `Transform +`_ +class from WebGraph. +This step is a prerequisite for LLP compression. + +7. SIMPLIFY +----------- + +This step creates a loopless and symmetric version of the BFS-compressed graph, +using the `Transform +`_ +class from WebGraph. +This step is a prerequisite for LLP compression. + +8. LLP +------ + +Better compression ratios can be achieved by the Layered Label Propagation +(LLP) algorithm to reorder nodes. This algorithm is described in [LLP]_. +The LLP algorithm finds locality-preserving orders by clustering together nodes +in close proximity. Similar to the BFS, this algorithm is particularly +interesting for our use case as it is unsupervised, and does not rely on prior +information on the clusters present in the graph. The idea behind the +clustering algorithm is to randomly distribute communities to the nodes in the +graph, then iteratively assign to each node the community most represented in +its neighbors. + +.. [LLP] Paolo Boldi, Marco Rosa, Massimo Santini, Sebastiano Vigna. + *Layered label propagation: a multiresolution coordinate-free ordering for compressing social networks.* + WWW 2011: 587-596 + DOI: https://doi.org/10.1145/1963405.1963488 + preprint: https://arxiv.org/abs/1011.5425 + +LLP is more costly than simple BFS-based compression in both time and memory. +Even though the algorithm has a linear time complexity, it does multiple +iterations on the graph and is significantly slower than the BFS which is just +one single traversal. Moreover, keeping track of the communities requires a +total of 13 bytes per node, which increases the RAM requirements. +Because of these constraints, it is unrealistic to run the LLP algorithm on the +uncompressed version of the graph; this is why we do an intermediate +compression with the BFS ordering first, then compress the entire graph *again* +with an even better ordering. + +The LLP algorithm takes a simplified (loopless, symmetric) graph as an input, +which we already computed in the previous steps. + +The algorithm is also parameterized by a list of γ values, a "resolution" parameter +which defines the shapes of the clustering it produces: either small, but +denser pieces, or larger, but unavoidably sparser pieces. The algorithm then +combines the different clusterings together to generate the output reordering. +γ values are given to the algorithm in the form :math:`\frac{j}{2^k}`; by +default, 12 different values of γ are used. However, the combination procedure +is very slow, and using that many γ values could take several months in our +case. +We thus narrowed down a smaller set of γ values that empirically give good +compression results, which are used by default in the pipeline. In general, +smaller values of γ seem to generate better compression ratios. The effect of a +given γ is that the density of the sparsest cluster is at least γ γ+1, so large +γ values imply small, more dense clusters. It is reasonable to assume that +since the graph is very sparse to start with, such clusters are not that +useful. + +The resulting ordering is stored in a ``graph-llp.order`` file. + +9. PERMUTE_LLP +-------------- + +Once the LLP order is computed, we permute the BFS-compressed graph using the +this new ordering. The LLP-compressed graph, which is our final compressed +graph, is stored in the files ``graph.{graph,offsets,properties}``. + +10. OBL +------- + +Cache the BVGraph offsets of the forward graph to make loading faster. The +resulting offset big list is stored in the ``graph.obl`` file. + +11. COMPOSE_ORDERS +------------------ + +To be able to translate the initial MPH inputs to their resulting rank in the +LLP-compressed graph, we need to use the two order permutations successively: +the base → BFS permutation, then the BFS → LLP permutation. + +To make this less wasteful, we *compose* the two permutations into a single +one. We use the `composePermutationsInPlace +`_ +function of the dsiutils library. The resulting permutation is stored as a +``graph.order`` file. Hashing a SWHID with the ``graph.mph`` function, then +permuting the result using the ``graph.order`` permutation yields the integer +node ID matching the input SWHID in the graph. -Compute various statistics on the final compressed graph: +12. STATS +--------- -- ``.stats``: entries such as number of nodes, edges, avg/min/max degree, +This step computes various statistics on the compressed graph: + +- ``.stats``: statistics such as number of nodes, edges, avg/min/max degree, average locality, etc. - ``.indegree``: graph indegree distribution - ``.outdegree``: graph outdegree distribution This step uses the `Stats `_ class from WebGraph. -6. Transpose ------------- +13. TRANSPOSE +------------- -Create a transposed graph to allow backward traversal, using the `Transform +Transpose the graph to allow backward traversal, using the `Transform `_ -class from WebGraph. +class from WebGraph. The resulting transposed graph is stored as the +``graph-transposed.{graph,offsets,properties}`` files. + + +14. TRANSPOSE_OBL +----------------- + +Same as OBL, but for the transposed graph. The resulting offset big list is +stored in the ``graph-transposed.obl`` file. + + +15. MAPS +-------- + +This steps generates the *node mappings* described in +:ref:`swh-graph-java-node-mappings`. In particular, it generates: + +- ``graph.node2swhid.bin``: a compact binary representation of all the + SWHIDs in the graph, ordered by their rank in the graph file. +- ``graph.node2type.bin``: a `LongBigArrayBitVector + `_ + which stores the type of each node. + +It does so by reading all the SWHIDs in the ``graph.nodes.csv.zst`` file generated in the +EXTRACT_NODES step, then getting their corresponding node IDs (using the +``.mph`` and ``.order`` files), then sorting all the SWHIDs according to +their node ID. It then writes these SWHIDs in order, in a compact but seekable +binary format, which can be used to return the SWHID corresponding to any given +node in O(1). + + +16. EXTRACT_PERSONS +------------------- + +This step reads the ORC graph dataset and extracts all the unique persons it +contains. Here, "persons" are defined as the set of unique pairs of name + +email, potentially pseudonymized, found either as revision authors, revision +committers or release authors. + +The ExtractPersons class reads all the persons from revision and release +tables, then uses ``sort -u`` to get a sorted list without any duplicates. The +resulting sorted list of authors is stored in the ``graph.persons.csv.zst`` +file. + + +17. MPH_PERSONS +--------------- + +This step computes a Minimal Perfect Hash function on the set of all the unique +persons extracted in the EXTRACT_PERSONS step. Each individual person is mapped +to a unique integer in :math:`[0, n-1]` where *n* is the total number of +persons. The resulting function is serialized and stored in the +``graph.persons.mph`` file. + + +18. NODE_PROPERTIES +------------------- + +This step generates the *node property files*, as described in +:ref:`swh-graph-java-node-properties`. +The nodes in the Software Heritage Graph each have associated *properties* +(e.g., commit timestamps, authors, messages, ...). The values of these +properties for each node in the graph are compressed and stored in files +alongside the compressed graph. + +The WriteNodeProperties class reads all the properties from the ORC Graph +Dataset, then serializes each of them in a representation suitable for +efficient random access (e.g., large binary arrays) and stores them on disk. + +For persons (authors, committers etc), the MPH computed in the MPH_PERSONS step +is used to store them as a single pseudonymized integer ID, which uniquely +represents a full name + email. + +The results are stored in the following list of files: + +- ``graph.property.author_id.bin`` +- ``graph.property.author_timestamp.bin`` +- ``graph.property.author_timestamp_offset.bin`` +- ``graph.property.committer_id.bin`` +- ``graph.property.committer_timestamp.bin`` +- ``graph.property.committer_timestamp_offset.bin`` +- ``graph.property.content.is_skipped.bin`` +- ``graph.property.content.length.bin`` +- ``graph.property.message.bin`` +- ``graph.property.message.offset.bin`` +- ``graph.property.tag_name.bin`` +- ``graph.property.tag_name.offset.bin`` + + +19. MPH_LABELS +-------------- + +This step computes a **monotone** Minimal Perfect Hash function on the set of +all the unique *arc label names* extracted in the EXTRACT_NODES step. Each +individual arc label name (i.e., directory entry names and snapshot branch +names) is monotonely mapped to a unique integer in :math:`[0, n-1]`, where *n* +is the total number of unique arc label names, which corresponds to their +**lexical rank** in the set of all arc labels. + +In other words, this MPH being monotone means that the hash of the *k*-th item +in the sorted input list of arc labels will always be *k*. +We use the `LcpMonotoneMinimalPerfectHashFunction +`_ +of Sux4J to generate this function. + +The rationale for using a monotone function here is that it will allow us to +quickly get back the arc label from its hash without having to store an +additional permutation. +The resulting MPH function is serialized and stored in the ``graph.labels.mph`` +file. + + +20. FCL_LABELS +-------------- + +This step computes a *reverse-mapping* for arc labels, i.e., a way to +efficiently get the arc label name from its hash computed with the monotone MPH +of the MPH_LABELS step. + +Thanks to the MPH being monotone, this boils down to storing all the labels in +lexicographic order in a string list format that allows O(1) access to its +elements. For this purpose, we use the `MappedFrontCodedStringBigList +`_ +class from the dsiutils library, using the ``graph.labels.csv.zst`` file as its +input. It stores the label names in a compact way by using front-coding +compression, which is particularly efficient here because the strings are +already in lexicographic order. The resulting FCL files are stored as +``graph.labels.fcl.*``, and they can be loaded using memory mapping. + + +21. EDGE_LABELS +--------------- + + +This step generates the *edge property files*, as described in +:ref:`swh-graph-java-edge-properties`. These files allow us to get the *edge +labels* as we iterate on the edges of the graph. The files essentially contain +compressed sorted triplets of the form (source, destination, label), with +additional offsets to allow random access. + +To generate these files, the LabelMapBuilder class starts by reading in +parallel the labelled edges in the ORC dataset, which can be thought of as +quadruplets containing the source SWHID, the destination SWHID, the label name +and the entry permission if applicable: + +.. code-block:: text + + swh:1:snp:4548a5… swh:1:rev:0d6834… cmVmcy9oZWFkcy9tYXN0ZXI= + swh:1:dir:05faa1… swh:1:cnt:a35136… dGVzdC5j 33188 + swh:1:dir:05faa1… swh:1:dir:d0ff82… dGVzdA== 16384 + ... + +Using the ``graph.mph`` and the ``graph.order`` files, we hash and permute the +source and destination nodes. We also monotonically hash the labels using the +``graph.labels.mph`` function to obtain the arc label identifiers. The +permissions are normalized as one of the 6 possible values in the +``DirEntry.Permission.Type`` enum, and are then stored in the 3 lowest bits of +the label field. + +.. code-block:: text + + 4421 14773 154 + 1877 21441 1134 + 1877 14143 1141 + ... + +These hashed edges and their compact-form labels are then put in large batches +sorted in an aggressively parallel fashion, which are then stored as +``.bitstream`` files. These batch files are put in a heap structure to perform +a merge sort on the fly on all the batches. + +Then, the LabelMapBuilder loads the graph and starts iterating on its edges. It +synchronizes the stream of edges read from the graph with the stream of sorted +edges and labels read from the bitstreams in the heap. At this point, it writes +the labels to the following output files: + +- ``graph-labelled.properties``: a property file describing the graph, notably + containing the basename of the wrapped graph. +- ``graph-labelled.labels``: the compressed labels +- ``graph-labelled.labeloffsets``: the offsets used to access the labels in + random order. + +It then does the same with backward edge batches to get the transposed +equivalent of these files: +``graph-transposed-labelled.{properties,labels,labeloffsets}``. + + +23. EDGE_LABELS_OBL +------------------- + +Cache the label offsets of the forward labelled graph to make loading faster. +The resulting label offset big list is stored in the +``graph-labelled.labelobl`` file. + + +23. EDGE_LABELS_TRANSPOSE_OBL +----------------------------- + +Same as EDGE_LABELS_OBL, but for the transposed labelled graph. +The resulting label offset big list is stored in the +``graph-transposed-labelled.labelobl`` file. + + +24. CLEAN_TMP +------------- + +This step reclaims space by deleting the temporary directory, as well as all +the intermediate outputs that are no longer necessary now that the final graph +has been compressed (shown in gray in the step diagram). diff --git a/docs/grpc-api.rst b/docs/grpc-api.rst new file mode 100644 index 0000000..70c197d --- /dev/null +++ b/docs/grpc-api.rst @@ -0,0 +1,556 @@ +.. _swh-graph-grpc-api: + +================== +Using the GRPC API +================== + +The GRPC API is the core API used to query the graph remotely. It uses the +`GRPC framework `_ to provide high-performance graph +traversal methods with server streaming. + +It is more expressive than the :ref:`HTTP API ` (which itself +uses the GRPC API under the hood to serve queries), however it can only be +used internally or with a local setup, and is never exposed publicly. + +Its major features include: returning node and edge properties, performing BFS +traversals, including traversals with more than one starting node, finding +shortest paths, common ancestors, etc. + +Quickstart +========== + +Starting the server +------------------- + +The GRPC server is automatically started on port 50091 when the HTTP server +is started with ``swh graph rpc-serve``. It can also be started directly with +Java, instead of going through the Python layer, by using the fat-jar shipped +with swh-graph: + +.. code-block:: console + + $ java -cp swh-graph-XXX.jar org.softwareheritage.graph.rpc.GraphServer + +(See :ref:`swh-graph-java-api` and :ref:`swh-graph-memory` for more +information on Java process options and JVM tuning.) + +Running queries +--------------- + +The `gRPC command line tool +`_ +can be an easy way to query the GRPC API from the command line. It is +invoked with the ``grpc_cli`` command. Of course, it is also possible to use +a generated RPC client in any programming language supported by GRPC. + +All RPC methods are defined in the service ``swh.graph.TraversalService``. +The available endpoints can be listed with ``ls``: + +.. code-block:: console + + $ grpc_cli ls localhost:50091 swh.graph.TraversalService + Traverse + FindPathTo + FindPathBetween + CountNodes + CountEdges + Stats + GetNode + +A RPC method can be called with the ``call`` subcommand. + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.Stats "" + connecting to localhost:50091 + num_nodes: 21 + num_edges: 23 + compression: 1.412 + bits_per_node: 8.524 + [...] + Rpc succeeded with OK status + +The ``--json-output`` flag can also be used to make the results easier to +parse. + +.. code-block:: console + + $ grpc_cli --json_output call localhost:50091 swh.graph.TraversalService.Stats "" + connecting to localhost:50091 + { + "numNodes": "21", + "numEdges": "23", + [...] + } + Rpc succeeded with OK status + + +**Note**: grpc_cli's outputs in this document are slightly modified for +readability's sake. + +Simple queries +============== + +For a full documentation of all the endpoints, as well as the request and +response messages, see :ref:`swh-graph-grpc-api-protobuf`. + +Querying a single node +---------------------- + +The **GetNode** endpoint can be used to return information on a single +node of the graph, including all its node properties, from its SWHID. Here +are a few examples from the test graph: + +Content +~~~~~~~ + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \ + 'swhid: "swh:1:cnt:0000000000000000000000000000000000000001"' + +.. code-block:: javascript + + swhid: "swh:1:cnt:0000000000000000000000000000000000000001" + cnt { + length: 42 + is_skipped: false + } + +Revision +~~~~~~~~ + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \ + 'swhid: "swh:1:rev:0000000000000000000000000000000000000009"' + +.. code-block:: javascript + + swhid: "swh:1:rev:0000000000000000000000000000000000000009" + rev { + author: 2 + author_date: 1111140840 + author_date_offset: 120 + committer: 2 + committer_date: 1111151950 + committer_date_offset: 120 + message: "Add parser" + } + +Release +~~~~~~~ + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \ + 'swhid: "swh:1:rel:0000000000000000000000000000000000000010"' + +.. code-block:: javascript + + swhid: "swh:1:rel:0000000000000000000000000000000000000010" + rel { + author: 0 + author_date: 1234564290 + author_date_offset: 120 + message: "Version 1.0" + } + +Origin +~~~~~~ + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \ + 'swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054"' + +.. code-block:: javascript + + swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054" + ori { + url: "https://example.com/swh/graph" + } + + +Checking the presence of a node +------------------------------- + +The **GetNode** endpoint can also be used to check if a node exists in the +graph. The RPC will return the ``INVALID_ARGUMENT`` code, and a detailed error +message. + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \ + 'swhid: "swh:1:ori:ffffffffffffffffffffffffffffffffffffffff"' + Rpc failed with status code 3, error message: Unknown SWHID: swh:1:ori:ffffffffffffffffffffffffffffffffffffffff + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \ + 'swhid: "invalidswhid"' + Rpc failed with status code 3, error message: malformed SWHID: swh:1:ori:ffffffffffffffffffffffffffffffffffffffff + + +Selecting returned fields with FieldMask +---------------------------------------- + +Many endpoints, including **GetNode**, contain a ``mask`` field of type +`FieldMask +`_, +which can be used to select which fields should be returned in the response. + +This is particularly interesting for traversal queries that return a large +number of nodes, because property access is quite costly from the compressed +graph (at least compared to regular node access). It is therefore recommended +that clients systematically use FieldMasks to only request the properties that +they will consume. + +A FieldMask is represented as a set of "field paths" in dotted notation. For +instance, ``paths: ["swhid", "rev.message"]`` will only request the swhid and +the message of a given node. An empty mask will return an empty object. + +Example: + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \ + 'swhid: "swh:1:rev:0000000000000000000000000000000000000009", mask: {paths: ["swhid"]}' + swhid: "swh:1:rev:0000000000000000000000000000000000000009" + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \ + 'swhid: "swh:1:rev:0000000000000000000000000000000000000009", mask: {paths: ["swhid", "rev.message", "rev.author"]}' + swhid: "swh:1:rev:0000000000000000000000000000000000000009" + rev { + author: 2 + message: "Add parser" + } + + +Getting statistics on the graph +------------------------------- + +The **Stats** endpoint returns overall statistics on the entire compressed +graph. Most notably, the total number of nodes and edges, as well as the +range of indegrees and outdegrees, and some compression-related statistics. + +.. code-block:: console + + $ grpc_cli --json_output call localhost:50091 swh.graph.TraversalService.Stats "" + +.. code-block:: json + + { + "numNodes": "21", + "numEdges": "23", + "compression": 1.412, + "bitsPerNode": 8.524, + "bitsPerEdge": 7.783, + "avgLocality": 2.522, + "indegreeMax": "3", + "indegreeAvg": 1.0952380952380953, + "outdegreeMax": "3", + "outdegreeAvg": 1.0952380952380953 + } + + +Graph traversals +================ + +Breadth-first traversal +----------------------- + +The **Traverse** endpoint performs a breadth-first traversal from a set of +source nodes, and `streams +`_ all +the nodes it encounters on the way. All the node properties are stored in the +result nodes. Additionally, the *edge properties* (e.g., directory entry names +and permissions) are stored as a list in the ``successor`` field of each node. + +For instance, here we run a traversal from a directory that contains two +contents: + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \ + "src: 'swh:1:dir:0000000000000000000000000000000000000006'" + +We get the following stream of nodes: first, the source directory (including +its properties, successor list and their labels), then the contents themselves +and their respective properties. + +.. code-block:: javascript + + swhid: "swh:1:dir:0000000000000000000000000000000000000006" + successor { + swhid: "swh:1:cnt:0000000000000000000000000000000000000005" + label { + name: "parser.c" + permission: 33188 + } + } + successor { + swhid: "swh:1:cnt:0000000000000000000000000000000000000004" + label { + name: "README.md" + permission: 33188 + } + } + num_successors: 2 + +.. code-block:: javascript + + swhid: "swh:1:cnt:0000000000000000000000000000000000000005" + cnt { + length: 1337 + is_skipped: false + } + +.. code-block:: javascript + + swhid: "swh:1:cnt:0000000000000000000000000000000000000004" + cnt { + length: 404 + is_skipped: false + } + +Again, it is possible to use a FieldMask to restrict which fields get returned. +For instance, if we only care about the SWHIDs: + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \ + "src: 'swh:1:dir:0000000000000000000000000000000000000006', mask: {paths: ['swhid']}" + swhid: "swh:1:dir:0000000000000000000000000000000000000006" + swhid: "swh:1:cnt:0000000000000000000000000000000000000005" + swhid: "swh:1:cnt:0000000000000000000000000000000000000004" + + +Graph direction +~~~~~~~~~~~~~~~ + +For many purposes, especially that of finding the provenance of software +artifacts, it is useful to query the backward (or transposed) graph instead, +which is the same as the forward graph except all the edges are reversed. +To achieve this, the ``direction`` field can be used to specify a direction +from the ``GraphDirection`` enum (either ``FORWARD`` or ``BACKWARD``). + +This query returns all the nodes reachable from a given directory in the +*backward* (or "transposed") graph: + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \ + "src: 'swh:1:dir:0000000000000000000000000000000000000006', direction: BACKWARD, mask: {paths: ['swhid']}" + swhid: "swh:1:dir:0000000000000000000000000000000000000006" + swhid: "swh:1:dir:0000000000000000000000000000000000000008" + swhid: "swh:1:dir:0000000000000000000000000000000000000012" + swhid: "swh:1:rev:0000000000000000000000000000000000000009" + swhid: "swh:1:rev:0000000000000000000000000000000000000013" + swhid: "swh:1:rel:0000000000000000000000000000000000000010" + swhid: "swh:1:snp:0000000000000000000000000000000000000020" + swhid: "swh:1:rev:0000000000000000000000000000000000000018" + swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054" + swhid: "swh:1:rel:0000000000000000000000000000000000000019" + + +Edge restrictions +~~~~~~~~~~~~~~~~~ + +To constrain the types of edges that can be followed during the graph +traversal, it is possible to specify an edge restriction string in the ``edge`` +field. It is a comma-separated list of edge types that will be followed (e.g. +``"rev:dir,dir:cnt"`` to only follow revision → directory and directory → +content edges). +By default (or when ``"*"`` is provided), all edges can be followed. + +This query traverses the parent revisions of a given revision only (i.e., it +outputs the *commit log* from a given commit): + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \ + "src: 'swh:1:rev:0000000000000000000000000000000000000018', edges: 'rev:rev', mask: {paths: ['swhid']}" + swhid: "swh:1:rev:0000000000000000000000000000000000000018" + swhid: "swh:1:rev:0000000000000000000000000000000000000013" + swhid: "swh:1:rev:0000000000000000000000000000000000000009" + swhid: "swh:1:rev:0000000000000000000000000000000000000003" + + +Limiting the traversal +~~~~~~~~~~~~~~~~~~~~~~ + +To avoid using up too much memory or resources, a traversal can be limited +in two different ways: + +- the ``max_depth`` attribute defines the maximum depth of the traversal. +- the ``max_edges`` attribute defines the maximum number of edges that can be + fetched by the traversal. + +When these limits are reached, the traversal will simply stop. While these +options have obvious use-cases for anti-abuse, they can also be semantically +useful: for instance, specifying ``max_depth: 1`` will only return the +*neighbors* of the source node. + + +Filtering returned nodes +~~~~~~~~~~~~~~~~~~~~~~~~ + +In many cases, clients might not want to get all the traversed nodes in the +response stream. With the ``return_nodes`` field (of type ``NodeFilter``), it +is possible to specify various *criteria* for which nodes should be sent to the +stream. By default, all nodes are returned. + +One common filter is to only want specific *node types* to be returned, which +can be done with the ``types`` field of ``NodeFilter``. This field contains a +node type restriction string (e.g. "dir,cnt,rev"), and defaults to "*" (all). +For instance, to find the list of origins in which a given directory can be +found: + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \ + "src: 'swh:1:dir:0000000000000000000000000000000000000006', return_nodes: {types: 'ori'}, direction: BACKWARD, mask: {paths: ['swhid']}" + swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054" + + +Traversal from multiple sources +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Traversals can have multiple starting nodes, when multiple source nodes are +present in the ``src`` field. For instance, this BFS starts from two different +directories, and explores the graph in parallel from these multiple starting +points: + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \ + "src: ['swh:1:dir:0000000000000000000000000000000000000006', 'swh:1:dir:0000000000000000000000000000000000000017'], mask: {paths: ['swhid']}" + swhid: "swh:1:dir:0000000000000000000000000000000000000006" + swhid: "swh:1:dir:0000000000000000000000000000000000000017" + swhid: "swh:1:cnt:0000000000000000000000000000000000000005" + swhid: "swh:1:cnt:0000000000000000000000000000000000000004" + swhid: "swh:1:cnt:0000000000000000000000000000000000000014" + swhid: "swh:1:dir:0000000000000000000000000000000000000016" + swhid: "swh:1:cnt:0000000000000000000000000000000000000015" + + +Finding a path to a node matching a criteria +-------------------------------------------- + +The **FindPathTo** endpoint searches for a shortest path between a set of +source nodes and any node that matches a specific *criteria*. +It does so by performing a breadth-first search from the source node, +until any node that matches the given criteria is found, then follows +back its parents to return a shortest path from the source set to that +node. + +The criteria can be specified in the ``target`` field of the +``FindPathToRequest``, which is of type ``NodeFilter``. + +As an example, a common use-case for content provenance is to find the shortest +path of a content to an origin in the transposed graph. This query can be +run like this: + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.FindPathTo \ + "src: 'swh:1:cnt:0000000000000000000000000000000000000001', target: {types: 'ori'}, direction: BACKWARD, mask: {paths: ['swhid']}" + swhid: "swh:1:cnt:0000000000000000000000000000000000000001" + swhid: "swh:1:dir:0000000000000000000000000000000000000008" + swhid: "swh:1:rev:0000000000000000000000000000000000000009" + swhid: "swh:1:snp:0000000000000000000000000000000000000020" + swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054" + +As soon as the request finds an origin, it stops and returns the path from the +source set to this origin. + +Similar to the **Traverse** endpoint, it is possible to specify edge +restrictions, graph directions, as well as multiple source nodes. + + +Finding a path between two sets of nodes +---------------------------------------- + +The **FindPathBetween** endpoint searches for a shortest path between a set of +source nodes and a set of destination nodes. + +It does so by performing a *bidirectional breadth-first search*, i.e., +two parallel breadth-first searches, one from the source set ("src-BFS") +and one from the destination set ("dst-BFS"), until both searches find a +common node that joins their visited sets. This node is called the +"midpoint node". +The path returned is the path src -> ... -> midpoint -> ... -> dst, +which is always a shortest path between src and dst. + +The graph direction of both BFS can be configured separately. By +default, the dst-BFS will use the graph in the opposite direction than +the src-BFS (if direction = FORWARD, by default direction_reverse = +BACKWARD, and vice-versa). The default behavior is thus to search for +a shortest path between two nodes in a given direction. However, one +can also specify FORWARD or BACKWARD for *both* the src-BFS and the +dst-BFS. This will search for a common descendant or a common ancestor +between the two sets, respectively. These will be the midpoints of the +returned path. + +Similar to the **Traverse** endpoint, it is also possible to specify edge +restrictions. + +**Example 1**: shortest path from a snapshot to a content (forward graph): + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.FindPathBetween \ + "src: 'swh:1:snp:0000000000000000000000000000000000000020', dst: 'swh:1:cnt:0000000000000000000000000000000000000004', mask: {paths: ['swhid']}" + swhid: "swh:1:snp:0000000000000000000000000000000000000020" + swhid: "swh:1:rev:0000000000000000000000000000000000000009" + swhid: "swh:1:dir:0000000000000000000000000000000000000008" + swhid: "swh:1:dir:0000000000000000000000000000000000000006" + swhid: "swh:1:cnt:0000000000000000000000000000000000000004" + +**Example 2**: shortest path from a directory to a snapshot (backward graph): + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.FindPathBetween \ + "src: 'swh:1:dir:0000000000000000000000000000000000000006', dst: 'swh:1:rel:0000000000000000000000000000000000000019', direction: BACKWARD, mask: {paths: ['swhid']}" + swhid: "swh:1:dir:0000000000000000000000000000000000000006" + swhid: "swh:1:dir:0000000000000000000000000000000000000008" + swhid: "swh:1:dir:0000000000000000000000000000000000000012" + swhid: "swh:1:rev:0000000000000000000000000000000000000013" + swhid: "swh:1:rev:0000000000000000000000000000000000000018" + swhid: "swh:1:rel:0000000000000000000000000000000000000019" + +**Example 3**: common ancestor of two contents: + +.. code-block:: console + + $ grpc_cli call localhost:50091 swh.graph.TraversalService.FindPathBetween \ + "src: 'swh:1:cnt:0000000000000000000000000000000000000004', dst: 'swh:1:cnt:0000000000000000000000000000000000000015', direction: BACKWARD, direction_reverse: BACKWARD, mask: {paths: ['swhid']}" + swhid: "swh:1:cnt:0000000000000000000000000000000000000004" + swhid: "swh:1:dir:0000000000000000000000000000000000000006" + swhid: "swh:1:dir:0000000000000000000000000000000000000008" + swhid: "swh:1:dir:0000000000000000000000000000000000000012" + swhid: "swh:1:rev:0000000000000000000000000000000000000013" + swhid: "swh:1:rev:0000000000000000000000000000000000000018" + swhid: "swh:1:dir:0000000000000000000000000000000000000017" + swhid: "swh:1:dir:0000000000000000000000000000000000000016" + swhid: "swh:1:cnt:0000000000000000000000000000000000000015" + middle_node_index: 5 + +Because ``middle_node_index = 5``, the common ancestor is +``swh:1:rev:0000000000000000000000000000000000000018``. + + +.. _swh-graph-grpc-api-protobuf: + +Protobuf API Reference +====================== + +The GRPC API is specified in a single self-documenting +`protobuf `_ file, which is +available in the ``proto/swhgraph.proto`` file of the swh-graph repository: + +https://forge.softwareheritage.org/source/swh-graph/browse/master/proto/swhgraph.proto + +.. + .. literalinclude:: swhgraph.proto + :language: protobuf diff --git a/docs/images/Makefile b/docs/images/Makefile index 01fbfa2..9cb29d6 100644 --- a/docs/images/Makefile +++ b/docs/images/Makefile @@ -1,13 +1,13 @@ all: compression_steps.png compression_steps.svg %.png: %.dot - dot -Gdpi=300 -Tpng $< -o $@ + dot -Gdpi=150 -Tpng $< -o $@ %.svg: %.dot dot -Tsvg $< -o $@ .PHONY: clean clean: rm -f compression_steps.png rm -f compression_steps.svg diff --git a/docs/images/compression_steps.dot b/docs/images/compression_steps.dot index 7156f62..c1beb77 100644 --- a/docs/images/compression_steps.dot +++ b/docs/images/compression_steps.dot @@ -1,51 +1,111 @@ digraph "Compression steps" { - // Horizontal graph - rankdir=LR; + node [shape = none]; + + orc_dataset [label="ORC Graph\nDataset"]; + nodes_csv [label="graph.nodes.csv.zst"]; + labels_csv [label="graph.labels.csv.zst"]; + graph_mph [label="graph.mph"]; subgraph { - input_edges [label="swh.edges.csv.gz", fontsize=9, shape=none]; - input_nodes [label="swh.nodes.csv.gz", fontsize=9, shape=none]; - {rank=same; input_edges; input_nodes;} + node [fontcolor=darkgray]; + graph_base [label="graph-base.graph"] + graph_bfs_order [label="graph-bfs.order"] + graph_bfs [label="graph-bfs.graph"] + graph_bfs_transposed [label="graph-bfs-transposed.graph"] + graph_bfs_simplified [label="graph-bfs-simplified.graph"] + graph_llp_order [label="graph-llp.order"] } - mph [label="MPH", shape=box]; - mph_out [label="swh.mph", fontsize=9, shape=none]; - - bv_compress [label="BV compress", shape=box]; - bv_compress_out - [label="swh-bv.graph\lswh-bv.offsets\lswh-bv.obl\lswh-bv.properties", - fontsize=9, shape=none]; - - bfs [label="BFS", shape=box]; - bfs_out [label="swh.order", fontsize=9, shape=none]; + graph_llp [label="graph.graph"] + graph_llp_transposed [label="graph-transposed.graph"] + graph_order [label="graph.order"] + graph_obl [label="graph.obl"] + graph_transposed_obl [label="graph-transposed.obl"] + stats [label="graph.stats"] + swhidmap [label="graph.node2swhid.bin"] + typemap [label="graph.node2type.bin"] + persons_csv [label="graph.persons.csv.zst"]; + persons_mph [label="graph.persons.mph"]; + node_properties [label="graph.property.*"]; + labels_mph [label="graph.labels.mph"]; + labels_fcl [label="graph.labels.fcl"]; + graph_labelled [label="graph-labelled.*"]; + graph_transposed_labelled [label="graph-transposed-labelled.*"]; + graph_labelled_obl [label="graph-labelled.labelobl"]; + graph_transposed_labelled [label="graph-transposed-labelled.labelobl"]; - permute [label="Permute", shape=box]; - permute_out - [label="swh.graph\lswh.offsets\lswh.obl\lswh.properties", - fontsize=9, shape=none]; - - stats [label="Stats", shape=box]; - stats_out - [label="swh.stats\lswh.indegree\lswh.outdegree", - fontsize=9, shape=none]; + subgraph { + node [shape=box, fontname="Courier New"]; + EXTRACT_NODES; + MPH; + BV; + BFS; + PERMUTE_BFS; + TRANSPOSE_BFS; + SIMPLIFY; + LLP; + PERMUTE_LLP; + COMPOSE_ORDERS; + STATS; + TRANSPOSE; + OBL; + TRANSPOSE_OBL; + NODE_MAP; + EXTRACT_PERSONS; + MPH_PERSONS; + NODE_PROPERTIES; + MPH_LABELS; + FCL_LABELS; + EDGE_LABELS; + EDGE_LABELS_OBL; + EDGE_LABELS_TRANSPOSE_OBL; + } - transpose [label="Transpose", shape=box]; - transpose_out - [label="swh-transposed.graph\lswh-transposed.offsets\lswh-transposed.obl\lswh-transposed.properties", - fontsize=9, shape=none]; - input_nodes -> mph; - input_edges -> bv_compress; - mph -> mph_out; - mph_out -> bv_compress; - bv_compress -> bv_compress_out; - bv_compress_out-> bfs; - bv_compress_out-> permute; - bfs -> bfs_out; - bfs_out -> permute; - permute -> permute_out; - permute_out -> stats; - permute_out -> transpose; - stats -> stats_out; - transpose -> transpose_out; + orc_dataset -> EXTRACT_NODES; + EXTRACT_NODES -> nodes_csv; + EXTRACT_NODES -> labels_csv; + nodes_csv -> MPH -> graph_mph; + graph_mph -> BV; + orc_dataset -> BV -> graph_base; + graph_base -> BFS -> graph_bfs_order; + graph_bfs_order -> PERMUTE_BFS; + graph_base -> PERMUTE_BFS -> graph_bfs; + graph_bfs -> TRANSPOSE_BFS -> graph_bfs_transposed; + graph_bfs_transposed -> SIMPLIFY; + graph_bfs -> SIMPLIFY -> graph_bfs_simplified; + graph_bfs_simplified -> LLP -> graph_llp_order; + graph_llp_order -> PERMUTE_LLP; + graph_bfs -> PERMUTE_LLP -> graph_llp; + graph_bfs_order -> COMPOSE_ORDERS; + graph_llp_order -> COMPOSE_ORDERS -> graph_order; + graph_llp -> TRANSPOSE -> graph_llp_transposed; + graph_llp -> OBL -> graph_obl; + graph_llp_transposed -> TRANSPOSE_OBL -> graph_transposed_obl; + graph_llp -> STATS -> stats; + graph_llp -> NODE_MAP; + nodes_csv -> NODE_MAP; + graph_mph -> NODE_MAP; + graph_order -> NODE_MAP; + NODE_MAP -> swhidmap; + NODE_MAP -> typemap; + orc_dataset -> EXTRACT_PERSONS -> persons_csv; + persons_csv -> MPH_PERSONS -> persons_mph; + orc_dataset -> NODE_PROPERTIES; + persons_mph -> NODE_PROPERTIES; + graph_mph -> NODE_PROPERTIES; + graph_order -> NODE_PROPERTIES; + NODE_PROPERTIES -> node_properties; + labels_csv -> MPH_LABELS -> labels_mph; + labels_mph -> FCL_LABELS; + labels_csv -> FCL_LABELS -> labels_fcl; + orc_dataset -> EDGE_LABELS; + labels_mph -> EDGE_LABELS; + graph_llp -> EDGE_LABELS; + graph_mph -> EDGE_LABELS; + graph_order -> EDGE_LABELS; + EDGE_LABELS -> graph_labelled; + EDGE_LABELS -> graph_transposed_labelled; + graph_labelled -> EDGE_LABELS_OBL -> graph_labelled_obl; + graph_transposed_labelled -> EDGE_LABELS_TRANSPOSE_OBL -> graph_transposed_labelled_obl; } diff --git a/docs/index.rst b/docs/index.rst index 9bf477d..07e1068 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,17 +1,19 @@ .. _swh-graph: .. include:: README.rst .. toctree:: :maxdepth: 1 :caption: Overview quickstart + api + grpc-api + java-api + memory compression cli - api - use-cases docker git2graph /apidoc/swh.graph diff --git a/docs/java-api.rst b/docs/java-api.rst new file mode 100644 index 0000000..982236a --- /dev/null +++ b/docs/java-api.rst @@ -0,0 +1,744 @@ +.. _swh-graph-java-api: + +Using the Java API +================== + +.. highlight:: java + +While the :ref:`HTTP API ` is useful for many common use-cases, +it is often not sufficient to implement more complex algorithms. This section +describes the low-level Java API that ``swh-graph`` provides on top of the +WebGraph framework to manipulate the compressed graph of Software Heritage. + +A cursory understanding of the `WebGraph framework +`_ and its API is helpful to understand the +notions detailed here. + +.. _swh-graph-java-basics: + +Basics +------ + +In the WebGraph framework, graphs are generally subclasses of +`ImmutableGraph +`_, +the abstract class providing the core API to manipulate and iterate on graphs. +Under the hood, compressed graphs are stored as the `BVGraph +`_ +class, which contains the actual codec used to compress and decompress +adjacency lists. + +Graphs **nodes** are mapped to a contiguous set of integers :math:`[0, n - 1]` +where *n* is the total number of nodes in the graph. +Each node has an associated *adjacency list*, i.e., a list of nodes going from +that source node to a destination node. This list represents the **edges** (or +**arcs**) of the graph. + +**Note**: edges are always directed. Undirected graphs are internally stored as +a pair of directed edges (src → dst, dst → src), and are called "symmetric" +graphs. + +On disk, a simple BVGraph with the basename ``graph`` would be represented as +the following set of files: + +- ``graph.graph``: contains the compressed adjacency lists of each node, which + can be decompressed by the BVGraph codec. +- ``graph.properties``: contains metadata on the graph, such as the number of + nodes and arcs, as well as additional loading information needed by the + BVGraph codec. +- ``graph.offsets``: a list of offsets of where the adjacency list of each node + is stored in the main graph file. +- ``graph.obl``: optionally, an "offset big-list file" which can be used to + load graphs faster. + +An ImmutableGraph can be loaded using different *load methods*, which have each +different performance implications: + +- ``load()``: the entire graph is loaded in RAM and supports random access. +- ``loadMapped()``: the graph is loaded by memory-mapping it from disk (see + ``mmap(1)``), at the cost of being potentially slower, especially when doing + random access on slow storage. +- ``loadOffline()``: no data is actually loaded is memory, only sequential + iteration is possible. + +The following code loads a graph stored on disk under the ``compressed/graph`` +basename, using the memory-mapped loading mode, and stores it as a generic +ImmutableGraph: + +.. code-block:: java + + ImmutableGraph graph = ImmutableGraph.loadMapped("compressed/graph"); + +Note that most of the time you will want to use the SWH-specific subclass +**SwhUnidirectionalGraph** instead, which has the same API as an ImmutableGraph +except it adds other SWH-specific methods. It is described later in the +:ref:`swh-graph-java-node-mappings` section. + + +Running the code +---------------- + +To run a piece of Java code written using the Java API, you need to run it with +all the dependencies in your classpath (the WebGraph libraries and the +swh-graph library). The easiest way to do it is to add the *fat jar* +shipped in the swh-graph library on PyPI, which contains all the required +dependencies. + +.. code-block:: console + + $ java -cp venv/share/swh-graph/swh-graph-0.5.2.jar MyAlgo.java + + +Note that to load bigger graphs, the default heap size of the JVM is likely to +be insufficient to load entire graphs in memory. It is advised to increase this +heap size with the ``-Xmx`` flag: + +.. code-block:: console + + $ java -Xmx300G -cp venv/share/swh-graph/swh-graph-0.5.2.jar MyAlgo.java + +For more information on performance tuning and memory considerations, you +should also read the :ref:`swh-graph-memory` page, in which we recommend +additional JVM options for loading large graphs. + + +Simple traversal +---------------- + +The ImmutableGraph class provides primitives to iterate and traverse graphs. It +contains the following methods: + +- ``graph.numNodes()`` returns the number of nodes in the graph (*n*). +- ``graph.numArcs()`` returns the number of arcs in the graph. + +And, given a node ID :math:`k \in [0, n - 1]`: + +- ``graph.successors(k)`` returns a LazyLongIterator on the nodes that are + *adjacent* to *k* (i.e., its outgoing *neighbors*). +- ``graph.outdegree(k)`` returns the number of outgoing neighbors of *k*. + + +Example: Average outdegree +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following code can be used to compute the average +outdegree of a graph, which is a useful measure of its density: + +.. code-block:: java + + public static long averageOutdegree(ImmutableGraph graph) { + return ((long) graph.numArcs()) / graph.numNodes(); + } + + +Example: Degree distributions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using the ``outdegree()`` primitive, we can compute the outdegree distribution +of the graph by iterating on all its nodes. The distribution will be returned +as a map that associates to each degree *d* the number of nodes with outdegree +*d*. + +.. code-block:: java + + public static Map outdegreeDistribution(ImmutableGraph graph) { + HashMap distribution = new HashMap(); + for (long k = 0; k < graph.numNodes(); ++k) { + distribution.merge(graph.outdegree(k), 1L, Long::sum); + } + return distribution; + } + + +Example: Depth-First Traversal +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``successors`` primitive can be used to write a simple stack-based DFS +traversal on the graph which starts from a given node and prints all the +descendant nodes in its transitive closure: + +.. code-block:: java + :emphasize-lines: 10 + + public static void visitNodesDFS(ImmutableGraph graph, long srcNodeId) { + Stack stack = new Stack<>(); + HashSet visited = new HashSet(); + stack.push(srcNodeId); + visited.add(srcNodeId); + + while (!stack.isEmpty()) { + long currentNodeId = stack.pop(); + System.out.println(currentNodeId); + + LazyLongIterator it = graph.successors(currentNodeId); + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + if (!visited.contains(neighborNodeId)) { + stack.push(neighborNodeId); + visited.add(neighborNodeId); + } + } + } + } + +Example: Breadth-First Traversal +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Swapping the stack for a queue changes the traversal order from depth-first +to breadth-first: + +.. code-block:: java + :emphasize-lines: 2 + + public static void visitNodesBFS(ImmutableGraph graph, long srcNodeId) { + Queue queue = new ArrayDeque<>(); + HashSet visited = new HashSet(); + queue.add(srcNodeId); + visited.add(srcNodeId); + + while (!queue.isEmpty()) { + long currentNodeId = queue.poll(); + System.out.println(currentNodeId); + + LazyLongIterator it = graph.successors(currentNodeId); + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + if (!visited.contains(neighborNodeId)) { + queue.add(neighborNodeId); + visited.add(neighborNodeId); + } + } + } + } + + +.. _swh-graph-java-node-mappings: + +Node types and SWHIDs +--------------------- + +In the Software Heritage archive, nodes are not represented by a simple +integer, but by a :ref:`SWHID `, which contain both the +*type* of the node (revision, directory, blob...) and its unique identifier. We +use **node mappings** which allow us to translate between SWHIDs and the +compact node IDs used in the compressed graph. + +Most notably, we use a MPH (Minimal Perfect Hash) function implemented in the +`GOVMinimalPerfectHashFunction +`_ +class of the Sux4J library, which maps N keys to N consecutive integers with no +collisions. + +The following files are used to store the mappings between the nodes and their +types: + +- ``graph.mph``: contains a serialized minimal perfect hash function computed + on the list of all the SWHIDs in the graph. +- ``graph.order``: contains the permutation that associates with each output of + the MPH the node ID to which it corresponds +- ``graph.node2swhid.bin``: contains a compact binary representation of all the + SWHIDs in the graph, ordered by their rank in the graph file. +- ``graph.node2type.bin``: contains a `LongBigArrayBitVector + `_ + which stores the type of each node. + +To use these mappings easily, we provide the class **SwhUnidirectionalGraph**, +an ImmutableGraph which wraps the underlying graph and adds a few +utility methods to obtain SWH-specific information on the graph. + +A SwhUnidirectionalGraph can be loaded in a similar way to any ImmutableGraph, +as long as the mapping files listed above are present:: + + SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.load(basename); + +This class exposes the same graph primitives as an ImmutableGraph, but it +additionally contains the following methods: + +- ``SWHID getSWHID(long nodeId)``: returns the SWHID associated with a given + node ID. This function does a lookup of the SWHID at offset *i* in the file + ``graph.node2swhid.bin``. + +- ``long getNodeID(SWHID swhid)``: returns the node ID associated with a given + SWHID. It works by hashing the SWHID with the function stored in + ``graph.mph``, then permuting it using the permutation stored in + ``graph.order``. It does additional domain-checking by calling ``getSWHID()`` + on its own result to check that the input SWHID was valid. + +- ``SwhType getNodeType(long nodeID)``: returns the type of a given node, as + an enum of all the different object types in the Software Heritage data + model. It does so by looking up the value at offset *i* in the bit vector + stored in ``graph.node2type.bin``. + + +Example: Find the target directory of a revision +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As an example, we use the methods mentioned above to perform the +following task: "given a revision, return its target directory". To do so, we +first look up the node ID of the given revision in the compressed graph. We +iterate on the successors of that node, and return the SWHID of the first +destination node that has the "directory" type. + + +.. code-block:: java + :emphasize-lines: 2 + + public SWHID findDirectoryOfRevision(SwhUnidirectionalGraph graph, SWHID revSwhid) { + long src = graph.getNodeId(revSwhid); + assert graph.getNodeType(src) == SwhType.REV; + LazyLongIterator it = graph.successors(currentNodeId); + for (long dst; (dst = it.nextLong()) != -1;) { + if (graph.getNodeType(dst) == SwhType.DIR) { + return graph.getSWHID(dst); + } + } + throw new RuntimeError("Revision has no target directory"); + } + +.. _swh-graph-java-node-properties: + +Node properties +--------------- + +The Software Heritage Graph is a *property graph*, which means it has various +properties associated with its nodes and edges (e.g., commit timestamps, +authors, messages, ...). We compress these properties and store them in files +alongside the compressed graph. This allows you to write traversal algorithms +that depend on these properties. + +By default, properties are not assumed to be present are are not loaded when +the graph itself is loaded. If you want to use a property, you need to +explicitly load it first. As an example, this is how you load the "content +length" property to get the length of a given blob:: + + SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.load(basename); + graph.loadContentLength(); + long blobSize = graph.getContentLength(graph.getNodeID(swhid)); + +The documentation of the SwhGraphProperties class (**TODO: link**) lists all +the different properties, their types, and the methods used to load them and to get +their value for a specific node. + +A few things of note: + +- A single loading call can load multiple properties at once; this is because + they are stored in the same file to be more space efficient. + +- Persons (authors, committers etc) are exported as a single pseudonymized + integer ID, that uniquely represents a full name + email. + +- Timestamps are stored as a long integer (for the timestamp itself) and a + short integer (for the UTC offset). + + +.. _swh-graph-java-edge-properties: + +Edge labels +----------- + +While looking up graph properties on the *nodes* of the graph is relatively +straightforward, doing so for labels on the *arcs* is comparatively more +difficult. These include the names and permissions of directory entries, as +well as the branch names of snapshots. + +The `ArcLabelledImmutableGraph +`_ +class in WebGraph wraps an ImmutableGraph, but augments its iterators by making them +*labelled iterators*, which essentially allow us to look up the label of the +arcs while iterating on them. + +This labelled graph is stored in the following files: + +- ``graph-labelled.properties``: a property file describing the graph, notably + containing the basename of the wrapped graph. +- ``graph-labelled.labels``: the compressed labels +- ``graph-labelled.labeloffsets``: the offsets used to access the labels in + random order. + +The SwhUnidirectionalGraph class contains *labelled* loading methods +(``loadLabelled()``, ``loadLabelledMapped()``, ...). When these loading methods +are used instead of the standard non-labelled ones, the graph is loaded as an +ArcLabelledImmutableGraph instead of an ImmutableGraph. The following methods +can then be used: + +- ``labelledSuccessors(k)`` returns a `LabelledArcIterator + `_ + which is used in the same way as a LazyLongIterator except it also contains a + ``label()`` method to get the label of the currently traversed arc. +- ``labelledNodeIterator()`` returns an `ArcLabelledNodeIterator + `_ + of all the nodes in the graph, which replaces the LazyLongIterator of the + ``successor()`` function by a LabelledArcIterator similar to above. + + +Label format +~~~~~~~~~~~~ + +The labels of each arc are returned as a ``DirEntry[]`` array. They encode +both the name of a directory entry and its permissions. For snapshot branches, +only the "name" field is useful. + +Arc label names are encoded as an integer ID representing each unique +entry/branch name present in the graph. To retrieve the actual name associated +with a given label ID, one needs to load the reverse mapping similar to how you +would do for a normal property:: + + SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.loadLabelled(basename); + graph.loadLabelNames(); + +The byte array representing the actual label name can then be loaded with:: + + byte[] name = graph.getLabelName(label.filenameId); + + +Multiedges +~~~~~~~~~~ + +The Software Heritage is not a *simple graph*, where at most one edge can exist +between two vertices, but a *multigraph*, where multiple edges can be incident +to the same two vertices. Consider for instance the case of a single directory +``test/`` containing twice the same file blob (e.g., the empty file), under two +different names (e.g., ``ISSUES.txt`` and ``TODO.txt``, both completely empty). +The simple graph view of this directory will represent it as a single edge +``test`` → *empty file*, while the multigraph view will represent it as *two* +edges between the same nodes. + +Due to the copy-list model of compression, WebGraph only stores simple graphs, +and thus stores multiedges as single edges, to which we cannot associate +a single label name (in our example, we need to associate both names +``ISSUES.txt`` and ``TODO.txt``). +To represent this possibility of having multiple file names for a single arc, +in the case of multiple relationships between two identical nodes, each arc label is +stored as an *array* of DirEntry, each record representing one relationship +between two nodes. + + +Example: Printing all the entries of a directory +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following code showcases how one can print all the entries (name, +permission and target SWHID) of a given directory, using the labelled methods +seen above. + +.. code-block:: java + + public static void printEntries(ImmutableGraph g, long dirNode) { + LabelledArcIterator s = g.labelledSuccessors(dirNode); + for (long dst; (dst = it.nextLong()) >= 0;) { + DirEntry[] labels = (DirEntry[]) s.label().get(); + for (DirEntry label : labels) { + System.out.format( + "%s %s %d\n", + graph.getSWHID(dst); + new String(graph.getLabelName(label.filenameId)), + label.permission + ); + } + } + } + + // Usage: $PROGRAM + public static void main(String[] args) { + SwhUnidirectionalGraph g = SwhUnidirectionalGraph.loadLabelledMapped(args[0]); + g.loadLabelNames(); + long dirNode = g.getNodeID(new SWHID(args[1])); + printEntries(g, dirNode); + } + + +Transposed graph +---------------- + +Up until now, we have only looked at how to traverse the *forward* graph, i.e., +the directed graph whose edges are in the same direction as the Merkle DAG of +the Software Heritage archive. +For many purposes, especially that of finding the *provenance* of software +artifacts, it is useful to query the *backward* (or *transposed*) graph +instead, which is the same as the forward graph except all the edges are +reversed. + +The transposed graph has its own set of files, counterparts to the files needed +for the forward graph: + +- ``graph-transposed.graph`` +- ``graph-transposed.properties`` +- ``graph-transposed.offsets`` +- ``graph-transposed.obl`` +- ``graph-transposed-labelled.labels`` +- ``graph-transposed-labelled.labeloffsets`` +- ``graph-transposed-labelled.properties`` + +However, because node IDs are the same in the forward and the backward graph, +all the files that pertain to mappings between the node IDs and various +properties (SWHIDs, property data, node permutations etc) remain the same. + + +Example: Earliest revision containing a given blob +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following code loads all the committer timestamps of the revisions in the +graph, then walks the *transposed* graph to return the earliest revision +containing a given object. + +.. code-block:: java + + public static long findEarliestRevisionContaining(SwhUnidirectionalGraph g, long src) { + long oldestRev = -1; + long oldestRevTs = Long.MAX_VALUE; + + Stack stack = new Stack<>(); + HashSet visited = new HashSet(); + stack.push(src); + visited.add(src); + while (!stack.isEmpty()) { + long currentNodeId = stack.pop(); + LazyLongIterator it = graph.successors(currentNodeId); + for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + if (!visited.contains(neighborNodeId)) { + stack.push(neighborNodeId); + visited.add(neighborNodeId); + if (g.getNodeType(neighborNodeId) == SwhType.REV) { + Long ts = g.getCommitterTimestamp(neighborNodeId); + if (ts != null && ts < oldestRevTs) { + oldestRev = neighborNodeId; + oldestRevTs = ts; + } + } + } + } + } + return oldestRev; + } + + // Usage: $PROGRAM + public static void main(String[] args) { + // Load the backward (= transposed) graph as a SwhUnidirectionalGraph + SwhUnidirectionalGraph g = SwhUnidirectionalGraph.loadMapped(args[0] + "-transposed"); + g.loadCommitterTimestamps(); + long node = g.getNodeID(new SWHID(args[1])); + long oldestRev = findEarliestRevisionContaining(g, node); + System.out.println(g.getSWHID(oldestRev)); + } + + + + +Bidirectional Graph +------------------- + + +BidirectionalImmutableGraph +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While ``graph-transposed`` can be loaded as a simple SwhUnidirectionalGraph and +then manipulated just like the forward graph, it is often convenient to have +*both* the forward and the backward graph in memory. Some traversal algorithms +require first going down in the forward graph to select some nodes, then going +up to find their provenance. + +To achieve that, we use the `BidirectionalImmutableGraph +`_ +class from WebGraph, which stores both a graph and its transpose. +This class provides the following methods to iterate on the **backward** graph, +shown here with their counterparts for the forward graph: + +.. list-table:: + :header-rows: 1 + + * - Forward graph operation + - Backward graph operation + + * - ``outdegree(k)`` + - ``indegree(k)`` + + * - ``successors(k)`` + - ``predecessors(k)`` + +In addition, the class offers a few convenience methods which are generally +useful when you have both a graph and its transpose: + +- ``transpose()`` returns the transpose of the BidirectionalImmutableGraph by + inverting the references to the forward and the backward graphs. Successors + become predecessors, and vice-versa. +- ``symmetrize()`` returns the symmetric (= undirected) version of the + bidirectional graph. It is implemented by a union between the forward and the + backward graph, which basically boils down to removing the directionality of + the edges (the successors of a node are also its predecessors). + + +SwhBidirectionalGraph +~~~~~~~~~~~~~~~~~~~~~ + +Like for ImmutableGraph, we extend the BidirectionalImmutableGraph with +SWH-specific methods, in the subclass ``SwhBidirectionalGraph``. Notably, it +contains the method ``labelledPredecessors()``, the equivalent of +``labelledSuccessors()`` but on the backward graph. + +Because SwhUnidirectionalGraph inherits from ImmutableGraph, and +SwhBidirectionalGraph inherits from BidirectionalImmutableGraph, we put the +common behavior between the two classes in a SwhGraph interface, which can +represent either an unidirectional or a bidirectional graph. + +To avoid loading the node properties two times (once for each direction), they +are stored in a separate class called SwhGraphProperties. In a +SwhBidirectionalGraph, the two SwhUnidirectionalGraph share their node +properties in memory by storing references to the same SwhGraphProperty +object. + +.. code-block:: text + + + ┌──────────────┐ + │ImmutableGraph◄────────┐ + └────▲─────────┘ │extends + │ │ + │ ┌──────────┴────────────────┐ + extends│ │BidirectionalImmutableGraph│ + │ └────────────▲──────────────┘ + │ │extends + ┌──────────────┴───────┐ ┌──────┴──────────────┐ + │SwhUnidirectionalGraph│◄────┤SwhBidirectionalGraph│ + └──┬──────────────┬────┘ └────────┬───────────┬┘ + │ │ contains x2 │ │ + │ │ │ │ + │ implements│ │implements │ + │ ┌▼──────────┐ │ │ + │ │SwhGraph(I)◄────────┘ │ + contains│ └───────────┘ │contains + │ │ + │ ┌──────────────────┐ │ + └────────────►SwhGraphProperties◄──────────────┘ + └──────────────────┘ + + +Example: Find all the shared-commit forks of a given origin +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +It is possible to define the *forks* of an origin as being the set of origins +which share at least one revision with that origin. + +The following code loads the graph in both directions using a +SwhBidirectionalGraph. Given an origin SWHID, it first walks the *forward* +graph to find all its root revisions. It then walks the *backward* graph to +find all the origins containing these root revisions, i.e., its *forks*. + +.. code-block:: java + + public static void findSharedCommitForks(SwhUnidirectionalGraph g, long srcOrigin) { + Stack forwardStack = new Stack<>(); + HashSet forwardVisited = new HashSet(); + Stack backwardStack = new Stack<>(); + HashSet backwardVisited = new HashSet(); + + // First traversal (forward graph): find all the root revisions of the + // origin + forwardStack.push(srcOrigin); + forwardVisited.add(srcOrigin); + while (!forwardStack.isEmpty()) { + long curr = forwardStack.pop(); + LazyLongIterator it = graph.successors(curr); + boolean isRootRevision = true; + for (long succ; (succ = it.nextLong()) != -1;) { + SwhType nt = g.getNodeType(succ); + if (!forwardVisited.contains(succ) + && nt != SwhType.DIR && nt != SwhType.CNT) { + forwardStack.push(succ); + forwardVisited.add(succ); + isRootRevision = false; + } + } + if (g.getNodeType(curr) == SwhType.REV && isRootRevision) { + // Found a root revision, add it to the second stack + backwardStack.push(curr); + backwardVisited.add(curr); + } + } + + // Second traversal (backward graph): find all the origins containing + // any of these root revisions and print them + while (!backwardStack.isEmpty()) { + long curr = backwardStack.pop(); + LazyLongIterator it = graph.predecessors(curr); + boolean isRootRevision = true; + for (long succ; (succ = it.nextLong()) != -1;) { + SwhType nt = g.getNodeType(succ); + if (!backwardVisited.contains(succ)) { + backwardStack.push(succ); + backwardVisited.add(succ); + if (nt == SwhType.ORI) { + // Found an origin, print it. + System.out.println(g.getSWHID(succ)); + } + } + } + } + } + + // Usage: $PROGRAM + public static void main(String[] args) { + // Load both forward and backward graphs as a SwhBidirectionalGraph + SwhBidirectionalGraph g = SwhBidirectionalGraph.loadMapped(args[0]); + long node = g.getNodeID(new SWHID(args[1])); + findSharedCommitForks(g, node); + } + + +Large-scale processing +---------------------- + +Multithreading +~~~~~~~~~~~~~~ + +ImmutableGraph is not thread-safe. When writing multithreaded algorithms, +calling ``successors()`` on the same graph from multiple threads will return +garbage. + +Instead, each thread should create its own "lightweight copy" of the graph by +calling ``.copy()``. This will not actually copy the entire graph data, which +will remain shared across threads, but it will create new instances of the +iterators so that each thread can independently iterate on the graph data. + + +Data structures for large traversals +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When doing very large traversals, such as a BFS on the entire graph, the +usual data structures (HashSet, Stack, ArrayDeque, etc.) will be quite +inefficient. If you know you are going to traverse large parts of the graph, +it's better to use more appropriate data structures, a lot of which can be +found in the dsiutils library. In particular: + +- `LongArrayBitVector + `_ + is an efficient bit-vector implementation, which can be used to store the + nodes that have already been seen in the visit. Its memory footprint is too + big to use for small traversals, but it is very efficient to traverse the + full graph, as every node only takes a single bit. + +- `ByteDiskQueue + `_ can + be used to efficiently store the queue of nodes to visit on disk, when it is + too large to fit in RAM. + +Other types in dsiutils and fastutil can save significant memory: +``LongArrayList`` saves at least 8 bytes per entry over ``ArrayList``, +and ``Long2LongOpenHashMap`` saves at least 16 bytes for every entry over +``HashMap``. We strongly recommend reading the documentation of the +unimi libraries and looking at the code for usage examples. + + +BigArrays +~~~~~~~~~ + +When working with the Software Heritage graph, is often necessary to store +large arrays of values, with a size exceeding 2^32 items. Unfortunately, +standard Java arrays do not support this. + +To circumvent this, WebGraph uses the `BigArrays scheme +`_ from +the fastutil library: "big arrays" are stored as arrays of arrays, supporting +quadrillions of records. + +A BigArray ``long[][] a`` can be used with the following methods: + +- ``BigArrays.get(a, i)`` to get the value at index *i* +- ``BigArrays.set(a, i, v)`` to set the value at index *i* to *v*. +- ``BigArrays.length(a)`` to get the total length of the bigarray. diff --git a/docs/memory.rst b/docs/memory.rst new file mode 100644 index 0000000..f30f9c4 --- /dev/null +++ b/docs/memory.rst @@ -0,0 +1,130 @@ +.. _swh-graph-memory: + +Memory & Performance tuning +=========================== + +This page discusses various considerations related to memory usage and +performance tuning when using the ``swh-graph`` library to load large +compressed graphs. + +JVM options +----------- + +In production, we tend to use very large servers which have enough RAM to load +the entire graph in RAM. In these setups, the default JVM options are often +suboptimal. We recommend to start the JVM with the following options, which +tend to significantly improve performance:: + + java \ + -ea \ + -server \ + -XX:PretenureSizeThreshold=512M \ + -XX:MaxNewSize=4G \ + -XX:+UseLargePages \ + -XX:+UseTransparentHugePages \ + -XX:+UseNUMA \ + -XX:+UseTLAB \ + -XX:+ResizeTLAB \ + +These options are documented in the manual of ``java(1)`` the Oracle +documentation. + + +Temporary directory +------------------- + +Many of the graph algorithms (either for compression or traversal) tend to +offload some of their run-time memory to disk. For instance, the `BFS +`_ +algorithm in the LAW library uses a temporary directory to write its queue of +nodes to visit. + +Because these can be quite large and sometimes overflow the default ``/tmp`` +partition, it is advised to systematically specify a path to a local temporary +directory with enough space to accommodate the needs of the Java programs. This +can be done using the ``-Djava.io.tmpdir`` parameter on the Java CLI:: + + java -Djava.io.tmpdir=/srv/softwareheritage/ssd/tmp + + +Memory mapping vs Direct loading +-------------------------------- + +The main dial you can use to manage your memory usage is to chose between +memory-mapping and direct-loading the graph data. The different loading modes +available when loading the graph are documented in :ref:`swh-graph-java-api`. + +Loading in mapped mode will not load any extra data in RAM, but will instead +use the ``mmap(1)`` syscall to put the graph file located on disk in the +virtual address space. The Linux kernel will then be free to arbitrarily cache +the file, either partially or in its entirety, depending on the available +memory space. + +In our experiments, memory-mapping a small graph from a SSD only incurs a +relatively small slowdown (about 15-20%). However, when the graph is too big to +fit in RAM, the kernel has to constantly invalidate pages to cache newly +accessed sections, which incurs a very large performance penalty. A full +traversal of a large graph that usually takes about 20 hours when loaded in +main memory could take more than a year when mapped from a hard drive! + +When deciding what to direct-load and what to memory-map, here are a few rules +of thumb: + +- If you don't need random access to the graph edges, you can consider using + the "offline" loading mode. The offsets won't be loaded which will save + dozens of gigabytes of RAM. + +- If you only need to query some specific nodes or run trivial traversals, + memory-mapping the graph from a HDD should be a reasonable solution that + doesn't take an inordinate amount of time. It might be bad for your disks, + though. + +- If you are constrained in available RAM, memory-mapping the graph from an SSD + offers reasonable performance for reasonably complex algorithms. + +- If you have a heavy workload (i.e. running a full traversal of the entire + graph) and you can afford the RAM, direct loading will be orders of magnitude + faster than all the above options. + + +Sharing mapped data across processes +------------------------------------ + +Often, multiple processes can be working on the same data (mappings or the +graph itself), for instance when running different experiments on the same +graph. This is problematic in terms of RAM usage when using direct memory +loading, as the same data of potentially hundreds of gigabytes is loaded in +memory twice. +As we have seen, memory-mapping can be used to avoid storing redundant data in +RAM, but comes at the cost of potentially slower I/O as the data is no longer +guaranteed to be loaded in main memory and is reliant on kernel heuristics. + +To efficiently share data across two different compressed graph processes, +another option is to copy graph data to a ``tmpfs`` not backed by a disk swap, +which forces the kernel to load it entirely in RAM. Subsequent memory-mappings +of the files stored in the tmpfs will simply map the data stored in RAM to +virtual memory pages, and return a pointer to the in-memory structure. + +To do so, we create a directory in ``/dev/shm`` in which we **copy** all the +files that we want to direct-load in RAM, and **symlink** all the others. Then, +we load the graph using the memory-mapped loading mode, which makes it use the +shared memory stored in the tmpfs under the hood. + +Here is a systemd service that can be used to perform this task automatically: + +.. code-block:: ini + + [Unit] + Description=swh-graph memory sharing in tmpfs + + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStart=mkdir -p /dev/shm/swh-graph/default + ExecStart=sh -c "ln -s /.../compressed/* /dev/shm/swh-graph/default" + ExecStart=cp --remove-destination /.../compressed/graph.graph /dev/shm/swh-graph/default + ExecStart=cp --remove-destination /.../compressed/graph-transposed.graph /dev/shm/swh-graph/default + ExecStop=rm -rf /dev/shm/swh-graph/default + + [Install] + WantedBy=multi-user.target diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 7ac51bd..425a547 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -1,174 +1,132 @@ +.. _swh-graph-quickstart: + Quickstart ========== -This quick tutorial shows how to compress and browse a graph using ``swh.graph``. - -It does not cover the technical details behind the graph compression techniques -(refer to :ref:`graph-compression`). - +This quick tutorial shows how to start the ``swh.graph`` service to query +an existing compressed graph with the high-level HTTP API. Dependencies ------------ In order to run the ``swh.graph`` tool, you will need Python (>= 3.7) and Java -JRE, you do not need the JDK if you install the package from pypi, but may want -to install it if you want to hack the code or install it from this git -repository. To compress a graph, you will need zstd_ compression tools. - -It is highly recommended to install this package in a virtualenv. - -On a Debian stable (buster) system: - -.. code:: bash - - $ sudo apt install python3-virtualenv default-jre zstd +JRE. On a Debian system: +.. code:: console -.. _zstd: https://facebook.github.io/zstd/ + $ sudo apt install python3 python3-venv default-jre - -Install -------- +Installing swh.graph +-------------------- Create a virtualenv and activate it: -.. code:: bash +.. code:: console - ~/tmp$ mkdir swh-graph-tests - ~/tmp$ cd swh-graph-tests - ~/t/swh-graph-tests$ virtualenv swhenv - ~/t/swh-graph-tests$ . swhenv/bin/activate + $ python3 -m venv .venv + $ source .venv/bin/activate Install the ``swh.graph`` python package: -.. code:: bash +.. code:: console - (swhenv) ~/t/swh-graph-tests$ pip install swh.graph + (venv) $ pip install swh.graph [...] - (swhenv) ~/t/swh-graph-tests swh graph --help + (venv) $ swh graph --help Usage: swh graph [OPTIONS] COMMAND [ARGS]... Software Heritage graph tools. Options: -C, --config-file FILE YAML configuration file -h, --help Show this message and exit. Commands: - api-client client for the graph RPC service - cachemount Cache the mmapped files of the compressed graph in a tmpfs. compress Compress a graph using WebGraph Input: a pair of files... - map Manage swh-graph on-disk maps rpc-serve run the graph RPC service -Compression ------------ - -Existing datasets -^^^^^^^^^^^^^^^^^ - -You can directly use compressed graph datasets provided by Software Heritage. -Here is a small and realistic dataset (3.1GB): - - https://annex.softwareheritage.org/public/dataset/graph/latest/popular-3k-python/python3kcompress.tar - -.. code:: bash - (swhenv) ~/t/swh-graph-tests$ curl -O https://annex.softwareheritage.org/public/dataset/graph/latest/popular-3k-python/python3kcompress.tar - (swhenv) ~/t/swh-graph-tests$ tar xvf python3kcompress.tar - (swhenv) ~/t/swh-graph-tests$ touch python3kcompress/*.obl # fix the mtime of cached offset files to allow faster loading +.. _swh-graph-retrieving-compressed: -Note: not for the faint heart, but the full dataset is available at: +Retrieving a compressed graph +----------------------------- - https://annex.softwareheritage.org/public/dataset/graph/latest/compressed/ +Software Heritage provides a list of off-the-shelf datasets that can be used +for various research or prototyping purposes. Most of them are available in +*compressed* representation, i.e., in a format suitable to be loaded and +queried by the ``swh-graph`` library. -Own datasets -^^^^^^^^^^^^ +All the publicly available datasets are documented on this page: +https://docs.softwareheritage.org/devel/swh-dataset/graph/dataset.html -A graph is described as both its adjacency list and the set of nodes -identifiers in plain text format. Such graph example can be found in the -``swh/graph/tests/dataset/`` folder. +A good way of retrieving these datasets is to use the `AWS S3 CLI +`_. -You can compress the example graph on the command line like this: +Here is an example with the dataset ``2021-03-23-popular-3k-python``, which has +a relatively reasonable size (~15 GiB including property data, with +the compressed graph itself being less than 700 MiB): -.. code:: bash +.. code:: console + (venv) $ pip install awscli + [...] + (venv) $ mkdir -p 2021-03-23-popular-3k-python/compressed + (venv) $ cd 2021-03-23-popular-3k-python/ + (venv) $ aws s3 cp --recursive s3://softwareheritage/graph/2021-03-23-popular-3k-python/compressed/ compressed - (swhenv) ~/t/swh-graph-tests$ swh graph compress --graph swh/graph/tests/dataset/example --outdir output/ - [...] +You can also retrieve larger graphs, but note that these graphs are generally +intended to be loaded fully in RAM, and do not fit on ordinary desktop +machines. The server we use in production to run the graph service has more +than 700 GiB of RAM. These memory considerations are discussed in more details +in :ref:`swh-graph-memory`. - (swhenv) ~/t/swh-graph-tests$ ls output/ - example-bv.properties example.mph example.obl example.outdegree example.swhid2node.bin example-transposed.offsets - example.graph example.node2swhid.bin example.offsets example.properties example-transposed.graph example-transposed.properties - example.indegree example.node2type.map example.order example.stats example-transposed.obl +**Note:** for testing purposes, a fake test dataset is available in the +``swh-graph`` repository, with just a few dozen nodes. Its basename is +``swh-graph/swh/graph/tests/dataset/compressed/example``. API server ---------- -To start a ``swh.graph`` API server of a compressed graph dataset, run: +To start a ``swh.graph`` API server of a compressed graph dataset, you need to +use the ``rpc-serve`` command with the basename of the graph, which is the path prefix +of all the graph files (e.g., with the basename ``compressed/graph``, it will +attempt to load the files located at +``compressed/graph.{graph,properties,offsets,...}``. -.. code:: bash +In our example: + +.. code:: console - (swhenv) ~/t/swh-graph-tests$ swh graph rpc-serve -g output/example - Loading graph output/example ... + (venv) $ swh graph rpc-serve -g compressed/graph + Loading graph compressed/graph ... Graph loaded. ======== Running on http://0.0.0.0:5009 ======== (Press CTRL+C to quit) From there you can use this endpoint to query the compressed graph, for example -with httpie_ (``sudo apt install``) from another terminal: +with httpie_ (``sudo apt install httpie``): .. _httpie: https://httpie.org .. code:: bash - ~/tmp$ http :5009/graph/visit/nodes/swh:1:rel:0000000000000000000000000000000000000010 - HTTP/1.1 200 OK - Content-Type: text/plain - Date: Tue, 15 Sep 2020 08:33:25 GMT - Server: Python/3.8 aiohttp/3.6.2 - Transfer-Encoding: chunked - - swh:1:rel:0000000000000000000000000000000000000010 - swh:1:rev:0000000000000000000000000000000000000009 - swh:1:rev:0000000000000000000000000000000000000003 - swh:1:dir:0000000000000000000000000000000000000002 - swh:1:cnt:0000000000000000000000000000000000000001 - swh:1:dir:0000000000000000000000000000000000000008 - swh:1:dir:0000000000000000000000000000000000000006 - swh:1:cnt:0000000000000000000000000000000000000004 - swh:1:cnt:0000000000000000000000000000000000000005 - swh:1:cnt:0000000000000000000000000000000000000007 - - -Running the existing ``python3kcompress`` dataset: - -.. code:: bash - - (swhenv) ~/t/swh-graph-tests$ swh graph rpc-serve -g python3kcompress/python3k - Loading graph python3kcompress/python3k ... - Graph loaded. - ======== Running on http://0.0.0.0:5009 ======== - (Press CTRL+C to quit) - - ~/tmp$ http :5009/graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323 HTTP/1.1 200 OK Content-Type: text/plain Date: Tue, 15 Sep 2020 08:35:19 GMT Server: Python/3.8 aiohttp/3.6.2 Transfer-Encoding: chunked swh:1:cnt:33af56e02dd970873d8058154bf016ec73b35dfb swh:1:cnt:b03b4ffd7189ae5457d8e1c2ee0490b1938fd79f swh:1:cnt:74d127c2186f7f0e8b14a27249247085c49d548a swh:1:cnt:c0139aa8e79b338e865a438326629fa22fa8f472 [...] swh:1:cnt:a6b60e797063fef707bbaa4f90cfb4a2cbbddd4a swh:1:cnt:cc0a1deca559c1dd2240c08156d31cde1d8ed406 - -See the documentation of the :ref:`API ` for more details. +See the documentation of the :ref:`API ` for more details on how +to use the HTTP graph querying API. diff --git a/docs/use-cases.rst b/docs/use-cases.rst deleted file mode 100644 index ce01d8c..0000000 --- a/docs/use-cases.rst +++ /dev/null @@ -1,167 +0,0 @@ -========= -Use cases -========= - - -This document lists use cases and benchmark scenarios for the Software Heritage -graph service. - - -Conventions -=========== - -- **Node identification**: in the following, nodes are always identified by - their :ref:`SWHIDs `. - - -Use cases -========= - - -Browsing --------- - -The following use cases require traversing the *forward graph*. - -- **ls**: given a directory node, list (non recursively) all linked nodes of - type directory and content - - Implementation:: - - /graph/neighbors/:DIR_ID?edges=dir:cnt,dir:dir - -- **ls -R**: given a directory node, recursively list all linked nodes of type - directory and content - - Implementation:: - - /graph/visit/paths/:DIR_ID?edges=dir:cnt,dir:dir - -- **git log**: given a revision node, recursively list all linked nodes of type - revision - - Implementation:: - - /graph/visit/nodes/:REV_ID?edges=rev:rev - - -Vault ------ - -The following use cases require traversing the *forward graph*. - -- **tarball** (same as *ls -R* above) - -- **git bundle**: given a node, recursively list all linked nodes of any kind - - Implementation:: - - /graph/visit/nodes/:NODE_ID?edges=* - - -Provenance ----------- - -The following use cases require traversing the *backward (transposed) -graph*. - -- **commit provenance**: given a content or directory node, return *a* commit - whose directory (recursively) contains it - - Implementation:: - - /graph/walk/:NODE_ID/rev?direction=backward&edges=dir:dir,cnt:dir,dir:rev - -- **complete commit provenance**: given a content or directory node, return - *all* commits whose directory (recursively) contains it - - Implementation:: - - /graph/leaves/:NODE_ID?direction=backward&edges=dir:dir,cnt:dir,dir:rev - -- **origin provenance**: given a content, directory, or commit node, return - *an* origin that has at least one snapshot that (recursively) contains it - - Implementation:: - - /graph/walk/:NODE_ID/ori?direction=backward&edges=* - -- **complete origin provenance**: given a content, directory, or commit node, - return *all* origins that have at least one snapshot that (recursively) - contains it - - Implementation:: - - /graph/leaves/:NODE_ID?direction=backward&edges=* - -- *SLOC tracking*: left as future work - - -Provenance statistics -~~~~~~~~~~~~~~~~~~~~~ - -The following use cases require traversing the *backward (transposed) -graph*. - -- **content popularity across commits**: for each content, count the number of - commits (or *commit popularity*) that link to a directory that (recursively) - includes it. Plot the distribution of content popularity across commits. - - Implementation: apply *complete commit provenance* to each content node, - count the returned commits, aggregate. - -- **commit popularity across origins**: for each commit, count the number of - origins (or *origin popularity*) that have a snapshot that (recursively) - includes it. Plot the distribution of commit popularity across origins. - - Implementation: apply *complete origin provenance* to each commit node, count - the returned origins, aggregate. - -- *SLOC popularity across contents*: left as future work - -The following use cases require traversing the *forward graph*. - -- **revision size** (as n. of contents) distribution: for each revision, count - the number of contents that are (recursively) reachable from it. Plot the - distribution of revision sizes. - -- **origin size** (as n. of revisions) distribution: for each origin, count the - number of revisions that are (recursively) reachable from it. Plot the - distribution of origin sizes. - - -Benchmarks -========== - -Notes on how to benchmark graph access: - -- separate pure-graph timings from timings related to use additional mappings - (e.g., node types), no matter if the mappings are in-memory or on-disk - -- separate in-memory timings from on-disk timings; in particular, separate the - timing of translating node identifiers between internal integers and SWHIDs - -- for each use case that requires a node as input, we will randomize the choice - of the input node and repeat the experiment a suitable number of times; where - possible we will aggregate results computing basic statistics (average, - standard deviation), as well as normalize results w.r.t. the “size” of the - chosen node (e.g., number of nodes/path length in the resulting visit) - - -Basic benchmarks ----------------- - -- **Edge traversal**: given a node, retrieve the first node in its adjacency - list. - - For reference: Apostolico, Drovandi in *Graph Compression by BFS* report - times to retrieve the adjacency list of a node (and/or test if an edge exists - between two nodes) in the 2-3 us range, for the largest graph in their - experiments (22 M nodes, 600 M edges). - - -Each use case is a benchmark ----------------------------- - -In addition to abstract benchmark, we will use each use case above as a -scenario-based benchmark. diff --git a/java/README.md b/java/README.md index 623e98e..7276284 100644 --- a/java/README.md +++ b/java/README.md @@ -1,51 +1,49 @@ Graph service - Java backend ============================ Server side Java RPC API. Build ----- ```bash $ mvn compile assembly:single ``` Start RPC API ------------- ```bash $ java -cp target/swh-graph-*.jar \ - org.softwareheritage.graph.server.App \ + org.softwareheritage.graph.rpc.GraphServer \ ``` -Default port is 5009 (use the `--port` option to change port number). If you -need timings metadata send back to the client in addition to the result, use the -`--timings` flag. +Default port is 50091 (use the `--port` option to change port number). Tests ----- Unit tests rely on test data that are already available in the Git repository (under `src/swh/graph/tests/dataset/`). You generally only need to run them using Maven: ```bash $ mvn test ``` In case you want to regenerate the test data: ```bash # Graph compression $ cd src/swh/graph/tests/dataset $ ./generate_graph.sh $ cd ../../../.. $ mvn compile assembly:single # Dump mapping files $ java -cp target/swh-graph-*.jar \ - org.softwareheritage.graph.maps.NodeMapBuilder \ + org.softwareheritage.graph.compress.NodeMapBuilder \ src/swh/graph/tests/dataset/example.nodes.csv.gz \ src/swh/graph/tests/dataset/output/example ``` diff --git a/java/pom.xml b/java/pom.xml index 26f0ade..405ce93 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -1,274 +1,403 @@ 4.0.0 org.softwareheritage.graph swh-graph ${git.closest.tag.name} swh-graph https://forge.softwareheritage.org/source/swh-graph/ UTF-8 11 + 3.21.1 + 1.47.0 ch.qos.logback logback-classic 1.2.3 org.junit.jupiter junit-jupiter-api 5.7.0 test org.junit.jupiter junit-jupiter-engine 5.7.0 test - - org.hamcrest - hamcrest - 2.2 - test - - - io.javalin - javalin - 3.0.0 - org.slf4j slf4j-simple 1.7.26 - - com.fasterxml.jackson.core - jackson-databind - 2.13.0 - it.unimi.dsi webgraph-big - 3.6.6 + 3.7.0 it.unimi.dsi fastutil - 8.5.6 + 8.5.8 it.unimi.dsi dsiutils - 2.6.17 + 2.7.2 it.unimi.dsi sux4j - 5.2.3 + 5.3.1 it.unimi.dsi law 2.7.2 org.apache.hadoop hadoop-common org.umlgraph umlgraph org.eclipse.jetty.aggregate jetty-all it.unimi.di mg4j it.unimi.di mg4j-big com.martiansoftware jsap 2.1 - - net.sf.py4j - py4j - 0.10.9.3 - commons-codec commons-codec 1.15 + + com.github.luben + zstd-jni + 1.5.1-1 + + + org.apache.orc + orc-core + 1.7.1 + + + org.apache.hadoop + hadoop-common + 3.3.1 + + + org.apache.hadoop + hadoop-client-runtime + 3.3.1 + + + com.google.protobuf + protobuf-java + ${protobuf.version} + + + io.grpc + grpc-netty-shaded + ${grpc.version} + + + io.grpc + grpc-protobuf + ${grpc.version} + + + io.grpc + grpc-stub + ${grpc.version} + + + io.grpc + grpc-services + ${grpc.version} + + + io.grpc + grpc-testing + ${grpc.version} + + + javax.annotation + javax.annotation-api + 1.3.2 + + + com.google.protobuf + protobuf-java-util + ${protobuf.version} + maven-clean-plugin 3.1.0 maven-resources-plugin 3.0.2 maven-compiler-plugin 3.8.0 11 11 -verbose -Xlint:all maven-surefire-plugin 2.22.2 maven-failsafe-plugin 2.22.2 maven-jar-plugin 3.0.2 maven-install-plugin 2.5.2 maven-deploy-plugin 2.8.2 maven-site-plugin 3.7.1 maven-project-info-reports-plugin 3.0.0 + + maven-dependency-plugin + 3.1.2 + maven-assembly-plugin 3.3.0 - org.softwareheritage.graph.server.App + org.softwareheritage.graph.rpc.GraphServer jar-with-dependencies false make-assembly package single com.diffplug.spotless spotless-maven-plugin - 2.4.1 + 2.22.1 *.md .gitignore true 4 4.16.0 .coding-style.xml pl.project13.maven git-commit-id-plugin 3.0.1 get-the-git-infos revision initialize true true true true v* git.closest.tag.name ^v true - - - - + + maven-source-plugin + 2.1.1 + + + bundle-sources + package + + jar-no-fork + test-jar-no-fork + + + + org.apache.maven.plugins maven-javadoc-plugin - 3.1.1 + 3.3.1 + + + resource-bundles + package + + + resource-bundle + + + test-resource-bundle + + + false + + + + javadoc-jar + package + + jar + + + + + true + + it.unimi.dsi:webgraph-big:* + + + https://webgraph.di.unimi.it/docs-big/ + https://dsiutils.di.unimi.it/docs/ + https://fastutil.di.unimi.it/docs/ + https://law.di.unimi.it/software/law-docs/ + + + + implSpec + a + Implementation Requirements: + + + implNote + a + Implementation Note: + + + + + + org.xolstice.maven.plugins + protobuf-maven-plugin + 0.6.1 + + com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} + grpc-java + io.grpc:protoc-gen-grpc-java:${grpc.version}:exe:${os.detected.classifier} + + + + + compile + compile-custom + test-compile + test-compile-custom + + + - + + + kr.motd.maven + os-maven-plugin + 1.6.2 + + + diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java index 737148a..a91276f 100644 --- a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java +++ b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java @@ -1,74 +1,98 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import java.util.ArrayList; /** * Edge restriction based on node types, used when visiting the graph. *

* Software Heritage * graph contains multiple node types (contents, directories, revisions, ...) and restricting * the traversal to specific node types is necessary for many querying operations: * use cases. * * @author The Software Heritage developers */ public class AllowedEdges { /** * 2D boolean matrix storing access rights for all combination of src/dst node types (first * dimension is source, second dimension is destination), when edge restriction is not enforced this * array is set to null for early bypass. */ public boolean[][] restrictedTo; /** * Constructor. * * @param edgesFmt a formatted string describing allowed * edges */ public AllowedEdges(String edgesFmt) { - int nbNodeTypes = Node.Type.values().length; + int nbNodeTypes = SwhType.values().length; this.restrictedTo = new boolean[nbNodeTypes][nbNodeTypes]; // Special values (null, empty, "*") if (edgesFmt == null || edgesFmt.isEmpty()) { return; } if (edgesFmt.equals("*")) { // Allows for quick bypass (with simple null check) when no edge restriction restrictedTo = null; return; } // Format: "src1:dst1,src2:dst2,[...]" String[] edgeTypes = edgesFmt.split(","); for (String edgeType : edgeTypes) { String[] nodeTypes = edgeType.split(":"); if (nodeTypes.length != 2) { throw new IllegalArgumentException("Cannot parse edge type: " + edgeType); } - ArrayList srcTypes = Node.Type.parse(nodeTypes[0]); - ArrayList dstTypes = Node.Type.parse(nodeTypes[1]); - for (Node.Type srcType : srcTypes) { - for (Node.Type dstType : dstTypes) { + ArrayList srcTypes = SwhType.parse(nodeTypes[0]); + ArrayList dstTypes = SwhType.parse(nodeTypes[1]); + for (SwhType srcType : srcTypes) { + for (SwhType dstType : dstTypes) { restrictedTo[srcType.ordinal()][dstType.ordinal()] = true; } } } } /** * Checks if a given edge can be followed during graph traversal. * * @param srcType edge source type * @param dstType edge destination type * @return true if allowed and false otherwise */ - public boolean isAllowed(Node.Type srcType, Node.Type dstType) { + public boolean isAllowed(SwhType srcType, SwhType dstType) { if (restrictedTo == null) return true; return restrictedTo[srcType.ordinal()][dstType.ordinal()]; } + + /** + * Return a new AllowedEdges instance with reversed edge restrictions. e.g. "src1:dst1,src2:dst2" + * becomes "dst1:src1,dst2:src2" + * + * @return a new AllowedEdges instance with reversed edge restrictions + */ + public AllowedEdges reverse() { + AllowedEdges reversed = new AllowedEdges(null); + reversed.restrictedTo = new boolean[restrictedTo.length][restrictedTo[0].length]; + for (int i = 0; i < restrictedTo.length; i++) { + for (int j = 0; j < restrictedTo[0].length; j++) { + reversed.restrictedTo[i][j] = restrictedTo[j][i]; + } + } + return reversed; + } } diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java b/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java index 40f473a..d80cae4 100644 --- a/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java +++ b/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java @@ -1,50 +1,57 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; /** - * TODO + * Node type restriction, useful to implement filtering of returned nodes during traversal. * * @author The Software Heritage developers */ public class AllowedNodes { public boolean[] restrictedTo; /** * Constructor. * * @param nodesFmt a formatted string describing allowed nodes */ public AllowedNodes(String nodesFmt) { - int nbNodeTypes = Node.Type.values().length; + int nbNodeTypes = SwhType.values().length; this.restrictedTo = new boolean[nbNodeTypes]; // Special values (null, empty, "*") if (nodesFmt == null || nodesFmt.isEmpty()) { return; } if (nodesFmt.equals("*")) { // Allows for quick bypass (with simple null check) when no node restriction restrictedTo = null; return; } // Format: "nodeType1,nodeType2,[...]" String[] nodeTypesStr = nodesFmt.split(","); for (String nodeTypeStr : nodeTypesStr) { - for (Node.Type nodeType : Node.Type.parse(nodeTypeStr)) { - this.restrictedTo[Node.Type.toInt(nodeType)] = true; + for (SwhType nodeType : SwhType.parse(nodeTypeStr)) { + this.restrictedTo[SwhType.toInt(nodeType)] = true; } } } /** * Checks if a given node type is allowed. * * @param nodeType node type to check * @return true if allowed and false otherwise */ - public boolean isAllowed(Node.Type nodeType) { + public boolean isAllowed(SwhType nodeType) { if (restrictedTo == null) return true; - return restrictedTo[Node.Type.toInt(nodeType)]; + return restrictedTo[SwhType.toInt(nodeType)]; } } diff --git a/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java b/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java deleted file mode 100644 index be19956..0000000 --- a/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java +++ /dev/null @@ -1,128 +0,0 @@ -package org.softwareheritage.graph; - -import it.unimi.dsi.big.webgraph.ImmutableGraph; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.big.webgraph.Transform; -import it.unimi.dsi.fastutil.longs.LongIterator; - -/** - * A directed immutable graph which can be iterated in both directions (forward and backward). It - * exposes the backward equivalents of the ImmutableGraph primitives (indegree() and - * predecessors()). This is implemented by passing two graphs, one in the forward and one in the - * backward direction. - */ -public class BidirectionalImmutableGraph extends ImmutableGraph { - private final ImmutableGraph forwardGraph; - private final ImmutableGraph backwardGraph; - - /** - * Creates a bidirectional immutable graph - * - * @param forwardGraph The graph in the forward direction - * @param backwardGraph The graph in the backward direction - */ - protected BidirectionalImmutableGraph(ImmutableGraph forwardGraph, ImmutableGraph backwardGraph) { - this.forwardGraph = forwardGraph; - this.backwardGraph = backwardGraph; - } - - @Override - public long numNodes() { - assert forwardGraph.numNodes() == backwardGraph.numNodes(); - return this.forwardGraph.numNodes(); - } - - @Override - public long numArcs() { - assert forwardGraph.numArcs() == backwardGraph.numArcs(); - return this.forwardGraph.numArcs(); - } - - @Override - public boolean randomAccess() { - return this.forwardGraph.randomAccess() && this.backwardGraph.randomAccess(); - } - - @Override - public boolean hasCopiableIterators() { - return forwardGraph.hasCopiableIterators() && backwardGraph.hasCopiableIterators(); - } - - @Override - public BidirectionalImmutableGraph copy() { - return new BidirectionalImmutableGraph(this.forwardGraph.copy(), this.backwardGraph.copy()); - } - - /** - * Returns the transposed version of the bidirectional graph. Successors become predecessors, and - * vice-versa. - */ - public BidirectionalImmutableGraph transpose() { - return new BidirectionalImmutableGraph(backwardGraph, forwardGraph); - } - - /** - * Returns the symmetric version of the bidirectional graph. It returns the (lazy) union of the - * forward graph and the backward graph. This is equivalent to removing the directionality of the - * edges: the successors of a node are also its predecessors. - * - * @return a symmetric, undirected BidirectionalImmutableGraph. - */ - public BidirectionalImmutableGraph symmetrize() { - ImmutableGraph symmetric = Transform.union(forwardGraph, backwardGraph); - return new BidirectionalImmutableGraph(symmetric, symmetric); - } - - /** - * Returns the simplified version of the bidirectional graph. Works like symmetrize(), but also - * removes the loop edges. - * - * @return a simplified (loopless and symmetric) BidirectionalImmutableGraph - */ - public BidirectionalImmutableGraph simplify() { - ImmutableGraph simplified = Transform.simplify(forwardGraph, backwardGraph); - return new BidirectionalImmutableGraph(simplified, simplified); - } - - /** Returns the outdegree of a node */ - @Override - public long outdegree(long l) { - return forwardGraph.outdegree(l); - } - - /** Returns the indegree of a node */ - public long indegree(long l) { - return backwardGraph.outdegree(l); - } - - /** Returns a lazy iterator over the successors of a given node. */ - @Override - public LazyLongIterator successors(long nodeId) { - return forwardGraph.successors(nodeId); - } - - /** Returns a lazy iterator over the predecessors of a given node. */ - public LazyLongIterator predecessors(long nodeId) { - return backwardGraph.successors(nodeId); - } - - /** Returns a reference to an array containing the predecessors of a given node. */ - public long[][] predecessorBigArray(long x) { - return backwardGraph.successorBigArray(x); - } - - /** Returns an iterator enumerating the indegrees of the nodes of this graph. */ - public LongIterator indegrees() { - return backwardGraph.outdegrees(); - } - - /** Returns the underlying ImmutableGraph in the forward direction. */ - public ImmutableGraph getForwardGraph() { - return forwardGraph; - } - - /** Returns the underlying ImmutableGraph in the backward direction. */ - public ImmutableGraph getBackwardGraph() { - return backwardGraph; - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/Entry.java b/java/src/main/java/org/softwareheritage/graph/Entry.java deleted file mode 100644 index a2d3f5a..0000000 --- a/java/src/main/java/org/softwareheritage/graph/Entry.java +++ /dev/null @@ -1,193 +0,0 @@ -package org.softwareheritage.graph; - -import java.io.*; -import java.util.ArrayList; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.PropertyNamingStrategy; - -public class Entry { - private Graph graph; - - public void load_graph(String graphBasename) throws IOException { - System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = Graph.loadMapped(graphBasename); - System.err.println("Graph loaded."); - } - - public Graph get_graph() { - return graph.copy(); - } - - public String stats() { - try { - Stats stats = new Stats(graph.getPath()); - ObjectMapper objectMapper = new ObjectMapper(); - objectMapper.setPropertyNamingStrategy(PropertyNamingStrategy.SNAKE_CASE); - return objectMapper.writeValueAsString(stats); - } catch (IOException e) { - throw new RuntimeException("Cannot read stats: " + e); - } - } - - public void check_swhid(String src) { - graph.getNodeId(new SWHID(src)); - } - - private int count_visitor(NodeCountVisitor f, long srcNodeId) { - int[] count = {0}; - f.accept(srcNodeId, (node) -> { - count[0]++; - }); - return count[0]; - } - - public int count_leaves(String direction, String edgesFmt, String src, long maxEdges) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - Traversal t = new Traversal(graph.copy(), direction, edgesFmt, maxEdges); - return count_visitor(t::leavesVisitor, srcNodeId); - } - - public int count_neighbors(String direction, String edgesFmt, String src, long maxEdges) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - Traversal t = new Traversal(graph.copy(), direction, edgesFmt, maxEdges); - return count_visitor(t::neighborsVisitor, srcNodeId); - } - - public int count_visit_nodes(String direction, String edgesFmt, String src, long maxEdges) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - Traversal t = new Traversal(graph.copy(), direction, edgesFmt, maxEdges); - return count_visitor(t::visitNodesVisitor, srcNodeId); - } - - public QueryHandler get_handler(String clientFIFO) { - return new QueryHandler(graph.copy(), clientFIFO); - } - - private interface NodeCountVisitor { - void accept(long nodeId, Traversal.NodeIdConsumer consumer); - } - - public class QueryHandler { - Graph graph; - BufferedWriter out; - String clientFIFO; - - public QueryHandler(Graph graph, String clientFIFO) { - this.graph = graph; - this.clientFIFO = clientFIFO; - this.out = null; - } - - public void writeNode(SWHID swhid) { - try { - out.write(swhid.toString() + "\n"); - } catch (IOException e) { - throw new RuntimeException("Cannot write response to client: " + e); - } - } - - public void writeEdge(SWHID src, SWHID dst) { - try { - out.write(src.toString() + " " + dst.toString() + "\n"); - } catch (IOException e) { - throw new RuntimeException("Cannot write response to client: " + e); - } - } - - public void open() { - try { - FileOutputStream file = new FileOutputStream(this.clientFIFO); - this.out = new BufferedWriter(new OutputStreamWriter(file)); - } catch (IOException e) { - throw new RuntimeException("Cannot open client FIFO: " + e); - } - } - - public void close() { - try { - out.close(); - } catch (IOException e) { - throw new RuntimeException("Cannot write response to client: " + e); - } - } - - public void leaves(String direction, String edgesFmt, String src, long maxEdges, String returnTypes) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - open(); - Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes); - for (Long nodeId : t.leaves(srcNodeId)) { - writeNode(graph.getSWHID(nodeId)); - } - close(); - } - - public void neighbors(String direction, String edgesFmt, String src, long maxEdges, String returnTypes) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - open(); - Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes); - for (Long nodeId : t.neighbors(srcNodeId)) { - writeNode(graph.getSWHID(nodeId)); - } - close(); - } - - public void visit_nodes(String direction, String edgesFmt, String src, long maxEdges, String returnTypes) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - open(); - Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes); - for (Long nodeId : t.visitNodes(srcNodeId)) { - writeNode(graph.getSWHID(nodeId)); - } - close(); - } - - public void visit_edges(String direction, String edgesFmt, String src, long maxEdges, String returnTypes) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - open(); - Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges); - t.visitNodesVisitor(srcNodeId, null, (srcId, dstId) -> { - writeEdge(graph.getSWHID(srcId), graph.getSWHID(dstId)); - }); - close(); - } - - public void walk(String direction, String edgesFmt, String algorithm, String src, String dst, long maxEdges, - String returnTypes) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - open(); - ArrayList res; - Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes); - if (dst.matches("ori|snp|rel|rev|dir|cnt")) { - Node.Type dstType = Node.Type.fromStr(dst); - res = t.walk(srcNodeId, dstType, algorithm); - } else { - long dstNodeId = graph.getNodeId(new SWHID(dst)); - res = t.walk(srcNodeId, dstNodeId, algorithm); - } - for (Long nodeId : res) { - writeNode(graph.getSWHID(nodeId)); - } - close(); - } - - public void random_walk(String direction, String edgesFmt, int retries, String src, String dst, long maxEdges, - String returnTypes) { - long srcNodeId = graph.getNodeId(new SWHID(src)); - open(); - ArrayList res; - Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes); - if (dst.matches("ori|snp|rel|rev|dir|cnt")) { - Node.Type dstType = Node.Type.fromStr(dst); - res = t.randomWalk(srcNodeId, dstType, retries); - } else { - long dstNodeId = graph.getNodeId(new SWHID(dst)); - res = t.randomWalk(srcNodeId, dstNodeId, retries); - } - for (Long nodeId : res) { - writeNode(graph.getSWHID(nodeId)); - } - close(); - } - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/Graph.java b/java/src/main/java/org/softwareheritage/graph/Graph.java deleted file mode 100644 index 8d9acf1..0000000 --- a/java/src/main/java/org/softwareheritage/graph/Graph.java +++ /dev/null @@ -1,304 +0,0 @@ -package org.softwareheritage.graph; - -import it.unimi.dsi.big.webgraph.ImmutableGraph; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.logging.ProgressLogger; -import org.softwareheritage.graph.maps.NodeIdMap; -import org.softwareheritage.graph.maps.NodeTypesMap; - -import java.io.IOException; - -/** - * Main class storing the compressed graph and node id mappings. - *

- * The compressed graph is stored using the WebGraph - * ecosystem. Additional mappings are necessary because Software Heritage uses string based persistent - * identifiers (SWHID) while WebGraph uses integers internally. These two mappings (long id - * ↔ SWHID) are used for the input (users refer to the graph using SWHID) and the output - * (convert back to SWHID for users results). However, since graph traversal can be restricted - * depending on the node type (see {@link AllowedEdges}), a long id → node type map is stored - * as well to avoid a full SWHID lookup. - * - * @author The Software Heritage developers - * @see org.softwareheritage.graph.AllowedEdges - * @see org.softwareheritage.graph.maps.NodeIdMap - * @see org.softwareheritage.graph.maps.NodeTypesMap - */ - -public class Graph extends ImmutableGraph { - /** - * Bidirectional graph containing two compressed {@link it.unimi.dsi.big.webgraph.BVGraph} one for - * each direction - */ - BidirectionalImmutableGraph graph; - - /** Path and basename of the compressed graph */ - String path; - /** Mapping long id ↔ SWHIDs */ - NodeIdMap nodeIdMap; - /** Mapping long id → node types */ - NodeTypesMap nodeTypesMap; - - /** - * Constructor. - * - * @param path path and basename of the compressed graph to load - */ - - private Graph(String path) throws IOException { - loadInternal(path, null, LoadMethod.MAPPED); - } - - /** - * Loading mechanisms - */ - - enum LoadMethod { - MEMORY, MAPPED, OFFLINE, - } - - protected Graph loadInternal(String path, ProgressLogger pl, LoadMethod method) throws IOException { - this.path = path; - ImmutableGraph direct = null; - ImmutableGraph transposed = null; - if (method == LoadMethod.MEMORY) { - direct = ImmutableGraph.load(path, pl); - transposed = ImmutableGraph.load(path + "-transposed", pl); - } else if (method == LoadMethod.MAPPED) { - direct = ImmutableGraph.load(path, pl); - transposed = ImmutableGraph.loadMapped(path + "-transposed", pl); - } else if (method == LoadMethod.OFFLINE) { - direct = ImmutableGraph.loadOffline(path, pl); - transposed = ImmutableGraph.loadOffline(path + "-transposed", pl); - } - this.graph = new BidirectionalImmutableGraph(direct, transposed); - this.nodeTypesMap = new NodeTypesMap(path); - this.nodeIdMap = new NodeIdMap(path, numNodes()); - return this; - } - - protected Graph() { - } - - public static Graph load(String path, ProgressLogger pl) throws IOException { - return new Graph().loadInternal(path, pl, LoadMethod.MEMORY); - } - - public static Graph loadMapped(String path, ProgressLogger pl) throws IOException { - return new Graph().loadInternal(path, pl, LoadMethod.MAPPED); - } - - public static Graph loadOffline(String path, ProgressLogger pl) throws IOException { - return new Graph().loadInternal(path, null, LoadMethod.OFFLINE); - } - - public static Graph load(String path) throws IOException { - return new Graph().loadInternal(path, null, LoadMethod.MEMORY); - } - - public static Graph loadMapped(String path) throws IOException { - return new Graph().loadInternal(path, null, LoadMethod.MAPPED); - } - - public static Graph loadOffline(String path) throws IOException { - return new Graph().loadInternal(path, null, LoadMethod.OFFLINE); - } - - /** - * Constructor used for copy() - */ - protected Graph(BidirectionalImmutableGraph graph, String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { - this.graph = graph; - this.path = path; - this.nodeIdMap = nodeIdMap; - this.nodeTypesMap = nodeTypesMap; - } - - /** - * Return a flyweight copy of the graph. - */ - @Override - public Graph copy() { - return new Graph(this.graph.copy(), this.path, this.nodeIdMap, this.nodeTypesMap); - } - - @Override - public boolean randomAccess() { - return graph.randomAccess(); - } - - /** - * Return a transposed version of the graph. - */ - public Graph transpose() { - return new Graph(this.graph.transpose(), this.path, this.nodeIdMap, this.nodeTypesMap); - } - - /** - * Return a symmetric version of the graph. - */ - public Graph symmetrize() { - return new Graph(this.graph.symmetrize(), this.path, this.nodeIdMap, this.nodeTypesMap); - } - - /** - * Cleans up graph resources after use. - */ - public void cleanUp() throws IOException { - nodeIdMap.close(); - } - - /** - * Returns number of nodes in the graph. - * - * @return number of nodes in the graph - */ - @Override - public long numNodes() { - return graph.numNodes(); - } - - /** - * Returns number of edges in the graph. - * - * @return number of edges in the graph - */ - @Override - public long numArcs() { - return graph.numArcs(); - } - - /** - * Returns lazy iterator of successors of a node. - * - * @param nodeId node specified as a long id - * @return lazy iterator of successors of the node, specified as a - * WebGraph LazyLongIterator - */ - @Override - public LazyLongIterator successors(long nodeId) { - return graph.successors(nodeId); - } - - /** - * Returns lazy iterator of successors of a node while following a specific set of edge types. - * - * @param nodeId node specified as a long id - * @param allowedEdges the specification of which edges can be traversed - * @return lazy iterator of successors of the node, specified as a - * WebGraph LazyLongIterator - */ - public LazyLongIterator successors(long nodeId, AllowedEdges allowedEdges) { - if (allowedEdges.restrictedTo == null) { - // All edges are allowed, bypass edge check - return this.successors(nodeId); - } else { - LazyLongIterator allSuccessors = this.successors(nodeId); - Graph thisGraph = this; - return new LazyLongIterator() { - @Override - public long nextLong() { - long neighbor; - while ((neighbor = allSuccessors.nextLong()) != -1) { - if (allowedEdges.isAllowed(thisGraph.getNodeType(nodeId), thisGraph.getNodeType(neighbor))) { - return neighbor; - } - } - return -1; - } - - @Override - public long skip(final long n) { - long i; - for (i = 0; i < n && nextLong() != -1; i++) - ; - return i; - } - }; - } - } - - /** - * Returns the outdegree of a node. - * - * @param nodeId node specified as a long id - * @return outdegree of a node - */ - @Override - public long outdegree(long nodeId) { - return graph.outdegree(nodeId); - } - - /** - * Returns lazy iterator of predecessors of a node. - * - * @param nodeId node specified as a long id - * @return lazy iterator of predecessors of the node, specified as a - * WebGraph LazyLongIterator - */ - public LazyLongIterator predecessors(long nodeId) { - return graph.predecessors(nodeId); - } - - /** - * Returns the indegree of a node. - * - * @param nodeId node specified as a long id - * @return indegree of a node - */ - public long indegree(long nodeId) { - return graph.indegree(nodeId); - } - - /** - * Returns the underlying BidirectionalImmutableGraph. - * - * @return WebGraph ImmutableGraph - */ - public ImmutableGraph getGraph() { - return this.graph; - } - - /** - * Returns the graph full path. - * - * @return graph full path - */ - public String getPath() { - return path; - } - - /** - * Converts {@link SWHID} node to long. - * - * @param swhid node specified as a {@link SWHID} - * @return internal long node id - * @see SWHID - */ - public long getNodeId(SWHID swhid) { - return nodeIdMap.getNodeId(swhid); - } - - /** - * Converts long id node to {@link SWHID}. - * - * @param nodeId node specified as a long id - * @return external SWHID - * @see SWHID - */ - public SWHID getSWHID(long nodeId) { - return nodeIdMap.getSWHID(nodeId); - } - - /** - * Returns node type. - * - * @param nodeId node specified as a long id - * @return corresponding node type - * @see org.softwareheritage.graph.Node.Type - */ - public Node.Type getNodeType(long nodeId) { - return nodeTypesMap.getType(nodeId); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/Node.java b/java/src/main/java/org/softwareheritage/graph/Node.java deleted file mode 100644 index e4a61d3..0000000 --- a/java/src/main/java/org/softwareheritage/graph/Node.java +++ /dev/null @@ -1,116 +0,0 @@ -package org.softwareheritage.graph; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * A node in the Software Heritage graph. - * - * @author The Software Heritage developers - */ - -public class Node { - /** - * Software Heritage graph node types, as described in the - * data model. - */ - public enum Type { - /** Content node */ - CNT, - /** Directory node */ - DIR, - /** Origin node */ - ORI, - /** Release node */ - REL, - /** Revision node */ - REV, - /** Snapshot node */ - SNP; - - /** - * Converts integer to corresponding SWH node type. - * - * @param intType node type represented as an integer - * @return the corresponding {@link Node.Type} value - * @see org.softwareheritage.graph.Node.Type - */ - public static Node.Type fromInt(int intType) { - switch (intType) { - case 0: - return CNT; - case 1: - return DIR; - case 2: - return ORI; - case 3: - return REL; - case 4: - return REV; - case 5: - return SNP; - } - return null; - } - - /** - * Converts node types to the corresponding int value - * - * @param type node type as an enum - * @return the corresponding int value - */ - public static int toInt(Node.Type type) { - switch (type) { - case CNT: - return 0; - case DIR: - return 1; - case ORI: - return 2; - case REL: - return 3; - case REV: - return 4; - case SNP: - return 5; - } - throw new IllegalArgumentException("Unknown node type: " + type); - } - - /** - * Converts string to corresponding SWH node type. - * - * @param strType node type represented as a string - * @return the corresponding {@link Node.Type} value - * @see org.softwareheritage.graph.Node.Type - */ - public static Node.Type fromStr(String strType) { - if (!strType.matches("cnt|dir|ori|rel|rev|snp")) { - throw new IllegalArgumentException("Unknown node type: " + strType); - } - return Node.Type.valueOf(strType.toUpperCase()); - } - - /** - * Parses SWH node type possible values from formatted string (see the - * API syntax). - * - * @param strFmtType node types represented as a formatted string - * @return a list containing the {@link Node.Type} values - * @see org.softwareheritage.graph.Node.Type - */ - public static ArrayList parse(String strFmtType) { - ArrayList types = new ArrayList<>(); - - if (strFmtType.equals("*")) { - List nodeTypes = Arrays.asList(Node.Type.values()); - types.addAll(nodeTypes); - } else { - types.add(Node.Type.fromStr(strFmtType)); - } - - return types; - } - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/NodesFiltering.java b/java/src/main/java/org/softwareheritage/graph/NodesFiltering.java deleted file mode 100644 index 3f3e7a3..0000000 --- a/java/src/main/java/org/softwareheritage/graph/NodesFiltering.java +++ /dev/null @@ -1,107 +0,0 @@ -package org.softwareheritage.graph; - -import java.util.ArrayList; - -/** - *

NodesFiltering

- *

- * class that manages the filtering of nodes that have been returned after a visit of the graph. - * parameterized by a string that represents either no filtering (*) or a set of node types. - *

- * - *
    - * - *
  • graph/visit/nodes/swh:1:rel:0000000000000000000000000000000000000010 return_types==rev will - * only return 'rev' nodes.
  • - * - *
  • graph/visit/nodes/swh:1:rel:0000000000000000000000000000000000000010 - * return_types==rev,snp,cnt will only return 'rev' 'snp' 'cnt' nodes.
  • - * - *
  • graph/visit/nodes/swh:1:rel:0000000000000000000000000000000000000010 return_types==* will - * return all the nodes.
  • - *
- * - * How to use NodesFiltering : - * - *
- * {@code
- *  Long id1 = .... // graph.getNodeType(id1) == CNT
- *  Long id2 = .... // graph.getNodeType(id2) == SNP
- *  Long id3 = .... // graph.getNodeType(id3) == ORI
- *  ArrayList nodeIds = nez ArrayList();
- *  nodeIds.add(id1); nodeIds.add(id2); nodeIds.add(id3);
- *
- *  NodeFiltering nds = new NodesFiltering("snp,ori"); // we allow only snp node types to be shown
- *  System.out.println(nds.filterByNodeTypes(nodeIds,graph)); // will print id2, id3
- *
- *  nds = NodesFiltering("*");
- *  System.out.println(nds.filterByNodeTypes(nodeIds,graph)); // will print id1, id2 id3
- *
- * }
- * 
- */ - -public class NodesFiltering { - - boolean restricted; - ArrayList allowedNodesTypes; - - /** - * Default constructor, in order to handle the * case (all types of nodes are allowed to be - * returned). allowedNodesTypes will contains [SNP,CNT....] all types of nodes. - * - */ - public NodesFiltering() { - restricted = false; - allowedNodesTypes = Node.Type.parse("*"); - } - - /** - * Constructor - * - * @param strTypes a formatted string describing the types of nodes we want to allow to be shown. - * - * NodesFilterind("cnt,snp") will set allowedNodesTypes to [CNT,SNP] - * - */ - public NodesFiltering(String strTypes) { - restricted = true; - allowedNodesTypes = new ArrayList(); - String[] types = strTypes.split(","); - for (String type : types) { - allowedNodesTypes.add(Node.Type.fromStr(type)); - } - } - - /** - * Check if the type given in parameter is in the list of allowed types. - * - * @param typ the type of the node. - */ - public boolean typeIsAllowed(Node.Type typ) { - return this.allowedNodesTypes.contains(typ); - } - - /** - *

- * the function that filters the nodes returned, we browse the list of nodes found after a visit and - * we create a new list with only the nodes that have a type that is contained in the list of - * allowed types (allowedNodesTypes) - *

- * - * @param nodeIds the nodes founded during the visit - * @param g the graph in order to find the types of nodes from their id in nodeIds - * @return a new list with the id of node which have a type in allowedTypes - * - * - */ - public ArrayList filterByNodeTypes(ArrayList nodeIds, Graph g) { - ArrayList filteredNodes = new ArrayList(); - for (Long node : nodeIds) { - if (this.typeIsAllowed(g.getNodeType(node))) { - filteredNodes.add(node); - } - } - return filteredNodes; - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/SWHID.java b/java/src/main/java/org/softwareheritage/graph/SWHID.java index 16aff83..3ccb90a 100644 --- a/java/src/main/java/org/softwareheritage/graph/SWHID.java +++ b/java/src/main/java/org/softwareheritage/graph/SWHID.java @@ -1,118 +1,125 @@ +/* + * Copyright (c) 2019 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import com.fasterxml.jackson.annotation.JsonValue; import org.apache.commons.codec.DecoderException; import org.apache.commons.codec.binary.Hex; /** * A Software Heritage persistent identifier (SWHID), see persistent * identifier documentation. * * @author The Software Heritage developers */ public class SWHID { /** Fixed hash length of the SWHID */ public static final int HASH_LENGTH = 40; /** Full SWHID as a string */ String swhid; /** SWHID node type */ - Node.Type type; + SwhType type; /** * Constructor. * * @param swhid full SWHID as a string */ public SWHID(String swhid) { this.swhid = swhid; // SWHID format: 'swh:1:type:hash' String[] parts = swhid.split(":"); if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) { throw new IllegalArgumentException("malformed SWHID: " + swhid); } - this.type = Node.Type.fromStr(parts[2]); + this.type = SwhType.fromStr(parts[2]); if (!parts[3].matches("[0-9a-f]{" + HASH_LENGTH + "}")) { throw new IllegalArgumentException("malformed SWHID: " + swhid); } } /** * Creates a SWHID from a compact binary representation. *

* The binary format is specified in the Python module swh.graph.swhid:str_to_bytes . */ public static SWHID fromBytes(byte[] input) { byte[] digest = new byte[20]; System.arraycopy(input, 2, digest, 0, digest.length); - String swhidStr = String.format("swh:%d:%s:%s", input[0], Node.Type.fromInt(input[1]).toString().toLowerCase(), + String swhidStr = String.format("swh:%d:%s:%s", input[0], SwhType.fromInt(input[1]).toString().toLowerCase(), Hex.encodeHexString(digest)); return new SWHID(swhidStr); } @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof SWHID)) return false; SWHID other = (SWHID) otherObj; return swhid.equals(other.getSWHID()); } @Override public int hashCode() { return swhid.hashCode(); } @Override public String toString() { return swhid; } /** * Converts SWHID to a compact binary representation. *

* The binary format is specified in the Python module swh.graph.swhid:str_to_bytes . */ public byte[] toBytes() { byte[] bytes = new byte[22]; byte[] digest; bytes[0] = (byte) 1; // namespace version - bytes[1] = (byte) Node.Type.toInt(this.type); // SWHID type + bytes[1] = (byte) SwhType.toInt(this.type); // SWHID type try { digest = Hex.decodeHex(this.swhid.substring(10)); // SHA1 hash System.arraycopy(digest, 0, bytes, 2, digest.length); } catch (DecoderException e) { throw new IllegalArgumentException("invalid hex sequence in SWHID: " + this.swhid); } return bytes; } /** * Returns full SWHID as a string. * * @return full SWHID string */ @JsonValue public String getSWHID() { return swhid; } /** * Returns SWHID node type. * - * @return SWHID corresponding {@link Node.Type} - * @see org.softwareheritage.graph.Node.Type + * @return SWHID corresponding {@link SwhType} + * @see SwhType */ - public Node.Type getType() { + public SwhType getType() { return type; } } diff --git a/java/src/main/java/org/softwareheritage/graph/Stats.java b/java/src/main/java/org/softwareheritage/graph/Stats.java deleted file mode 100644 index 1c1cb0f..0000000 --- a/java/src/main/java/org/softwareheritage/graph/Stats.java +++ /dev/null @@ -1,67 +0,0 @@ -package org.softwareheritage.graph; - -import java.io.FileInputStream; -import java.io.IOException; -import java.util.Properties; - -/** - * Statistics on the compressed graph. - *

- * These statistics are not computed but directly read from - * WebGraph generated .stats and .properties files. - * - * @author The Software Heritage developers - */ - -public class Stats { - public Counts counts; - public Ratios ratios; - public Degree indegree; - public Degree outdegree; - /** - * Constructor. - * - * @param graphPath path and basename of compressed graph - */ - public Stats(String graphPath) throws IOException { - Properties properties = new Properties(); - properties.load(new FileInputStream(graphPath + ".properties")); - properties.load(new FileInputStream(graphPath + ".stats")); - - this.counts = new Counts(); - this.ratios = new Ratios(); - this.indegree = new Degree(); - this.outdegree = new Degree(); - - this.counts.nodes = Long.parseLong(properties.getProperty("nodes")); - this.counts.edges = Long.parseLong(properties.getProperty("arcs")); - this.ratios.compression = Double.parseDouble(properties.getProperty("compratio")); - this.ratios.bitsPerNode = Double.parseDouble(properties.getProperty("bitspernode")); - this.ratios.bitsPerEdge = Double.parseDouble(properties.getProperty("bitsperlink")); - this.ratios.avgLocality = Double.parseDouble(properties.getProperty("avglocality")); - this.indegree.min = Long.parseLong(properties.getProperty("minindegree")); - this.indegree.max = Long.parseLong(properties.getProperty("maxindegree")); - this.indegree.avg = Double.parseDouble(properties.getProperty("avgindegree")); - this.outdegree.min = Long.parseLong(properties.getProperty("minoutdegree")); - this.outdegree.max = Long.parseLong(properties.getProperty("maxoutdegree")); - this.outdegree.avg = Double.parseDouble(properties.getProperty("avgoutdegree")); - } - - public static class Counts { - public long nodes; - public long edges; - } - - public static class Ratios { - public double compression; - public double bitsPerNode; - public double bitsPerEdge; - public double avgLocality; - } - - public static class Degree { - public long min; - public long max; - public double avg; - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/Subgraph.java b/java/src/main/java/org/softwareheritage/graph/Subgraph.java index 3e7e7fd..9cafc0b 100644 --- a/java/src/main/java/org/softwareheritage/graph/Subgraph.java +++ b/java/src/main/java/org/softwareheritage/graph/Subgraph.java @@ -1,224 +1,231 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.NodeIterator; import java.util.NoSuchElementException; public class Subgraph extends ImmutableGraph { - private final Graph underlyingGraph; + private final SwhBidirectionalGraph underlyingGraph; public final AllowedNodes allowedNodeTypes; private long nodeCount = -1; /** * Constructor. * */ - public Subgraph(Graph underlyingGraph, AllowedNodes allowedNodeTypes) { + public Subgraph(SwhBidirectionalGraph underlyingGraph, AllowedNodes allowedNodeTypes) { this.underlyingGraph = underlyingGraph.copy(); this.allowedNodeTypes = allowedNodeTypes; } /** * Return a flyweight copy of the graph. */ @Override public Subgraph copy() { return new Subgraph(this.underlyingGraph.copy(), allowedNodeTypes); } @Override public boolean randomAccess() { return underlyingGraph.randomAccess(); } /** * Return a transposed version of the graph. */ public Subgraph transpose() { return new Subgraph(underlyingGraph.transpose(), allowedNodeTypes); } /** * Return a symmetric version of the graph. */ public Subgraph symmetrize() { return new Subgraph(underlyingGraph.symmetrize(), allowedNodeTypes); } /** * Returns number of nodes in the graph. * * @return number of nodes in the graph */ @Override public long numNodes() { if (nodeCount == -1) { for (long i = 0; i < underlyingGraph.numNodes(); ++i) { if (nodeExists(i)) ++nodeCount; } } return nodeCount; } /** * Returns number of edges in the graph. * * @return number of edges in the graph */ @Override public long numArcs() { throw new UnsupportedOperationException("Cannot determine the number of arcs in a subgraph"); } public long maxNodeNumber() { return underlyingGraph.numNodes(); } public boolean nodeExists(long node) { return allowedNodeTypes.isAllowed(underlyingGraph.getNodeType(node)); } /** * Returns lazy iterator of successors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of successors of the node, specified as a * WebGraph LazyLongIterator */ @Override public LazyLongIterator successors(long nodeId) { if (!nodeExists(nodeId)) { throw new IllegalArgumentException("Node " + nodeId + " not in subgraph"); } LazyLongIterator allSuccessors = underlyingGraph.successors(nodeId); return new LazyLongIterator() { @Override public long nextLong() { long neighbor; while ((neighbor = allSuccessors.nextLong()) != -1) { if (nodeExists(neighbor)) { return neighbor; } } return -1; } @Override public long skip(final long n) { long i; for (i = 0; i < n && nextLong() != -1; i++) ; return i; } }; } /** * Returns the outdegree of a node. * * @param nodeId node specified as a long id * @return outdegree of a node */ @Override public long outdegree(long nodeId) { long deg = 0; for (LazyLongIterator allSuccessors = successors(nodeId); allSuccessors.nextLong() != -1; ++deg) ; return deg; } @Override public NodeIterator nodeIterator() { return new NodeIterator() { final long n = numNodes(); long i = -1; long done = 0; @Override public boolean hasNext() { return done <= n; } @Override public long nextLong() { if (!hasNext()) throw new NoSuchElementException(); do { ++i; if (i >= underlyingGraph.numNodes()) throw new NoSuchElementException(); } while (!nodeExists(i)); ++done; return i; } @Override public long outdegree() { return Subgraph.this.outdegree(i); } @Override public LazyLongIterator successors() { return Subgraph.this.successors(i); } }; } /** * Returns lazy iterator of predecessors of a node. * * @param nodeId node specified as a long id * @return lazy iterator of predecessors of the node, specified as a * WebGraph LazyLongIterator */ public LazyLongIterator predecessors(long nodeId) { return this.transpose().successors(nodeId); } /** * Returns the indegree of a node. * * @param nodeId node specified as a long id * @return indegree of a node */ public long indegree(long nodeId) { return this.transpose().outdegree(nodeId); } /** * Converts {@link SWHID} node to long. * * @param swhid node specified as a {@link SWHID} * @return internal long node id * @see SWHID */ public long getNodeId(SWHID swhid) { return underlyingGraph.getNodeId(swhid); } /** * Converts long id node to {@link SWHID}. * * @param nodeId node specified as a long id * @return external SWHID * @see SWHID */ public SWHID getSWHID(long nodeId) { return underlyingGraph.getSWHID(nodeId); } /** * Returns node type. * * @param nodeId node specified as a long id * @return corresponding node type - * @see Node.Type + * @see SwhType */ - public Node.Type getNodeType(long nodeId) { + public SwhType getNodeType(long nodeId) { return underlyingGraph.getNodeType(nodeId); } } diff --git a/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java new file mode 100644 index 0000000..04b2a8c --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph; + +import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; +import it.unimi.dsi.big.webgraph.BidirectionalImmutableGraph; +import it.unimi.dsi.logging.ProgressLogger; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Class representing the compressed Software Heritage graph in both directions (forward and + * backward). + * + * This class uses the {@link BidirectionalImmutableGraph} class internally to implement the + * backward equivalent of graph operations ({@link SwhBidirectionalGraph#indegree(long)}, + * {@link SwhBidirectionalGraph#predecessors(long)}, etc.) by holding a reference to two + * {@link SwhUnidirectionalGraph} (a forward graph and a backward graph). + * + * Both graphs share their graph properties in memory by storing references to the same + * {@link SwhGraphProperties} object. + * + *

+ *                 ┌──────────────┐
+ *                 │ImmutableGraph◄────────┐
+ *                 └────▲─────────┘        │extends
+ *                      │                  │
+ *                      │       ┌──────────┴────────────────┐
+ *               extends│       │BidirectionalImmutableGraph│
+ *                      │       └────────────▲──────────────┘
+ *                      │                    │extends
+ *       ┌──────────────┴───────┐     ┌──────┴──────────────┐
+ *       │SwhUnidirectionalGraph│◄────┤SwhBidirectionalGraph│
+ *       └──┬──────────────┬────┘     └────────┬───────────┬┘
+ *          │              │    contains x2    │           │
+ *          │              │                   │           │
+ *          │    implements│                   │implements │
+ *          │             ┌▼──────────┐        │           │
+ *          │             │SwhGraph(I)◄────────┘           │
+ * contains │             └───────────┘                    │contains
+ *          │                                              │
+ *          │            ┌──────────────────┐              │
+ *          └────────────►SwhGraphProperties◄──────────────┘
+ *                       └──────────────────┘
+ * 
+ * + * @author The Software Heritage developers + * @see SwhUnidirectionalGraph + */ + +public class SwhBidirectionalGraph extends BidirectionalImmutableGraph implements SwhGraph { + /** Property data of the graph (id/type mappings etc.) */ + public final SwhGraphProperties properties; + + private final SwhUnidirectionalGraph forwardGraph; + private final SwhUnidirectionalGraph backwardGraph; + + public SwhBidirectionalGraph(SwhUnidirectionalGraph forwardGraph, SwhUnidirectionalGraph backwardGraph, + SwhGraphProperties properties) { + super(forwardGraph, backwardGraph); + this.forwardGraph = forwardGraph; + this.backwardGraph = backwardGraph; + this.properties = properties; + } + + private SwhBidirectionalGraph(BidirectionalImmutableGraph graph, SwhGraphProperties properties) { + super(graph.forward, graph.backward); + this.forwardGraph = new SwhUnidirectionalGraph(graph.forward, properties); + this.backwardGraph = new SwhUnidirectionalGraph(graph.backward, properties); + this.properties = properties; + } + + public static SwhBidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl) + throws IOException { + SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadGraphOnly(method, path, is, pl); + SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadGraphOnly(method, path + "-transposed", is, pl); + SwhGraphProperties properties = SwhGraphProperties.load(path); + forward.setProperties(properties); + backward.setProperties(properties); + return new SwhBidirectionalGraph(forward, backward, properties); + } + + public static SwhBidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl) + throws IOException { + SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path, is, pl); + SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path + "-transposed", is, + pl); + SwhGraphProperties properties = SwhGraphProperties.load(path); + forward.setProperties(properties); + backward.setProperties(properties); + return new SwhBidirectionalGraph(forward, backward, properties); + } + + // loadXXX methods from ImmutableGraph + public static SwhBidirectionalGraph load(String path, ProgressLogger pl) throws IOException { + return load(LoadMethod.STANDARD, path, null, pl); + } + public static SwhBidirectionalGraph load(String path) throws IOException { + return load(LoadMethod.STANDARD, path, null, null); + } + public static SwhBidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException { + return load(LoadMethod.MAPPED, path, null, pl); + } + public static SwhBidirectionalGraph loadMapped(String path) throws IOException { + return load(LoadMethod.MAPPED, path, null, null); + } + public static SwhBidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException { + return load(LoadMethod.OFFLINE, path, null, pl); + } + public static SwhBidirectionalGraph loadOffline(String path) throws IOException { + return load(LoadMethod.OFFLINE, path, null, null); + } + + // Labelled versions of the loadXXX methods from ImmutableGraph + public static SwhBidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException { + return loadLabelled(LoadMethod.STANDARD, path, null, pl); + } + public static SwhBidirectionalGraph loadLabelled(String path) throws IOException { + return loadLabelled(LoadMethod.STANDARD, path, null, null); + } + public static SwhBidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException { + return loadLabelled(LoadMethod.MAPPED, path, null, pl); + } + public static SwhBidirectionalGraph loadLabelledMapped(String path) throws IOException { + return loadLabelled(LoadMethod.MAPPED, path, null, null); + } + public static SwhBidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException { + return loadLabelled(LoadMethod.OFFLINE, path, null, pl); + } + public static SwhBidirectionalGraph loadLabelledOffline(String path) throws IOException { + return loadLabelled(LoadMethod.OFFLINE, path, null, null); + } + + @Override + public SwhBidirectionalGraph copy() { + return new SwhBidirectionalGraph(forwardGraph.copy(), backwardGraph.copy(), this.properties); + } + + @Override + public SwhBidirectionalGraph transpose() { + return new SwhBidirectionalGraph(super.transpose(), this.properties); + } + + @Override + public SwhBidirectionalGraph symmetrize() { + return new SwhBidirectionalGraph(super.symmetrize(), this.properties); + } + + public SwhUnidirectionalGraph getForwardGraph() { + return this.forwardGraph; + } + + public SwhUnidirectionalGraph getBackwardGraph() { + return this.backwardGraph; + } + + /** + * Returns a *labelled* lazy iterator over the successors of a given node. The iteration terminates + * when -1 is returned. + */ + public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) { + return forwardGraph.labelledSuccessors(x); + } + + /** + * Returns a *labelled* lazy iterator over the predecessors of a given node. The iteration + * terminates when -1 is returned. + */ + public ArcLabelledNodeIterator.LabelledArcIterator labelledPredecessors(long x) { + return backwardGraph.labelledSuccessors(x); + } + + public void close() throws IOException { + this.properties.close(); + } + + @Override + public SwhGraphProperties getProperties() { + return properties; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java new file mode 100644 index 0000000..aee50cd --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph; + +import java.io.IOException; + +/** + * Common interface for SWH graph classes. + * + * This interface forwards all property loading/access methods to the SwhGraphProperties object + * returned by the getProperties() method of the implementing class. This allows API users to write + * graph.getNodeType() instead of graph.getProperties().getNodeType(). + */ +public interface SwhGraph { + /** + * Cleans up graph resources after use. + */ + void close() throws IOException; + + /** + * Returns the SWH graph properties object of this graph. + * + * @return graph properties + */ + SwhGraphProperties getProperties(); + + /** @see SwhGraphProperties#getPath() */ + default String getPath() { + return getProperties().getPath(); + } + + /** @see SwhGraphProperties#getNodeId(SWHID) */ + default long getNodeId(SWHID swhid) { + return getProperties().getNodeId(swhid); + } + + /** @see SwhGraphProperties#getSWHID(long) */ + default SWHID getSWHID(long nodeId) { + return getProperties().getSWHID(nodeId); + } + + /** @see SwhGraphProperties#getNodeType(long) */ + default SwhType getNodeType(long nodeId) { + return getProperties().getNodeType(nodeId); + } + + /** @see SwhGraphProperties#loadContentLength() */ + default void loadContentLength() throws IOException { + getProperties().loadContentLength(); + } + + /** @see SwhGraphProperties#getContentLength(long) */ + default Long getContentLength(long nodeId) { + return getProperties().getContentLength(nodeId); + } + + /** @see SwhGraphProperties#loadPersonIds() */ + default void loadPersonIds() throws IOException { + getProperties().loadPersonIds(); + } + + /** @see SwhGraphProperties#getAuthorId(long) */ + default Long getAuthorId(long nodeId) { + return getProperties().getAuthorId(nodeId); + } + + /** @see SwhGraphProperties#getCommitterId(long) */ + default Long getCommitterId(long nodeId) { + return getProperties().getCommitterId(nodeId); + } + + /** @see SwhGraphProperties#loadContentIsSkipped() */ + default void loadContentIsSkipped() throws IOException { + getProperties().loadContentIsSkipped(); + } + + /** @see SwhGraphProperties#isContentSkipped(long) */ + default boolean isContentSkipped(long nodeId) { + return getProperties().isContentSkipped(nodeId); + } + + /** @see SwhGraphProperties#loadAuthorTimestamps() */ + default void loadAuthorTimestamps() throws IOException { + getProperties().loadAuthorTimestamps(); + } + + /** @see SwhGraphProperties#getAuthorTimestamp(long) */ + default Long getAuthorTimestamp(long nodeId) { + return getProperties().getAuthorTimestamp(nodeId); + } + + /** @see SwhGraphProperties#getAuthorTimestampOffset(long) */ + default Short getAuthorTimestampOffset(long nodeId) { + return getProperties().getAuthorTimestampOffset(nodeId); + } + + /** @see SwhGraphProperties#loadCommitterTimestamps() */ + default void loadCommitterTimestamps() throws IOException { + getProperties().loadCommitterTimestamps(); + } + + /** @see SwhGraphProperties#getCommitterTimestamp(long) */ + default Long getCommitterTimestamp(long nodeId) { + return getProperties().getCommitterTimestamp(nodeId); + } + + /** @see SwhGraphProperties#getCommitterTimestampOffset(long) */ + default Short getCommitterTimestampOffset(long nodeId) { + return getProperties().getCommitterTimestampOffset(nodeId); + } + + /** @see SwhGraphProperties#loadMessages() */ + default void loadMessages() throws IOException { + getProperties().loadMessages(); + } + + /** @see SwhGraphProperties#getMessage(long) */ + default byte[] getMessage(long nodeId) { + return getProperties().getMessage(nodeId); + } + + /** @see SwhGraphProperties#getUrl(long) */ + default String getUrl(long nodeId) { + return getProperties().getUrl(nodeId); + } + + /** @see SwhGraphProperties#loadTagNames() */ + default void loadTagNames() throws IOException { + getProperties().loadTagNames(); + } + + /** @see SwhGraphProperties#getTagName(long) */ + default byte[] getTagName(long nodeId) { + return getProperties().getTagName(nodeId); + } + + /** @see SwhGraphProperties#loadLabelNames() */ + default void loadLabelNames() throws IOException { + getProperties().loadLabelNames(); + } + + /** @see SwhGraphProperties#getLabelName(long) */ + default byte[] getLabelName(long labelId) { + return getProperties().getLabelName(labelId); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java new file mode 100644 index 0000000..3372947 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph; + +import it.unimi.dsi.big.util.MappedFrontCodedStringBigList; +import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.fastutil.bytes.ByteBigList; +import it.unimi.dsi.fastutil.bytes.ByteMappedBigList; +import it.unimi.dsi.fastutil.ints.IntBigList; +import it.unimi.dsi.fastutil.ints.IntMappedBigList; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongBigList; +import it.unimi.dsi.fastutil.longs.LongMappedBigList; +import it.unimi.dsi.fastutil.shorts.ShortBigList; +import it.unimi.dsi.fastutil.shorts.ShortMappedBigList; +import it.unimi.dsi.sux4j.util.EliasFanoLongBigList; +import org.apache.commons.configuration2.ex.ConfigurationException; +import org.softwareheritage.graph.maps.NodeIdMap; +import org.softwareheritage.graph.maps.NodeTypesMap; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.Base64; + +/** + * This objects contains SWH graph properties such as node labels. + * + * Some property mappings are necessary because Software Heritage uses string based persistent + * identifiers (SWHID) while WebGraph uses integers internally. + * + * The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph + * using SWHID) and the output (convert back to SWHID for users results). + * + * Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a + * long id → node type map is stored as well to avoid a full SWHID lookup. + * + * @see NodeIdMap + * @see NodeTypesMap + */ +public class SwhGraphProperties { + private final String path; + + private final NodeIdMap nodeIdMap; + private final NodeTypesMap nodeTypesMap; + private LongBigList authorTimestamp; + private ShortBigList authorTimestampOffset; + private LongBigList committerTimestamp; + private ShortBigList committerTimestampOffset; + private LongBigList contentLength; + private LongArrayBitVector contentIsSkipped; + private IntBigList authorId; + private IntBigList committerId; + private ByteBigList messageBuffer; + private LongBigList messageOffsets; + private ByteBigList tagNameBuffer; + private LongBigList tagNameOffsets; + private MappedFrontCodedStringBigList edgeLabelNames; + + protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) { + this.path = path; + this.nodeIdMap = nodeIdMap; + this.nodeTypesMap = nodeTypesMap; + } + + public static SwhGraphProperties load(String path) throws IOException { + return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path)); + } + + /** + * Cleans up resources after use. + */ + public void close() throws IOException { + edgeLabelNames.close(); + } + + /** Return the basename of the compressed graph */ + public String getPath() { + return path; + } + + /** + * Converts {@link SWHID} node to long. + * + * @param swhid node specified as a {@link SWHID} + * @return internal long node id + * @see SWHID + */ + public long getNodeId(SWHID swhid) { + return nodeIdMap.getNodeId(swhid); + } + + /** + * Converts long id node to {@link SWHID}. + * + * @param nodeId node specified as a long id + * @return external SWHID + * @see SWHID + */ + public SWHID getSWHID(long nodeId) { + return nodeIdMap.getSWHID(nodeId); + } + + /** + * Returns node type. + * + * @param nodeId node specified as a long id + * @return corresponding node type + * @see SwhType + */ + public SwhType getNodeType(long nodeId) { + return nodeTypesMap.getType(nodeId); + } + + private static LongBigList loadMappedLongs(String path) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { + return LongMappedBigList.map(raf.getChannel()); + } + } + + private static IntBigList loadMappedInts(String path) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { + return IntMappedBigList.map(raf.getChannel()); + } + } + + private static ShortBigList loadMappedShorts(String path) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { + return ShortMappedBigList.map(raf.getChannel()); + } + } + + private static ByteBigList loadMappedBytes(String path) throws IOException { + try (RandomAccessFile raf = new RandomAccessFile(path, "r")) { + return ByteMappedBigList.map(raf.getChannel()); + } + } + + private static LongBigList loadEFLongs(String path) throws IOException { + try { + return (EliasFanoLongBigList) BinIO.loadObject(path); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + private static byte[] getLine(ByteBigList byteArray, long start) { + long end = start; + while (end < byteArray.size64() && byteArray.getByte(end) != '\n') { + end++; + } + int length = (int) (end - start); + byte[] buffer = new byte[length]; + byteArray.getElements(start, buffer, 0, length); + return buffer; + } + + /** Load the sizes of the content nodes */ + public void loadContentLength() throws IOException { + contentLength = loadMappedLongs(path + ".property.content.length.bin"); + } + + /** Get the size (in bytes) of the given content node */ + public Long getContentLength(long nodeId) { + if (contentLength == null) { + throw new IllegalStateException("Content lengths not loaded"); + } + long res = contentLength.getLong(nodeId); + return (res >= 0) ? res : null; + } + + /** Load the IDs of the persons (authors and committers) */ + public void loadPersonIds() throws IOException { + authorId = loadMappedInts(path + ".property.author_id.bin"); + committerId = loadMappedInts(path + ".property.committer_id.bin"); + } + + /** Get a unique integer ID representing the author of the given revision or release node */ + public Long getAuthorId(long nodeId) { + if (authorId == null) { + throw new IllegalStateException("Author IDs not loaded"); + } + long res = authorId.getInt(nodeId); + return (res >= 0) ? res : null; + } + + /** Get a unique integer ID representing the committer of the given revision node */ + public Long getCommitterId(long nodeId) { + if (committerId == null) { + throw new IllegalStateException("Committer IDs not loaded"); + } + long res = committerId.getInt(nodeId); + return (res >= 0) ? res : null; + } + + /** + * Loads a boolean array indicating whether the given content node was skipped during archive + * ingestion + */ + public void loadContentIsSkipped() throws IOException { + try { + contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin"); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } + + /** Returns whether the given content node was skipped during archive ingestion */ + public boolean isContentSkipped(long nodeId) { + if (contentIsSkipped == null) { + throw new IllegalStateException("Skipped content array not loaded"); + } + return contentIsSkipped.getBoolean(nodeId); + } + + /** Load the timestamps at which the releases and revisions were authored */ + public void loadAuthorTimestamps() throws IOException { + authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin"); + authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin"); + } + + /** Return the timestamp at which the given revision or release was authored */ + public Long getAuthorTimestamp(long nodeId) { + if (authorTimestamp == null) { + throw new IllegalStateException("Author timestamps not loaded"); + } + long res = authorTimestamp.getLong(nodeId); + return (res > Long.MIN_VALUE) ? res : null; + } + + /** Return the timestamp offset at which the given revision or release was authored */ + public Short getAuthorTimestampOffset(long nodeId) { + if (authorTimestampOffset == null) { + throw new IllegalStateException("Author timestamp offsets not loaded"); + } + short res = authorTimestampOffset.getShort(nodeId); + return (res > Short.MIN_VALUE) ? res : null; + } + + /** Load the timestamps at which the releases and revisions were committed */ + public void loadCommitterTimestamps() throws IOException { + committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin"); + committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin"); + } + + /** Return the timestamp at which the given revision was committed */ + public Long getCommitterTimestamp(long nodeId) { + if (committerTimestamp == null) { + throw new IllegalStateException("Committer timestamps not loaded"); + } + long res = committerTimestamp.getLong(nodeId); + return (res > Long.MIN_VALUE) ? res : null; + } + + /** Return the timestamp offset at which the given revision was committed */ + public Short getCommitterTimestampOffset(long nodeId) { + if (committerTimestampOffset == null) { + throw new IllegalStateException("Committer timestamp offsets not loaded"); + } + short res = committerTimestampOffset.getShort(nodeId); + return (res > Short.MIN_VALUE) ? res : null; + } + + /** Load the revision messages, the release messages and the origin URLs */ + public void loadMessages() throws IOException { + messageBuffer = loadMappedBytes(path + ".property.message.bin"); + messageOffsets = loadMappedLongs(path + ".property.message.offset.bin"); + } + + /** Get the message of the given revision or release node */ + public byte[] getMessage(long nodeId) { + if (messageBuffer == null || messageOffsets == null) { + throw new IllegalStateException("Messages not loaded"); + } + long startOffset = messageOffsets.getLong(nodeId); + if (startOffset == -1) { + return null; + } + return Base64.getDecoder().decode(getLine(messageBuffer, startOffset)); + } + + /** Get the URL of the given origin node */ + public String getUrl(long nodeId) { + byte[] url = getMessage(nodeId); + return (url != null) ? new String(url) : null; + } + + /** Load the release names */ + public void loadTagNames() throws IOException { + tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin"); + tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin"); + } + + /** Get the name of the given release node */ + public byte[] getTagName(long nodeId) { + if (tagNameBuffer == null || tagNameOffsets == null) { + throw new IllegalStateException("Tag names not loaded"); + } + long startOffset = tagNameOffsets.getLong(nodeId); + if (startOffset == -1) { + return null; + } + return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset)); + } + + /** Load the arc label names (directory entry names and snapshot branch names) */ + public void loadLabelNames() throws IOException { + try { + edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl"); + } catch (ConfigurationException e) { + throw new IOException(e); + } + } + + /** + * Get the arc label name (either a directory entry name or snapshot branch name) associated with + * the given label ID + */ + public byte[] getLabelName(long labelId) { + if (edgeLabelNames == null) { + throw new IllegalStateException("Label names not loaded"); + } + return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId)); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/SwhPath.java b/java/src/main/java/org/softwareheritage/graph/SwhPath.java deleted file mode 100644 index 8693e02..0000000 --- a/java/src/main/java/org/softwareheritage/graph/SwhPath.java +++ /dev/null @@ -1,122 +0,0 @@ -package org.softwareheritage.graph; - -import com.fasterxml.jackson.annotation.JsonValue; - -import java.util.ArrayList; - -/** - * Wrapper class to store a list of {@link SWHID}. - * - * @author The Software Heritage developers - * @see SWHID - */ - -public class SwhPath { - /** Internal list of {@link SWHID} */ - ArrayList path; - - /** - * Constructor. - */ - public SwhPath() { - this.path = new ArrayList<>(); - } - - /** - * Constructor. - * - * @param swhids variable number of string SWHIDs to initialize this path with - */ - public SwhPath(String... swhids) { - this(); - for (String swhid : swhids) { - add(new SWHID(swhid)); - } - } - - /** - * Constructor. - * - * @param swhids variable number of {@link SWHID} to initialize this path with - * @see SWHID - */ - public SwhPath(SWHID... swhids) { - this(); - for (SWHID swhid : swhids) { - add(swhid); - } - } - - /** - * Returns this path as a list of {@link SWHID}. - * - * @return list of {@link SWHID} constituting the path - * @see SWHID - */ - @JsonValue - public ArrayList getPath() { - return path; - } - - /** - * Adds a {@link SWHID} to this path. - * - * @param swhid {@link SWHID} to add to this path - * @see SWHID - */ - public void add(SWHID swhid) { - path.add(swhid); - } - - /** - * Returns the {@link SWHID} at the specified position in this path. - * - * @param index position of the {@link SWHID} to return - * @return {@link SWHID} at the specified position - * @see SWHID - */ - public SWHID get(int index) { - return path.get(index); - } - - /** - * Returns the number of elements in this path. - * - * @return number of elements in this path - */ - public int size() { - return path.size(); - } - - @Override - public boolean equals(Object otherObj) { - if (otherObj == this) - return true; - if (!(otherObj instanceof SwhPath)) - return false; - - SwhPath other = (SwhPath) otherObj; - if (size() != other.size()) { - return false; - } - - for (int i = 0; i < size(); i++) { - SWHID thisSWHID = get(i); - SWHID otherSWHID = other.get(i); - if (!thisSWHID.equals(otherSWHID)) { - return false; - } - } - - return true; - } - - @Override - public String toString() { - StringBuilder str = new StringBuilder(); - for (SWHID swhid : path) { - str.append(swhid).append("/"); - } - return str.toString(); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/SwhType.java b/java/src/main/java/org/softwareheritage/graph/SwhType.java new file mode 100644 index 0000000..5578837 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/SwhType.java @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Software Heritage graph node types, as described in the + * data model. + */ +public enum SwhType { + /** Content node */ + CNT, + /** Directory node */ + DIR, + /** Origin node */ + ORI, + /** Release node */ + REL, + /** Revision node */ + REV, + /** Snapshot node */ + SNP; + + /** + * Converts integer to corresponding SWH node type. + * + * @param intType node type represented as an integer + * @return the corresponding {@link SwhType} value + * @see SwhType + */ + public static SwhType fromInt(int intType) { + switch (intType) { + case 0: + return CNT; + case 1: + return DIR; + case 2: + return ORI; + case 3: + return REL; + case 4: + return REV; + case 5: + return SNP; + } + return null; + } + + /** + * Converts node types to the corresponding int value + * + * @param type node type as an enum + * @return the corresponding int value + */ + public static int toInt(SwhType type) { + switch (type) { + case CNT: + return 0; + case DIR: + return 1; + case ORI: + return 2; + case REL: + return 3; + case REV: + return 4; + case SNP: + return 5; + } + throw new IllegalArgumentException("Unknown node type: " + type); + } + + /** + * Converts string to corresponding SWH node type. + * + * @param strType node type represented as a string + * @return the corresponding {@link SwhType} value + * @see SwhType + */ + public static SwhType fromStr(String strType) { + if (!strType.matches("cnt|dir|ori|rel|rev|snp")) { + throw new IllegalArgumentException("Unknown node type: " + strType); + } + return SwhType.valueOf(strType.toUpperCase()); + } + + /** + * Converts byte array name to the int code of the corresponding SWH node type. Used for + * performance-critical deserialization. + * + * @param name node type represented as a byte array (e.g. b"cnt") + * @return the ordinal value of the corresponding {@link SwhType} + * @see SwhType + */ + public static int byteNameToInt(byte[] name) { + if (Arrays.equals(name, "cnt".getBytes())) { + return 0; + } else if (Arrays.equals(name, "dir".getBytes())) { + return 1; + } else if (Arrays.equals(name, "ori".getBytes())) { + return 2; + } else if (Arrays.equals(name, "rel".getBytes())) { + return 3; + } else if (Arrays.equals(name, "rev".getBytes())) { + return 4; + } else if (Arrays.equals(name, "snp".getBytes())) { + return 5; + } else + return -1; + } + + /** + * Parses SWH node type possible values from formatted string (see the + * API syntax). + * + * @param strFmtType node types represented as a formatted string + * @return a list containing the {@link SwhType} values + * @see SwhType + */ + public static ArrayList parse(String strFmtType) { + ArrayList types = new ArrayList<>(); + + if (strFmtType.equals("*")) { + List nodeTypes = Arrays.asList(SwhType.values()); + types.addAll(nodeTypes); + } else { + types.add(SwhType.fromStr(strFmtType)); + } + + return types; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java new file mode 100644 index 0000000..3f865d0 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph; + +import it.unimi.dsi.big.webgraph.ImmutableGraph; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; +import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Class representing the compressed Software Heritage graph in a single direction. + *

+ * The compressed graph is stored using the WebGraph + * framework. This class contains an {@link ImmutableGraph} representing the graph itself, as well + * as a reference to the object containing the graph properties (e.g. node labels). Optionally, + * arc labels (properties stored on the graph edges) can also be loaded with the + * loadLabelled...() function family. + * + * @author The Software Heritage developers + * @see SwhGraphProperties + * @see SwhUnidirectionalGraph + */ + +public class SwhUnidirectionalGraph extends ImmutableGraph implements SwhGraph { + /** Underlying ImmutableGraph */ + private final ImmutableGraph graph; + + /** Labelled ImmutableGraph, null if labels are not loaded */ + private ArcLabelledImmutableGraph labelledGraph; + + /** Property data of the graph (id/type mappings etc.) */ + public SwhGraphProperties properties; + + public SwhUnidirectionalGraph(ImmutableGraph graph, SwhGraphProperties properties) { + this.graph = graph; + this.properties = properties; + } + + protected SwhUnidirectionalGraph(ImmutableGraph graph, ArcLabelledImmutableGraph labelledGraph, + SwhGraphProperties properties) { + this.graph = graph; + this.labelledGraph = labelledGraph; + this.properties = properties; + } + + /** + * Load the (unlabelled) graph only, without the SWH properties. + */ + public static SwhUnidirectionalGraph loadGraphOnly(LoadMethod method, String path, InputStream is, + ProgressLogger pl) throws IOException { + return new SwhUnidirectionalGraph(ImmutableGraph.load(method, path, is, pl), null); + } + + /** + * Load the labelled graph only, without the SWH properties. + */ + public static SwhUnidirectionalGraph loadLabelledGraphOnly(LoadMethod method, String path, InputStream is, + ProgressLogger pl) throws IOException { + BitStreamArcLabelledImmutableGraph g = (BitStreamArcLabelledImmutableGraph) BitStreamArcLabelledImmutableGraph + .load(method, path + "-labelled", is, pl); + return new SwhUnidirectionalGraph(g.g, g, null); + } + + /** + * Load the SWH properties of the graph from a given path. + */ + public void loadProperties(String path) throws IOException { + properties = SwhGraphProperties.load(path); + } + + /** + * Setter for the SWH graph properties. + * + * @param properties The {@link SwhGraphProperties} object containing the graph properties + */ + public void setProperties(SwhGraphProperties properties) { + this.properties = properties; + } + + /** + * Load the unlabelled graph and its SWH properties. + */ + public static SwhUnidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl) + throws IOException { + SwhUnidirectionalGraph g = loadGraphOnly(method, path, is, pl); + g.loadProperties(path); + return g; + } + + /** + * Load the labelled graph and its SWH properties. + */ + public static SwhUnidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl) + throws IOException { + SwhUnidirectionalGraph g = loadLabelledGraphOnly(method, path, is, pl); + g.loadProperties(path); + return g; + } + + // loadXXX methods of ImmutableGraph + public static SwhUnidirectionalGraph load(String path, ProgressLogger pl) throws IOException { + return load(LoadMethod.STANDARD, path, null, pl); + } + public static SwhUnidirectionalGraph load(String path) throws IOException { + return load(LoadMethod.STANDARD, path, null, null); + } + public static SwhUnidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException { + return load(LoadMethod.MAPPED, path, null, pl); + } + public static SwhUnidirectionalGraph loadMapped(String path) throws IOException { + return load(LoadMethod.MAPPED, path, null, null); + } + public static SwhUnidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException { + return load(LoadMethod.OFFLINE, path, null, pl); + } + public static SwhUnidirectionalGraph loadOffline(String path) throws IOException { + return load(LoadMethod.OFFLINE, path, null, null); + } + + // Labelled versions of the loadXXX methods from ImmutableGraph + public static SwhUnidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException { + return loadLabelled(LoadMethod.STANDARD, path, null, pl); + } + public static SwhUnidirectionalGraph loadLabelled(String path) throws IOException { + return loadLabelled(LoadMethod.STANDARD, path, null, null); + } + public static SwhUnidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException { + return loadLabelled(LoadMethod.MAPPED, path, null, pl); + } + public static SwhUnidirectionalGraph loadLabelledMapped(String path) throws IOException { + return loadLabelled(LoadMethod.MAPPED, path, null, null); + } + public static SwhUnidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException { + return loadLabelled(LoadMethod.OFFLINE, path, null, pl); + } + public static SwhUnidirectionalGraph loadLabelledOffline(String path) throws IOException { + return loadLabelled(LoadMethod.OFFLINE, path, null, null); + } + + @Override + public SwhUnidirectionalGraph copy() { + return new SwhUnidirectionalGraph(this.graph.copy(), + this.labelledGraph != null ? this.labelledGraph.copy() : null, this.properties); + } + + @Override + public boolean randomAccess() { + return graph.randomAccess(); + } + + public void close() throws IOException { + this.properties.close(); + } + + @Override + public long numNodes() { + return graph.numNodes(); + } + + @Override + public long numArcs() { + return graph.numArcs(); + } + + @Override + public LazyLongIterator successors(long nodeId) { + return graph.successors(nodeId); + } + + /** + * Returns a labelled node iterator for scanning the graph sequentially, starting from the + * first node. + */ + public ArcLabelledNodeIterator labelledNodeIterator() { + if (labelledGraph == null) { + throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); + } + return labelledGraph.nodeIterator(); + } + + /** + * Returns a labelled node iterator for scanning the graph sequentially, starting from a + * given node. + */ + public ArcLabelledNodeIterator labelledNodeIterator(long from) { + if (labelledGraph == null) { + throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); + } + return labelledGraph.nodeIterator(from); + } + + /** + * Returns a labelled lazy iterator over the successors of a given node. The iteration + * terminates when -1 is returned. + */ + public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) { + if (labelledGraph == null) { + throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded."); + } + return labelledGraph.successors(x); + } + + @Override + public long outdegree(long nodeId) { + return graph.outdegree(nodeId); + } + + @Override + public SwhGraphProperties getProperties() { + return properties; + } + + public ImmutableGraph underlyingGraph() { + return graph; + } + + public ArcLabelledImmutableGraph underlyingLabelledGraph() { + return labelledGraph; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/Traversal.java b/java/src/main/java/org/softwareheritage/graph/Traversal.java deleted file mode 100644 index 4c8c669..0000000 --- a/java/src/main/java/org/softwareheritage/graph/Traversal.java +++ /dev/null @@ -1,580 +0,0 @@ -package org.softwareheritage.graph; - -import java.util.ArrayDeque; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.Map; -import java.util.Queue; -import java.util.Random; -import java.util.Stack; -import java.util.function.Consumer; -import java.util.function.LongConsumer; - -import org.softwareheritage.graph.server.Endpoint; - -import it.unimi.dsi.big.webgraph.LazyLongIterator; - -/** - * Traversal algorithms on the compressed graph. - *

- * Internal implementation of the traversal API endpoints. These methods only input/output internal - * long ids, which are converted in the {@link Endpoint} higher-level class to {@link SWHID}. - * - * @author The Software Heritage developers - * @see Endpoint - */ - -public class Traversal { - /** Graph used in the traversal */ - Graph graph; - /** Graph edge restriction */ - AllowedEdges edges; - - /** Hash set storing if we have visited a node */ - HashSet visited; - /** Hash map storing parent node id for each nodes during a traversal */ - Map parentNode; - /** Number of edges accessed during traversal */ - long nbEdgesAccessed; - - /** The anti Dos limit of edges traversed while a visit */ - long maxEdges; - /** The string represent the set of type restriction */ - NodesFiltering ndsfilter; - - /** random number generator, for random walks */ - Random rng; - - /** - * Constructor. - * - * @param graph graph used in the traversal - * @param direction a string (either "forward" or "backward") specifying edge orientation - * @param edgesFmt a formatted string describing allowed - * edges - */ - - public Traversal(Graph graph, String direction, String edgesFmt) { - this(graph, direction, edgesFmt, 0); - } - - public Traversal(Graph graph, String direction, String edgesFmt, long maxEdges) { - this(graph, direction, edgesFmt, maxEdges, "*"); - } - - public Traversal(Graph graph, String direction, String edgesFmt, long maxEdges, String returnTypes) { - if (!direction.matches("forward|backward")) { - throw new IllegalArgumentException("Unknown traversal direction: " + direction); - } - - if (direction.equals("backward")) { - this.graph = graph.transpose(); - } else { - this.graph = graph; - } - this.edges = new AllowedEdges(edgesFmt); - - this.visited = new HashSet<>(); - this.parentNode = new HashMap<>(); - this.nbEdgesAccessed = 0; - this.maxEdges = maxEdges; - this.rng = new Random(); - - if (returnTypes.equals("*")) { - this.ndsfilter = new NodesFiltering(); - } else { - this.ndsfilter = new NodesFiltering(returnTypes); - } - } - - /** - * Returns number of accessed edges during traversal. - * - * @return number of edges accessed in last traversal - */ - public long getNbEdgesAccessed() { - return nbEdgesAccessed; - } - - /** - * Returns number of accessed nodes during traversal. - * - * @return number of nodes accessed in last traversal - */ - public long getNbNodesAccessed() { - return this.visited.size(); - } - - /** - * Push version of {@link #leaves} will fire passed callback for each leaf. - */ - public void leavesVisitor(long srcNodeId, NodeIdConsumer cb) { - Stack stack = new Stack<>(); - this.nbEdgesAccessed = 0; - - stack.push(srcNodeId); - visited.add(srcNodeId); - - while (!stack.isEmpty()) { - long currentNodeId = stack.pop(); - - long neighborsCnt = 0; - nbEdgesAccessed += graph.outdegree(currentNodeId); - if (this.maxEdges > 0) { - if (nbEdgesAccessed >= this.maxEdges) { - break; - } - } - LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { - neighborsCnt++; - if (!visited.contains(neighborNodeId)) { - stack.push(neighborNodeId); - visited.add(neighborNodeId); - } - } - - if (neighborsCnt == 0) { - cb.accept(currentNodeId); - } - } - } - - /** - * Returns the leaves of a subgraph rooted at the specified source node. - * - * @param srcNodeId source node - * @return list of node ids corresponding to the leaves - */ - public ArrayList leaves(long srcNodeId) { - ArrayList nodeIds = new ArrayList(); - leavesVisitor(srcNodeId, nodeIds::add); - if (ndsfilter.restricted) { - return ndsfilter.filterByNodeTypes(nodeIds, graph); - } - return nodeIds; - } - - /** - * Push version of {@link #neighbors}: will fire passed callback on each neighbor. - */ - public void neighborsVisitor(long srcNodeId, NodeIdConsumer cb) { - this.nbEdgesAccessed = graph.outdegree(srcNodeId); - if (this.maxEdges > 0) { - if (nbEdgesAccessed >= this.maxEdges) { - return; - } - } - LazyLongIterator it = graph.successors(srcNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { - cb.accept(neighborNodeId); - } - } - - /** - * Returns node direct neighbors (linked with exactly one edge). - * - * @param srcNodeId source node - * @return list of node ids corresponding to the neighbors - */ - public ArrayList neighbors(long srcNodeId) { - ArrayList nodeIds = new ArrayList<>(); - neighborsVisitor(srcNodeId, nodeIds::add); - if (ndsfilter.restricted) { - return ndsfilter.filterByNodeTypes(nodeIds, graph); - } - return nodeIds; - } - - /** - * Push version of {@link #visitNodes}: will fire passed callback on each visited node. - */ - public void visitNodesVisitor(long srcNodeId, NodeIdConsumer nodeCb, EdgeIdConsumer edgeCb) { - Stack stack = new Stack<>(); - this.nbEdgesAccessed = 0; - - stack.push(srcNodeId); - visited.add(srcNodeId); - - while (!stack.isEmpty()) { - long currentNodeId = stack.pop(); - if (nodeCb != null) { - nodeCb.accept(currentNodeId); - } - nbEdgesAccessed += graph.outdegree(currentNodeId); - if (this.maxEdges > 0) { - if (nbEdgesAccessed >= this.maxEdges) { - break; - } - } - LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { - if (edgeCb != null) { - edgeCb.accept(currentNodeId, neighborNodeId); - } - if (!visited.contains(neighborNodeId)) { - stack.push(neighborNodeId); - visited.add(neighborNodeId); - } - } - } - } - - /** One-argument version to handle callbacks properly */ - public void visitNodesVisitor(long srcNodeId, NodeIdConsumer cb) { - visitNodesVisitor(srcNodeId, cb, null); - } - - /** - * Performs a graph traversal and returns explored nodes. - * - * @param srcNodeId source node - * @return list of explored node ids - */ - public ArrayList visitNodes(long srcNodeId) { - ArrayList nodeIds = new ArrayList<>(); - visitNodesVisitor(srcNodeId, nodeIds::add); - if (ndsfilter.restricted) { - return ndsfilter.filterByNodeTypes(nodeIds, graph); - } - return nodeIds; - } - - /** - * Push version of {@link #visitPaths}: will fire passed callback on each discovered (complete) - * path. - */ - public void visitPathsVisitor(long srcNodeId, PathConsumer cb) { - Stack currentPath = new Stack<>(); - this.nbEdgesAccessed = 0; - visitPathsInternalVisitor(srcNodeId, currentPath, cb); - } - - /** - * Performs a graph traversal and returns explored paths. - * - * @param srcNodeId source node - * @return list of explored paths (represented as a list of node ids) - */ - public ArrayList> visitPaths(long srcNodeId) { - ArrayList> paths = new ArrayList<>(); - visitPathsVisitor(srcNodeId, paths::add); - return paths; - } - - private void visitPathsInternalVisitor(long currentNodeId, Stack currentPath, PathConsumer cb) { - currentPath.push(currentNodeId); - - long visitedNeighbors = 0; - - nbEdgesAccessed += graph.outdegree(currentNodeId); - if (this.maxEdges > 0) { - if (nbEdgesAccessed >= this.maxEdges) { - currentPath.pop(); - return; - } - } - LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { - visitPathsInternalVisitor(neighborNodeId, currentPath, cb); - visitedNeighbors++; - } - - if (visitedNeighbors == 0) { - ArrayList path = new ArrayList<>(currentPath); - cb.accept(path); - } - - currentPath.pop(); - } - - /** - * Performs a graph traversal with backtracking, and returns the first found path from source to - * destination. - * - * @param srcNodeId source node - * @param dst destination (either a node or a node type) - * @return found path as a list of node ids - */ - public ArrayList walk(long srcNodeId, T dst, String visitOrder) { - long dstNodeId; - if (visitOrder.equals("dfs")) { - dstNodeId = walkInternalDFS(srcNodeId, dst); - } else if (visitOrder.equals("bfs")) { - dstNodeId = walkInternalBFS(srcNodeId, dst); - } else { - throw new IllegalArgumentException("Unknown visit order: " + visitOrder); - } - - if (dstNodeId == -1) { - throw new IllegalArgumentException("Cannot find destination: " + dst); - } - - return backtracking(srcNodeId, dstNodeId); - } - - /** - * Performs a random walk (picking a random successor at each step) from source to destination. - * - * @param srcNodeId source node - * @param dst destination (either a node or a node type) - * @return found path as a list of node ids or an empty path to indicate that no suitable path have - * been found - */ - public ArrayList randomWalk(long srcNodeId, T dst) { - return randomWalk(srcNodeId, dst, 0); - } - - /** - * Performs a stubborn random walk (picking a random successor at each step) from source to - * destination. The walk is "stubborn" in the sense that it will not give up the first time if a - * satisfying target node is found, but it will retry up to a limited amount of times. - * - * @param srcNodeId source node - * @param dst destination (either a node or a node type) - * @param retries number of times to retry; 0 means no retries (single walk) - * @return found path as a list of node ids or an empty path to indicate that no suitable path have - * been found - */ - public ArrayList randomWalk(long srcNodeId, T dst, int retries) { - long curNodeId = srcNodeId; - ArrayList path = new ArrayList<>(); - this.nbEdgesAccessed = 0; - boolean found; - - if (retries < 0) { - throw new IllegalArgumentException("Negative number of retries given: " + retries); - } - - while (true) { - path.add(curNodeId); - LazyLongIterator successors = graph.successors(curNodeId, edges); - curNodeId = randomPick(successors); - if (curNodeId < 0) { - found = false; - break; - } - if (isDstNode(curNodeId, dst)) { - path.add(curNodeId); - found = true; - break; - } - } - - if (found) { - if (ndsfilter.restricted) { - return ndsfilter.filterByNodeTypes(path, graph); - } - return path; - } else if (retries > 0) { // try again - return randomWalk(srcNodeId, dst, retries - 1); - } else { // not found and no retries left - path.clear(); - return path; - } - } - - /** - * Randomly choose an element from an iterator over Longs using reservoir sampling - * - * @param elements iterator over selection domain - * @return randomly chosen element or -1 if no suitable element was found - */ - private long randomPick(LazyLongIterator elements) { - long curPick = -1; - long seenCandidates = 0; - - for (long element; (element = elements.nextLong()) != -1;) { - seenCandidates++; - if (Math.round(rng.nextFloat() * (seenCandidates - 1)) == 0) { - curPick = element; - } - } - - return curPick; - } - - /** - * Internal DFS function of {@link #walk}. - * - * @param srcNodeId source node - * @param dst destination (either a node or a node type) - * @return final destination node or -1 if no path found - */ - private long walkInternalDFS(long srcNodeId, T dst) { - Stack stack = new Stack<>(); - this.nbEdgesAccessed = 0; - - stack.push(srcNodeId); - visited.add(srcNodeId); - - while (!stack.isEmpty()) { - long currentNodeId = stack.pop(); - if (isDstNode(currentNodeId, dst)) { - return currentNodeId; - } - - nbEdgesAccessed += graph.outdegree(currentNodeId); - LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { - if (!visited.contains(neighborNodeId)) { - stack.push(neighborNodeId); - visited.add(neighborNodeId); - parentNode.put(neighborNodeId, currentNodeId); - } - } - } - - return -1; - } - - /** - * Internal BFS function of {@link #walk}. - * - * @param srcNodeId source node - * @param dst destination (either a node or a node type) - * @return final destination node or -1 if no path found - */ - private long walkInternalBFS(long srcNodeId, T dst) { - Queue queue = new LinkedList<>(); - this.nbEdgesAccessed = 0; - - queue.add(srcNodeId); - visited.add(srcNodeId); - - while (!queue.isEmpty()) { - long currentNodeId = queue.poll(); - if (isDstNode(currentNodeId, dst)) { - return currentNodeId; - } - - nbEdgesAccessed += graph.outdegree(currentNodeId); - LazyLongIterator it = graph.successors(currentNodeId, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { - if (!visited.contains(neighborNodeId)) { - queue.add(neighborNodeId); - visited.add(neighborNodeId); - parentNode.put(neighborNodeId, currentNodeId); - } - } - } - - return -1; - } - - /** - * Internal function of {@link #walk} to check if a node corresponds to the destination. - * - * @param nodeId current node - * @param dst destination (either a node or a node type) - * @return true if the node is a destination, or false otherwise - */ - private boolean isDstNode(long nodeId, T dst) { - if (dst instanceof Long) { - long dstNodeId = (Long) dst; - return nodeId == dstNodeId; - } else if (dst instanceof Node.Type) { - Node.Type dstType = (Node.Type) dst; - return graph.getNodeType(nodeId) == dstType; - } else { - return false; - } - } - - /** - * Internal backtracking function of {@link #walk}. - * - * @param srcNodeId source node - * @param dstNodeId destination node - * @return the found path, as a list of node ids - */ - private ArrayList backtracking(long srcNodeId, long dstNodeId) { - ArrayList path = new ArrayList<>(); - long currentNodeId = dstNodeId; - while (currentNodeId != srcNodeId) { - path.add(currentNodeId); - currentNodeId = parentNode.get(currentNodeId); - } - path.add(srcNodeId); - Collections.reverse(path); - return path; - } - - /** - * Find a common descendant between two given nodes using two parallel BFS - * - * @param lhsNode the first node - * @param rhsNode the second node - * @return the found path, as a list of node ids - */ - public Long findCommonDescendant(long lhsNode, long rhsNode) { - Queue lhsStack = new ArrayDeque<>(); - Queue rhsStack = new ArrayDeque<>(); - HashSet lhsVisited = new HashSet<>(); - HashSet rhsVisited = new HashSet<>(); - lhsStack.add(lhsNode); - rhsStack.add(rhsNode); - lhsVisited.add(lhsNode); - rhsVisited.add(rhsNode); - - this.nbEdgesAccessed = 0; - Long curNode; - - while (!lhsStack.isEmpty() || !rhsStack.isEmpty()) { - if (!lhsStack.isEmpty()) { - curNode = lhsStack.poll(); - nbEdgesAccessed += graph.outdegree(curNode); - LazyLongIterator it = graph.successors(curNode, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { - if (!lhsVisited.contains(neighborNodeId)) { - if (rhsVisited.contains(neighborNodeId)) - return neighborNodeId; - lhsStack.add(neighborNodeId); - lhsVisited.add(neighborNodeId); - } - } - } - - if (!rhsStack.isEmpty()) { - curNode = rhsStack.poll(); - nbEdgesAccessed += graph.outdegree(curNode); - LazyLongIterator it = graph.successors(curNode, edges); - for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { - if (!rhsVisited.contains(neighborNodeId)) { - if (lhsVisited.contains(neighborNodeId)) - return neighborNodeId; - rhsStack.add(neighborNodeId); - rhsVisited.add(neighborNodeId); - } - } - } - } - - return null; - } - - public interface NodeIdConsumer extends LongConsumer { - /** - * Callback for incrementally receiving node identifiers during a graph visit. - */ - void accept(long nodeId); - } - - public interface EdgeIdConsumer { - /** - * Callback for incrementally receiving edge identifiers during a graph visit. - */ - void accept(long srcId, long dstId); - } - - public interface PathConsumer extends Consumer> { - /** - * Callback for incrementally receiving node paths (made of node identifiers) during a graph visit. - */ - void accept(ArrayList path); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/algo/TopologicalTraversal.java b/java/src/main/java/org/softwareheritage/graph/algo/TopologicalTraversal.java deleted file mode 100644 index bdbb60d..0000000 --- a/java/src/main/java/org/softwareheritage/graph/algo/TopologicalTraversal.java +++ /dev/null @@ -1,70 +0,0 @@ -package org.softwareheritage.graph.algo; - -import com.google.common.primitives.Longs; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.bits.LongArrayBitVector; -import it.unimi.dsi.fastutil.Arrays; -import it.unimi.dsi.fastutil.BigArrays; -import it.unimi.dsi.fastutil.longs.LongBigArrays; -import it.unimi.dsi.io.ByteDiskQueue; -import it.unimi.dsi.logging.ProgressLogger; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Traversal; -import org.softwareheritage.graph.experiments.forks.ForkCC; - -import java.io.File; -import java.io.IOException; - -public class TopologicalTraversal { - public static void run(final Graph graph, Traversal.NodeIdConsumer cb) throws IOException { - final long[][] indegree = LongBigArrays.newBigArray(graph.numNodes()); - final ProgressLogger pl = new ProgressLogger(); - - pl.itemsName = "nodes"; - pl.expectedUpdates = graph.numNodes(); - - pl.start("Fetching indegrees..."); - long n = graph.numNodes(); - for (long i = 0; i < graph.numNodes(); ++i) { - BigArrays.add(indegree, i, graph.indegree(i)); - } - pl.done(); - - LongArrayBitVector visited = LongArrayBitVector.ofLength(n); - - int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); - final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue"); - final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); - final byte[] byteBuf = new byte[Long.BYTES]; - - pl.start("Traversal in topological order..."); - for (long i = 0; i < graph.numNodes(); ++i) { - if (visited.getBoolean(i) || BigArrays.get(indegree, i) != 0L) { - continue; - } - - queue.enqueue(Longs.toByteArray(i)); - visited.set(i); - - while (!queue.isEmpty()) { - queue.dequeue(byteBuf); - final long currentNode = Longs.fromByteArray(byteBuf); - - cb.accept(currentNode); - - final LazyLongIterator iterator = graph.successors(currentNode); - long succ; - while ((succ = iterator.nextLong()) != -1) { - BigArrays.add(indegree, succ, -1L); - if (visited.getBoolean(succ) || BigArrays.get(indegree, succ) != 0) - continue; - visited.set(succ); - queue.enqueue(Longs.toByteArray(succ)); - } - - pl.update(); - } - } - pl.done(); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java b/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java deleted file mode 100644 index 9397de7..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java +++ /dev/null @@ -1,45 +0,0 @@ -package org.softwareheritage.graph.benchmark; - -import com.martiansoftware.jsap.JSAPException; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.benchmark.utils.Statistics; -import org.softwareheritage.graph.benchmark.utils.Timing; - -import java.io.IOException; -import java.util.ArrayList; - -/** - * Benchmark to time edge access time. - * - * @author The Software Heritage developers - */ - -public class AccessEdge { - /** - * Main entrypoint. - * - * @param args command line arguments - */ - public static void main(String[] args) throws IOException, JSAPException { - Benchmark bench = new Benchmark(); - bench.parseCommandLineArgs(args); - - Graph graph = Graph.loadMapped(bench.args.graphPath); - - long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes); - - ArrayList timings = new ArrayList<>(); - for (long nodeId : nodeIds) { - long startTime = Timing.start(); - LazyLongIterator neighbors = graph.successors(nodeId); - long firstNeighbor = neighbors.nextLong(); - double duration = Timing.stop(startTime); - timings.add(duration); - } - - System.out.println("Used " + bench.args.nbNodes + " random edges (results are in seconds):"); - Statistics stats = new Statistics(timings); - stats.printAll(); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java deleted file mode 100644 index 43aec2e..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java +++ /dev/null @@ -1,107 +0,0 @@ -package org.softwareheritage.graph.benchmark; - -import com.google.common.primitives.Longs; -import com.martiansoftware.jsap.*; -import it.unimi.dsi.big.webgraph.ImmutableGraph; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.bits.LongArrayBitVector; -import it.unimi.dsi.fastutil.Arrays; -import it.unimi.dsi.io.ByteDiskQueue; -import it.unimi.dsi.logging.ProgressLogger; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.softwareheritage.graph.Graph; - -import java.io.File; -import java.io.IOException; - -public class BFS { - private final static Logger LOGGER = LoggerFactory.getLogger(BFS.class); - private final ImmutableGraph graph; - - public BFS(ImmutableGraph graph) { - this.graph = graph; - } - - private static JSAPResult parse_args(String[] args) { - JSAPResult config = null; - try { - SimpleJSAP jsap = new SimpleJSAP(BFS.class.getName(), "", - new Parameter[]{ - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', - "graph", "Basename of the compressed graph"), - - new FlaggedOption("useTransposed", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, 'T', - "transposed", "Use transposed graph (default: false)"),}); - - config = jsap.parse(args); - if (jsap.messagePrinted()) { - System.exit(1); - } - } catch (JSAPException e) { - e.printStackTrace(); - } - return config; - } - - public static void main(String[] args) throws IOException { - JSAPResult config = parse_args(args); - String graphPath = config.getString("graphPath"); - boolean useTransposed = config.getBoolean("useTransposed"); - - System.err.println("Loading graph " + graphPath + " ..."); - Graph graph = Graph.loadMapped(graphPath); - System.err.println("Graph loaded."); - - if (useTransposed) - graph = graph.transpose(); - - BFS bfs = new BFS(graph); - bfs.bfsperm(); - } - - // Partly inlined from it.unimi.dsi.law.big.graph.BFS - private void bfsperm() throws IOException { - final long n = graph.numNodes(); - // Allow enough memory to behave like in-memory queue - int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); - - // Use a disk based queue to store BFS frontier - final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue"); - final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); - final byte[] byteBuf = new byte[Long.BYTES]; - // WARNING: no 64-bit version of this data-structure, but it can support - // indices up to 2^37 - final LongArrayBitVector visited = LongArrayBitVector.ofLength(n); - final ProgressLogger pl = new ProgressLogger(LOGGER); - pl.expectedUpdates = n; - pl.itemsName = "nodes"; - pl.start("Starting breadth-first visit..."); - - for (long i = 0; i < n; i++) { - if (visited.getBoolean(i)) - continue; - queue.enqueue(Longs.toByteArray(i)); - visited.set(i); - - while (!queue.isEmpty()) { - queue.dequeue(byteBuf); - final long currentNode = Longs.fromByteArray(byteBuf); - - final LazyLongIterator iterator = graph.successors(currentNode); - long succ; - while ((succ = iterator.nextLong()) != -1) { - if (!visited.getBoolean(succ)) { - visited.set(succ); - queue.enqueue(Longs.toByteArray(succ)); - } - } - - pl.update(); - } - } - - pl.done(); - queue.close(); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java deleted file mode 100644 index 98dd854..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java +++ /dev/null @@ -1,154 +0,0 @@ -package org.softwareheritage.graph.benchmark; - -import com.martiansoftware.jsap.*; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.SWHID; -import org.softwareheritage.graph.benchmark.utils.Random; -import org.softwareheritage.graph.benchmark.utils.Statistics; -import org.softwareheritage.graph.server.Endpoint; - -import java.io.BufferedWriter; -import java.io.FileWriter; -import java.io.IOException; -import java.io.Writer; -import java.util.ArrayList; -import java.util.StringJoiner; -import java.util.function.Function; - -/** - * Benchmark common utility functions. - * - * @author The Software Heritage developers - */ - -public class Benchmark { - /** CSV separator for log file */ - final String CSV_SEPARATOR = ";"; - /** Command line arguments */ - public Args args; - /** - * Constructor. - */ - public Benchmark() { - this.args = new Args(); - } - - /** - * Parses benchmark command line arguments. - * - * @param args command line arguments - */ - public void parseCommandLineArgs(String[] args) throws JSAPException { - SimpleJSAP jsap = new SimpleJSAP(Benchmark.class.getName(), - "Benchmark tool for Software Heritage use-cases scenarios.", - new Parameter[]{ - new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - JSAP.NOT_GREEDY, "The basename of the compressed graph."), - new FlaggedOption("nbNodes", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n', - "nb-nodes", "Number of random nodes used to do the benchmark."), - new FlaggedOption("logFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'l', - "log-file", "File name to output CSV format benchmark log."), - new FlaggedOption("seed", JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "seed", - "Random generator seed."),}); - - JSAPResult config = jsap.parse(args); - if (jsap.messagePrinted()) { - System.exit(1); - } - - this.args.graphPath = config.getString("graphPath"); - this.args.nbNodes = config.getInt("nbNodes"); - this.args.logFile = config.getString("logFile"); - this.args.random = config.contains("seed") ? new Random(config.getLong("seed")) : new Random(); - } - - /** - * Creates CSV file for log output. - */ - public void createCSVLogFile() throws IOException { - try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile))) { - StringJoiner csvHeader = new StringJoiner(CSV_SEPARATOR); - csvHeader.add("use case name").add("SWHID").add("number of edges accessed").add("traversal timing") - .add("swhid2node timing").add("node2swhid timing"); - csvLog.write(csvHeader.toString() + "\n"); - } - } - - /** - * Times a specific endpoint and outputs individual datapoints along with aggregated statistics. - * - * @param useCaseName benchmark use-case name - * @param graph compressed graph used in the benchmark - * @param nodeIds node ids to use as starting point for the endpoint traversal - * @param operation endpoint function to benchmark - * @param dstFmt destination formatted string as described in the - * API - * @param algorithm traversal algorithm used in endpoint call (either "dfs" or "bfs") - */ - public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds, - Function operation, String dstFmt, String algorithm) throws IOException { - ArrayList timings = new ArrayList<>(); - ArrayList timingsNormalized = new ArrayList<>(); - ArrayList nbEdgesAccessed = new ArrayList<>(); - - final boolean append = true; - try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile, append))) { - for (long nodeId : nodeIds) { - SWHID swhid = graph.getSWHID(nodeId); - - Endpoint.Output output = (dstFmt == null) - ? operation.apply(new Endpoint.Input(swhid)) - : operation.apply(new Endpoint.Input(swhid, dstFmt, algorithm)); - - StringJoiner csvLine = new StringJoiner(CSV_SEPARATOR); - csvLine.add(useCaseName).add(swhid.toString()).add(Long.toString(output.meta.nbEdgesAccessed)) - .add(Double.toString(output.meta.timings.traversal)) - .add(Double.toString(output.meta.timings.swhid2node)) - .add(Double.toString(output.meta.timings.node2swhid)); - csvLog.write(csvLine.toString() + "\n"); - - timings.add(output.meta.timings.traversal); - nbEdgesAccessed.add((double) output.meta.nbEdgesAccessed); - if (output.meta.nbEdgesAccessed != 0) { - timingsNormalized.add(output.meta.timings.traversal / output.meta.nbEdgesAccessed); - } - } - } - - System.out.println("\n" + useCaseName + " use-case:"); - - System.out.println("timings:"); - Statistics stats = new Statistics(timings); - stats.printAll(); - - System.out.println("timings normalized:"); - Statistics statsNormalized = new Statistics(timingsNormalized); - statsNormalized.printAll(); - - System.out.println("nb edges accessed:"); - Statistics statsNbEdgesAccessed = new Statistics(nbEdgesAccessed); - statsNbEdgesAccessed.printAll(); - } - - /** - * Same as {@link #timeEndpoint} but without destination or algorithm specified to endpoint call. - */ - public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds, - Function operation) throws IOException { - timeEndpoint(useCaseName, graph, nodeIds, operation, null, null); - } - - /** - * Input arguments. - */ - public class Args { - /** Basename of the compressed graph */ - public String graphPath; - /** Number of random nodes to use for the benchmark */ - public int nbNodes; - /** File name for CSV format benchmark log */ - public String logFile; - /** Random generator */ - public Random random; - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java deleted file mode 100644 index 6a0cf58..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java +++ /dev/null @@ -1,42 +0,0 @@ -package org.softwareheritage.graph.benchmark; - -import com.martiansoftware.jsap.JSAPException; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; -import org.softwareheritage.graph.server.Endpoint; - -import java.io.IOException; - -/** - * Benchmark Software Heritage - * browsing - * use-cases scenarios. - * - * @author The Software Heritage developers - */ - -public class Browsing { - /** - * Main entrypoint. - * - * @param args command line arguments - */ - public static void main(String[] args) throws IOException, JSAPException { - Benchmark bench = new Benchmark(); - bench.parseCommandLineArgs(args); - - Graph graph = Graph.loadMapped(bench.args.graphPath); - - long[] dirNodeIds = bench.args.random.generateNodeIdsOfType(graph, bench.args.nbNodes, Node.Type.DIR); - long[] revNodeIds = bench.args.random.generateNodeIdsOfType(graph, bench.args.nbNodes, Node.Type.REV); - - Endpoint dirEndpoint = new Endpoint(graph, "forward", "dir:cnt,dir:dir"); - Endpoint revEndpoint = new Endpoint(graph, "forward", "rev:rev"); - - System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):"); - bench.createCSVLogFile(); - bench.timeEndpoint("ls", graph, dirNodeIds, dirEndpoint::neighbors); - bench.timeEndpoint("ls -R", graph, dirNodeIds, dirEndpoint::visitPaths); - bench.timeEndpoint("git log", graph, revNodeIds, revEndpoint::visitNodes); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java deleted file mode 100644 index 9b3c4c9..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java +++ /dev/null @@ -1,45 +0,0 @@ -package org.softwareheritage.graph.benchmark; - -import com.martiansoftware.jsap.JSAPException; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.server.Endpoint; - -import java.io.IOException; - -/** - * Benchmark Software Heritage - * provenance - * use-cases scenarios. - * - * @author The Software Heritage developers - */ - -public class Provenance { - /** - * Main entrypoint. - * - * @param args command line arguments - */ - public static void main(String[] args) throws IOException, JSAPException { - Benchmark bench = new Benchmark(); - bench.parseCommandLineArgs(args); - - Graph graph = Graph.loadMapped(bench.args.graphPath); - - long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes); - - Endpoint commitProvenanceEndpoint = new Endpoint(graph, "backward", "dir:dir,cnt:dir,dir:rev"); - Endpoint originProvenanceEndpoint = new Endpoint(graph, "backward", "*"); - - System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):"); - bench.createCSVLogFile(); - - bench.timeEndpoint("commit provenance (dfs)", graph, nodeIds, commitProvenanceEndpoint::walk, "rev", "dfs"); - bench.timeEndpoint("commit provenance (bfs)", graph, nodeIds, commitProvenanceEndpoint::walk, "rev", "bfs"); - bench.timeEndpoint("complete commit provenance", graph, nodeIds, commitProvenanceEndpoint::leaves); - - bench.timeEndpoint("origin provenance (dfs)", graph, nodeIds, originProvenanceEndpoint::walk, "ori", "dfs"); - bench.timeEndpoint("origin provenance (bfs)", graph, nodeIds, originProvenanceEndpoint::walk, "ori", "bfs"); - bench.timeEndpoint("complete origin provenance", graph, nodeIds, originProvenanceEndpoint::leaves); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java deleted file mode 100644 index c0e19f6..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java +++ /dev/null @@ -1,37 +0,0 @@ -package org.softwareheritage.graph.benchmark; - -import com.martiansoftware.jsap.JSAPException; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.server.Endpoint; - -import java.io.IOException; - -/** - * Benchmark Software Heritage - * vault use-case - * scenario. - * - * @author The Software Heritage developers - */ - -public class Vault { - /** - * Main entrypoint. - * - * @param args command line arguments - */ - public static void main(String[] args) throws IOException, JSAPException { - Benchmark bench = new Benchmark(); - bench.parseCommandLineArgs(args); - - Graph graph = Graph.loadMapped(bench.args.graphPath); - - long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes); - - Endpoint endpoint = new Endpoint(graph, "forward", "*"); - - System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):"); - bench.createCSVLogFile(); - bench.timeEndpoint("git bundle", graph, nodeIds, endpoint::visitNodes); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java deleted file mode 100644 index ee4c530..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java +++ /dev/null @@ -1,67 +0,0 @@ -package org.softwareheritage.graph.benchmark.utils; - -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; - -import java.util.PrimitiveIterator; - -/** - * Random related utility class. - * - * @author The Software Heritage developers - */ - -public class Random { - /** Internal pseudorandom generator */ - java.util.Random random; - - /** - * Constructor. - */ - public Random() { - this.random = new java.util.Random(); - } - - /** - * Constructor. - * - * @param seed random generator seed - */ - public Random(long seed) { - this.random = new java.util.Random(seed); - } - - /** - * Generates random node ids. - * - * @param graph graph used to pick node ids - * @param nbNodes number of node ids to generate - * @return an array of random node ids - */ - public long[] generateNodeIds(Graph graph, int nbNodes) { - return random.longs(nbNodes, 0, graph.numNodes()).toArray(); - } - - /** - * Generates random node ids with a specific type. - * - * @param graph graph used to pick node ids - * @param nbNodes number of node ids to generate - * @param expectedType specific node type to pick - * @return an array of random node ids - */ - public long[] generateNodeIdsOfType(Graph graph, int nbNodes, Node.Type expectedType) { - PrimitiveIterator.OfLong nodes = random.longs(0, graph.numNodes()).iterator(); - long[] nodeIds = new long[nbNodes]; - - long nextId; - for (int i = 0; i < nbNodes; i++) { - do { - nextId = nodes.nextLong(); - } while (graph.getNodeType(nextId) != expectedType); - nodeIds[i] = nextId; - } - - return nodeIds; - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Statistics.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Statistics.java deleted file mode 100644 index 96bdfd0..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Statistics.java +++ /dev/null @@ -1,104 +0,0 @@ -package org.softwareheritage.graph.benchmark.utils; - -import java.util.ArrayList; -import java.util.Collections; - -/** - * Compute various statistics on a list of values. - * - * @author The Software Heritage developers - */ - -public class Statistics { - /** Input values */ - ArrayList values; - - /** - * Constructor. - * - * @param values input values - */ - public Statistics(ArrayList values) { - this.values = values; - } - - /** - * Returns the minimum value. - * - * @return minimum value - */ - public double getMin() { - double min = Double.POSITIVE_INFINITY; - for (double v : values) { - min = Math.min(min, v); - } - return min; - } - - /** - * Returns the maximum value. - * - * @return maximum value - */ - public double getMax() { - double max = Double.NEGATIVE_INFINITY; - for (double v : values) { - max = Math.max(max, v); - } - return max; - } - - /** - * Computes the average. - * - * @return average value - */ - public double getAverage() { - double sum = 0; - for (double v : values) { - sum += v; - } - return sum / (double) values.size(); - } - - /** - * Returns the median value. - * - * @return median value - */ - public double getMedian() { - Collections.sort(values); - int length = values.size(); - if (length % 2 == 0) { - return (values.get(length / 2) + values.get(length / 2 - 1)) / 2; - } else { - return values.get(length / 2); - } - } - - /** - * Computes the standard deviation. - * - * @return standard deviation value - */ - public double getStandardDeviation() { - double average = getAverage(); - double variance = 0; - for (double v : values) { - variance += (v - average) * (v - average); - } - variance /= (double) values.size(); - return Math.sqrt(variance); - } - - /** - * Computes and prints all statistical values. - */ - public void printAll() { - System.out.println("min value: " + getMin()); - System.out.println("max value: " + getMax()); - System.out.println("average: " + getAverage()); - System.out.println("median: " + getMedian()); - System.out.println("standard deviation: " + getStandardDeviation()); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Timing.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Timing.java deleted file mode 100644 index de5de6c..0000000 --- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Timing.java +++ /dev/null @@ -1,30 +0,0 @@ -package org.softwareheritage.graph.benchmark.utils; - -/** - * Time measurement utility class. - * - * @author The Software Heritage developers - */ - -public class Timing { - /** - * Returns measurement starting timestamp. - * - * @return timestamp used for time measurement - */ - public static long start() { - return System.nanoTime(); - } - - /** - * Ends timing measurement and returns total duration in seconds. - * - * @param startTime measurement starting timestamp - * @return time in seconds elapsed since starting point - */ - public static double stop(long startTime) { - long endTime = System.nanoTime(); - double duration = (double) (endTime - startTime) / 1_000_000_000; - return duration; - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java new file mode 100644 index 0000000..1f12744 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import com.github.luben.zstd.ZstdInputStream; +import it.unimi.dsi.fastutil.bytes.ByteArrays; +import it.unimi.dsi.fastutil.io.FastBufferedInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; + +/** + * A graph dataset in (zstd-compressed) CSV format. + * + * This format does not contain any properties apart from the SWHIDs of the nodes, and optionally + * the labels of the edges and the permissions of the directory entries. + * + * The structure of the dataset is as follows: one directory per object type, each containing: + * + *

    + *
  • a number of files *.nodes.csv.zst containing the SWHIDs of the objects stored in + * the graph, one per line.
  • + *
  • a number of files *.edges.csv.zst containing the edges of the graph, one per + * line. The format of each edge is as follows: + * SRC_SWHID DST_SWHID [BASE64_LABEL] [INT_PERMISSION].
  • + *
+ * + */ +public class CSVEdgeDataset implements GraphDataset { + final static Logger logger = LoggerFactory.getLogger(CSVEdgeDataset.class); + + final private File datasetDir; + + public CSVEdgeDataset(String datasetPath) { + this(new File(datasetPath)); + } + + public CSVEdgeDataset(File datasetDir) { + if (!datasetDir.exists()) { + throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist"); + } + this.datasetDir = datasetDir; + } + + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + File[] allTables = datasetDir.listFiles(); + if (allTables == null) { + return; + } + for (File tableFile : allTables) { + File[] allCsvFiles = tableFile.listFiles(); + if (allCsvFiles == null) { + continue; + } + for (File csvFile : allCsvFiles) { + if (csvFile.getName().endsWith(".edges.csv.zst")) { + readEdgesCsvZst(csvFile.getPath(), edgeCb); + } else if (csvFile.getName().endsWith(".nodes.csv.zst")) { + readNodesCsvZst(csvFile.getPath(), nodeCb); + } + } + } + } + + public static void readEdgesCsvZst(String csvZstPath, GraphDataset.EdgeCallback cb) throws IOException { + InputStream csvInputStream = new ZstdInputStream(new BufferedInputStream(new FileInputStream(csvZstPath))); + readEdgesCsv(csvInputStream, cb); + } + + public static void readEdgesCsv(InputStream csvInputStream, GraphDataset.EdgeCallback cb) throws IOException { + FastBufferedInputStream csvReader = new FastBufferedInputStream(csvInputStream); + + Charset charset = StandardCharsets.US_ASCII; + byte[] array = new byte[1024]; + for (long line = 0;; line++) { + int start = 0, len; + while ((len = csvReader.readLine(array, start, array.length - start, + FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) { + start += len; + array = ByteArrays.grow(array, array.length + 1); + } + if (len == -1) + break; // EOF + final int lineLength = start + len; + + // Skip whitespace at the start of the line. + int offset = 0; + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') + offset++; + if (offset == lineLength) { + continue; + } + if (array[0] == '#') + continue; + + // Scan source id. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) + offset++; + final byte[] ss = Arrays.copyOfRange(array, start, offset); + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') + offset++; + if (offset == lineLength) { + logger.error("Error at line " + line + ": no target"); + continue; + } + + // Scan target ID + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) + offset++; + final byte[] ts = Arrays.copyOfRange(array, start, offset); + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') + offset++; + // Scan label + byte[] ls = null; + if (offset < lineLength) { + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) + offset++; + ls = Arrays.copyOfRange(array, start, offset); + } + + // Skip whitespace between identifiers. + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') + offset++; + // Scan permission + int permission = 0; + if (offset < lineLength) { + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) + offset++; + permission = Integer.parseInt(new String(array, start, offset - start, charset)); + } + + cb.onEdge(ss, ts, ls, permission); + } + } + + public static void readNodesCsvZst(String csvZstPath, GraphDataset.NodeCallback cb) throws IOException { + InputStream csvInputStream = new ZstdInputStream(new BufferedInputStream(new FileInputStream(csvZstPath))); + readNodesCsv(csvInputStream, cb); + } + + public static void readNodesCsv(InputStream csvInputStream, GraphDataset.NodeCallback cb) throws IOException { + FastBufferedInputStream csvReader = new FastBufferedInputStream(csvInputStream); + + byte[] array = new byte[1024]; + for (long line = 0;; line++) { + int start = 0, len; + while ((len = csvReader.readLine(array, start, array.length - start, + FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) { + start += len; + array = ByteArrays.grow(array, array.length + 1); + } + if (len == -1) + break; // EOF + final int lineLength = start + len; + + // Skip whitespace at the start of the line. + int offset = 0; + while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ') + offset++; + if (offset == lineLength) { + continue; + } + if (array[0] == '#') + continue; + + // Scan source id. + start = offset; + while (offset < lineLength && (array[offset] < 0 || array[offset] > ' ')) + offset++; + final byte[] ss = Arrays.copyOfRange(array, start, offset); + + cb.onNode(ss); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java b/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java similarity index 85% rename from java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java rename to java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java index 3e094e8..62d3460 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java @@ -1,51 +1,58 @@ -package org.softwareheritage.graph.utils; +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; import it.unimi.dsi.fastutil.io.BinIO; import java.io.File; import java.io.IOException; /** * CLI program used to compose two on-disk permutations. * * It takes two on-disk permutations as parameters, p1 and p2, and writes on disk (p1 o p2) at the - * given location. This is useful for multi-step compression (e.g. Unordered -> BFS -> LLP), as it + * given location. This is useful for multi-step compression (e.g., Unordered -> BFS -> LLP), as it * can be used to merge all the intermediate permutations. */ public class ComposePermutations { private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ new UnflaggedOption("firstPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The first permutation"), new UnflaggedOption("secondPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The second permutation"), new UnflaggedOption("outputPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The output permutation"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException, ClassNotFoundException { JSAPResult config = parse_args(args); String firstPermFilename = config.getString("firstPermutation"); String secondPermFilename = config.getString("secondPermutation"); String outputPermFilename = config.getString("outputPermutation"); long[][] firstPerm = BinIO.loadLongsBig(new File(firstPermFilename)); long[][] secondPerm = BinIO.loadLongsBig(new File(secondPermFilename)); long[][] outputPerm = Util.composePermutationsInPlace(firstPerm, secondPerm); BinIO.storeLongs(outputPerm, outputPermFilename); } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java new file mode 100644 index 0000000..9e2ad40 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import com.github.luben.zstd.ZstdOutputStream; +import com.martiansoftware.jsap.*; +import it.unimi.dsi.logging.ProgressLogger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.SwhType; +import org.softwareheritage.graph.utils.Sort; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicLongArray; + +/** + * Read a graph dataset and extract all the unique node SWHIDs it contains, including the ones that + * are not stored as actual objects in the graph, but only referred to by the edges. + * Additionally, extract the set of all unique edge labels in the graph. + * + *
    + *
  • The set of nodes is written in ${outputBasename}.nodes.csv.zst, as a + * zst-compressed sorted list of SWHIDs, one per line.
  • + *
  • The set of edge labels is written in ${outputBasename}.labels.csv.zst, as a + * zst-compressed sorted list of labels encoded in base64, one per line.
  • + *
  • The number of unique nodes referred to in the graph is written in a text file, + * ${outputBasename}.nodes.count.txt
  • + *
  • The number of unique edges referred to in the graph is written in a text file, + * ${outputBasename}.edges.count.txt
  • + *
  • The number of unique edge labels is written in a text file, + * ${outputBasename}.labels.count.txt
  • + *
  • Statistics on the number of nodes of each type are written in a text file, + * ${outputBasename}.nodes.stats.txt
  • + *
  • Statistics on the number of edges of each type are written in a text file, + * ${outputBasename}.edges.stats.txt
  • + *
+ * + *

+ * Rationale: Because the graph can contain holes, loose objects and dangling + * objects, some nodes that are referred to as destinations in the edge relationships might not + * actually be stored in the graph itself. However, to compress the graph using a graph compression + * library, it is necessary to have a list of all the nodes in the graph, including the + * ones that are simply referred to by the edges but not actually stored as concrete objects. + *

+ * + *

+ * This class reads the entire graph dataset, and uses sort -u to extract the set of + * all the unique nodes and unique labels that will be needed as an input for the compression + * process. + *

+ */ +public class ExtractNodes { + private final static Logger logger = LoggerFactory.getLogger(ExtractNodes.class); + + // Create one thread per processor. + final static int numThreads = Runtime.getRuntime().availableProcessors(); + + // Allocate up to 20% of maximum memory for sorting subprocesses. + final static long sortBufferSize = (long) (Runtime.getRuntime().maxMemory() * 0.2 / numThreads / 2); + + private static JSAPResult parseArgs(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ + new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the edges dataset"), + new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, + "Basename of the output files"), + + new FlaggedOption("format", JSAP.STRING_PARSER, "orc", JSAP.NOT_REQUIRED, 'f', "format", + "Format of the input dataset (orc, csv)"), + new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, String.valueOf(sortBufferSize) + "b", + JSAP.NOT_REQUIRED, 'S', "sort-buffer-size", + "Size of the memory buffer used by each sort process"), + new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir", + "Path to the temporary directory used by sort")}); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + System.err.println("Usage error: " + e.getMessage()); + System.exit(1); + } + return config; + } + + public static void main(String[] args) throws IOException, InterruptedException { + JSAPResult parsedArgs = parseArgs(args); + String datasetPath = parsedArgs.getString("dataset"); + String outputBasename = parsedArgs.getString("outputBasename"); + + String datasetFormat = parsedArgs.getString("format"); + String sortBufferSize = parsedArgs.getString("sortBufferSize"); + String sortTmpPath = parsedArgs.getString("sortTmpDir", null); + + File sortTmpDir = new File(sortTmpPath); + sortTmpDir.mkdirs(); + + // Open edge dataset + GraphDataset dataset; + if (datasetFormat.equals("orc")) { + dataset = new ORCGraphDataset(datasetPath); + } else if (datasetFormat.equals("csv")) { + dataset = new CSVEdgeDataset(datasetPath); + } else { + throw new IllegalArgumentException("Unknown dataset format: " + datasetFormat); + } + + extractNodes(dataset, outputBasename, sortBufferSize, sortTmpDir); + } + + public static void extractNodes(GraphDataset dataset, String outputBasename, String sortBufferSize, File sortTmpDir) + throws IOException, InterruptedException { + // Read the dataset and write the nodes and labels to the sorting processes + AtomicLong edgeCount = new AtomicLong(0); + AtomicLongArray edgeCountByType = new AtomicLongArray(SwhType.values().length * SwhType.values().length); + + int numThreads = Runtime.getRuntime().availableProcessors(); + ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); + + Process[] nodeSorters = new Process[numThreads]; + File[] nodeBatchPaths = new File[numThreads]; + Process[] labelSorters = new Process[numThreads]; + File[] labelBatches = new File[numThreads]; + long[] progressCounts = new long[numThreads]; + + AtomicInteger nextThreadId = new AtomicInteger(0); + ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement); + + ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); + pl.itemsName = "edges"; + pl.start("Reading node/edge files and writing sorted batches."); + + GraphDataset.NodeCallback nodeCallback = (node) -> { + int threadId = threadLocalId.get(); + if (nodeSorters[threadId] == null) { + nodeBatchPaths[threadId] = File.createTempFile("nodes", ".txt", sortTmpDir); + nodeSorters[threadId] = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), + List.of("-o", nodeBatchPaths[threadId].getPath())); + } + OutputStream nodeOutputStream = nodeSorters[threadId].getOutputStream(); + nodeOutputStream.write(node); + nodeOutputStream.write('\n'); + }; + + GraphDataset.NodeCallback labelCallback = (label) -> { + int threadId = threadLocalId.get(); + if (labelSorters[threadId] == null) { + labelBatches[threadId] = File.createTempFile("labels", ".txt", sortTmpDir); + labelSorters[threadId] = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), + List.of("-o", labelBatches[threadId].getPath())); + } + OutputStream labelOutputStream = labelSorters[threadId].getOutputStream(); + labelOutputStream.write(label); + labelOutputStream.write('\n'); + }; + + try { + forkJoinPool.submit(() -> { + try { + dataset.readEdges((node) -> { + nodeCallback.onNode(node); + }, (src, dst, label, perm) -> { + nodeCallback.onNode(src); + nodeCallback.onNode(dst); + + if (label != null) { + labelCallback.onNode(label); + } + edgeCount.incrementAndGet(); + // Extract type of src and dst from their SWHID: swh:1:XXX + byte[] srcTypeBytes = Arrays.copyOfRange(src, 6, 6 + 3); + byte[] dstTypeBytes = Arrays.copyOfRange(dst, 6, 6 + 3); + int srcType = SwhType.byteNameToInt(srcTypeBytes); + int dstType = SwhType.byteNameToInt(dstTypeBytes); + if (srcType != -1 && dstType != -1) { + edgeCountByType.incrementAndGet(srcType * SwhType.values().length + dstType); + } else { + System.err.println("Invalid edge type: " + new String(srcTypeBytes) + " -> " + + new String(dstTypeBytes)); + System.exit(1); + } + + int threadId = threadLocalId.get(); + if (++progressCounts[threadId] > 1000) { + synchronized (pl) { + pl.update(progressCounts[threadId]); + } + progressCounts[threadId] = 0; + } + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).get(); + } catch (ExecutionException e) { + throw new RuntimeException(e); + } + + // Close all the sorters stdin + for (int i = 0; i < numThreads; i++) { + if (nodeSorters[i] != null) { + nodeSorters[i].getOutputStream().close(); + } + if (labelSorters[i] != null) { + labelSorters[i].getOutputStream().close(); + } + } + + // Wait for sorting processes to finish + for (int i = 0; i < numThreads; i++) { + if (nodeSorters[i] != null) { + nodeSorters[i].waitFor(); + } + if (labelSorters[i] != null) { + labelSorters[i].waitFor(); + } + } + pl.done(); + + ArrayList nodeSortMergerOptions = new ArrayList<>(List.of("-m")); + ArrayList labelSortMergerOptions = new ArrayList<>(List.of("-m")); + for (int i = 0; i < numThreads; i++) { + if (nodeBatchPaths[i] != null) { + nodeSortMergerOptions.add(nodeBatchPaths[i].getPath()); + } + if (labelBatches[i] != null) { + labelSortMergerOptions.add(labelBatches[i].getPath()); + } + } + + // Spawn node merge-sorting process + Process nodeSortMerger = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), nodeSortMergerOptions); + nodeSortMerger.getOutputStream().close(); + OutputStream nodesFileOutputStream = new ZstdOutputStream( + new BufferedOutputStream(new FileOutputStream(outputBasename + ".nodes.csv.zst"))); + NodesOutputThread nodesOutputThread = new NodesOutputThread( + new BufferedInputStream(nodeSortMerger.getInputStream()), nodesFileOutputStream); + nodesOutputThread.start(); + + // Spawn label merge-sorting process + Process labelSortMerger = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), labelSortMergerOptions); + labelSortMerger.getOutputStream().close(); + OutputStream labelsFileOutputStream = new ZstdOutputStream( + new BufferedOutputStream(new FileOutputStream(outputBasename + ".labels.csv.zst"))); + LabelsOutputThread labelsOutputThread = new LabelsOutputThread( + new BufferedInputStream(labelSortMerger.getInputStream()), labelsFileOutputStream); + labelsOutputThread.start(); + + pl.logger().info("Waiting for merge-sort and writing output files..."); + nodeSortMerger.waitFor(); + labelSortMerger.waitFor(); + nodesOutputThread.join(); + labelsOutputThread.join(); + + long[][] edgeCountByTypeArray = new long[SwhType.values().length][SwhType.values().length]; + for (int i = 0; i < edgeCountByTypeArray.length; i++) { + for (int j = 0; j < edgeCountByTypeArray[i].length; j++) { + edgeCountByTypeArray[i][j] = edgeCountByType.get(i * SwhType.values().length + j); + } + } + + // Write node, edge and label counts/statistics + printEdgeCounts(outputBasename, edgeCount.get(), edgeCountByTypeArray); + printNodeCounts(outputBasename, nodesOutputThread.getNodeCount(), nodesOutputThread.getNodeTypeCounts()); + printLabelCounts(outputBasename, labelsOutputThread.getLabelCount()); + + // Clean up sorted batches + for (int i = 0; i < numThreads; i++) { + if (nodeBatchPaths[i] != null) { + nodeBatchPaths[i].delete(); + } + if (labelBatches[i] != null) { + labelBatches[i].delete(); + } + } + } + + private static void printEdgeCounts(String basename, long edgeCount, long[][] edgeTypeCounts) throws IOException { + PrintWriter nodeCountWriter = new PrintWriter(basename + ".edges.count.txt"); + nodeCountWriter.println(edgeCount); + nodeCountWriter.close(); + + PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".edges.stats.txt"); + TreeMap edgeTypeCountsMap = new TreeMap<>(); + for (SwhType src : SwhType.values()) { + for (SwhType dst : SwhType.values()) { + long cnt = edgeTypeCounts[SwhType.toInt(src)][SwhType.toInt(dst)]; + if (cnt > 0) + edgeTypeCountsMap.put(src.toString().toLowerCase() + ":" + dst.toString().toLowerCase(), cnt); + } + } + for (Map.Entry entry : edgeTypeCountsMap.entrySet()) { + nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue()); + } + nodeTypesCountWriter.close(); + } + + private static void printNodeCounts(String basename, long nodeCount, long[] nodeTypeCounts) throws IOException { + PrintWriter nodeCountWriter = new PrintWriter(basename + ".nodes.count.txt"); + nodeCountWriter.println(nodeCount); + nodeCountWriter.close(); + + PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".nodes.stats.txt"); + TreeMap nodeTypeCountsMap = new TreeMap<>(); + for (SwhType v : SwhType.values()) { + nodeTypeCountsMap.put(v.toString().toLowerCase(), nodeTypeCounts[SwhType.toInt(v)]); + } + for (Map.Entry entry : nodeTypeCountsMap.entrySet()) { + nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue()); + } + nodeTypesCountWriter.close(); + } + + private static void printLabelCounts(String basename, long labelCount) throws IOException { + PrintWriter nodeCountWriter = new PrintWriter(basename + ".labels.count.txt"); + nodeCountWriter.println(labelCount); + nodeCountWriter.close(); + } + + private static class NodesOutputThread extends Thread { + private final InputStream sortedNodesStream; + private final OutputStream nodesOutputStream; + + private long nodeCount = 0; + private final long[] nodeTypeCounts = new long[SwhType.values().length]; + + NodesOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) { + this.sortedNodesStream = sortedNodesStream; + this.nodesOutputStream = nodesOutputStream; + } + + @Override + public void run() { + BufferedReader reader = new BufferedReader( + new InputStreamReader(sortedNodesStream, StandardCharsets.UTF_8)); + try { + String line; + while ((line = reader.readLine()) != null) { + nodesOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); + nodesOutputStream.write('\n'); + nodeCount++; + try { + SwhType nodeType = SwhType.fromStr(line.split(":")[2]); + nodeTypeCounts[SwhType.toInt(nodeType)]++; + } catch (ArrayIndexOutOfBoundsException e) { + System.err.println("Error parsing SWHID: " + line); + System.exit(1); + } + } + nodesOutputStream.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public long getNodeCount() { + return nodeCount; + } + + public long[] getNodeTypeCounts() { + return nodeTypeCounts; + } + } + + private static class LabelsOutputThread extends Thread { + private final InputStream sortedLabelsStream; + private final OutputStream labelsOutputStream; + + private long labelCount = 0; + + LabelsOutputThread(InputStream sortedLabelsStream, OutputStream labelsOutputStream) { + this.labelsOutputStream = labelsOutputStream; + this.sortedLabelsStream = sortedLabelsStream; + } + + @Override + public void run() { + BufferedReader reader = new BufferedReader( + new InputStreamReader(sortedLabelsStream, StandardCharsets.UTF_8)); + try { + String line; + while ((line = reader.readLine()) != null) { + labelsOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); + labelsOutputStream.write('\n'); + labelCount++; + } + labelsOutputStream.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public long getLabelCount() { + return labelCount; + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java new file mode 100644 index 0000000..fc5cc5b --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import com.github.luben.zstd.ZstdOutputStream; +import com.martiansoftware.jsap.*; +import org.softwareheritage.graph.utils.Sort; + +import java.io.*; +import java.nio.charset.StandardCharsets; + +/** + * Read a graph dataset and extract all the unique authors it contains. + * + *

+ * This class reads the revision and release tables of the graph dataset, and uses + * sort -u to extract the set of all the unique persons (name + email, potentially + * pseudonymized) and store them in a file. + *

+ */ +public class ExtractPersons { + private static JSAPResult parseArgs(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ + new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC dataset"), + new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, + "Basename of the output files"), + + new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, "30%", JSAP.NOT_REQUIRED, 'S', + "sort-buffer-size", "Size of the memory buffer used by sort"), + new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir", + "Path to the temporary directory used by sort")}); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + System.err.println("Usage error: " + e.getMessage()); + System.exit(1); + } + return config; + } + + private static void processAuthorColumn(ORCGraphDataset.SwhOrcTable table, String columnName, OutputStream stream) + throws IOException { + table.readBytes64Column(columnName, (swhid, personBase64) -> { + stream.write(personBase64); + stream.write('\n'); + }); + } + + public static void main(String[] args) throws IOException, InterruptedException { + JSAPResult parsedArgs = parseArgs(args); + String datasetPath = parsedArgs.getString("dataset"); + String outputBasename = parsedArgs.getString("outputBasename"); + + String sortBufferSize = parsedArgs.getString("sortBufferSize"); + String sortTmpDir = parsedArgs.getString("sortTmpDir", null); + + ORCGraphDataset dataset = new ORCGraphDataset(datasetPath); + + extractPersons(dataset, outputBasename, sortBufferSize, sortTmpDir); + } + + public static void extractPersons(ORCGraphDataset dataset, String outputBasename, String sortBufferSize, + String sortTmpDir) throws IOException, InterruptedException { + (new File(sortTmpDir)).mkdirs(); + + // Spawn person sorting process + Process personSort = Sort.spawnSort(sortBufferSize, sortTmpDir); + BufferedOutputStream personSortStdin = new BufferedOutputStream(personSort.getOutputStream()); + BufferedInputStream personSortStdout = new BufferedInputStream(personSort.getInputStream()); + OutputStream personsFileOutputStream = new ZstdOutputStream( + new BufferedOutputStream(new FileOutputStream(outputBasename + ".persons.csv.zst"))); + PersonsOutputThread personsOutputThread = new PersonsOutputThread(personSortStdout, personsFileOutputStream); + personsOutputThread.start(); + + processAuthorColumn(dataset.getTable("release"), "author", personSortStdin); + processAuthorColumn(dataset.getTable("revision"), "author", personSortStdin); + processAuthorColumn(dataset.getTable("revision"), "committer", personSortStdin); + + // Wait for sorting processes to finish + personSortStdin.close(); + personSort.waitFor(); + personsOutputThread.join(); + + // Write person count statistics + printPersonsCounts(outputBasename, personsOutputThread.getPersonCount()); + } + + private static void printPersonsCounts(String basename, long labelCount) throws IOException { + PrintWriter nodeCountWriter = new PrintWriter(basename + ".persons.count.txt"); + nodeCountWriter.println(labelCount); + nodeCountWriter.close(); + } + + private static class PersonsOutputThread extends Thread { + private final InputStream sortedPersonsStream; + private final OutputStream personsOutputStream; + + private long personCount = 0; + + PersonsOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) { + this.sortedPersonsStream = sortedNodesStream; + this.personsOutputStream = nodesOutputStream; + } + + @Override + public void run() { + BufferedReader reader = new BufferedReader( + new InputStreamReader(sortedPersonsStream, StandardCharsets.UTF_8)); + try { + String line; + while ((line = reader.readLine()) != null) { + personsOutputStream.write(line.getBytes(StandardCharsets.UTF_8)); + personsOutputStream.write('\n'); + personCount++; + } + personsOutputStream.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public long getPersonCount() { + return personCount; + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java new file mode 100644 index 0000000..ae38cda --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import java.io.IOException; + +/** + * GraphDataset is a common interface to represent on-disk graph datasets in various formats, + * usually extracted from the SWH archive with the swh-dataset tool. + */ +public interface GraphDataset { + interface NodeCallback { + void onNode(byte[] node) throws IOException; + } + + interface EdgeCallback { + void onEdge(byte[] src, byte[] dst, byte[] label, int permission) throws IOException; + } + + /** + * Read the graph dataset and call the callback methods for each node and edge encountered. + * + *
    + *
  • The node callback is called for each object stored in the graph.
  • + *
  • The edge callback is called for each relationship (between two nodes) stored in the + * graph.
  • + *
+ * + *

+ * Note that because the graph can contain holes, loose objects and dangling objects, the edge + * callback may be called with parameters representing nodes that are not stored in the graph. This + * is because some nodes that are referred to as destinations in the dataset might not be present in + * the archive (e.g., a revision entry in a directory pointing to a revision that we have not + * crawled yet). + *

+ * + *

+ * In order to generate a complete set of all the nodes that are referred to in the graph + * dataset, see the {@link ExtractNodes} class. + *

+ * + * @param nodeCb callback for each node + * @param edgeCb callback for each edge + */ + void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException; + + interface TimestampCallback { + void onTimestamp(byte[] swhid, long timestamp, short offset) throws IOException; + } + + interface LongCallback { + void onLong(byte[] swhid, long value) throws IOException; + } + + interface BytesCallback { + void onBytes(byte[] swhid, byte[] value) throws IOException; + } + + interface HashedEdgeCallback { + void onHashedEdge(long src, long dst, long label, int permission) throws IOException; + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java new file mode 100644 index 0000000..31531ec --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2020-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import com.martiansoftware.jsap.*; +import it.unimi.dsi.big.webgraph.LazyLongIterator; +import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; +import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; +import it.unimi.dsi.fastutil.Arrays; +import it.unimi.dsi.fastutil.BigArrays; +import it.unimi.dsi.fastutil.Size64; +import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.longs.LongHeapSemiIndirectPriorityQueue; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.io.InputBitStream; +import it.unimi.dsi.io.OutputBitStream; +import it.unimi.dsi.logging.ProgressLogger; +import it.unimi.dsi.big.webgraph.ImmutableGraph; +import it.unimi.dsi.big.webgraph.NodeIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.labels.DirEntry; +import org.softwareheritage.graph.labels.SwhLabel; +import org.softwareheritage.graph.maps.NodeIdMap; +import org.softwareheritage.graph.utils.ForkJoinBigQuickSort2; +import org.softwareheritage.graph.utils.ForkJoinQuickSort3; + +import java.io.*; +import java.nio.file.Paths; +import java.util.*; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; + +public class LabelMapBuilder { + final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class); + + // Create one thread per processor. + final static int numThreads = Runtime.getRuntime().availableProcessors(); + // Allocate up to 40% of maximum memory. + final static int DEFAULT_BATCH_SIZE = Math + .min((int) (Runtime.getRuntime().maxMemory() * 0.4 / (numThreads * 8 * 3)), Arrays.MAX_ARRAY_SIZE); + + String orcDatasetPath; + String graphPath; + String outputGraphPath; + String tmpDir; + int batchSize; + + long numNodes; + long numArcs; + + NodeIdMap nodeIdMap; + Object2LongFunction filenameMph; + long numFilenames; + int totalLabelWidth; + + public LabelMapBuilder(String orcDatasetPath, String graphPath, String outputGraphPath, int batchSize, + String tmpDir) throws IOException { + this.orcDatasetPath = orcDatasetPath; + this.graphPath = graphPath; + this.outputGraphPath = (outputGraphPath == null) ? graphPath : outputGraphPath; + this.batchSize = batchSize; + this.tmpDir = tmpDir; + + ImmutableGraph graph = ImmutableGraph.loadOffline(graphPath); + this.numArcs = graph.numArcs(); + this.numNodes = graph.numNodes(); + + this.nodeIdMap = new NodeIdMap(graphPath); + + filenameMph = NodeIdMap.loadMph(graphPath + ".labels.mph"); + numFilenames = getMPHSize(filenameMph); + totalLabelWidth = DirEntry.labelWidth(numFilenames); + } + + private static JSAPResult parse_args(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ + new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"), + new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output graph"), + new FlaggedOption("outputGraphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', + "output-graph", "Basename of the output graph, same as --graph if not specified"), + new FlaggedOption("batchSize", JSAP.INTEGER_PARSER, String.valueOf(DEFAULT_BATCH_SIZE), + JSAP.NOT_REQUIRED, 'b', "batch-size", "Number of triplets held in memory in each batch"), + new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 'T', "temp-dir", + "Temporary directory path"),}); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + e.printStackTrace(); + } + return config; + } + + public static void main(String[] args) throws IOException, InterruptedException { + JSAPResult config = parse_args(args); + String orcDataset = config.getString("dataset"); + String graphPath = config.getString("graphPath"); + String outputGraphPath = config.getString("outputGraphPath"); + int batchSize = config.getInt("batchSize"); + String tmpDir = config.getString("tmpDir"); + + LabelMapBuilder builder = new LabelMapBuilder(orcDataset, graphPath, outputGraphPath, batchSize, tmpDir); + + builder.computeLabelMap(); + } + + static long getMPHSize(Object2LongFunction mph) { + return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size(); + } + + void computeLabelMap() throws IOException { + File tempDirFile = new File(tmpDir); + ObjectArrayList forwardBatches = new ObjectArrayList<>(); + ObjectArrayList backwardBatches = new ObjectArrayList<>(); + genSortedBatches(forwardBatches, backwardBatches, tempDirFile); + + BatchEdgeLabelLineIterator forwardBatchHeapIterator = new BatchEdgeLabelLineIterator(forwardBatches); + writeLabels(forwardBatchHeapIterator, graphPath, outputGraphPath); + for (File batch : forwardBatches) { + batch.delete(); + } + + BatchEdgeLabelLineIterator backwardBatchHeapIterator = new BatchEdgeLabelLineIterator(backwardBatches); + writeLabels(backwardBatchHeapIterator, graphPath + "-transposed", outputGraphPath + "-transposed"); + for (File batch : backwardBatches) { + batch.delete(); + } + + logger.info("Done"); + } + + void genSortedBatches(ObjectArrayList forwardBatches, ObjectArrayList backwardBatches, File tempDirFile) + throws IOException { + logger.info("Initializing batch arrays."); + long[][] srcArrays = new long[numThreads][batchSize]; + long[][] dstArrays = new long[numThreads][batchSize]; + long[][] labelArrays = new long[numThreads][batchSize]; + int[] indexes = new int[numThreads]; + long[] progressCounts = new long[numThreads]; + + ProgressLogger plSortingBatches = new ProgressLogger(logger, 10, TimeUnit.SECONDS); + plSortingBatches.itemsName = "edges"; + plSortingBatches.expectedUpdates = this.numArcs; + plSortingBatches.start("Reading edges and writing sorted batches."); + + AtomicInteger nextThreadId = new AtomicInteger(0); + ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement); + + readHashedEdgeLabels((src, dst, label, perms) -> { + // System.err.println("0. Input " + src + " " + dst + " " + label + " " + perms); + int threadId = threadLocalId.get(); + int idx = indexes[threadId]++; + srcArrays[threadId][idx] = src; + dstArrays[threadId][idx] = dst; + labelArrays[threadId][idx] = DirEntry.toEncoded(label, perms); + if (++progressCounts[threadId] > 1000) { + synchronized (plSortingBatches) { + plSortingBatches.update(progressCounts[threadId]); + } + progressCounts[threadId] = 0; + } + + if (idx == batchSize - 1) { + processBidirectionalBatches(batchSize, srcArrays[threadId], dstArrays[threadId], labelArrays[threadId], + tempDirFile, forwardBatches, backwardBatches); + indexes[threadId] = 0; + } + }); + + IntStream.range(0, numThreads).parallel().forEach(t -> { + int idx = indexes[t]; + if (idx > 0) { + try { + processBidirectionalBatches(idx, srcArrays[t], dstArrays[t], labelArrays[t], tempDirFile, + forwardBatches, backwardBatches); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }); + + // Trigger the GC to free up the large arrays + for (int i = 0; i < numThreads; i++) { + srcArrays[i] = null; + dstArrays[i] = null; + labelArrays[i] = null; + } + + logger.info("Created " + forwardBatches.size() + " forward batches and " + backwardBatches.size() + + " backward batches."); + } + + void readHashedEdgeLabels(GraphDataset.HashedEdgeCallback cb) throws IOException { + ORCGraphDataset dataset = new ORCGraphDataset(orcDatasetPath); + ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); + try { + forkJoinPool.submit(() -> { + try { + dataset.readEdges((node) -> { + }, (src, dst, label, perms) -> { + if (label == null) { + return; + } + long srcNode = nodeIdMap.getNodeId(src); + long dstNode = nodeIdMap.getNodeId(dst); + long labelId = filenameMph.getLong(label); + cb.onHashedEdge(srcNode, dstNode, labelId, perms); + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + } + + void processBidirectionalBatches(final int n, final long[] source, final long[] target, final long[] labels, + final File tempDir, final List forwardBatches, final List backwardBatches) throws IOException { + processBatch(n, source, target, labels, tempDir, forwardBatches); + processBatch(n, target, source, labels, tempDir, backwardBatches); + } + + void processBatch(final int n, final long[] source, final long[] target, final long[] labels, final File tempDir, + final List batches) throws IOException { + if (n == 0) { + return; + } + ForkJoinQuickSort3.parallelQuickSort(source, target, labels, 0, n); + + final File batchFile = File.createTempFile("batch", ".bitstream", tempDir); + batchFile.deleteOnExit(); + batches.add(batchFile); + final OutputBitStream batch = new OutputBitStream(batchFile); + + // Compute unique triplets + int u = 1; + for (int i = n - 1; i-- != 0;) { + if (source[i] != source[i + 1] || target[i] != target[i + 1] || labels[i] != labels[i + 1]) { + u++; + } + } + batch.writeDelta(u); + + // Write batch + long prevSource = source[0]; + batch.writeLongDelta(prevSource); + batch.writeLongDelta(target[0]); + batch.writeLongDelta(labels[0]); + // System.err.println("1. Wrote " + prevSource + " " + target[0] + " " + labels[0]); + + for (int i = 1; i < n; i++) { + if (source[i] != prevSource) { + // Default case, we write (source - prevsource, target, label) + batch.writeLongDelta(source[i] - prevSource); + batch.writeLongDelta(target[i]); + batch.writeLongDelta(labels[i]); + prevSource = source[i]; + } else if (target[i] != target[i - 1] || labels[i] != labels[i - 1]) { + // Case where source is identical with prevsource, but target or label differ. + // We write (0, target - prevtarget, label) + batch.writeLongDelta(0); + batch.writeLongDelta(target[i] - target[i - 1]); + batch.writeLongDelta(labels[i]); + } else { + continue; + } + // System.err.println("1. Wrote " + source[i] + " " + target[i] + " " + labels[i]); + } + batch.close(); + } + + void writeLabels(EdgeLabelLineIterator mapLines, String graphBasename, String outputGraphBasename) + throws IOException { + // Loading the graph to iterate + ImmutableGraph graph = ImmutableGraph.loadMapped(graphBasename); + + // Get the sorted output and write the labels and label offsets + ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS); + plLabels.itemsName = "edges"; + plLabels.expectedUpdates = this.numArcs; + plLabels.start("Writing the labels to the label file: " + outputGraphBasename + "-labelled.*"); + + OutputBitStream labels = new OutputBitStream( + new File(outputGraphBasename + "-labelled" + BitStreamArcLabelledImmutableGraph.LABELS_EXTENSION)); + OutputBitStream offsets = new OutputBitStream(new File( + outputGraphBasename + "-labelled" + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION)); + offsets.writeGamma(0); + + EdgeLabelLine line = new EdgeLabelLine(-1, -1, -1, -1); + + NodeIterator it = graph.nodeIterator(); + boolean started = false; + + ArrayList labelBuffer = new ArrayList<>(128); + while (it.hasNext()) { + long srcNode = it.nextLong(); + + long bits = 0; + LazyLongIterator s = it.successors(); + long dstNode; + while ((dstNode = s.nextLong()) >= 0) { + while (line != null && line.srcNode <= srcNode && line.dstNode <= dstNode) { + if (line.srcNode == srcNode && line.dstNode == dstNode) { + labelBuffer.add(new DirEntry(line.filenameId, line.permission)); + } + + if (!mapLines.hasNext()) + break; + + line = mapLines.next(); + if (!started) { + plLabels.start("Writing label map to file..."); + started = true; + } + } + + SwhLabel l = new SwhLabel("edgelabel", totalLabelWidth, labelBuffer.toArray(new DirEntry[0])); + labelBuffer.clear(); + bits += l.toBitStream(labels, -1); + plLabels.lightUpdate(); + } + offsets.writeLongGamma(bits); + } + + labels.close(); + offsets.close(); + plLabels.done(); + + graph = null; + + PrintWriter pw = new PrintWriter(new FileWriter(outputGraphBasename + "-labelled.properties")); + pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName()); + pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + SwhLabel.class.getName() + + "(DirEntry," + totalLabelWidth + ")"); + pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = " + + Paths.get(outputGraphBasename).getFileName()); + pw.close(); + } + + public static class EdgeLabelLine { + public long srcNode; + public long dstNode; + public long filenameId; + public int permission; + + public EdgeLabelLine(long labelSrcNode, long labelDstNode, long labelFilenameId, int labelPermission) { + this.srcNode = labelSrcNode; + this.dstNode = labelDstNode; + this.filenameId = labelFilenameId; + this.permission = labelPermission; + } + } + + public abstract static class EdgeLabelLineIterator implements Iterator { + @Override + public abstract boolean hasNext(); + + @Override + public abstract EdgeLabelLine next(); + } + + public static class BatchEdgeLabelLineIterator extends EdgeLabelLineIterator { + private static final int STD_BUFFER_SIZE = 128 * 1024; + + private final InputBitStream[] batchIbs; + private final int[] inputStreamLength; + private final long[] refArray; + private final LongHeapSemiIndirectPriorityQueue queue; + private final long[] prevTarget; + + /** The last returned node (-1 if no node has been returned yet). */ + private long lastNode; + private long[][] lastNodeSuccessors = LongBigArrays.EMPTY_BIG_ARRAY; + private long[][] lastNodeLabels = LongBigArrays.EMPTY_BIG_ARRAY; + private long lastNodeOutdegree; + private long lastNodeCurrentSuccessor; + + public BatchEdgeLabelLineIterator(final List batches) throws IOException { + this.batchIbs = new InputBitStream[batches.size()]; + this.refArray = new long[batches.size()]; + this.prevTarget = new long[batches.size()]; + this.queue = new LongHeapSemiIndirectPriorityQueue(refArray); + this.inputStreamLength = new int[batches.size()]; + + for (int i = 0; i < batches.size(); i++) { + batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE); + this.inputStreamLength[i] = batchIbs[i].readDelta(); + this.refArray[i] = batchIbs[i].readLongDelta(); + queue.enqueue(i); + } + + this.lastNode = -1; + this.lastNodeOutdegree = 0; + this.lastNodeCurrentSuccessor = 0; + } + + public boolean hasNextNode() { + return !queue.isEmpty(); + } + + private void readNextNode() throws IOException { + assert hasNext(); + + int i; + lastNode++; + lastNodeOutdegree = 0; + lastNodeCurrentSuccessor = 0; + + /* + * We extract elements from the queue as long as their target is equal to last. If during the + * process we exhaust a batch, we close it. + */ + while (!queue.isEmpty() && refArray[i = queue.first()] == lastNode) { + lastNodeSuccessors = BigArrays.grow(lastNodeSuccessors, lastNodeOutdegree + 1); + lastNodeLabels = BigArrays.grow(lastNodeLabels, lastNodeOutdegree + 1); + + long target = prevTarget[i] += batchIbs[i].readLongDelta(); + long label = batchIbs[i].readLongDelta(); + BigArrays.set(lastNodeSuccessors, lastNodeOutdegree, target); + BigArrays.set(lastNodeLabels, lastNodeOutdegree, label); + + // System.err.println("2. Read " + lastNode + " " + target + " " + label); + if (--inputStreamLength[i] == 0) { + queue.dequeue(); + batchIbs[i].close(); + batchIbs[i] = null; + } else { + // We read a new source and update the queue. + final long sourceDelta = batchIbs[i].readLongDelta(); + if (sourceDelta != 0) { + refArray[i] += sourceDelta; + prevTarget[i] = 0; + queue.changed(); + } + } + lastNodeOutdegree++; + } + + // Neither quicksort nor heaps are stable, so we reestablish order here. + // LongBigArrays.radixSort(lastNodeSuccessors, lastNodeLabels, 0, lastNodeOutdegree); + ForkJoinBigQuickSort2.parallelQuickSort(lastNodeSuccessors, lastNodeLabels, 0, lastNodeOutdegree); + } + + @Override + public boolean hasNext() { + return lastNodeCurrentSuccessor < lastNodeOutdegree || hasNextNode(); + } + + @Override + public EdgeLabelLine next() { + if (lastNode == -1 || lastNodeCurrentSuccessor >= lastNodeOutdegree) { + try { + do { + readNextNode(); + } while (hasNextNode() && lastNodeOutdegree == 0); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + long src = lastNode; + long dst = BigArrays.get(lastNodeSuccessors, lastNodeCurrentSuccessor); + long compressedLabel = BigArrays.get(lastNodeLabels, lastNodeCurrentSuccessor); + long labelName = DirEntry.labelNameFromEncoded(compressedLabel); + int permission = DirEntry.permissionFromEncoded(compressedLabel); + // System.err.println("3. Output (encoded): " + src + " " + dst + " " + compressedLabel); + // System.err.println("4. Output (decoded): " + src + " " + dst + " " + labelName + " " + + // permission); + lastNodeCurrentSuccessor++; + return new EdgeLabelLine(src, dst, labelName, permission); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java similarity index 90% rename from java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java rename to java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java index 626c747..105a921 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java +++ b/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java @@ -1,191 +1,201 @@ -package org.softwareheritage.graph.maps; +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; +import com.github.luben.zstd.ZstdInputStream; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.Size64; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.fastutil.longs.LongBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.softwareheritage.graph.Node; import org.softwareheritage.graph.SWHID; +import org.softwareheritage.graph.SwhType; +import org.softwareheritage.graph.maps.NodeIdMap; +import org.softwareheritage.graph.maps.NodeTypesMap; import java.io.*; import java.nio.charset.StandardCharsets; import java.util.Scanner; import java.util.concurrent.TimeUnit; /** * Create maps needed at runtime by the graph service, in particular: *

*

    - *
  • SWHID → WebGraph long node id
  • - *
  • WebGraph long node id → SWHID (converse of the former)
  • + *
  • WebGraph long node id → SWHID
  • *
  • WebGraph long node id → SWH node type (enum)
  • *
* * @author The Software Heritage developers */ public class NodeMapBuilder { final static String SORT_BUFFER_SIZE = "40%"; final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class); /** * Main entrypoint. * * @param args command line arguments */ public static void main(String[] args) throws IOException { if (args.length != 2) { logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV"); System.exit(1); } String graphPath = args[0]; String tmpDir = args[1]; logger.info("starting maps generation..."); precomputeNodeIdMap(graphPath, tmpDir); logger.info("maps generation completed"); } /** * Computes and dumps on disk mapping files. * * @param graphPath path of the compressed graph */ static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException { ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS); ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS); - plSWHID2Node.itemsName = "Hashing swhid→node"; - plNode2SWHID.itemsName = "Building map node→swhid"; + plSWHID2Node.itemsName = "nodes"; + plNode2SWHID.itemsName = "nodes"; // first half of SWHID->node mapping: SWHID -> WebGraph MPH (long) Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); plSWHID2Node.expectedUpdates = nbIds; plNode2SWHID.expectedUpdates = nbIds; // second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long) long[][] bfsMap = LongBigArrays.newBigArray(nbIds); logger.info("loading BFS order file..."); long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap); logger.info("BFS order file loaded"); if (loaded != nbIds) { logger.error("graph contains " + nbIds + " nodes, but read " + loaded); System.exit(2); } /* * Read on stdin a list of SWHIDs, hash them with MPH, then permute them according to the .order * file */ - FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); + FastBufferedReader buffer = new FastBufferedReader( + new InputStreamReader(new ZstdInputStream(new BufferedInputStream(System.in)))); LineIterator swhidIterator = new LineIterator(buffer); /* * The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically * sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the * sorting job to /usr/bin/sort via pipes */ ProcessBuilder processBuilder = new ProcessBuilder(); processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); Process sort = processBuilder.start(); BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); // for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap try (BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream( new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) { /* * background handler for sort output, it will be fed SWHID/node pairs, and will itself fill * nodeToSwhidMap as soon as data from sort is ready. */ SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID); outputHandler.start(); /* * Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to * efficiently check edge restrictions. */ - final int nbBitsPerNodeType = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2)); + final int nbBitsPerNodeType = (int) Math.ceil(Math.log(SwhType.values().length) / Math.log(2)); LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds); LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType); plSWHID2Node.start("Hashing SWHIDs to fill sort input"); for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) { String swhidStr = swhidIterator.next().toString(); SWHID swhid = new SWHID(swhidStr); long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII)); long nodeId = BigArrays.get(bfsMap, mphId); sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII)); nodeTypesMap.set(nodeId, swhid.getType().ordinal()); plSWHID2Node.lightUpdate(); } plSWHID2Node.done(); sort_stdin.close(); // write type map logger.info("storing type map"); BinIO.storeObject(nodeTypesMap, graphPath + NodeTypesMap.NODE_TO_TYPE); logger.info("type map stored"); // wait for nodeToSwhidMap filling try { logger.info("waiting for node2swhid map..."); int sortExitCode = sort.waitFor(); if (sortExitCode != 0) { logger.error("sort returned non-zero exit code: " + sortExitCode); System.exit(2); } outputHandler.join(); } catch (InterruptedException e) { logger.error("processing of sort output failed with: " + e); System.exit(2); } } } private static class SortOutputHandler extends Thread { private final Scanner input; private final OutputStream output; private final ProgressLogger pl; SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) { this.input = new Scanner(input, StandardCharsets.US_ASCII); this.output = output; this.pl = pl; } public void run() { boolean sortDone = false; logger.info("node2swhid: waiting for sort output..."); while (input.hasNextLine()) { if (!sortDone) { sortDone = true; this.pl.start("filling node2swhid map"); } String line = input.nextLine(); // format: SWHID NODE_ID SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID try { output.write(swhid.toBytes()); } catch (IOException e) { logger.error("writing to node->SWHID map failed with: " + e); } this.pl.lightUpdate(); } this.pl.done(); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java new file mode 100644 index 0000000..d16b5ae --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java @@ -0,0 +1,718 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import com.github.luben.zstd.ZstdOutputStream; +import com.google.common.primitives.Bytes; +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; +import org.apache.orc.TypeDescription; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.*; +import java.util.*; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.ForkJoinTask; + +/** + * A graph dataset in ORC format. + * + * This format of dataset is a full export of the graph, including all the edge and node properties. + * + * For convenience purposes, this class also provides a main method to print all the edges of the + * graph, so that the output can be piped to + * {@link it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph}. + * + * Reading edges from ORC files using this class is about ~2.5 times slower than reading them + * directly from a plaintext format. + */ +public class ORCGraphDataset implements GraphDataset { + final static Logger logger = LoggerFactory.getLogger(ORCGraphDataset.class); + + final static public int ORC_BATCH_SIZE = 16 * 1024; + + private File datasetDir; + + protected ORCGraphDataset() { + } + + public ORCGraphDataset(String datasetPath) { + this(new File(datasetPath)); + } + + public ORCGraphDataset(File datasetDir) { + if (!datasetDir.exists()) { + throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist"); + } + this.datasetDir = datasetDir; + } + + /** + * Return the given table as a {@link SwhOrcTable}. The return value can be down-casted to the type + * of the specific table it represents. + */ + public SwhOrcTable getTable(String tableName) { + File tableDir = new File(datasetDir, tableName); + if (!tableDir.exists()) { + return null; + } + switch (tableName) { + case "skipped_content": + return new SkippedContentOrcTable(tableDir); + case "content": + return new ContentOrcTable(tableDir); + case "directory": + return new DirectoryOrcTable(tableDir); + case "directory_entry": + return new DirectoryEntryOrcTable(tableDir); + case "revision": + return new RevisionOrcTable(tableDir); + case "revision_history": + return new RevisionHistoryOrcTable(tableDir); + case "release": + return new ReleaseOrcTable(tableDir); + case "snapshot_branch": + return new SnapshotBranchOrcTable(tableDir); + case "snapshot": + return new SnapshotOrcTable(tableDir); + case "origin_visit_status": + return new OriginVisitStatusOrcTable(tableDir); + case "origin_visit": + return new OriginVisitOrcTable(tableDir); + case "origin": + return new OriginOrcTable(tableDir); + default : + return null; + } + } + + /** Return all the tables in this dataset as a map of {@link SwhOrcTable}. */ + public Map allTables() { + HashMap tables = new HashMap<>(); + File[] tableDirs = datasetDir.listFiles(); + if (tableDirs == null) { + return tables; + } + for (File tableDir : tableDirs) { + SwhOrcTable table = getTable(tableDir.getName()); + if (table != null) { + tables.put(tableDir.getName(), table); + } + } + return tables; + } + + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + Map tables = allTables(); + for (SwhOrcTable table : tables.values()) { + table.readEdges(nodeCb, edgeCb); + } + } + + /** + * A class representing an ORC table, stored on disk as a set of ORC files all in the same + * directory. + */ + public static class ORCTable { + private final File tableDir; + + public ORCTable(File tableDir) { + if (!tableDir.exists()) { + throw new IllegalArgumentException("Table " + tableDir.getName() + " does not exist"); + } + this.tableDir = tableDir; + } + + public static ORCTable load(File tableDir) { + return new ORCTable(tableDir); + } + + /** + * Utility function for byte columns. Return as a byte array the value of the given row in the + * column vector. + */ + public static byte[] getBytesRow(BytesColumnVector columnVector, int row) { + if (columnVector.isRepeating) { + row = 0; + } + if (columnVector.isNull[row]) { + return null; + } + return Arrays.copyOfRange(columnVector.vector[row], columnVector.start[row], + columnVector.start[row] + columnVector.length[row]); + } + + /** + * Utility function for long columns. Return as a long the value of the given row in the column + * vector. + */ + public static Long getLongRow(LongColumnVector columnVector, int row) { + if (columnVector.isRepeating) { + row = 0; + } + if (columnVector.isNull[row]) { + return null; + } + return columnVector.vector[row]; + } + + interface ReadOrcBatchHandler { + void accept(VectorizedRowBatch batch, Map columnMap) throws IOException; + } + + /** + * Read the table, calling the given handler for each new batch of rows. Optionally, if columns is + * not null, will only scan the columns present in this set instead of the entire table. + * + * If this method is called from within a ForkJoinPool, the ORC table will be read in parallel using + * that thread pool. Otherwise, the ORC files will be read sequentially. + */ + public void readOrcTable(ReadOrcBatchHandler batchHandler, Set columns) throws IOException { + File[] listing = tableDir.listFiles(); + if (listing == null) { + throw new IOException("No files found in " + tableDir.getName()); + } + ForkJoinPool forkJoinPool = ForkJoinTask.getPool(); + if (forkJoinPool == null) { + // Sequential case + for (File file : listing) { + readOrcFile(file.getPath(), batchHandler, columns); + } + } else { + // Parallel case + ArrayList listingArray = new ArrayList<>(Arrays.asList(listing)); + listingArray.parallelStream().forEach(file -> { + try { + readOrcFile(file.getPath(), batchHandler, columns); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } + + private void readOrcFile(String path, ReadOrcBatchHandler batchHandler, Set columns) + throws IOException { + try (Reader reader = OrcFile.createReader(new Path(path), OrcFile.readerOptions(new Configuration()))) { + TypeDescription schema = reader.getSchema(); + + Reader.Options options = reader.options(); + if (columns != null) { + options.include(createColumnsToRead(schema, columns)); + } + Map columnMap = getColumnMap(schema); + + try (RecordReader records = reader.rows(options)) { + VectorizedRowBatch batch = reader.getSchema().createRowBatch(ORC_BATCH_SIZE); + while (records.nextBatch(batch)) { + batchHandler.accept(batch, columnMap); + } + } + } + } + + private static Map getColumnMap(TypeDescription schema) { + Map columnMap = new HashMap<>(); + List fieldNames = schema.getFieldNames(); + for (int i = 0; i < fieldNames.size(); i++) { + columnMap.put(fieldNames.get(i), i); + } + return columnMap; + } + + private static boolean[] createColumnsToRead(TypeDescription schema, Set columns) { + boolean[] columnsToRead = new boolean[schema.getMaximumId() + 1]; + List fieldNames = schema.getFieldNames(); + List columnTypes = schema.getChildren(); + for (int i = 0; i < fieldNames.size(); i++) { + if (columns.contains(fieldNames.get(i))) { + logger.debug("Adding column " + fieldNames.get(i) + " with ID " + i + " to the read list"); + TypeDescription type = columnTypes.get(i); + for (int id = type.getId(); id <= type.getMaximumId(); id++) { + columnsToRead[id] = true; + } + } + } + return columnsToRead; + } + } + + /** Base class for SWH-specific ORC tables. */ + public static class SwhOrcTable { + protected ORCTable orcTable; + + protected static final byte[] cntPrefix = "swh:1:cnt:".getBytes(); + protected static final byte[] dirPrefix = "swh:1:dir:".getBytes(); + protected static final byte[] revPrefix = "swh:1:rev:".getBytes(); + protected static final byte[] relPrefix = "swh:1:rel:".getBytes(); + protected static final byte[] snpPrefix = "swh:1:snp:".getBytes(); + protected static final byte[] oriPrefix = "swh:1:ori:".getBytes(); + + protected String getIdColumn() { + return "id"; + } + protected byte[] getSwhidPrefix() { + throw new UnsupportedOperationException(); + } + protected byte[] idToSwhid(byte[] id) { + return Bytes.concat(getSwhidPrefix(), id); + } + + protected SwhOrcTable() { + } + + public SwhOrcTable(File tableDir) { + orcTable = new ORCTable(tableDir); + } + + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + // No nodes or edges to read in the table by default. + } + + protected static byte[] urlToOriginId(byte[] url) { + return DigestUtils.sha1Hex(url).getBytes(); + } + + public void readIdColumn(NodeCallback cb) throws IOException { + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; + + for (int row = 0; row < batch.size; row++) { + byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row)); + cb.onNode(id); + } + }, Set.of(getIdColumn())); + } + + public void readLongColumn(String longColumn, LongCallback cb) throws IOException { + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; + LongColumnVector dateVector = (LongColumnVector) batch.cols[columnMap.get(longColumn)]; + + for (int row = 0; row < batch.size; row++) { + byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row)); + Long date = ORCTable.getLongRow(dateVector, row); + if (date != null) { + cb.onLong(id, date); + } + } + }, Set.of(getIdColumn(), longColumn)); + } + + public void readTimestampColumn(String dateColumn, String dateOffsetColumn, TimestampCallback cb) + throws IOException { + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; + TimestampColumnVector dateVector = (TimestampColumnVector) batch.cols[columnMap.get(dateColumn)]; + LongColumnVector dateOffsetVector = (LongColumnVector) batch.cols[columnMap.get(dateOffsetColumn)]; + + for (int row = 0; row < batch.size; row++) { + byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row)); + long date = dateVector.getTimestampAsLong(row); // rounded to seconds + Long dateOffset = ORCTable.getLongRow(dateOffsetVector, row); + if (dateOffset != null) { + cb.onTimestamp(id, date, dateOffset.shortValue()); + } + } + }, Set.of(getIdColumn(), dateColumn, dateOffsetColumn)); + } + + public void readBytes64Column(String longColumn, BytesCallback cb) throws IOException { + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; + BytesColumnVector valueVector = (BytesColumnVector) batch.cols[columnMap.get(longColumn)]; + + for (int row = 0; row < batch.size; row++) { + byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row)); + byte[] value = Base64.getEncoder().encode(ORCTable.getBytesRow(valueVector, row)); + cb.onBytes(id, value); + } + }, Set.of(getIdColumn(), longColumn)); + } + } + + public static class SkippedContentOrcTable extends SwhOrcTable { + public SkippedContentOrcTable(File tableDir) { + super(tableDir); + } + + @Override + protected String getIdColumn() { + return "sha1_git"; + } + + @Override + protected byte[] getSwhidPrefix() { + return cntPrefix; + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + readIdColumn(nodeCb); + } + } + + public static class ContentOrcTable extends SwhOrcTable { + public ContentOrcTable(File tableDir) { + super(tableDir); + } + + @Override + protected String getIdColumn() { + return "sha1_git"; + } + + @Override + protected byte[] getSwhidPrefix() { + return cntPrefix; + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + readIdColumn(nodeCb); + } + } + + public static class DirectoryOrcTable extends SwhOrcTable { + public DirectoryOrcTable(File tableDir) { + super(tableDir); + } + + @Override + protected byte[] getSwhidPrefix() { + return dirPrefix; + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + readIdColumn(nodeCb); + } + } + + public static class DirectoryEntryOrcTable extends SwhOrcTable { + public DirectoryEntryOrcTable(File tableDir) { + super(tableDir); + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + byte[] cntType = "file".getBytes(); + byte[] dirType = "dir".getBytes(); + byte[] revType = "rev".getBytes(); + + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector srcVector = (BytesColumnVector) batch.cols[columnMap.get("directory_id")]; + BytesColumnVector dstVector = (BytesColumnVector) batch.cols[columnMap.get("target")]; + BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("type")]; + BytesColumnVector labelVector = (BytesColumnVector) batch.cols[columnMap.get("name")]; + LongColumnVector permissionVector = (LongColumnVector) batch.cols[columnMap.get("perms")]; + + for (int row = 0; row < batch.size; row++) { + byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row); + byte[] targetPrefix; + if (Arrays.equals(targetType, cntType)) { + targetPrefix = cntPrefix; + } else if (Arrays.equals(targetType, dirType)) { + targetPrefix = dirPrefix; + } else if (Arrays.equals(targetType, revType)) { + targetPrefix = revPrefix; + } else { + continue; + } + + byte[] src = Bytes.concat(dirPrefix, ORCTable.getBytesRow(srcVector, row)); + byte[] dst = Bytes.concat(targetPrefix, ORCTable.getBytesRow(dstVector, row)); + byte[] label = Base64.getEncoder().encode(ORCTable.getBytesRow(labelVector, row)); + Long permission = ORCTable.getLongRow(permissionVector, row); + edgeCb.onEdge(src, dst, label, permission != null ? permission.intValue() : 0); + } + }, Set.of("directory_id", "target", "type", "name", "perms")); + } + } + + public static class RevisionOrcTable extends SwhOrcTable { + public RevisionOrcTable(File tableDir) { + super(tableDir); + } + + @Override + protected byte[] getSwhidPrefix() { + return revPrefix; + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector revisionIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")]; + BytesColumnVector directoryIdVector = (BytesColumnVector) batch.cols[columnMap.get("directory")]; + for (int row = 0; row < batch.size; row++) { + byte[] revisionId = Bytes.concat(revPrefix, ORCTable.getBytesRow(revisionIdVector, row)); + byte[] directoryId = Bytes.concat(dirPrefix, ORCTable.getBytesRow(directoryIdVector, row)); + nodeCb.onNode(revisionId); + edgeCb.onEdge(revisionId, directoryId, null, -1); + } + }, Set.of("id", "directory")); + } + } + + public static class RevisionHistoryOrcTable extends SwhOrcTable { + public RevisionHistoryOrcTable(File tableDir) { + super(tableDir); + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector revisionIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")]; + BytesColumnVector parentIdVector = (BytesColumnVector) batch.cols[columnMap.get("parent_id")]; + for (int row = 0; row < batch.size; row++) { + byte[] parentId = Bytes.concat(revPrefix, ORCTable.getBytesRow(parentIdVector, row)); + byte[] revisionId = Bytes.concat(revPrefix, ORCTable.getBytesRow(revisionIdVector, row)); + edgeCb.onEdge(revisionId, parentId, null, -1); + } + }, Set.of("id", "parent_id")); + } + } + + public static class ReleaseOrcTable extends SwhOrcTable { + public ReleaseOrcTable(File tableDir) { + super(tableDir); + } + + @Override + protected byte[] getSwhidPrefix() { + return relPrefix; + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + byte[] cntType = "content".getBytes(); + byte[] dirType = "directory".getBytes(); + byte[] revType = "revision".getBytes(); + byte[] relType = "release".getBytes(); + + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector releaseIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")]; + BytesColumnVector targetIdVector = (BytesColumnVector) batch.cols[columnMap.get("target")]; + BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("target_type")]; + + for (int row = 0; row < batch.size; row++) { + byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row); + + byte[] targetPrefix; + if (Arrays.equals(targetType, cntType)) { + targetPrefix = cntPrefix; + } else if (Arrays.equals(targetType, dirType)) { + targetPrefix = dirPrefix; + } else if (Arrays.equals(targetType, revType)) { + targetPrefix = revPrefix; + } else if (Arrays.equals(targetType, relType)) { + targetPrefix = relPrefix; + } else { + continue; + } + + byte[] releaseId = Bytes.concat(relPrefix, ORCTable.getBytesRow(releaseIdVector, row)); + byte[] targetId = Bytes.concat(targetPrefix, ORCTable.getBytesRow(targetIdVector, row)); + nodeCb.onNode(releaseId); + edgeCb.onEdge(releaseId, targetId, null, -1); + } + }, Set.of("id", "target", "target_type")); + } + } + + public static class SnapshotOrcTable extends SwhOrcTable { + public SnapshotOrcTable(File tableDir) { + super(tableDir); + } + + @Override + protected byte[] getSwhidPrefix() { + return snpPrefix; + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + readIdColumn(nodeCb); + } + } + + public static class SnapshotBranchOrcTable extends SwhOrcTable { + public SnapshotBranchOrcTable(File tableDir) { + super(tableDir); + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + byte[] cntType = "content".getBytes(); + byte[] dirType = "directory".getBytes(); + byte[] revType = "revision".getBytes(); + byte[] relType = "release".getBytes(); + + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector snapshotIdVector = (BytesColumnVector) batch.cols[columnMap.get("snapshot_id")]; + BytesColumnVector targetIdVector = (BytesColumnVector) batch.cols[columnMap.get("target")]; + BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("target_type")]; + BytesColumnVector branchNameVector = (BytesColumnVector) batch.cols[columnMap.get("name")]; + + for (int row = 0; row < batch.size; row++) { + byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row); + byte[] targetPrefix; + if (Arrays.equals(targetType, cntType)) { + targetPrefix = cntPrefix; + } else if (Arrays.equals(targetType, dirType)) { + targetPrefix = dirPrefix; + } else if (Arrays.equals(targetType, revType)) { + targetPrefix = revPrefix; + } else if (Arrays.equals(targetType, relType)) { + targetPrefix = relPrefix; + } else { + continue; + } + + byte[] snapshotId = Bytes.concat(snpPrefix, ORCTable.getBytesRow(snapshotIdVector, row)); + byte[] targetId = Bytes.concat(targetPrefix, ORCTable.getBytesRow(targetIdVector, row)); + byte[] branchName = Base64.getEncoder().encode(ORCTable.getBytesRow(branchNameVector, row)); + nodeCb.onNode(snapshotId); + edgeCb.onEdge(snapshotId, targetId, branchName, -1); + } + }, Set.of("snapshot_id", "name", "target", "target_type")); + } + } + + public static class OriginVisitStatusOrcTable extends SwhOrcTable { + public OriginVisitStatusOrcTable(File tableDir) { + super(tableDir); + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector originUrlVector = (BytesColumnVector) batch.cols[columnMap.get("origin")]; + BytesColumnVector snapshotIdVector = (BytesColumnVector) batch.cols[columnMap.get("snapshot")]; + + for (int row = 0; row < batch.size; row++) { + byte[] originId = urlToOriginId(ORCTable.getBytesRow(originUrlVector, row)); + byte[] snapshot_id = ORCTable.getBytesRow(snapshotIdVector, row); + if (snapshot_id == null || snapshot_id.length == 0) { + continue; + } + edgeCb.onEdge(Bytes.concat(oriPrefix, originId), Bytes.concat(snpPrefix, snapshot_id), null, -1); + } + }, Set.of("origin", "snapshot")); + } + } + + public static class OriginVisitOrcTable extends SwhOrcTable { + public OriginVisitOrcTable(File tableDir) { + super(tableDir); + } + } + + public static class OriginOrcTable extends SwhOrcTable { + public OriginOrcTable(File tableDir) { + super(tableDir); + } + + @Override + protected byte[] getSwhidPrefix() { + return oriPrefix; + } + + @Override + protected byte[] idToSwhid(byte[] id) { + return Bytes.concat(getSwhidPrefix(), urlToOriginId(id)); + } + + @Override + protected String getIdColumn() { + return "url"; + } + + @Override + public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException { + readIdColumn(nodeCb); + } + + public void readURLs(BytesCallback cb) throws IOException { + orcTable.readOrcTable((batch, columnMap) -> { + BytesColumnVector urlVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())]; + + for (int row = 0; row < batch.size; row++) { + byte[] id = idToSwhid(ORCTable.getBytesRow(urlVector, row)); + byte[] url = Base64.getEncoder().encode(ORCTable.getBytesRow(urlVector, row)); + cb.onBytes(id, url); + } + }, Set.of(getIdColumn())); + } + } + + /** + * Export an ORC graph to the CSV edge dataset format as two different files, + * nodes.csv.zst and edges.csv.zst. + */ + public static void exportToCsvDataset(String orcDataset, String csvDatasetBasename) throws IOException { + ORCGraphDataset dataset = new ORCGraphDataset(orcDataset); + File nodesFile = new File(csvDatasetBasename + ".nodes.csv.zst"); + File edgesFile = new File(csvDatasetBasename + ".edges.csv.zst"); + FastBufferedOutputStream nodesOut = new FastBufferedOutputStream( + new ZstdOutputStream(new FileOutputStream(nodesFile))); + FastBufferedOutputStream edgesOut = new FastBufferedOutputStream( + new ZstdOutputStream(new FileOutputStream(edgesFile))); + dataset.readEdges((node) -> { + nodesOut.write(node); + nodesOut.write('\n'); + }, (src, dst, label, perms) -> { + edgesOut.write(src); + edgesOut.write(' '); + edgesOut.write(dst); + if (label != null) { + edgesOut.write(' '); + edgesOut.write(label); + edgesOut.write(' '); + } + if (perms != -1) { + edgesOut.write(' '); + edgesOut.write(Long.toString(perms).getBytes()); + } + edgesOut.write('\n'); + }); + } + + /** + * Print all the edges of the graph to stdout. Can be piped to + * {@link it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph} to import the graph dataset and convert + * it to a {@link it.unimi.dsi.big.webgraph.BVGraph}. + */ + public static void printSimpleEdges(String orcDataset) throws IOException { + ORCGraphDataset dataset = new ORCGraphDataset(orcDataset); + FastBufferedOutputStream out = new FastBufferedOutputStream(System.out); + dataset.readEdges((node) -> { + }, (src, dst, label, perms) -> { + out.write(src); + out.write(' '); + out.write(dst); + out.write('\n'); + }); + out.flush(); + } + + public static void main(String[] args) throws IOException { + printSimpleEdges(args[0]); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java b/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java new file mode 100644 index 0000000..9320d98 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import java.io.File; +import java.io.IOException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; + +import it.unimi.dsi.big.webgraph.BVGraph; +import it.unimi.dsi.big.webgraph.ImmutableSequentialGraph; +import it.unimi.dsi.big.webgraph.NodeIterator; +import it.unimi.dsi.big.webgraph.Transform; +import it.unimi.dsi.fastutil.Arrays; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.martiansoftware.jsap.FlaggedOption; +import com.martiansoftware.jsap.JSAP; +import com.martiansoftware.jsap.JSAPException; +import com.martiansoftware.jsap.JSAPResult; +import com.martiansoftware.jsap.Parameter; +import com.martiansoftware.jsap.SimpleJSAP; +import com.martiansoftware.jsap.UnflaggedOption; + +import it.unimi.dsi.fastutil.Size64; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import it.unimi.dsi.logging.ProgressLogger; + +public class ScatteredArcsORCGraph extends ImmutableSequentialGraph { + private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredArcsORCGraph.class); + + /** The default number of threads. */ + public static final int DEFAULT_NUM_THREADS = Runtime.getRuntime().availableProcessors(); + + /** The default batch size. */ + public static final int DEFAULT_BATCH_SIZE = Math + .min((int) (Runtime.getRuntime().maxMemory() * 0.4 / (DEFAULT_NUM_THREADS * 8 * 2)), Arrays.MAX_ARRAY_SIZE); + + /** The batch graph used to return node iterators. */ + private final Transform.BatchGraph batchGraph; + + /** + * Creates a scattered-arcs ORC graph. + * + * @param dataset the Swh ORC Graph dataset + * @param function an explicitly provided function from string representing nodes to node numbers, + * or null for the standard behaviour. + * @param n the number of nodes of the graph (used only if function is not + * null). + * @param numThreads the number of threads to use. + * @param batchSize the number of integers in a batch; two arrays of integers of this size will be + * allocated by each thread. + * @param tempDir a temporary directory for the batches, or null for + * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice. + * @param pl a progress logger, or null. + */ + public ScatteredArcsORCGraph(final ORCGraphDataset dataset, final Object2LongFunction function, + final long n, final int numThreads, final int batchSize, final File tempDir, final ProgressLogger pl) + throws IOException { + final ObjectArrayList batches = new ObjectArrayList<>(); + ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads); + + long[][] srcArrays = new long[numThreads][batchSize]; + long[][] dstArrays = new long[numThreads][batchSize]; + int[] indexes = new int[numThreads]; + long[] progressCounts = new long[numThreads]; + AtomicInteger pairs = new AtomicInteger(0); + + AtomicInteger nextThreadId = new AtomicInteger(0); + ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement); + + if (pl != null) { + pl.itemsName = "arcs"; + pl.start("Creating sorted batches..."); + } + + try { + forkJoinPool.submit(() -> { + try { + dataset.readEdges((node) -> { + }, (src, dst, label, perms) -> { + long s = function.getLong(src); + long t = function.getLong(dst); + + int threadId = threadLocalId.get(); + int idx = indexes[threadId]++; + srcArrays[threadId][idx] = s; + dstArrays[threadId][idx] = t; + + if (idx == batchSize - 1) { + pairs.addAndGet(Transform.processBatch(batchSize, srcArrays[threadId], dstArrays[threadId], + tempDir, batches)); + indexes[threadId] = 0; + } + + if (pl != null && ++progressCounts[threadId] > 1000) { + synchronized (pl) { + pl.update(progressCounts[threadId]); + } + progressCounts[threadId] = 0; + } + + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).get(); + } catch (InterruptedException | ExecutionException e) { + throw new RuntimeException(e); + } + + IntStream.range(0, numThreads).parallel().forEach(t -> { + int idx = indexes[t]; + if (idx > 0) { + try { + pairs.addAndGet(Transform.processBatch(idx, srcArrays[t], dstArrays[t], tempDir, batches)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }); + + // Trigger the GC to free up the large arrays + for (int i = 0; i < numThreads; i++) { + srcArrays[i] = null; + dstArrays[i] = null; + } + + if (pl != null) { + pl.done(); + pl.logger().info("Created " + batches.size() + " batches."); + } + + batchGraph = new Transform.BatchGraph(n, pairs.get(), batches); + } + + @Override + public long numNodes() { + if (batchGraph == null) + throw new UnsupportedOperationException( + "The number of nodes is unknown (you need to generate all the batches first)."); + return batchGraph.numNodes(); + } + + @Override + public long numArcs() { + if (batchGraph == null) + throw new UnsupportedOperationException( + "The number of arcs is unknown (you need to generate all the batches first)."); + return batchGraph.numArcs(); + } + + @Override + public NodeIterator nodeIterator(final long from) { + return batchGraph.nodeIterator(from); + } + + @Override + public boolean hasCopiableIterators() { + return batchGraph.hasCopiableIterators(); + } + + @Override + public ScatteredArcsORCGraph copy() { + return this; + } + + @SuppressWarnings("unchecked") + public static void main(final String[] args) + throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException { + final SimpleJSAP jsap = new SimpleJSAP(ScatteredArcsORCGraph.class.getName(), + "Converts a scattered list of arcs from an ORC graph dataset into a BVGraph.", + new Parameter[]{ + new FlaggedOption("logInterval", JSAP.LONG_PARSER, + Long.toString(ProgressLogger.DEFAULT_LOG_INTERVAL), JSAP.NOT_REQUIRED, 'l', + "log-interval", "The minimum time interval between activity logs in milliseconds."), + new FlaggedOption("numThreads", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_NUM_THREADS), + JSAP.NOT_REQUIRED, 't', "threads", "The number of threads to use."), + new FlaggedOption("batchSize", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_BATCH_SIZE), + JSAP.NOT_REQUIRED, 's', "batch-size", "The maximum size of a batch, in arcs."), + new FlaggedOption("tempDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T', + "temp-dir", "A directory for all temporary batch files."), + new FlaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', + "function", + "A serialised function from strings to longs that will be used to translate identifiers to node numbers."), + new FlaggedOption("comp", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'c', "comp", + "A compression flag (may be specified several times).") + .setAllowMultipleDeclarations(true), + new FlaggedOption("windowSize", JSAP.INTEGER_PARSER, + String.valueOf(BVGraph.DEFAULT_WINDOW_SIZE), JSAP.NOT_REQUIRED, 'w', "window-size", + "Reference window size (0 to disable)."), + new FlaggedOption("maxRefCount", JSAP.INTEGER_PARSER, + String.valueOf(BVGraph.DEFAULT_MAX_REF_COUNT), JSAP.NOT_REQUIRED, 'm', "max-ref-count", + "Maximum number of backward references (-1 for ∞)."), + new FlaggedOption("minIntervalLength", JSAP.INTEGER_PARSER, + String.valueOf(BVGraph.DEFAULT_MIN_INTERVAL_LENGTH), JSAP.NOT_REQUIRED, 'i', + "min-interval-length", "Minimum length of an interval (0 to disable)."), + new FlaggedOption("zetaK", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_ZETA_K), + JSAP.NOT_REQUIRED, 'k', "zeta-k", "The k parameter for zeta-k codes."), + new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + JSAP.NOT_GREEDY, "The path to the ORC graph dataset."), + new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, + JSAP.NOT_GREEDY, "The basename of the output graph"),}); + + final JSAPResult jsapResult = jsap.parse(args); + if (jsap.messagePrinted()) + System.exit(1); + + String basename = jsapResult.getString("basename"); + String orcDatasetPath = jsapResult.getString("dataset"); + ORCGraphDataset orcDataset = new ORCGraphDataset(orcDatasetPath); + + int flags = 0; + for (final String compressionFlag : jsapResult.getStringArray("comp")) { + try { + flags |= BVGraph.class.getField(compressionFlag).getInt(BVGraph.class); + } catch (final Exception notFound) { + throw new JSAPException("Compression method " + compressionFlag + " unknown."); + } + } + + final int windowSize = jsapResult.getInt("windowSize"); + final int zetaK = jsapResult.getInt("zetaK"); + int maxRefCount = jsapResult.getInt("maxRefCount"); + if (maxRefCount == -1) + maxRefCount = Integer.MAX_VALUE; + final int minIntervalLength = jsapResult.getInt("minIntervalLength"); + + if (!jsapResult.userSpecified("function")) { + throw new IllegalArgumentException("Function must be specified."); + } + final Object2LongFunction function = (Object2LongFunction) BinIO + .loadObject(jsapResult.getString("function")); + long n = function instanceof Size64 ? ((Size64) function).size64() : function.size(); + + File tempDir = null; + if (jsapResult.userSpecified("tempDir")) { + tempDir = new File(jsapResult.getString("tempDir")); + } + + final ProgressLogger pl = new ProgressLogger(LOGGER, jsapResult.getLong("logInterval"), TimeUnit.MILLISECONDS); + final int batchSize = jsapResult.getInt("batchSize"); + final int numThreads = jsapResult.getInt("numThreads"); + final ScatteredArcsORCGraph graph = new ScatteredArcsORCGraph(orcDataset, function, n, numThreads, batchSize, + tempDir, pl); + BVGraph.store(graph, basename, windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java new file mode 100644 index 0000000..f06ba59 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import com.martiansoftware.jsap.*; +import it.unimi.dsi.bits.LongArrayBitVector; +import it.unimi.dsi.fastutil.BigArrays; +import it.unimi.dsi.fastutil.ints.IntBigArrays; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.io.FastBufferedOutputStream; +import it.unimi.dsi.fastutil.longs.LongBigArrays; +import it.unimi.dsi.fastutil.objects.Object2LongFunction; +import it.unimi.dsi.fastutil.shorts.ShortBigArrays; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.maps.NodeIdMap; +import org.softwareheritage.graph.compress.ORCGraphDataset.*; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.util.*; +import java.util.concurrent.atomic.AtomicLong; + +/** + * This class is used to extract the node properties from the graph dataset, and write them to a set + * of property files. + * + * Note: because the nodes are not sorted by type, we have an incentive to minimize the number of + * "holes" in offset arrays. This is why many unrelated properties are cobbled together in the same + * files (e.g. commit messages, tag messages and origin URLs are all in a "message" property file). + * Once we migrate to a TypedImmutableGraph as the underlying storage of the graph, we can split all + * the different properties in their own files. + */ +public class WriteNodeProperties { + final static Logger logger = LoggerFactory.getLogger(WriteNodeProperties.class); + + private final ORCGraphDataset dataset; + private final String graphBasename; + private final NodeIdMap nodeIdMap; + private final long numNodes; + + public WriteNodeProperties(String dataset, String graphBasename, NodeIdMap nodeIdMap) { + this.dataset = new ORCGraphDataset(dataset); + this.graphBasename = graphBasename; + this.nodeIdMap = nodeIdMap; + this.numNodes = nodeIdMap.size64(); + } + + public static String[] PROPERTY_WRITERS = new String[]{"timestamps", "content_length", "content_is_skipped", + "person_ids", "messages", "tag_names",}; + + private static JSAPResult parseArgs(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{ + new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"), + new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, + "Basename of the output graph"), + new FlaggedOption("properties", JSAP.STRING_PARSER, "*", JSAP.NOT_REQUIRED, 'p', "properties", + "Properties to write, comma separated (default: all). Possible choices: " + + String.join(",", PROPERTY_WRITERS)),}); + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + System.err.println("Usage error: " + e.getMessage()); + System.exit(1); + } + return config; + } + + public static void main(String[] argv) throws IOException, ClassNotFoundException, NoSuchMethodException, + InvocationTargetException, IllegalAccessException { + JSAPResult args = parseArgs(argv); + String dataset = args.getString("dataset"); + String graphBasename = args.getString("graphBasename"); + NodeIdMap nodeIdMap = new NodeIdMap(graphBasename); + + Set properties; + if (args.getString("properties").equals("*")) { + properties = Set.of(PROPERTY_WRITERS); + } else { + properties = new HashSet<>(Arrays.asList(args.getString("properties").split(","))); + } + + WriteNodeProperties writer = new WriteNodeProperties(dataset, graphBasename, nodeIdMap); + if (properties.contains("timestamps")) { + writer.writeTimestamps(); + } + if (properties.contains("content_length")) { + writer.writeContentLength(); + } + if (properties.contains("content_is_skipped")) { + writer.writeContentIsSkipped(); + } + if (properties.contains("person_ids")) { + writer.writePersonIds(); + } + if (properties.contains("messages")) { + writer.writeMessages(); + } + if (properties.contains("tag_names")) { + writer.writeTagNames(); + } + } + + public void writeContentLength() throws IOException { + logger.info("Writing content lengths"); + long[][] valueArray = LongBigArrays.newBigArray(numNodes); + BigArrays.fill(valueArray, -1); + + for (String tableName : new String[]{"content", "skipped_content"}) { + SwhOrcTable table = dataset.getTable(tableName); + if (table == null) { + continue; + } + table.readLongColumn("length", (swhid, value) -> { + long id = nodeIdMap.getNodeId(swhid); + BigArrays.set(valueArray, id, value); + }); + } + + BinIO.storeLongs(valueArray, graphBasename + ".property.content.length.bin"); + } + + public void writeContentIsSkipped() throws IOException { + LongArrayBitVector isSkippedBitVector = LongArrayBitVector.ofLength(numNodes); + SwhOrcTable table = dataset.getTable("skipped_content"); + if (table != null) { + table.readIdColumn((swhid) -> { + long id = nodeIdMap.getNodeId(swhid); + isSkippedBitVector.set(id); + }); + } + BinIO.storeObject(isSkippedBitVector, graphBasename + ".property.content.is_skipped.bin"); + } + + public void writeTimestamps() throws IOException { + logger.info("Writing author/committer timestamps for release + revision"); + SwhOrcTable releaseTable = dataset.getTable("release"); + SwhOrcTable revisionTable = dataset.getTable("revision"); + + long[][] timestampArray = LongBigArrays.newBigArray(numNodes); + short[][] timestampOffsetArray = ShortBigArrays.newBigArray(numNodes); + + // Author timestamps + BigArrays.fill(timestampArray, Long.MIN_VALUE); + BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE); + releaseTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> { + long id = nodeIdMap.getNodeId(swhid); + BigArrays.set(timestampArray, id, date); + BigArrays.set(timestampOffsetArray, id, dateOffset); + }); + revisionTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> { + long id = nodeIdMap.getNodeId(swhid); + BigArrays.set(timestampArray, id, date); + BigArrays.set(timestampOffsetArray, id, dateOffset); + }); + BinIO.storeLongs(timestampArray, graphBasename + ".property.author_timestamp.bin"); + BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.author_timestamp_offset.bin"); + + // Committer timestamps + BigArrays.fill(timestampArray, Long.MIN_VALUE); + BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE); + revisionTable.readTimestampColumn("committer_date", "committer_offset", (swhid, date, dateOffset) -> { + long id = nodeIdMap.getNodeId(swhid); + BigArrays.set(timestampArray, id, date); + BigArrays.set(timestampOffsetArray, id, dateOffset); + }); + BinIO.storeLongs(timestampArray, graphBasename + ".property.committer_timestamp.bin"); + BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.committer_timestamp_offset.bin"); + } + + public void writePersonIds() throws IOException { + logger.info("Writing author/committer IDs for release + revision"); + Object2LongFunction personIdMap = NodeIdMap.loadMph(graphBasename + ".persons.mph"); + SwhOrcTable releaseTable = dataset.getTable("release"); + SwhOrcTable revisionTable = dataset.getTable("revision"); + + int[][] personArray = IntBigArrays.newBigArray(numNodes); + + // Author IDs + BigArrays.fill(personArray, -1); + releaseTable.readBytes64Column("author", (swhid, personBase64) -> { + long id = nodeIdMap.getNodeId(swhid); + BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); + }); + revisionTable.readBytes64Column("author", (swhid, personBase64) -> { + long id = nodeIdMap.getNodeId(swhid); + BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); + }); + BinIO.storeInts(personArray, graphBasename + ".property.author_id.bin"); + + // Committer IDs + BigArrays.fill(personArray, -1); + revisionTable.readBytes64Column("committer", (swhid, personBase64) -> { + long id = nodeIdMap.getNodeId(swhid); + BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64)); + }); + BinIO.storeInts(personArray, graphBasename + ".property.committer_id.bin"); + } + + public void writeMessages() throws IOException { + logger.info("Writing messages for release + revision, and URLs for origins"); + + long[][] messageOffsetArray = LongBigArrays.newBigArray(numNodes); + BigArrays.fill(messageOffsetArray, -1); + + FastBufferedOutputStream messageStream = new FastBufferedOutputStream( + new FileOutputStream(graphBasename + ".property.message.bin")); + AtomicLong offset = new AtomicLong(0L); + + SwhOrcTable releaseTable = dataset.getTable("release"); + releaseTable.readBytes64Column("message", (swhid, messageBase64) -> { + long id = nodeIdMap.getNodeId(swhid); + messageStream.write(messageBase64); + messageStream.write('\n'); + BigArrays.set(messageOffsetArray, id, offset.longValue()); + offset.addAndGet(messageBase64.length + 1); + }); + + SwhOrcTable revisionTable = dataset.getTable("revision"); + revisionTable.readBytes64Column("message", (swhid, messageBase64) -> { + long id = nodeIdMap.getNodeId(swhid); + messageStream.write(messageBase64); + messageStream.write('\n'); + BigArrays.set(messageOffsetArray, id, offset.longValue()); + offset.addAndGet(messageBase64.length + 1); + }); + + OriginOrcTable originTable = (OriginOrcTable) dataset.getTable("origin"); + originTable.readURLs((swhid, messageBase64) -> { + long id = nodeIdMap.getNodeId(swhid); + messageStream.write(messageBase64); + messageStream.write('\n'); + BigArrays.set(messageOffsetArray, id, offset.longValue()); + offset.addAndGet(messageBase64.length + 1); + }); + + // TODO: check which one is optimal in terms of memory/disk usage, EF vs mapped file + BinIO.storeLongs(messageOffsetArray, graphBasename + ".property.message.offset.bin"); + // EliasFanoLongBigList messageOffsetEF = new + // EliasFanoLongBigList(LongBigArrayBigList.wrap(messageOffsetArray)); + // BinIO.storeObject(messageOffsetEF, graphBasename + ".property.message.offset.bin"); + messageStream.close(); + } + + public void writeTagNames() throws IOException { + logger.info("Writing tag names for release"); + + long[][] tagNameOffsetArray = LongBigArrays.newBigArray(numNodes); + BigArrays.fill(tagNameOffsetArray, -1); + + FastBufferedOutputStream tagNameStream = new FastBufferedOutputStream( + new FileOutputStream(graphBasename + ".property.tag_name.bin")); + AtomicLong offset = new AtomicLong(0L); + + SwhOrcTable releaseTable = dataset.getTable("release"); + releaseTable.readBytes64Column("name", (swhid, tagNameBase64) -> { + long id = nodeIdMap.getNodeId(swhid); + tagNameStream.write(tagNameBase64); + tagNameStream.write('\n'); + BigArrays.set(tagNameOffsetArray, id, offset.longValue()); + offset.addAndGet(tagNameBase64.length + 1); + }); + + BinIO.storeLongs(tagNameOffsetArray, graphBasename + ".property.tag_name.offset.bin"); + // EliasFanoLongBigList tagNameOffsetEF = new + // EliasFanoLongBigList(LongBigArrayBigList.wrap(tagNameOffsetArray)); + // BinIO.storeObject(tagNameOffsetEF, graphBasename + ".property.tag_name.offset.bin"); + tagNameStream.close(); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java deleted file mode 100644 index f36ce88..0000000 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java +++ /dev/null @@ -1,62 +0,0 @@ -package org.softwareheritage.graph.experiments.forks; - -import com.martiansoftware.jsap.*; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Traversal; - -import java.io.IOException; -import java.util.Scanner; - -public class FindCommonAncestor { - private Graph graph; - - private void load_graph(String graphBasename) throws IOException { - System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = Graph.loadMapped(graphBasename); - System.err.println("Graph loaded."); - } - - private static JSAPResult parse_args(String[] args) { - JSAPResult config = null; - try { - SimpleJSAP jsap = new SimpleJSAP(FindCommonAncestor.class.getName(), "", - new Parameter[]{ - new FlaggedOption("edgesFmt", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'e', - "edges", "Edges constraints"), - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', - "graph", "Basename of the compressed graph"),}); - - config = jsap.parse(args); - if (jsap.messagePrinted()) { - System.exit(1); - } - } catch (JSAPException e) { - e.printStackTrace(); - } - return config; - } - - public static void main(String[] args) { - JSAPResult config = parse_args(args); - - String graphPath = config.getString("graphPath"); - String edgesFmt = config.getString("edgesFmt"); - - FindCommonAncestor fca = new FindCommonAncestor(); - try { - fca.load_graph(graphPath); - } catch (IOException e) { - System.out.println("Could not load graph: " + e); - System.exit(2); - } - - Scanner input = new Scanner(System.in); - while (input.hasNextLong()) { - long lhsNode = input.nextLong(); - long rhsNode = input.nextLong(); - - Traversal t = new Traversal(fca.graph.symmetrize(), "forward", edgesFmt); - System.out.println(t.findCommonDescendant(lhsNode, rhsNode)); - } - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java deleted file mode 100644 index 2e5afd9..0000000 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java +++ /dev/null @@ -1,123 +0,0 @@ -package org.softwareheritage.graph.experiments.forks; - -import com.martiansoftware.jsap.*; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; - -import java.io.IOException; -import java.util.*; - -public class FindPath { - private Graph graph; - private Long emptySnapshot; - - private void load_graph(String graphBasename) throws IOException { - System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = Graph.loadMapped(graphBasename).symmetrize(); - System.err.println("Graph loaded."); - this.emptySnapshot = null; - } - - private static JSAPResult parse_args(String[] args) { - JSAPResult config = null; - try { - SimpleJSAP jsap = new SimpleJSAP(FindPath.class.getName(), "", - new Parameter[]{new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - 'g', "graph", "Basename of the compressed graph"),}); - - config = jsap.parse(args); - if (jsap.messagePrinted()) { - System.exit(1); - } - } catch (JSAPException e) { - e.printStackTrace(); - } - return config; - } - - private boolean nodeIsEmptySnapshot(Long node) { - if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP - && this.graph.outdegree(node) == 0) { - System.err.println("Found empty snapshot: " + node); - this.emptySnapshot = node; - } - return node.equals(this.emptySnapshot); - } - - private Boolean shouldVisit(Long node) { - Node.Type nt = this.graph.getNodeType(node); - if (nt != Node.Type.REV && nt != Node.Type.REL && nt != Node.Type.SNP && nt != Node.Type.ORI) { - return false; - } - if (this.nodeIsEmptySnapshot(node)) - return false; - return true; - } - - private ArrayList findPath(Long src, Long dst) { - HashSet visited = new HashSet<>(); - Queue queue = new ArrayDeque<>(); - Map parentNode = new HashMap<>(); - - queue.add(src); - visited.add(src); - - while (!queue.isEmpty()) { - long currentNode = queue.poll(); - - final LazyLongIterator iterator = graph.successors(currentNode); - long succ; - while ((succ = iterator.nextLong()) != -1) { - if (!shouldVisit(succ) || visited.contains(succ)) - continue; - visited.add(succ); - queue.add(succ); - parentNode.put(succ, currentNode); - - if (succ == dst) { - ArrayList path = new ArrayList<>(); - long n = dst; - while (n != src) { - path.add(n); - n = parentNode.get(n); - } - path.add(src); - Collections.reverse(path); - return path; - } - } - } - return null; - } - - public static void main(String[] args) { - JSAPResult config = parse_args(args); - - String graphPath = config.getString("graphPath"); - - FindPath fpath = new FindPath(); - try { - fpath.load_graph(graphPath); - } catch (IOException e) { - System.out.println("Could not load graph: " + e); - System.exit(2); - } - - Scanner input = new Scanner(System.in); - while (input.hasNextLong()) { - long lhsNode = input.nextLong(); - long rhsNode = input.nextLong(); - - ArrayList path = fpath.findPath(lhsNode, rhsNode); - if (path != null) { - for (Long n : path) { - System.out.format("%d ", n); - } - System.out.println(); - } else { - System.out.println("null"); - } - } - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java index 714df2e..446b0e1 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java @@ -1,249 +1,256 @@ +/* + * Copyright (c) 2019 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.forks; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.SwhBidirectionalGraph; +import org.softwareheritage.graph.SwhType; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; public class ForkCC { public Boolean includeRootDir; - private Graph graph; + private SwhBidirectionalGraph graph; private Long emptySnapshot; private LongArrayBitVector visited; private LongArrayBitVector whitelist; private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ForkCC.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't', "whitelist", "Whitelist of origins"), new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, 'R', "includerootdir", "Include root directory (default: false)"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private static void printDistribution(ArrayList> components) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { System.out.format("%d %d\n", entry.getKey(), entry.getValue()); } } private static void printLargestComponent(ArrayList> components) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { System.out.println(node); } } private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = Graph.loadMapped(graphBasename).symmetrize(); + this.graph = SwhBidirectionalGraph.loadMapped(graphBasename).symmetrize(); System.err.println("Graph loaded."); this.emptySnapshot = null; this.whitelist = null; this.visited = null; this.includeRootDir = null; } private boolean nodeIsEmptySnapshot(Long node) { - if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP + if (this.emptySnapshot == null && this.graph.getNodeType(node) == SwhType.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private Boolean shouldVisit(Long node) { - Node.Type nt = this.graph.getNodeType(node); - if (nt == Node.Type.CNT) { + SwhType nt = this.graph.getNodeType(node); + if (nt == SwhType.CNT) { return false; } - if (nt == Node.Type.DIR && !includeRootDir) + if (nt == SwhType.DIR && !includeRootDir) return false; if (this.nodeIsEmptySnapshot(node)) return false; if (visited.getBoolean(node)) return false; return true; } private ArrayList> compute(ProgressLogger pl) throws IOException { final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 visited = LongArrayBitVector.ofLength(n); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting connected components visit..."); ArrayList> components = new ArrayList<>(); for (long i = 0; i < n; i++) { - if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR) + if (!shouldVisit(i) || this.graph.getNodeType(i) == SwhType.DIR) continue; ArrayList component = new ArrayList<>(); queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); - Node.Type cur_nt = this.graph.getNodeType(currentNode); - if (cur_nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) { + SwhType cur_nt = this.graph.getNodeType(currentNode); + if (cur_nt == SwhType.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) { // TODO: add a check that the origin has >=1 non-empty snapshot component.add(currentNode); } final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (!shouldVisit(succ)) continue; - if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV) + if (this.graph.getNodeType(succ) == SwhType.DIR && cur_nt != SwhType.REV) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } if (component.size() > 0) { components.add(component); } } pl.done(); queue.close(); return components; } private static void printDistribution(ArrayList> components, Formatter out) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { out.format("%d %d\n", entry.getKey(), entry.getValue()); } } private static void printLargestComponent(ArrayList> components, Formatter out) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { out.format("%d\n", node); } } private static void printAllComponents(ArrayList> components, Formatter out) { for (int i = 1; i < components.size(); ++i) { ArrayList component = components.get(i); for (Long node : component) { out.format("%d ", node); } out.format("\n"); } } private void parseWhitelist(String path) { System.err.println("Loading whitelist " + path + " ..."); this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes()); Scanner scanner; try { scanner = new Scanner(new File(path)); while (scanner.hasNextLong()) { whitelist.set(scanner.nextLong()); } System.err.println("Whitelist loaded."); } catch (FileNotFoundException e) { e.printStackTrace(); } } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String whitelistPath = config.getString("whitelistPath"); boolean includeRootDir = config.getBoolean("includeRootDir"); String outdirPath = config.getString("outdir"); ForkCC forkCc = new ForkCC(); try { forkCc.load_graph(graphPath); forkCc.includeRootDir = includeRootDir; } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } if (whitelistPath != null) { forkCc.parseWhitelist(whitelistPath); } ProgressLogger logger = new ProgressLogger(); // noinspection ResultOfMethodCallIgnored new File(outdirPath).mkdirs(); try { ArrayList> components = forkCc.compute(logger); printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt")); printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt")); } catch (IOException e) { e.printStackTrace(); } logger.done(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java index 4d749bd..746d51e 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java @@ -1,223 +1,230 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.forks; import ch.qos.logback.classic.Level; import ch.qos.logback.classic.Logger; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.logging.ProgressLogger; import org.slf4j.LoggerFactory; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.SwhBidirectionalGraph; +import org.softwareheritage.graph.SwhType; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.*; public class ForkCliques { - private Graph graph; + private SwhBidirectionalGraph graph; private LongArrayBitVector whitelist; private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = Graph.loadMapped(graphBasename); + this.graph = SwhBidirectionalGraph.loadMapped(graphBasename); System.err.println("Graph loaded."); this.whitelist = null; } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ForkCliques.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't', "whitelist", "Whitelist of origins"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private ArrayList dfsAt(Long baseNode) { ArrayList res = new ArrayList<>(); final Deque stack = new ArrayDeque<>(); HashSet seen = new HashSet<>(); stack.push(baseNode); while (!stack.isEmpty()) { final Long currentNode = stack.pop(); final LazyLongIterator iterator = this.graph.predecessors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (!seen.contains(succ)) { - Node.Type nt = this.graph.getNodeType(succ); - if (nt == Node.Type.DIR || nt == Node.Type.CNT) + SwhType nt = this.graph.getNodeType(succ); + if (nt == SwhType.DIR || nt == SwhType.CNT) continue; - if (nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(succ))) { + if (nt == SwhType.ORI && (this.whitelist == null || this.whitelist.getBoolean(succ))) { res.add(succ); } else { stack.push(succ); seen.add(succ); } } } } Collections.sort(res); return res; } private boolean isBaseRevision(Long node) { - if (this.graph.getNodeType(node) != Node.Type.REV) + if (this.graph.getNodeType(node) != SwhType.REV) return false; final LazyLongIterator iterator = this.graph.successors(node); long succ; while ((succ = iterator.nextLong()) != -1) { - if (this.graph.getNodeType(succ) == Node.Type.REV) + if (this.graph.getNodeType(succ) == SwhType.REV) return false; } return true; } static private String fingerprint(ArrayList cluster) { MessageDigest digest; try { digest = MessageDigest.getInstance("SHA-256"); } catch (NoSuchAlgorithmException e) { e.printStackTrace(); return null; } for (Long n : cluster) digest.update(Longs.toByteArray(n)); return new String(digest.digest()); } private ArrayList> compute(ProgressLogger pl) { final long n = this.graph.numNodes(); HashSet fingerprints = new HashSet<>(); ArrayList> clusters = new ArrayList<>(); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting topological sort..."); for (long i = 0; i < n; i++) { if (isBaseRevision(i)) { ArrayList currentCluster = dfsAt(i); String clusterFp = fingerprint(currentCluster); if (!fingerprints.contains(clusterFp)) { fingerprints.add(clusterFp); clusters.add(currentCluster); } } pl.update(); } pl.done(); return clusters; } private static void printDistribution(ArrayList> components, Formatter out) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { out.format("%d %d\n", entry.getKey(), entry.getValue()); } } private static void printLargestComponent(ArrayList> components, Formatter out) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { out.format("%d\n", node); } } private static void printAllComponents(ArrayList> components, Formatter out) { for (int i = 1; i < components.size(); ++i) { ArrayList component = components.get(i); for (Long node : component) { out.format("%d ", node); } out.format("\n"); } } private void parseWhitelist(String path) { System.err.println("Loading whitelist " + path + " ..."); this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes()); Scanner scanner; try { scanner = new Scanner(new File(path)); while (scanner.hasNextLong()) { whitelist.set(scanner.nextLong()); } System.err.println("Whitelist loaded."); } catch (FileNotFoundException e) { e.printStackTrace(); } } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String whitelistPath = config.getString("whitelistPath"); String outdirPath = config.getString("outdir"); ForkCliques forkCliques = new ForkCliques(); try { forkCliques.load_graph(graphPath); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } if (whitelistPath != null) { forkCliques.parseWhitelist(whitelistPath); } Logger rootLogger = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME); rootLogger.setLevel(Level.DEBUG); ProgressLogger logger = new ProgressLogger(rootLogger); ArrayList> components = forkCliques.compute(logger); // noinspection ResultOfMethodCallIgnored new File(outdirPath).mkdirs(); try { printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt")); printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt")); } catch (FileNotFoundException e) { e.printStackTrace(); } logger.done(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java index 332a908..0ffb690 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java @@ -1,88 +1,95 @@ +/* + * Copyright (c) 2019 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.forks; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.SwhBidirectionalGraph; +import org.softwareheritage.graph.SwhType; import java.io.IOException; import java.util.ArrayList; public class ListEmptyOrigins { - private Graph graph; + private SwhBidirectionalGraph graph; private Long emptySnapshot; private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ListEmptyOrigins.class.getName(), "", new Parameter[]{new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); ListEmptyOrigins leo = new ListEmptyOrigins(); try { leo.load_graph(graphPath); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } ArrayList badlist = leo.compute(leo.graph); for (Long bad : badlist) { System.out.println(bad); } } private void load_graph(String graphBasename) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = Graph.loadMapped(graphBasename); + this.graph = SwhBidirectionalGraph.loadMapped(graphBasename); System.err.println("Graph loaded."); this.emptySnapshot = null; } private boolean nodeIsEmptySnapshot(Long node) { System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node); - if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP + if (this.emptySnapshot == null && this.graph.getNodeType(node) == SwhType.SNP && this.graph.outdegree(node) == 0) { System.err.println("Found empty snapshot: " + node); this.emptySnapshot = node; } return node.equals(this.emptySnapshot); } private ArrayList compute(ImmutableGraph graph) { final long n = graph.numNodes(); ArrayList bad = new ArrayList<>(); for (long i = 0; i < n; i++) { - Node.Type nt = this.graph.getNodeType(i); - if (nt != Node.Type.ORI) + SwhType nt = this.graph.getNodeType(i); + if (nt != SwhType.ORI) continue; final LazyLongIterator iterator = graph.successors(i); long succ; boolean found = false; while ((succ = iterator.nextLong()) != -1) { if (this.graph.outdegree(succ) > 0) { found = true; } } if (!found) bad.add(i); } return bad; } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java b/java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java deleted file mode 100644 index 89fd675..0000000 --- a/java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java +++ /dev/null @@ -1,130 +0,0 @@ -package org.softwareheritage.graph.experiments.multiplicationfactor; - -import com.martiansoftware.jsap.*; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; -import org.softwareheritage.graph.Traversal; -import org.softwareheritage.graph.benchmark.utils.Timing; - -import java.io.IOException; -import java.util.Scanner; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -public class GenDistribution { - private Graph graph; - - private static JSAPResult parse_args(String[] args) { - JSAPResult config = null; - try { - SimpleJSAP jsap = new SimpleJSAP(GenDistribution.class.getName(), "", - new Parameter[]{ - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', - "graph", "Basename of the compressed graph"), - new FlaggedOption("srcType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 's', - "srctype", "Source node type"), - new FlaggedOption("dstType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'd', - "dsttype", "Destination node type"), - new FlaggedOption("edgesFmt", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'e', - "edges", "Edges constraints"), - - new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, 't', - "numthreads", "Number of threads"),}); - - config = jsap.parse(args); - if (jsap.messagePrinted()) { - System.exit(1); - } - } catch (JSAPException e) { - e.printStackTrace(); - } - return config; - } - - public static void main(String[] args) { - JSAPResult config = parse_args(args); - - String graphPath = config.getString("graphPath"); - Node.Type srcType = Node.Type.fromStr(config.getString("srcType")); - Node.Type dstType = Node.Type.fromStr(config.getString("dstType")); - String edgesFmt = config.getString("edgesFmt"); - int numThreads = config.getInt("numThreads"); - - GenDistribution tp = new GenDistribution(); - try { - tp.load_graph(graphPath); - } catch (IOException e) { - System.out.println("Could not load graph: " + e); - System.exit(2); - } - - final long END_OF_QUEUE = -1L; - - ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads); - ExecutorService service = Executors.newFixedThreadPool(numThreads + 1); - - service.submit(() -> { - try { - Scanner input = new Scanner(System.in); - while (input.hasNextLong()) { - long node = input.nextLong(); - if (tp.graph.getNodeType(node) == srcType) { - queue.put(node); - } - } - } catch (InterruptedException e) { - e.printStackTrace(); - } finally { - for (int i = 0; i < numThreads; ++i) { - try { - queue.put(END_OF_QUEUE); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - }); - - for (int i = 0; i < numThreads; ++i) { - service.submit(() -> { - Graph thread_graph = tp.graph.copy(); - long startTime; - double totalTime; - - while (true) { - Long node = null; - try { - node = queue.take(); - } catch (InterruptedException e) { - e.printStackTrace(); - } - if (node == null || node == END_OF_QUEUE) { - return; - } - - Traversal t = new Traversal(thread_graph, "backward", edgesFmt); - int[] count = {0}; - - startTime = Timing.start(); - t.visitNodesVisitor(node, (curnode) -> { - if (tp.graph.getNodeType(curnode) == dstType) { - count[0]++; - } - }); - totalTime = Timing.stop(startTime); - System.out.format("%d %d %d %d %f\n", node, count[0], t.getNbNodesAccessed(), - t.getNbEdgesAccessed(), totalTime); - } - }); - } - - service.shutdown(); - } - - private void load_graph(String graphBasename) throws IOException { - System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = Graph.loadMapped(graphBasename); - System.err.println("Graph loaded."); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java index ad7eadf..dd8d203 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java @@ -1,188 +1,195 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.util.XoRoShiRo128PlusRandom; import org.softwareheritage.graph.*; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.*; import java.util.concurrent.*; public class AveragePaths { - private final Graph graph; + private final SwhBidirectionalGraph graph; private final Subgraph subgraph; private final ConcurrentHashMap result; private final String outdir; public AveragePaths(String graphBasename, String allowedNodes, String outdir) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); - this.graph = Graph.loadMapped(graphBasename); + this.graph = SwhBidirectionalGraph.loadMapped(graphBasename); this.subgraph = new Subgraph(this.graph, new AllowedNodes(allowedNodes)); this.outdir = outdir; System.err.println("Graph loaded."); result = new ConcurrentHashMap<>(); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(AveragePaths.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("nodeTypes", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 's', "nodetypes", "Node type constraints"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"), new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "32", JSAP.NOT_REQUIRED, 't', "numthreads", "Number of threads"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private void run(int numThreads) throws InterruptedException { final long END_OF_QUEUE = -1L; ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads); ExecutorService service = Executors.newFixedThreadPool(numThreads + 1); service.submit(() -> { try { - Graph thread_graph = graph.copy(); + SwhBidirectionalGraph thread_graph = graph.copy(); Subgraph thread_subgraph = subgraph.copy(); long[][] randomPerm = Util.identity(thread_graph.numNodes()); LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom()); long n = thread_graph.numNodes(); ProgressLogger pl = new ProgressLogger(); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Filling processor queue..."); for (long j = 0; j < n; ++j) { long node = BigArrays.get(randomPerm, j); if (thread_subgraph.nodeExists(node) && thread_subgraph.outdegree(node) == 0) { queue.put(node); } if (j % 10000 == 0) { printResult(); } pl.update(); } } catch (Exception e) { e.printStackTrace(); } finally { for (int i = 0; i < numThreads; ++i) { try { queue.put(END_OF_QUEUE); } catch (InterruptedException e) { e.printStackTrace(); } } } }); for (int i = 0; i < numThreads; ++i) { service.submit(() -> { try { Subgraph thread_subgraph = subgraph.copy(); while (true) { Long node = null; try { node = queue.take(); } catch (InterruptedException e) { e.printStackTrace(); } if (node == null || node == END_OF_QUEUE) { return; } bfsAt(thread_subgraph, node); } } catch (Exception e) { e.printStackTrace(); } }); } service.shutdown(); service.awaitTermination(365, TimeUnit.DAYS); } private void bfsAt(Subgraph graph, long srcNodeId) { ArrayDeque queue = new ArrayDeque<>(); HashSet visited = new HashSet<>(); long FRONTIER_MARKER = -1; queue.addLast(srcNodeId); visited.add(srcNodeId); long distance = 0; queue.addLast(FRONTIER_MARKER); while (!queue.isEmpty()) { long currentNodeId = queue.removeFirst(); // System.err.println("curr: " + currentNodeId); if (currentNodeId == FRONTIER_MARKER) { if (queue.isEmpty()) // avoid infinite loops break; ++distance; queue.addLast(FRONTIER_MARKER); continue; } if (graph.indegree(currentNodeId) == 0) { result.merge(distance, 1L, Long::sum); } LazyLongIterator it = graph.predecessors(currentNodeId); for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { if (!visited.contains(neighborNodeId)) { queue.addLast(neighborNodeId); visited.add(neighborNodeId); } } } } public void printResult() throws IOException { new File(outdir).mkdirs(); PrintWriter f = new PrintWriter(new FileWriter(outdir + "/distribution.txt")); TreeMap sortedDistribution = new TreeMap<>(result); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String outdir = config.getString("outdir"); String allowedNodes = config.getString("nodeTypes"); int numThreads = config.getInt("numThreads"); AveragePaths tp = new AveragePaths(graphPath, allowedNodes, outdir); tp.run(numThreads); tp.printResult(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java index 2e6fa0c..558aa39 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java @@ -1,325 +1,331 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.util.XoRoShiRo128PlusRandom; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.SwhBidirectionalGraph; +import org.softwareheritage.graph.SwhType; import java.io.*; import java.util.*; import java.util.concurrent.*; public class ClusteringCoefficient { - private final Graph graph; + private final SwhBidirectionalGraph graph; private final String outdirPath; private final ConcurrentHashMap result_full; private final ConcurrentHashMap result_dircnt; private final ConcurrentHashMap result_rev; private final ConcurrentHashMap result_revrel; private final ConcurrentHashMap result_orisnp; public ClusteringCoefficient(String graphBasename, String outdirPath) throws IOException { this.outdirPath = outdirPath; System.err.println("Loading graph " + graphBasename + " ..."); - Graph directedGraph = Graph.loadMapped(graphBasename); + SwhBidirectionalGraph directedGraph = SwhBidirectionalGraph.loadMapped(graphBasename); this.graph = directedGraph.symmetrize(); System.err.println("Graph loaded."); result_full = new ConcurrentHashMap<>(); result_dircnt = new ConcurrentHashMap<>(); result_rev = new ConcurrentHashMap<>(); result_revrel = new ConcurrentHashMap<>(); result_orisnp = new ConcurrentHashMap<>(); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(AveragePaths.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"), new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "32", JSAP.NOT_REQUIRED, 't', "numthreads", "Number of threads"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private void run(int numThreads) throws InterruptedException { final long END_OF_QUEUE = -1L; ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads); ExecutorService service = Executors.newFixedThreadPool(numThreads + 1); service.submit(() -> { try { - Graph thread_graph = graph.copy(); + SwhBidirectionalGraph thread_graph = graph.copy(); long[][] randomPerm = Util.identity(thread_graph.numNodes()); LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom()); long n = thread_graph.numNodes(); ProgressLogger pl = new ProgressLogger(); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Filling processor queue..."); for (long j = 0; j < n; ++j) { long node = BigArrays.get(randomPerm, j); queue.put(node); if (j % 10000 == 0) { printResult(); } pl.update(); } } catch (Exception e) { e.printStackTrace(); } finally { for (int i = 0; i < numThreads; ++i) { try { queue.put(END_OF_QUEUE); } catch (InterruptedException e) { e.printStackTrace(); } } } }); for (int i = 0; i < numThreads; ++i) { service.submit(() -> { try { - Graph thread_graph = graph.copy(); + SwhBidirectionalGraph thread_graph = graph.copy(); while (true) { Long node = null; try { node = queue.take(); } catch (InterruptedException e) { e.printStackTrace(); } if (node == null || node == END_OF_QUEUE) { return; } computeAt(thread_graph, node); } } catch (Exception e) { e.printStackTrace(); } }); } service.shutdown(); service.awaitTermination(365, TimeUnit.DAYS); } - private void computeAt(Graph graph, long node) { + private void computeAt(SwhBidirectionalGraph graph, long node) { long d = graph.outdegree(node); if (d < 2) { return; } - Node.Type nodeType = graph.getNodeType(node); + SwhType nodeType = graph.getNodeType(node); HashSet neighborhood = new HashSet<>(); long succ; final LazyLongIterator iterator = graph.successors(node); while ((succ = iterator.nextLong()) != -1) { neighborhood.add(succ); } long triangles_full = 0; long triangles_dircnt = 0; long triangles_rev = 0; long triangles_revrel = 0; long triangles_orisnp = 0; for (Long neighbor : neighborhood) { - Node.Type neighborNodeType = graph.getNodeType(neighbor); + SwhType neighborNodeType = graph.getNodeType(neighbor); final LazyLongIterator it = graph.successors(neighbor); while ((succ = it.nextLong()) != -1) { if (neighborhood.contains(succ)) { - Node.Type succNodeType = graph.getNodeType(succ); + SwhType succNodeType = graph.getNodeType(succ); triangles_full++; - if ((nodeType == Node.Type.DIR || nodeType == Node.Type.CNT) - && (neighborNodeType == Node.Type.DIR || neighborNodeType == Node.Type.CNT) - && (succNodeType == Node.Type.DIR || succNodeType == Node.Type.CNT)) { + if ((nodeType == SwhType.DIR || nodeType == SwhType.CNT) + && (neighborNodeType == SwhType.DIR || neighborNodeType == SwhType.CNT) + && (succNodeType == SwhType.DIR || succNodeType == SwhType.CNT)) { triangles_dircnt++; - } else if ((nodeType == Node.Type.REV || nodeType == Node.Type.REL) - && (neighborNodeType == Node.Type.REV || neighborNodeType == Node.Type.REL) - && (succNodeType == Node.Type.REV || succNodeType == Node.Type.REL)) { + } else if ((nodeType == SwhType.REV || nodeType == SwhType.REL) + && (neighborNodeType == SwhType.REV || neighborNodeType == SwhType.REL) + && (succNodeType == SwhType.REV || succNodeType == SwhType.REL)) { triangles_revrel++; - if (nodeType == Node.Type.REV && neighborNodeType == Node.Type.REV - && succNodeType == Node.Type.REV) + if (nodeType == SwhType.REV && neighborNodeType == SwhType.REV && succNodeType == SwhType.REV) triangles_rev++; - } else if ((nodeType == Node.Type.ORI || nodeType == Node.Type.SNP) - && (neighborNodeType == Node.Type.ORI || neighborNodeType == Node.Type.SNP) - && (succNodeType == Node.Type.ORI || succNodeType == Node.Type.SNP)) { + } else if ((nodeType == SwhType.ORI || nodeType == SwhType.SNP) + && (neighborNodeType == SwhType.ORI || neighborNodeType == SwhType.SNP) + && (succNodeType == SwhType.ORI || succNodeType == SwhType.SNP)) { triangles_orisnp++; } } } } result_full.merge(triangles_full, 1L, Long::sum); result_dircnt.merge(triangles_dircnt, 1L, Long::sum); result_rev.merge(triangles_rev, 1L, Long::sum); result_revrel.merge(triangles_revrel, 1L, Long::sum); result_orisnp.merge(triangles_orisnp, 1L, Long::sum); } public void printSortedDistribution(String distribPath, Map distrib) throws IOException { PrintWriter f = new PrintWriter(new FileWriter(distribPath)); TreeMap sortedDistribution = new TreeMap<>(distrib); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } public void printResult() throws IOException { new File(outdirPath).mkdirs(); printSortedDistribution(outdirPath + "/distribution-full.txt", result_full); printSortedDistribution(outdirPath + "/distribution-dircnt.txt", result_dircnt); printSortedDistribution(outdirPath + "/distribution-rev.txt", result_rev); printSortedDistribution(outdirPath + "/distribution-relrev.txt", result_revrel); printSortedDistribution(outdirPath + "/distribution-orisnp.txt", result_orisnp); } public static void main(String[] args) throws IOException, InterruptedException { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String outdir = config.getString("outdir"); int numThreads = config.getInt("numThreads"); ClusteringCoefficient cc = new ClusteringCoefficient(graphPath, outdir); cc.run(numThreads); cc.printResult(); } // Old unused functions private long oppositeEdges(ImmutableGraph graph, long node) { HashSet neighborhood = new HashSet<>(); long succ; final LazyLongIterator iterator = graph.successors(node); while ((succ = iterator.nextLong()) != -1) { neighborhood.add(succ); } long closed_triplets = 0; for (Long neighbor : neighborhood) { final LazyLongIterator it = graph.successors(neighbor); while ((succ = it.nextLong()) != -1) { if (neighborhood.contains(succ)) { closed_triplets++; } } } return closed_triplets; } public void compute(ProgressLogger pl, Formatter out_local, Formatter out_global) { final long n = this.graph.numNodes(); pl.expectedUpdates = n; pl.itemsName = "nodes"; long nodes_d2 = 0; double cum_lcc = 0; double cum_lcc_c0 = 0; double cum_lcc_c1 = 0; HashMap distribution = new HashMap<>(); for (long node = 0; node < n; node++) { long d = graph.outdegree(node); if (d >= 2) { double t = (d * (d - 1)); double m = oppositeEdges(graph, node); double lcc = m / t; distribution.merge(lcc, 1L, Long::sum); cum_lcc += lcc; nodes_d2++; } else { cum_lcc_c1++; } pl.update(); } pl.done(); for (Map.Entry entry : distribution.entrySet()) { out_local.format("%f %d\n", entry.getKey(), entry.getValue()); } double gC = cum_lcc / nodes_d2; double gC0 = cum_lcc_c0 / n; double gC1 = cum_lcc_c1 / n; out_global.format("C: %f\n", gC); out_global.format("C0: %f\n", gC0); out_global.format("C1: %f\n", gC1); } public void compute_approx(Formatter out_global) { final long n = this.graph.numNodes(); long trials = 0; long triangles = 0; while (true) { long node = ThreadLocalRandom.current().nextLong(0, n); long d = graph.outdegree(node); if (d >= 2) { Long u = null; Long v = null; long u_idx = ThreadLocalRandom.current().nextLong(0, d); long v_idx = ThreadLocalRandom.current().nextLong(0, d - 1); if (v_idx >= u_idx) { v_idx++; } long succ; final LazyLongIterator node_iterator = graph.successors(node); for (long succ_idx = 0; (succ = node_iterator.nextLong()) != -1; succ_idx++) { if (succ_idx == u_idx) { u = succ; } if (succ_idx == v_idx) { v = succ; } } final LazyLongIterator u_iterator = graph.successors(u); while ((succ = u_iterator.nextLong()) != -1) { if (succ == v) triangles++; } } trials++; if (trials % 100 == 0 || true) { double gC = (double) triangles / (double) trials; out_global.format("C: %f (triangles: %d, trials: %d)\n", gC, triangles, trials); System.out.format("C: %f (triangles: %d, trials: %d)\n", gC, triangles, trials); } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java index 4dc4c7d..36c9c52 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java @@ -1,200 +1,204 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; -import org.softwareheritage.graph.AllowedNodes; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; -import org.softwareheritage.graph.Subgraph; +import org.softwareheritage.graph.*; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.*; public class ConnectedComponents { private Subgraph graph; private void load_graph(String graphBasename, String nodeTypes) throws IOException { System.err.println("Loading graph " + graphBasename + " ..."); - var underlyingGraph = Graph.loadMapped(graphBasename); + var underlyingGraph = SwhBidirectionalGraph.loadMapped(graphBasename); var underlyingGraphSym = underlyingGraph.symmetrize(); graph = new Subgraph(underlyingGraphSym, new AllowedNodes(nodeTypes)); System.err.println("Graph loaded."); } private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(ConnectedComponents.class.getName(), "", new Parameter[]{ new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", "Basename of the compressed graph"), new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', "outdir", "Directory where to put the results"), new Switch("byOrigins", JSAP.NO_SHORTFLAG, "by-origins", "Compute size of components by number of origins"), new FlaggedOption("nodeTypes", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n', "nodetypes", "Allowed node types (comma-separated)"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } private HashMap /* ArrayList> */ compute(ProgressLogger pl, boolean byOrigin) throws IOException { final long n = graph.numNodes(); final long maxN = graph.maxNodeNumber(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * maxN); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 LongArrayBitVector visited = LongArrayBitVector.ofLength(maxN); pl.expectedUpdates = n; pl.itemsName = "nodes"; pl.start("Starting connected components visit..."); // ArrayList> components = new ArrayList<>(); HashMap componentDistribution = new HashMap<>(); var it = graph.nodeIterator(); while (it.hasNext()) { long i = it.nextLong(); if (visited.getBoolean(i)) continue; // ArrayList component = new ArrayList<>(); long componentNodes = 0; queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); // component.add(currentNode); - if (!byOrigin || graph.getNodeType(currentNode) == Node.Type.ORI) + if (!byOrigin || graph.getNodeType(currentNode) == SwhType.ORI) componentNodes += 1; final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { if (visited.getBoolean(succ)) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } /* * if (component.size() > 0) { components.add(component); } */ if (componentNodes > 0) componentDistribution.merge(componentNodes, 1L, Long::sum); } pl.done(); // return components; return componentDistribution; } private static void printDistribution(ArrayList> components, Formatter out) { TreeMap distribution = new TreeMap<>(); for (ArrayList component : components) { distribution.merge((long) component.size(), 1L, Long::sum); } for (Map.Entry entry : distribution.entrySet()) { out.format("%d %d\n", entry.getKey(), entry.getValue()); } out.close(); } private static void printLargestComponent(ArrayList> components, Formatter out) { int indexLargest = 0; for (int i = 1; i < components.size(); ++i) { if (components.get(i).size() > components.get(indexLargest).size()) indexLargest = i; } ArrayList component = components.get(indexLargest); for (Long node : component) { out.format("%d\n", node); } out.close(); } private static void printAllComponents(ArrayList> components, Formatter out) { for (int i = 1; i < components.size(); ++i) { ArrayList component = components.get(i); for (Long node : component) { out.format("%d ", node); } out.format("\n"); } out.close(); } public static void main(String[] args) { JSAPResult config = parse_args(args); String graphPath = config.getString("graphPath"); String outdirPath = config.getString("outdir"); String nodeTypes = config.getString("nodeTypes"); boolean byOrigin = config.getBoolean("byOrigins"); ConnectedComponents connectedComponents = new ConnectedComponents(); try { connectedComponents.load_graph(graphPath, nodeTypes); } catch (IOException e) { System.out.println("Could not load graph: " + e); System.exit(2); } ProgressLogger logger = new ProgressLogger(); // noinspection ResultOfMethodCallIgnored new File(outdirPath).mkdirs(); try { // ArrayList> components = connectedComponents.compute(logger); // components.sort(Comparator.comparing(ArrayList::size).reversed()); // printDistribution(components, new Formatter(outdirPath + "/distribution.txt")); // printLargestComponent(components, new Formatter(outdirPath + "/largest_component.txt")); // printAllComponents(components, new Formatter(outdirPath + "/all_components.txt")); HashMap componentDistribution = connectedComponents.compute(logger, byOrigin); PrintWriter f = new PrintWriter(new FileWriter(outdirPath + "/distribution.txt")); TreeMap sortedDistribution = new TreeMap<>(componentDistribution); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } catch (IOException e) { e.printStackTrace(); } logger.done(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java index 2d6ebdb..603be4d 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java @@ -1,239 +1,246 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.lang.reflect.InvocationTargetException; import java.util.*; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; import com.martiansoftware.jsap.Parameter; import com.martiansoftware.jsap.SimpleJSAP; import com.martiansoftware.jsap.UnflaggedOption; import it.unimi.dsi.logging.ProgressLogger; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.SwhBidirectionalGraph; +import org.softwareheritage.graph.SwhType; public class InOutDegree { private InOutDegree() { } - private static final int NODE_ARRAY_SIZE = Node.Type.values().length + 1; - private static final int TYPE_ALL = Node.Type.values().length; - private static final int TYPE_CNT = Node.Type.toInt(Node.Type.CNT); - private static final int TYPE_DIR = Node.Type.toInt(Node.Type.DIR); - private static final int TYPE_REV = Node.Type.toInt(Node.Type.REV); - private static final int TYPE_REL = Node.Type.toInt(Node.Type.REL); - private static final int TYPE_SNP = Node.Type.toInt(Node.Type.SNP); - private static final int TYPE_ORI = Node.Type.toInt(Node.Type.ORI); + private static final int NODE_ARRAY_SIZE = SwhType.values().length + 1; + private static final int TYPE_ALL = SwhType.values().length; + private static final int TYPE_CNT = SwhType.toInt(SwhType.CNT); + private static final int TYPE_DIR = SwhType.toInt(SwhType.DIR); + private static final int TYPE_REV = SwhType.toInt(SwhType.REV); + private static final int TYPE_REL = SwhType.toInt(SwhType.REL); + private static final int TYPE_SNP = SwhType.toInt(SwhType.SNP); + private static final int TYPE_ORI = SwhType.toInt(SwhType.ORI); - public static long[] outdegreeTypes(final Graph graph, long node) { + public static long[] outdegreeTypes(final SwhBidirectionalGraph graph, long node) { long[] out = new long[NODE_ARRAY_SIZE]; var successors = graph.successors(node); long neighbor; while ((neighbor = successors.nextLong()) != -1) { - out[Node.Type.toInt(graph.getNodeType(neighbor))]++; + out[SwhType.toInt(graph.getNodeType(neighbor))]++; out[TYPE_ALL]++; } return out; } - public static long[] indegreeTypes(final Graph graph, long node) { + public static long[] indegreeTypes(final SwhBidirectionalGraph graph, long node) { return outdegreeTypes(graph.transpose(), node); } public static void writeDistribution(HashMap distribution, String filename) throws IOException { PrintWriter f = new PrintWriter(new FileWriter(filename)); TreeMap sortedDistribution = new TreeMap<>(distribution); for (Map.Entry entry : sortedDistribution.entrySet()) { f.println(entry.getKey() + " " + entry.getValue()); } f.close(); } - public static void run(final Graph graph, String resultsDir) throws IOException { + public static void run(final SwhBidirectionalGraph graph, String resultsDir) throws IOException { // Per-type var cnt_in_dir = new HashMap(); var dir_in_dir = new HashMap(); var dir_in_rev = new HashMap(); var dir_in_all = new HashMap(); var dir_out_all = new HashMap(); var dir_out_dir = new HashMap(); var dir_out_cnt = new HashMap(); var dir_out_rev = new HashMap(); var rev_in_dir = new HashMap(); var rev_in_rel = new HashMap(); var rev_in_rev = new HashMap(); var rev_in_snp = new HashMap(); var rev_in_all = new HashMap(); var rev_out_rev = new HashMap(); var rel_in_snp = new HashMap(); var snp_in_ori = new HashMap(); var snp_out_all = new HashMap(); var snp_out_rel = new HashMap(); var snp_out_rev = new HashMap(); var ori_out_snp = new HashMap(); // Aggregated per layer var full_in = new HashMap(); var full_out = new HashMap(); var dircnt_in = new HashMap(); var dircnt_out = new HashMap(); var orisnp_in = new HashMap(); var orisnp_out = new HashMap(); var relrev_in = new HashMap(); var relrev_out = new HashMap(); var rev_in = rev_in_rev; // alias for single-type layer var rev_out = rev_out_rev; final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "nodes"; pl.expectedUpdates = graph.numNodes(); pl.start("Scanning..."); long[] in; long[] out; for (long i = graph.numNodes(); i-- != 0;) { long d_in = graph.indegree(i); long d_out = graph.outdegree(i); full_in.merge(d_in, 1L, Long::sum); full_out.merge(d_out, 1L, Long::sum); switch (graph.getNodeType(i)) { case CNT: cnt_in_dir.merge(d_in, 1L, Long::sum); dircnt_in.merge(d_in, 1L, Long::sum); dircnt_out.merge(0L, 1L, Long::sum); break; case DIR: in = indegreeTypes(graph, i); out = outdegreeTypes(graph, i); dir_in_all.merge(in[TYPE_ALL], 1L, Long::sum); dir_out_all.merge(out[TYPE_ALL], 1L, Long::sum); dir_in_dir.merge(in[TYPE_DIR], 1L, Long::sum); dir_in_rev.merge(in[TYPE_REV], 1L, Long::sum); dir_out_cnt.merge(out[TYPE_CNT], 1L, Long::sum); dir_out_dir.merge(out[TYPE_DIR], 1L, Long::sum); dir_out_rev.merge(out[TYPE_REV], 1L, Long::sum); dircnt_in.merge(in[TYPE_DIR] + in[TYPE_CNT], 1L, Long::sum); dircnt_out.merge(out[TYPE_DIR] + out[TYPE_CNT], 1L, Long::sum); break; case REV: in = indegreeTypes(graph, i); out = outdegreeTypes(graph, i); rev_in_all.merge(in[TYPE_ALL], 1L, Long::sum); rev_in_dir.merge(in[TYPE_DIR], 1L, Long::sum); rev_in_rev.merge(in[TYPE_REV], 1L, Long::sum); rev_in_rel.merge(in[TYPE_REL], 1L, Long::sum); rev_in_snp.merge(in[TYPE_SNP], 1L, Long::sum); rev_out_rev.merge(out[TYPE_REV], 1L, Long::sum); relrev_in.merge(in[TYPE_REL] + in[TYPE_REV], 1L, Long::sum); relrev_out.merge(out[TYPE_REL] + out[TYPE_REV], 1L, Long::sum); break; case REL: rel_in_snp.merge(d_in, 1L, Long::sum); relrev_in.merge(0L, 1L, Long::sum); relrev_out.merge(d_out, 1L, Long::sum); break; case SNP: out = outdegreeTypes(graph, i); snp_in_ori.merge(d_in, 1L, Long::sum); snp_out_all.merge(out[TYPE_ALL], 1L, Long::sum); snp_out_rel.merge(out[TYPE_REL], 1L, Long::sum); snp_out_rev.merge(out[TYPE_REV], 1L, Long::sum); orisnp_in.merge(d_in, 1L, Long::sum); orisnp_out.merge(out[TYPE_REL] + out[TYPE_REV], 1L, Long::sum); break; case ORI: ori_out_snp.merge(d_out, 1L, Long::sum); orisnp_in.merge(0L, 1L, Long::sum); orisnp_out.merge(d_out, 1L, Long::sum); break; default : pl.logger().warn("Invalid node type at pos {}", i); break; } pl.update(); } pl.done(); (new File(resultsDir)).mkdir(); writeDistribution(full_in, resultsDir + "/full_in.txt"); writeDistribution(full_out, resultsDir + "/full_out.txt"); writeDistribution(dircnt_in, resultsDir + "/dir+cnt_in.txt"); writeDistribution(dircnt_out, resultsDir + "/dir+cnt_out.txt"); writeDistribution(relrev_in, resultsDir + "/rel+rev_in.txt"); writeDistribution(relrev_out, resultsDir + "/rel+rev_out.txt"); writeDistribution(orisnp_in, resultsDir + "/ori+snp_in.txt"); writeDistribution(orisnp_out, resultsDir + "/ori+snp_out.txt"); writeDistribution(rev_in, resultsDir + "/rev_in.txt"); writeDistribution(rev_out, resultsDir + "/rev_out.txt"); String resultsTypeDir = resultsDir + "/per_type"; (new File(resultsTypeDir)).mkdir(); writeDistribution(cnt_in_dir, resultsTypeDir + "/cnt_in_dir.txt"); writeDistribution(dir_in_dir, resultsTypeDir + "/dir_in_dir.txt"); writeDistribution(dir_in_rev, resultsTypeDir + "/dir_in_rev.txt"); writeDistribution(dir_in_all, resultsTypeDir + "/dir_in_all.txt"); writeDistribution(dir_out_all, resultsTypeDir + "/dir_out_all.txt"); writeDistribution(dir_out_dir, resultsTypeDir + "/dir_out_dir.txt"); writeDistribution(dir_out_cnt, resultsTypeDir + "/dir_out_cnt.txt"); writeDistribution(dir_out_rev, resultsTypeDir + "/dir_out_rev.txt"); writeDistribution(rev_in_dir, resultsTypeDir + "/rev_in_dir.txt"); writeDistribution(rev_in_rel, resultsTypeDir + "/rev_in_rel.txt"); writeDistribution(rev_in_rev, resultsTypeDir + "/rev_in_rev.txt"); writeDistribution(rev_in_snp, resultsTypeDir + "/rev_in_snp.txt"); writeDistribution(rev_in_all, resultsTypeDir + "/rev_in_all.txt"); writeDistribution(rev_out_rev, resultsTypeDir + "/rev_out_rev.txt"); writeDistribution(rel_in_snp, resultsTypeDir + "/rel_in_snp.txt"); writeDistribution(snp_in_ori, resultsTypeDir + "/snp_in_ori.txt"); writeDistribution(snp_out_all, resultsTypeDir + "/snp_out_all.txt"); writeDistribution(snp_out_rel, resultsTypeDir + "/snp_out_rel.txt"); writeDistribution(snp_out_rev, resultsTypeDir + "/snp_out_rev.txt"); writeDistribution(ori_out_snp, resultsTypeDir + "/ori_out_snp.txt"); } static public void main(final String[] arg) throws IllegalArgumentException, SecurityException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, JSAPException, IOException, ClassNotFoundException { final SimpleJSAP jsap = new SimpleJSAP(InOutDegree.class.getName(), "Computes in and out degrees of the given SWHGraph", new Parameter[]{ new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the graph."), new UnflaggedOption("resultsDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "The directory of the resulting files."),}); final JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) System.exit(1); final String basename = jsapResult.getString("basename"); final String resultsDir = jsapResult.userSpecified("resultsDir") ? jsapResult.getString("resultsDir") : basename; final ProgressLogger pl = new ProgressLogger(); - Graph graph = Graph.loadMapped(basename); + SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(basename); run(graph, resultsDir); } } diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java index f897e00..189b3af 100644 --- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java +++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java @@ -1,98 +1,105 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.experiments.topology; import com.google.common.primitives.Longs; import com.martiansoftware.jsap.*; import it.unimi.dsi.Util; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.fastutil.BigArrays; import it.unimi.dsi.fastutil.longs.LongBigArrays; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.logging.ProgressLogger; import it.unimi.dsi.util.XoRoShiRo128PlusRandom; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.SwhBidirectionalGraph; +import org.softwareheritage.graph.SwhType; import org.softwareheritage.graph.experiments.forks.ForkCC; import java.io.*; public class SubdatasetSizeFunction { private SubdatasetSizeFunction() { } - public static void run(final Graph graph) throws IOException { + public static void run(final SwhBidirectionalGraph graph) throws IOException { final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "nodes"; pl.expectedUpdates = graph.numNodes(); long n = graph.numNodes(); LongArrayBitVector visited = LongArrayBitVector.ofLength(n); int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; long[][] randomPerm = Util.identity(graph.numNodes()); LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom()); long visitedNodes = 0; long visitedEdges = 0; long visitedOrigins = 0; long visitedContents = 0; pl.start("Running traversal starting from origins..."); for (long j = 0; j < n; ++j) { long i = BigArrays.get(randomPerm, j); - if (visited.getBoolean(i) || graph.getNodeType(i) != Node.Type.ORI) { + if (visited.getBoolean(i) || graph.getNodeType(i) != SwhType.ORI) { continue; } visitedOrigins++; queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); visitedNodes++; - if (graph.getNodeType(currentNode) == Node.Type.CNT) + if (graph.getNodeType(currentNode) == SwhType.CNT) visitedContents++; final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { visitedEdges++; if (visited.getBoolean(succ)) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } pl.update(); } if (visitedOrigins % 10000 == 0) System.out.println(visitedNodes + " " + visitedEdges + " " + visitedContents); } pl.done(); } static public void main(final String[] arg) throws IllegalArgumentException, SecurityException, JSAPException, IOException { final SimpleJSAP jsap = new SimpleJSAP(SubdatasetSizeFunction.class.getName(), "Computes subdataset size functions using a random uniform order", new Parameter[]{new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The basename of the graph."),}); final JSAPResult jsapResult = jsap.parse(arg); if (jsap.messagePrinted()) System.exit(1); final String basename = jsapResult.getString("basename"); - Graph graph = Graph.loadMapped(basename); + SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(basename); run(graph); } } diff --git a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java deleted file mode 100644 index d71d7d8..0000000 --- a/java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java +++ /dev/null @@ -1,103 +0,0 @@ -// TODO: should be in webgraph upstream -// package it.unimi.dsi.big.webgraph.labelling; -package org.softwareheritage.graph.labels; - -/* - * Copyright (C) 2020 TODO - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 3 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - * - */ - -import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; -import it.unimi.dsi.big.webgraph.labelling.Label; - -import java.util.Arrays; - -/** - * An abstract (single-attribute) list-of-longs label. - * - *

- * This class provides basic methods for a label holding a list of longs. Concrete implementations - * may impose further requirements on the long. - * - *

- * Implementing subclasses must provide constructors, {@link Label#copy()}, - * {@link Label#fromBitStream(it.unimi.dsi.io.InputBitStream, int)}, - * {@link Label#toBitStream(it.unimi.dsi.io.OutputBitStream, int)} and possibly override - * {@link #toString()}. - */ - -public abstract class AbstractLongListLabel extends AbstractLabel implements Label { - /** The key of the attribute represented by this label. */ - protected final String key; - /** The values of the attribute represented by this label. */ - public long[] value; - - /** - * Creates an long label with given key and value. - * - * @param key the (only) key of this label. - * @param value the value of this label. - */ - public AbstractLongListLabel(String key, long[] value) { - this.key = key; - this.value = value; - } - - @Override - public String wellKnownAttributeKey() { - return key; - } - - @Override - public String[] attributeKeys() { - return new String[]{key}; - } - - @Override - public Class[] attributeTypes() { - return new Class[]{long[].class}; - } - - @Override - public Object get(String key) { - if (this.key.equals(key)) - return value; - throw new IllegalArgumentException(); - } - - @Override - public Object get() { - return value; - } - - @Override - public String toString() { - return key + ":" + Arrays.toString(value); - } - - @Override - public boolean equals(Object x) { - if (x instanceof AbstractLongListLabel) - return Arrays.equals(value, ((AbstractLongListLabel) x).value); - else - return false; - } - - @Override - public int hashCode() { - return Arrays.hashCode(value); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java index 878db82..2b30ecf 100644 --- a/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java +++ b/java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java @@ -1,135 +1,154 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.labels; /** * Directory entries metadata are stored as edge labels on the graph. {@link DirEntry} can be * encoded in a single long type, to re-use Webgraph interface. * * @author The Software Heritage developers */ public class DirEntry { public long filenameId; public int permission; public DirEntry(long filenameId, int permission) { this.filenameId = filenameId; this.permission = permission; } public DirEntry(long dirEntryEncoded) { - this.filenameId = dirEntryEncoded >> Permission.NB_BITS_PER_TYPE; - int dirBytes = (int) (dirEntryEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1)); - this.permission = Permission.Type.fromEncoded(dirBytes); + this.filenameId = labelNameFromEncoded(dirEntryEncoded); + this.permission = permissionFromEncoded(dirEntryEncoded); } - public long toEncoded() { + public static long toEncoded(long filenameId, int permission) { return (filenameId << Permission.NB_BITS_PER_TYPE) + Permission.Type.toEncoded(permission); } + public static long labelNameFromEncoded(long labelEncoded) { + return labelEncoded >> Permission.NB_BITS_PER_TYPE; + } + + public static int permissionFromEncoded(long labelEncoded) { + int dirBytes = (int) (labelEncoded & ((1 << Permission.NB_BITS_PER_TYPE) - 1)); + return Permission.Type.fromEncoded(dirBytes); + } + + public long toEncoded() { + return toEncoded(filenameId, permission); + } + public static int labelWidth(long numLabels) { int filenameIdWidth = (int) Math.ceil(Math.log(numLabels) / Math.log(2)); if (filenameIdWidth > Long.SIZE - Permission.NB_BITS_PER_TYPE) { System.err.println("FIXME: Too many filenames, we can't handle more than 2^" + (Long.SIZE - Permission.NB_BITS_PER_TYPE) + " for now."); System.exit(2); } return filenameIdWidth + Permission.NB_BITS_PER_TYPE; } /** * Permission types present in the Software Heritage graph. * * @author The Software Heritage developers */ private static class Permission { public static final int NB_BITS_PER_TYPE = (int) Math .ceil(Math.log(Permission.Type.values().length) / Math.log(2)); public enum Type { NONE, CONTENT, EXECUTABLE_CONTENT, SYMLINK, DIRECTORY, REVISION; public static Permission.Type fromIntCode(int intCode) { switch (intCode) { case 0: return NONE; case 1: return CONTENT; case 2: return EXECUTABLE_CONTENT; case 3: return SYMLINK; case 4: return DIRECTORY; case 5: return REVISION; } throw new IllegalArgumentException("Unknown node permission code: " + intCode); } public static int toIntCode(Permission.Type type) { switch (type) { case NONE: return 0; case CONTENT: return 1; case EXECUTABLE_CONTENT: return 2; case SYMLINK: return 3; case DIRECTORY: return 4; case REVISION: return 5; } throw new IllegalArgumentException("Unknown node permission type: " + type); } public static Permission.Type fromIntPerm(int intPerm) { switch (intPerm) { case 0: return NONE; case 0100644: return CONTENT; case 0100755: return EXECUTABLE_CONTENT; case 0120000: return SYMLINK; case 0040000: return DIRECTORY; case 0160000: return REVISION; default : return NONE; } // throw new IllegalArgumentException("Unknown node permission: " + intPerm); // TODO: warning here instead? } public static int toIntPerm(Permission.Type type) { switch (type) { case NONE: return 0; case CONTENT: return 0100644; case EXECUTABLE_CONTENT: return 0100755; case SYMLINK: return 0120000; case DIRECTORY: return 0040000; case REVISION: return 0160000; } throw new IllegalArgumentException("Unknown node permission type: " + type); } public static int fromEncoded(int encoded) { return toIntPerm(fromIntCode(encoded)); } public static int toEncoded(int permission) { return toIntCode(fromIntPerm(permission)); } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java deleted file mode 100644 index f1672d9..0000000 --- a/java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java +++ /dev/null @@ -1,115 +0,0 @@ -// TODO: should be in webgraph upstream -// package it.unimi.dsi.big.webgraph.labelling; -package org.softwareheritage.graph.labels; - -/* - * Copyright (C) 2020 TODO - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 3 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY - * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - * - */ - -import it.unimi.dsi.big.webgraph.labelling.Label; -import it.unimi.dsi.fastutil.longs.LongArrays; -import it.unimi.dsi.io.InputBitStream; -import it.unimi.dsi.io.OutputBitStream; - -import java.io.IOException; -import java.util.Arrays; - -/** - * A list of longs represented in fixed width. Each list is prefixed by its length written in - * {@linkplain OutputBitStream#writeGamma(int) γ coding}. - */ - -public class FixedWidthLongListLabel extends org.softwareheritage.graph.labels.AbstractLongListLabel { - /** The bit width used to represent the value of this label. */ - private final int width; - - /** - * Creates a new fixed-width long label. - * - * @param key the (only) key of this label. - * @param width the label width (in bits). - * @param value the value of this label. - */ - public FixedWidthLongListLabel(String key, int width, long[] value) { - super(key, value); - for (int i = value.length; i-- != 0;) - if (value[i] < 0 || value[i] >= 1L << width) - throw new IllegalArgumentException("Value out of range: " + Long.toString(value[i])); - this.width = width; - } - - /** - * Creates a new fixed-width label with an empty list. - * - * @param key the (only) key of this label. - * @param width the label width (in bits). - */ - public FixedWidthLongListLabel(String key, int width) { - this(key, width, LongArrays.EMPTY_ARRAY); - } - - /** - * Creates a new fixed-width long label using the given key and width with an empty list. - * - * @param arg two strings containing the key and the width of this label. - */ - public FixedWidthLongListLabel(String... arg) { - this(arg[0], Integer.parseInt(arg[1])); - } - - @Override - public Label copy() { - return new FixedWidthLongListLabel(key, width, value.clone()); - } - - @Override - public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { - long readBits = inputBitStream.readBits(); - value = new long[inputBitStream.readGamma()]; - for (int i = 0; i < value.length; i++) - value[i] = inputBitStream.readLong(width); - return (int) (inputBitStream.readBits() - readBits); - } - - @Override - public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { - int bits = outputBitStream.writeGamma(value.length); - for (int i = 0; i < value.length; i++) - bits += outputBitStream.writeLong(value[i], width); - return bits; - } - - /** - * Returns -1 (the fixed width refers to a single long, not to the entire list). - * - * @return -1; - */ - @Override - public int fixedWidth() { - return -1; - } - - @Override - public String toString() { - return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; - } - - @Override - public String toSpec() { - return this.getClass().getName() + "(" + key + "," + width + ")"; - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java index 41f7d5f..f1a2c18 100644 --- a/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java +++ b/java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java @@ -1,109 +1,117 @@ +/* + * Copyright (c) 2021-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.labels; import it.unimi.dsi.big.webgraph.labelling.AbstractLabel; +import it.unimi.dsi.big.webgraph.labelling.FixedWidthLongListLabel; import it.unimi.dsi.big.webgraph.labelling.Label; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.io.OutputBitStream; import java.io.IOException; import java.util.Arrays; /** * Software Heritage graph edge labels following Webgraph labels convention. * * @author The Software Heritage developers */ public class SwhLabel extends AbstractLabel { private final String key; private final int width; // TODO: in the future we would like this to be edge type dependent (eg: having a similar SnpEntry // to store branch names) public DirEntry[] value; // Use existing Webgraph class to represent a list of DirEntry as a list of encoded long private final FixedWidthLongListLabel longList; private static final DirEntry[] EMPTY_ARRAY = {}; public SwhLabel(String key, int width, DirEntry[] value) { this.key = key; this.width = width; this.value = value; long[] valueEncoded = new long[value.length]; for (int i = 0; i < value.length; i++) valueEncoded[i] = value[i].toEncoded(); this.longList = new FixedWidthLongListLabel(key, width, valueEncoded); } public SwhLabel(String key, int width) { this(key, width, EMPTY_ARRAY); } public SwhLabel(String... arg) { this(arg[0], Integer.parseInt(arg[1])); } @Override public int fromBitStream(InputBitStream inputBitStream, final long sourceUnused) throws IOException { int ret = longList.fromBitStream(inputBitStream, sourceUnused); // Decode values from their internal long representation value = new DirEntry[longList.value.length]; for (int i = 0; i < value.length; i++) value[i] = new DirEntry(longList.value[i]); return ret; } @Override public int toBitStream(OutputBitStream outputBitStream, final long sourceUnused) throws IOException { // Values have already been encoded in the SwhLabel constructor return longList.toBitStream(outputBitStream, sourceUnused); } @Override public String wellKnownAttributeKey() { return key; } @Override public String[] attributeKeys() { return new String[]{key}; } @Override public Class[] attributeTypes() { return new Class[]{DirEntry[].class}; } @Override public Object get(String s) { - if (this.key.equals(key)) + if (this.key.equals(s)) return value; throw new IllegalArgumentException(); } @Override public Object get() { return value; } @Override public Label copy() { return new SwhLabel(key, width, value.clone()); } @Override public int fixedWidth() { return -1; } @Override public String toString() { return key + ":" + Arrays.toString(value) + " (width:" + width + ")"; } @Override public String toSpec() { return this.getClass().getName() + "(" + key + "," + width + ")"; } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java deleted file mode 100644 index 04bde71..0000000 --- a/java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java +++ /dev/null @@ -1,527 +0,0 @@ -package org.softwareheritage.graph.maps; - -import com.martiansoftware.jsap.*; -import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; -import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; -import it.unimi.dsi.fastutil.Size64; -import it.unimi.dsi.fastutil.bytes.ByteArrays; -import it.unimi.dsi.fastutil.io.FastBufferedInputStream; -import it.unimi.dsi.fastutil.objects.Object2LongFunction; -import it.unimi.dsi.io.OutputBitStream; -import it.unimi.dsi.logging.ProgressLogger; -import it.unimi.dsi.big.webgraph.BVGraph; -import it.unimi.dsi.big.webgraph.ImmutableGraph; -import it.unimi.dsi.big.webgraph.NodeIterator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.softwareheritage.graph.labels.DirEntry; -import org.softwareheritage.graph.labels.SwhLabel; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.*; -import java.util.concurrent.TimeUnit; - -public class LabelMapBuilder { - final static String SORT_BUFFER_SIZE = "40%"; - final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class); - - String graphPath; - String outputGraphPath; - String debugPath; - String tmpDir; - ImmutableGraph graph; - long numNodes; - long numArcs; - - NodeIdMap nodeIdMap; - Object2LongFunction filenameMph; - long numFilenames; - int totalLabelWidth; - - public LabelMapBuilder(String graphPath, String debugPath, String outputGraphPath, String tmpDir) - throws IOException { - this.graphPath = graphPath; - if (outputGraphPath == null) { - this.outputGraphPath = graphPath; - } else { - this.outputGraphPath = outputGraphPath; - } - this.debugPath = debugPath; - this.tmpDir = tmpDir; - - // Load the graph in offline mode to retrieve the number of nodes/edges, - // then immediately destroy it. XXX: not even needed? - // ImmutableGraph graphOffline = BVGraph.loadMapped(graphPath); - - graph = BVGraph.loadMapped(graphPath); - numArcs = graph.numArcs(); - numNodes = graph.numNodes(); - - nodeIdMap = new NodeIdMap(graphPath, numNodes); - - filenameMph = NodeIdMap.loadMph(graphPath + "-labels.mph"); - numFilenames = getMPHSize(filenameMph); - totalLabelWidth = DirEntry.labelWidth(numFilenames); - } - - private static JSAPResult parse_args(String[] args) { - JSAPResult config = null; - try { - SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{ - new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g', "graph", - "Basename of the compressed graph"), - new FlaggedOption("debugPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'd', - "debug-path", "Store the intermediate representation here for debug"), - new FlaggedOption("outputGraphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', - "output-graph", "Basename of the output graph, same as --graph if not specified"), - - new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 't', "tmp", - "Temporary directory path"),}); - - config = jsap.parse(args); - if (jsap.messagePrinted()) { - System.exit(1); - } - } catch (JSAPException e) { - e.printStackTrace(); - } - return config; - } - - public static void main(String[] args) throws IOException, InterruptedException { - JSAPResult config = parse_args(args); - String graphPath = config.getString("graphPath"); - String outputGraphPath = config.getString("outputGraphPath"); - String tmpDir = config.getString("tmpDir"); - String debugPath = config.getString("debugPath"); - - LabelMapBuilder builder = new LabelMapBuilder(graphPath, debugPath, outputGraphPath, tmpDir); - - logger.info("Loading graph and MPH functions..."); - builder.computeLabelMap(); - } - - static long getMPHSize(Object2LongFunction mph) { - return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size(); - } - - void computeLabelMap() throws IOException, InterruptedException { - this.loadGraph(); - // this.computeLabelMapSort(); - this.computeLabelMapBsort(); - } - - void computeLabelMapSort() throws IOException { - // Pass the intermediate representation to sort(1) so that we see the labels in the order they will - // appear in the label file. - ProcessBuilder processBuilder = new ProcessBuilder(); - processBuilder.command("sort", "-k1,1n", "-k2,2n", // Numerical sort - "--numeric-sort", "--buffer-size", SORT_BUFFER_SIZE, "--temporary-directory", tmpDir); - Process sort = processBuilder.start(); - BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream()); - // BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream()); - FastBufferedInputStream sort_stdout = new FastBufferedInputStream(sort.getInputStream()); - - final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); - hashLabelStream(fbis, new EdgeLabelLineWriter() { - @Override - public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { - sort_stdin.write((src + "\t" + dst + "\t" + filenameId + "\t" + permission + "\n") - .getBytes(StandardCharsets.US_ASCII)); - } - }); - sort_stdin.close(); - - EdgeLabelLineIterator mapLines = new TextualEdgeLabelLineIterator(sort_stdout); - writeLabels(mapLines); - logger.info("Done"); - } - - void computeLabelMapBsort() throws IOException, InterruptedException { - // Pass the intermediate representation to bsort(1) so that we see the labels in the order they will - // appear in the label file. - - String tmpFile = tmpDir + "/labelsToSort.bin"; - final FastBufferedInputStream fbis = new FastBufferedInputStream(System.in); - final DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tmpFile))); - - // Number of bytes to represent a node. - final int nodeBytes = (Long.SIZE - Long.numberOfLeadingZeros(graph.numNodes())) / 8 + 1; - ByteBuffer buffer = ByteBuffer.allocate(Long.BYTES); - logger.info("Writing labels to a packed binary files (node bytes: {})", nodeBytes); - - hashLabelStream(fbis, new EdgeLabelLineWriter() { - @Override - public void writeLine(long src, long dst, long filenameId, int permission) throws IOException { - buffer.putLong(0, src); - out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); - buffer.putLong(0, dst); - out.write(buffer.array(), Long.BYTES - nodeBytes, nodeBytes); - out.writeLong(filenameId); - out.writeInt(permission); - } - }); - - ProcessBuilder processBuilder = new ProcessBuilder(); - processBuilder.command("/home/seirl/bsort/src/bsort", "-v", "-r", - String.valueOf(nodeBytes * 2 + Long.BYTES + Integer.BYTES), "-k", String.valueOf(nodeBytes * 2), - tmpFile); - Process sort = processBuilder.start(); - sort.waitFor(); - - final DataInputStream sortedLabels = new DataInputStream(new BufferedInputStream(new FileInputStream(tmpFile))); - BinaryEdgeLabelLineIterator mapLines = new BinaryEdgeLabelLineIterator(sortedLabels, nodeBytes); - writeLabels(mapLines); - - logger.info("Done"); - } - - void loadGraph() throws IOException { - } - - void hashLabelStream(FastBufferedInputStream input, EdgeLabelLineWriter writer) throws IOException { - // Compute intermediate representation and write it on : - // "

- * Java has a limit for mmap()-ed files because of unsupported 64-bit indexing. The - * dsiutils ByteBufferInputStream is used to overcome - * this Java limit. - * - * @author The Software Heritage developers - */ - -public class MapFile { - /** Memory-mapped file buffer */ - ByteBufferInputStream bufferMap; - /** Fixed line length of the mmap()-ed file */ - int lineLength; - - /** - * Constructor. - * - * @param path file path to mmap() - * @param lineLength fixed length of a line in the file - */ - public MapFile(String path, int lineLength) throws IOException { - this.bufferMap = null; - this.lineLength = lineLength; - - try (RandomAccessFile mapFile = new RandomAccessFile(new File(path), "r")) { - FileChannel fileChannel = mapFile.getChannel(); - bufferMap = ByteBufferInputStream.map(fileChannel, FileChannel.MapMode.READ_ONLY); - } - } - - /** - * Returns a specific line in the file. - * - * @param lineIndex line number in the file - * @return the line at the specified position - */ - public byte[] readAtLine(long lineIndex) { - byte[] buffer = new byte[lineLength]; - long position = lineIndex * (long) lineLength; - bufferMap.position(position); - bufferMap.read(buffer, 0, lineLength); - return buffer; - } - - /** - * Closes the mmap()-ed file. - */ - public void close() throws IOException { - bufferMap.close(); - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java index 2a2c50f..fb65937 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java @@ -1,186 +1,196 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.maps; import it.unimi.dsi.fastutil.Size64; +import it.unimi.dsi.fastutil.bytes.ByteBigList; +import it.unimi.dsi.fastutil.bytes.ByteMappedBigList; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; +import it.unimi.dsi.fastutil.longs.LongMappedBigList; import it.unimi.dsi.fastutil.objects.Object2LongFunction; -import it.unimi.dsi.util.ByteBufferLongBigList; import org.softwareheritage.graph.SWHID; +import org.softwareheritage.graph.compress.NodeMapBuilder; -import java.io.FileInputStream; +import java.io.File; import java.io.IOException; +import java.io.RandomAccessFile; import java.nio.charset.StandardCharsets; /** * Mapping between internal long node id and external SWHID. *

* The SWHID -> node mapping is obtained from hashing the SWHID with a MPH, then permuting it using * an mmap()-ed .order file containing the graph permutation. * * The node -> SWHID reverse mapping is pre-computed and dumped on disk in the * {@link NodeMapBuilder} class, then it is loaded here using mmap(). * * @author The Software Heritage developers * @see NodeMapBuilder */ -public class NodeIdMap { +public class NodeIdMap implements Size64 { /** Fixed length of binary SWHID buffer */ public static final int SWHID_BIN_SIZE = 22; /** File extension for the long node id to SWHID map */ public static final String NODE_TO_SWHID = ".node2swhid.bin"; /** Graph path and basename */ String graphPath; - /** Number of ids to map */ - long nbIds; /** mmap()-ed NODE_TO_SWHID file */ - MapFile nodeToSwhMap; + ByteBigList nodeToSwhMap; /** Minimal perfect hash (MPH) function SWHID -> initial order */ Object2LongFunction mph; /** mmap()-ed long list with the permutation initial order -> graph order */ LongBigList orderMap; - /** FileInputStream containing the permutation */ - FileInputStream orderInputStream; /** * Constructor. * * @param graphPath full graph path - * @param nbNodes number of nodes in the graph */ - public NodeIdMap(String graphPath, long nbNodes) throws IOException { + public NodeIdMap(String graphPath) throws IOException { this.graphPath = graphPath; - this.nbIds = nbNodes; // node -> SWHID - this.nodeToSwhMap = new MapFile(graphPath + NODE_TO_SWHID, SWHID_BIN_SIZE); + try (RandomAccessFile raf = new RandomAccessFile(graphPath + NODE_TO_SWHID, "r")) { + this.nodeToSwhMap = ByteMappedBigList.map(raf.getChannel()); + } // SWHID -> node this.mph = loadMph(graphPath + ".mph"); - this.orderInputStream = new FileInputStream(graphPath + ".order"); - this.orderMap = ByteBufferLongBigList.map(orderInputStream.getChannel()); + try (RandomAccessFile mapFile = new RandomAccessFile(new File(graphPath + ".order"), "r")) { + this.orderMap = LongMappedBigList.map(mapFile.getChannel()); + } } @SuppressWarnings("unchecked") public static Object2LongFunction loadMph(String path) throws IOException { Object obj; try { obj = BinIO.loadObject(path); } catch (ClassNotFoundException e) { throw new IOException(e.getMessage()); } Object2LongFunction res = (Object2LongFunction) obj; // Backward-compatibility for old maps parametrized with . // New maps should be parametrized with , which is faster. try { // Try to call it with bytes, will fail if it's a O2LF. res.getLong("42".getBytes(StandardCharsets.UTF_8)); } catch (ClassCastException e) { class StringCompatibleByteFunction implements Object2LongFunction, Size64 { private final Object2LongFunction legacyFunction; public StringCompatibleByteFunction(Object2LongFunction legacyFunction) { this.legacyFunction = legacyFunction; } @Override public long getLong(Object o) { byte[] bi = (byte[]) o; return legacyFunction.getLong(new String(bi, StandardCharsets.UTF_8)); } + @SuppressWarnings("deprecation") @Override public int size() { return legacyFunction.size(); } @Override public long size64() { return (legacyFunction instanceof Size64) ? ((Size64) legacyFunction).size64() : legacyFunction.size(); } } Object2LongFunction mphLegacy = (Object2LongFunction) obj; return new StringCompatibleByteFunction(mphLegacy); } // End of backward-compatibility block return res; } /** * Converts byte-form SWHID to corresponding long node id. Low-level function, does not check if the * SWHID is valid. * * @param swhid node represented as bytes * @return corresponding node as a long id */ public long getNodeId(byte[] swhid) { // 1. Hash the SWHID with the MPH to get its original ID long origNodeId = mph.getLong(swhid); // 2. Use the order permutation to get the position in the permuted graph return this.orderMap.getLong(origNodeId); } /** * Converts SWHID to corresponding long node id. * * @param swhid node represented as a {@link SWHID} * @param checkExists if true, error if the SWHID is not present in the graph, if false the check * will be skipped and invalid data will be returned for non-existing SWHIDs. * @return corresponding node as a long id * @see SWHID */ public long getNodeId(SWHID swhid, boolean checkExists) { // Convert the SWHID to bytes and call getNodeId() long nodeId = getNodeId(swhid.toString().getBytes(StandardCharsets.US_ASCII)); // Check that the position effectively corresponds to a real node using the reverse map. // This is necessary because the MPH makes no guarantees on whether the input SWHID is valid. if (!checkExists || getSWHID(nodeId).equals(swhid)) { return nodeId; } else { throw new IllegalArgumentException("Unknown SWHID: " + swhid); } } public long getNodeId(SWHID swhid) { return getNodeId(swhid, true); } /** * Converts a node long id to corresponding SWHID. * * @param nodeId node as a long id * @return corresponding node as a {@link SWHID} * @see SWHID */ public SWHID getSWHID(long nodeId) { /* * Each line in NODE_TO_SWHID is formatted as: swhid The file is ordered by nodeId, meaning node0's * swhid is at line 0, hence we can read the nodeId-th line to get corresponding swhid */ - if (nodeId < 0 || nodeId >= nbIds) { - throw new IllegalArgumentException("Node id " + nodeId + " should be between 0 and " + nbIds); + if (nodeId < 0 || nodeId >= nodeToSwhMap.size64()) { + throw new IllegalArgumentException( + "Node id " + nodeId + " should be between 0 and " + nodeToSwhMap.size64()); } - return SWHID.fromBytes(nodeToSwhMap.readAtLine(nodeId)); + byte[] swhid = new byte[SWHID_BIN_SIZE]; + nodeToSwhMap.getElements(nodeId * SWHID_BIN_SIZE, swhid, 0, SWHID_BIN_SIZE); + return SWHID.fromBytes(swhid); } - /** - * Closes the mapping files. - */ - public void close() throws IOException { - orderInputStream.close(); - nodeToSwhMap.close(); + /** Return the number of nodes in the map. */ + @Override + public long size64() { + return nodeToSwhMap.size64(); } } diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java index befe094..40aafb1 100644 --- a/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java +++ b/java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java @@ -1,54 +1,62 @@ +/* + * Copyright (c) 2019-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.maps; import it.unimi.dsi.fastutil.io.BinIO; import it.unimi.dsi.fastutil.longs.LongBigList; -import org.softwareheritage.graph.Node; +import org.softwareheritage.graph.SwhType; import java.io.IOException; /** * Mapping between long node id and SWH node type as described in the * data model. *

- * The type mapping is pre-computed and dumped on disk in the {@link NodeMapBuilder} class, then it - * is loaded in-memory here using fastutil LongBigList. - * To be space-efficient, the mapping is stored as a bitmap using minimum number of bits per - * {@link Node.Type}. + * The type mapping is pre-computed and dumped on disk in the + * {@link org.softwareheritage.graph.compress.NodeMapBuilder} class, then it is loaded in-memory + * here using fastutil LongBigList. To be + * space-efficient, the mapping is stored as a bitmap using minimum number of bits per + * {@link SwhType}. * * @author The Software Heritage developers */ public class NodeTypesMap { /** File extension for the long node id to node type map */ public static final String NODE_TO_TYPE = ".node2type.map"; /** * Array storing for each node its type */ public LongBigList nodeTypesMap; /** * Constructor. * * @param graphPath path and basename of the compressed graph */ public NodeTypesMap(String graphPath) throws IOException { try { nodeTypesMap = (LongBigList) BinIO.loadObject(graphPath + NODE_TO_TYPE); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Unknown class object: " + e); } } /** * Returns node type from a node long id. * * @param nodeId node as a long id - * @return corresponding {@link Node.Type} value - * @see org.softwareheritage.graph.Node.Type + * @return corresponding {@link SwhType} value + * @see SwhType */ - public Node.Type getType(long nodeId) { + public SwhType getType(long nodeId) { long type = nodeTypesMap.getLong(nodeId); - return Node.Type.fromInt((int) type); + return SwhType.fromInt((int) type); } } diff --git a/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java b/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java new file mode 100644 index 0000000..470f6da --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import com.google.protobuf.FieldMask; +import com.martiansoftware.jsap.*; +import io.grpc.Server; +import io.grpc.Status; +import io.grpc.netty.shaded.io.grpc.netty.NettyServerBuilder; +import io.grpc.netty.shaded.io.netty.channel.ChannelOption; +import io.grpc.stub.StreamObserver; +import io.grpc.protobuf.services.ProtoReflectionService; +import it.unimi.dsi.logging.ProgressLogger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.SWHID; +import org.softwareheritage.graph.SwhBidirectionalGraph; +import org.softwareheritage.graph.compress.LabelMapBuilder; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Properties; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Server that manages startup/shutdown of a {@code Greeter} server. + */ +public class GraphServer { + private final static Logger logger = LoggerFactory.getLogger(GraphServer.class); + + private final SwhBidirectionalGraph graph; + private final int port; + private final int threads; + private Server server; + + /** + * @param graphBasename the basename of the SWH graph to load + * @param port the port on which the GRPC server will listen + * @param threads the number of threads to use in the server threadpool + */ + public GraphServer(String graphBasename, int port, int threads) throws IOException { + this.graph = loadGraph(graphBasename); + this.port = port; + this.threads = threads; + } + + /** Load a graph and all its properties. */ + public static SwhBidirectionalGraph loadGraph(String basename) throws IOException { + SwhBidirectionalGraph g = SwhBidirectionalGraph.loadLabelledMapped(basename, new ProgressLogger(logger)); + g.loadContentLength(); + g.loadContentIsSkipped(); + g.loadPersonIds(); + g.loadAuthorTimestamps(); + g.loadCommitterTimestamps(); + g.loadMessages(); + g.loadTagNames(); + g.loadLabelNames(); + return g; + } + + /** Start the RPC server. */ + private void start() throws IOException { + server = NettyServerBuilder.forPort(port).withChildOption(ChannelOption.SO_REUSEADDR, true) + .executor(Executors.newFixedThreadPool(threads)).addService(new TraversalService(graph)) + .addService(ProtoReflectionService.newInstance()).build().start(); + logger.info("Server started, listening on " + port); + Runtime.getRuntime().addShutdownHook(new Thread(() -> { + try { + GraphServer.this.stop(); + } catch (InterruptedException e) { + e.printStackTrace(System.err); + } + })); + } + + private void stop() throws InterruptedException { + if (server != null) { + server.shutdown().awaitTermination(30, TimeUnit.SECONDS); + } + } + + /** + * Await termination on the main thread since the grpc library uses daemon threads. + */ + private void blockUntilShutdown() throws InterruptedException { + if (server != null) { + server.awaitTermination(); + } + } + + private static JSAPResult parseArgs(String[] args) { + JSAPResult config = null; + try { + SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", + new Parameter[]{ + new FlaggedOption("port", JSAP.INTEGER_PARSER, "50091", JSAP.NOT_REQUIRED, 'p', "port", + "The port on which the server should listen."), + new FlaggedOption("threads", JSAP.INTEGER_PARSER, "0", JSAP.NOT_REQUIRED, 't', "threads", + "The number of concurrent threads. 0 = number of cores."), + new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, + "Basename of the output graph")}); + + config = jsap.parse(args); + if (jsap.messagePrinted()) { + System.exit(1); + } + } catch (JSAPException e) { + e.printStackTrace(); + } + return config; + } + + /** Main launches the server from the command line. */ + public static void main(String[] args) throws IOException, InterruptedException { + JSAPResult config = parseArgs(args); + String graphBasename = config.getString("graphBasename"); + int port = config.getInt("port"); + int threads = config.getInt("threads"); + if (threads == 0) { + threads = Runtime.getRuntime().availableProcessors(); + } + + final GraphServer server = new GraphServer(graphBasename, port, threads); + server.start(); + server.blockUntilShutdown(); + } + + /** Implementation of the Traversal service, which contains all the graph querying endpoints. */ + static class TraversalService extends TraversalServiceGrpc.TraversalServiceImplBase { + SwhBidirectionalGraph graph; + + public TraversalService(SwhBidirectionalGraph graph) { + this.graph = graph; + } + + /** Return various statistics on the overall graph. */ + @Override + public void stats(StatsRequest request, StreamObserver responseObserver) { + StatsResponse.Builder response = StatsResponse.newBuilder(); + response.setNumNodes(graph.numNodes()); + response.setNumEdges(graph.numArcs()); + + Properties properties = new Properties(); + try { + properties.load(new FileInputStream(graph.getPath() + ".properties")); + properties.load(new FileInputStream(graph.getPath() + ".stats")); + } catch (IOException e) { + throw new RuntimeException(e); + } + response.setCompressionRatio(Double.parseDouble(properties.getProperty("compratio"))); + response.setBitsPerNode(Double.parseDouble(properties.getProperty("bitspernode"))); + response.setBitsPerEdge(Double.parseDouble(properties.getProperty("bitsperlink"))); + response.setAvgLocality(Double.parseDouble(properties.getProperty("avglocality"))); + response.setIndegreeMin(Long.parseLong(properties.getProperty("minindegree"))); + response.setIndegreeMax(Long.parseLong(properties.getProperty("maxindegree"))); + response.setIndegreeAvg(Double.parseDouble(properties.getProperty("avgindegree"))); + response.setOutdegreeMin(Long.parseLong(properties.getProperty("minoutdegree"))); + response.setOutdegreeMax(Long.parseLong(properties.getProperty("maxoutdegree"))); + response.setOutdegreeAvg(Double.parseDouble(properties.getProperty("avgoutdegree"))); + responseObserver.onNext(response.build()); + responseObserver.onCompleted(); + } + + /** Return a single node and its properties. */ + @Override + public void getNode(GetNodeRequest request, StreamObserver responseObserver) { + SwhBidirectionalGraph g = graph.copy(); + long nodeId; + try { + nodeId = g.getNodeId(new SWHID(request.getSwhid())); + } catch (IllegalArgumentException e) { + responseObserver + .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); + return; + } + Node.Builder builder = Node.newBuilder(); + NodePropertyBuilder.buildNodeProperties(g.getForwardGraph(), request.hasMask() ? request.getMask() : null, + builder, nodeId); + responseObserver.onNext(builder.build()); + responseObserver.onCompleted(); + } + + /** Perform a BFS traversal from a set of source nodes and stream the nodes encountered. */ + @Override + public void traverse(TraversalRequest request, StreamObserver responseObserver) { + SwhBidirectionalGraph g = graph.copy(); + Traversal.SimpleTraversal t; + try { + t = new Traversal.SimpleTraversal(g, request, responseObserver::onNext); + } catch (IllegalArgumentException e) { + responseObserver + .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); + return; + } + t.visit(); + responseObserver.onCompleted(); + } + + /** + * Find the shortest path between a set of source nodes and a node that matches a given criteria + * using a BFS. + */ + @Override + public void findPathTo(FindPathToRequest request, StreamObserver responseObserver) { + SwhBidirectionalGraph g = graph.copy(); + Traversal.FindPathTo t; + try { + t = new Traversal.FindPathTo(g, request); + } catch (IllegalArgumentException e) { + responseObserver + .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); + return; + } + t.visit(); + Path path = t.getPath(); + if (path == null) { + responseObserver.onError(Status.NOT_FOUND.asException()); + } else { + responseObserver.onNext(path); + responseObserver.onCompleted(); + } + } + + /** + * Find the shortest path between a set of source nodes and a set of destination nodes using a + * bidirectional BFS. + */ + @Override + public void findPathBetween(FindPathBetweenRequest request, StreamObserver responseObserver) { + SwhBidirectionalGraph g = graph.copy(); + Traversal.FindPathBetween t; + try { + t = new Traversal.FindPathBetween(g, request); + } catch (IllegalArgumentException e) { + responseObserver + .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); + return; + } + t.visit(); + Path path = t.getPath(); + if (path == null) { + responseObserver.onError(Status.NOT_FOUND.asException()); + } else { + responseObserver.onNext(path); + responseObserver.onCompleted(); + } + } + + /** Return the number of nodes traversed by a BFS traversal. */ + @Override + public void countNodes(TraversalRequest request, StreamObserver responseObserver) { + AtomicLong count = new AtomicLong(0); + SwhBidirectionalGraph g = graph.copy(); + TraversalRequest fixedReq = TraversalRequest.newBuilder(request) + // Ignore return fields, just count nodes + .setMask(FieldMask.getDefaultInstance()).build(); + Traversal.SimpleTraversal t; + try { + t = new Traversal.SimpleTraversal(g, fixedReq, n -> count.incrementAndGet()); + } catch (IllegalArgumentException e) { + responseObserver + .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); + return; + } + t.visit(); + CountResponse response = CountResponse.newBuilder().setCount(count.get()).build(); + responseObserver.onNext(response); + responseObserver.onCompleted(); + } + + /** Return the number of edges traversed by a BFS traversal. */ + @Override + public void countEdges(TraversalRequest request, StreamObserver responseObserver) { + AtomicLong count = new AtomicLong(0); + SwhBidirectionalGraph g = graph.copy(); + TraversalRequest fixedReq = TraversalRequest.newBuilder(request) + // Force return empty successors to count the edges + .setMask(FieldMask.newBuilder().addPaths("num_successors").build()).build(); + Traversal.SimpleTraversal t; + try { + t = new Traversal.SimpleTraversal(g, fixedReq, n -> count.addAndGet(n.getNumSuccessors())); + } catch (IllegalArgumentException e) { + responseObserver + .onError(Status.INVALID_ARGUMENT.withDescription(e.getMessage()).withCause(e).asException()); + return; + } + t.visit(); + CountResponse response = CountResponse.newBuilder().setCount(count.get()).build(); + responseObserver.onNext(response); + responseObserver.onCompleted(); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/rpc/NodePropertyBuilder.java b/java/src/main/java/org/softwareheritage/graph/rpc/NodePropertyBuilder.java new file mode 100644 index 0000000..1cedeb9 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/rpc/NodePropertyBuilder.java @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import com.google.protobuf.ByteString; +import com.google.protobuf.FieldMask; +import com.google.protobuf.util.FieldMaskUtil; +import it.unimi.dsi.big.webgraph.labelling.Label; +import org.softwareheritage.graph.SwhUnidirectionalGraph; +import org.softwareheritage.graph.labels.DirEntry; + +import java.util.*; + +/** + * NodePropertyBuilder is a helper class to enrich {@link Node} messages with node and edge + * properties. It is used by {@link GraphServer.TraversalService} to build the response messages or + * streams. Because property access is disk-based and slow, particular care is taken to avoid + * loading unnecessary properties. We use a FieldMask object to check which properties are requested + * by the client, and only load these. + */ +public class NodePropertyBuilder { + /** + * NodeDataMask caches a FieldMask into a more efficient representation (booleans). This avoids the + * need of parsing the FieldMask for each node in the stream. + */ + public static class NodeDataMask { + public boolean swhid; + public boolean successor; + public boolean successorSwhid; + public boolean successorLabel; + public boolean numSuccessors; + public boolean cntLength; + public boolean cntIsSkipped; + public boolean revAuthor; + public boolean revAuthorDate; + public boolean revAuthorDateOffset; + public boolean revCommitter; + public boolean revCommitterDate; + public boolean revCommitterDateOffset; + public boolean revMessage; + public boolean relAuthor; + public boolean relAuthorDate; + public boolean relAuthorDateOffset; + public boolean relName; + public boolean relMessage; + public boolean oriUrl; + + public NodeDataMask(FieldMask mask) { + Set allowedFields = null; + if (mask != null) { + mask = FieldMaskUtil.normalize(mask); + allowedFields = new HashSet<>(mask.getPathsList()); + } + this.swhid = allowedFields == null || allowedFields.contains("swhid"); + this.successorSwhid = allowedFields == null || allowedFields.contains("successor") + || allowedFields.contains("successor.swhid"); + this.successorLabel = allowedFields == null || allowedFields.contains("successor") + || allowedFields.contains("successor.label"); + this.successor = this.successorSwhid || this.successorLabel; + this.numSuccessors = allowedFields == null || allowedFields.contains("num_successors"); + this.cntLength = allowedFields == null || allowedFields.contains("cnt.length"); + this.cntIsSkipped = allowedFields == null || allowedFields.contains("cnt.is_skipped"); + this.revAuthor = allowedFields == null || allowedFields.contains("rev.author"); + this.revAuthorDate = allowedFields == null || allowedFields.contains("rev.author_date"); + this.revAuthorDateOffset = allowedFields == null || allowedFields.contains("rev.author_date_offset"); + this.revCommitter = allowedFields == null || allowedFields.contains("rev.committer"); + this.revCommitterDate = allowedFields == null || allowedFields.contains("rev.committer_date"); + this.revCommitterDateOffset = allowedFields == null || allowedFields.contains("rev.committer_date_offset"); + this.revMessage = allowedFields == null || allowedFields.contains("rev.message"); + this.relAuthor = allowedFields == null || allowedFields.contains("rel.author"); + this.relAuthorDate = allowedFields == null || allowedFields.contains("rel.author_date"); + this.relAuthorDateOffset = allowedFields == null || allowedFields.contains("rel.author_date_offset"); + this.relName = allowedFields == null || allowedFields.contains("rel.name"); + this.relMessage = allowedFields == null || allowedFields.contains("rel.message"); + this.oriUrl = allowedFields == null || allowedFields.contains("ori.url"); + } + } + + /** Enrich a Node message with node properties requested in the NodeDataMask. */ + public static void buildNodeProperties(SwhUnidirectionalGraph graph, NodeDataMask mask, Node.Builder nodeBuilder, + long node) { + if (mask.swhid) { + nodeBuilder.setSwhid(graph.getSWHID(node).toString()); + } + + switch (graph.getNodeType(node)) { + case CNT: + ContentData.Builder cntBuilder = ContentData.newBuilder(); + if (mask.cntLength) { + cntBuilder.setLength(graph.getContentLength(node)); + } + if (mask.cntIsSkipped) { + cntBuilder.setIsSkipped(graph.isContentSkipped(node)); + } + nodeBuilder.setCnt(cntBuilder.build()); + break; + case REV: + RevisionData.Builder revBuilder = RevisionData.newBuilder(); + if (mask.revAuthor) { + revBuilder.setAuthor(graph.getAuthorId(node)); + } + if (mask.revAuthorDate) { + revBuilder.setAuthorDate(graph.getAuthorTimestamp(node)); + } + if (mask.revAuthorDateOffset) { + revBuilder.setAuthorDateOffset(graph.getAuthorTimestampOffset(node)); + } + if (mask.revCommitter) { + revBuilder.setCommitter(graph.getCommitterId(node)); + } + if (mask.revCommitterDate) { + revBuilder.setCommitterDate(graph.getCommitterTimestamp(node)); + } + if (mask.revCommitterDateOffset) { + revBuilder.setCommitterDateOffset(graph.getCommitterTimestampOffset(node)); + } + if (mask.revMessage) { + byte[] msg = graph.getMessage(node); + if (msg != null) { + revBuilder.setMessage(ByteString.copyFrom(msg)); + } + } + nodeBuilder.setRev(revBuilder.build()); + break; + case REL: + ReleaseData.Builder relBuilder = ReleaseData.newBuilder(); + if (mask.relAuthor) { + relBuilder.setAuthor(graph.getAuthorId(node)); + } + if (mask.relAuthorDate) { + relBuilder.setAuthorDate(graph.getAuthorTimestamp(node)); + } + if (mask.relAuthorDateOffset) { + relBuilder.setAuthorDateOffset(graph.getAuthorTimestampOffset(node)); + } + if (mask.relName) { + byte[] msg = graph.getMessage(node); + if (msg != null) { + relBuilder.setMessage(ByteString.copyFrom(msg)); + } + } + if (mask.relMessage) { + byte[] msg = graph.getMessage(node); + if (msg != null) { + relBuilder.setMessage(ByteString.copyFrom(msg)); + } + } + nodeBuilder.setRel(relBuilder.build()); + break; + case ORI: + OriginData.Builder oriBuilder = OriginData.newBuilder(); + if (mask.oriUrl) { + String url = graph.getUrl(node); + if (url != null) { + oriBuilder.setUrl(url); + } + } + nodeBuilder.setOri(oriBuilder.build()); + } + } + + /** Enrich a Node message with node properties requested in the FieldMask. */ + public static void buildNodeProperties(SwhUnidirectionalGraph graph, FieldMask mask, Node.Builder nodeBuilder, + long node) { + NodeDataMask nodeMask = new NodeDataMask(mask); + buildNodeProperties(graph, nodeMask, nodeBuilder, node); + } + + /** + * Enrich a Node message with edge properties requested in the NodeDataMask, for a specific edge. + */ + public static void buildSuccessorProperties(SwhUnidirectionalGraph graph, NodeDataMask mask, + Node.Builder nodeBuilder, long src, long dst, Label label) { + if (nodeBuilder != null) { + Successor.Builder successorBuilder = Successor.newBuilder(); + if (mask.successorSwhid) { + successorBuilder.setSwhid(graph.getSWHID(dst).toString()); + } + if (mask.successorLabel) { + DirEntry[] entries = (DirEntry[]) label.get(); + for (DirEntry entry : entries) { + EdgeLabel.Builder builder = EdgeLabel.newBuilder(); + builder.setName(ByteString.copyFrom(graph.getLabelName(entry.filenameId))); + builder.setPermission(entry.permission); + successorBuilder.addLabel(builder.build()); + } + } + Successor successor = successorBuilder.build(); + if (successor != Successor.getDefaultInstance()) { + nodeBuilder.addSuccessor(successor); + } + + if (mask.numSuccessors) { + nodeBuilder.setNumSuccessors(nodeBuilder.getNumSuccessors() + 1); + } + } + } + + /** Enrich a Node message with edge properties requested in the FieldMask, for a specific edge. */ + public static void buildSuccessorProperties(SwhUnidirectionalGraph graph, FieldMask mask, Node.Builder nodeBuilder, + long src, long dst, Label label) { + NodeDataMask nodeMask = new NodeDataMask(mask); + buildSuccessorProperties(graph, nodeMask, nodeBuilder, src, dst, label); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java b/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java new file mode 100644 index 0000000..bbdf4fa --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; +import it.unimi.dsi.big.webgraph.labelling.Label; +import org.softwareheritage.graph.*; + +import java.util.*; + +/** Traversal contains all the algorithms used for graph traversals */ +public class Traversal { + /** + * Wrapper around g.successors(), only follows edges that are allowed by the given + * {@link AllowedEdges} object. + */ + private static ArcLabelledNodeIterator.LabelledArcIterator filterLabelledSuccessors(SwhUnidirectionalGraph g, + long nodeId, AllowedEdges allowedEdges) { + if (allowedEdges.restrictedTo == null) { + // All edges are allowed, bypass edge check + return g.labelledSuccessors(nodeId); + } else { + ArcLabelledNodeIterator.LabelledArcIterator allSuccessors = g.labelledSuccessors(nodeId); + return new ArcLabelledNodeIterator.LabelledArcIterator() { + @Override + public Label label() { + return allSuccessors.label(); + } + + @Override + public long nextLong() { + long neighbor; + while ((neighbor = allSuccessors.nextLong()) != -1) { + if (allowedEdges.isAllowed(g.getNodeType(nodeId), g.getNodeType(neighbor))) { + return neighbor; + } + } + return -1; + } + + @Override + public long skip(final long n) { + long i = 0; + while (i < n && nextLong() != -1) + i++; + return i; + } + }; + } + } + + /** Helper class to check that a given node is "valid" for some given {@link NodeFilter} */ + private static class NodeFilterChecker { + private final SwhUnidirectionalGraph g; + private final NodeFilter filter; + private final AllowedNodes allowedNodes; + + private NodeFilterChecker(SwhUnidirectionalGraph graph, NodeFilter filter) { + this.g = graph; + this.filter = filter; + this.allowedNodes = new AllowedNodes(filter.hasTypes() ? filter.getTypes() : "*"); + } + + public boolean allowed(long nodeId) { + if (filter == null) { + return true; + } + if (!this.allowedNodes.isAllowed(g.getNodeType(nodeId))) { + return false; + } + + return true; + } + } + + /** Returns the unidirectional graph from a bidirectional graph and a {@link GraphDirection}. */ + public static SwhUnidirectionalGraph getDirectedGraph(SwhBidirectionalGraph g, GraphDirection direction) { + switch (direction) { + case FORWARD: + return g.getForwardGraph(); + case BACKWARD: + return g.getBackwardGraph(); + /* + * TODO: add support for BOTH case BOTH: return new SwhUnidirectionalGraph(g.symmetrize(), + * g.getProperties()); + */ + default : + throw new IllegalArgumentException("Unknown direction: " + direction); + } + } + + /** Returns the opposite of a given {@link GraphDirection} (equivalent to a graph transposition). */ + public static GraphDirection reverseDirection(GraphDirection direction) { + switch (direction) { + case FORWARD: + return GraphDirection.BACKWARD; + case BACKWARD: + return GraphDirection.FORWARD; + /* + * TODO: add support for BOTH case BOTH: return GraphDirection.BOTH; + */ + default : + throw new IllegalArgumentException("Unknown direction: " + direction); + } + } + + /** Dummy exception to short-circuit and interrupt a graph traversal. */ + static class StopTraversalException extends RuntimeException { + } + + /** Generic BFS traversal algorithm. */ + static class BFSVisitor { + /** The graph to traverse. */ + protected final SwhUnidirectionalGraph g; + /** Depth of the node currently being visited */ + protected long depth = 0; + /** + * Number of traversal successors (i.e., successors that will be considered by the traversal) of the + * node currently being visited + */ + protected long traversalSuccessors = 0; + /** Number of edges accessed since the beginning of the traversal */ + protected long edgesAccessed = 0; + + /** + * Map from a node ID to its parent node ID. The key set can be used as the set of all visited + * nodes. + */ + protected HashMap parents = new HashMap<>(); + /** Queue of nodes to visit (also called "frontier", "open set", "wavefront" etc.) */ + protected ArrayDeque queue = new ArrayDeque<>(); + /** If > 0, the maximum depth of the traversal. */ + private long maxDepth = -1; + /** If > 0, the maximum number of edges to traverse. */ + private long maxEdges = -1; + + BFSVisitor(SwhUnidirectionalGraph g) { + this.g = g; + } + + /** Add a new source node to the initial queue. */ + public void addSource(long nodeId) { + queue.add(nodeId); + parents.put(nodeId, -1L); + } + + /** Set the maximum depth of the traversal. */ + public void setMaxDepth(long depth) { + maxDepth = depth; + } + + /** Set the maximum number of edges to traverse. */ + public void setMaxEdges(long edges) { + maxEdges = edges; + } + + /** Setup the visit counters and depth sentinel. */ + public void visitSetup() { + edgesAccessed = 0; + depth = 0; + queue.add(-1L); // depth sentinel + } + + /** Perform the visit */ + public void visit() { + visitSetup(); + while (!queue.isEmpty()) { + visitStep(); + } + } + + /** Single "step" of a visit. Advance the frontier of exactly one node. */ + public void visitStep() { + try { + assert !queue.isEmpty(); + long curr = queue.poll(); + if (curr == -1L) { + ++depth; + if (!queue.isEmpty()) { + queue.add(-1L); + visitStep(); + } + return; + } + if (maxDepth >= 0 && depth > maxDepth) { + throw new StopTraversalException(); + } + edgesAccessed += g.outdegree(curr); + if (maxEdges >= 0 && edgesAccessed > maxEdges) { + throw new StopTraversalException(); + } + visitNode(curr); + } catch (StopTraversalException e) { + // Traversal is over, clear the to-do queue. + queue.clear(); + } + } + + /** + * Get the successors of a node. Override this function if you want to filter which successors are + * considered during the traversal. + */ + protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { + return g.labelledSuccessors(nodeId); + } + + /** Visit a node. Override to do additional processing on the node. */ + protected void visitNode(long node) { + ArcLabelledNodeIterator.LabelledArcIterator it = getSuccessors(node); + traversalSuccessors = 0; + for (long succ; (succ = it.nextLong()) != -1;) { + traversalSuccessors++; + visitEdge(node, succ, it.label()); + } + } + + /** Visit an edge. Override to do additional processing on the edge. */ + protected void visitEdge(long src, long dst, Label label) { + if (!parents.containsKey(dst)) { + queue.add(dst); + parents.put(dst, src); + } + } + } + + /** + * SimpleTraversal is used by the Traverse endpoint. It extends BFSVisitor with additional + * processing, notably related to graph properties and filters. + */ + static class SimpleTraversal extends BFSVisitor { + private final NodeFilterChecker nodeReturnChecker; + private final AllowedEdges allowedEdges; + private final TraversalRequest request; + private final NodePropertyBuilder.NodeDataMask nodeDataMask; + private final NodeObserver nodeObserver; + + private Node.Builder nodeBuilder; + + SimpleTraversal(SwhBidirectionalGraph bidirectionalGraph, TraversalRequest request, NodeObserver nodeObserver) { + super(getDirectedGraph(bidirectionalGraph, request.getDirection())); + this.request = request; + this.nodeObserver = nodeObserver; + this.nodeReturnChecker = new NodeFilterChecker(g, request.getReturnNodes()); + this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null); + this.allowedEdges = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*"); + request.getSrcList().forEach(srcSwhid -> { + long srcNodeId = g.getNodeId(new SWHID(srcSwhid)); + addSource(srcNodeId); + }); + if (request.hasMaxDepth()) { + setMaxDepth(request.getMaxDepth()); + } + if (request.hasMaxEdges()) { + setMaxEdges(request.getMaxEdges()); + } + } + + @Override + protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { + return filterLabelledSuccessors(g, nodeId, allowedEdges); + } + + @Override + public void visitNode(long node) { + nodeBuilder = null; + if (nodeReturnChecker.allowed(node) && (!request.hasMinDepth() || depth >= request.getMinDepth())) { + nodeBuilder = Node.newBuilder(); + NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, node); + } + super.visitNode(node); + if (request.getReturnNodes().hasMinTraversalSuccessors() + && traversalSuccessors < request.getReturnNodes().getMinTraversalSuccessors() + || request.getReturnNodes().hasMaxTraversalSuccessors() + && traversalSuccessors > request.getReturnNodes().getMaxTraversalSuccessors()) { + nodeBuilder = null; + } + if (nodeBuilder != null) { + nodeObserver.onNext(nodeBuilder.build()); + } + } + + @Override + protected void visitEdge(long src, long dst, Label label) { + super.visitEdge(src, dst, label); + NodePropertyBuilder.buildSuccessorProperties(g, nodeDataMask, nodeBuilder, src, dst, label); + } + } + + /** + * FindPathTo searches for a path from a source node to a node matching a given criteria It extends + * BFSVisitor with additional processing, and makes the traversal stop as soon as a node matching + * the given criteria is found. + */ + static class FindPathTo extends BFSVisitor { + private final AllowedEdges allowedEdges; + private final FindPathToRequest request; + private final NodePropertyBuilder.NodeDataMask nodeDataMask; + private final NodeFilterChecker targetChecker; + private Long targetNode = null; + + FindPathTo(SwhBidirectionalGraph bidirectionalGraph, FindPathToRequest request) { + super(getDirectedGraph(bidirectionalGraph, request.getDirection())); + this.request = request; + this.targetChecker = new NodeFilterChecker(g, request.getTarget()); + this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null); + this.allowedEdges = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*"); + if (request.hasMaxDepth()) { + setMaxDepth(request.getMaxDepth()); + } + if (request.hasMaxEdges()) { + setMaxEdges(request.getMaxEdges()); + } + request.getSrcList().forEach(srcSwhid -> { + long srcNodeId = g.getNodeId(new SWHID(srcSwhid)); + addSource(srcNodeId); + }); + } + + @Override + protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { + return filterLabelledSuccessors(g, nodeId, allowedEdges); + } + + @Override + public void visitNode(long node) { + if (targetChecker.allowed(node)) { + targetNode = node; + throw new StopTraversalException(); + } + super.visitNode(node); + } + + /** + * Once the visit has been performed and a matching node has been found, return the shortest path + * from the source set to that node. To do so, we need to backtrack the parents of the node until we + * find one of the source nodes (whose parent is -1). + */ + public Path getPath() { + if (targetNode == null) { + return null; // No path found. + } + + /* Backtrack from targetNode to a source node */ + long curNode = targetNode; + ArrayList path = new ArrayList<>(); + while (curNode != -1) { + path.add(curNode); + curNode = parents.get(curNode); + } + Collections.reverse(path); + + /* Enrich path with node properties */ + Path.Builder pathBuilder = Path.newBuilder(); + for (long nodeId : path) { + Node.Builder nodeBuilder = Node.newBuilder(); + NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, nodeId); + pathBuilder.addNode(nodeBuilder.build()); + } + return pathBuilder.build(); + } + } + + /** + * FindPathBetween searches for a shortest path between a set of source nodes and a set of + * destination nodes. + * + * It does so by performing a *bidirectional breadth-first search*, i.e., two parallel breadth-first + * searches, one from the source set ("src-BFS") and one from the destination set ("dst-BFS"), until + * both searches find a common node that joins their visited sets. This node is called the "midpoint + * node". The path returned is the path src -> ... -> midpoint -> ... -> dst, which is always a + * shortest path between src and dst. + * + * The graph direction of both BFS can be configured separately. By default, the dst-BFS will use + * the graph in the opposite direction than the src-BFS (if direction = FORWARD, by default + * direction_reverse = BACKWARD, and vice-versa). The default behavior is thus to search for a + * shortest path between two nodes in a given direction. However, one can also specify FORWARD or + * BACKWARD for *both* the src-BFS and the dst-BFS. This will search for a common descendant or a + * common ancestor between the two sets, respectively. These will be the midpoints of the returned + * path. + */ + static class FindPathBetween extends BFSVisitor { + private final FindPathBetweenRequest request; + private final NodePropertyBuilder.NodeDataMask nodeDataMask; + private final AllowedEdges allowedEdgesSrc; + private final AllowedEdges allowedEdgesDst; + + private final BFSVisitor srcVisitor; + private final BFSVisitor dstVisitor; + private Long middleNode = null; + + FindPathBetween(SwhBidirectionalGraph bidirectionalGraph, FindPathBetweenRequest request) { + super(getDirectedGraph(bidirectionalGraph, request.getDirection())); + this.request = request; + this.nodeDataMask = new NodePropertyBuilder.NodeDataMask(request.hasMask() ? request.getMask() : null); + + GraphDirection direction = request.getDirection(); + // if direction_reverse is not specified, use the opposite direction of direction + GraphDirection directionReverse = request.hasDirectionReverse() + ? request.getDirectionReverse() + : reverseDirection(request.getDirection()); + SwhUnidirectionalGraph srcGraph = getDirectedGraph(bidirectionalGraph, direction); + SwhUnidirectionalGraph dstGraph = getDirectedGraph(bidirectionalGraph, directionReverse); + + this.allowedEdgesSrc = new AllowedEdges(request.hasEdges() ? request.getEdges() : "*"); + /* + * If edges_reverse is not specified: - If `edges` is not specified either, defaults to "*" - If + * direction == direction_reverse, defaults to `edges` - If direction != direction_reverse, defaults + * to the reverse of `edges` (e.g. "rev:dir" becomes "dir:rev"). + */ + this.allowedEdgesDst = request.hasEdgesReverse() + ? new AllowedEdges(request.getEdgesReverse()) + : (request.hasEdges() + ? (direction == directionReverse + ? new AllowedEdges(request.getEdges()) + : new AllowedEdges(request.getEdges()).reverse()) + : new AllowedEdges("*")); + + /* + * Source sub-visitor. Aborts as soon as it finds a node already visited by the destination + * sub-visitor. + */ + this.srcVisitor = new BFSVisitor(srcGraph) { + @Override + protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { + return filterLabelledSuccessors(g, nodeId, allowedEdgesSrc); + } + + @Override + public void visitNode(long node) { + if (dstVisitor.parents.containsKey(node)) { + middleNode = node; + throw new StopTraversalException(); + } + super.visitNode(node); + } + }; + + /* + * Destination sub-visitor. Aborts as soon as it finds a node already visited by the source + * sub-visitor. + */ + this.dstVisitor = new BFSVisitor(dstGraph) { + @Override + protected ArcLabelledNodeIterator.LabelledArcIterator getSuccessors(long nodeId) { + return filterLabelledSuccessors(g, nodeId, allowedEdgesDst); + } + + @Override + public void visitNode(long node) { + if (srcVisitor.parents.containsKey(node)) { + middleNode = node; + throw new StopTraversalException(); + } + super.visitNode(node); + } + }; + if (request.hasMaxDepth()) { + this.srcVisitor.setMaxDepth(request.getMaxDepth()); + this.dstVisitor.setMaxDepth(request.getMaxDepth()); + } + if (request.hasMaxEdges()) { + this.srcVisitor.setMaxEdges(request.getMaxEdges()); + this.dstVisitor.setMaxEdges(request.getMaxEdges()); + } + request.getSrcList().forEach(srcSwhid -> { + long srcNodeId = g.getNodeId(new SWHID(srcSwhid)); + srcVisitor.addSource(srcNodeId); + }); + request.getDstList().forEach(srcSwhid -> { + long srcNodeId = g.getNodeId(new SWHID(srcSwhid)); + dstVisitor.addSource(srcNodeId); + }); + } + + @Override + public void visit() { + /* + * Bidirectional BFS: maintain two sub-visitors, and alternately run a visit step in each of them. + */ + srcVisitor.visitSetup(); + dstVisitor.visitSetup(); + while (!srcVisitor.queue.isEmpty() || !dstVisitor.queue.isEmpty()) { + if (!srcVisitor.queue.isEmpty()) { + srcVisitor.visitStep(); + } + if (!dstVisitor.queue.isEmpty()) { + dstVisitor.visitStep(); + } + } + } + + public Path getPath() { + if (middleNode == null) { + return null; // No path found. + } + Path.Builder pathBuilder = Path.newBuilder(); + ArrayList path = new ArrayList<>(); + + /* First section of the path: src -> midpoint */ + long curNode = middleNode; + while (curNode != -1) { + path.add(curNode); + curNode = srcVisitor.parents.get(curNode); + } + pathBuilder.setMidpointIndex(path.size() - 1); + Collections.reverse(path); + + /* Second section of the path: midpoint -> dst */ + curNode = dstVisitor.parents.get(middleNode); + while (curNode != -1) { + path.add(curNode); + curNode = dstVisitor.parents.get(curNode); + } + + /* Enrich path with node properties */ + for (long nodeId : path) { + Node.Builder nodeBuilder = Node.newBuilder(); + NodePropertyBuilder.buildNodeProperties(g, nodeDataMask, nodeBuilder, nodeId); + pathBuilder.addNode(nodeBuilder.build()); + } + return pathBuilder.build(); + } + } + + public interface NodeObserver { + void onNext(Node nodeId); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/server/App.java b/java/src/main/java/org/softwareheritage/graph/server/App.java deleted file mode 100644 index bb2ce5b..0000000 --- a/java/src/main/java/org/softwareheritage/graph/server/App.java +++ /dev/null @@ -1,196 +0,0 @@ -package org.softwareheritage.graph.server; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.PropertyNamingStrategy; -import com.martiansoftware.jsap.*; -import io.javalin.Javalin; -import io.javalin.http.Context; -import io.javalin.plugin.json.JavalinJackson; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Stats; -import org.softwareheritage.graph.SWHID; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -/** - * Web framework of the swh-graph server RPC API. - * - * @author The Software Heritage developers - */ - -public class App { - /** - * Main entrypoint. - * - * @param args command line arguments - */ - public static void main(String[] args) throws IOException, JSAPException { - SimpleJSAP jsap = new SimpleJSAP(App.class.getName(), - "Server to load and query a compressed graph representation of Software Heritage archive.", - new Parameter[]{ - new FlaggedOption("port", JSAP.INTEGER_PARSER, "5009", JSAP.NOT_REQUIRED, 'p', "port", - "Binding port of the server."), - new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, - JSAP.NOT_GREEDY, "The basename of the compressed graph."), - new Switch("timings", 't', "timings", "Show timings in API result metadata."),}); - - JSAPResult config = jsap.parse(args); - if (jsap.messagePrinted()) { - System.exit(1); - } - - String graphPath = config.getString("graphPath"); - int port = config.getInt("port"); - boolean showTimings = config.getBoolean("timings"); - - startServer(graphPath, port, showTimings); - } - - /** - * Loads compressed graph and starts the web server to query it. - * - * @param graphPath basename of the compressed graph - * @param port binding port of the server - * @param showTimings true if timings should be in results metadata, false otherwise - */ - private static void startServer(String graphPath, int port, boolean showTimings) throws IOException { - Graph graph = Graph.loadMapped(graphPath); - Stats stats = new Stats(graphPath); - - // Clean up on exit - Runtime.getRuntime().addShutdownHook(new Thread() { - public void run() { - try { - graph.cleanUp(); - } catch (IOException e) { - System.out.println("Could not clean up graph on exit: " + e); - } - } - }); - - // Configure Jackson JSON to use snake case naming style - ObjectMapper objectMapper = JavalinJackson.getObjectMapper(); - objectMapper.setPropertyNamingStrategy(PropertyNamingStrategy.SNAKE_CASE); - JavalinJackson.configure(objectMapper); - - Javalin app = Javalin.create().start(port); - - app.before("/stats/*", ctx -> { - checkQueryStrings(ctx, ""); - }); - app.before("/leaves/*", ctx -> { - checkQueryStrings(ctx, "direction|edges"); - }); - app.before("/neighbors/*", ctx -> { - checkQueryStrings(ctx, "direction|edges"); - }); - app.before("/visit/*", ctx -> { - checkQueryStrings(ctx, "direction|edges"); - }); - app.before("/walk/*", ctx -> { - checkQueryStrings(ctx, "direction|edges|traversal"); - }); - - app.get("/stats/", ctx -> { - ctx.json(stats); - }); - - // Graph traversal endpoints - // By default the traversal is a forward DFS using all edges - - app.get("/leaves/:src", ctx -> { - SWHID src = new SWHID(ctx.pathParam("src")); - String direction = ctx.queryParam("direction", "forward"); - String edgesFmt = ctx.queryParam("edges", "*"); - - Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); - Endpoint.Output output = endpoint.leaves(new Endpoint.Input(src)); - ctx.json(formatEndpointOutput(output, showTimings)); - }); - - app.get("/neighbors/:src", ctx -> { - SWHID src = new SWHID(ctx.pathParam("src")); - String direction = ctx.queryParam("direction", "forward"); - String edgesFmt = ctx.queryParam("edges", "*"); - - Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); - Endpoint.Output output = endpoint.neighbors(new Endpoint.Input(src)); - ctx.json(formatEndpointOutput(output, showTimings)); - }); - - app.get("/visit/nodes/:src", ctx -> { - SWHID src = new SWHID(ctx.pathParam("src")); - String direction = ctx.queryParam("direction", "forward"); - String edgesFmt = ctx.queryParam("edges", "*"); - - Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); - Endpoint.Output output = endpoint.visitNodes(new Endpoint.Input(src)); - ctx.json(formatEndpointOutput(output, showTimings)); - }); - - app.get("/visit/paths/:src", ctx -> { - SWHID src = new SWHID(ctx.pathParam("src")); - String direction = ctx.queryParam("direction", "forward"); - String edgesFmt = ctx.queryParam("edges", "*"); - - Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); - Endpoint.Output output = endpoint.visitPaths(new Endpoint.Input(src)); - ctx.json(formatEndpointOutput(output, showTimings)); - }); - - app.get("/walk/:src/:dst", ctx -> { - SWHID src = new SWHID(ctx.pathParam("src")); - String dstFmt = ctx.pathParam("dst"); - String direction = ctx.queryParam("direction", "forward"); - String edgesFmt = ctx.queryParam("edges", "*"); - String algorithm = ctx.queryParam("traversal", "dfs"); - - Endpoint endpoint = new Endpoint(graph, direction, edgesFmt); - Endpoint.Output output = endpoint.walk(new Endpoint.Input(src, dstFmt, algorithm)); - ctx.json(formatEndpointOutput(output, showTimings)); - }); - - app.exception(IllegalArgumentException.class, (e, ctx) -> { - ctx.status(400); - ctx.result(e.getMessage()); - }); - } - - /** - * Checks query strings names provided to the RPC API. - * - * @param ctx Javalin HTTP request context - * @param allowedFmt a regular expression describing allowed query strings names - * @throws IllegalArgumentException unknown query string provided - */ - private static void checkQueryStrings(Context ctx, String allowedFmt) { - Map> queryParamMap = ctx.queryParamMap(); - for (String key : queryParamMap.keySet()) { - if (!key.matches(allowedFmt)) { - throw new IllegalArgumentException("Unknown query string: " + key); - } - } - } - - /** - * Formats endpoint result into final JSON for the RPC API. - *

- * Removes unwanted information if necessary, such as timings (to prevent use of side channels - * attacks). - * - * @param output endpoint operation output which needs formatting - * @param showTimings true if timings should be in results metadata, false otherwise - * @return final Object with desired JSON format - */ - private static Object formatEndpointOutput(Endpoint.Output output, boolean showTimings) { - if (showTimings) { - return output; - } else { - Map metaNoTimings = Map.of("nb_edges_accessed", output.meta.nbEdgesAccessed); - Map outputNoTimings = Map.of("result", output.result, "meta", metaNoTimings); - return outputNoTimings; - } - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/server/Endpoint.java b/java/src/main/java/org/softwareheritage/graph/server/Endpoint.java deleted file mode 100644 index 97664d6..0000000 --- a/java/src/main/java/org/softwareheritage/graph/server/Endpoint.java +++ /dev/null @@ -1,309 +0,0 @@ -package org.softwareheritage.graph.server; - -import org.softwareheritage.graph.*; -import org.softwareheritage.graph.benchmark.utils.Timing; - -import java.util.ArrayList; - -/** - * RPC API endpoints wrapper functions. - *

- * Graph operations are segmented between high-level class (this one) and the low-level class - * ({@link Traversal}). The {@link Endpoint} class creates wrappers for each endpoints by performing - * all the input/output node ids conversions and logging timings. - * - * @author The Software Heritage developers - * @see Traversal - */ - -public class Endpoint { - /** Graph where traversal endpoint is performed */ - Graph graph; - /** Internal traversal API */ - Traversal traversal; - - /** - * Constructor. - * - * @param graph the graph used for traversal endpoint - * @param direction a string (either "forward" or "backward") specifying edge orientation - * @param edgesFmt a formatted string describing allowed - * edges - */ - public Endpoint(Graph graph, String direction, String edgesFmt) { - this.graph = graph; - this.traversal = new Traversal(graph, direction, edgesFmt); - } - - /** - * Converts a list of (internal) long node ids to a list of corresponding (external) SWHIDs. - * - * @param nodeIds the list of long node ids - * @return a list of corresponding SWHIDs - */ - private ArrayList convertNodesToSWHIDs(ArrayList nodeIds) { - ArrayList swhids = new ArrayList<>(); - for (long nodeId : nodeIds) { - swhids.add(graph.getSWHID(nodeId)); - } - return swhids; - } - - /** - * Converts a list of (internal) long node ids to the corresponding {@link SwhPath}. - * - * @param nodeIds the list of long node ids - * @return the corresponding {@link SwhPath} - * @see org.softwareheritage.graph.SwhPath - */ - private SwhPath convertNodesToSwhPath(ArrayList nodeIds) { - SwhPath path = new SwhPath(); - for (long nodeId : nodeIds) { - path.add(graph.getSWHID(nodeId)); - } - return path; - } - - /** - * Converts a list of paths made of (internal) long node ids to one made of {@link SwhPath}-s. - * - * @param pathsNodeId the list of paths with long node ids - * @return a list of corresponding {@link SwhPath} - * @see org.softwareheritage.graph.SwhPath - */ - private ArrayList convertPathsToSWHIDs(ArrayList> pathsNodeId) { - ArrayList paths = new ArrayList<>(); - for (ArrayList path : pathsNodeId) { - paths.add(convertNodesToSwhPath(path)); - } - return paths; - } - - /** - * Leaves endpoint wrapper. - * - * @param input input parameters for the underlying endpoint call - * @return the resulting list of {@link SWHID} from endpoint call and operation metadata - * @see SWHID - * @see Traversal#leaves(long) - */ - public Output leaves(Input input) { - Output> output = new Output<>(); - long startTime; - - startTime = Timing.start(); - long srcNodeId = graph.getNodeId(input.src); - output.meta.timings.swhid2node = Timing.stop(startTime); - - startTime = Timing.start(); - ArrayList nodeIds = traversal.leaves(srcNodeId); - output.meta.timings.traversal = Timing.stop(startTime); - output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); - - startTime = Timing.start(); - output.result = convertNodesToSWHIDs(nodeIds); - output.meta.timings.node2swhid = Timing.stop(startTime); - - return output; - } - - /** - * Neighbors endpoint wrapper. - * - * @param input input parameters for the underlying endpoint call - * @return the resulting list of {@link SWHID} from endpoint call and operation metadata - * @see SWHID - * @see Traversal#neighbors(long) - */ - public Output neighbors(Input input) { - Output> output = new Output<>(); - long startTime; - - startTime = Timing.start(); - long srcNodeId = graph.getNodeId(input.src); - output.meta.timings.swhid2node = Timing.stop(startTime); - - startTime = Timing.start(); - ArrayList nodeIds = traversal.neighbors(srcNodeId); - output.meta.timings.traversal = Timing.stop(startTime); - output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); - - startTime = Timing.start(); - output.result = convertNodesToSWHIDs(nodeIds); - output.meta.timings.node2swhid = Timing.stop(startTime); - - return output; - } - - /** - * Walk endpoint wrapper. - * - * @param input input parameters for the underlying endpoint call - * @return the resulting {@link SwhPath} from endpoint call and operation metadata - * @see SWHID - * @see org.softwareheritage.graph.SwhPath - * @see Traversal#walk - */ - public Output walk(Input input) { - Output output = new Output<>(); - long startTime; - - startTime = Timing.start(); - long srcNodeId = graph.getNodeId(input.src); - output.meta.timings.swhid2node = Timing.stop(startTime); - - ArrayList nodeIds = new ArrayList(); - - // Destination is either a SWHID or a node type - try { - SWHID dstSWHID = new SWHID(input.dstFmt); - long dstNodeId = graph.getNodeId(dstSWHID); - - startTime = Timing.start(); - nodeIds = traversal.walk(srcNodeId, dstNodeId, input.algorithm); - output.meta.timings.traversal = Timing.stop(startTime); - } catch (IllegalArgumentException ignored1) { - try { - Node.Type dstType = Node.Type.fromStr(input.dstFmt); - - startTime = Timing.start(); - nodeIds = traversal.walk(srcNodeId, dstType, input.algorithm); - output.meta.timings.traversal = Timing.stop(startTime); - } catch (IllegalArgumentException ignored2) { - } - } - - output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); - - startTime = Timing.start(); - output.result = convertNodesToSwhPath(nodeIds); - output.meta.timings.node2swhid = Timing.stop(startTime); - - return output; - } - - /** - * VisitNodes endpoint wrapper. - * - * @param input input parameters for the underlying endpoint call - * @return the resulting list of {@link SWHID} from endpoint call and operation metadata - * @see SWHID - * @see Traversal#visitNodes(long) - */ - public Output visitNodes(Input input) { - Output> output = new Output<>(); - long startTime; - - startTime = Timing.start(); - long srcNodeId = graph.getNodeId(input.src); - output.meta.timings.swhid2node = Timing.stop(startTime); - - startTime = Timing.start(); - ArrayList nodeIds = traversal.visitNodes(srcNodeId); - output.meta.timings.traversal = Timing.stop(startTime); - output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); - - startTime = Timing.start(); - output.result = convertNodesToSWHIDs(nodeIds); - output.meta.timings.node2swhid = Timing.stop(startTime); - - return output; - } - - /** - * VisitPaths endpoint wrapper. - * - * @param input input parameters for the underlying endpoint call - * @return the resulting list of {@link SwhPath} from endpoint call and operation metadata - * @see SWHID - * @see org.softwareheritage.graph.SwhPath - * @see Traversal#visitPaths(long) - */ - public Output visitPaths(Input input) { - Output> output = new Output<>(); - long startTime; - - startTime = Timing.start(); - long srcNodeId = graph.getNodeId(input.src); - output.meta.timings.swhid2node = Timing.stop(startTime); - - startTime = Timing.start(); - ArrayList> paths = traversal.visitPaths(srcNodeId); - output.meta.timings.traversal = Timing.stop(startTime); - output.meta.nbEdgesAccessed = traversal.getNbEdgesAccessed(); - - startTime = Timing.start(); - output.result = convertPathsToSWHIDs(paths); - output.meta.timings.node2swhid = Timing.stop(startTime); - - return output; - } - - /** - * Wrapper class to unify traversal methods input signatures. - */ - public static class Input { - /** Source node of endpoint call specified as a {@link SWHID} */ - public SWHID src; - /** - * Destination formatted string as described in the - * API - */ - public String dstFmt; - /** Traversal algorithm used in endpoint call (either "dfs" or "bfs") */ - public String algorithm; - - public Input(SWHID src) { - this.src = src; - } - - public Input(SWHID src, String dstFmt, String algorithm) { - this.src = src; - this.dstFmt = dstFmt; - this.algorithm = algorithm; - } - } - - /** - * Wrapper class to return both the endpoint result and metadata (such as timings). - */ - public static class Output { - /** The result content itself */ - public T result; - /** Various metadata about the result */ - public Meta meta; - - public Output() { - this.result = null; - this.meta = new Meta(); - } - - /** - * Endpoint result metadata. - */ - public class Meta { - /** Operations timings */ - public Timings timings; - /** Number of edges accessed during traversal */ - public long nbEdgesAccessed; - - public Meta() { - this.timings = new Timings(); - this.nbEdgesAccessed = 0; - } - - /** - * Wrapper class for JSON output format. - */ - public class Timings { - /** Time in seconds to do the traversal */ - public double traversal; - /** Time in seconds to convert input SWHID to node id */ - public double swhid2node; - /** Time in seconds to convert output node ids to SWHIDs */ - public double node2swhid; - } - } - } -} diff --git a/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java new file mode 100644 index 0000000..62d0f52 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.utils; + +import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; +import it.unimi.dsi.logging.ProgressLogger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.SwhUnidirectionalGraph; +import org.softwareheritage.graph.labels.DirEntry; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +public class DumpProperties { + final static Logger logger = LoggerFactory.getLogger(DumpProperties.class); + + public static void main(String[] args) throws IOException { + String graphPath = args[0]; + + ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); + SwhUnidirectionalGraph graph; + if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) { + graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl); + } else { + graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl); + } + graph.loadContentLength(); + graph.loadContentIsSkipped(); + graph.loadPersonIds(); + graph.loadAuthorTimestamps(); + graph.loadCommitterTimestamps(); + graph.loadMessages(); + graph.loadTagNames(); + graph.loadLabelNames(); + + ArcLabelledNodeIterator it = graph.labelledNodeIterator(); + while (it.hasNext()) { + long node = it.nextLong(); + System.out.format("%s: %s\n", node, graph.getSWHID(node)); + + var s = it.successors(); + System.out.println(" successors:"); + for (long succ; (succ = s.nextLong()) >= 0;) { + DirEntry[] labels = (DirEntry[]) s.label().get(); + if (labels.length > 0) { + for (DirEntry label : labels) { + System.out.format(" %s %s [perms: %s]\n", graph.getSWHID(succ), + new String(graph.getLabelName(label.filenameId)), label.permission); + } + } else { + System.out.format(" %s\n", graph.getSWHID(succ)); + } + } + + switch (graph.getNodeType(node)) { + case CNT: + System.out.format(" length: %s\n", graph.getContentLength(node)); + System.out.format(" is_skipped: %s\n", graph.isContentSkipped(node)); + break; + case REV: + System.out.format(" author: %s\n", graph.getAuthorId(node)); + System.out.format(" committer: %s\n", graph.getCommitterId(node)); + System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node), + graph.getAuthorTimestampOffset(node)); + System.out.format(" committer_date: %s (offset: %s)\n", graph.getCommitterTimestamp(node), + graph.getCommitterTimestampOffset(node)); + byte[] msg = graph.getMessage(node); + if (msg != null) { + System.out.format(" message: %s\n", (new String(msg)).replace("\n", "\\n")); + } + break; + case REL: + System.out.format(" author: %s\n", graph.getAuthorId(node)); + System.out.format(" date: %s (offset: %s)\n", graph.getAuthorTimestamp(node), + graph.getAuthorTimestamp(node)); + byte[] tagMsg = graph.getMessage(node); + if (tagMsg != null) { + System.out.format(" message: %s\n", (new String(tagMsg)).replace("\n", "\\n")); + } + byte[] tagName = graph.getTagName(node); + if (tagName != null) { + System.out.format(" name: %s\n", (new String(tagName))); + } + break; + case ORI: + System.out.format(" url: %s\n", graph.getUrl(node)); + } + + System.out.println(); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java index b88f8b6..a4e017b 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java @@ -1,76 +1,83 @@ +/* + * Copyright (c) 2021 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import com.google.common.primitives.Longs; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.bits.LongArrayBitVector; import it.unimi.dsi.fastutil.Arrays; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.ByteDiskQueue; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; -import org.softwareheritage.graph.Graph; +import org.softwareheritage.graph.SwhBidirectionalGraph; import org.softwareheritage.graph.SWHID; import org.softwareheritage.graph.experiments.topology.ConnectedComponents; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class ExportSubdataset { public static void main(String[] args) throws IOException, ClassNotFoundException { System.err.print("Loading everything..."); String graphPath = args[0]; - Graph graph = Graph.loadMapped(graphPath); + SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(graphPath); Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); System.err.println(" done."); final long n = graph.numNodes(); // Allow enough memory to behave like in-memory queue int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n); // Use a disk based queue to store BFS frontier final File queueFile = File.createTempFile(ConnectedComponents.class.getSimpleName(), "queue"); final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true); final byte[] byteBuf = new byte[Long.BYTES]; // WARNING: no 64-bit version of this data-structure, but it can support // indices up to 2^37 LongArrayBitVector visited = LongArrayBitVector.ofLength(n); FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator lineIterator = new LineIterator(buffer); while (lineIterator.hasNext()) { String line = lineIterator.next().toString(); long i; try { // i = mphMap.getLong(line.getBytes(StandardCharsets.UTF_8)); i = graph.getNodeId(new SWHID(line)); } catch (IllegalArgumentException e) { continue; } queue.enqueue(Longs.toByteArray(i)); visited.set(i); while (!queue.isEmpty()) { queue.dequeue(byteBuf); final long currentNode = Longs.fromByteArray(byteBuf); SWHID currentNodeSWHID = graph.getSWHID(currentNode); final LazyLongIterator iterator = graph.successors(currentNode); long succ; while ((succ = iterator.nextLong()) != -1) { System.out.format("%s %s\n", currentNodeSWHID, graph.getSWHID(succ)); if (visited.getBoolean(succ)) continue; visited.set(succ); queue.enqueue(Longs.toByteArray(succ)); } } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java b/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java index 71379f2..7ff29b8 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java @@ -1,116 +1,120 @@ +/* + * Copyright (c) 2021 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import it.unimi.dsi.big.webgraph.LazyLongIterator; -import it.unimi.dsi.fastutil.BigArrays; -import it.unimi.dsi.fastutil.io.BinIO; -import org.softwareheritage.graph.AllowedEdges; -import org.softwareheritage.graph.Graph; -import org.softwareheritage.graph.Node; -import org.softwareheritage.graph.SWHID; +import org.softwareheritage.graph.*; import java.io.IOException; import java.time.Duration; import java.util.HashSet; import java.util.Scanner; import java.util.Stack; /* sample invocation on granet.internal.softwareheritage.org for benchmarking * purposes, with the main swh-graph service already running: * * $ java -cp ~/swh-environment/swh-graph/java/target/swh-graph-0.3.0.jar -Xmx300G -XX:PretenureSizeThreshold=512M -XX:MaxNewSize=4G -XX:+UseLargePages -XX:+UseTransparentHugePages -XX:+UseNUMA -XX:+UseTLAB -XX:+ResizeTLAB org.softwareheritage.graph.utils.FindEarliestRevision --timing /dev/shm/swh-graph/default/graph * */ public class FindEarliestRevision { public static void main(String[] args) throws IOException, ClassNotFoundException { String graphPath = args[0]; boolean timing = false; long ts, elapsedNanos; Duration elapsed; if (args.length >= 2 && (args[0].equals("-t") || args[0].equals("--timing"))) { timing = true; graphPath = args[1]; System.err.println("started with timing option, will keep track of elapsed time"); } System.err.println("loading transposed graph..."); ts = System.nanoTime(); - Graph graph = Graph.loadMapped(graphPath).transpose(); + SwhBidirectionalGraph graph = SwhBidirectionalGraph.loadMapped(graphPath).transpose(); elapsed = Duration.ofNanos(System.nanoTime() - ts); System.err.println(String.format("transposed graph loaded (duration: %s).", elapsed)); System.err.println("loading revision timestamps..."); ts = System.nanoTime(); - long[][] committerTimestamps = BinIO.loadLongsBig(graphPath + "-rev_committer_timestamps.bin"); + graph.loadCommitterTimestamps(); elapsed = Duration.ofNanos(System.nanoTime() - ts); System.err.println(String.format("revision timestamps loaded (duration: %s).", elapsed)); Scanner stdin = new Scanner(System.in); AllowedEdges edges = new AllowedEdges("cnt:dir,dir:dir,dir:rev"); String rawSWHID = null; SWHID srcSWHID = null; long lineCount = 0; long srcNodeId = -1; if (timing) { System.err.println("starting SWHID processing..."); elapsed = Duration.ZERO; } while (stdin.hasNextLine()) { if (timing) ts = System.nanoTime(); rawSWHID = stdin.nextLine().strip(); lineCount++; try { srcSWHID = new SWHID(rawSWHID); srcNodeId = graph.getNodeId(srcSWHID); } catch (IllegalArgumentException e) { System.err .println(String.format("skipping invalid or unknown SWHID %s on line %d", rawSWHID, lineCount)); continue; } if (timing) System.err.println("starting traversal for: " + srcSWHID.toString()); Stack stack = new Stack<>(); HashSet visited = new HashSet<>(); stack.push(srcNodeId); visited.add(srcNodeId); long minRevId = -1; long minTimestamp = Long.MAX_VALUE; while (!stack.isEmpty()) { long currentNodeId = stack.pop(); - if (graph.getNodeType(currentNodeId) == Node.Type.REV) { - long committerTs = BigArrays.get(committerTimestamps, currentNodeId); + if (graph.getNodeType(currentNodeId) == SwhType.REV) { + long committerTs = graph.getCommitterTimestamp(currentNodeId); if (committerTs < minTimestamp) { minRevId = currentNodeId; minTimestamp = committerTs; } } - LazyLongIterator it = graph.successors(currentNodeId, edges); + LazyLongIterator it = graph.successors(currentNodeId); for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) { + if (!edges.isAllowed(graph.getNodeType(currentNodeId), graph.getNodeType(neighborNodeId))) { + continue; + } if (!visited.contains(neighborNodeId)) { stack.push(neighborNodeId); visited.add(neighborNodeId); } } } if (minRevId == -1) { System.err.println("no revision found containing: " + srcSWHID.toString()); } else { System.out.println(srcSWHID.toString() + "\t" + graph.getSWHID(minRevId).toString()); } if (timing) { elapsedNanos = System.nanoTime() - ts; // processing time for current SWHID elapsed = elapsed.plus(Duration.ofNanos(elapsedNanos)); // cumulative processing time for all SWHIDs - System.err.println(String.format("visit time (s):\t%.6f", (double) elapsedNanos / 1_000_000_000)); + System.err.printf("visit time (s):\t%.6f\n", (double) elapsedNanos / 1_000_000_000); } } if (timing) - System.err.println(String.format("processed %d SWHIDs in %s (%s avg)", lineCount, elapsed, - elapsed.dividedBy(lineCount))); + System.err.printf("processed %d SWHIDs in %s (%s avg)\n", lineCount, elapsed, elapsed.dividedBy(lineCount)); } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java new file mode 100644 index 0000000..dadaa51 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.utils; + +import it.unimi.dsi.fastutil.BigArrays; + +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.ForkJoinTask; +import java.util.concurrent.RecursiveAction; + +public class ForkJoinBigQuickSort2 extends RecursiveAction { + private static final long serialVersionUID = 1L; + private final long from; + private final long to; + private final long[][] x, y; + + private static final int QUICKSORT_NO_REC = 16; + private static final int PARALLEL_QUICKSORT_NO_FORK = 8192; + private static final int QUICKSORT_MEDIAN_OF_9 = 128; + + public ForkJoinBigQuickSort2(final long[][] x, final long[][] y, final long from, final long to) { + this.from = from; + this.to = to; + this.x = x; + this.y = y; + } + + @Override + protected void compute() { + final long[][] x = this.x; + final long[][] y = this.y; + final long len = to - from; + if (len < PARALLEL_QUICKSORT_NO_FORK) { + quickSort(x, y, from, to); + return; + } + // Choose a partition element, v + long m = from + len / 2; + long l = from; + long n = to - 1; + long s = len / 8; + l = med3(x, y, l, l + s, l + 2 * s); + m = med3(x, y, m - s, m, m + s); + n = med3(x, y, n - 2 * s, n - s, n); + m = med3(x, y, l, m, n); + final long xm = BigArrays.get(x, m), ym = BigArrays.get(y, m); + // Establish Invariant: v* (v)* v* + long a = from, b = a, c = to - 1, d = c; + while (true) { + int comparison; + while (b <= c && (comparison = compare(x, y, b, xm, ym)) <= 0) { + if (comparison == 0) + swap(x, y, a++, b); + b++; + } + while (c >= b && (comparison = compare(x, y, c, xm, ym)) >= 0) { + if (comparison == 0) + swap(x, y, c, d--); + c--; + } + if (b > c) + break; + swap(x, y, b++, c--); + } + // Swap partition elements back to middle + long t; + s = Math.min(a - from, b - a); + swap(x, y, from, b - s, s); + s = Math.min(d - c, to - d - 1); + swap(x, y, b, to - s, s); + s = b - a; + t = d - c; + // Recursively sort non-partition-elements + if (s > 1 && t > 1) + invokeAll(new ForkJoinBigQuickSort2(x, y, from, from + s), new ForkJoinBigQuickSort2(x, y, to - t, to)); + else if (s > 1) + invokeAll(new ForkJoinBigQuickSort2(x, y, from, from + s)); + else + invokeAll(new ForkJoinBigQuickSort2(x, y, to - t, to)); + } + + public static void quickSort(final long[][] x, final long[][] y, final long from, final long to) { + final long len = to - from; + if (len < QUICKSORT_NO_REC) { + selectionSort(x, y, from, to); + return; + } + // Choose a partition element, v + long m = from + len / 2; + long l = from; + long n = to - 1; + if (len > QUICKSORT_MEDIAN_OF_9) { // Big arrays, pseudomedian of 9 + long s = len / 8; + l = med3(x, y, l, l + s, l + 2 * s); + m = med3(x, y, m - s, m, m + s); + n = med3(x, y, n - 2 * s, n - s, n); + } + m = med3(x, y, l, m, n); // Mid-size, med of 3 + // Establish Invariant: v* (v)* v* + long a = from, b = a, c = to - 1, d = c; + final long xm = BigArrays.get(x, m), ym = BigArrays.get(y, m); + while (true) { + long comparison; + while (b <= c && (comparison = compare(x, y, b, xm, ym)) <= 0) { + if (comparison == 0) + swap(x, y, a++, b); + b++; + } + while (c >= b && (comparison = compare(x, y, c, xm, ym)) >= 0) { + if (comparison == 0) + swap(x, y, c, d--); + c--; + } + if (b > c) + break; + swap(x, y, b++, c--); + } + // Swap partition elements back to middle + long s; + s = Math.min(a - from, b - a); + swap(x, y, from, b - s, s); + s = Math.min(d - c, to - d - 1); + swap(x, y, b, to - s, s); + // Recursively sort non-partition-elements + if ((s = b - a) > 1) + quickSort(x, y, from, from + s); + if ((s = d - c) > 1) + quickSort(x, y, to - s, to); + } + + public static void quickSort(final long[][] x, final long[][] y) { + quickSort(x, y, 0, x.length); + } + + private static int compare(final long[][] x, final long[][] y, final long u, final long v) { + int tx; + return (tx = Long.compare(BigArrays.get(x, u), BigArrays.get(x, v))) != 0 + ? tx + : Long.compare(BigArrays.get(y, u), BigArrays.get(y, v)); + } + + private static int compare(final long[][] x, final long[][] y, final long i, final long xm, final long ym) { + int tx; + return (tx = Long.compare(BigArrays.get(x, i), xm)) != 0 ? tx : Long.compare(BigArrays.get(y, i), ym); + } + + private static void swap(final long[][] x, final long[][] y, final long a, final long b) { + BigArrays.swap(x, a, b); + BigArrays.swap(y, a, b); + } + + private static void swap(final long[][] x, final long[][] y, long a, long b, final long n) { + for (long i = 0; i < n; i++, a++, b++) + swap(x, y, a, b); + } + + private static long med3(final long[][] x, final long[][] y, final long a, final long b, final long c) { + final int ab = compare(x, y, a, b); + final int ac = compare(x, y, a, c); + final int bc = compare(x, y, b, c); + return (ab < 0 ? (bc < 0 ? b : ac < 0 ? c : a) : (bc > 0 ? b : ac > 0 ? c : a)); + } + + public static void selectionSort(final long[][] a, final long[][] b, final long from, final long to) { + for (long i = from; i < to - 1; i++) { + long m = i; + for (long j = i + 1; j < to; j++) + if (compare(a, b, j, m) < 0) + m = j; + if (m != i) { + BigArrays.swap(a, i, m); + BigArrays.swap(b, i, m); + } + } + } + + public static void selectionSort(final long[][] x, final long[][] y) { + selectionSort(x, y, 0, x.length); + } + + public static ForkJoinPool getPool() { + ForkJoinPool current = ForkJoinTask.getPool(); + return current == null ? ForkJoinPool.commonPool() : current; + } + + public static void parallelQuickSort(final long[][] x, final long[][] y) { + BigArrays.ensureSameLength(x, y); + parallelQuickSort(x, y, 0, x.length); + } + + public static void parallelQuickSort(final long[][] x, final long[][] y, final long from, final long to) { + ForkJoinPool pool = getPool(); + if (to - from < PARALLEL_QUICKSORT_NO_FORK || pool.getParallelism() == 1) + quickSort(x, y, from, to); + else { + pool.invoke(new ForkJoinBigQuickSort2(x, y, from, to)); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java new file mode 100644 index 0000000..57ae71d --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.utils; + +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.ForkJoinTask; +import java.util.concurrent.RecursiveAction; + +import static it.unimi.dsi.fastutil.longs.LongArrays.ensureSameLength; + +public class ForkJoinQuickSort3 extends RecursiveAction { + private static final long serialVersionUID = 1L; + private final int from; + private final int to; + private final long[] x, y, z; + + private static final int QUICKSORT_NO_REC = 16; + private static final int PARALLEL_QUICKSORT_NO_FORK = 8192; + private static final int QUICKSORT_MEDIAN_OF_9 = 128; + + public ForkJoinQuickSort3(final long[] x, final long[] y, final long z[], final int from, final int to) { + this.from = from; + this.to = to; + this.x = x; + this.y = y; + this.z = z; + } + + @Override + protected void compute() { + final long[] x = this.x; + final long[] y = this.y; + final long[] z = this.z; + final int len = to - from; + if (len < PARALLEL_QUICKSORT_NO_FORK) { + quickSort(x, y, z, from, to); + return; + } + // Choose a partition element, v + int m = from + len / 2; + int l = from; + int n = to - 1; + int s = len / 8; + l = med3(x, y, z, l, l + s, l + 2 * s); + m = med3(x, y, z, m - s, m, m + s); + n = med3(x, y, z, n - 2 * s, n - s, n); + m = med3(x, y, z, l, m, n); + final long xm = x[m], ym = y[m], zm = z[m]; + // Establish Invariant: v* (v)* v* + int a = from, b = a, c = to - 1, d = c; + while (true) { + int comparison, t; + while (b <= c && (comparison = compare(x, y, z, b, xm, ym, zm)) <= 0) { + if (comparison == 0) + swap(x, y, z, a++, b); + b++; + } + while (c >= b && (comparison = compare(x, y, z, c, xm, ym, zm)) >= 0) { + if (comparison == 0) + swap(x, y, z, c, d--); + c--; + } + if (b > c) + break; + swap(x, y, z, b++, c--); + } + // Swap partition elements back to middle + int t; + s = Math.min(a - from, b - a); + swap(x, y, z, from, b - s, s); + s = Math.min(d - c, to - d - 1); + swap(x, y, z, b, to - s, s); + s = b - a; + t = d - c; + // Recursively sort non-partition-elements + if (s > 1 && t > 1) + invokeAll(new ForkJoinQuickSort3(x, y, z, from, from + s), new ForkJoinQuickSort3(x, y, z, to - t, to)); + else if (s > 1) + invokeAll(new ForkJoinQuickSort3(x, y, z, from, from + s)); + else + invokeAll(new ForkJoinQuickSort3(x, y, z, to - t, to)); + } + + public static void quickSort(final long[] x, final long[] y, final long[] z, final int from, final int to) { + final int len = to - from; + if (len < QUICKSORT_NO_REC) { + selectionSort(x, y, z, from, to); + return; + } + // Choose a partition element, v + int m = from + len / 2; + int l = from; + int n = to - 1; + if (len > QUICKSORT_MEDIAN_OF_9) { // Big arrays, pseudomedian of 9 + int s = len / 8; + l = med3(x, y, z, l, l + s, l + 2 * s); + m = med3(x, y, z, m - s, m, m + s); + n = med3(x, y, z, n - 2 * s, n - s, n); + } + m = med3(x, y, z, l, m, n); // Mid-size, med of 3 + // Establish Invariant: v* (v)* v* + int a = from, b = a, c = to - 1, d = c; + final long xm = x[m], ym = y[m], zm = z[m]; + while (true) { + int comparison; + while (b <= c && (comparison = compare(x, y, z, b, xm, ym, zm)) <= 0) { + if (comparison == 0) + swap(x, y, z, a++, b); + b++; + } + while (c >= b && (comparison = compare(x, y, z, c, xm, ym, zm)) >= 0) { + if (comparison == 0) + swap(x, y, z, c, d--); + c--; + } + if (b > c) + break; + swap(x, y, z, b++, c--); + } + // Swap partition elements back to middle + int s; + s = Math.min(a - from, b - a); + swap(x, y, z, from, b - s, s); + s = Math.min(d - c, to - d - 1); + swap(x, y, z, b, to - s, s); + // Recursively sort non-partition-elements + if ((s = b - a) > 1) + quickSort(x, y, z, from, from + s); + if ((s = d - c) > 1) + quickSort(x, y, z, to - s, to); + } + + public static void quickSort(final long[] x, final long[] y, final long[] z) { + quickSort(x, y, z, 0, x.length); + } + + private static int compare(final long[] x, final long[] y, final long[] z, final int u, final int v) { + int tx, ty; + return (tx = Long.compare(x[u], x[v])) != 0 + ? tx + : ((ty = Long.compare(y[u], y[v])) != 0 ? ty : Long.compare(z[u], z[v])); + } + + private static int compare(final long[] x, final long[] y, final long[] z, final int i, final long xm, + final long ym, final long zm) { + int tx, ty; + return (tx = Long.compare(x[i], xm)) != 0 + ? tx + : ((ty = Long.compare(y[i], ym)) != 0 ? ty : Long.compare(z[i], zm)); + } + + private static void swap(final long[] x, final long[] y, final long[] z, final int a, final int b) { + final long t = x[a]; + final long u = y[a]; + final long v = z[a]; + x[a] = x[b]; + y[a] = y[b]; + z[a] = z[b]; + x[b] = t; + y[b] = u; + z[b] = v; + } + + private static void swap(final long[] x, final long[] y, final long[] z, int a, int b, final int n) { + for (int i = 0; i < n; i++, a++, b++) + swap(x, y, z, a, b); + } + + private static int med3(final long[] x, final long[] y, final long[] z, final int a, final int b, final int c) { + final int ab = compare(x, y, z, a, b); + final int ac = compare(x, y, z, a, c); + final int bc = compare(x, y, z, b, c); + return (ab < 0 ? (bc < 0 ? b : ac < 0 ? c : a) : (bc > 0 ? b : ac > 0 ? c : a)); + } + + public static void selectionSort(final long[] a, final long[] b, long[] c, final int from, final int to) { + for (int i = from; i < to - 1; i++) { + int m = i; + for (int j = i + 1; j < to; j++) + if (compare(a, b, c, j, m) < 0) + m = j; + if (m != i) { + long t = a[i]; + a[i] = a[m]; + a[m] = t; + t = b[i]; + b[i] = b[m]; + b[m] = t; + t = c[i]; + c[i] = c[m]; + c[m] = t; + } + } + } + + public static void selectionSort(final long[] x, final long[] y, final long[] z) { + selectionSort(x, y, z, 0, x.length); + } + + public static ForkJoinPool getPool() { + ForkJoinPool current = ForkJoinTask.getPool(); + return current == null ? ForkJoinPool.commonPool() : current; + } + + public static void parallelQuickSort(final long[] x, final long[] y, final long[] z) { + ensureSameLength(x, y); + ensureSameLength(x, z); + parallelQuickSort(x, y, z, 0, x.length); + } + + public static void parallelQuickSort(final long[] x, final long[] y, final long[] z, final int from, final int to) { + ForkJoinPool pool = getPool(); + if (to - from < PARALLEL_QUICKSORT_NO_FORK || pool.getParallelism() == 1) + quickSort(x, y, z, from, to); + else { + pool.invoke(new ForkJoinQuickSort3(x, y, z, from, to)); + } + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java index 0d672e2..71d6dab 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java @@ -1,46 +1,53 @@ +/* + * Copyright (c) 2020 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; import com.martiansoftware.jsap.*; import it.unimi.dsi.fastutil.objects.Object2LongFunction; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; public class MPHTranslate { private static JSAPResult parse_args(String[] args) { JSAPResult config = null; try { SimpleJSAP jsap = new SimpleJSAP(MPHTranslate.class.getName(), "", new Parameter[]{new UnflaggedOption("function", JSAP.STRING_PARSER, JSAP.REQUIRED, "Filename of the serialized MPH"),}); config = jsap.parse(args); if (jsap.messagePrinted()) { System.exit(1); } } catch (JSAPException e) { e.printStackTrace(); } return config; } public static void main(String[] args) throws IOException, ClassNotFoundException { JSAPResult config = parse_args(args); String mphPath = config.getString("function"); Object2LongFunction mphMap = NodeIdMap.loadMph(mphPath); // TODO: wasteful to convert to/from bytes FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); LineIterator lineIterator = new LineIterator(buffer); while (lineIterator.hasNext()) { String line = lineIterator.next().toString(); System.out.println(mphMap.getLong(line.getBytes(StandardCharsets.US_ASCII))); } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java index 545dc8f..7daec23 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java @@ -1,27 +1,47 @@ +/* + * Copyright (c) 2020-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; -import it.unimi.dsi.big.webgraph.ImmutableGraph; import it.unimi.dsi.big.webgraph.NodeIterator; -import org.softwareheritage.graph.maps.NodeIdMap; +import it.unimi.dsi.logging.ProgressLogger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.SwhUnidirectionalGraph; import java.io.IOException; +import java.util.concurrent.TimeUnit; public class ReadGraph { + final static Logger logger = LoggerFactory.getLogger(ReadLabelledGraph.class); + public static void main(String[] args) throws IOException { String graphPath = args[0]; - ImmutableGraph graph = ImmutableGraph.load(graphPath); - NodeIdMap nodeMap = new NodeIdMap(graphPath, graph.numNodes()); + SwhUnidirectionalGraph graph; + ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); + if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) { + graph = SwhUnidirectionalGraph.loadMapped(graphPath, pl); + } else { + graph = SwhUnidirectionalGraph.load(graphPath, pl); + } + pl.expectedUpdates = graph.numArcs(); + pl.start("Reading graph..."); NodeIterator it = graph.nodeIterator(); while (it.hasNext()) { long srcNode = it.nextLong(); var s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { - System.out.format("%s %s\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode)); + System.out.format("%s %s\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode)); + pl.lightUpdate(); } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java index 4b04992..c8e0a9f 100644 --- a/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java +++ b/java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java @@ -1,40 +1,55 @@ +/* + * Copyright (c) 2020-2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph.utils; -import it.unimi.dsi.big.util.FrontCodedStringBigList; -import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph; import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; -import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph; -import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.logging.ProgressLogger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.softwareheritage.graph.SwhUnidirectionalGraph; import org.softwareheritage.graph.labels.DirEntry; -import org.softwareheritage.graph.maps.NodeIdMap; import java.io.IOException; +import java.util.concurrent.TimeUnit; public class ReadLabelledGraph { + final static Logger logger = LoggerFactory.getLogger(ReadLabelledGraph.class); + public static void main(String[] args) throws IOException, ClassNotFoundException { String graphPath = args[0]; - ArcLabelledImmutableGraph graph = BitStreamArcLabelledImmutableGraph.loadOffline(graphPath + "-labelled"); - NodeIdMap nodeMap = new NodeIdMap(graphPath, graph.numNodes()); - FrontCodedStringBigList filenameMap = (FrontCodedStringBigList) BinIO.loadObject(graphPath + "-labels.fcl"); + ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS); + SwhUnidirectionalGraph graph; + if (args.length > 1 && (args[1].equals("--mapped") || args[1].equals("-m"))) { + graph = SwhUnidirectionalGraph.loadLabelledMapped(graphPath, pl); + } else { + graph = SwhUnidirectionalGraph.loadLabelled(graphPath, pl); + } + + graph.properties.loadLabelNames(); - ArcLabelledNodeIterator it = graph.nodeIterator(); + ArcLabelledNodeIterator it = graph.labelledNodeIterator(); while (it.hasNext()) { long srcNode = it.nextLong(); ArcLabelledNodeIterator.LabelledArcIterator s = it.successors(); long dstNode; while ((dstNode = s.nextLong()) >= 0) { DirEntry[] labels = (DirEntry[]) s.label().get(); if (labels.length > 0) { for (DirEntry label : labels) { - System.out.format("%s %s %s %d\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode), - filenameMap.get(label.filenameId), label.permission); + System.out.format("%s %s %s %d\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode), + new String(graph.properties.getLabelName(label.filenameId)), label.permission); } } else { - System.out.format("%s %s\n", nodeMap.getSWHID(srcNode), nodeMap.getSWHID(dstNode)); + System.out.format("%s %s\n", graph.getSWHID(srcNode), graph.getSWHID(dstNode)); } } } } } diff --git a/java/src/main/java/org/softwareheritage/graph/utils/Sort.java b/java/src/main/java/org/softwareheritage/graph/utils/Sort.java new file mode 100644 index 0000000..9a69b94 --- /dev/null +++ b/java/src/main/java/org/softwareheritage/graph/utils/Sort.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.utils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class Sort { + public static Process spawnSort(String sortBufferSize, String sortTmpDir) throws IOException { + return spawnSort(sortBufferSize, sortTmpDir, null); + } + + public static Process spawnSort(String sortBufferSize, String sortTmpDir, List options) throws IOException { + ProcessBuilder sortProcessBuilder = new ProcessBuilder(); + sortProcessBuilder.redirectError(ProcessBuilder.Redirect.INHERIT); + ArrayList command = new ArrayList<>(List.of("sort", "-u", "--buffer-size", sortBufferSize)); + if (sortTmpDir != null) { + command.add("--temporary-directory"); + command.add(sortTmpDir); + } + if (options != null) { + command.addAll(options); + } + sortProcessBuilder.command(command); + Map env = sortProcessBuilder.environment(); + env.put("LC_ALL", "C"); + env.put("LC_COLLATE", "C"); + env.put("LANG", "C"); + + return sortProcessBuilder.start(); + } +} diff --git a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java b/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java deleted file mode 100644 index 7d35574..0000000 --- a/java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java +++ /dev/null @@ -1,53 +0,0 @@ -package org.softwareheritage.graph.utils; - -import it.unimi.dsi.fastutil.BigArrays; -import it.unimi.dsi.fastutil.Size64; -import it.unimi.dsi.fastutil.io.BinIO; -import it.unimi.dsi.fastutil.longs.LongBigArrays; -import it.unimi.dsi.fastutil.objects.Object2LongFunction; -import it.unimi.dsi.io.FastBufferedReader; -import it.unimi.dsi.io.LineIterator; -import org.softwareheritage.graph.maps.NodeIdMap; - -import java.io.IOException; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; - -public class WriteRevisionTimestamps { - public static void main(String[] args) throws IOException, ClassNotFoundException { - System.err.print("Loading everything..."); - String graphPath = args[0]; - String outputFile = args[1]; - Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph"); - long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size(); - long[][] nodePerm = BinIO.loadLongsBig(graphPath + ".order"); - // NodeIdMap nodeIdMap = new NodeIdMap(graphPath, nbIds); - long[][] timestampArray = LongBigArrays.newBigArray(nbIds); - BigArrays.fill(timestampArray, Long.MIN_VALUE); - System.err.println(" done."); - - // TODO: wasteful to convert to/from bytes - FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII)); - LineIterator lineIterator = new LineIterator(buffer); - - while (lineIterator.hasNext()) { - String line = lineIterator.next().toString(); - String[] line_elements = line.split("[ \\t]"); - - // SWHID currentRev = new SWHID(line_elements[0].strip()); - long revId = -1; - long timestamp = -1; - try { - // revId = nodeIdMap.getNodeId(currentRev); - long revHash = mphMap.getLong(line_elements[0].strip().getBytes(StandardCharsets.US_ASCII)); - revId = BigArrays.get(nodePerm, revHash); - timestamp = Long.parseLong(line_elements[1].strip()); - } catch (IllegalArgumentException e) { - continue; - } - BigArrays.set(timestampArray, revId, timestamp); - // System.err.println(revId + " " + timestamp); - } - BinIO.storeLongs(timestampArray, outputFile); - } -} diff --git a/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java b/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java index f91f6ed..6cd9b68 100644 --- a/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java +++ b/java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java @@ -1,113 +1,120 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.ArrayList; public class AllowedEdgesTest extends GraphTest { static class EdgeType { - Node.Type src; - Node.Type dst; + SwhType src; + SwhType dst; - public EdgeType(Node.Type src, Node.Type dst) { + public EdgeType(SwhType src, SwhType dst) { this.src = src; this.dst = dst; } @Override public boolean equals(Object otherObj) { if (otherObj == this) return true; if (!(otherObj instanceof EdgeType)) return false; EdgeType other = (EdgeType) otherObj; return src == other.src && dst == other.dst; } } void assertEdgeRestriction(AllowedEdges edges, ArrayList expectedAllowed) { - Node.Type[] nodeTypes = Node.Type.values(); - for (Node.Type src : nodeTypes) { - for (Node.Type dst : nodeTypes) { + SwhType[] nodeTypes = SwhType.values(); + for (SwhType src : nodeTypes) { + for (SwhType dst : nodeTypes) { EdgeType edge = new EdgeType(src, dst); boolean isAllowed = edges.isAllowed(src, dst); boolean isExpected = false; for (EdgeType expected : expectedAllowed) { if (expected.equals(edge)) { isExpected = true; break; } } Assertions.assertEquals(isAllowed, isExpected, "Edge type: " + src + " -> " + dst); } } } @Test public void dirToDirDirToCntEdges() { AllowedEdges edges = new AllowedEdges("dir:dir,dir:cnt"); ArrayList expected = new ArrayList<>(); - expected.add(new EdgeType(Node.Type.DIR, Node.Type.DIR)); - expected.add(new EdgeType(Node.Type.DIR, Node.Type.CNT)); + expected.add(new EdgeType(SwhType.DIR, SwhType.DIR)); + expected.add(new EdgeType(SwhType.DIR, SwhType.CNT)); assertEdgeRestriction(edges, expected); } @Test public void relToRevRevToRevRevToDirEdges() { AllowedEdges edges = new AllowedEdges("rel:rev,rev:rev,rev:dir"); ArrayList expected = new ArrayList<>(); - expected.add(new EdgeType(Node.Type.REL, Node.Type.REV)); - expected.add(new EdgeType(Node.Type.REV, Node.Type.REV)); - expected.add(new EdgeType(Node.Type.REV, Node.Type.DIR)); + expected.add(new EdgeType(SwhType.REL, SwhType.REV)); + expected.add(new EdgeType(SwhType.REV, SwhType.REV)); + expected.add(new EdgeType(SwhType.REV, SwhType.DIR)); assertEdgeRestriction(edges, expected); } @Test public void revToAllDirToDirEdges() { AllowedEdges edges = new AllowedEdges("rev:*,dir:dir"); ArrayList expected = new ArrayList<>(); - for (Node.Type dst : Node.Type.values()) { - expected.add(new EdgeType(Node.Type.REV, dst)); + for (SwhType dst : SwhType.values()) { + expected.add(new EdgeType(SwhType.REV, dst)); } - expected.add(new EdgeType(Node.Type.DIR, Node.Type.DIR)); + expected.add(new EdgeType(SwhType.DIR, SwhType.DIR)); assertEdgeRestriction(edges, expected); } @Test public void allToCntEdges() { AllowedEdges edges = new AllowedEdges("*:cnt"); ArrayList expected = new ArrayList<>(); - for (Node.Type src : Node.Type.values()) { - expected.add(new EdgeType(src, Node.Type.CNT)); + for (SwhType src : SwhType.values()) { + expected.add(new EdgeType(src, SwhType.CNT)); } assertEdgeRestriction(edges, expected); } @Test public void allEdges() { AllowedEdges edges = new AllowedEdges("*:*"); ArrayList expected = new ArrayList<>(); - for (Node.Type src : Node.Type.values()) { - for (Node.Type dst : Node.Type.values()) { + for (SwhType src : SwhType.values()) { + for (SwhType dst : SwhType.values()) { expected.add(new EdgeType(src, dst)); } } assertEdgeRestriction(edges, expected); // Special null value used to quickly bypass edge check when no restriction AllowedEdges edges2 = new AllowedEdges("*"); Assertions.assertNull(edges2.restrictedTo); } @Test public void noEdges() { AllowedEdges edges = new AllowedEdges(""); AllowedEdges edges2 = new AllowedEdges(null); ArrayList expected = new ArrayList<>(); assertEdgeRestriction(edges, expected); assertEdgeRestriction(edges2, expected); } } diff --git a/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java b/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java new file mode 100644 index 0000000..4da3c59 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Set; + +public class AllowedNodesTest extends GraphTest { + void assertNodeRestriction(AllowedNodes nodes, Set expectedAllowed) { + SwhType[] nodeTypes = SwhType.values(); + for (SwhType t : nodeTypes) { + boolean isAllowed = nodes.isAllowed(t); + boolean isExpected = expectedAllowed.contains(t); + Assertions.assertEquals(isAllowed, isExpected, "Node type: " + t); + } + } + + @Test + public void dirCntNodes() { + AllowedNodes edges = new AllowedNodes("dir,cnt"); + Set expected = Set.of(SwhType.DIR, SwhType.CNT); + assertNodeRestriction(edges, expected); + } + + @Test + public void revDirNodes() { + AllowedNodes edges = new AllowedNodes("rev,dir"); + Set expected = Set.of(SwhType.DIR, SwhType.REV); + assertNodeRestriction(edges, expected); + } + + @Test + public void relSnpCntNodes() { + AllowedNodes edges = new AllowedNodes("rel,snp,cnt"); + Set expected = Set.of(SwhType.REL, SwhType.SNP, SwhType.CNT); + assertNodeRestriction(edges, expected); + } + + @Test + public void allNodes() { + AllowedNodes edges = new AllowedNodes("*"); + Set expected = Set.of(SwhType.REL, SwhType.SNP, SwhType.CNT, SwhType.DIR, SwhType.REV, SwhType.ORI); + assertNodeRestriction(edges, expected); + } + + @Test + public void noNodes() { + AllowedNodes edges = new AllowedNodes(""); + Set expected = Set.of(); + assertNodeRestriction(edges, expected); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/GraphTest.java b/java/src/test/java/org/softwareheritage/graph/GraphTest.java index fba8ed8..872784f 100644 --- a/java/src/test/java/org/softwareheritage/graph/GraphTest.java +++ b/java/src/test/java/org/softwareheritage/graph/GraphTest.java @@ -1,44 +1,67 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; +import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; +import java.util.Comparator; import java.util.Iterator; +import com.github.luben.zstd.ZstdInputStream; import it.unimi.dsi.big.webgraph.LazyLongIterator; import it.unimi.dsi.big.webgraph.LazyLongIterators; -import org.hamcrest.MatcherAssert; import org.junit.jupiter.api.BeforeAll; -import static org.hamcrest.collection.IsIterableContainingInAnyOrder.containsInAnyOrder; +import static org.junit.Assert.assertEquals; public class GraphTest { - static Graph graph; + static SwhBidirectionalGraph graph; + + final protected String TEST_ORIGIN_ID = "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054"; @BeforeAll public static void setUp() throws IOException { - Path graphPath = Paths.get("..", "swh", "graph", "tests", "dataset", "output", "example"); - graph = Graph.loadMapped(graphPath.toString()); + graph = SwhBidirectionalGraph.loadLabelled(getGraphPath().toString()); + } + + public static Path getGraphPath() { + return Paths.get("..", "swh", "graph", "tests", "dataset", "compressed", "example"); } - public Graph getGraph() { + public static SwhBidirectionalGraph getGraph() { return graph; } public static SWHID fakeSWHID(String type, int num) { return new SWHID(String.format("swh:1:%s:%040d", type, num)); } - public static void assertEqualsAnyOrder(Collection expecteds, Collection actuals) { - MatcherAssert.assertThat(expecteds, containsInAnyOrder(actuals.toArray())); + public static void assertEqualsAnyOrder(Collection expected, Collection actual) { + ArrayList expectedList = new ArrayList<>(expected); + ArrayList actualList = new ArrayList<>(actual); + expectedList.sort(Comparator.comparing(Object::toString)); + actualList.sort(Comparator.comparing(Object::toString)); + assertEquals(expectedList, actualList); } public static ArrayList lazyLongIteratorToList(LazyLongIterator input) { ArrayList inputList = new ArrayList<>(); Iterator inputIt = LazyLongIterators.eager(input); inputIt.forEachRemaining(inputList::add); return inputList; } + + public static String[] readZstFile(Path zstFile) throws IOException { + ZstdInputStream zis = new ZstdInputStream(new FileInputStream(zstFile.toFile())); + return (new String(zis.readAllBytes())).split("\n"); + } } diff --git a/java/src/test/java/org/softwareheritage/graph/NeighborsTest.java b/java/src/test/java/org/softwareheritage/graph/NeighborsTest.java deleted file mode 100644 index cf41aa4..0000000 --- a/java/src/test/java/org/softwareheritage/graph/NeighborsTest.java +++ /dev/null @@ -1,141 +0,0 @@ -package org.softwareheritage.graph; - -import java.util.ArrayList; - -import org.junit.jupiter.api.Test; -import org.softwareheritage.graph.server.Endpoint; - -// Avoid warnings concerning Endpoint.Output.result manual cast -@SuppressWarnings("unchecked") -public class NeighborsTest extends GraphTest { - @Test - public void zeroNeighbor() { - Graph graph = getGraph(); - ArrayList expectedNodes = new ArrayList<>(); - - SWHID src1 = new SWHID("swh:1:ori:0000000000000000000000000000000000000021"); - Endpoint endpoint1 = new Endpoint(graph, "backward", "*"); - ArrayList actuals1 = (ArrayList) endpoint1.neighbors(new Endpoint.Input(src1)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes, actuals1); - - SWHID src2 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000004"); - Endpoint endpoint2 = new Endpoint(graph, "forward", "*"); - ArrayList actuals2 = (ArrayList) endpoint2.neighbors(new Endpoint.Input(src2)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes, actuals2); - - SWHID src3 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000015"); - Endpoint endpoint3 = new Endpoint(graph, "forward", "*"); - ArrayList actuals3 = (ArrayList) endpoint3.neighbors(new Endpoint.Input(src3)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes, actuals3); - - SWHID src4 = new SWHID("swh:1:rel:0000000000000000000000000000000000000019"); - Endpoint endpoint4 = new Endpoint(graph, "backward", "*"); - ArrayList actuals4 = (ArrayList) endpoint4.neighbors(new Endpoint.Input(src4)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes, actuals4); - - SWHID src5 = new SWHID("swh:1:dir:0000000000000000000000000000000000000008"); - Endpoint endpoint5 = new Endpoint(graph, "forward", "snp:*,rev:*,rel:*"); - ArrayList actuals5 = (ArrayList) endpoint5.neighbors(new Endpoint.Input(src5)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes, actuals5); - } - - @Test - public void oneNeighbor() { - Graph graph = getGraph(); - - SWHID src1 = new SWHID("swh:1:rev:0000000000000000000000000000000000000003"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "*"); - ArrayList expectedNodes1 = new ArrayList<>(); - expectedNodes1.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002")); - ArrayList actuals1 = (ArrayList) endpoint1.neighbors(new Endpoint.Input(src1)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); - - SWHID src2 = new SWHID("swh:1:dir:0000000000000000000000000000000000000017"); - Endpoint endpoint2 = new Endpoint(graph, "forward", "dir:cnt"); - ArrayList expectedNodes2 = new ArrayList<>(); - expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000014")); - ArrayList actuals2 = (ArrayList) endpoint2.neighbors(new Endpoint.Input(src2)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); - - SWHID src3 = new SWHID("swh:1:dir:0000000000000000000000000000000000000012"); - Endpoint endpoint3 = new Endpoint(graph, "backward", "*"); - ArrayList expectedNodes3 = new ArrayList<>(); - expectedNodes3.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); - ArrayList actuals3 = (ArrayList) endpoint3.neighbors(new Endpoint.Input(src3)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes3, actuals3); - - SWHID src4 = new SWHID("swh:1:rev:0000000000000000000000000000000000000009"); - Endpoint endpoint4 = new Endpoint(graph, "backward", "rev:rev"); - ArrayList expectedNodes4 = new ArrayList<>(); - expectedNodes4.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); - ArrayList actuals4 = (ArrayList) endpoint4.neighbors(new Endpoint.Input(src4)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes4, actuals4); - - SWHID src5 = new SWHID("swh:1:snp:0000000000000000000000000000000000000020"); - Endpoint endpoint5 = new Endpoint(graph, "backward", "*"); - ArrayList expectedNodes5 = new ArrayList<>(); - expectedNodes5.add(new SWHID("swh:1:ori:0000000000000000000000000000000000000021")); - ArrayList actuals5 = (ArrayList) endpoint5.neighbors(new Endpoint.Input(src5)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes5, actuals5); - } - - @Test - public void twoNeighbors() { - Graph graph = getGraph(); - - SWHID src1 = new SWHID("swh:1:snp:0000000000000000000000000000000000000020"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "*"); - ArrayList expectedNodes1 = new ArrayList<>(); - expectedNodes1.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); - expectedNodes1.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000009")); - ArrayList actuals1 = (ArrayList) endpoint1.neighbors(new Endpoint.Input(src1)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); - - SWHID src2 = new SWHID("swh:1:dir:0000000000000000000000000000000000000008"); - Endpoint endpoint2 = new Endpoint(graph, "forward", "dir:cnt"); - ArrayList expectedNodes2 = new ArrayList<>(); - expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); - expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); - ArrayList actuals2 = (ArrayList) endpoint2.neighbors(new Endpoint.Input(src2)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); - - SWHID src3 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000001"); - Endpoint endpoint3 = new Endpoint(graph, "backward", "*"); - ArrayList expectedNodes3 = new ArrayList<>(); - expectedNodes3.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000008")); - expectedNodes3.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002")); - ArrayList actuals3 = (ArrayList) endpoint3.neighbors(new Endpoint.Input(src3)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes3, actuals3); - - SWHID src4 = new SWHID("swh:1:rev:0000000000000000000000000000000000000009"); - Endpoint endpoint4 = new Endpoint(graph, "backward", "rev:snp,rev:rel"); - ArrayList expectedNodes4 = new ArrayList<>(); - expectedNodes4.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020")); - expectedNodes4.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); - ArrayList actuals4 = (ArrayList) endpoint4.neighbors(new Endpoint.Input(src4)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes4, actuals4); - } - - @Test - public void threeNeighbors() { - Graph graph = getGraph(); - - SWHID src1 = new SWHID("swh:1:dir:0000000000000000000000000000000000000008"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "*"); - ArrayList expectedNodes1 = new ArrayList<>(); - expectedNodes1.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000006")); - expectedNodes1.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); - expectedNodes1.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); - ArrayList actuals1 = (ArrayList) endpoint1.neighbors(new Endpoint.Input(src1)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); - - SWHID src2 = new SWHID("swh:1:rev:0000000000000000000000000000000000000009"); - Endpoint endpoint2 = new Endpoint(graph, "backward", "*"); - ArrayList expectedNodes2 = new ArrayList<>(); - expectedNodes2.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020")); - expectedNodes2.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); - expectedNodes2.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); - ArrayList actuals2 = (ArrayList) endpoint2.neighbors(new Endpoint.Input(src2)).result; - GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); - } -} diff --git a/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java b/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java index 1f95ebe..cce1a45 100644 --- a/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java +++ b/java/src/test/java/org/softwareheritage/graph/SubgraphTest.java @@ -1,85 +1,92 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + package org.softwareheritage.graph; import java.util.*; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class SubgraphTest extends GraphTest { @Test public void noFilter() { - Graph g = getGraph(); + SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("*")); for (long i = 0; i < g.numNodes(); ++i) { Assertions.assertEquals(g.outdegree(i), sg.outdegree(i)); } } @Test public void missingNode() { - Graph g = getGraph(); + SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori")); SWHID rev1 = fakeSWHID("rev", 18); Assertions.assertThrows(IllegalArgumentException.class, () -> { sg.outdegree(sg.getNodeId(rev1)); }); Assertions.assertThrows(IllegalArgumentException.class, () -> { sg.successors(sg.getNodeId(rev1)); }); } @Test public void outdegreeOnlyDirOri() { - Graph g = getGraph(); + SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori")); SWHID dir1 = fakeSWHID("dir", 17); Assertions.assertEquals(2, g.outdegree(g.getNodeId(dir1))); Assertions.assertEquals(1, sg.outdegree(sg.getNodeId(dir1))); SWHID dir2 = fakeSWHID("dir", 6); Assertions.assertEquals(2, g.outdegree(g.getNodeId(dir2))); Assertions.assertEquals(0, sg.outdegree(sg.getNodeId(dir2))); - SWHID ori1 = fakeSWHID("ori", 21); + SWHID ori1 = new SWHID(TEST_ORIGIN_ID); Assertions.assertEquals(1, g.outdegree(g.getNodeId(ori1))); Assertions.assertEquals(0, sg.outdegree(sg.getNodeId(ori1))); } @Test public void successorsOnlyDirOri() { - Graph g = getGraph(); + SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori")); SWHID dir1 = fakeSWHID("dir", 17); assertEqualsAnyOrder(Collections.singletonList(sg.getNodeId(fakeSWHID("dir", 16))), lazyLongIteratorToList(sg.successors(sg.getNodeId(dir1)))); SWHID dir2 = fakeSWHID("dir", 6); assertEqualsAnyOrder(Collections.emptyList(), lazyLongIteratorToList(sg.successors(sg.getNodeId(dir2)))); - SWHID ori1 = fakeSWHID("ori", 21); + SWHID ori1 = new SWHID(TEST_ORIGIN_ID); assertEqualsAnyOrder(Collections.emptyList(), lazyLongIteratorToList(sg.successors(sg.getNodeId(ori1)))); } @Test public void nodeIteratorOnlyOriDir() { - Graph g = getGraph(); + SwhBidirectionalGraph g = getGraph(); Subgraph sg = new Subgraph(g, new AllowedNodes("dir,ori")); ArrayList nodeList = new ArrayList<>(); Iterator nodeIt = sg.nodeIterator(); nodeIt.forEachRemaining(nodeList::add); - assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(fakeSWHID("ori", 21)), sg.getNodeId(fakeSWHID("dir", 2)), + assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(new SWHID(TEST_ORIGIN_ID)), sg.getNodeId(fakeSWHID("dir", 2)), sg.getNodeId(fakeSWHID("dir", 6)), sg.getNodeId(fakeSWHID("dir", 8)), sg.getNodeId(fakeSWHID("dir", 12)), sg.getNodeId(fakeSWHID("dir", 16)), sg.getNodeId(fakeSWHID("dir", 17))), nodeList); sg = new Subgraph(g, new AllowedNodes("snp,rel")); nodeList = new ArrayList<>(); nodeIt = sg.nodeIterator(); nodeIt.forEachRemaining(nodeList::add); assertEqualsAnyOrder(Arrays.asList(sg.getNodeId(fakeSWHID("snp", 20)), sg.getNodeId(fakeSWHID("rel", 10)), sg.getNodeId(fakeSWHID("rel", 19))), nodeList); } } diff --git a/java/src/test/java/org/softwareheritage/graph/VisitTest.java b/java/src/test/java/org/softwareheritage/graph/VisitTest.java deleted file mode 100644 index de5e8af..0000000 --- a/java/src/test/java/org/softwareheritage/graph/VisitTest.java +++ /dev/null @@ -1,420 +0,0 @@ -package org.softwareheritage.graph; - -import java.util.ArrayList; -import java.util.Set; -import java.util.HashSet; - -import org.junit.jupiter.api.Test; -import org.softwareheritage.graph.server.Endpoint; - -// Avoid warnings concerning Endpoint.Output.result manual cast -@SuppressWarnings("unchecked") -public class VisitTest extends GraphTest { - private void assertSameNodesFromPaths(ArrayList paths, ArrayList nodes) { - Set expectedNodes = new HashSet(); - for (SwhPath path : paths) { - expectedNodes.addAll(path.getPath()); - } - GraphTest.assertEqualsAnyOrder(expectedNodes, nodes); - } - - @Test - public void forwardFromRoot() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:ori:0000000000000000000000000000000000000021"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "*"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "*"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000007")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000001")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000004")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000005")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:dir:0000000000000000000000000000000000000002", - "swh:1:cnt:0000000000000000000000000000000000000001")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000007")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000001")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000004")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000005")); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:dir:0000000000000000000000000000000000000002", - "swh:1:cnt:0000000000000000000000000000000000000001")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void forwardFromMiddle() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:dir:0000000000000000000000000000000000000012"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "*"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "*"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000007")); - expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000001")); - expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000004")); - expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000005")); - expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:cnt:0000000000000000000000000000000000000011")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void forwardFromLeaf() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:cnt:0000000000000000000000000000000000000004"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "*"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "*"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void backwardFromRoot() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:ori:0000000000000000000000000000000000000021"); - Endpoint endpoint1 = new Endpoint(graph, "backward", "*"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "backward", "*"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:ori:0000000000000000000000000000000000000021")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void backwardFromMiddle() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:dir:0000000000000000000000000000000000000012"); - Endpoint endpoint1 = new Endpoint(graph, "backward", "*"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "backward", "*"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000018", - "swh:1:rel:0000000000000000000000000000000000000019")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void backwardFromLeaf() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:cnt:0000000000000000000000000000000000000004"); - Endpoint endpoint1 = new Endpoint(graph, "backward", "*"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "backward", "*"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000018", - "swh:1:rel:0000000000000000000000000000000000000019")); - expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000018", - "swh:1:rel:0000000000000000000000000000000000000019")); - expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:ori:0000000000000000000000000000000000000021")); - expectedPaths.add(new SwhPath("swh:1:cnt:0000000000000000000000000000000000000004", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:ori:0000000000000000000000000000000000000021")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void forwardSnpToRev() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:snp:0000000000000000000000000000000000000020"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "snp:rev"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "snp:rev"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void forwardRelToRevRevToRev() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:rel:0000000000000000000000000000000000000010"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "rel:rev,rev:rev"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "rel:rev,rev:rev"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000003")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void forwardRevToAllDirToAll() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:rev:0000000000000000000000000000000000000013"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "rev:*,dir:*"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "rev:*,dir:*"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000005")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000005")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000004")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000004")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000007")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000007")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:cnt:0000000000000000000000000000000000000011")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:dir:0000000000000000000000000000000000000002", - "swh:1:cnt:0000000000000000000000000000000000000001")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000001")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:dir:0000000000000000000000000000000000000012", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000001")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void forwardSnpToAllRevToAll() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:snp:0000000000000000000000000000000000000020"); - Endpoint endpoint1 = new Endpoint(graph, "forward", "snp:*,rev:*"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "snp:*,rev:*"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:dir:0000000000000000000000000000000000000002")); - expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008")); - expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rel:0000000000000000000000000000000000000010")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void forwardNoEdges() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:snp:0000000000000000000000000000000000000020"); - Endpoint endpoint1 = new Endpoint(graph, "forward", ""); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", ""); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:snp:0000000000000000000000000000000000000020")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void backwardRevToRevRevToRel() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:rev:0000000000000000000000000000000000000003"); - Endpoint endpoint1 = new Endpoint(graph, "backward", "rev:rev,rev:rel"); - ArrayList paths = (ArrayList) endpoint1.visitPaths(new Endpoint.Input(swhid)).result; - Endpoint endpoint2 = new Endpoint(graph, "backward", "rev:rev,rev:rel"); - ArrayList nodes = (ArrayList) endpoint2.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedPaths = new ArrayList(); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000018", - "swh:1:rel:0000000000000000000000000000000000000019")); - expectedPaths.add(new SwhPath("swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rel:0000000000000000000000000000000000000010")); - - GraphTest.assertEqualsAnyOrder(expectedPaths, paths); - assertSameNodesFromPaths(expectedPaths, nodes); - } - - @Test - public void forwardFromRootNodesOnly() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:ori:0000000000000000000000000000000000000021"); - Endpoint endpoint = new Endpoint(graph, "forward", "*"); - ArrayList nodes = (ArrayList) endpoint.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedNodes = new ArrayList(); - expectedNodes.add(new SWHID("swh:1:ori:0000000000000000000000000000000000000021")); - expectedNodes.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020")); - expectedNodes.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); - expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000009")); - expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000003")); - expectedNodes.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002")); - expectedNodes.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); - expectedNodes.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000008")); - expectedNodes.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000006")); - expectedNodes.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004")); - expectedNodes.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005")); - expectedNodes.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); - - GraphTest.assertEqualsAnyOrder(expectedNodes, nodes); - } - - @Test - public void backwardRevToAllNodesOnly() { - Graph graph = getGraph(); - SWHID swhid = new SWHID("swh:1:rev:0000000000000000000000000000000000000003"); - Endpoint endpoint = new Endpoint(graph, "backward", "rev:*"); - ArrayList nodes = (ArrayList) endpoint.visitNodes(new Endpoint.Input(swhid)).result; - - ArrayList expectedNodes = new ArrayList(); - expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000003")); - expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000009")); - expectedNodes.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020")); - expectedNodes.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); - expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); - expectedNodes.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000018")); - expectedNodes.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000019")); - - GraphTest.assertEqualsAnyOrder(expectedNodes, nodes); - } -} diff --git a/java/src/test/java/org/softwareheritage/graph/WalkTest.java b/java/src/test/java/org/softwareheritage/graph/WalkTest.java deleted file mode 100644 index 8ddd0f9..0000000 --- a/java/src/test/java/org/softwareheritage/graph/WalkTest.java +++ /dev/null @@ -1,187 +0,0 @@ -package org.softwareheritage.graph; - -import java.util.Arrays; -import java.util.List; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import org.softwareheritage.graph.server.Endpoint; - -public class WalkTest extends GraphTest { - @Test - public void forwardRootToLeaf() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:snp:0000000000000000000000000000000000000020"); - String dstFmt = "swh:1:cnt:0000000000000000000000000000000000000005"; - - SwhPath solution1 = new SwhPath("swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000005"); - SwhPath solution2 = new SwhPath("swh:1:snp:0000000000000000000000000000000000000020", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000005"); - - Endpoint endpoint1 = new Endpoint(graph, "forward", "*"); - SwhPath dfsPath = (SwhPath) endpoint1.walk(new Endpoint.Input(src, dstFmt, "dfs")).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "*"); - SwhPath bfsPath = (SwhPath) endpoint2.walk(new Endpoint.Input(src, dstFmt, "bfs")).result; - - List possibleSolutions = Arrays.asList(solution1, solution2); - Assertions.assertTrue(possibleSolutions.contains(dfsPath)); - Assertions.assertTrue(possibleSolutions.contains(bfsPath)); - } - - @Test - public void forwardLeafToLeaf() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:cnt:0000000000000000000000000000000000000007"); - String dstFmt = "cnt"; - - SwhPath expectedPath = new SwhPath("swh:1:cnt:0000000000000000000000000000000000000007"); - - Endpoint endpoint1 = new Endpoint(graph, "forward", "*"); - SwhPath dfsPath = (SwhPath) endpoint1.walk(new Endpoint.Input(src, dstFmt, "dfs")).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "*"); - SwhPath bfsPath = (SwhPath) endpoint2.walk(new Endpoint.Input(src, dstFmt, "bfs")).result; - - Assertions.assertEquals(dfsPath, expectedPath); - Assertions.assertEquals(bfsPath, expectedPath); - } - - @Test - public void forwardRevToRev() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:rev:0000000000000000000000000000000000000018"); - String dstFmt = "swh:1:rev:0000000000000000000000000000000000000003"; - - SwhPath expectedPath = new SwhPath("swh:1:rev:0000000000000000000000000000000000000018", - "swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000003"); - - Endpoint endpoint1 = new Endpoint(graph, "forward", "rev:rev"); - SwhPath dfsPath = (SwhPath) endpoint1.walk(new Endpoint.Input(src, dstFmt, "dfs")).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "rev:rev"); - SwhPath bfsPath = (SwhPath) endpoint2.walk(new Endpoint.Input(src, dstFmt, "bfs")).result; - - Assertions.assertEquals(dfsPath, expectedPath); - Assertions.assertEquals(bfsPath, expectedPath); - } - - @Test - public void backwardRevToRev() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:rev:0000000000000000000000000000000000000003"); - String dstFmt = "swh:1:rev:0000000000000000000000000000000000000018"; - - SwhPath expectedPath = new SwhPath("swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000013", - "swh:1:rev:0000000000000000000000000000000000000018"); - - Endpoint endpoint1 = new Endpoint(graph, "backward", "rev:rev"); - SwhPath dfsPath = (SwhPath) endpoint1.walk(new Endpoint.Input(src, dstFmt, "dfs")).result; - Endpoint endpoint2 = new Endpoint(graph, "backward", "rev:rev"); - SwhPath bfsPath = (SwhPath) endpoint2.walk(new Endpoint.Input(src, dstFmt, "bfs")).result; - - Assertions.assertEquals(dfsPath, expectedPath); - Assertions.assertEquals(bfsPath, expectedPath); - } - - @Test - public void backwardCntToFirstSnp() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:cnt:0000000000000000000000000000000000000001"); - String dstFmt = "snp"; - - SwhPath solution1 = new SwhPath("swh:1:cnt:0000000000000000000000000000000000000001", - "swh:1:dir:0000000000000000000000000000000000000002", - "swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:snp:0000000000000000000000000000000000000020"); - SwhPath solution2 = new SwhPath("swh:1:cnt:0000000000000000000000000000000000000001", - "swh:1:dir:0000000000000000000000000000000000000002", - "swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:snp:0000000000000000000000000000000000000020"); - SwhPath solution3 = new SwhPath("swh:1:cnt:0000000000000000000000000000000000000001", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:snp:0000000000000000000000000000000000000020"); - SwhPath solution4 = new SwhPath("swh:1:cnt:0000000000000000000000000000000000000001", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rel:0000000000000000000000000000000000000010", - "swh:1:snp:0000000000000000000000000000000000000020"); - - Endpoint endpoint1 = new Endpoint(graph, "backward", "*"); - SwhPath dfsPath = (SwhPath) endpoint1.walk(new Endpoint.Input(src, dstFmt, "dfs")).result; - Endpoint endpoint2 = new Endpoint(graph, "backward", "*"); - SwhPath bfsPath = (SwhPath) endpoint2.walk(new Endpoint.Input(src, dstFmt, "bfs")).result; - - List possibleSolutions = Arrays.asList(solution1, solution2, solution3, solution4); - Assertions.assertTrue(possibleSolutions.contains(dfsPath)); - Assertions.assertTrue(possibleSolutions.contains(bfsPath)); - } - - @Test - public void forwardRevToFirstCnt() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:rev:0000000000000000000000000000000000000009"); - String dstFmt = "cnt"; - - SwhPath solution1 = new SwhPath("swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000007"); - SwhPath solution2 = new SwhPath("swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000005"); - SwhPath solution3 = new SwhPath("swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:dir:0000000000000000000000000000000000000006", - "swh:1:cnt:0000000000000000000000000000000000000004"); - SwhPath solution4 = new SwhPath("swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:dir:0000000000000000000000000000000000000008", - "swh:1:cnt:0000000000000000000000000000000000000001"); - SwhPath solution5 = new SwhPath("swh:1:rev:0000000000000000000000000000000000000009", - "swh:1:rev:0000000000000000000000000000000000000003", - "swh:1:dir:0000000000000000000000000000000000000002", - "swh:1:cnt:0000000000000000000000000000000000000001"); - - Endpoint endpoint1 = new Endpoint(graph, "forward", "rev:*,dir:*"); - SwhPath dfsPath = (SwhPath) endpoint1.walk(new Endpoint.Input(src, dstFmt, "dfs")).result; - Endpoint endpoint2 = new Endpoint(graph, "forward", "rev:*,dir:*"); - SwhPath bfsPath = (SwhPath) endpoint2.walk(new Endpoint.Input(src, dstFmt, "bfs")).result; - - List possibleSolutions = Arrays.asList(solution1, solution2, solution3, solution4, solution5); - Assertions.assertTrue(possibleSolutions.contains(dfsPath)); - Assertions.assertTrue(possibleSolutions.contains(bfsPath)); - } - - @Test - public void backwardDirToFirstRel() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:dir:0000000000000000000000000000000000000016"); - String dstFmt = "rel"; - - SwhPath expectedPath = new SwhPath("swh:1:dir:0000000000000000000000000000000000000016", - "swh:1:dir:0000000000000000000000000000000000000017", - "swh:1:rev:0000000000000000000000000000000000000018", - "swh:1:rel:0000000000000000000000000000000000000019"); - - Endpoint endpoint1 = new Endpoint(graph, "backward", "dir:dir,dir:rev,rev:*"); - SwhPath dfsPath = (SwhPath) endpoint1.walk(new Endpoint.Input(src, dstFmt, "dfs")).result; - Endpoint endpoint2 = new Endpoint(graph, "backward", "dir:dir,dir:rev,rev:*"); - SwhPath bfsPath = (SwhPath) endpoint2.walk(new Endpoint.Input(src, dstFmt, "bfs")).result; - - Assertions.assertEquals(dfsPath, expectedPath); - Assertions.assertEquals(bfsPath, expectedPath); - } -} diff --git a/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java b/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java new file mode 100644 index 0000000..9338a2f --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import org.apache.commons.codec.digest.DigestUtils; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.softwareheritage.graph.GraphTest; +import org.softwareheritage.graph.SwhType; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.TreeSet; + +public class ExtractNodesTest extends GraphTest { + /** Generate a fake SWHID for a given node type and numeric ID */ + private static byte[] f(String type, int id) { + String hash = new String(DigestUtils.sha1Hex(type + id).getBytes()); + return String.format("swh:1:%s:%s", type, hash).getBytes(); + } + + static class FakeDataset implements GraphDataset { + @Override + public void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException { + // For each node type, write nodes {1..4} as present in the graph + for (SwhType type : SwhType.values()) { + for (int i = 1; i <= 4; i++) { + byte[] node = f(type.toString().toLowerCase(), i); + nodeCb.onNode(node); + } + } + + edgeCb.onEdge(f("ori", 1), f("snp", 1), null, -1); + edgeCb.onEdge(f("ori", 2), f("snp", 2), null, -1); + edgeCb.onEdge(f("ori", 3), f("snp", 3), null, -1); + edgeCb.onEdge(f("ori", 4), f("snp", 404), null, -1); + + edgeCb.onEdge(f("snp", 1), f("rev", 1), "dup1".getBytes(), -1); + edgeCb.onEdge(f("snp", 1), f("rev", 1), "dup2".getBytes(), -1); + edgeCb.onEdge(f("snp", 3), f("cnt", 1), "c1".getBytes(), -1); + edgeCb.onEdge(f("snp", 4), f("rel", 1), "r1".getBytes(), -1); + + edgeCb.onEdge(f("rel", 1), f("rel", 2), null, -1); + edgeCb.onEdge(f("rel", 2), f("rev", 1), null, -1); + edgeCb.onEdge(f("rel", 3), f("rev", 2), null, -1); + edgeCb.onEdge(f("rel", 4), f("dir", 1), null, -1); + + edgeCb.onEdge(f("rev", 1), f("rev", 1), null, -1); + edgeCb.onEdge(f("rev", 1), f("rev", 1), null, -1); + edgeCb.onEdge(f("rev", 1), f("rev", 2), null, -1); + edgeCb.onEdge(f("rev", 2), f("rev", 404), null, -1); + edgeCb.onEdge(f("rev", 3), f("rev", 2), null, -1); + edgeCb.onEdge(f("rev", 4), f("dir", 1), null, -1); + + edgeCb.onEdge(f("dir", 1), f("cnt", 1), "c1".getBytes(), 42); + edgeCb.onEdge(f("dir", 1), f("dir", 1), "d1".getBytes(), 1337); + edgeCb.onEdge(f("dir", 1), f("rev", 1), "r1".getBytes(), 0); + } + } + + @Test + public void testExtractNodes(@TempDir Path outputDir, @TempDir Path sortTmpDir) + throws IOException, InterruptedException { + FakeDataset dataset = new FakeDataset(); + ExtractNodes.extractNodes(dataset, outputDir.toString() + "/graph", "2M", sortTmpDir.toFile()); + + // Check count files + Long nodeCount = Long.parseLong(Files.readString(outputDir.resolve("graph.nodes.count.txt")).strip()); + Long edgeCount = Long.parseLong(Files.readString(outputDir.resolve("graph.edges.count.txt")).strip()); + Long labelCount = Long.parseLong(Files.readString(outputDir.resolve("graph.labels.count.txt")).strip()); + Assertions.assertEquals(26L, nodeCount); + Assertions.assertEquals(21L, edgeCount); + Assertions.assertEquals(5L, labelCount); + + // Check stat files + List nodeStats = Files.readAllLines(outputDir.resolve("graph.nodes.stats.txt")); + List edgeStats = Files.readAllLines(outputDir.resolve("graph.edges.stats.txt")); + Assertions.assertEquals(nodeStats, List.of("cnt 4", "dir 4", "ori 4", "rel 4", "rev 5", "snp 5")); + Assertions.assertEquals(edgeStats, List.of("dir:cnt 1", "dir:dir 1", "dir:rev 1", "ori:snp 4", "rel:dir 1", + "rel:rel 1", "rel:rev 2", "rev:dir 1", "rev:rev 5", "snp:cnt 1", "snp:rel 1", "snp:rev 2")); + + // Build ordered set of expected node IDs + TreeSet expectedNodes = new TreeSet<>(); + for (SwhType type : SwhType.values()) { + for (int i = 1; i <= 4; i++) { + byte[] node = f(type.toString().toLowerCase(), i); + expectedNodes.add(new String(node)); + } + } + expectedNodes.add(new String(f("snp", 404))); + expectedNodes.add(new String(f("rev", 404))); + String[] nodeLines = readZstFile(outputDir.resolve("graph.nodes.csv.zst")); + Assertions.assertArrayEquals(expectedNodes.toArray(new String[0]), nodeLines); + + // Build ordered set of expected label IDs + TreeSet expectedLabels = new TreeSet<>(); + expectedLabels.add("dup1"); + expectedLabels.add("dup2"); + expectedLabels.add("c1"); + expectedLabels.add("r1"); + expectedLabels.add("d1"); + String[] labelLines = readZstFile(outputDir.resolve("graph.labels.csv.zst")); + Assertions.assertArrayEquals(expectedLabels.toArray(new String[0]), labelLines); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java b/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java new file mode 100644 index 0000000..142d849 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.compress; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.softwareheritage.graph.GraphTest; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; + +public class ExtractPersonsTest extends GraphTest { + private static class FakeORCDataset extends ORCGraphDataset { + private static class FakeSwhOrcTable extends ORCGraphDataset.SwhOrcTable { + private final String tableName; + + public FakeSwhOrcTable(String tableName) { + this.tableName = tableName; + } + + @Override + public void readBytes64Column(String longColumn, BytesCallback cb) throws IOException { + if (tableName.equals("revision") && longColumn.equals("author")) { + cb.onBytes(fakeSWHID("rev", 1).toBytes(), "rev_author_1".getBytes()); + cb.onBytes(fakeSWHID("rev", 2).toBytes(), "rev_author_1".getBytes()); + cb.onBytes(fakeSWHID("rev", 3).toBytes(), "rev_author_2".getBytes()); + cb.onBytes(fakeSWHID("rev", 4).toBytes(), "rev_author_1".getBytes()); + cb.onBytes(fakeSWHID("rev", 5).toBytes(), "rev_author_3".getBytes()); + } else if (tableName.equals("revision") && longColumn.equals("committer")) { + cb.onBytes(fakeSWHID("rev", 1).toBytes(), "rev_committer_1".getBytes()); + cb.onBytes(fakeSWHID("rev", 2).toBytes(), "rev_committer_1".getBytes()); + cb.onBytes(fakeSWHID("rev", 3).toBytes(), "rev_committer_2".getBytes()); + cb.onBytes(fakeSWHID("rev", 4).toBytes(), "rev_author_2".getBytes()); + cb.onBytes(fakeSWHID("rev", 5).toBytes(), "rev_author_1".getBytes()); + cb.onBytes(fakeSWHID("rev", 6).toBytes(), "rev_committer_1".getBytes()); + } else if (tableName.equals("release") && longColumn.equals("author")) { + cb.onBytes(fakeSWHID("rel", 1).toBytes(), "rel_committer_1".getBytes()); + cb.onBytes(fakeSWHID("rel", 2).toBytes(), "rel_committer_1".getBytes()); + cb.onBytes(fakeSWHID("rel", 3).toBytes(), "rel_committer_2".getBytes()); + cb.onBytes(fakeSWHID("rel", 4).toBytes(), "rev_author_2".getBytes()); + cb.onBytes(fakeSWHID("rel", 5).toBytes(), "rev_author_1".getBytes()); + cb.onBytes(fakeSWHID("rel", 6).toBytes(), "rev_committer_1".getBytes()); + cb.onBytes(fakeSWHID("rel", 7).toBytes(), "rel_committer_1".getBytes()); + } else { + throw new RuntimeException("Unknown table/column: " + tableName + "/" + longColumn); + } + } + } + + public SwhOrcTable getTable(String tableName) { + return new FakeSwhOrcTable(tableName); + } + } + + @Test + public void testExtractPersons(@TempDir Path outputDir, @TempDir Path sortTmpDir) + throws IOException, InterruptedException { + + FakeORCDataset fakeORCDataset = new FakeORCDataset(); + ExtractPersons.extractPersons(fakeORCDataset, outputDir.toString() + "/graph", "2M", sortTmpDir.toString()); + + ArrayList expectedPersons = new ArrayList<>(Arrays.asList("rev_author_1", "rev_author_2", + "rev_author_3", "rev_committer_1", "rev_committer_2", "rel_committer_1", "rel_committer_2")); + + // Check count files + Long personsCount = Long.parseLong(Files.readString(outputDir.resolve("graph.persons.count.txt")).strip()); + Assertions.assertEquals(expectedPersons.size(), personsCount); + + // Check persons + expectedPersons.sort(String::compareTo); + String[] personLines = readZstFile(outputDir.resolve("graph.persons.csv.zst")); + Assertions.assertArrayEquals(expectedPersons.toArray(new String[0]), personLines); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/CountEdgesTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/CountEdgesTest.java new file mode 100644 index 0000000..7445671 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/CountEdgesTest.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import com.google.protobuf.FieldMask; +import io.grpc.Status; +import io.grpc.StatusRuntimeException; +import org.junit.jupiter.api.Test; +import org.softwareheritage.graph.SWHID; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class CountEdgesTest extends TraversalServiceTest { + private TraversalRequest.Builder getTraversalRequestBuilder(SWHID src) { + return TraversalRequest.newBuilder().addSrc(src.toString()); + } + + @Test + public void testSwhidErrors() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, () -> client + .countEdges(TraversalRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.countEdges( + TraversalRequest.newBuilder().addSrc("swh:1:lol:0000000000000000000000000000000000000001").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.countEdges( + TraversalRequest.newBuilder().addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + @Test + public void forwardFromRoot() { + CountResponse actual = client.countEdges(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).build()); + assertEquals(13, actual.getCount()); + } + + @Test + public void forwardFromMiddle() { + CountResponse actual = client.countEdges(getTraversalRequestBuilder(fakeSWHID("dir", 12)).build()); + assertEquals(7, actual.getCount()); + } + + @Test + public void forwardRelRev() { + CountResponse actual = client + .countEdges(getTraversalRequestBuilder(fakeSWHID("rel", 10)).setEdges("rel:rev,rev:rev").build()); + assertEquals(2, actual.getCount()); + } + + @Test + public void backwardFromMiddle() { + CountResponse actual = client.countEdges( + getTraversalRequestBuilder(fakeSWHID("dir", 12)).setDirection(GraphDirection.BACKWARD).build()); + assertEquals(3, actual.getCount()); + } + + @Test + public void backwardFromLeaf() { + CountResponse actual = client.countEdges( + getTraversalRequestBuilder(fakeSWHID("cnt", 4)).setDirection(GraphDirection.BACKWARD).build()); + assertEquals(12, actual.getCount()); + } + + @Test + public void backwardRevToRevRevToRel() { + CountResponse actual = client.countEdges(getTraversalRequestBuilder(fakeSWHID("rev", 3)) + .setEdges("rev:rev,rev:rel").setDirection(GraphDirection.BACKWARD).build()); + assertEquals(5, actual.getCount()); + } + + @Test + public void testWithEmptyMask() { + CountResponse actual = client.countEdges( + getTraversalRequestBuilder(fakeSWHID("dir", 12)).setMask(FieldMask.getDefaultInstance()).build()); + assertEquals(7, actual.getCount()); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/CountNodesTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/CountNodesTest.java new file mode 100644 index 0000000..a0bebc1 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/CountNodesTest.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import com.google.protobuf.FieldMask; +import io.grpc.Status; +import io.grpc.StatusRuntimeException; +import org.junit.jupiter.api.Test; +import org.softwareheritage.graph.SWHID; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class CountNodesTest extends TraversalServiceTest { + private TraversalRequest.Builder getTraversalRequestBuilder(SWHID src) { + return TraversalRequest.newBuilder().addSrc(src.toString()); + } + + @Test + public void testSwhidErrors() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, () -> client + .countNodes(TraversalRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.countNodes( + TraversalRequest.newBuilder().addSrc("swh:1:lol:0000000000000000000000000000000000000001").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.countNodes( + TraversalRequest.newBuilder().addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + @Test + public void forwardFromRoot() { + CountResponse actual = client.countNodes(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).build()); + assertEquals(12, actual.getCount()); + } + + @Test + public void forwardFromMiddle() { + CountResponse actual = client.countNodes(getTraversalRequestBuilder(fakeSWHID("dir", 12)).build()); + assertEquals(8, actual.getCount()); + } + + @Test + public void forwardRelRev() { + CountResponse actual = client + .countNodes(getTraversalRequestBuilder(fakeSWHID("rel", 10)).setEdges("rel:rev,rev:rev").build()); + assertEquals(3, actual.getCount()); + } + + @Test + public void backwardFromMiddle() { + CountResponse actual = client.countNodes( + getTraversalRequestBuilder(fakeSWHID("dir", 12)).setDirection(GraphDirection.BACKWARD).build()); + assertEquals(4, actual.getCount()); + } + + @Test + public void backwardFromLeaf() { + CountResponse actual = client.countNodes( + getTraversalRequestBuilder(fakeSWHID("cnt", 4)).setDirection(GraphDirection.BACKWARD).build()); + assertEquals(11, actual.getCount()); + } + + @Test + public void backwardRevToRevRevToRel() { + CountResponse actual = client.countNodes(getTraversalRequestBuilder(fakeSWHID("rev", 3)) + .setEdges("rev:rev,rev:rel").setDirection(GraphDirection.BACKWARD).build()); + assertEquals(6, actual.getCount()); + } + + @Test + public void testWithEmptyMask() { + CountResponse actual = client.countNodes( + getTraversalRequestBuilder(fakeSWHID("dir", 12)).setMask(FieldMask.getDefaultInstance()).build()); + assertEquals(8, actual.getCount()); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java new file mode 100644 index 0000000..218a79c --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import io.grpc.Status; +import io.grpc.StatusRuntimeException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.softwareheritage.graph.SWHID; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class FindPathBetweenTest extends TraversalServiceTest { + private FindPathBetweenRequest.Builder getRequestBuilder(SWHID src, SWHID dst) { + return FindPathBetweenRequest.newBuilder().addSrc(src.toString()).addDst(dst.toString()); + } + + @Test + public void testSwhidErrors() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, () -> client + .findPathBetween(FindPathBetweenRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest + .newBuilder().addSrc("swh:1:lol:0000000000000000000000000000000000000001").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest + .newBuilder().addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, + () -> client.findPathBetween(FindPathBetweenRequest.newBuilder().addSrc(TEST_ORIGIN_ID) + .addDst("swh:1:cnt:000000000000000000000000000000000000000z").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + @Test + public void testEdgeErrors() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathBetween(FindPathBetweenRequest + .newBuilder().addSrc(TEST_ORIGIN_ID).addDst(TEST_ORIGIN_ID).setEdges("batracien:reptile").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + // Test path between ori 1 and cnt 4 (forward graph) + @Test + public void forwardRootToLeaf() { + ArrayList actual = getSWHIDs( + client.findPathBetween(getRequestBuilder(new SWHID(TEST_ORIGIN_ID), fakeSWHID("cnt", 4)).build())); + List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("snp", 20), fakeSWHID("rev", 9), + fakeSWHID("dir", 8), fakeSWHID("dir", 6), fakeSWHID("cnt", 4)); + Assertions.assertEquals(expected, actual); + } + + // Test path between rev 18 and rev 3 (forward graph) + @Test + public void forwardRevToRev() { + ArrayList actual = getSWHIDs( + client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 18), fakeSWHID("rev", 3)).build())); + List expected = List.of(fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("rev", 9), + fakeSWHID("rev", 3)); + Assertions.assertEquals(expected, actual); + } + + // Test path between rev 3 and rev 18 (backward graph) + @Test + public void backwardRevToRev() { + ArrayList actual = getSWHIDs( + client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 3), fakeSWHID("rev", 18)) + .setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("rev", 13), + fakeSWHID("rev", 18)); + Assertions.assertEquals(expected, actual); + } + + // Test path between cnt 4 and itself (forward graph) + @Test + public void forwardCntToItself() { + ArrayList actual = getSWHIDs( + client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("cnt", 4)).build())); + List expected = List.of(fakeSWHID("cnt", 4)); + Assertions.assertEquals(expected, actual); + } + + // Start from ori and rel 19 and find cnt 14 or cnt 7 (forward graph) + @Test + public void forwardMultipleSourcesDest() { + ArrayList actual = getSWHIDs( + client.findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 14)) + .addSrc(TEST_ORIGIN_ID).addDst(fakeSWHID("cnt", 7).toString()).build())); + List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), + fakeSWHID("cnt", 14)); + } + + // Start from cnt 4 and cnt 11 and find rev 13 or rev 9 (backward graph) + @Test + public void backwardMultipleSourcesDest() { + ArrayList actual = getSWHIDs(client.findPathBetween( + getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("rev", 13)).setDirection(GraphDirection.BACKWARD) + .addSrc(fakeSWHID("cnt", 11).toString()).addDst(fakeSWHID("rev", 9).toString()).build())); + List expected = List.of(fakeSWHID("cnt", 11), fakeSWHID("dir", 12), fakeSWHID("rev", 13)); + Assertions.assertEquals(expected, actual); + } + + // Start from all directories and find the origin (backward graph) + @Test + public void backwardMultipleSourcesAllDirToOri() { + ArrayList actual = getSWHIDs( + client.findPathBetween(getRequestBuilder(fakeSWHID("dir", 2), new SWHID(TEST_ORIGIN_ID)) + .addSrc(fakeSWHID("dir", 6).toString()).addSrc(fakeSWHID("dir", 8).toString()) + .addSrc(fakeSWHID("dir", 12).toString()).addSrc(fakeSWHID("dir", 16).toString()) + .addSrc(fakeSWHID("dir", 17).toString()).setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("dir", 8), fakeSWHID("rev", 9), fakeSWHID("snp", 20), + new SWHID(TEST_ORIGIN_ID)); + Assertions.assertEquals(expected, actual); + } + + // Start from cnt 4 and find any rev (backward graph) + @Test + public void backwardCntToAnyRev() { + ArrayList actual = getSWHIDs( + client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("rev", 3)) + .addDst(fakeSWHID("rev", 9).toString()).addDst(fakeSWHID("rev", 13).toString()) + .addDst(fakeSWHID("rev", 18).toString()).setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("cnt", 4), fakeSWHID("dir", 6), fakeSWHID("dir", 8), + fakeSWHID("rev", 9)); + Assertions.assertEquals(expected, actual); + } + + // Impossible path between rev 9 and cnt 14 + @Test + public void forwardImpossiblePath() { + StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { + client.findPathBetween(getRequestBuilder(fakeSWHID("rev", 9), fakeSWHID("cnt", 14)).build()); + }); + Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); + + // Reverse direction + thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { + client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 14), fakeSWHID("rev", 9)) + .setDirection(GraphDirection.BACKWARD).build()); + }); + Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); + } + + // Common ancestor between cnt 4 and cnt 15 : rev 18 + @Test + public void commonAncestorBackwardBackward() { + Path p = client.findPathBetween(getRequestBuilder(fakeSWHID("cnt", 4), fakeSWHID("cnt", 15)) + .setDirection(GraphDirection.BACKWARD).setDirectionReverse(GraphDirection.BACKWARD).build()); + ArrayList actual = getSWHIDs(p); + SWHID expected = fakeSWHID("rev", 18); + Assertions.assertEquals(expected, actual.get(p.getMidpointIndex())); + } + + // Common descendant between rev 13 and rev 3 : cnt 1 (with rev:dir,dir:dir,dir:cnt) + @Test + public void commonDescendantForwardForward() { + Path p = client.findPathBetween( + getRequestBuilder(fakeSWHID("rev", 13), fakeSWHID("rev", 3)).setDirection(GraphDirection.FORWARD) + .setDirectionReverse(GraphDirection.FORWARD).setEdges("rev:dir,dir:dir,dir:cnt").build()); + ArrayList actual = getSWHIDs(p); + SWHID expected = fakeSWHID("cnt", 1); + Assertions.assertEquals(expected, actual.get(p.getMidpointIndex())); + } + + // Path between rel 19 and cnt 15 with various max depths + @Test + public void maxDepth() { + // Works with max_depth = 2 + ArrayList actual = getSWHIDs(client + .findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxDepth(2).build())); + List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), + fakeSWHID("dir", 16), fakeSWHID("cnt", 15)); + Assertions.assertEquals(expected, actual); + + // Check that it throws NOT_FOUND with max depth = 1 + StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { + client.findPathBetween( + getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxDepth(1).build()); + }); + Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); + } + + // Path between rel 19 and cnt 15 with various max edges + @Test + public void maxEdges() { + // Works with max_edges = 3 + ArrayList actual = getSWHIDs(client + .findPathBetween(getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxEdges(3).build())); + List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), + fakeSWHID("dir", 16), fakeSWHID("cnt", 15)); + Assertions.assertEquals(expected, actual); + + // Check that it throws NOT_FOUND with max_edges = 2 + StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { + client.findPathBetween( + getRequestBuilder(fakeSWHID("rel", 19), fakeSWHID("cnt", 15)).setMaxEdges(2).build()); + }); + Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java new file mode 100644 index 0000000..54d358f --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import io.grpc.Status; +import io.grpc.StatusRuntimeException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.softwareheritage.graph.SWHID; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class FindPathToTest extends TraversalServiceTest { + private FindPathToRequest.Builder getRequestBuilder(SWHID src, String allowedNodes) { + return FindPathToRequest.newBuilder().addSrc(src.toString()) + .setTarget(NodeFilter.newBuilder().setTypes(allowedNodes).build()); + } + + @Test + public void testSrcErrors() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, () -> client + .findPathTo(FindPathToRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo( + FindPathToRequest.newBuilder().addSrc("swh:1:lol:0000000000000000000000000000000000000001").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo( + FindPathToRequest.newBuilder().addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + @Test + public void testEdgeErrors() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, () -> client.findPathTo( + FindPathToRequest.newBuilder().addSrc(TEST_ORIGIN_ID).setEdges("batracien:reptile").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + @Test + public void testTargetErrors() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, + () -> client.findPathTo(FindPathToRequest.newBuilder().addSrc(TEST_ORIGIN_ID) + .setTarget(NodeFilter.newBuilder().setTypes("argoumante,eglomatique").build()).build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + // Test path between ori 1 and any dir (forward graph) + @Test + public void forwardOriToFirstDir() { + ArrayList actual = getSWHIDs( + client.findPathTo(getRequestBuilder(new SWHID(TEST_ORIGIN_ID), "dir").build())); + List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("snp", 20), fakeSWHID("rev", 9), + fakeSWHID("dir", 8)); + Assertions.assertEquals(expected, actual); + } + + // Test path between rel 19 and any cnt (forward graph) + @Test + public void forwardRelToFirstCnt() { + ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("rel", 19), "cnt").build())); + List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), + fakeSWHID("cnt", 14)); + Assertions.assertEquals(expected, actual); + } + + // Test path between dir 16 and any rel (backward graph) + @Test + public void backwardDirToFirstRel() { + ArrayList actual = getSWHIDs(client.findPathTo( + getRequestBuilder(fakeSWHID("dir", 16), "rel").setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("dir", 16), fakeSWHID("dir", 17), fakeSWHID("rev", 18), + fakeSWHID("rel", 19)); + Assertions.assertEquals(expected, actual); + } + + // Test path between cnt 4 and itself (forward graph) + @Test + public void forwardCntToItself() { + ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 4), "cnt").build())); + List expected = List.of(fakeSWHID("cnt", 4)); + Assertions.assertEquals(expected, actual); + } + + // Start from ori and rel 19 and find any cnt (forward graph) + @Test + public void forwardMultipleSources() { + ArrayList actual = getSWHIDs( + client.findPathTo(getRequestBuilder(fakeSWHID("rel", 19), "cnt").addSrc(TEST_ORIGIN_ID).build())); + List expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("dir", 17), + fakeSWHID("cnt", 14)); + } + + // Start from cnt 4 and cnt 11 and find any rev (backward graph) + @Test + public void backwardMultipleSources() { + ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 4), "rev") + .addSrc(fakeSWHID("cnt", 11).toString()).setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("cnt", 11), fakeSWHID("dir", 12), fakeSWHID("rev", 13)); + Assertions.assertEquals(expected, actual); + } + + // Start from all directories and find any origin (backward graph) + @Test + public void backwardMultipleSourcesAllDirToOri() { + ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("dir", 2), "ori") + .addSrc(fakeSWHID("dir", 6).toString()).addSrc(fakeSWHID("dir", 8).toString()) + .addSrc(fakeSWHID("dir", 12).toString()).addSrc(fakeSWHID("dir", 16).toString()) + .addSrc(fakeSWHID("dir", 17).toString()).setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("dir", 8), fakeSWHID("rev", 9), fakeSWHID("snp", 20), + new SWHID(TEST_ORIGIN_ID)); + Assertions.assertEquals(expected, actual); + } + + // Impossible path between rev 9 and any release (forward graph) + @Test + public void forwardImpossiblePath() { + // Check that the return is STATUS.NOT_FOUND + StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { + client.findPathTo(getRequestBuilder(fakeSWHID("rev", 9), "rel").build()); + }); + Assertions.assertEquals(thrown.getStatus(), Status.NOT_FOUND); + } + + // Path from cnt 15 to any rel with various max depths + @Test + public void maxDepth() { + // Works with max_depth = 2 + ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 15), "rel") + .setDirection(GraphDirection.BACKWARD).setMaxDepth(4).build())); + List expected = List.of(fakeSWHID("cnt", 15), fakeSWHID("dir", 16), fakeSWHID("dir", 17), + fakeSWHID("rev", 18), fakeSWHID("rel", 19)); + Assertions.assertEquals(expected, actual); + + // Check that it throws NOT_FOUND with max depth = 1 + StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { + client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 15), "rel").setDirection(GraphDirection.BACKWARD) + .setMaxDepth(3).build()); + }); + Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); + } + + // Path from cnt 15 to any rel with various max edges + @Test + public void maxEdges() { + ArrayList actual = getSWHIDs(client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 15), "rel") + .setDirection(GraphDirection.BACKWARD).setMaxEdges(4).build())); + List expected = List.of(fakeSWHID("cnt", 15), fakeSWHID("dir", 16), fakeSWHID("dir", 17), + fakeSWHID("rev", 18), fakeSWHID("rel", 19)); + Assertions.assertEquals(expected, actual); + + StatusRuntimeException thrown = Assertions.assertThrows(StatusRuntimeException.class, () -> { + client.findPathTo(getRequestBuilder(fakeSWHID("cnt", 15), "rel").setDirection(GraphDirection.BACKWARD) + .setMaxEdges(3).build()); + }); + Assertions.assertEquals(thrown.getStatus().getCode(), Status.NOT_FOUND.getCode()); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/GetNodeTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/GetNodeTest.java new file mode 100644 index 0000000..22e3a54 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/GetNodeTest.java @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import com.google.protobuf.Descriptors; +import com.google.protobuf.FieldMask; +import io.grpc.Status; +import io.grpc.StatusRuntimeException; +import org.junit.jupiter.api.Test; +import org.softwareheritage.graph.SWHID; + +import java.util.*; + +import static org.junit.jupiter.api.Assertions.*; + +public class GetNodeTest extends TraversalServiceTest { + @Test + public void testNotFound() { + StatusRuntimeException thrown = assertThrows(StatusRuntimeException.class, + () -> client.getNode(GetNodeRequest.newBuilder().setSwhid(fakeSWHID("cnt", 404).toString()).build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + @Test + public void testInvalidSwhid() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, () -> client.getNode( + GetNodeRequest.newBuilder().setSwhid("swh:1:lol:0000000000000000000000000000000000000001").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, () -> client.getNode( + GetNodeRequest.newBuilder().setSwhid("swh:1:cnt:000000000000000000000000000000000000000z").build())); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + @Test + public void testContents() { + List expectedCnts = List.of(1, 4, 5, 7, 11, 14, 15); + Map expectedLengths = Map.of(1, 42, 4, 404, 5, 1337, 7, 666, 11, 313, 14, 14, 15, 404); + Set expectedSkipped = Set.of(15); + + for (Integer cntId : expectedCnts) { + Node n = client.getNode(GetNodeRequest.newBuilder().setSwhid(fakeSWHID("cnt", cntId).toString()).build()); + assertTrue(n.hasCnt()); + assertTrue(n.getCnt().hasLength()); + assertEquals((long) expectedLengths.get(cntId), n.getCnt().getLength()); + assertTrue(n.getCnt().hasIsSkipped()); + assertEquals(expectedSkipped.contains(cntId), n.getCnt().getIsSkipped()); + } + } + + @Test + public void testRevisions() { + List expectedRevs = List.of(3, 9, 13, 18); + Map expectedMessages = Map.of(3, "Initial commit", 9, "Add parser", 13, "Add tests", 18, + "Refactor codebase"); + + Map expectedAuthors = Map.of(3, "foo", 9, "bar", 13, "foo", 18, "baz"); + Map expectedCommitters = Map.of(3, "foo", 9, "bar", 13, "bar", 18, "foo"); + + Map expectedAuthorTimestamps = Map.of(3, 1111122220L, 9, 1111144440L, 13, 1111166660L, 18, + 1111177770L); + Map expectedCommitterTimestamps = Map.of(3, 1111122220L, 9, 1111155550L, 13, 1111166660L, 18, + 1111177770L); + Map expectedAuthorTimestampOffsets = Map.of(3, 120, 9, 120, 13, 120, 18, 0); + Map expectedCommitterTimestampOffsets = Map.of(3, 120, 9, 120, 13, 120, 18, 0); + + HashMap personMapping = new HashMap<>(); + for (Integer revId : expectedRevs) { + Node n = client.getNode(GetNodeRequest.newBuilder().setSwhid(fakeSWHID("rev", revId).toString()).build()); + assertTrue(n.hasRev()); + assertTrue(n.getRev().hasMessage()); + assertEquals(expectedMessages.get(revId), n.getRev().getMessage().toStringUtf8()); + + // Persons are anonymized, we just need to check that the mapping is self-consistent + assertTrue(n.getRev().hasAuthor()); + assertTrue(n.getRev().hasCommitter()); + int[] actualPersons = new int[]{(int) n.getRev().getAuthor(), (int) n.getRev().getCommitter()}; + String[] expectedPersons = new String[]{expectedAuthors.get(revId), expectedCommitters.get(revId)}; + for (int i = 0; i < actualPersons.length; i++) { + int actualPerson = actualPersons[i]; + String expectedPerson = expectedPersons[i]; + assertTrue(actualPerson >= 0); + if (personMapping.containsKey(actualPerson)) { + assertEquals(personMapping.get(actualPerson), expectedPerson); + } else { + personMapping.put(actualPerson, expectedPerson); + } + } + + assertTrue(n.getRev().hasAuthorDate()); + assertTrue(n.getRev().hasAuthorDateOffset()); + assertTrue(n.getRev().hasCommitterDate()); + assertTrue(n.getRev().hasCommitterDateOffset()); + + // FIXME: all the timestamps are one hour off?! + // System.err.println(revId + " " + n.getRev().getAuthorDate() + " " + + // n.getRev().getAuthorDateOffset()); + // System.err.println(revId + " " + n.getRev().getCommitterDate() + " " + + // n.getRev().getCommitterDateOffset()); + + // assertEquals(expectedAuthorTimestamps.get(revId), n.getRev().getAuthorDate()); + assertEquals(expectedAuthorTimestampOffsets.get(revId), n.getRev().getAuthorDateOffset()); + // assertEquals(expectedCommitterTimestamps.get(revId), n.getRev().getAuthorDate()); + assertEquals(expectedCommitterTimestampOffsets.get(revId), n.getRev().getAuthorDateOffset()); + } + } + + @Test + public void testReleases() { + List expectedRels = List.of(10, 19); + Map expectedMessages = Map.of(10, "Version 1.0", 19, "Version 2.0"); + Map expectedNames = Map.of(10, "v1.0", 19, "v2.0"); + + Map expectedAuthors = Map.of(10, "foo", 19, "bar"); + + Map expectedAuthorTimestamps = Map.of(10, 1234567890L); + Map expectedAuthorTimestampOffsets = Map.of(3, 120); + + HashMap personMapping = new HashMap<>(); + for (Integer relId : expectedRels) { + Node n = client.getNode(GetNodeRequest.newBuilder().setSwhid(fakeSWHID("rel", relId).toString()).build()); + assertTrue(n.hasRel()); + assertTrue(n.getRel().hasMessage()); + assertEquals(expectedMessages.get(relId), n.getRel().getMessage().toStringUtf8()); + // FIXME: names are always empty?! + // System.err.println(relId + " " + n.getRel().getName()); + // assertEquals(expectedNames.get(relId), n.getRel().getName().toStringUtf8()); + + // Persons are anonymized, we just need to check that the mapping is self-consistent + assertTrue(n.getRel().hasAuthor()); + int actualPerson = (int) n.getRel().getAuthor(); + String expectedPerson = expectedAuthors.get(relId); + assertTrue(actualPerson >= 0); + if (personMapping.containsKey(actualPerson)) { + assertEquals(personMapping.get(actualPerson), expectedPerson); + } else { + personMapping.put(actualPerson, expectedPerson); + } + + assertTrue(n.getRel().hasAuthorDate()); + assertTrue(n.getRel().hasAuthorDateOffset()); + + // FIXME: all the timestamps are one hour off?! + // if (expectedAuthorTimestamps.containsKey(relId)) { + // assertEquals(expectedAuthorTimestamps.get(revId), n.getRev().getAuthorDate()); + // } + if (expectedAuthorTimestampOffsets.containsKey(relId)) { + assertEquals(expectedAuthorTimestampOffsets.get(relId), n.getRev().getAuthorDateOffset()); + } + } + } + + @Test + public void testOrigins() { + List expectedOris = List.of(new SWHID(TEST_ORIGIN_ID)); + Map expectedUrls = Map.of(new SWHID(TEST_ORIGIN_ID), "https://example.com/swh/graph"); + + for (SWHID oriSwhid : expectedOris) { + Node n = client.getNode(GetNodeRequest.newBuilder().setSwhid(oriSwhid.toString()).build()); + assertTrue(n.hasOri()); + assertTrue(n.getOri().hasUrl()); + assertEquals(expectedUrls.get(oriSwhid), n.getOri().getUrl()); + } + } + + @Test + public void testCntMask() { + Node n; + String swhid = fakeSWHID("cnt", 1).toString(); + + // No mask, all fields present + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).build()); + assertTrue(n.hasCnt()); + assertTrue(n.getCnt().hasLength()); + assertEquals(42, n.getCnt().getLength()); + assertTrue(n.getCnt().hasIsSkipped()); + assertFalse(n.getCnt().getIsSkipped()); + + // Empty mask, no fields present + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).setMask(FieldMask.getDefaultInstance()).build()); + assertFalse(n.getCnt().hasLength()); + assertFalse(n.getCnt().hasIsSkipped()); + + // Mask with length, no isSkipped + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) + .setMask(FieldMask.newBuilder().addPaths("cnt.length").build()).build()); + assertTrue(n.getCnt().hasLength()); + assertFalse(n.getCnt().hasIsSkipped()); + + // Mask with isSkipped, no length + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) + .setMask(FieldMask.newBuilder().addPaths("cnt.is_skipped").build()).build()); + assertFalse(n.getCnt().hasLength()); + assertTrue(n.getCnt().hasIsSkipped()); + } + + @Test + public void testRevMask() { + Node n; + String swhid = fakeSWHID("rev", 3).toString(); + + // No mask, all fields present + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).build()); + assertTrue(n.hasRev()); + assertTrue(n.getRev().hasMessage()); + assertTrue(n.getRev().hasAuthor()); + assertTrue(n.getRev().hasAuthorDate()); + assertTrue(n.getRev().hasAuthorDateOffset()); + assertTrue(n.getRev().hasCommitter()); + assertTrue(n.getRev().hasCommitterDate()); + assertTrue(n.getRev().hasCommitterDateOffset()); + + // Empty mask, no fields present + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).setMask(FieldMask.getDefaultInstance()).build()); + assertFalse(n.getRev().hasMessage()); + assertFalse(n.getRev().hasAuthor()); + assertFalse(n.getRev().hasAuthorDate()); + assertFalse(n.getRev().hasAuthorDateOffset()); + assertFalse(n.getRev().hasCommitter()); + assertFalse(n.getRev().hasCommitterDate()); + assertFalse(n.getRev().hasCommitterDateOffset()); + + // Test all masks with single fields + for (Descriptors.FieldDescriptor includedField : RevisionData.getDefaultInstance().getAllFields().keySet()) { + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) + .setMask(FieldMask.newBuilder().addPaths("rev." + includedField.getName()).build()).build()); + for (Descriptors.FieldDescriptor f : n.getRev().getDescriptorForType().getFields()) { + assertEquals(n.getRev().hasField(f), f.getName().equals(includedField.getName())); + } + } + } + + @Test + public void testRelMask() { + Node n; + String swhid = fakeSWHID("rel", 19).toString(); + + // No mask, all fields present + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).build()); + assertTrue(n.hasRel()); + assertTrue(n.getRel().hasMessage()); + assertTrue(n.getRel().hasAuthor()); + assertTrue(n.getRel().hasAuthorDate()); + assertTrue(n.getRel().hasAuthorDateOffset()); + + // Empty mask, no fields present + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).setMask(FieldMask.getDefaultInstance()).build()); + assertFalse(n.getRel().hasMessage()); + assertFalse(n.getRel().hasAuthor()); + assertFalse(n.getRel().hasAuthorDate()); + assertFalse(n.getRel().hasAuthorDateOffset()); + + // Test all masks with single fields + for (Descriptors.FieldDescriptor includedField : ReleaseData.getDefaultInstance().getAllFields().keySet()) { + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) + .setMask(FieldMask.newBuilder().addPaths("rel." + includedField.getName()).build()).build()); + for (Descriptors.FieldDescriptor f : n.getRel().getDescriptorForType().getFields()) { + assertEquals(n.getRel().hasField(f), f.getName().equals(includedField.getName())); + } + } + } + + @Test + public void testOriMask() { + Node n; + String swhid = TEST_ORIGIN_ID; + + // No mask, all fields present + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).build()); + assertTrue(n.hasOri()); + assertTrue(n.getOri().hasUrl()); + + // Empty mask, no fields present + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid).setMask(FieldMask.getDefaultInstance()).build()); + assertFalse(n.getOri().hasUrl()); + + // Test all masks with single fields + for (Descriptors.FieldDescriptor includedField : OriginData.getDefaultInstance().getAllFields().keySet()) { + n = client.getNode(GetNodeRequest.newBuilder().setSwhid(swhid) + .setMask(FieldMask.newBuilder().addPaths("ori." + includedField.getName()).build()).build()); + for (Descriptors.FieldDescriptor f : n.getOri().getDescriptorForType().getFields()) { + assertEquals(n.getOri().hasField(f), f.getName().equals(includedField.getName())); + } + } + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/StatsTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/StatsTest.java new file mode 100644 index 0000000..e8224c3 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/StatsTest.java @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +public class StatsTest extends TraversalServiceTest { + @Test + public void testStats() { + StatsResponse stats = client.stats(StatsRequest.getDefaultInstance()); + assertEquals(stats.getNumNodes(), 21); + assertEquals(stats.getNumEdges(), 23); + assertEquals(stats.getIndegreeMin(), 0); + assertEquals(stats.getIndegreeMax(), 3); + assertEquals(stats.getOutdegreeMin(), 0); + assertEquals(stats.getOutdegreeMax(), 3); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraversalServiceTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraversalServiceTest.java new file mode 100644 index 0000000..862e1ea --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraversalServiceTest.java @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import io.grpc.ManagedChannel; +import io.grpc.Server; +import io.grpc.inprocess.InProcessChannelBuilder; +import io.grpc.inprocess.InProcessServerBuilder; +import io.grpc.testing.GrpcCleanupRule; +import org.junit.Rule; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.softwareheritage.graph.GraphTest; +import org.softwareheritage.graph.SWHID; +import org.softwareheritage.graph.SwhBidirectionalGraph; + +import java.util.ArrayList; +import java.util.Iterator; + +public class TraversalServiceTest extends GraphTest { + @Rule + public final GrpcCleanupRule grpcCleanup = new GrpcCleanupRule(); + + private static Server server; + private static ManagedChannel channel; + protected static SwhBidirectionalGraph g; + protected static TraversalServiceGrpc.TraversalServiceBlockingStub client; + + @BeforeAll + static void setup() throws Exception { + String serverName = InProcessServerBuilder.generateName(); + g = GraphServer.loadGraph(getGraphPath().toString()); + server = InProcessServerBuilder.forName(serverName).directExecutor() + .addService(new GraphServer.TraversalService(g.copy())).build().start(); + channel = InProcessChannelBuilder.forName(serverName).directExecutor().build(); + client = TraversalServiceGrpc.newBlockingStub(channel); + } + + @AfterAll + static void teardown() { + channel.shutdownNow(); + server.shutdownNow(); + } + + public ArrayList getSWHIDs(Iterator it) { + ArrayList res = new ArrayList<>(); + it.forEachRemaining((Node n) -> { + res.add(new SWHID(n.getSwhid())); + }); + return res; + } + + public ArrayList getSWHIDs(Path p) { + ArrayList res = new ArrayList<>(); + p.getNodeList().forEach((Node n) -> { + res.add(new SWHID(n.getSwhid())); + }); + return res; + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/LeavesTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseLeavesTest.java similarity index 57% rename from java/src/test/java/org/softwareheritage/graph/LeavesTest.java rename to java/src/test/java/org/softwareheritage/graph/rpc/TraverseLeavesTest.java index a288d03..6e8a7ee 100644 --- a/java/src/test/java/org/softwareheritage/graph/LeavesTest.java +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseLeavesTest.java @@ -1,107 +1,100 @@ -package org.softwareheritage.graph; +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ -import java.util.ArrayList; +package org.softwareheritage.graph.rpc; import org.junit.jupiter.api.Test; -import org.softwareheritage.graph.server.Endpoint; +import org.softwareheritage.graph.GraphTest; +import org.softwareheritage.graph.SWHID; + +import java.util.ArrayList; + +public class TraverseLeavesTest extends TraversalServiceTest { + private TraversalRequest.Builder getLeavesRequestBuilder(SWHID src) { + return TraversalRequest.newBuilder().addSrc(src.toString()) + .setReturnNodes(NodeFilter.newBuilder().setMaxTraversalSuccessors(0).build()); + } -// Avoid warnings concerning Endpoint.Output.result manual cast -@SuppressWarnings("unchecked") -public class LeavesTest extends GraphTest { @Test public void forwardFromSnp() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:snp:0000000000000000000000000000000000000020"); - Endpoint endpoint = new Endpoint(graph, "forward", "*"); + TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("snp", 20)).build(); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); - ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result; + ArrayList actualLeaves = getSWHIDs(client.traverse(request)); GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } @Test public void forwardFromRel() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:rel:0000000000000000000000000000000000000019"); - Endpoint endpoint = new Endpoint(graph, "forward", "*"); - + TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("rel", 19)).build(); + ArrayList actualLeaves = getSWHIDs(client.traverse(request)); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000015")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000014")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000011")); - ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result; GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } @Test public void backwardFromLeaf() { - Graph graph = getGraph(); - - Endpoint endpoint1 = new Endpoint(graph, "backward", "*"); - SWHID src1 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000015"); + TraversalRequest request1 = getLeavesRequestBuilder(fakeSWHID("cnt", 15)).setDirection(GraphDirection.BACKWARD) + .build(); + ArrayList actualLeaves1 = getSWHIDs(client.traverse(request1)); ArrayList expectedLeaves1 = new ArrayList<>(); expectedLeaves1.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000019")); - ArrayList actualLeaves1 = (ArrayList) endpoint1.leaves(new Endpoint.Input(src1)).result; GraphTest.assertEqualsAnyOrder(expectedLeaves1, actualLeaves1); - Endpoint endpoint2 = new Endpoint(graph, "backward", "*"); - SWHID src2 = new SWHID("swh:1:cnt:0000000000000000000000000000000000000004"); + TraversalRequest request2 = getLeavesRequestBuilder(fakeSWHID("cnt", 4)).setDirection(GraphDirection.BACKWARD) + .build(); + ArrayList actualLeaves2 = getSWHIDs(client.traverse(request2)); ArrayList expectedLeaves2 = new ArrayList<>(); - expectedLeaves2.add(new SWHID("swh:1:ori:0000000000000000000000000000000000000021")); + expectedLeaves2.add(new SWHID(TEST_ORIGIN_ID)); expectedLeaves2.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000019")); - ArrayList actualLeaves2 = (ArrayList) endpoint2.leaves(new Endpoint.Input(src2)).result; GraphTest.assertEqualsAnyOrder(expectedLeaves2, actualLeaves2); } @Test public void forwardRevToRevOnly() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:rev:0000000000000000000000000000000000000018"); - Endpoint endpoint = new Endpoint(graph, "forward", "rev:rev"); - + TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("rev", 18)).setEdges("rev:rev").build(); + ArrayList actualLeaves = getSWHIDs(client.traverse(request)); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000003")); - - ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result; GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } @Test public void forwardDirToAll() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:dir:0000000000000000000000000000000000000008"); - Endpoint endpoint = new Endpoint(graph, "forward", "dir:*"); - + TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("dir", 8)).setEdges("dir:*").build(); + ArrayList actualLeaves = getSWHIDs(client.traverse(request)); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000004")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000005")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); expectedLeaves.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); - - ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result; GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } @Test public void backwardCntToDirDirToDir() { - Graph graph = getGraph(); - SWHID src = new SWHID("swh:1:cnt:0000000000000000000000000000000000000005"); - Endpoint endpoint = new Endpoint(graph, "backward", "cnt:dir,dir:dir"); - + TraversalRequest request = getLeavesRequestBuilder(fakeSWHID("cnt", 5)).setEdges("cnt:dir,dir:dir") + .setDirection(GraphDirection.BACKWARD).build(); + ArrayList actualLeaves = getSWHIDs(client.traverse(request)); ArrayList expectedLeaves = new ArrayList<>(); expectedLeaves.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000012")); - - ArrayList actualLeaves = (ArrayList) endpoint.leaves(new Endpoint.Input(src)).result; GraphTest.assertEqualsAnyOrder(expectedLeaves, actualLeaves); } } diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNeighborsTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNeighborsTest.java new file mode 100644 index 0000000..94c92fa --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNeighborsTest.java @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import org.junit.jupiter.api.Test; +import org.softwareheritage.graph.GraphTest; +import org.softwareheritage.graph.SWHID; + +import java.util.ArrayList; + +public class TraverseNeighborsTest extends TraversalServiceTest { + private TraversalRequest.Builder getNeighborsRequestBuilder(SWHID src) { + return TraversalRequest.newBuilder().addSrc(src.toString()).setMinDepth(1).setMaxDepth(1); + } + + @Test + public void zeroNeighbor() { + ArrayList expectedNodes = new ArrayList<>(); + + TraversalRequest request1 = getNeighborsRequestBuilder(new SWHID(TEST_ORIGIN_ID)) + .setDirection(GraphDirection.BACKWARD).build(); + ArrayList actuals1 = getSWHIDs(client.traverse(request1)); + GraphTest.assertEqualsAnyOrder(expectedNodes, actuals1); + + TraversalRequest request2 = getNeighborsRequestBuilder(fakeSWHID("cnt", 4)).build(); + ArrayList actuals2 = getSWHIDs(client.traverse(request2)); + GraphTest.assertEqualsAnyOrder(expectedNodes, actuals2); + + TraversalRequest request3 = getNeighborsRequestBuilder(fakeSWHID("cnt", 15)).build(); + ArrayList actuals3 = getSWHIDs(client.traverse(request3)); + GraphTest.assertEqualsAnyOrder(expectedNodes, actuals3); + + TraversalRequest request4 = getNeighborsRequestBuilder(fakeSWHID("rel", 19)) + .setDirection(GraphDirection.BACKWARD).build(); + ArrayList actuals4 = getSWHIDs(client.traverse(request4)); + GraphTest.assertEqualsAnyOrder(expectedNodes, actuals4); + + TraversalRequest request5 = getNeighborsRequestBuilder(fakeSWHID("dir", 8)).setEdges("snp:*,rev:*,rel:*") + .build(); + ArrayList actuals5 = getSWHIDs(client.traverse(request5)); + GraphTest.assertEqualsAnyOrder(expectedNodes, actuals5); + } + + @Test + public void oneNeighbor() { + TraversalRequest request1 = getNeighborsRequestBuilder(fakeSWHID("rev", 3)).build(); + ArrayList actuals1 = getSWHIDs(client.traverse(request1)); + ArrayList expectedNodes1 = new ArrayList<>(); + expectedNodes1.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002")); + GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); + + TraversalRequest request2 = getNeighborsRequestBuilder(fakeSWHID("dir", 17)).setEdges("dir:cnt").build(); + ArrayList actuals2 = getSWHIDs(client.traverse(request2)); + ArrayList expectedNodes2 = new ArrayList<>(); + expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000014")); + GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); + + TraversalRequest request3 = getNeighborsRequestBuilder(fakeSWHID("dir", 12)) + .setDirection(GraphDirection.BACKWARD).build(); + ArrayList actuals3 = getSWHIDs(client.traverse(request3)); + ArrayList expectedNodes3 = new ArrayList<>(); + expectedNodes3.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); + GraphTest.assertEqualsAnyOrder(expectedNodes3, actuals3); + + TraversalRequest request4 = getNeighborsRequestBuilder(fakeSWHID("rev", 9)) + .setDirection(GraphDirection.BACKWARD).setEdges("rev:rev").build(); + ArrayList actuals4 = getSWHIDs(client.traverse(request4)); + ArrayList expectedNodes4 = new ArrayList<>(); + expectedNodes4.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); + GraphTest.assertEqualsAnyOrder(expectedNodes4, actuals4); + + TraversalRequest request5 = getNeighborsRequestBuilder(fakeSWHID("snp", 20)) + .setDirection(GraphDirection.BACKWARD).build(); + ArrayList actuals5 = getSWHIDs(client.traverse(request5)); + ArrayList expectedNodes5 = new ArrayList<>(); + expectedNodes5.add(new SWHID(TEST_ORIGIN_ID)); + GraphTest.assertEqualsAnyOrder(expectedNodes5, actuals5); + } + + @Test + public void twoNeighbors() { + TraversalRequest request1 = getNeighborsRequestBuilder(fakeSWHID("snp", 20)).build(); + ArrayList actuals1 = getSWHIDs(client.traverse(request1)); + ArrayList expectedNodes1 = new ArrayList<>(); + expectedNodes1.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); + expectedNodes1.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000009")); + GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); + + TraversalRequest request2 = getNeighborsRequestBuilder(fakeSWHID("dir", 8)).setEdges("dir:cnt").build(); + ArrayList actuals2 = getSWHIDs(client.traverse(request2)); + ArrayList expectedNodes2 = new ArrayList<>(); + expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); + expectedNodes2.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); + GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); + + TraversalRequest request3 = getNeighborsRequestBuilder(fakeSWHID("cnt", 1)) + .setDirection(GraphDirection.BACKWARD).build(); + ArrayList actuals3 = getSWHIDs(client.traverse(request3)); + ArrayList expectedNodes3 = new ArrayList<>(); + expectedNodes3.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000008")); + expectedNodes3.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000002")); + GraphTest.assertEqualsAnyOrder(expectedNodes3, actuals3); + + TraversalRequest request4 = getNeighborsRequestBuilder(fakeSWHID("rev", 9)) + .setDirection(GraphDirection.BACKWARD).setEdges("rev:snp,rev:rel").build(); + ArrayList actuals4 = getSWHIDs(client.traverse(request4)); + ArrayList expectedNodes4 = new ArrayList<>(); + expectedNodes4.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020")); + expectedNodes4.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); + GraphTest.assertEqualsAnyOrder(expectedNodes4, actuals4); + } + + @Test + public void threeNeighbors() { + TraversalRequest request1 = getNeighborsRequestBuilder(fakeSWHID("dir", 8)).build(); + ArrayList actuals1 = getSWHIDs(client.traverse(request1)); + ArrayList expectedNodes1 = new ArrayList<>(); + expectedNodes1.add(new SWHID("swh:1:dir:0000000000000000000000000000000000000006")); + expectedNodes1.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000001")); + expectedNodes1.add(new SWHID("swh:1:cnt:0000000000000000000000000000000000000007")); + GraphTest.assertEqualsAnyOrder(expectedNodes1, actuals1); + + TraversalRequest request2 = getNeighborsRequestBuilder(fakeSWHID("rev", 9)) + .setDirection(GraphDirection.BACKWARD).build(); + ArrayList actuals2 = getSWHIDs(client.traverse(request2)); + ArrayList expectedNodes2 = new ArrayList<>(); + expectedNodes2.add(new SWHID("swh:1:snp:0000000000000000000000000000000000000020")); + expectedNodes2.add(new SWHID("swh:1:rel:0000000000000000000000000000000000000010")); + expectedNodes2.add(new SWHID("swh:1:rev:0000000000000000000000000000000000000013")); + GraphTest.assertEqualsAnyOrder(expectedNodes2, actuals2); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesPropertiesTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesPropertiesTest.java new file mode 100644 index 0000000..9a0ab38 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesPropertiesTest.java @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import com.google.protobuf.Descriptors; +import com.google.protobuf.FieldMask; +import com.google.protobuf.Message; +import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator; +import org.junit.jupiter.api.Test; +import org.softwareheritage.graph.SWHID; +import org.softwareheritage.graph.SwhUnidirectionalGraph; +import org.softwareheritage.graph.labels.DirEntry; + +import java.util.*; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TraverseNodesPropertiesTest extends TraversalServiceTest { + private TraversalRequest.Builder getTraversalRequestBuilder(SWHID src) { + return TraversalRequest.newBuilder().addSrc(src.toString()); + } + + private void checkHasAllFields(Message m) { + for (Descriptors.FieldDescriptor fd : m.getAllFields().keySet()) { + assertTrue(m.hasField(fd)); + } + } + + private void checkHasAllFieldsOfType(Node node) { + if (node.hasCnt()) { + checkHasAllFields(node.getCnt()); + } + if (node.hasRev()) { + checkHasAllFields(node.getRev()); + } + if (node.hasRel()) { + checkHasAllFields(node.getRel()); + } + if (node.hasOri()) { + checkHasAllFields(node.getOri()); + } + } + + private void checkSuccessors(SwhUnidirectionalGraph g, Node node) { + HashMap graphSuccessors = new HashMap<>(); + ArcLabelledNodeIterator.LabelledArcIterator it = g.labelledSuccessors(g.getNodeId(new SWHID(node.getSwhid()))); + long succ; + while ((succ = it.nextLong()) != -1) { + graphSuccessors.put(g.getSWHID(succ).toString(), (DirEntry[]) it.label().get()); + } + + assertEquals(node.getSuccessorList().stream().map(Successor::getSwhid).collect(Collectors.toSet()), + graphSuccessors.keySet()); + + for (Successor successor : node.getSuccessorList()) { + DirEntry[] expectedArray = graphSuccessors.get(successor.getSwhid()); + HashMap expectedLabels = new HashMap<>(); + for (DirEntry dirEntry : expectedArray) { + expectedLabels.put(new String(g.getLabelName(dirEntry.filenameId)), dirEntry.permission); + } + for (EdgeLabel edgeLabel : successor.getLabelList()) { + assertTrue(expectedLabels.containsKey(edgeLabel.getName().toStringUtf8())); + if (edgeLabel.getPermission() > 0) { + assertEquals(edgeLabel.getPermission(), expectedLabels.get(edgeLabel.getName().toStringUtf8())); + } + } + } + } + + @Test + public void forwardFromRoot() { + ArrayList response = new ArrayList<>(); + client.traverse(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).build()).forEachRemaining(response::add); + for (Node node : response) { + checkHasAllFieldsOfType(node); + checkSuccessors(g.getForwardGraph(), node); + } + } + + @Test + public void backwardFromLeaf() { + ArrayList response = new ArrayList<>(); + client.traverse(getTraversalRequestBuilder(fakeSWHID("cnt", 4)).setDirection(GraphDirection.BACKWARD).build()) + .forEachRemaining(response::add); + for (Node node : response) { + checkHasAllFieldsOfType(node); + checkSuccessors(g.getBackwardGraph(), node); + } + } + + @Test + public void forwardFromRootMaskedLabels() { + ArrayList response = new ArrayList<>(); + client.traverse(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)) + .setMask(FieldMask.newBuilder().addPaths("successor.swhid").addPaths("swhid").build()).build()) + .forEachRemaining(response::add); + for (Node node : response) { + HashSet graphSuccessors = new HashSet<>(); + ArcLabelledNodeIterator.LabelledArcIterator it = g + .labelledSuccessors(g.getNodeId(new SWHID(node.getSwhid()))); + long succ; + while ((succ = it.nextLong()) != -1) { + graphSuccessors.add(g.getSWHID(succ).toString()); + } + + assertEquals(node.getSuccessorList().stream().map(Successor::getSwhid).collect(Collectors.toSet()), + graphSuccessors); + } + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesTest.java b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesTest.java new file mode 100644 index 0000000..fe88c5f --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesTest.java @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.rpc; + +import io.grpc.Status; +import io.grpc.StatusRuntimeException; +import org.junit.jupiter.api.Test; +import org.softwareheritage.graph.GraphTest; +import org.softwareheritage.graph.SWHID; + +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TraverseNodesTest extends TraversalServiceTest { + private TraversalRequest.Builder getTraversalRequestBuilder(SWHID src) { + return TraversalRequest.newBuilder().addSrc(src.toString()); + } + + @Test + public void testSrcErrors() { + StatusRuntimeException thrown; + thrown = assertThrows(StatusRuntimeException.class, + () -> client.traverse(TraversalRequest.newBuilder().addSrc(fakeSWHID("cnt", 404).toString()).build()) + .forEachRemaining((n) -> { + })); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, + () -> client + .traverse(TraversalRequest.newBuilder() + .addSrc("swh:1:lol:0000000000000000000000000000000000000001").build()) + .forEachRemaining((n) -> { + })); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + thrown = assertThrows(StatusRuntimeException.class, + () -> client + .traverse(TraversalRequest.newBuilder() + .addSrc("swh:1:cnt:000000000000000000000000000000000000000z").build()) + .forEachRemaining((n) -> { + })); + assertEquals(Status.INVALID_ARGUMENT.getCode(), thrown.getStatus().getCode()); + } + + @Test + public void forwardFromRoot() { + ArrayList actual = getSWHIDs( + client.traverse(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).build())); + List expected = List.of(fakeSWHID("cnt", 1), fakeSWHID("cnt", 4), fakeSWHID("cnt", 5), + fakeSWHID("cnt", 7), fakeSWHID("dir", 2), fakeSWHID("dir", 6), fakeSWHID("dir", 8), + fakeSWHID("rel", 10), fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("snp", 20), + new SWHID(TEST_ORIGIN_ID)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardFromMiddle() { + ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("dir", 12)).build())); + List expected = List.of(fakeSWHID("cnt", 1), fakeSWHID("cnt", 4), fakeSWHID("cnt", 5), + fakeSWHID("cnt", 7), fakeSWHID("cnt", 11), fakeSWHID("dir", 6), fakeSWHID("dir", 8), + fakeSWHID("dir", 12)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardRelRev() { + ArrayList actual = getSWHIDs( + client.traverse(getTraversalRequestBuilder(fakeSWHID("rel", 10)).setEdges("rel:rev,rev:rev").build())); + List expected = List.of(fakeSWHID("rel", 10), fakeSWHID("rev", 9), fakeSWHID("rev", 3)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardFilterReturnedNodesDir() { + ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("rel", 10)) + .setReturnNodes(NodeFilter.newBuilder().setTypes("dir").build()).build())); + List expected = List.of(fakeSWHID("dir", 2), fakeSWHID("dir", 8), fakeSWHID("dir", 6)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void backwardFromRoot() { + ArrayList actual = getSWHIDs(client.traverse( + getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(new SWHID(TEST_ORIGIN_ID)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void backwardFromMiddle() { + ArrayList actual = getSWHIDs(client.traverse( + getTraversalRequestBuilder(fakeSWHID("dir", 12)).setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("dir", 12), fakeSWHID("rel", 19), fakeSWHID("rev", 13), + fakeSWHID("rev", 18)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void backwardFromLeaf() { + ArrayList actual = getSWHIDs(client.traverse( + getTraversalRequestBuilder(fakeSWHID("cnt", 4)).setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("cnt", 4), fakeSWHID("dir", 6), + fakeSWHID("dir", 8), fakeSWHID("dir", 12), fakeSWHID("rel", 10), fakeSWHID("rel", 19), + fakeSWHID("rev", 9), fakeSWHID("rev", 13), fakeSWHID("rev", 18), fakeSWHID("snp", 20)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardSnpToRev() { + ArrayList actual = getSWHIDs( + client.traverse(getTraversalRequestBuilder(fakeSWHID("snp", 20)).setEdges("snp:rev").build())); + List expected = List.of(fakeSWHID("rev", 9), fakeSWHID("snp", 20)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardRelToRevRevToRev() { + ArrayList actual = getSWHIDs( + client.traverse(getTraversalRequestBuilder(fakeSWHID("rel", 10)).setEdges("rel:rev,rev:rev").build())); + List expected = List.of(fakeSWHID("rel", 10), fakeSWHID("rev", 3), fakeSWHID("rev", 9)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardRevToAllDirToAll() { + ArrayList actual = getSWHIDs( + client.traverse(getTraversalRequestBuilder(fakeSWHID("rev", 13)).setEdges("rev:*,dir:*").build())); + List expected = List.of(fakeSWHID("cnt", 1), fakeSWHID("cnt", 4), fakeSWHID("cnt", 5), + fakeSWHID("cnt", 7), fakeSWHID("cnt", 11), fakeSWHID("dir", 2), fakeSWHID("dir", 6), + fakeSWHID("dir", 8), fakeSWHID("dir", 12), fakeSWHID("rev", 3), fakeSWHID("rev", 9), + fakeSWHID("rev", 13)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardSnpToAllRevToAll() { + ArrayList actual = getSWHIDs( + client.traverse(getTraversalRequestBuilder(fakeSWHID("snp", 20)).setEdges("snp:*,rev:*").build())); + List expected = List.of(fakeSWHID("dir", 2), fakeSWHID("dir", 8), fakeSWHID("rel", 10), + fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("snp", 20)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardNoEdges() { + ArrayList actual = getSWHIDs( + client.traverse(getTraversalRequestBuilder(fakeSWHID("snp", 20)).setEdges("").build())); + List expected = List.of(fakeSWHID("snp", 20)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void backwardRevToRevRevToRel() { + ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("rev", 3)) + .setEdges("rev:rev,rev:rel").setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("rel", 10), fakeSWHID("rel", 19), fakeSWHID("rev", 3), + fakeSWHID("rev", 9), fakeSWHID("rev", 13), fakeSWHID("rev", 18)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardFromRootNodesOnly() { + ArrayList actual = getSWHIDs( + client.traverse(getTraversalRequestBuilder(new SWHID(TEST_ORIGIN_ID)).build())); + List expected = List.of(new SWHID(TEST_ORIGIN_ID), fakeSWHID("cnt", 1), fakeSWHID("cnt", 4), + fakeSWHID("cnt", 5), fakeSWHID("cnt", 7), fakeSWHID("dir", 2), fakeSWHID("dir", 6), fakeSWHID("dir", 8), + fakeSWHID("rel", 10), fakeSWHID("rev", 3), fakeSWHID("rev", 9), fakeSWHID("snp", 20)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void backwardRevToAllNodesOnly() { + ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("rev", 3)) + .setDirection(GraphDirection.BACKWARD).setEdges("rev:*").build())); + List expected = List.of(fakeSWHID("rel", 10), fakeSWHID("rel", 19), fakeSWHID("rev", 3), + fakeSWHID("rev", 9), fakeSWHID("rev", 13), fakeSWHID("rev", 18), fakeSWHID("snp", 20)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void forwardMultipleSources() { + ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("snp", 20)) + .addSrc(fakeSWHID("rel", 19).toString()).setMaxDepth(1).build())); + List expected = List.of(fakeSWHID("snp", 20), fakeSWHID("rel", 19), fakeSWHID("rel", 10), + fakeSWHID("rev", 9), fakeSWHID("rev", 18)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + @Test + public void backwardMultipleSources() { + ArrayList actual = getSWHIDs(client.traverse(getTraversalRequestBuilder(fakeSWHID("cnt", 5)) + .addSrc(fakeSWHID("dir", 16).toString()).setMaxDepth(2).setDirection(GraphDirection.BACKWARD).build())); + List expected = List.of(fakeSWHID("cnt", 5), fakeSWHID("dir", 16), fakeSWHID("dir", 6), + fakeSWHID("dir", 8), fakeSWHID("dir", 17), fakeSWHID("rev", 18)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + // Go from rel 19 with various max depths + @Test + public void maxDepth() { + TraversalRequest.Builder builder = getTraversalRequestBuilder(fakeSWHID("rel", 19)); + + ArrayList actual; + List expected; + + actual = getSWHIDs(client.traverse(builder.setMaxDepth(0).build())); + expected = List.of(fakeSWHID("rel", 19)); + GraphTest.assertEqualsAnyOrder(expected, actual); + + actual = getSWHIDs(client.traverse(builder.setMaxDepth(1).build())); + expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18)); + GraphTest.assertEqualsAnyOrder(expected, actual); + + actual = getSWHIDs(client.traverse(builder.setMaxDepth(2).build())); + expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("dir", 17)); + GraphTest.assertEqualsAnyOrder(expected, actual); + + actual = getSWHIDs(client.traverse(builder.setMaxDepth(3).build())); + expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("dir", 17), + fakeSWHID("rev", 9), fakeSWHID("dir", 12), fakeSWHID("dir", 16), fakeSWHID("cnt", 14)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } + + // Go from rel 19 with various max edges + @Test + public void maxEdges() { + TraversalRequest.Builder builder = getTraversalRequestBuilder(fakeSWHID("rel", 19)); + + ArrayList actual; + List expected; + + actual = getSWHIDs(client.traverse(builder.setMaxEdges(1).build())); + expected = List.of(fakeSWHID("rel", 19)); + GraphTest.assertEqualsAnyOrder(expected, actual); + + actual = getSWHIDs(client.traverse(builder.setMaxEdges(3).build())); + expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18)); + GraphTest.assertEqualsAnyOrder(expected, actual); + + actual = getSWHIDs(client.traverse(builder.setMaxEdges(7).build())); + expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("dir", 17), + fakeSWHID("cnt", 14)); + GraphTest.assertEqualsAnyOrder(expected, actual); + + actual = getSWHIDs(client.traverse(builder.setMaxEdges(12).build())); + expected = List.of(fakeSWHID("rel", 19), fakeSWHID("rev", 18), fakeSWHID("rev", 13), fakeSWHID("dir", 17), + fakeSWHID("rev", 9), fakeSWHID("dir", 12), fakeSWHID("dir", 16), fakeSWHID("cnt", 14), + fakeSWHID("cnt", 15)); + GraphTest.assertEqualsAnyOrder(expected, actual); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2Test.java b/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2Test.java new file mode 100644 index 0000000..ebc92a7 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2Test.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.utils; + +import it.unimi.dsi.fastutil.BigArrays; +import it.unimi.dsi.fastutil.longs.LongArrays; +import org.junit.jupiter.api.Test; + +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ForkJoinBigQuickSort2Test { + private static long[] identity(final int n) { + final long[] perm = new long[n]; + for (int i = perm.length; i-- != 0;) + perm[i] = i; + return perm; + } + + private static void checkArraySorted(long[] x, long[] y) { + checkArraySorted(x, y, 0, x.length); + } + + private static void checkArraySorted(long[] x, long[] y, int from, int to) { + for (int i = to - 1; i-- != from;) + assertTrue(x[i] < x[i + 1] || x[i] == x[i + 1] && (y[i] < y[i + 1] || y[i] == y[i + 1]), + String.format("%d: <%d, %d>, <%d, %d>", i, x[i], y[i], x[i + 1], y[i + 1])); + } + + private static void sortBig2(long[] x, long[] y, long from, long to) { + ForkJoinBigQuickSort2.parallelQuickSort(BigArrays.wrap(x), BigArrays.wrap(y), from, to); + } + + private static void sortBig2(long[] x, long[] y) { + sortBig2(x, y, 0, x.length); + } + + @Test + public void testParallelQuickSort3() { + final long[][] d = new long[2][]; + + d[0] = new long[10]; + for (int i = d[0].length; i-- != 0;) + d[0][i] = 3 - i % 3; + d[1] = LongArrays.shuffle(identity(10), new Random(0)); + sortBig2(d[0], d[1]); + checkArraySorted(d[0], d[1]); + + d[0] = new long[100000]; + for (int i = d[0].length; i-- != 0;) + d[0][i] = 100 - i % 100; + d[1] = LongArrays.shuffle(identity(100000), new Random(6)); + sortBig2(d[0], d[1]); + checkArraySorted(d[0], d[1]); + + d[0] = new long[10]; + for (int i = d[0].length; i-- != 0;) + d[0][i] = i % 3 - 2; + Random random = new Random(0); + d[1] = new long[d[0].length]; + for (int i = d[1].length; i-- != 0;) + d[1][i] = random.nextInt(); + sortBig2(d[0], d[1]); + checkArraySorted(d[0], d[1]); + + d[0] = new long[100000]; + d[1] = new long[100000]; + sortBig2(d[0], d[1]); + checkArraySorted(d[0], d[1]); + + d[0] = new long[100000]; + random = new Random(0); + for (int i = d[0].length; i-- != 0;) + d[0][i] = random.nextInt(); + d[1] = new long[d[0].length]; + for (int i = d[1].length; i-- != 0;) + d[1][i] = random.nextInt(); + sortBig2(d[0], d[1]); + checkArraySorted(d[0], d[1]); + for (int i = 100; i-- != 10;) + d[0][i] = random.nextInt(); + for (int i = 100; i-- != 10;) + d[1][i] = random.nextInt(); + sortBig2(d[0], d[1], 10, 100); + checkArraySorted(d[0], d[1], 10, 100); + } +} diff --git a/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3Test.java b/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3Test.java new file mode 100644 index 0000000..1f1fa38 --- /dev/null +++ b/java/src/test/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3Test.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022 The Software Heritage developers + * See the AUTHORS file at the top-level directory of this distribution + * License: GNU General Public License version 3, or any later version + * See top-level LICENSE file for more information + */ + +package org.softwareheritage.graph.utils; + +import it.unimi.dsi.fastutil.longs.LongArrays; +import org.junit.jupiter.api.Test; + +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ForkJoinQuickSort3Test { + private static long[] identity(final int n) { + final long[] perm = new long[n]; + for (int i = perm.length; i-- != 0;) + perm[i] = i; + return perm; + } + + private static void checkArraySorted(long[] x, long[] y, long[] z) { + checkArraySorted(x, y, z, 0, x.length); + } + + private static void checkArraySorted(long[] x, long[] y, long[] z, int from, int to) { + for (int i = to - 1; i-- != from;) + assertTrue(x[i] < x[i + 1] || x[i] == x[i + 1] && (y[i] < y[i + 1] || y[i] == y[i + 1] && z[i] <= z[i + 1]), + String.format("%d: <%d, %d, %d>, <%d, %d, %d>", i, x[i], y[i], z[i], x[i + 1], y[i + 1], z[i + 1])); + } + + @Test + public void testParallelQuickSort3() { + final long[][] d = new long[3][]; + + d[0] = new long[10]; + for (int i = d[0].length; i-- != 0;) + d[0][i] = 3 - i % 3; + d[1] = LongArrays.shuffle(identity(10), new Random(0)); + d[2] = LongArrays.shuffle(identity(10), new Random(1)); + ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); + checkArraySorted(d[0], d[1], d[2]); + + d[0] = new long[100000]; + for (int i = d[0].length; i-- != 0;) + d[0][i] = 100 - i % 100; + d[1] = LongArrays.shuffle(identity(100000), new Random(6)); + d[2] = LongArrays.shuffle(identity(100000), new Random(7)); + ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); + checkArraySorted(d[0], d[1], d[2]); + + d[0] = new long[10]; + for (int i = d[0].length; i-- != 0;) + d[0][i] = i % 3 - 2; + Random random = new Random(0); + d[1] = new long[d[0].length]; + for (int i = d[1].length; i-- != 0;) + d[1][i] = random.nextInt(); + d[2] = new long[d[0].length]; + for (int i = d[2].length; i-- != 0;) + d[2][i] = random.nextInt(); + ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); + checkArraySorted(d[0], d[1], d[2]); + + d[0] = new long[100000]; + d[1] = new long[100000]; + d[2] = new long[100000]; + for (int i = d[0].length; i-- != 0;) + d[2][i] = random.nextInt(); + ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); + checkArraySorted(d[0], d[1], d[2]); + + d[0] = new long[100000]; + random = new Random(0); + for (int i = d[0].length; i-- != 0;) + d[0][i] = random.nextInt(); + d[1] = new long[d[0].length]; + for (int i = d[1].length; i-- != 0;) + d[1][i] = random.nextInt(); + d[2] = new long[d[0].length]; + for (int i = d[2].length; i-- != 0;) + d[2][i] = random.nextInt(); + ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2]); + checkArraySorted(d[0], d[1], d[2]); + for (int i = 100; i-- != 10;) + d[0][i] = random.nextInt(); + for (int i = 100; i-- != 10;) + d[1][i] = random.nextInt(); + for (int i = 100; i-- != 10;) + d[2][i] = random.nextInt(); + ForkJoinQuickSort3.parallelQuickSort(d[0], d[1], d[2], 10, 100); + checkArraySorted(d[0], d[1], d[2], 10, 100); + } +} diff --git a/mypy.ini b/mypy.ini index 0f1b85c..147a027 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,18 +1,21 @@ [mypy] namespace_packages = True warn_unused_ignores = True +exclude = (?x)( + ^swh/graph/rpc + ) # 3rd party libraries without stubs (yet) [mypy-pkg_resources.*] ignore_missing_imports = True [mypy-psutil.*] ignore_missing_imports = True [mypy-py4j.*] ignore_missing_imports = True [mypy-pytest.*] ignore_missing_imports = True diff --git a/proto/swhgraph.proto b/proto/swhgraph.proto new file mode 100644 index 0000000..7c40a6e --- /dev/null +++ b/proto/swhgraph.proto @@ -0,0 +1,316 @@ +syntax = "proto3"; + +import "google/protobuf/field_mask.proto"; + +option java_multiple_files = true; +option java_package = "org.softwareheritage.graph.rpc"; +option java_outer_classname = "GraphService"; + +package swh.graph; + +/* Graph traversal service */ +service TraversalService { + /* GetNode returns a single Node and its properties. */ + rpc GetNode (GetNodeRequest) returns (Node); + + /* Traverse performs a breadth-first graph traversal from a set of source + * nodes, then streams the nodes it encounters (if they match a given + * return filter), along with their properties. + */ + rpc Traverse (TraversalRequest) returns (stream Node); + + /* FindPathTo searches for a shortest path between a set of source nodes + * and a node that matches a specific *criteria*. + * + * It does so by performing a breadth-first search from the source node, + * until any node that matches the given criteria is found, then follows + * back its parents to return a shortest path from the source set to that + * node. + */ + rpc FindPathTo (FindPathToRequest) returns (Path); + + /* FindPathBetween searches for a shortest path between a set of source + * nodes and a set of destination nodes. + * + * It does so by performing a *bidirectional breadth-first search*, i.e., + * two parallel breadth-first searches, one from the source set ("src-BFS") + * and one from the destination set ("dst-BFS"), until both searches find a + * common node that joins their visited sets. This node is called the + * "midpoint node". + * The path returned is the path src -> ... -> midpoint -> ... -> dst, + * which is always a shortest path between src and dst. + * + * The graph direction of both BFS can be configured separately. By + * default, the dst-BFS will use the graph in the opposite direction than + * the src-BFS (if direction = FORWARD, by default direction_reverse = + * BACKWARD, and vice-versa). The default behavior is thus to search for + * a shortest path between two nodes in a given direction. However, one + * can also specify FORWARD or BACKWARD for *both* the src-BFS and the + * dst-BFS. This will search for a common descendant or a common ancestor + * between the two sets, respectively. These will be the midpoints of the + * returned path. + */ + rpc FindPathBetween (FindPathBetweenRequest) returns (Path); + + /* CountNodes does the same as Traverse, but only returns the number of + * nodes accessed during the traversal. */ + rpc CountNodes (TraversalRequest) returns (CountResponse); + + /* CountEdges does the same as Traverse, but only returns the number of + * edges accessed during the traversal. */ + rpc CountEdges (TraversalRequest) returns (CountResponse); + + /* Stats returns various statistics on the overall graph. */ + rpc Stats (StatsRequest) returns (StatsResponse); +} + +/* Direction of the graph */ +enum GraphDirection { + /* Forward DAG: ori -> snp -> rel -> rev -> dir -> cnt */ + FORWARD = 0; + /* Transposed DAG: cnt -> dir -> rev -> rel -> snp -> ori */ + BACKWARD = 1; +} + +/* Describe a node to return */ +message GetNodeRequest { + /* SWHID of the node to return */ + string swhid = 1; + /* FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + * By default, all fields are returned. */ + optional google.protobuf.FieldMask mask = 8; +} + +/* TraversalRequest describes how a breadth-first traversal should be + * performed, and what should be returned to the client. */ +message TraversalRequest { + /* Set of source nodes (SWHIDs) */ + repeated string src = 1; + /* Direction of the graph to traverse. Defaults to FORWARD. */ + GraphDirection direction = 2; + /* Edge restriction string (e.g. "rev:dir,dir:cnt"). + * Defaults to "*" (all). */ + optional string edges = 3; + /* Maximum number of edges accessed in the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_edges = 4; + /* Do not return nodes with a depth lower than this number. + * By default, all depths are returned. */ + optional int64 min_depth = 5; + /* Maximum depth of the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_depth = 6; + /* Filter which nodes will be sent to the stream. By default, all nodes are + * returned. */ + optional NodeFilter return_nodes = 7; + /* FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + * By default, all fields are returned. */ + optional google.protobuf.FieldMask mask = 8; +} + +/* FindPathToRequest describes a request to find a shortest path between a + * set of nodes and a given target criteria, as well as what should be returned + * in the path. + */ +message FindPathToRequest { + /* Set of source nodes (SWHIDs) */ + repeated string src = 1; + /* Target criteria, i.e., what constitutes a valid path destination. */ + NodeFilter target = 2; + /* Direction of the graph to traverse. Defaults to FORWARD. */ + GraphDirection direction = 3; + /* Edge restriction string (e.g. "rev:dir,dir:cnt"). + * Defaults to "*" (all). */ + optional string edges = 4; + /* Maximum number of edges accessed in the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_edges = 5; + /* Maximum depth of the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_depth = 6; + /* FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + * By default, all fields are returned. */ + optional google.protobuf.FieldMask mask = 7; +} + +/* FindPathToRequest describes a request to find a shortest path between a + * set of source nodes and a set of destination nodes. It works by performing a + * bidirectional breadth-first traversal from both sets at the same time. + */ +message FindPathBetweenRequest { + /* Set of source nodes (SWHIDs) */ + repeated string src = 1; + /* Set of destination nodes (SWHIDs) */ + repeated string dst = 2; + /* Direction of the graph to traverse from the source set. Defaults to + * FORWARD. */ + GraphDirection direction = 3; + /* Direction of the graph to traverse from the destination set. Defaults to + * the opposite of `direction`. If direction and direction_reverse are + * identical, it will find the first common successor of both sets in the + * given direction. */ + optional GraphDirection direction_reverse = 4; + /* Edge restriction string for the traversal from the source set. + * (e.g. "rev:dir,dir:cnt"). Defaults to "*" (all). */ + optional string edges = 5; + /* Edge restriction string for the reverse traversal from the destination + * set. + * If not specified: + * - If `edges` is not specified either, defaults to "*" + * - If direction == direction_reverse, defaults to `edges` + * - If direction != direction_reverse, defaults to the reverse of `edges` + * (e.g. "rev:dir" becomes "dir:rev"). + */ + optional string edges_reverse = 6; + /* Maximum number of edges accessed in the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_edges = 7; + /* Maximum depth of the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_depth = 8; + /* FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + * By default, all fields are returned. */ + optional google.protobuf.FieldMask mask = 9; +} + +/* Represents various criteria that make a given node "valid". A node is + * only valid if all the subcriteria present in this message are fulfilled. + */ +message NodeFilter { + /* Node restriction string. (e.g. "dir,cnt,rev"). Defaults to "*" (all). */ + optional string types = 1; + /* Minimum number of successors encountered *during the traversal*. + * Default: no constraint */ + optional int64 min_traversal_successors = 2; + /* Maximum number of successors encountered *during the traversal*. + * Default: no constraint */ + optional int64 max_traversal_successors = 3; +} + +/* Represents a node in the graph. */ +message Node { + /* The SWHID of the graph node. */ + string swhid = 1; + /* List of relevant successors of this node. */ + repeated Successor successor = 2; + /* Number of relevant successors. */ + optional int64 num_successors = 9; + /* Node properties */ + oneof data { + ContentData cnt = 3; + RevisionData rev = 5; + ReleaseData rel = 6; + OriginData ori = 8; + }; +} + +/* Represents a path in the graph. */ +message Path { + /* List of nodes in the path, from source to destination */ + repeated Node node = 1; + /* Index of the "midpoint" of the path. For paths obtained with + * bidirectional search queries, this is the node that joined the two + * sets together. When looking for a common ancestor between two nodes by + * performing a FindPathBetween search with two backward graphs, this will + * be the index of the common ancestor in the path. */ + optional int32 midpoint_index = 2; +} + +/* Represents a successor of a given node. */ +message Successor { + /* The SWHID of the successor */ + optional string swhid = 1; + /* A list of edge labels for the given edge */ + repeated EdgeLabel label = 2; +} + +/* Content node properties */ +message ContentData { + /* Length of the blob, in bytes */ + optional int64 length = 1; + /* Whether the content was skipped during ingestion. */ + optional bool is_skipped = 2; +} + +/* Revision node properties */ +message RevisionData { + /* Revision author ID (anonymized) */ + optional int64 author = 1; + /* UNIX timestamp of the revision date (UTC) */ + optional int64 author_date = 2; + /* Timezone of the revision author date as an offset from UTC */ + optional int32 author_date_offset = 3; + /* Revision committer ID (anonymized) */ + optional int64 committer = 4; + /* UNIX timestamp of the revision committer date (UTC) */ + optional int64 committer_date = 5; + /* Timezone of the revision committer date as an offset from UTC */ + optional int32 committer_date_offset = 6; + /* Revision message */ + optional bytes message = 7; +} + +/* Release node properties */ +message ReleaseData { + /* Release author ID (anonymized) */ + optional int64 author = 1; + /* UNIX timestamp of the release date (UTC) */ + optional int64 author_date = 2; + /* Timezone of the release author date as an offset from UTC */ + optional int32 author_date_offset = 3; + /* Release name */ + optional bytes name = 4; + /* Release message */ + optional bytes message = 5; +} + +/* Origin node properties */ +message OriginData { + /* URL of the origin */ + optional string url = 1; +} + +message EdgeLabel { + /* Directory entry name for directories, branch name for snapshots */ + bytes name = 1; + /* Entry permission (only set for directories). */ + int32 permission = 2; +} + +message CountResponse { + int64 count = 1; +} + +message StatsRequest { +} + +message StatsResponse { + /* Number of nodes in the graph */ + int64 num_nodes = 1; + /* Number of edges in the graph */ + int64 num_edges = 2; + + /* Ratio between the graph size and the information-theoretical lower + * bound */ + double compression_ratio = 3; + /* Number of bits per node (overall graph size in bits divided by the + * number of nodes) */ + double bits_per_node = 4; + /* Number of bits per edge (overall graph size in bits divided by the + * number of arcs). */ + double bits_per_edge = 5; + double avg_locality = 6; + + /* Smallest indegree */ + int64 indegree_min = 7; + /* Largest indegree */ + int64 indegree_max = 8; + /* Average indegree */ + double indegree_avg = 9; + /* Smallest outdegree */ + int64 outdegree_min = 10; + /* Largest outdegree */ + int64 outdegree_max = 11; + /* Average outdegree */ + double outdegree_avg = 12; +} diff --git a/pyproject.toml b/pyproject.toml index 69b8f4d..8c8af87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,16 @@ [tool.black] target-version = ['py37'] +extend-exclude = ''' +/( + | swh/graph/rpc +)/ +''' [tool.isort] multi_line_output = 3 include_trailing_comma = true force_grid_wrap = 0 use_parentheses = true ensure_newline_before_comments = true line_length = 88 force_sort_within_sections = true diff --git a/pytest.ini b/pytest.ini index 0f05569..ddc1b32 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,4 @@ [pytest] norecursedirs = build docs + +asyncio_mode = strict diff --git a/requirements-swh.txt b/requirements-swh.txt index 15b3bd5..9602b57 100644 --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,2 +1,3 @@ swh.core[http] >= 0.3 swh.model >= 0.13.0 +swh.dataset diff --git a/requirements-test.txt b/requirements-test.txt index c3a6f62..39099d2 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,8 @@ pytest +pytest-asyncio types-click types-pyyaml types-requests +types-protobuf +grpc-stubs diff --git a/requirements.txt b/requirements.txt index c038073..3983067 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ aiohttp click py4j psutil +grpcio-tools +mypy-protobuf diff --git a/setup.cfg b/setup.cfg index 1d722c2..b3eac4d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,10 @@ [flake8] -ignore = E203,E231,W503 +select = C,E,F,W,B950 +ignore = E203,E231,E501,W503 max-line-length = 88 +extend_exclude = swh/graph/rpc [egg_info] tag_build = tag_date = 0 diff --git a/swh.graph.egg-info/PKG-INFO b/swh.graph.egg-info/PKG-INFO index 4839ff0..d6f1fb7 100644 --- a/swh.graph.egg-info/PKG-INFO +++ b/swh.graph.egg-info/PKG-INFO @@ -1,56 +1,52 @@ Metadata-Version: 2.1 Name: swh.graph -Version: 0.5.2 +Version: 1.0.0 Summary: Software Heritage graph service Home-page: https://forge.softwareheritage.org/diffusion/DGRPH Author: Software Heritage developers Author-email: swh-devel@inria.fr -License: UNKNOWN Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest Project-URL: Funding, https://www.softwareheritage.org/donate Project-URL: Source, https://forge.softwareheritage.org/source/swh-graph Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-graph/ -Platform: UNKNOWN Classifier: Programming Language :: Python :: 3 Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) Classifier: Operating System :: OS Independent Classifier: Development Status :: 3 - Alpha Requires-Python: >=3.7 Description-Content-Type: text/x-rst Provides-Extra: testing License-File: LICENSE License-File: AUTHORS Software Heritage - graph service ================================= Tooling and services, collectively known as ``swh-graph``, providing fast access to the graph representation of the `Software Heritage `_ `archive `_. The service is in-memory, based on a compressed representation of the Software Heritage Merkle DAG. Bibliography ------------ In addition to accompanying technical documentation, ``swh-graph`` is also described in the following scientific paper. If you publish results based on ``swh-graph``, please acknowledge it by citing the paper as follows: .. note:: Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli. `Ultra-Large-Scale Repository Analysis via Graph Compression `_. In proceedings of `SANER 2020 `_: The 27th IEEE International Conference on Software Analysis, Evolution and Reengineering, pages 184-194. IEEE 2020. Links: `preprint `_, `bibtex `_. - - diff --git a/swh.graph.egg-info/SOURCES.txt b/swh.graph.egg-info/SOURCES.txt index cca3c22..5a4af14 100644 --- a/swh.graph.egg-info/SOURCES.txt +++ b/swh.graph.egg-info/SOURCES.txt @@ -1,193 +1,256 @@ +.git-blame-ignore-revs .gitignore .pre-commit-config.yaml AUTHORS CODE_OF_CONDUCT.md CONTRIBUTORS LICENSE MANIFEST.in Makefile Makefile.local README.rst mypy.ini pyproject.toml pytest.ini requirements-swh.txt requirements-test.txt requirements.txt setup.cfg setup.py tox.ini docker/Dockerfile docker/build.sh docker/run.sh docs/.gitignore docs/Makefile docs/Makefile.local docs/README.rst docs/api.rst docs/cli.rst docs/compression.rst docs/conf.py docs/docker.rst docs/git2graph.md +docs/grpc-api.rst docs/index.rst +docs/java-api.rst +docs/memory.rst docs/quickstart.rst -docs/use-cases.rst docs/_static/.placeholder docs/_templates/.placeholder docs/images/.gitignore docs/images/Makefile docs/images/compression_steps.dot java/.coding-style.xml java/.gitignore java/AUTHORS java/LICENSE java/README.md java/pom.xml java/.mvn/jvm.config +java/src/main/proto java/src/main/java/org/softwareheritage/graph/AllowedEdges.java java/src/main/java/org/softwareheritage/graph/AllowedNodes.java -java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java -java/src/main/java/org/softwareheritage/graph/Entry.java -java/src/main/java/org/softwareheritage/graph/Graph.java -java/src/main/java/org/softwareheritage/graph/Node.java -java/src/main/java/org/softwareheritage/graph/NodesFiltering.java java/src/main/java/org/softwareheritage/graph/SWHID.java -java/src/main/java/org/softwareheritage/graph/Stats.java java/src/main/java/org/softwareheritage/graph/Subgraph.java -java/src/main/java/org/softwareheritage/graph/SwhPath.java -java/src/main/java/org/softwareheritage/graph/Traversal.java -java/src/main/java/org/softwareheritage/graph/algo/TopologicalTraversal.java -java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java -java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java -java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java -java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java -java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java -java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java -java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java -java/src/main/java/org/softwareheritage/graph/benchmark/utils/Statistics.java -java/src/main/java/org/softwareheritage/graph/benchmark/utils/Timing.java -java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java -java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java +java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java +java/src/main/java/org/softwareheritage/graph/SwhGraph.java +java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java +java/src/main/java/org/softwareheritage/graph/SwhType.java +java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java +java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java +java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java +java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java +java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java +java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java +java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java +java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java +java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java +java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java +java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java -java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java java/src/main/java/org/softwareheritage/graph/experiments/topology/ConnectedComponents.java java/src/main/java/org/softwareheritage/graph/experiments/topology/InOutDegree.java java/src/main/java/org/softwareheritage/graph/experiments/topology/SubdatasetSizeFunction.java -java/src/main/java/org/softwareheritage/graph/labels/AbstractLongListLabel.java java/src/main/java/org/softwareheritage/graph/labels/DirEntry.java -java/src/main/java/org/softwareheritage/graph/labels/FixedWidthLongListLabel.java java/src/main/java/org/softwareheritage/graph/labels/SwhLabel.java -java/src/main/java/org/softwareheritage/graph/maps/LabelMapBuilder.java -java/src/main/java/org/softwareheritage/graph/maps/MapFile.java java/src/main/java/org/softwareheritage/graph/maps/NodeIdMap.java -java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java java/src/main/java/org/softwareheritage/graph/maps/NodeTypesMap.java -java/src/main/java/org/softwareheritage/graph/server/App.java -java/src/main/java/org/softwareheritage/graph/server/Endpoint.java -java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java +java/src/main/java/org/softwareheritage/graph/rpc/GraphServer.java +java/src/main/java/org/softwareheritage/graph/rpc/NodePropertyBuilder.java +java/src/main/java/org/softwareheritage/graph/rpc/Traversal.java +java/src/main/java/org/softwareheritage/graph/utils/DumpProperties.java java/src/main/java/org/softwareheritage/graph/utils/ExportSubdataset.java java/src/main/java/org/softwareheritage/graph/utils/FindEarliestRevision.java +java/src/main/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2.java +java/src/main/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3.java java/src/main/java/org/softwareheritage/graph/utils/MPHTranslate.java java/src/main/java/org/softwareheritage/graph/utils/ReadGraph.java java/src/main/java/org/softwareheritage/graph/utils/ReadLabelledGraph.java -java/src/main/java/org/softwareheritage/graph/utils/WriteRevisionTimestamps.java +java/src/main/java/org/softwareheritage/graph/utils/Sort.java java/src/test/java/org/softwareheritage/graph/AllowedEdgesTest.java +java/src/test/java/org/softwareheritage/graph/AllowedNodesTest.java java/src/test/java/org/softwareheritage/graph/GraphTest.java -java/src/test/java/org/softwareheritage/graph/LeavesTest.java -java/src/test/java/org/softwareheritage/graph/NeighborsTest.java java/src/test/java/org/softwareheritage/graph/SubgraphTest.java -java/src/test/java/org/softwareheritage/graph/VisitTest.java -java/src/test/java/org/softwareheritage/graph/WalkTest.java -java/target/swh-graph-0.5.2.jar +java/src/test/java/org/softwareheritage/graph/compress/ExtractNodesTest.java +java/src/test/java/org/softwareheritage/graph/compress/ExtractPersonsTest.java +java/src/test/java/org/softwareheritage/graph/rpc/CountEdgesTest.java +java/src/test/java/org/softwareheritage/graph/rpc/CountNodesTest.java +java/src/test/java/org/softwareheritage/graph/rpc/FindPathBetweenTest.java +java/src/test/java/org/softwareheritage/graph/rpc/FindPathToTest.java +java/src/test/java/org/softwareheritage/graph/rpc/GetNodeTest.java +java/src/test/java/org/softwareheritage/graph/rpc/StatsTest.java +java/src/test/java/org/softwareheritage/graph/rpc/TraversalServiceTest.java +java/src/test/java/org/softwareheritage/graph/rpc/TraverseLeavesTest.java +java/src/test/java/org/softwareheritage/graph/rpc/TraverseNeighborsTest.java +java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesPropertiesTest.java +java/src/test/java/org/softwareheritage/graph/rpc/TraverseNodesTest.java +java/src/test/java/org/softwareheritage/graph/utils/ForkJoinBigQuickSort2Test.java +java/src/test/java/org/softwareheritage/graph/utils/ForkJoinQuickSort3Test.java +java/target/swh-graph-1.0.0.jar +proto/swhgraph.proto reports/.gitignore reports/benchmarks/Makefile reports/benchmarks/benchmarks.tex reports/experiments/Makefile reports/experiments/experiments.tex reports/linux_log/LinuxLog.java reports/linux_log/Makefile reports/linux_log/linux_log.tex reports/node_mapping/Makefile reports/node_mapping/NodeIdMapHaloDB.java reports/node_mapping/NodeIdMapRocksDB.java reports/node_mapping/node_mapping.tex swh/__init__.py swh.graph.egg-info/PKG-INFO swh.graph.egg-info/SOURCES.txt swh.graph.egg-info/dependency_links.txt swh.graph.egg-info/entry_points.txt swh.graph.egg-info/requires.txt swh.graph.egg-info/top_level.txt swh/graph/__init__.py -swh/graph/backend.py swh/graph/cli.py swh/graph/client.py swh/graph/config.py -swh/graph/dot.py +swh/graph/http_client.py +swh/graph/http_naive_client.py +swh/graph/http_server.py swh/graph/naive_client.py swh/graph/py.typed -swh/graph/swhid.py +swh/graph/rpc_server.py swh/graph/webgraph.py -swh/graph/server/__init__.py -swh/graph/server/app.py +swh/graph/rpc/swhgraph.proto +swh/graph/rpc/swhgraph_pb2.py +swh/graph/rpc/swhgraph_pb2.pyi +swh/graph/rpc/swhgraph_pb2_grpc.py swh/graph/tests/__init__.py swh/graph/tests/conftest.py -swh/graph/tests/test_api_client.py swh/graph/tests/test_cli.py -swh/graph/tests/test_swhid.py -swh/graph/tests/dataset/.gitignore -swh/graph/tests/dataset/example.edges.csv -swh/graph/tests/dataset/example.edges.csv.zst -swh/graph/tests/dataset/example.nodes.csv -swh/graph/tests/dataset/example.nodes.csv.zst -swh/graph/tests/dataset/generate_graph.sh +swh/graph/tests/test_http_client.py +swh/graph/tests/dataset/generate_dataset.py +swh/graph/tests/dataset/compressed/example-labelled.labeloffsets +swh/graph/tests/dataset/compressed/example-labelled.labels +swh/graph/tests/dataset/compressed/example-labelled.properties +swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets +swh/graph/tests/dataset/compressed/example-transposed-labelled.labels +swh/graph/tests/dataset/compressed/example-transposed-labelled.properties +swh/graph/tests/dataset/compressed/example-transposed.graph +swh/graph/tests/dataset/compressed/example-transposed.obl +swh/graph/tests/dataset/compressed/example-transposed.offsets +swh/graph/tests/dataset/compressed/example-transposed.properties +swh/graph/tests/dataset/compressed/example.edges.count.txt +swh/graph/tests/dataset/compressed/example.edges.stats.txt +swh/graph/tests/dataset/compressed/example.graph +swh/graph/tests/dataset/compressed/example.indegree +swh/graph/tests/dataset/compressed/example.labels.count.txt +swh/graph/tests/dataset/compressed/example.labels.csv.zst +swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray +swh/graph/tests/dataset/compressed/example.labels.fcl.pointers +swh/graph/tests/dataset/compressed/example.labels.fcl.properties +swh/graph/tests/dataset/compressed/example.labels.mph +swh/graph/tests/dataset/compressed/example.mph +swh/graph/tests/dataset/compressed/example.node2swhid.bin +swh/graph/tests/dataset/compressed/example.node2type.map +swh/graph/tests/dataset/compressed/example.nodes.count.txt +swh/graph/tests/dataset/compressed/example.nodes.csv.zst +swh/graph/tests/dataset/compressed/example.nodes.stats.txt +swh/graph/tests/dataset/compressed/example.obl +swh/graph/tests/dataset/compressed/example.offsets +swh/graph/tests/dataset/compressed/example.order +swh/graph/tests/dataset/compressed/example.outdegree +swh/graph/tests/dataset/compressed/example.persons.count.txt +swh/graph/tests/dataset/compressed/example.persons.csv.zst +swh/graph/tests/dataset/compressed/example.persons.mph +swh/graph/tests/dataset/compressed/example.properties +swh/graph/tests/dataset/compressed/example.property.author_id.bin +swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin +swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin +swh/graph/tests/dataset/compressed/example.property.committer_id.bin +swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin +swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin +swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin +swh/graph/tests/dataset/compressed/example.property.content.length.bin +swh/graph/tests/dataset/compressed/example.property.message.bin +swh/graph/tests/dataset/compressed/example.property.message.offset.bin +swh/graph/tests/dataset/compressed/example.property.tag_name.bin +swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin +swh/graph/tests/dataset/compressed/example.stats +swh/graph/tests/dataset/edges/content/graph-all.edges.csv.zst +swh/graph/tests/dataset/edges/content/graph-all.nodes.csv.zst +swh/graph/tests/dataset/edges/directory/graph-all.edges.csv.zst +swh/graph/tests/dataset/edges/directory/graph-all.nodes.csv.zst +swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst +swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst +swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst +swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst +swh/graph/tests/dataset/edges/revision/graph-all.edges.csv.zst +swh/graph/tests/dataset/edges/revision/graph-all.nodes.csv.zst +swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst +swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst swh/graph/tests/dataset/img/.gitignore swh/graph/tests/dataset/img/Makefile swh/graph/tests/dataset/img/example.dot -swh/graph/tests/dataset/output/example-transposed.graph -swh/graph/tests/dataset/output/example-transposed.obl -swh/graph/tests/dataset/output/example-transposed.offsets -swh/graph/tests/dataset/output/example-transposed.properties -swh/graph/tests/dataset/output/example.graph -swh/graph/tests/dataset/output/example.indegree -swh/graph/tests/dataset/output/example.mph -swh/graph/tests/dataset/output/example.node2swhid.bin -swh/graph/tests/dataset/output/example.node2type.map -swh/graph/tests/dataset/output/example.obl -swh/graph/tests/dataset/output/example.offsets -swh/graph/tests/dataset/output/example.order -swh/graph/tests/dataset/output/example.outdegree -swh/graph/tests/dataset/output/example.properties -swh/graph/tests/dataset/output/example.stats +swh/graph/tests/dataset/orc/content/content-all.orc +swh/graph/tests/dataset/orc/directory/directory-all.orc +swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc +swh/graph/tests/dataset/orc/origin/origin-all.orc +swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc +swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc +swh/graph/tests/dataset/orc/release/release-all.orc +swh/graph/tests/dataset/orc/revision/revision-all.orc +swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc +swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc +swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc +swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc +swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc tools/dir2graph tools/swhid2int2int2swhid.sh tools/git2graph/.gitignore tools/git2graph/Makefile tools/git2graph/README.md tools/git2graph/git2graph.c tools/git2graph/tests/edge-filters.bats tools/git2graph/tests/full-graph.bats tools/git2graph/tests/node-filters.bats tools/git2graph/tests/repo_helper.bash tools/git2graph/tests/data/sample-repo.tgz tools/git2graph/tests/data/graphs/dir-nodes/edges.csv tools/git2graph/tests/data/graphs/dir-nodes/nodes.csv tools/git2graph/tests/data/graphs/from-dir-edges/edges.csv tools/git2graph/tests/data/graphs/from-dir-edges/nodes.csv tools/git2graph/tests/data/graphs/from-rel-edges/edges.csv tools/git2graph/tests/data/graphs/from-rel-edges/nodes.csv tools/git2graph/tests/data/graphs/fs-nodes/edges.csv tools/git2graph/tests/data/graphs/fs-nodes/nodes.csv tools/git2graph/tests/data/graphs/full/edges.csv tools/git2graph/tests/data/graphs/full/nodes.csv tools/git2graph/tests/data/graphs/rev-edges/edges.csv tools/git2graph/tests/data/graphs/rev-edges/nodes.csv tools/git2graph/tests/data/graphs/rev-nodes/edges.csv tools/git2graph/tests/data/graphs/rev-nodes/nodes.csv tools/git2graph/tests/data/graphs/to-rev-edges/edges.csv tools/git2graph/tests/data/graphs/to-rev-edges/nodes.csv \ No newline at end of file diff --git a/swh.graph.egg-info/requires.txt b/swh.graph.egg-info/requires.txt index aef777c..ad5da5d 100644 --- a/swh.graph.egg-info/requires.txt +++ b/swh.graph.egg-info/requires.txt @@ -1,12 +1,18 @@ aiohttp click py4j psutil +grpcio-tools +mypy-protobuf swh.core[http]>=0.3 swh.model>=0.13.0 +swh.dataset [testing] pytest +pytest-asyncio types-click types-pyyaml types-requests +types-protobuf +grpc-stubs diff --git a/swh/graph/backend.py b/swh/graph/backend.py deleted file mode 100644 index 5fb82f5..0000000 --- a/swh/graph/backend.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import asyncio -import contextlib -import io -import os -import re -import subprocess -import sys -import tempfile - -from py4j.java_gateway import JavaGateway -from py4j.protocol import Py4JJavaError - -from swh.graph.config import check_config - -BUF_LINES = 1024 - - -def _get_pipe_stderr(): - # Get stderr if possible, or pipe to stdout if running with Jupyter. - try: - sys.stderr.fileno() - except io.UnsupportedOperation: - return subprocess.STDOUT - else: - return sys.stderr - - -class Backend: - def __init__(self, graph_path, config=None): - self.gateway = None - self.entry = None - self.graph_path = graph_path - self.config = check_config(config or {}) - - def start_gateway(self): - self.gateway = JavaGateway.launch_gateway( - java_path=None, - javaopts=self.config["java_tool_options"].split(), - classpath=self.config["classpath"], - die_on_exit=True, - redirect_stdout=sys.stdout, - redirect_stderr=_get_pipe_stderr(), - ) - self.entry = self.gateway.jvm.org.softwareheritage.graph.Entry() - self.entry.load_graph(self.graph_path) - self.stream_proxy = JavaStreamProxy(self.entry) - - def stop_gateway(self): - self.gateway.shutdown() - - def __enter__(self): - self.start_gateway() - return self - - def __exit__(self, exc_type, exc_value, tb): - self.stop_gateway() - - def stats(self): - return self.entry.stats() - - def check_swhid(self, swhid): - try: - self.entry.check_swhid(swhid) - except Py4JJavaError as e: - m = re.search(r"malformed SWHID: (\w+)", str(e)) - if m: - raise ValueError(f"malformed SWHID: {m[1]}") - m = re.search(r"Unknown SWHID: (\w+)", str(e)) - if m: - raise NameError(f"Unknown SWHID: {m[1]}") - raise - - def count(self, ttype, *args): - method = getattr(self.entry, "count_" + ttype) - return method(*args) - - async def traversal(self, ttype, *args): - method = getattr(self.stream_proxy, ttype) - async for line in method(*args): - yield line.decode().rstrip("\n") - - -class JavaStreamProxy: - """A proxy class for the org.softwareheritage.graph.Entry Java class that - takes care of the setup and teardown of the named-pipe FIFO communication - between Python and Java. - - Initialize JavaStreamProxy using: - - proxy = JavaStreamProxy(swh_entry_class_instance) - - Then you can call an Entry method and iterate on the FIFO results like - this: - - async for value in proxy.java_method(arg1, arg2): - print(value) - """ - - def __init__(self, entry): - self.entry = entry - - async def read_node_ids(self, fname): - loop = asyncio.get_event_loop() - open_thread = loop.run_in_executor(None, open, fname, "rb") - - # Since the open() call on the FIFO is blocking until it is also opened - # on the Java side, we await it with a timeout in case there is an - # exception that prevents the write-side open(). - with (await asyncio.wait_for(open_thread, timeout=2)) as f: - - def read_n_lines(f, n): - buf = [] - for _ in range(n): - try: - buf.append(next(f)) - except StopIteration: - break - return buf - - while True: - lines = await loop.run_in_executor(None, read_n_lines, f, BUF_LINES) - if not lines: - break - for line in lines: - yield line - - class _HandlerWrapper: - def __init__(self, handler): - self._handler = handler - - def __getattr__(self, name): - func = getattr(self._handler, name) - - async def java_call(*args, **kwargs): - loop = asyncio.get_event_loop() - await loop.run_in_executor(None, lambda: func(*args, **kwargs)) - - def java_task(*args, **kwargs): - return asyncio.create_task(java_call(*args, **kwargs)) - - return java_task - - @contextlib.contextmanager - def get_handler(self): - with tempfile.TemporaryDirectory(prefix="swh-graph-") as tmpdirname: - cli_fifo = os.path.join(tmpdirname, "swh-graph.fifo") - os.mkfifo(cli_fifo) - reader = self.read_node_ids(cli_fifo) - query_handler = self.entry.get_handler(cli_fifo) - handler = self._HandlerWrapper(query_handler) - yield (handler, reader) - - def __getattr__(self, name): - async def java_call_iterator(*args, **kwargs): - with self.get_handler() as (handler, reader): - java_task = getattr(handler, name)(*args, **kwargs) - try: - async for value in reader: - yield value - except asyncio.TimeoutError: - # If the read-side open() timeouts, an exception on the - # Java side probably happened that prevented the - # write-side open(). We propagate this exception here if - # that is the case. - task_exc = java_task.exception() - if task_exc: - raise task_exc - raise - await java_task - - return java_call_iterator diff --git a/swh/graph/cli.py b/swh/graph/cli.py index 7d399ac..9eaf547 100644 --- a/swh/graph/cli.py +++ b/swh/graph/cli.py @@ -1,447 +1,200 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import logging from pathlib import Path -import sys from typing import TYPE_CHECKING, Any, Dict, Set, Tuple # WARNING: do not import unnecessary things here to keep cli startup time under # control import click from swh.core.cli import CONTEXT_SETTINGS, AliasedGroup from swh.core.cli import swh as swh_cli_group if TYPE_CHECKING: from swh.graph.webgraph import CompressionStep # noqa class StepOption(click.ParamType): """click type for specifying a compression step on the CLI parse either individual steps, specified as step names or integers, or step ranges """ name = "compression step" def convert(self, value, param, ctx): # type: (...) -> Set[CompressionStep] from swh.graph.webgraph import COMP_SEQ, CompressionStep # noqa steps: Set[CompressionStep] = set() specs = value.split(",") for spec in specs: if "-" in spec: # step range (raw_l, raw_r) = spec.split("-", maxsplit=1) if raw_l == "": # no left endpoint raw_l = COMP_SEQ[0].name if raw_r == "": # no right endpoint raw_r = COMP_SEQ[-1].name l_step = self.convert(raw_l, param, ctx) r_step = self.convert(raw_r, param, ctx) if len(l_step) != 1 or len(r_step) != 1: self.fail(f"invalid step specification: {value}, " f"see --help") l_idx = l_step.pop() r_idx = r_step.pop() steps = steps.union( set(CompressionStep(i) for i in range(l_idx.value, r_idx.value + 1)) ) else: # singleton step try: steps.add(CompressionStep(int(spec))) # integer step except ValueError: try: steps.add(CompressionStep[spec.upper()]) # step name except KeyError: self.fail( f"invalid step specification: {value}, " f"see --help" ) return steps class PathlibPath(click.Path): """A Click path argument that returns a pathlib Path, not a string""" def convert(self, value, param, ctx): return Path(super().convert(value, param, ctx)) DEFAULT_CONFIG: Dict[str, Tuple[str, Any]] = {"graph": ("dict", {})} @swh_cli_group.group(name="graph", context_settings=CONTEXT_SETTINGS, cls=AliasedGroup) @click.option( "--config-file", "-C", default=None, - type=click.Path(exists=True, dir_okay=False,), + type=click.Path( + exists=True, + dir_okay=False, + ), help="YAML configuration file", ) @click.pass_context def graph_cli_group(ctx, config_file): """Software Heritage graph tools.""" from swh.core import config ctx.ensure_object(dict) conf = config.read(config_file, DEFAULT_CONFIG) if "graph" not in conf: raise ValueError( 'no "graph" stanza found in configuration file %s' % config_file ) ctx.obj["config"] = conf -@graph_cli_group.command("api-client") -@click.option("--host", default="localhost", help="Graph server host") -@click.option("--port", default="5009", help="Graph server port") -@click.pass_context -def api_client(ctx, host, port): - """client for the graph RPC service""" - from swh.graph import client - - url = "http://{}:{}".format(host, port) - app = client.RemoteGraphClient(url) - - # TODO: run web app - print(app.stats()) - - -@graph_cli_group.group("map") -@click.pass_context -def map(ctx): - """Manage swh-graph on-disk maps""" - pass - - -def dump_swhid2node(filename): - from swh.graph.swhid import SwhidToNodeMap - - for (swhid, int) in SwhidToNodeMap(filename): - print("{}\t{}".format(swhid, int)) - - -def dump_node2swhid(filename): - from swh.graph.swhid import NodeToSwhidMap - - for (int, swhid) in NodeToSwhidMap(filename): - print("{}\t{}".format(int, swhid)) - - -def restore_swhid2node(filename): - """read a textual SWHID->int map from stdin and write its binary version to - filename - - """ - from swh.graph.swhid import SwhidToNodeMap - - with open(filename, "wb") as dst: - for line in sys.stdin: - (str_swhid, str_int) = line.split() - SwhidToNodeMap.write_record(dst, str_swhid, int(str_int)) - - -def restore_node2swhid(filename, length): - """read a textual int->SWHID map from stdin and write its binary version to - filename - - """ - from swh.graph.swhid import NodeToSwhidMap - - node2swhid = NodeToSwhidMap(filename, mode="wb", length=length) - for line in sys.stdin: - (str_int, str_swhid) = line.split() - node2swhid[int(str_int)] = str_swhid - node2swhid.close() - - -@map.command("dump") -@click.option( - "--type", - "-t", - "map_type", - required=True, - type=click.Choice(["swhid2node", "node2swhid"]), - help="type of map to dump", -) -@click.argument("filename", required=True, type=click.Path(exists=True)) -@click.pass_context -def dump_map(ctx, map_type, filename): - """Dump a binary SWHID<->node map to textual format.""" - if map_type == "swhid2node": - dump_swhid2node(filename) - elif map_type == "node2swhid": - dump_node2swhid(filename) - else: - raise ValueError("invalid map type: " + map_type) - pass - - -@map.command("restore") -@click.option( - "--type", - "-t", - "map_type", - required=True, - type=click.Choice(["swhid2node", "node2swhid"]), - help="type of map to dump", -) -@click.option( - "--length", - "-l", - type=int, - help="""map size in number of logical records - (required for node2swhid maps)""", -) -@click.argument("filename", required=True, type=click.Path()) -@click.pass_context -def restore_map(ctx, map_type, length, filename): - """Restore a binary SWHID<->node map from textual format.""" - if map_type == "swhid2node": - restore_swhid2node(filename) - elif map_type == "node2swhid": - if length is None: - raise click.UsageError( - "map length is required when restoring {} maps".format(map_type), ctx - ) - restore_node2swhid(filename, length) - else: - raise ValueError("invalid map type: " + map_type) - - -@map.command("write") -@click.option( - "--type", - "-t", - "map_type", - required=True, - type=click.Choice(["swhid2node", "node2swhid"]), - help="type of map to write", -) -@click.argument("filename", required=True, type=click.Path()) -@click.pass_context -def write(ctx, map_type, filename): - """Write a map to disk sequentially. - - read from stdin a textual SWHID->node mapping (for swhid2node, or a simple - sequence of SWHIDs for node2swhid) and write it to disk in the requested binary - map format - - note that no sorting is applied, so the input should already be sorted as - required by the chosen map type (by SWHID for swhid2node, by int for node2swhid) - - """ - from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap - - with open(filename, "wb") as f: - if map_type == "swhid2node": - for line in sys.stdin: - (swhid, int_str) = line.rstrip().split(maxsplit=1) - SwhidToNodeMap.write_record(f, swhid, int(int_str)) - elif map_type == "node2swhid": - for line in sys.stdin: - swhid = line.rstrip() - NodeToSwhidMap.write_record(f, swhid) - else: - raise ValueError("invalid map type: " + map_type) - - -@map.command("lookup") -@click.option( - "--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename" -) -@click.argument("identifiers", nargs=-1) -def map_lookup(graph, identifiers): - """Lookup identifiers using on-disk maps. - - Depending on the identifier type lookup either a SWHID into a SWHID->node (and - return the node integer identifier) or, vice-versa, lookup a node integer - identifier into a node->SWHID (and return the SWHID). The desired behavior is - chosen depending on the syntax of each given identifier. - - Identifiers can be passed either directly on the command line or on - standard input, separate by blanks. Logical lines (as returned by - readline()) in stdin will be preserved in stdout. - - """ - from swh.graph.backend import NODE2SWHID_EXT, SWHID2NODE_EXT - from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap - import swh.model.exceptions - from swh.model.swhids import ExtendedSWHID - - success = True # no identifiers failed to be looked up - swhid2node = SwhidToNodeMap(f"{graph}.{SWHID2NODE_EXT}") - node2swhid = NodeToSwhidMap(f"{graph}.{NODE2SWHID_EXT}") - - def lookup(identifier): - nonlocal success, swhid2node, node2swhid - is_swhid = None - try: - int(identifier) - is_swhid = False - except ValueError: - try: - ExtendedSWHID.from_string(identifier) - is_swhid = True - except swh.model.exceptions.ValidationError: - success = False - logging.error(f'invalid identifier: "{identifier}", skipping') - - try: - if is_swhid: - return str(swhid2node[identifier]) - else: - return node2swhid[int(identifier)] - except KeyError: - success = False - logging.error(f'identifier not found: "{identifier}", skipping') - - if identifiers: # lookup identifiers passed via CLI - for identifier in identifiers: - print(lookup(identifier)) - else: # lookup identifiers passed via stdin, preserving logical lines - for line in sys.stdin: - results = [lookup(id) for id in line.rstrip().split()] - if results: # might be empty if all IDs on the same line failed - print(" ".join(results)) - - sys.exit(0 if success else 1) - - @graph_cli_group.command(name="rpc-serve") @click.option( "--host", "-h", default="0.0.0.0", metavar="IP", show_default=True, help="host IP address to bind the server on", ) @click.option( "--port", "-p", default=5009, type=click.INT, metavar="PORT", show_default=True, help="port to bind the server on", ) @click.option( "--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename" ) @click.pass_context def serve(ctx, host, port, graph): """run the graph RPC service""" - import aiohttp + import aiohttp.web - from swh.graph.server.app import make_app + from swh.graph.http_server import make_app config = ctx.obj["config"] config.setdefault("graph", {}) config["graph"]["path"] = graph app = make_app(config=config) aiohttp.web.run_app(app, host=host, port=port) @graph_cli_group.command() @click.option( - "--graph", - "-g", + "--input-dataset", + "-i", required=True, - metavar="GRAPH", type=PathlibPath(), - help="input graph basename", + help="graph dataset directory, in ORC format", ) @click.option( - "--outdir", + "--output-directory", "-o", - "out_dir", required=True, - metavar="DIR", type=PathlibPath(), help="directory where to store compressed graph", ) +@click.option( + "--graph-name", + "-g", + default="graph", + metavar="NAME", + help="name of the output graph (default: 'graph')", +) @click.option( "--steps", "-s", metavar="STEPS", type=StepOption(), help="run only these compression steps (default: all steps)", ) @click.pass_context -def compress(ctx, graph, out_dir, steps): +def compress(ctx, input_dataset, output_directory, graph_name, steps): """Compress a graph using WebGraph - Input: a pair of files g.nodes.csv.gz, g.edges.csv.gz + Input: a directory containing a graph dataset in ORC format Output: a directory containing a WebGraph compressed graph - Compression steps are: (1) mph, (2) bv, (3) bfs, (4) permute_bfs, - (5) transpose_bfs, (6) simplify, (7) llp, (8) permute_llp, (9) obl, (10) - compose_orders, (11) stats, (12) transpose, (13) transpose_obl, (14) maps, - (15) clean_tmp. Compression steps can be selected by name or number using - --steps, separating them with commas; step ranges (e.g., 3-9, 6-, etc.) are - also supported. + Compression steps are: (1) extract_nodes, (2) mph, (3) bv, (4) bfs, (5) + permute_bfs, (6) transpose_bfs, (7) simplify, (8) llp, (9) permute_llp, + (10) obl, (11) compose_orders, (12) stats, (13) transpose, (14) + transpose_obl, (15) maps, (16) extract_persons, (17) mph_persons, (18) + node_properties, (19) mph_labels, (20) fcl_labels, (21) edge_labels, (22) + edge_labels_obl, (23) edge_labels_transpose_obl, (24) clean_tmp. + Compression steps can be selected by name or number using --steps, + separating them with commas; step ranges (e.g., 3-9, 6-, etc.) are also + supported. """ from swh.graph import webgraph - graph_name = graph.name - in_dir = graph.parent try: conf = ctx.obj["config"]["graph"]["compress"] except KeyError: conf = {} # use defaults - webgraph.compress(graph_name, in_dir, out_dir, steps, conf) - - -@graph_cli_group.command(name="cachemount") -@click.option( - "--graph", "-g", required=True, metavar="GRAPH", help="compressed graph basename" -) -@click.option( - "--cache", - "-c", - default="/dev/shm/swh-graph/default", - metavar="CACHE", - type=PathlibPath(), - help="Memory cache path (defaults to /dev/shm/swh-graph/default)", -) -@click.pass_context -def cachemount(ctx, graph, cache): - """ - Cache the mmapped files of the compressed graph in a tmpfs. - - This command creates a new directory at the path given by CACHE that has - the same structure as the compressed graph basename, except it copies the - files that require mmap access (:file:`{*}.graph`) but uses symlinks from the source - for all the other files (:file:`{*}.map`, :file:`{*}.bin`, ...). - - The command outputs the path to the memory cache directory (particularly - useful when relying on the default value). - """ - import shutil - - cache.mkdir(parents=True) - for src in Path(graph).parent.glob("*"): - dst = cache / src.name - if src.suffix == ".graph": - shutil.copy2(src, dst) - else: - dst.symlink_to(src.resolve()) - print(cache) + webgraph.compress(graph_name, input_dataset, output_directory, steps, conf) def main(): return graph_cli_group(auto_envvar_prefix="SWH_GRAPH") if __name__ == "__main__": main() diff --git a/swh/graph/client.py b/swh/graph/client.py index aa66108..f1ea063 100644 --- a/swh/graph/client.py +++ b/swh/graph/client.py @@ -1,156 +1,15 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information +# Copyright (c) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information -import json -from swh.core.api import RPCClient +import warnings +from .http_client import * # noqa -class GraphAPIError(Exception): - """Graph API Error""" - - def __str__(self): - return """An unexpected error occurred - in the Graph backend: {}""".format( - self.args - ) - - -class GraphArgumentException(Exception): - def __init__(self, *args, response=None): - super().__init__(*args) - self.response = response - - -class RemoteGraphClient(RPCClient): - """Client to the Software Heritage Graph.""" - - def __init__(self, url, timeout=None): - super().__init__(api_exception=GraphAPIError, url=url, timeout=timeout) - - def raw_verb_lines(self, verb, endpoint, **kwargs): - response = self.raw_verb(verb, endpoint, stream=True, **kwargs) - self.raise_for_status(response) - for line in response.iter_lines(): - yield line.decode().lstrip("\n") - - def get_lines(self, endpoint, **kwargs): - yield from self.raw_verb_lines("get", endpoint, **kwargs) - - def raise_for_status(self, response) -> None: - if response.status_code // 100 == 4: - raise GraphArgumentException( - response.content.decode("ascii"), response=response - ) - super().raise_for_status(response) - - # Web API endpoints - - def stats(self): - return self.get("stats") - - def leaves( - self, src, edges="*", direction="forward", max_edges=0, return_types="*" - ): - return self.get_lines( - "leaves/{}".format(src), - params={ - "edges": edges, - "direction": direction, - "max_edges": max_edges, - "return_types": return_types, - }, - ) - - def neighbors( - self, src, edges="*", direction="forward", max_edges=0, return_types="*" - ): - return self.get_lines( - "neighbors/{}".format(src), - params={ - "edges": edges, - "direction": direction, - "max_edges": max_edges, - "return_types": return_types, - }, - ) - - def visit_nodes( - self, src, edges="*", direction="forward", max_edges=0, return_types="*" - ): - return self.get_lines( - "visit/nodes/{}".format(src), - params={ - "edges": edges, - "direction": direction, - "max_edges": max_edges, - "return_types": return_types, - }, - ) - - def visit_edges(self, src, edges="*", direction="forward", max_edges=0): - for edge in self.get_lines( - "visit/edges/{}".format(src), - params={"edges": edges, "direction": direction, "max_edges": max_edges}, - ): - yield tuple(edge.split()) - - def visit_paths(self, src, edges="*", direction="forward", max_edges=0): - def decode_path_wrapper(it): - for e in it: - yield json.loads(e) - - return decode_path_wrapper( - self.get_lines( - "visit/paths/{}".format(src), - params={"edges": edges, "direction": direction, "max_edges": max_edges}, - ) - ) - - def walk( - self, src, dst, edges="*", traversal="dfs", direction="forward", limit=None - ): - endpoint = "walk/{}/{}" - return self.get_lines( - endpoint.format(src, dst), - params={ - "edges": edges, - "traversal": traversal, - "direction": direction, - "limit": limit, - }, - ) - - def random_walk( - self, src, dst, edges="*", direction="forward", limit=None, return_types="*" - ): - endpoint = "randomwalk/{}/{}" - return self.get_lines( - endpoint.format(src, dst), - params={ - "edges": edges, - "direction": direction, - "limit": limit, - "return_types": return_types, - }, - ) - - def count_leaves(self, src, edges="*", direction="forward"): - return self.get( - "leaves/count/{}".format(src), - params={"edges": edges, "direction": direction}, - ) - - def count_neighbors(self, src, edges="*", direction="forward"): - return self.get( - "neighbors/count/{}".format(src), - params={"edges": edges, "direction": direction}, - ) - - def count_visit_nodes(self, src, edges="*", direction="forward"): - return self.get( - "visit/nodes/count/{}".format(src), - params={"edges": edges, "direction": direction}, - ) +warnings.warn( + "the swh.graph.client module is deprecated, use swh.graph.http_client instead", + DeprecationWarning, + stacklevel=2, +) diff --git a/swh/graph/config.py b/swh/graph/config.py index f144f26..12ac12c 100644 --- a/swh/graph/config.py +++ b/swh/graph/config.py @@ -1,115 +1,117 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging from pathlib import Path import sys import psutil def find_graph_jar(): """find swh-graph.jar, containing the Java part of swh-graph look both in development directories and installed data (for in-production deployments who fecthed the JAR from pypi) """ swh_graph_root = Path(__file__).parents[2] try_paths = [ swh_graph_root / "java/target/", Path(sys.prefix) / "share/swh-graph/", Path(sys.prefix) / "local/share/swh-graph/", ] for path in try_paths: glob = list(path.glob("swh-graph-*.jar")) if glob: if len(glob) > 1: - logging.warn( + logging.warning( "found multiple swh-graph JARs, " "arbitrarily picking one" ) logging.info("using swh-graph JAR: {0}".format(glob[0])) return str(glob[0]) raise RuntimeError("swh-graph JAR not found. Have you run `make java`?") def check_config(conf): """check configuration and propagate defaults""" conf = conf.copy() if "batch_size" not in conf: # Use 0.1% of the RAM as a batch size: # ~1 billion for big servers, ~10 million for small desktop machines - conf["batch_size"] = int(psutil.virtual_memory().total / 1000) + conf["batch_size"] = min(int(psutil.virtual_memory().total / 1000), 2**30 - 1) if "llp_gammas" not in conf: conf["llp_gammas"] = "-0,-1,-2,-3,-4" if "max_ram" not in conf: - conf["max_ram"] = str(psutil.virtual_memory().total) + conf["max_ram"] = str(int(psutil.virtual_memory().total * 0.9)) if "java_tool_options" not in conf: conf["java_tool_options"] = " ".join( [ "-Xmx{max_ram}", "-XX:PretenureSizeThreshold=512M", "-XX:MaxNewSize=4G", "-XX:+UseLargePages", "-XX:+UseTransparentHugePages", "-XX:+UseNUMA", "-XX:+UseTLAB", "-XX:+ResizeTLAB", ] ) conf["java_tool_options"] = conf["java_tool_options"].format( max_ram=conf["max_ram"] ) if "java" not in conf: conf["java"] = "java" if "classpath" not in conf: conf["classpath"] = find_graph_jar() return conf def check_config_compress(config, graph_name, in_dir, out_dir): """check compression-specific configuration and initialize its execution environment. """ conf = check_config(config) conf["graph_name"] = graph_name conf["in_dir"] = str(in_dir) conf["out_dir"] = str(out_dir) out_dir.mkdir(parents=True, exist_ok=True) if "tmp_dir" not in conf: tmp_dir = out_dir / "tmp" conf["tmp_dir"] = str(tmp_dir) else: tmp_dir = Path(conf["tmp_dir"]) tmp_dir.mkdir(parents=True, exist_ok=True) if "logback" not in conf: logback_confpath = tmp_dir / "logback.xml" with open(logback_confpath, "w") as conffile: conffile.write( """ - + %d %r %p [%t] %logger{1} - %m%n + System.err - + """ ) conf["logback"] = str(logback_confpath) conf["java_tool_options"] += " -Dlogback.configurationFile={logback}" conf["java_tool_options"] += " -Djava.io.tmpdir={tmp_dir}" conf["java_tool_options"] = conf["java_tool_options"].format( - logback=conf["logback"], tmp_dir=conf["tmp_dir"], + logback=conf["logback"], + tmp_dir=conf["tmp_dir"], ) return conf diff --git a/swh/graph/dot.py b/swh/graph/dot.py deleted file mode 100644 index 6a17150..0000000 --- a/swh/graph/dot.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (C) 2019 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import collections -from functools import lru_cache -import subprocess - -KIND_TO_SHAPE = { - "ori": "egg", - "snp": "doubleoctagon", - "rel": "octagon", - "rev": "diamond", - "dir": "folder", - "cnt": "oval", -} - - -@lru_cache() -def dot_to_svg(dot): - try: - p = subprocess.run( - ["dot", "-Tsvg"], - input=dot, - universal_newlines=True, - capture_output=True, - check=True, - ) - except subprocess.CalledProcessError as e: - raise RuntimeError(e.stderr) from e - return p.stdout - - -def graph_dot(nodes): - ids = {n.id for n in nodes} - - by_kind = collections.defaultdict(list) - for n in nodes: - by_kind[n.kind].append(n) - - forward_edges = [ - (node.id, child.id) - for node in nodes - for child in node.children() - if child.id in ids - ] - backward_edges = [ - (parent.id, node.id) - for node in nodes - for parent in node.parents() - if parent.id in ids - ] - edges = set(forward_edges + backward_edges) - edges_fmt = "\n".join("{} -> {};".format(a, b) for a, b in edges) - nodes_fmt = "\n".join(node.dot_fragment() for node in nodes) - - s = """digraph G {{ - ranksep=1; - nodesep=0.5; - - {nodes} - {edges} - - }}""".format( - nodes=nodes_fmt, edges=edges_fmt - ) - return s diff --git a/swh/graph/client.py b/swh/graph/http_client.py similarity index 100% copy from swh/graph/client.py copy to swh/graph/http_client.py diff --git a/swh/graph/naive_client.py b/swh/graph/http_naive_client.py similarity index 89% copy from swh/graph/naive_client.py copy to swh/graph/http_naive_client.py index 680df11..a94efe8 100644 --- a/swh/graph/naive_client.py +++ b/swh/graph/http_naive_client.py @@ -1,396 +1,395 @@ # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import functools import inspect import re import statistics from typing import ( Callable, Dict, Iterable, Iterator, List, Optional, Set, Tuple, TypeVar, Union, ) from swh.model.swhids import CoreSWHID, ExtendedSWHID, ValidationError -from .client import GraphArgumentException +from .http_client import GraphArgumentException _NODE_TYPES = "ori|snp|rel|rev|dir|cnt" -NODES_RE = re.compile(fr"(\*|{_NODE_TYPES})") -EDGES_RE = re.compile(fr"(\*|{_NODE_TYPES}):(\*|{_NODE_TYPES})") +NODES_RE = re.compile(rf"(\*|{_NODE_TYPES})") +EDGES_RE = re.compile(rf"(\*|{_NODE_TYPES}):(\*|{_NODE_TYPES})") T = TypeVar("T", bound=Callable) SWHIDlike = Union[CoreSWHID, ExtendedSWHID, str] def check_arguments(f: T) -> T: """Decorator for generic argument checking for methods of NaiveClient. Checks ``src`` is a valid and known SWHID, and ``edges`` has the right format.""" signature = inspect.signature(f) @functools.wraps(f) def newf(*args, **kwargs): __tracebackhide__ = True # for pytest try: bound_args = signature.bind(*args, **kwargs) except TypeError as e: # rethrow the exception from here so pytest doesn't flood the terminal # with signature.bind's call stack. raise TypeError(*e.args) from None self = bound_args.arguments["self"] src = bound_args.arguments.get("src") if src: self._check_swhid(src) edges = bound_args.arguments.get("edges") if edges: if edges != "*" and not EDGES_RE.match(edges): raise GraphArgumentException(f"invalid edge restriction: {edges}") return_types = bound_args.arguments.get("return_types") if return_types: if not NODES_RE.match(return_types): raise GraphArgumentException( f"invalid return_types restriction: {return_types}" ) return f(*args, **kwargs) return newf # type: ignore def filter_node_types(node_types: str, nodes: Iterable[str]) -> Iterator[str]: if node_types == "*": yield from nodes else: prefixes = tuple(f"swh:1:{type_}:" for type_ in node_types.split(",")) for node in nodes: if node.startswith(prefixes): yield node class NaiveClient: - """An alternative implementation of :class:`swh.graph.backend.Backend`, - written in pure-python and meant for simulating it in other components' test - cases; constructed from a list of nodes and (directed) edges, both - represented as SWHIDs. + """An alternative implementation of the graph server, written in + pure-python and meant for simulating it in other components' test cases; + constructed from a list of nodes and (directed) edges, both represented as + SWHIDs. It is NOT meant to be efficient in any way; only to be a very simple implementation that provides the same behavior. >>> nodes = [ ... "swh:1:rev:1111111111111111111111111111111111111111", ... "swh:1:rev:2222222222222222222222222222222222222222", ... "swh:1:rev:3333333333333333333333333333333333333333", ... ] >>> edges = [ ... ( ... "swh:1:rev:1111111111111111111111111111111111111111", ... "swh:1:rev:2222222222222222222222222222222222222222", ... ), ... ( ... "swh:1:rev:2222222222222222222222222222222222222222", ... "swh:1:rev:3333333333333333333333333333333333333333", ... ), ... ] >>> c = NaiveClient(nodes=nodes, edges=edges) >>> list(c.leaves("swh:1:rev:1111111111111111111111111111111111111111")) ['swh:1:rev:3333333333333333333333333333333333333333'] """ def __init__( self, *, nodes: List[SWHIDlike], edges: List[Tuple[SWHIDlike, SWHIDlike]] ): self.graph = Graph(nodes, edges) def _check_swhid(self, swhid): try: ExtendedSWHID.from_string(swhid) except ValidationError as e: raise GraphArgumentException(*e.args) from None if swhid not in self.graph.nodes: raise GraphArgumentException(f"SWHID not found: {swhid}") def stats(self) -> Dict: return { - "counts": { - "nodes": len(self.graph.nodes), - "edges": sum(map(len, self.graph.forward_edges.values())), - }, - "ratios": { - "compression": 1.0, - "bits_per_edge": 100.0, - "bits_per_node": 100.0, - "avg_locality": 0.0, - }, - "indegree": { - "min": min(map(len, self.graph.backward_edges.values())), - "max": max(map(len, self.graph.backward_edges.values())), - "avg": statistics.mean(map(len, self.graph.backward_edges.values())), - }, - "outdegree": { - "min": min(map(len, self.graph.forward_edges.values())), - "max": max(map(len, self.graph.forward_edges.values())), - "avg": statistics.mean(map(len, self.graph.forward_edges.values())), - }, + "num_nodes": len(self.graph.nodes), + "num_edges": sum(map(len, self.graph.forward_edges.values())), + "compression_ratio": 1.0, + "bits_per_edge": 100.0, + "bits_per_node": 100.0, + "avg_locality": 0.0, + "indegree_min": min(map(len, self.graph.backward_edges.values())), + "indegree_max": max(map(len, self.graph.backward_edges.values())), + "indegree_avg": statistics.mean( + map(len, self.graph.backward_edges.values()) + ), + "outdegree_min": min(map(len, self.graph.forward_edges.values())), + "outdegree_max": max(map(len, self.graph.forward_edges.values())), + "outdegree_avg": statistics.mean( + map(len, self.graph.forward_edges.values()) + ), } @check_arguments def leaves( self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0, return_types: str = "*", ) -> Iterator[str]: # TODO: max_edges yield from filter_node_types( return_types, [ node for node in self.graph.get_subgraph(src, edges, direction) if not self.graph.get_filtered_neighbors(node, edges, direction) ], ) @check_arguments def neighbors( self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0, return_types: str = "*", ) -> Iterator[str]: # TODO: max_edges yield from filter_node_types( return_types, self.graph.get_filtered_neighbors(src, edges, direction) ) @check_arguments def visit_nodes( self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0, return_types: str = "*", ) -> Iterator[str]: # TODO: max_edges yield from filter_node_types( return_types, self.graph.get_subgraph(src, edges, direction) ) @check_arguments def visit_edges( self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0 ) -> Iterator[Tuple[str, str]]: if max_edges == 0: max_edges = None # type: ignore else: max_edges -= 1 yield from list(self.graph.iter_edges_dfs(direction, edges, src))[:max_edges] @check_arguments def visit_paths( self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0 ) -> Iterator[List[str]]: # TODO: max_edges for path in self.graph.iter_paths_dfs(direction, edges, src): if path[-1] in self.leaves(src, edges, direction): yield list(path) @check_arguments def walk( self, src: str, dst: str, edges: str = "*", traversal: str = "dfs", direction: str = "forward", limit: Optional[int] = None, ) -> Iterator[str]: # TODO: implement algo="bfs" # TODO: limit match_path: Callable[[str], bool] if ":" in dst: match_path = dst.__eq__ self._check_swhid(dst) else: match_path = lambda node: node.startswith(f"swh:1:{dst}:") # noqa for path in self.graph.iter_paths_dfs(direction, edges, src): if match_path(path[-1]): if not limit: # 0 or None yield from path elif limit > 0: yield from path[0:limit] else: yield from path[limit:] @check_arguments def random_walk( self, src: str, dst: str, edges: str = "*", direction: str = "forward", limit: Optional[int] = None, ): # TODO: limit yield from self.walk(src, dst, edges, "dfs", direction, limit) @check_arguments def count_leaves( self, src: str, edges: str = "*", direction: str = "forward" ) -> int: return len(list(self.leaves(src, edges, direction))) @check_arguments def count_neighbors( self, src: str, edges: str = "*", direction: str = "forward" ) -> int: return len(self.graph.get_filtered_neighbors(src, edges, direction)) @check_arguments def count_visit_nodes( self, src: str, edges: str = "*", direction: str = "forward" ) -> int: return len(self.graph.get_subgraph(src, edges, direction)) class Graph: def __init__( self, nodes: List[SWHIDlike], edges: List[Tuple[SWHIDlike, SWHIDlike]] ): self.nodes = [str(node) for node in nodes] self.forward_edges: Dict[str, List[str]] = {} self.backward_edges: Dict[str, List[str]] = {} for node in nodes: self.forward_edges[str(node)] = [] self.backward_edges[str(node)] = [] for (src, dst) in edges: self.forward_edges[str(src)].append(str(dst)) self.backward_edges[str(dst)].append(str(src)) def get_filtered_neighbors( - self, src: str, edges_fmt: str, direction: str, + self, + src: str, + edges_fmt: str, + direction: str, ) -> Set[str]: if direction == "forward": edges = self.forward_edges elif direction == "backward": edges = self.backward_edges else: raise GraphArgumentException(f"invalid direction: {direction}") neighbors = edges.get(src, []) if edges_fmt == "*": return set(neighbors) else: filtered_neighbors: Set[str] = set() for edges_fmt_item in edges_fmt.split(","): (src_fmt, dst_fmt) = edges_fmt_item.split(":") if src_fmt != "*" and not src.startswith(f"swh:1:{src_fmt}:"): continue if dst_fmt == "*": filtered_neighbors.update(neighbors) else: prefix = f"swh:1:{dst_fmt}:" filtered_neighbors.update( n for n in neighbors if n.startswith(prefix) ) return filtered_neighbors def get_subgraph(self, src: str, edges_fmt: str, direction: str) -> Set[str]: seen = set() to_visit = {src} while to_visit: node = to_visit.pop() seen.add(node) neighbors = set(self.get_filtered_neighbors(node, edges_fmt, direction)) new_nodes = neighbors - seen to_visit.update(new_nodes) return seen def iter_paths_dfs( self, direction: str, edges_fmt: str, src: str ) -> Iterator[Tuple[str, ...]]: for (path, node) in DfsSubgraphIterator(self, direction, edges_fmt, src): yield path + (node,) def iter_edges_dfs( self, direction: str, edges_fmt: str, src: str ) -> Iterator[Tuple[str, str]]: for (path, node) in DfsSubgraphIterator(self, direction, edges_fmt, src): if len(path) > 0: yield (path[-1], node) class SubgraphIterator(Iterator[Tuple[Tuple[str, ...], str]]): def __init__(self, graph: Graph, direction: str, edges_fmt: str, src: str): self.graph = graph self.direction = direction self.edges_fmt = edges_fmt self.seen: Set[str] = set() self.src = src def more_work(self) -> bool: raise NotImplementedError() def pop(self) -> Tuple[Tuple[str, ...], str]: raise NotImplementedError() def push(self, new_path: Tuple[str, ...], neighbor: str) -> None: raise NotImplementedError() def __next__(self) -> Tuple[Tuple[str, ...], str]: # Stores (path, next_node) if not self.more_work(): raise StopIteration() (path, node) = self.pop() new_path = path + (node,) if node not in self.seen: neighbors = self.graph.get_filtered_neighbors( node, self.edges_fmt, self.direction ) # We want to visit the first neighbor first, and to_visit is a stack; # so we need to reversed() the list of neighbors to get it on top # of the stack. for neighbor in reversed(list(neighbors)): self.push(new_path, neighbor) self.seen.add(node) return (path, node) class DfsSubgraphIterator(SubgraphIterator): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.to_visit: List[Tuple[Tuple[str, ...], str]] = [((), self.src)] def more_work(self) -> bool: return bool(self.to_visit) def pop(self) -> Tuple[Tuple[str, ...], str]: return self.to_visit.pop() def push(self, new_path: Tuple[str, ...], neighbor: str) -> None: self.to_visit.append((new_path, neighbor)) diff --git a/swh/graph/server/app.py b/swh/graph/http_server.py similarity index 62% rename from swh/graph/server/app.py rename to swh/graph/http_server.py index 3a883e9..d06293b 100644 --- a/swh/graph/server/app.py +++ b/swh/graph/http_server.py @@ -1,373 +1,349 @@ # Copyright (C) 2019-2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """ A proxy HTTP server for swh-graph, talking to the Java code via py4j, and using FIFO as a transport to stream integers between the two languages. """ -import asyncio -from collections import deque +import json import os from typing import Optional +import aiohttp.test_utils import aiohttp.web +from google.protobuf import json_format +from google.protobuf.field_mask_pb2 import FieldMask +import grpc from swh.core.api.asynchronous import RPCServerApp from swh.core.config import read as config_read -from swh.graph.backend import Backend +from swh.graph.rpc.swhgraph_pb2 import ( + GetNodeRequest, + NodeFilter, + StatsRequest, + TraversalRequest, +) +from swh.graph.rpc.swhgraph_pb2_grpc import TraversalServiceStub +from swh.graph.rpc_server import spawn_java_rpc_server from swh.model.swhids import EXTENDED_SWHID_TYPES try: from contextlib import asynccontextmanager except ImportError: # Compatibility with 3.6 backport from async_generator import asynccontextmanager # type: ignore # maximum number of retries for random walks RANDOM_RETRIES = 10 # TODO make this configurable via rpc-serve configuration class GraphServerApp(RPCServerApp): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.on_startup.append(self._start_gateway) - self.on_shutdown.append(self._stop_gateway) + self.on_startup.append(self._start) + self.on_shutdown.append(self._stop) @staticmethod - async def _start_gateway(app): - # Equivalent to entering `with app["backend"]:` - app["backend"].start_gateway() + async def _start(app): + app["channel"] = grpc.aio.insecure_channel(app["rpc_url"]) + await app["channel"].__aenter__() + app["rpc_client"] = TraversalServiceStub(app["channel"]) + await app["rpc_client"].Stats(StatsRequest(), wait_for_ready=True) @staticmethod - async def _stop_gateway(app): - # Equivalent to exiting `with app["backend"]:` with no error - app["backend"].stop_gateway() + async def _stop(app): + await app["channel"].__aexit__(None, None, None) + if app.get("local_server"): + app["local_server"].terminate() async def index(request): return aiohttp.web.Response( content_type="text/html", body=""" Software Heritage graph server

You have reached the Software Heritage graph API server.

See its API documentation for more information.

""", ) class GraphView(aiohttp.web.View): """Base class for views working on the graph, with utility functions""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.backend = self.request.app["backend"] + self.rpc_client: TraversalServiceStub = self.request.app["rpc_client"] def get_direction(self): """Validate HTTP query parameter `direction`""" s = self.request.query.get("direction", "forward") if s not in ("forward", "backward"): raise aiohttp.web.HTTPBadRequest(text=f"invalid direction: {s}") - return s + return s.upper() def get_edges(self): """Validate HTTP query parameter `edges`, i.e., edge restrictions""" s = self.request.query.get("edges", "*") if any( [ node_type != "*" and node_type not in EXTENDED_SWHID_TYPES for edge in s.split(":") for node_type in edge.split(",", maxsplit=1) ] ): raise aiohttp.web.HTTPBadRequest(text=f"invalid edge restriction: {s}") return s def get_return_types(self): """Validate HTTP query parameter 'return types', i.e, a set of types which we will filter the query results with""" s = self.request.query.get("return_types", "*") if any( node_type != "*" and node_type not in EXTENDED_SWHID_TYPES for node_type in s.split(",") ): raise aiohttp.web.HTTPBadRequest( text=f"invalid type for filtering res: {s}" ) # if the user puts a star, # then we filter nothing, we don't need the other information if "*" in s: return "*" else: return s def get_traversal(self): """Validate HTTP query parameter `traversal`, i.e., visit order""" s = self.request.query.get("traversal", "dfs") if s not in ("bfs", "dfs"): raise aiohttp.web.HTTPBadRequest(text=f"invalid traversal order: {s}") return s def get_limit(self): """Validate HTTP query parameter `limit`, i.e., number of results""" s = self.request.query.get("limit", "0") try: return int(s) except ValueError: raise aiohttp.web.HTTPBadRequest(text=f"invalid limit value: {s}") def get_max_edges(self): """Validate HTTP query parameter 'max_edges', i.e., the limit of the number of edges that can be visited""" s = self.request.query.get("max_edges", "0") try: return int(s) except ValueError: raise aiohttp.web.HTTPBadRequest(text=f"invalid max_edges value: {s}") - def check_swhid(self, swhid): + async def check_swhid(self, swhid): """Validate that the given SWHID exists in the graph""" try: - self.backend.check_swhid(swhid) - except (NameError, ValueError) as e: - raise aiohttp.web.HTTPBadRequest(text=str(e)) + await self.rpc_client.GetNode( + GetNodeRequest(swhid=swhid, mask=FieldMask(paths=["swhid"])) + ) + except grpc.aio.AioRpcError as e: + if e.code() == grpc.StatusCode.INVALID_ARGUMENT: + raise aiohttp.web.HTTPBadRequest(text=str(e.details())) class StreamingGraphView(GraphView): """Base class for views streaming their response line by line.""" content_type = "text/plain" @asynccontextmanager async def response_streamer(self, *args, **kwargs): """Context manager to prepare then close a StreamResponse""" response = aiohttp.web.StreamResponse(*args, **kwargs) response.content_type = self.content_type await response.prepare(self.request) yield response await response.write_eof() async def get(self): await self.prepare_response() async with self.response_streamer() as self.response_stream: self._buf = [] try: await self.stream_response() finally: await self._flush_buffer() return self.response_stream async def prepare_response(self): """This can be overridden with some setup to be run before the response actually starts streaming. """ pass async def stream_response(self): """Override this to perform the response streaming. Implementations of this should await self.stream_line(line) to write each line. """ raise NotImplementedError async def stream_line(self, line): """Write a line in the response stream.""" self._buf.append(line) if len(self._buf) > 100: await self._flush_buffer() async def _flush_buffer(self): await self.response_stream.write("\n".join(self._buf).encode() + b"\n") self._buf = [] class StatsView(GraphView): """View showing some statistics on the graph""" async def get(self): - stats = self.backend.stats() - return aiohttp.web.Response(body=stats, content_type="application/json") + res = await self.rpc_client.Stats(StatsRequest()) + stats = json_format.MessageToDict( + res, including_default_value_fields=True, preserving_proto_field_name=True + ) + # Int64 fields are serialized as strings by default. + for descriptor in res.DESCRIPTOR.fields: + if descriptor.type == descriptor.TYPE_INT64: + try: + stats[descriptor.name] = int(stats[descriptor.name]) + except KeyError: + pass + json_body = json.dumps(stats, indent=4, sort_keys=True) + return aiohttp.web.Response(body=json_body, content_type="application/json") class SimpleTraversalView(StreamingGraphView): """Base class for views of simple traversals""" - simple_traversal_type: Optional[str] = None - async def prepare_response(self): - self.src = self.request.match_info["src"] - self.edges = self.get_edges() - self.direction = self.get_direction() - self.max_edges = self.get_max_edges() - self.return_types = self.get_return_types() - self.check_swhid(self.src) + src = self.request.match_info["src"] + self.traversal_request = TraversalRequest( + src=[src], + edges=self.get_edges(), + direction=self.get_direction(), + return_nodes=NodeFilter(types=self.get_return_types()), + mask=FieldMask(paths=["swhid"]), + ) + if self.get_max_edges(): + self.traversal_request.max_edges = self.get_max_edges() + await self.check_swhid(src) + self.configure_request() + + def configure_request(self): + pass async def stream_response(self): - async for res_line in self.backend.traversal( - self.simple_traversal_type, - self.direction, - self.edges, - self.src, - self.max_edges, - self.return_types, - ): - await self.stream_line(res_line) + async for node in self.rpc_client.Traverse(self.traversal_request): + await self.stream_line(node.swhid) class LeavesView(SimpleTraversalView): - simple_traversal_type = "leaves" + def configure_request(self): + self.traversal_request.return_nodes.max_traversal_successors = 0 class NeighborsView(SimpleTraversalView): - simple_traversal_type = "neighbors" + def configure_request(self): + self.traversal_request.min_depth = 1 + self.traversal_request.max_depth = 1 class VisitNodesView(SimpleTraversalView): - simple_traversal_type = "visit_nodes" + pass class VisitEdgesView(SimpleTraversalView): - simple_traversal_type = "visit_edges" - - -class WalkView(StreamingGraphView): - async def prepare_response(self): - self.src = self.request.match_info["src"] - self.dst = self.request.match_info["dst"] - - self.edges = self.get_edges() - self.direction = self.get_direction() - self.algo = self.get_traversal() - self.limit = self.get_limit() - self.max_edges = self.get_max_edges() - self.return_types = self.get_return_types() - - self.check_swhid(self.src) - if self.dst not in EXTENDED_SWHID_TYPES: - self.check_swhid(self.dst) - - async def get_walk_iterator(self): - return self.backend.traversal( - "walk", - self.direction, - self.edges, - self.algo, - self.src, - self.dst, - self.max_edges, - self.return_types, - ) + def configure_request(self): + self.traversal_request.mask.paths.extend(["successor", "successor.swhid"]) + # self.traversal_request.return_fields.successor = True async def stream_response(self): - it = self.get_walk_iterator() - if self.limit < 0: - queue = deque(maxlen=-self.limit) - async for res_swhid in it: - queue.append(res_swhid) - while queue: - await self.stream_line(queue.popleft()) - else: - count = 0 - async for res_swhid in it: - if self.limit == 0 or count < self.limit: - await self.stream_line(res_swhid) - count += 1 - else: - break - - -class RandomWalkView(WalkView): - def get_walk_iterator(self): - return self.backend.traversal( - "random_walk", - self.direction, - self.edges, - RANDOM_RETRIES, - self.src, - self.dst, - self.max_edges, - self.return_types, - ) + async for node in self.rpc_client.Traverse(self.traversal_request): + for succ in node.successor: + await self.stream_line(node.swhid + " " + succ.swhid) class CountView(GraphView): """Base class for counting views.""" count_type: Optional[str] = None async def get(self): - self.src = self.request.match_info["src"] - self.check_swhid(self.src) - - self.edges = self.get_edges() - self.direction = self.get_direction() - self.max_edges = self.get_max_edges() - - loop = asyncio.get_event_loop() - cnt = await loop.run_in_executor( - None, - self.backend.count, - self.count_type, - self.direction, - self.edges, - self.src, - self.max_edges, + src = self.request.match_info["src"] + self.traversal_request = TraversalRequest( + src=[src], + edges=self.get_edges(), + direction=self.get_direction(), + return_nodes=NodeFilter(types=self.get_return_types()), + mask=FieldMask(paths=["swhid"]), + ) + if self.get_max_edges(): + self.traversal_request.max_edges = self.get_max_edges() + self.configure_request() + res = await self.rpc_client.CountNodes(self.traversal_request) + return aiohttp.web.Response( + body=str(res.count), content_type="application/json" ) - return aiohttp.web.Response(body=str(cnt), content_type="application/json") + + def configure_request(self): + pass class CountNeighborsView(CountView): - count_type = "neighbors" + def configure_request(self): + self.traversal_request.min_depth = 1 + self.traversal_request.max_depth = 1 class CountLeavesView(CountView): - count_type = "leaves" + def configure_request(self): + self.traversal_request.return_nodes.max_traversal_successors = 0 class CountVisitNodesView(CountView): - count_type = "visit_nodes" + pass -def make_app(config=None, backend=None, **kwargs): - if (config is None) == (backend is None): - raise ValueError("make_app() expects exactly one of 'config' or 'backend'") - if backend is None: - backend = Backend(graph_path=config["graph"]["path"], config=config["graph"]) +def make_app(config=None, rpc_url=None, **kwargs): app = GraphServerApp(**kwargs) + + if rpc_url is None: + app["local_server"], port = spawn_java_rpc_server(config) + rpc_url = f"localhost:{port}" + app.add_routes( [ aiohttp.web.get("/", index), aiohttp.web.get("/graph", index), aiohttp.web.view("/graph/stats", StatsView), aiohttp.web.view("/graph/leaves/{src}", LeavesView), aiohttp.web.view("/graph/neighbors/{src}", NeighborsView), aiohttp.web.view("/graph/visit/nodes/{src}", VisitNodesView), aiohttp.web.view("/graph/visit/edges/{src}", VisitEdgesView), - # temporarily disabled in wait of a proper fix for T1969 - # aiohttp.web.view("/graph/walk/{src}/{dst}", WalkView) - aiohttp.web.view("/graph/randomwalk/{src}/{dst}", RandomWalkView), aiohttp.web.view("/graph/neighbors/count/{src}", CountNeighborsView), aiohttp.web.view("/graph/leaves/count/{src}", CountLeavesView), aiohttp.web.view("/graph/visit/nodes/count/{src}", CountVisitNodesView), ] ) - app["backend"] = backend + app["rpc_url"] = rpc_url return app def make_app_from_configfile(): - """Load configuration and then build application to run - - """ + """Load configuration and then build application to run""" config_file = os.environ.get("SWH_CONFIG_FILENAME") config = config_read(config_file) return make_app(config=config) diff --git a/swh/graph/naive_client.py b/swh/graph/naive_client.py index 680df11..5a070d3 100644 --- a/swh/graph/naive_client.py +++ b/swh/graph/naive_client.py @@ -1,396 +1,15 @@ -# Copyright (C) 2021 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information +# Copyright (c) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information -import functools -import inspect -import re -import statistics -from typing import ( - Callable, - Dict, - Iterable, - Iterator, - List, - Optional, - Set, - Tuple, - TypeVar, - Union, -) - -from swh.model.swhids import CoreSWHID, ExtendedSWHID, ValidationError - -from .client import GraphArgumentException - -_NODE_TYPES = "ori|snp|rel|rev|dir|cnt" -NODES_RE = re.compile(fr"(\*|{_NODE_TYPES})") -EDGES_RE = re.compile(fr"(\*|{_NODE_TYPES}):(\*|{_NODE_TYPES})") - - -T = TypeVar("T", bound=Callable) -SWHIDlike = Union[CoreSWHID, ExtendedSWHID, str] - - -def check_arguments(f: T) -> T: - """Decorator for generic argument checking for methods of NaiveClient. - Checks ``src`` is a valid and known SWHID, and ``edges`` has the right format.""" - signature = inspect.signature(f) - - @functools.wraps(f) - def newf(*args, **kwargs): - __tracebackhide__ = True # for pytest - try: - bound_args = signature.bind(*args, **kwargs) - except TypeError as e: - # rethrow the exception from here so pytest doesn't flood the terminal - # with signature.bind's call stack. - raise TypeError(*e.args) from None - self = bound_args.arguments["self"] - - src = bound_args.arguments.get("src") - if src: - self._check_swhid(src) - - edges = bound_args.arguments.get("edges") - if edges: - if edges != "*" and not EDGES_RE.match(edges): - raise GraphArgumentException(f"invalid edge restriction: {edges}") - - return_types = bound_args.arguments.get("return_types") - if return_types: - if not NODES_RE.match(return_types): - raise GraphArgumentException( - f"invalid return_types restriction: {return_types}" - ) - - return f(*args, **kwargs) - - return newf # type: ignore - - -def filter_node_types(node_types: str, nodes: Iterable[str]) -> Iterator[str]: - if node_types == "*": - yield from nodes - else: - prefixes = tuple(f"swh:1:{type_}:" for type_ in node_types.split(",")) - for node in nodes: - if node.startswith(prefixes): - yield node - - -class NaiveClient: - """An alternative implementation of :class:`swh.graph.backend.Backend`, - written in pure-python and meant for simulating it in other components' test - cases; constructed from a list of nodes and (directed) edges, both - represented as SWHIDs. - - It is NOT meant to be efficient in any way; only to be a very simple - implementation that provides the same behavior. - - >>> nodes = [ - ... "swh:1:rev:1111111111111111111111111111111111111111", - ... "swh:1:rev:2222222222222222222222222222222222222222", - ... "swh:1:rev:3333333333333333333333333333333333333333", - ... ] - >>> edges = [ - ... ( - ... "swh:1:rev:1111111111111111111111111111111111111111", - ... "swh:1:rev:2222222222222222222222222222222222222222", - ... ), - ... ( - ... "swh:1:rev:2222222222222222222222222222222222222222", - ... "swh:1:rev:3333333333333333333333333333333333333333", - ... ), - ... ] - >>> c = NaiveClient(nodes=nodes, edges=edges) - >>> list(c.leaves("swh:1:rev:1111111111111111111111111111111111111111")) - ['swh:1:rev:3333333333333333333333333333333333333333'] - """ - - def __init__( - self, *, nodes: List[SWHIDlike], edges: List[Tuple[SWHIDlike, SWHIDlike]] - ): - self.graph = Graph(nodes, edges) - - def _check_swhid(self, swhid): - try: - ExtendedSWHID.from_string(swhid) - except ValidationError as e: - raise GraphArgumentException(*e.args) from None - if swhid not in self.graph.nodes: - raise GraphArgumentException(f"SWHID not found: {swhid}") - - def stats(self) -> Dict: - return { - "counts": { - "nodes": len(self.graph.nodes), - "edges": sum(map(len, self.graph.forward_edges.values())), - }, - "ratios": { - "compression": 1.0, - "bits_per_edge": 100.0, - "bits_per_node": 100.0, - "avg_locality": 0.0, - }, - "indegree": { - "min": min(map(len, self.graph.backward_edges.values())), - "max": max(map(len, self.graph.backward_edges.values())), - "avg": statistics.mean(map(len, self.graph.backward_edges.values())), - }, - "outdegree": { - "min": min(map(len, self.graph.forward_edges.values())), - "max": max(map(len, self.graph.forward_edges.values())), - "avg": statistics.mean(map(len, self.graph.forward_edges.values())), - }, - } - - @check_arguments - def leaves( - self, - src: str, - edges: str = "*", - direction: str = "forward", - max_edges: int = 0, - return_types: str = "*", - ) -> Iterator[str]: - # TODO: max_edges - yield from filter_node_types( - return_types, - [ - node - for node in self.graph.get_subgraph(src, edges, direction) - if not self.graph.get_filtered_neighbors(node, edges, direction) - ], - ) - - @check_arguments - def neighbors( - self, - src: str, - edges: str = "*", - direction: str = "forward", - max_edges: int = 0, - return_types: str = "*", - ) -> Iterator[str]: - # TODO: max_edges - yield from filter_node_types( - return_types, self.graph.get_filtered_neighbors(src, edges, direction) - ) - - @check_arguments - def visit_nodes( - self, - src: str, - edges: str = "*", - direction: str = "forward", - max_edges: int = 0, - return_types: str = "*", - ) -> Iterator[str]: - # TODO: max_edges - yield from filter_node_types( - return_types, self.graph.get_subgraph(src, edges, direction) - ) - - @check_arguments - def visit_edges( - self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0 - ) -> Iterator[Tuple[str, str]]: - if max_edges == 0: - max_edges = None # type: ignore - else: - max_edges -= 1 - yield from list(self.graph.iter_edges_dfs(direction, edges, src))[:max_edges] - - @check_arguments - def visit_paths( - self, src: str, edges: str = "*", direction: str = "forward", max_edges: int = 0 - ) -> Iterator[List[str]]: - # TODO: max_edges - for path in self.graph.iter_paths_dfs(direction, edges, src): - if path[-1] in self.leaves(src, edges, direction): - yield list(path) - @check_arguments - def walk( - self, - src: str, - dst: str, - edges: str = "*", - traversal: str = "dfs", - direction: str = "forward", - limit: Optional[int] = None, - ) -> Iterator[str]: - # TODO: implement algo="bfs" - # TODO: limit - match_path: Callable[[str], bool] - if ":" in dst: - match_path = dst.__eq__ - self._check_swhid(dst) - else: - match_path = lambda node: node.startswith(f"swh:1:{dst}:") # noqa - for path in self.graph.iter_paths_dfs(direction, edges, src): - if match_path(path[-1]): - if not limit: - # 0 or None - yield from path - elif limit > 0: - yield from path[0:limit] - else: - yield from path[limit:] +import warnings - @check_arguments - def random_walk( - self, - src: str, - dst: str, - edges: str = "*", - direction: str = "forward", - limit: Optional[int] = None, - ): - # TODO: limit - yield from self.walk(src, dst, edges, "dfs", direction, limit) +from .http_naive_client import * # noqa - @check_arguments - def count_leaves( - self, src: str, edges: str = "*", direction: str = "forward" - ) -> int: - return len(list(self.leaves(src, edges, direction))) - - @check_arguments - def count_neighbors( - self, src: str, edges: str = "*", direction: str = "forward" - ) -> int: - return len(self.graph.get_filtered_neighbors(src, edges, direction)) - - @check_arguments - def count_visit_nodes( - self, src: str, edges: str = "*", direction: str = "forward" - ) -> int: - return len(self.graph.get_subgraph(src, edges, direction)) - - -class Graph: - def __init__( - self, nodes: List[SWHIDlike], edges: List[Tuple[SWHIDlike, SWHIDlike]] - ): - self.nodes = [str(node) for node in nodes] - self.forward_edges: Dict[str, List[str]] = {} - self.backward_edges: Dict[str, List[str]] = {} - for node in nodes: - self.forward_edges[str(node)] = [] - self.backward_edges[str(node)] = [] - for (src, dst) in edges: - self.forward_edges[str(src)].append(str(dst)) - self.backward_edges[str(dst)].append(str(src)) - - def get_filtered_neighbors( - self, src: str, edges_fmt: str, direction: str, - ) -> Set[str]: - if direction == "forward": - edges = self.forward_edges - elif direction == "backward": - edges = self.backward_edges - else: - raise GraphArgumentException(f"invalid direction: {direction}") - - neighbors = edges.get(src, []) - - if edges_fmt == "*": - return set(neighbors) - else: - filtered_neighbors: Set[str] = set() - for edges_fmt_item in edges_fmt.split(","): - (src_fmt, dst_fmt) = edges_fmt_item.split(":") - if src_fmt != "*" and not src.startswith(f"swh:1:{src_fmt}:"): - continue - if dst_fmt == "*": - filtered_neighbors.update(neighbors) - else: - prefix = f"swh:1:{dst_fmt}:" - filtered_neighbors.update( - n for n in neighbors if n.startswith(prefix) - ) - return filtered_neighbors - - def get_subgraph(self, src: str, edges_fmt: str, direction: str) -> Set[str]: - seen = set() - to_visit = {src} - while to_visit: - node = to_visit.pop() - seen.add(node) - neighbors = set(self.get_filtered_neighbors(node, edges_fmt, direction)) - new_nodes = neighbors - seen - to_visit.update(new_nodes) - - return seen - - def iter_paths_dfs( - self, direction: str, edges_fmt: str, src: str - ) -> Iterator[Tuple[str, ...]]: - for (path, node) in DfsSubgraphIterator(self, direction, edges_fmt, src): - yield path + (node,) - - def iter_edges_dfs( - self, direction: str, edges_fmt: str, src: str - ) -> Iterator[Tuple[str, str]]: - for (path, node) in DfsSubgraphIterator(self, direction, edges_fmt, src): - if len(path) > 0: - yield (path[-1], node) - - -class SubgraphIterator(Iterator[Tuple[Tuple[str, ...], str]]): - def __init__(self, graph: Graph, direction: str, edges_fmt: str, src: str): - self.graph = graph - self.direction = direction - self.edges_fmt = edges_fmt - self.seen: Set[str] = set() - self.src = src - - def more_work(self) -> bool: - raise NotImplementedError() - - def pop(self) -> Tuple[Tuple[str, ...], str]: - raise NotImplementedError() - - def push(self, new_path: Tuple[str, ...], neighbor: str) -> None: - raise NotImplementedError() - - def __next__(self) -> Tuple[Tuple[str, ...], str]: - # Stores (path, next_node) - if not self.more_work(): - raise StopIteration() - - (path, node) = self.pop() - - new_path = path + (node,) - - if node not in self.seen: - neighbors = self.graph.get_filtered_neighbors( - node, self.edges_fmt, self.direction - ) - - # We want to visit the first neighbor first, and to_visit is a stack; - # so we need to reversed() the list of neighbors to get it on top - # of the stack. - for neighbor in reversed(list(neighbors)): - self.push(new_path, neighbor) - - self.seen.add(node) - return (path, node) - - -class DfsSubgraphIterator(SubgraphIterator): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.to_visit: List[Tuple[Tuple[str, ...], str]] = [((), self.src)] - - def more_work(self) -> bool: - return bool(self.to_visit) - - def pop(self) -> Tuple[Tuple[str, ...], str]: - return self.to_visit.pop() - - def push(self, new_path: Tuple[str, ...], neighbor: str) -> None: - self.to_visit.append((new_path, neighbor)) +warnings.warn( + "the swh.graph.naive_client module is deprecated, use swh.graph.http_naive_client instead", + DeprecationWarning, + stacklevel=2, +) diff --git a/swh/graph/rpc/swhgraph.proto b/swh/graph/rpc/swhgraph.proto new file mode 100644 index 0000000..7c40a6e --- /dev/null +++ b/swh/graph/rpc/swhgraph.proto @@ -0,0 +1,316 @@ +syntax = "proto3"; + +import "google/protobuf/field_mask.proto"; + +option java_multiple_files = true; +option java_package = "org.softwareheritage.graph.rpc"; +option java_outer_classname = "GraphService"; + +package swh.graph; + +/* Graph traversal service */ +service TraversalService { + /* GetNode returns a single Node and its properties. */ + rpc GetNode (GetNodeRequest) returns (Node); + + /* Traverse performs a breadth-first graph traversal from a set of source + * nodes, then streams the nodes it encounters (if they match a given + * return filter), along with their properties. + */ + rpc Traverse (TraversalRequest) returns (stream Node); + + /* FindPathTo searches for a shortest path between a set of source nodes + * and a node that matches a specific *criteria*. + * + * It does so by performing a breadth-first search from the source node, + * until any node that matches the given criteria is found, then follows + * back its parents to return a shortest path from the source set to that + * node. + */ + rpc FindPathTo (FindPathToRequest) returns (Path); + + /* FindPathBetween searches for a shortest path between a set of source + * nodes and a set of destination nodes. + * + * It does so by performing a *bidirectional breadth-first search*, i.e., + * two parallel breadth-first searches, one from the source set ("src-BFS") + * and one from the destination set ("dst-BFS"), until both searches find a + * common node that joins their visited sets. This node is called the + * "midpoint node". + * The path returned is the path src -> ... -> midpoint -> ... -> dst, + * which is always a shortest path between src and dst. + * + * The graph direction of both BFS can be configured separately. By + * default, the dst-BFS will use the graph in the opposite direction than + * the src-BFS (if direction = FORWARD, by default direction_reverse = + * BACKWARD, and vice-versa). The default behavior is thus to search for + * a shortest path between two nodes in a given direction. However, one + * can also specify FORWARD or BACKWARD for *both* the src-BFS and the + * dst-BFS. This will search for a common descendant or a common ancestor + * between the two sets, respectively. These will be the midpoints of the + * returned path. + */ + rpc FindPathBetween (FindPathBetweenRequest) returns (Path); + + /* CountNodes does the same as Traverse, but only returns the number of + * nodes accessed during the traversal. */ + rpc CountNodes (TraversalRequest) returns (CountResponse); + + /* CountEdges does the same as Traverse, but only returns the number of + * edges accessed during the traversal. */ + rpc CountEdges (TraversalRequest) returns (CountResponse); + + /* Stats returns various statistics on the overall graph. */ + rpc Stats (StatsRequest) returns (StatsResponse); +} + +/* Direction of the graph */ +enum GraphDirection { + /* Forward DAG: ori -> snp -> rel -> rev -> dir -> cnt */ + FORWARD = 0; + /* Transposed DAG: cnt -> dir -> rev -> rel -> snp -> ori */ + BACKWARD = 1; +} + +/* Describe a node to return */ +message GetNodeRequest { + /* SWHID of the node to return */ + string swhid = 1; + /* FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + * By default, all fields are returned. */ + optional google.protobuf.FieldMask mask = 8; +} + +/* TraversalRequest describes how a breadth-first traversal should be + * performed, and what should be returned to the client. */ +message TraversalRequest { + /* Set of source nodes (SWHIDs) */ + repeated string src = 1; + /* Direction of the graph to traverse. Defaults to FORWARD. */ + GraphDirection direction = 2; + /* Edge restriction string (e.g. "rev:dir,dir:cnt"). + * Defaults to "*" (all). */ + optional string edges = 3; + /* Maximum number of edges accessed in the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_edges = 4; + /* Do not return nodes with a depth lower than this number. + * By default, all depths are returned. */ + optional int64 min_depth = 5; + /* Maximum depth of the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_depth = 6; + /* Filter which nodes will be sent to the stream. By default, all nodes are + * returned. */ + optional NodeFilter return_nodes = 7; + /* FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + * By default, all fields are returned. */ + optional google.protobuf.FieldMask mask = 8; +} + +/* FindPathToRequest describes a request to find a shortest path between a + * set of nodes and a given target criteria, as well as what should be returned + * in the path. + */ +message FindPathToRequest { + /* Set of source nodes (SWHIDs) */ + repeated string src = 1; + /* Target criteria, i.e., what constitutes a valid path destination. */ + NodeFilter target = 2; + /* Direction of the graph to traverse. Defaults to FORWARD. */ + GraphDirection direction = 3; + /* Edge restriction string (e.g. "rev:dir,dir:cnt"). + * Defaults to "*" (all). */ + optional string edges = 4; + /* Maximum number of edges accessed in the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_edges = 5; + /* Maximum depth of the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_depth = 6; + /* FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + * By default, all fields are returned. */ + optional google.protobuf.FieldMask mask = 7; +} + +/* FindPathToRequest describes a request to find a shortest path between a + * set of source nodes and a set of destination nodes. It works by performing a + * bidirectional breadth-first traversal from both sets at the same time. + */ +message FindPathBetweenRequest { + /* Set of source nodes (SWHIDs) */ + repeated string src = 1; + /* Set of destination nodes (SWHIDs) */ + repeated string dst = 2; + /* Direction of the graph to traverse from the source set. Defaults to + * FORWARD. */ + GraphDirection direction = 3; + /* Direction of the graph to traverse from the destination set. Defaults to + * the opposite of `direction`. If direction and direction_reverse are + * identical, it will find the first common successor of both sets in the + * given direction. */ + optional GraphDirection direction_reverse = 4; + /* Edge restriction string for the traversal from the source set. + * (e.g. "rev:dir,dir:cnt"). Defaults to "*" (all). */ + optional string edges = 5; + /* Edge restriction string for the reverse traversal from the destination + * set. + * If not specified: + * - If `edges` is not specified either, defaults to "*" + * - If direction == direction_reverse, defaults to `edges` + * - If direction != direction_reverse, defaults to the reverse of `edges` + * (e.g. "rev:dir" becomes "dir:rev"). + */ + optional string edges_reverse = 6; + /* Maximum number of edges accessed in the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_edges = 7; + /* Maximum depth of the traversal, after which it stops. + * Defaults to infinite. */ + optional int64 max_depth = 8; + /* FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + * By default, all fields are returned. */ + optional google.protobuf.FieldMask mask = 9; +} + +/* Represents various criteria that make a given node "valid". A node is + * only valid if all the subcriteria present in this message are fulfilled. + */ +message NodeFilter { + /* Node restriction string. (e.g. "dir,cnt,rev"). Defaults to "*" (all). */ + optional string types = 1; + /* Minimum number of successors encountered *during the traversal*. + * Default: no constraint */ + optional int64 min_traversal_successors = 2; + /* Maximum number of successors encountered *during the traversal*. + * Default: no constraint */ + optional int64 max_traversal_successors = 3; +} + +/* Represents a node in the graph. */ +message Node { + /* The SWHID of the graph node. */ + string swhid = 1; + /* List of relevant successors of this node. */ + repeated Successor successor = 2; + /* Number of relevant successors. */ + optional int64 num_successors = 9; + /* Node properties */ + oneof data { + ContentData cnt = 3; + RevisionData rev = 5; + ReleaseData rel = 6; + OriginData ori = 8; + }; +} + +/* Represents a path in the graph. */ +message Path { + /* List of nodes in the path, from source to destination */ + repeated Node node = 1; + /* Index of the "midpoint" of the path. For paths obtained with + * bidirectional search queries, this is the node that joined the two + * sets together. When looking for a common ancestor between two nodes by + * performing a FindPathBetween search with two backward graphs, this will + * be the index of the common ancestor in the path. */ + optional int32 midpoint_index = 2; +} + +/* Represents a successor of a given node. */ +message Successor { + /* The SWHID of the successor */ + optional string swhid = 1; + /* A list of edge labels for the given edge */ + repeated EdgeLabel label = 2; +} + +/* Content node properties */ +message ContentData { + /* Length of the blob, in bytes */ + optional int64 length = 1; + /* Whether the content was skipped during ingestion. */ + optional bool is_skipped = 2; +} + +/* Revision node properties */ +message RevisionData { + /* Revision author ID (anonymized) */ + optional int64 author = 1; + /* UNIX timestamp of the revision date (UTC) */ + optional int64 author_date = 2; + /* Timezone of the revision author date as an offset from UTC */ + optional int32 author_date_offset = 3; + /* Revision committer ID (anonymized) */ + optional int64 committer = 4; + /* UNIX timestamp of the revision committer date (UTC) */ + optional int64 committer_date = 5; + /* Timezone of the revision committer date as an offset from UTC */ + optional int32 committer_date_offset = 6; + /* Revision message */ + optional bytes message = 7; +} + +/* Release node properties */ +message ReleaseData { + /* Release author ID (anonymized) */ + optional int64 author = 1; + /* UNIX timestamp of the release date (UTC) */ + optional int64 author_date = 2; + /* Timezone of the release author date as an offset from UTC */ + optional int32 author_date_offset = 3; + /* Release name */ + optional bytes name = 4; + /* Release message */ + optional bytes message = 5; +} + +/* Origin node properties */ +message OriginData { + /* URL of the origin */ + optional string url = 1; +} + +message EdgeLabel { + /* Directory entry name for directories, branch name for snapshots */ + bytes name = 1; + /* Entry permission (only set for directories). */ + int32 permission = 2; +} + +message CountResponse { + int64 count = 1; +} + +message StatsRequest { +} + +message StatsResponse { + /* Number of nodes in the graph */ + int64 num_nodes = 1; + /* Number of edges in the graph */ + int64 num_edges = 2; + + /* Ratio between the graph size and the information-theoretical lower + * bound */ + double compression_ratio = 3; + /* Number of bits per node (overall graph size in bits divided by the + * number of nodes) */ + double bits_per_node = 4; + /* Number of bits per edge (overall graph size in bits divided by the + * number of arcs). */ + double bits_per_edge = 5; + double avg_locality = 6; + + /* Smallest indegree */ + int64 indegree_min = 7; + /* Largest indegree */ + int64 indegree_max = 8; + /* Average indegree */ + double indegree_avg = 9; + /* Smallest outdegree */ + int64 outdegree_min = 10; + /* Largest outdegree */ + int64 outdegree_max = 11; + /* Average outdegree */ + double outdegree_avg = 12; +} diff --git a/swh/graph/rpc/swhgraph_pb2.py b/swh/graph/rpc/swhgraph_pb2.py new file mode 100644 index 0000000..b48ad97 --- /dev/null +++ b/swh/graph/rpc/swhgraph_pb2.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: swh/graph/rpc/swhgraph.proto +"""Generated protocol buffer code.""" +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from google.protobuf import field_mask_pb2 as google_dot_protobuf_dot_field__mask__pb2 + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1cswh/graph/rpc/swhgraph.proto\x12\tswh.graph\x1a google/protobuf/field_mask.proto\"W\n\x0eGetNodeRequest\x12\r\n\x05swhid\x18\x01 \x01(\t\x12-\n\x04mask\x18\x08 \x01(\x0b\x32\x1a.google.protobuf.FieldMaskH\x00\x88\x01\x01\x42\x07\n\x05_mask\"\xd8\x02\n\x10TraversalRequest\x12\x0b\n\x03src\x18\x01 \x03(\t\x12,\n\tdirection\x18\x02 \x01(\x0e\x32\x19.swh.graph.GraphDirection\x12\x12\n\x05\x65\x64ges\x18\x03 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tmax_edges\x18\x04 \x01(\x03H\x01\x88\x01\x01\x12\x16\n\tmin_depth\x18\x05 \x01(\x03H\x02\x88\x01\x01\x12\x16\n\tmax_depth\x18\x06 \x01(\x03H\x03\x88\x01\x01\x12\x30\n\x0creturn_nodes\x18\x07 \x01(\x0b\x32\x15.swh.graph.NodeFilterH\x04\x88\x01\x01\x12-\n\x04mask\x18\x08 \x01(\x0b\x32\x1a.google.protobuf.FieldMaskH\x05\x88\x01\x01\x42\x08\n\x06_edgesB\x0c\n\n_max_edgesB\x0c\n\n_min_depthB\x0c\n\n_max_depthB\x0f\n\r_return_nodesB\x07\n\x05_mask\"\x97\x02\n\x11\x46indPathToRequest\x12\x0b\n\x03src\x18\x01 \x03(\t\x12%\n\x06target\x18\x02 \x01(\x0b\x32\x15.swh.graph.NodeFilter\x12,\n\tdirection\x18\x03 \x01(\x0e\x32\x19.swh.graph.GraphDirection\x12\x12\n\x05\x65\x64ges\x18\x04 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tmax_edges\x18\x05 \x01(\x03H\x01\x88\x01\x01\x12\x16\n\tmax_depth\x18\x06 \x01(\x03H\x02\x88\x01\x01\x12-\n\x04mask\x18\x07 \x01(\x0b\x32\x1a.google.protobuf.FieldMaskH\x03\x88\x01\x01\x42\x08\n\x06_edgesB\x0c\n\n_max_edgesB\x0c\n\n_max_depthB\x07\n\x05_mask\"\x81\x03\n\x16\x46indPathBetweenRequest\x12\x0b\n\x03src\x18\x01 \x03(\t\x12\x0b\n\x03\x64st\x18\x02 \x03(\t\x12,\n\tdirection\x18\x03 \x01(\x0e\x32\x19.swh.graph.GraphDirection\x12\x39\n\x11\x64irection_reverse\x18\x04 \x01(\x0e\x32\x19.swh.graph.GraphDirectionH\x00\x88\x01\x01\x12\x12\n\x05\x65\x64ges\x18\x05 \x01(\tH\x01\x88\x01\x01\x12\x1a\n\redges_reverse\x18\x06 \x01(\tH\x02\x88\x01\x01\x12\x16\n\tmax_edges\x18\x07 \x01(\x03H\x03\x88\x01\x01\x12\x16\n\tmax_depth\x18\x08 \x01(\x03H\x04\x88\x01\x01\x12-\n\x04mask\x18\t \x01(\x0b\x32\x1a.google.protobuf.FieldMaskH\x05\x88\x01\x01\x42\x14\n\x12_direction_reverseB\x08\n\x06_edgesB\x10\n\x0e_edges_reverseB\x0c\n\n_max_edgesB\x0c\n\n_max_depthB\x07\n\x05_mask\"\xb2\x01\n\nNodeFilter\x12\x12\n\x05types\x18\x01 \x01(\tH\x00\x88\x01\x01\x12%\n\x18min_traversal_successors\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12%\n\x18max_traversal_successors\x18\x03 \x01(\x03H\x02\x88\x01\x01\x42\x08\n\x06_typesB\x1b\n\x19_min_traversal_successorsB\x1b\n\x19_max_traversal_successors\"\x92\x02\n\x04Node\x12\r\n\x05swhid\x18\x01 \x01(\t\x12\'\n\tsuccessor\x18\x02 \x03(\x0b\x32\x14.swh.graph.Successor\x12\x1b\n\x0enum_successors\x18\t \x01(\x03H\x01\x88\x01\x01\x12%\n\x03\x63nt\x18\x03 \x01(\x0b\x32\x16.swh.graph.ContentDataH\x00\x12&\n\x03rev\x18\x05 \x01(\x0b\x32\x17.swh.graph.RevisionDataH\x00\x12%\n\x03rel\x18\x06 \x01(\x0b\x32\x16.swh.graph.ReleaseDataH\x00\x12$\n\x03ori\x18\x08 \x01(\x0b\x32\x15.swh.graph.OriginDataH\x00\x42\x06\n\x04\x64\x61taB\x11\n\x0f_num_successors\"U\n\x04Path\x12\x1d\n\x04node\x18\x01 \x03(\x0b\x32\x0f.swh.graph.Node\x12\x1b\n\x0emidpoint_index\x18\x02 \x01(\x05H\x00\x88\x01\x01\x42\x11\n\x0f_midpoint_index\"N\n\tSuccessor\x12\x12\n\x05swhid\x18\x01 \x01(\tH\x00\x88\x01\x01\x12#\n\x05label\x18\x02 \x03(\x0b\x32\x14.swh.graph.EdgeLabelB\x08\n\x06_swhid\"U\n\x0b\x43ontentData\x12\x13\n\x06length\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x17\n\nis_skipped\x18\x02 \x01(\x08H\x01\x88\x01\x01\x42\t\n\x07_lengthB\r\n\x0b_is_skipped\"\xc6\x02\n\x0cRevisionData\x12\x13\n\x06\x61uthor\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x0b\x61uthor_date\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x1f\n\x12\x61uthor_date_offset\x18\x03 \x01(\x05H\x02\x88\x01\x01\x12\x16\n\tcommitter\x18\x04 \x01(\x03H\x03\x88\x01\x01\x12\x1b\n\x0e\x63ommitter_date\x18\x05 \x01(\x03H\x04\x88\x01\x01\x12\"\n\x15\x63ommitter_date_offset\x18\x06 \x01(\x05H\x05\x88\x01\x01\x12\x14\n\x07message\x18\x07 \x01(\x0cH\x06\x88\x01\x01\x42\t\n\x07_authorB\x0e\n\x0c_author_dateB\x15\n\x13_author_date_offsetB\x0c\n\n_committerB\x11\n\x0f_committer_dateB\x18\n\x16_committer_date_offsetB\n\n\x08_message\"\xcd\x01\n\x0bReleaseData\x12\x13\n\x06\x61uthor\x18\x01 \x01(\x03H\x00\x88\x01\x01\x12\x18\n\x0b\x61uthor_date\x18\x02 \x01(\x03H\x01\x88\x01\x01\x12\x1f\n\x12\x61uthor_date_offset\x18\x03 \x01(\x05H\x02\x88\x01\x01\x12\x11\n\x04name\x18\x04 \x01(\x0cH\x03\x88\x01\x01\x12\x14\n\x07message\x18\x05 \x01(\x0cH\x04\x88\x01\x01\x42\t\n\x07_authorB\x0e\n\x0c_author_dateB\x15\n\x13_author_date_offsetB\x07\n\x05_nameB\n\n\x08_message\"&\n\nOriginData\x12\x10\n\x03url\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x06\n\x04_url\"-\n\tEdgeLabel\x12\x0c\n\x04name\x18\x01 \x01(\x0c\x12\x12\n\npermission\x18\x02 \x01(\x05\"\x1e\n\rCountResponse\x12\r\n\x05\x63ount\x18\x01 \x01(\x03\"\x0e\n\x0cStatsRequest\"\x9b\x02\n\rStatsResponse\x12\x11\n\tnum_nodes\x18\x01 \x01(\x03\x12\x11\n\tnum_edges\x18\x02 \x01(\x03\x12\x19\n\x11\x63ompression_ratio\x18\x03 \x01(\x01\x12\x15\n\rbits_per_node\x18\x04 \x01(\x01\x12\x15\n\rbits_per_edge\x18\x05 \x01(\x01\x12\x14\n\x0c\x61vg_locality\x18\x06 \x01(\x01\x12\x14\n\x0cindegree_min\x18\x07 \x01(\x03\x12\x14\n\x0cindegree_max\x18\x08 \x01(\x03\x12\x14\n\x0cindegree_avg\x18\t \x01(\x01\x12\x15\n\routdegree_min\x18\n \x01(\x03\x12\x15\n\routdegree_max\x18\x0b \x01(\x03\x12\x15\n\routdegree_avg\x18\x0c \x01(\x01*+\n\x0eGraphDirection\x12\x0b\n\x07\x46ORWARD\x10\x00\x12\x0c\n\x08\x42\x41\x43KWARD\x10\x01\x32\xcf\x03\n\x10TraversalService\x12\x35\n\x07GetNode\x12\x19.swh.graph.GetNodeRequest\x1a\x0f.swh.graph.Node\x12:\n\x08Traverse\x12\x1b.swh.graph.TraversalRequest\x1a\x0f.swh.graph.Node0\x01\x12;\n\nFindPathTo\x12\x1c.swh.graph.FindPathToRequest\x1a\x0f.swh.graph.Path\x12\x45\n\x0f\x46indPathBetween\x12!.swh.graph.FindPathBetweenRequest\x1a\x0f.swh.graph.Path\x12\x43\n\nCountNodes\x12\x1b.swh.graph.TraversalRequest\x1a\x18.swh.graph.CountResponse\x12\x43\n\nCountEdges\x12\x1b.swh.graph.TraversalRequest\x1a\x18.swh.graph.CountResponse\x12:\n\x05Stats\x12\x17.swh.graph.StatsRequest\x1a\x18.swh.graph.StatsResponseB0\n\x1eorg.softwareheritage.graph.rpcB\x0cGraphServiceP\x01\x62\x06proto3') + +_GRAPHDIRECTION = DESCRIPTOR.enum_types_by_name['GraphDirection'] +GraphDirection = enum_type_wrapper.EnumTypeWrapper(_GRAPHDIRECTION) +FORWARD = 0 +BACKWARD = 1 + + +_GETNODEREQUEST = DESCRIPTOR.message_types_by_name['GetNodeRequest'] +_TRAVERSALREQUEST = DESCRIPTOR.message_types_by_name['TraversalRequest'] +_FINDPATHTOREQUEST = DESCRIPTOR.message_types_by_name['FindPathToRequest'] +_FINDPATHBETWEENREQUEST = DESCRIPTOR.message_types_by_name['FindPathBetweenRequest'] +_NODEFILTER = DESCRIPTOR.message_types_by_name['NodeFilter'] +_NODE = DESCRIPTOR.message_types_by_name['Node'] +_PATH = DESCRIPTOR.message_types_by_name['Path'] +_SUCCESSOR = DESCRIPTOR.message_types_by_name['Successor'] +_CONTENTDATA = DESCRIPTOR.message_types_by_name['ContentData'] +_REVISIONDATA = DESCRIPTOR.message_types_by_name['RevisionData'] +_RELEASEDATA = DESCRIPTOR.message_types_by_name['ReleaseData'] +_ORIGINDATA = DESCRIPTOR.message_types_by_name['OriginData'] +_EDGELABEL = DESCRIPTOR.message_types_by_name['EdgeLabel'] +_COUNTRESPONSE = DESCRIPTOR.message_types_by_name['CountResponse'] +_STATSREQUEST = DESCRIPTOR.message_types_by_name['StatsRequest'] +_STATSRESPONSE = DESCRIPTOR.message_types_by_name['StatsResponse'] +GetNodeRequest = _reflection.GeneratedProtocolMessageType('GetNodeRequest', (_message.Message,), { + 'DESCRIPTOR' : _GETNODEREQUEST, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.GetNodeRequest) + }) +_sym_db.RegisterMessage(GetNodeRequest) + +TraversalRequest = _reflection.GeneratedProtocolMessageType('TraversalRequest', (_message.Message,), { + 'DESCRIPTOR' : _TRAVERSALREQUEST, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.TraversalRequest) + }) +_sym_db.RegisterMessage(TraversalRequest) + +FindPathToRequest = _reflection.GeneratedProtocolMessageType('FindPathToRequest', (_message.Message,), { + 'DESCRIPTOR' : _FINDPATHTOREQUEST, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.FindPathToRequest) + }) +_sym_db.RegisterMessage(FindPathToRequest) + +FindPathBetweenRequest = _reflection.GeneratedProtocolMessageType('FindPathBetweenRequest', (_message.Message,), { + 'DESCRIPTOR' : _FINDPATHBETWEENREQUEST, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.FindPathBetweenRequest) + }) +_sym_db.RegisterMessage(FindPathBetweenRequest) + +NodeFilter = _reflection.GeneratedProtocolMessageType('NodeFilter', (_message.Message,), { + 'DESCRIPTOR' : _NODEFILTER, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.NodeFilter) + }) +_sym_db.RegisterMessage(NodeFilter) + +Node = _reflection.GeneratedProtocolMessageType('Node', (_message.Message,), { + 'DESCRIPTOR' : _NODE, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.Node) + }) +_sym_db.RegisterMessage(Node) + +Path = _reflection.GeneratedProtocolMessageType('Path', (_message.Message,), { + 'DESCRIPTOR' : _PATH, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.Path) + }) +_sym_db.RegisterMessage(Path) + +Successor = _reflection.GeneratedProtocolMessageType('Successor', (_message.Message,), { + 'DESCRIPTOR' : _SUCCESSOR, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.Successor) + }) +_sym_db.RegisterMessage(Successor) + +ContentData = _reflection.GeneratedProtocolMessageType('ContentData', (_message.Message,), { + 'DESCRIPTOR' : _CONTENTDATA, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.ContentData) + }) +_sym_db.RegisterMessage(ContentData) + +RevisionData = _reflection.GeneratedProtocolMessageType('RevisionData', (_message.Message,), { + 'DESCRIPTOR' : _REVISIONDATA, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.RevisionData) + }) +_sym_db.RegisterMessage(RevisionData) + +ReleaseData = _reflection.GeneratedProtocolMessageType('ReleaseData', (_message.Message,), { + 'DESCRIPTOR' : _RELEASEDATA, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.ReleaseData) + }) +_sym_db.RegisterMessage(ReleaseData) + +OriginData = _reflection.GeneratedProtocolMessageType('OriginData', (_message.Message,), { + 'DESCRIPTOR' : _ORIGINDATA, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.OriginData) + }) +_sym_db.RegisterMessage(OriginData) + +EdgeLabel = _reflection.GeneratedProtocolMessageType('EdgeLabel', (_message.Message,), { + 'DESCRIPTOR' : _EDGELABEL, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.EdgeLabel) + }) +_sym_db.RegisterMessage(EdgeLabel) + +CountResponse = _reflection.GeneratedProtocolMessageType('CountResponse', (_message.Message,), { + 'DESCRIPTOR' : _COUNTRESPONSE, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.CountResponse) + }) +_sym_db.RegisterMessage(CountResponse) + +StatsRequest = _reflection.GeneratedProtocolMessageType('StatsRequest', (_message.Message,), { + 'DESCRIPTOR' : _STATSREQUEST, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.StatsRequest) + }) +_sym_db.RegisterMessage(StatsRequest) + +StatsResponse = _reflection.GeneratedProtocolMessageType('StatsResponse', (_message.Message,), { + 'DESCRIPTOR' : _STATSRESPONSE, + '__module__' : 'swh.graph.rpc.swhgraph_pb2' + # @@protoc_insertion_point(class_scope:swh.graph.StatsResponse) + }) +_sym_db.RegisterMessage(StatsResponse) + +_TRAVERSALSERVICE = DESCRIPTOR.services_by_name['TraversalService'] +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + DESCRIPTOR._serialized_options = b'\n\036org.softwareheritage.graph.rpcB\014GraphServiceP\001' + _GRAPHDIRECTION._serialized_start=2853 + _GRAPHDIRECTION._serialized_end=2896 + _GETNODEREQUEST._serialized_start=77 + _GETNODEREQUEST._serialized_end=164 + _TRAVERSALREQUEST._serialized_start=167 + _TRAVERSALREQUEST._serialized_end=511 + _FINDPATHTOREQUEST._serialized_start=514 + _FINDPATHTOREQUEST._serialized_end=793 + _FINDPATHBETWEENREQUEST._serialized_start=796 + _FINDPATHBETWEENREQUEST._serialized_end=1181 + _NODEFILTER._serialized_start=1184 + _NODEFILTER._serialized_end=1362 + _NODE._serialized_start=1365 + _NODE._serialized_end=1639 + _PATH._serialized_start=1641 + _PATH._serialized_end=1726 + _SUCCESSOR._serialized_start=1728 + _SUCCESSOR._serialized_end=1806 + _CONTENTDATA._serialized_start=1808 + _CONTENTDATA._serialized_end=1893 + _REVISIONDATA._serialized_start=1896 + _REVISIONDATA._serialized_end=2222 + _RELEASEDATA._serialized_start=2225 + _RELEASEDATA._serialized_end=2430 + _ORIGINDATA._serialized_start=2432 + _ORIGINDATA._serialized_end=2470 + _EDGELABEL._serialized_start=2472 + _EDGELABEL._serialized_end=2517 + _COUNTRESPONSE._serialized_start=2519 + _COUNTRESPONSE._serialized_end=2549 + _STATSREQUEST._serialized_start=2551 + _STATSREQUEST._serialized_end=2565 + _STATSRESPONSE._serialized_start=2568 + _STATSRESPONSE._serialized_end=2851 + _TRAVERSALSERVICE._serialized_start=2899 + _TRAVERSALSERVICE._serialized_end=3362 +# @@protoc_insertion_point(module_scope) diff --git a/swh/graph/rpc/swhgraph_pb2.pyi b/swh/graph/rpc/swhgraph_pb2.pyi new file mode 100644 index 0000000..1d099bf --- /dev/null +++ b/swh/graph/rpc/swhgraph_pb2.pyi @@ -0,0 +1,685 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +""" +import builtins +import google.protobuf.descriptor +import google.protobuf.field_mask_pb2 +import google.protobuf.internal.containers +import google.protobuf.internal.enum_type_wrapper +import google.protobuf.message +import typing +import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +class _GraphDirection: + ValueType = typing.NewType('ValueType', builtins.int) + V: typing_extensions.TypeAlias = ValueType +class _GraphDirectionEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_GraphDirection.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + FORWARD: _GraphDirection.ValueType # 0 + """Forward DAG: ori -> snp -> rel -> rev -> dir -> cnt""" + + BACKWARD: _GraphDirection.ValueType # 1 + """Transposed DAG: cnt -> dir -> rev -> rel -> snp -> ori""" + +class GraphDirection(_GraphDirection, metaclass=_GraphDirectionEnumTypeWrapper): + """Direction of the graph""" + pass + +FORWARD: GraphDirection.ValueType # 0 +"""Forward DAG: ori -> snp -> rel -> rev -> dir -> cnt""" + +BACKWARD: GraphDirection.ValueType # 1 +"""Transposed DAG: cnt -> dir -> rev -> rel -> snp -> ori""" + +global___GraphDirection = GraphDirection + + +class GetNodeRequest(google.protobuf.message.Message): + """Describe a node to return""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor + SWHID_FIELD_NUMBER: builtins.int + MASK_FIELD_NUMBER: builtins.int + swhid: typing.Text + """SWHID of the node to return""" + + @property + def mask(self) -> google.protobuf.field_mask_pb2.FieldMask: + """FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + By default, all fields are returned. + """ + pass + def __init__(self, + *, + swhid: typing.Text = ..., + mask: typing.Optional[google.protobuf.field_mask_pb2.FieldMask] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_mask",b"_mask","mask",b"mask"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_mask",b"_mask","mask",b"mask","swhid",b"swhid"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["_mask",b"_mask"]) -> typing.Optional[typing_extensions.Literal["mask"]]: ... +global___GetNodeRequest = GetNodeRequest + +class TraversalRequest(google.protobuf.message.Message): + """TraversalRequest describes how a breadth-first traversal should be + performed, and what should be returned to the client. + """ + DESCRIPTOR: google.protobuf.descriptor.Descriptor + SRC_FIELD_NUMBER: builtins.int + DIRECTION_FIELD_NUMBER: builtins.int + EDGES_FIELD_NUMBER: builtins.int + MAX_EDGES_FIELD_NUMBER: builtins.int + MIN_DEPTH_FIELD_NUMBER: builtins.int + MAX_DEPTH_FIELD_NUMBER: builtins.int + RETURN_NODES_FIELD_NUMBER: builtins.int + MASK_FIELD_NUMBER: builtins.int + @property + def src(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[typing.Text]: + """Set of source nodes (SWHIDs)""" + pass + direction: global___GraphDirection.ValueType + """Direction of the graph to traverse. Defaults to FORWARD.""" + + edges: typing.Text + """Edge restriction string (e.g. "rev:dir,dir:cnt"). + Defaults to "*" (all). + """ + + max_edges: builtins.int + """Maximum number of edges accessed in the traversal, after which it stops. + Defaults to infinite. + """ + + min_depth: builtins.int + """Do not return nodes with a depth lower than this number. + By default, all depths are returned. + """ + + max_depth: builtins.int + """Maximum depth of the traversal, after which it stops. + Defaults to infinite. + """ + + @property + def return_nodes(self) -> global___NodeFilter: + """Filter which nodes will be sent to the stream. By default, all nodes are + returned. + """ + pass + @property + def mask(self) -> google.protobuf.field_mask_pb2.FieldMask: + """FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + By default, all fields are returned. + """ + pass + def __init__(self, + *, + src: typing.Optional[typing.Iterable[typing.Text]] = ..., + direction: global___GraphDirection.ValueType = ..., + edges: typing.Optional[typing.Text] = ..., + max_edges: typing.Optional[builtins.int] = ..., + min_depth: typing.Optional[builtins.int] = ..., + max_depth: typing.Optional[builtins.int] = ..., + return_nodes: typing.Optional[global___NodeFilter] = ..., + mask: typing.Optional[google.protobuf.field_mask_pb2.FieldMask] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_edges",b"_edges","_mask",b"_mask","_max_depth",b"_max_depth","_max_edges",b"_max_edges","_min_depth",b"_min_depth","_return_nodes",b"_return_nodes","edges",b"edges","mask",b"mask","max_depth",b"max_depth","max_edges",b"max_edges","min_depth",b"min_depth","return_nodes",b"return_nodes"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_edges",b"_edges","_mask",b"_mask","_max_depth",b"_max_depth","_max_edges",b"_max_edges","_min_depth",b"_min_depth","_return_nodes",b"_return_nodes","direction",b"direction","edges",b"edges","mask",b"mask","max_depth",b"max_depth","max_edges",b"max_edges","min_depth",b"min_depth","return_nodes",b"return_nodes","src",b"src"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_edges",b"_edges"]) -> typing.Optional[typing_extensions.Literal["edges"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_mask",b"_mask"]) -> typing.Optional[typing_extensions.Literal["mask"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_max_depth",b"_max_depth"]) -> typing.Optional[typing_extensions.Literal["max_depth"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_max_edges",b"_max_edges"]) -> typing.Optional[typing_extensions.Literal["max_edges"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_min_depth",b"_min_depth"]) -> typing.Optional[typing_extensions.Literal["min_depth"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_return_nodes",b"_return_nodes"]) -> typing.Optional[typing_extensions.Literal["return_nodes"]]: ... +global___TraversalRequest = TraversalRequest + +class FindPathToRequest(google.protobuf.message.Message): + """FindPathToRequest describes a request to find a shortest path between a + set of nodes and a given target criteria, as well as what should be returned + in the path. + """ + DESCRIPTOR: google.protobuf.descriptor.Descriptor + SRC_FIELD_NUMBER: builtins.int + TARGET_FIELD_NUMBER: builtins.int + DIRECTION_FIELD_NUMBER: builtins.int + EDGES_FIELD_NUMBER: builtins.int + MAX_EDGES_FIELD_NUMBER: builtins.int + MAX_DEPTH_FIELD_NUMBER: builtins.int + MASK_FIELD_NUMBER: builtins.int + @property + def src(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[typing.Text]: + """Set of source nodes (SWHIDs)""" + pass + @property + def target(self) -> global___NodeFilter: + """Target criteria, i.e., what constitutes a valid path destination.""" + pass + direction: global___GraphDirection.ValueType + """Direction of the graph to traverse. Defaults to FORWARD.""" + + edges: typing.Text + """Edge restriction string (e.g. "rev:dir,dir:cnt"). + Defaults to "*" (all). + """ + + max_edges: builtins.int + """Maximum number of edges accessed in the traversal, after which it stops. + Defaults to infinite. + """ + + max_depth: builtins.int + """Maximum depth of the traversal, after which it stops. + Defaults to infinite. + """ + + @property + def mask(self) -> google.protobuf.field_mask_pb2.FieldMask: + """FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + By default, all fields are returned. + """ + pass + def __init__(self, + *, + src: typing.Optional[typing.Iterable[typing.Text]] = ..., + target: typing.Optional[global___NodeFilter] = ..., + direction: global___GraphDirection.ValueType = ..., + edges: typing.Optional[typing.Text] = ..., + max_edges: typing.Optional[builtins.int] = ..., + max_depth: typing.Optional[builtins.int] = ..., + mask: typing.Optional[google.protobuf.field_mask_pb2.FieldMask] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_edges",b"_edges","_mask",b"_mask","_max_depth",b"_max_depth","_max_edges",b"_max_edges","edges",b"edges","mask",b"mask","max_depth",b"max_depth","max_edges",b"max_edges","target",b"target"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_edges",b"_edges","_mask",b"_mask","_max_depth",b"_max_depth","_max_edges",b"_max_edges","direction",b"direction","edges",b"edges","mask",b"mask","max_depth",b"max_depth","max_edges",b"max_edges","src",b"src","target",b"target"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_edges",b"_edges"]) -> typing.Optional[typing_extensions.Literal["edges"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_mask",b"_mask"]) -> typing.Optional[typing_extensions.Literal["mask"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_max_depth",b"_max_depth"]) -> typing.Optional[typing_extensions.Literal["max_depth"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_max_edges",b"_max_edges"]) -> typing.Optional[typing_extensions.Literal["max_edges"]]: ... +global___FindPathToRequest = FindPathToRequest + +class FindPathBetweenRequest(google.protobuf.message.Message): + """FindPathToRequest describes a request to find a shortest path between a + set of source nodes and a set of destination nodes. It works by performing a + bidirectional breadth-first traversal from both sets at the same time. + """ + DESCRIPTOR: google.protobuf.descriptor.Descriptor + SRC_FIELD_NUMBER: builtins.int + DST_FIELD_NUMBER: builtins.int + DIRECTION_FIELD_NUMBER: builtins.int + DIRECTION_REVERSE_FIELD_NUMBER: builtins.int + EDGES_FIELD_NUMBER: builtins.int + EDGES_REVERSE_FIELD_NUMBER: builtins.int + MAX_EDGES_FIELD_NUMBER: builtins.int + MAX_DEPTH_FIELD_NUMBER: builtins.int + MASK_FIELD_NUMBER: builtins.int + @property + def src(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[typing.Text]: + """Set of source nodes (SWHIDs)""" + pass + @property + def dst(self) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[typing.Text]: + """Set of destination nodes (SWHIDs)""" + pass + direction: global___GraphDirection.ValueType + """Direction of the graph to traverse from the source set. Defaults to + FORWARD. + """ + + direction_reverse: global___GraphDirection.ValueType + """Direction of the graph to traverse from the destination set. Defaults to + the opposite of `direction`. If direction and direction_reverse are + identical, it will find the first common successor of both sets in the + given direction. + """ + + edges: typing.Text + """Edge restriction string for the traversal from the source set. + (e.g. "rev:dir,dir:cnt"). Defaults to "*" (all). + """ + + edges_reverse: typing.Text + """Edge restriction string for the reverse traversal from the destination + set. + If not specified: + - If `edges` is not specified either, defaults to "*" + - If direction == direction_reverse, defaults to `edges` + - If direction != direction_reverse, defaults to the reverse of `edges` + (e.g. "rev:dir" becomes "dir:rev"). + """ + + max_edges: builtins.int + """Maximum number of edges accessed in the traversal, after which it stops. + Defaults to infinite. + """ + + max_depth: builtins.int + """Maximum depth of the traversal, after which it stops. + Defaults to infinite. + """ + + @property + def mask(self) -> google.protobuf.field_mask_pb2.FieldMask: + """FieldMask of which fields are to be returned (e.g., "swhid,cnt.length"). + By default, all fields are returned. + """ + pass + def __init__(self, + *, + src: typing.Optional[typing.Iterable[typing.Text]] = ..., + dst: typing.Optional[typing.Iterable[typing.Text]] = ..., + direction: global___GraphDirection.ValueType = ..., + direction_reverse: typing.Optional[global___GraphDirection.ValueType] = ..., + edges: typing.Optional[typing.Text] = ..., + edges_reverse: typing.Optional[typing.Text] = ..., + max_edges: typing.Optional[builtins.int] = ..., + max_depth: typing.Optional[builtins.int] = ..., + mask: typing.Optional[google.protobuf.field_mask_pb2.FieldMask] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_direction_reverse",b"_direction_reverse","_edges",b"_edges","_edges_reverse",b"_edges_reverse","_mask",b"_mask","_max_depth",b"_max_depth","_max_edges",b"_max_edges","direction_reverse",b"direction_reverse","edges",b"edges","edges_reverse",b"edges_reverse","mask",b"mask","max_depth",b"max_depth","max_edges",b"max_edges"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_direction_reverse",b"_direction_reverse","_edges",b"_edges","_edges_reverse",b"_edges_reverse","_mask",b"_mask","_max_depth",b"_max_depth","_max_edges",b"_max_edges","direction",b"direction","direction_reverse",b"direction_reverse","dst",b"dst","edges",b"edges","edges_reverse",b"edges_reverse","mask",b"mask","max_depth",b"max_depth","max_edges",b"max_edges","src",b"src"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_direction_reverse",b"_direction_reverse"]) -> typing.Optional[typing_extensions.Literal["direction_reverse"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_edges",b"_edges"]) -> typing.Optional[typing_extensions.Literal["edges"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_edges_reverse",b"_edges_reverse"]) -> typing.Optional[typing_extensions.Literal["edges_reverse"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_mask",b"_mask"]) -> typing.Optional[typing_extensions.Literal["mask"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_max_depth",b"_max_depth"]) -> typing.Optional[typing_extensions.Literal["max_depth"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_max_edges",b"_max_edges"]) -> typing.Optional[typing_extensions.Literal["max_edges"]]: ... +global___FindPathBetweenRequest = FindPathBetweenRequest + +class NodeFilter(google.protobuf.message.Message): + """Represents various criteria that make a given node "valid". A node is + only valid if all the subcriteria present in this message are fulfilled. + """ + DESCRIPTOR: google.protobuf.descriptor.Descriptor + TYPES_FIELD_NUMBER: builtins.int + MIN_TRAVERSAL_SUCCESSORS_FIELD_NUMBER: builtins.int + MAX_TRAVERSAL_SUCCESSORS_FIELD_NUMBER: builtins.int + types: typing.Text + """Node restriction string. (e.g. "dir,cnt,rev"). Defaults to "*" (all).""" + + min_traversal_successors: builtins.int + """Minimum number of successors encountered *during the traversal*. + Default: no constraint + """ + + max_traversal_successors: builtins.int + """Maximum number of successors encountered *during the traversal*. + Default: no constraint + """ + + def __init__(self, + *, + types: typing.Optional[typing.Text] = ..., + min_traversal_successors: typing.Optional[builtins.int] = ..., + max_traversal_successors: typing.Optional[builtins.int] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_max_traversal_successors",b"_max_traversal_successors","_min_traversal_successors",b"_min_traversal_successors","_types",b"_types","max_traversal_successors",b"max_traversal_successors","min_traversal_successors",b"min_traversal_successors","types",b"types"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_max_traversal_successors",b"_max_traversal_successors","_min_traversal_successors",b"_min_traversal_successors","_types",b"_types","max_traversal_successors",b"max_traversal_successors","min_traversal_successors",b"min_traversal_successors","types",b"types"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_max_traversal_successors",b"_max_traversal_successors"]) -> typing.Optional[typing_extensions.Literal["max_traversal_successors"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_min_traversal_successors",b"_min_traversal_successors"]) -> typing.Optional[typing_extensions.Literal["min_traversal_successors"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_types",b"_types"]) -> typing.Optional[typing_extensions.Literal["types"]]: ... +global___NodeFilter = NodeFilter + +class Node(google.protobuf.message.Message): + """Represents a node in the graph.""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor + SWHID_FIELD_NUMBER: builtins.int + SUCCESSOR_FIELD_NUMBER: builtins.int + NUM_SUCCESSORS_FIELD_NUMBER: builtins.int + CNT_FIELD_NUMBER: builtins.int + REV_FIELD_NUMBER: builtins.int + REL_FIELD_NUMBER: builtins.int + ORI_FIELD_NUMBER: builtins.int + swhid: typing.Text + """The SWHID of the graph node.""" + + @property + def successor(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Successor]: + """List of relevant successors of this node.""" + pass + num_successors: builtins.int + """Number of relevant successors.""" + + @property + def cnt(self) -> global___ContentData: ... + @property + def rev(self) -> global___RevisionData: ... + @property + def rel(self) -> global___ReleaseData: ... + @property + def ori(self) -> global___OriginData: ... + def __init__(self, + *, + swhid: typing.Text = ..., + successor: typing.Optional[typing.Iterable[global___Successor]] = ..., + num_successors: typing.Optional[builtins.int] = ..., + cnt: typing.Optional[global___ContentData] = ..., + rev: typing.Optional[global___RevisionData] = ..., + rel: typing.Optional[global___ReleaseData] = ..., + ori: typing.Optional[global___OriginData] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_num_successors",b"_num_successors","cnt",b"cnt","data",b"data","num_successors",b"num_successors","ori",b"ori","rel",b"rel","rev",b"rev"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_num_successors",b"_num_successors","cnt",b"cnt","data",b"data","num_successors",b"num_successors","ori",b"ori","rel",b"rel","rev",b"rev","successor",b"successor","swhid",b"swhid"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_num_successors",b"_num_successors"]) -> typing.Optional[typing_extensions.Literal["num_successors"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["data",b"data"]) -> typing.Optional[typing_extensions.Literal["cnt","rev","rel","ori"]]: ... +global___Node = Node + +class Path(google.protobuf.message.Message): + """Represents a path in the graph.""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor + NODE_FIELD_NUMBER: builtins.int + MIDPOINT_INDEX_FIELD_NUMBER: builtins.int + @property + def node(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Node]: + """List of nodes in the path, from source to destination""" + pass + midpoint_index: builtins.int + """Index of the "midpoint" of the path. For paths obtained with + bidirectional search queries, this is the node that joined the two + sets together. When looking for a common ancestor between two nodes by + performing a FindPathBetween search with two backward graphs, this will + be the index of the common ancestor in the path. + """ + + def __init__(self, + *, + node: typing.Optional[typing.Iterable[global___Node]] = ..., + midpoint_index: typing.Optional[builtins.int] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_midpoint_index",b"_midpoint_index","midpoint_index",b"midpoint_index"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_midpoint_index",b"_midpoint_index","midpoint_index",b"midpoint_index","node",b"node"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["_midpoint_index",b"_midpoint_index"]) -> typing.Optional[typing_extensions.Literal["midpoint_index"]]: ... +global___Path = Path + +class Successor(google.protobuf.message.Message): + """Represents a successor of a given node.""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor + SWHID_FIELD_NUMBER: builtins.int + LABEL_FIELD_NUMBER: builtins.int + swhid: typing.Text + """The SWHID of the successor""" + + @property + def label(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___EdgeLabel]: + """A list of edge labels for the given edge""" + pass + def __init__(self, + *, + swhid: typing.Optional[typing.Text] = ..., + label: typing.Optional[typing.Iterable[global___EdgeLabel]] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_swhid",b"_swhid","swhid",b"swhid"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_swhid",b"_swhid","label",b"label","swhid",b"swhid"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["_swhid",b"_swhid"]) -> typing.Optional[typing_extensions.Literal["swhid"]]: ... +global___Successor = Successor + +class ContentData(google.protobuf.message.Message): + """Content node properties""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor + LENGTH_FIELD_NUMBER: builtins.int + IS_SKIPPED_FIELD_NUMBER: builtins.int + length: builtins.int + """Length of the blob, in bytes""" + + is_skipped: builtins.bool + """Whether the content was skipped during ingestion.""" + + def __init__(self, + *, + length: typing.Optional[builtins.int] = ..., + is_skipped: typing.Optional[builtins.bool] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_is_skipped",b"_is_skipped","_length",b"_length","is_skipped",b"is_skipped","length",b"length"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_is_skipped",b"_is_skipped","_length",b"_length","is_skipped",b"is_skipped","length",b"length"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_is_skipped",b"_is_skipped"]) -> typing.Optional[typing_extensions.Literal["is_skipped"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_length",b"_length"]) -> typing.Optional[typing_extensions.Literal["length"]]: ... +global___ContentData = ContentData + +class RevisionData(google.protobuf.message.Message): + """Revision node properties""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor + AUTHOR_FIELD_NUMBER: builtins.int + AUTHOR_DATE_FIELD_NUMBER: builtins.int + AUTHOR_DATE_OFFSET_FIELD_NUMBER: builtins.int + COMMITTER_FIELD_NUMBER: builtins.int + COMMITTER_DATE_FIELD_NUMBER: builtins.int + COMMITTER_DATE_OFFSET_FIELD_NUMBER: builtins.int + MESSAGE_FIELD_NUMBER: builtins.int + author: builtins.int + """Revision author ID (anonymized)""" + + author_date: builtins.int + """UNIX timestamp of the revision date (UTC)""" + + author_date_offset: builtins.int + """Timezone of the revision author date as an offset from UTC""" + + committer: builtins.int + """Revision committer ID (anonymized)""" + + committer_date: builtins.int + """UNIX timestamp of the revision committer date (UTC)""" + + committer_date_offset: builtins.int + """Timezone of the revision committer date as an offset from UTC""" + + message: builtins.bytes + """Revision message""" + + def __init__(self, + *, + author: typing.Optional[builtins.int] = ..., + author_date: typing.Optional[builtins.int] = ..., + author_date_offset: typing.Optional[builtins.int] = ..., + committer: typing.Optional[builtins.int] = ..., + committer_date: typing.Optional[builtins.int] = ..., + committer_date_offset: typing.Optional[builtins.int] = ..., + message: typing.Optional[builtins.bytes] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_author",b"_author","_author_date",b"_author_date","_author_date_offset",b"_author_date_offset","_committer",b"_committer","_committer_date",b"_committer_date","_committer_date_offset",b"_committer_date_offset","_message",b"_message","author",b"author","author_date",b"author_date","author_date_offset",b"author_date_offset","committer",b"committer","committer_date",b"committer_date","committer_date_offset",b"committer_date_offset","message",b"message"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_author",b"_author","_author_date",b"_author_date","_author_date_offset",b"_author_date_offset","_committer",b"_committer","_committer_date",b"_committer_date","_committer_date_offset",b"_committer_date_offset","_message",b"_message","author",b"author","author_date",b"author_date","author_date_offset",b"author_date_offset","committer",b"committer","committer_date",b"committer_date","committer_date_offset",b"committer_date_offset","message",b"message"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_author",b"_author"]) -> typing.Optional[typing_extensions.Literal["author"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_author_date",b"_author_date"]) -> typing.Optional[typing_extensions.Literal["author_date"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_author_date_offset",b"_author_date_offset"]) -> typing.Optional[typing_extensions.Literal["author_date_offset"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_committer",b"_committer"]) -> typing.Optional[typing_extensions.Literal["committer"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_committer_date",b"_committer_date"]) -> typing.Optional[typing_extensions.Literal["committer_date"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_committer_date_offset",b"_committer_date_offset"]) -> typing.Optional[typing_extensions.Literal["committer_date_offset"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_message",b"_message"]) -> typing.Optional[typing_extensions.Literal["message"]]: ... +global___RevisionData = RevisionData + +class ReleaseData(google.protobuf.message.Message): + """Release node properties""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor + AUTHOR_FIELD_NUMBER: builtins.int + AUTHOR_DATE_FIELD_NUMBER: builtins.int + AUTHOR_DATE_OFFSET_FIELD_NUMBER: builtins.int + NAME_FIELD_NUMBER: builtins.int + MESSAGE_FIELD_NUMBER: builtins.int + author: builtins.int + """Release author ID (anonymized)""" + + author_date: builtins.int + """UNIX timestamp of the release date (UTC)""" + + author_date_offset: builtins.int + """Timezone of the release author date as an offset from UTC""" + + name: builtins.bytes + """Release name""" + + message: builtins.bytes + """Release message""" + + def __init__(self, + *, + author: typing.Optional[builtins.int] = ..., + author_date: typing.Optional[builtins.int] = ..., + author_date_offset: typing.Optional[builtins.int] = ..., + name: typing.Optional[builtins.bytes] = ..., + message: typing.Optional[builtins.bytes] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_author",b"_author","_author_date",b"_author_date","_author_date_offset",b"_author_date_offset","_message",b"_message","_name",b"_name","author",b"author","author_date",b"author_date","author_date_offset",b"author_date_offset","message",b"message","name",b"name"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_author",b"_author","_author_date",b"_author_date","_author_date_offset",b"_author_date_offset","_message",b"_message","_name",b"_name","author",b"author","author_date",b"author_date","author_date_offset",b"author_date_offset","message",b"message","name",b"name"]) -> None: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_author",b"_author"]) -> typing.Optional[typing_extensions.Literal["author"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_author_date",b"_author_date"]) -> typing.Optional[typing_extensions.Literal["author_date"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_author_date_offset",b"_author_date_offset"]) -> typing.Optional[typing_extensions.Literal["author_date_offset"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_message",b"_message"]) -> typing.Optional[typing_extensions.Literal["message"]]: ... + @typing.overload + def WhichOneof(self, oneof_group: typing_extensions.Literal["_name",b"_name"]) -> typing.Optional[typing_extensions.Literal["name"]]: ... +global___ReleaseData = ReleaseData + +class OriginData(google.protobuf.message.Message): + """Origin node properties""" + DESCRIPTOR: google.protobuf.descriptor.Descriptor + URL_FIELD_NUMBER: builtins.int + url: typing.Text + """URL of the origin""" + + def __init__(self, + *, + url: typing.Optional[typing.Text] = ..., + ) -> None: ... + def HasField(self, field_name: typing_extensions.Literal["_url",b"_url","url",b"url"]) -> builtins.bool: ... + def ClearField(self, field_name: typing_extensions.Literal["_url",b"_url","url",b"url"]) -> None: ... + def WhichOneof(self, oneof_group: typing_extensions.Literal["_url",b"_url"]) -> typing.Optional[typing_extensions.Literal["url"]]: ... +global___OriginData = OriginData + +class EdgeLabel(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + NAME_FIELD_NUMBER: builtins.int + PERMISSION_FIELD_NUMBER: builtins.int + name: builtins.bytes + """Directory entry name for directories, branch name for snapshots""" + + permission: builtins.int + """Entry permission (only set for directories).""" + + def __init__(self, + *, + name: builtins.bytes = ..., + permission: builtins.int = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["name",b"name","permission",b"permission"]) -> None: ... +global___EdgeLabel = EdgeLabel + +class CountResponse(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + COUNT_FIELD_NUMBER: builtins.int + count: builtins.int + def __init__(self, + *, + count: builtins.int = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["count",b"count"]) -> None: ... +global___CountResponse = CountResponse + +class StatsRequest(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + def __init__(self, + ) -> None: ... +global___StatsRequest = StatsRequest + +class StatsResponse(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + NUM_NODES_FIELD_NUMBER: builtins.int + NUM_EDGES_FIELD_NUMBER: builtins.int + COMPRESSION_RATIO_FIELD_NUMBER: builtins.int + BITS_PER_NODE_FIELD_NUMBER: builtins.int + BITS_PER_EDGE_FIELD_NUMBER: builtins.int + AVG_LOCALITY_FIELD_NUMBER: builtins.int + INDEGREE_MIN_FIELD_NUMBER: builtins.int + INDEGREE_MAX_FIELD_NUMBER: builtins.int + INDEGREE_AVG_FIELD_NUMBER: builtins.int + OUTDEGREE_MIN_FIELD_NUMBER: builtins.int + OUTDEGREE_MAX_FIELD_NUMBER: builtins.int + OUTDEGREE_AVG_FIELD_NUMBER: builtins.int + num_nodes: builtins.int + """Number of nodes in the graph""" + + num_edges: builtins.int + """Number of edges in the graph""" + + compression_ratio: builtins.float + """Ratio between the graph size and the information-theoretical lower + bound + """ + + bits_per_node: builtins.float + """Number of bits per node (overall graph size in bits divided by the + number of nodes) + """ + + bits_per_edge: builtins.float + """Number of bits per edge (overall graph size in bits divided by the + number of arcs). + """ + + avg_locality: builtins.float + indegree_min: builtins.int + """Smallest indegree""" + + indegree_max: builtins.int + """Largest indegree""" + + indegree_avg: builtins.float + """Average indegree""" + + outdegree_min: builtins.int + """Smallest outdegree""" + + outdegree_max: builtins.int + """Largest outdegree""" + + outdegree_avg: builtins.float + """Average outdegree""" + + def __init__(self, + *, + num_nodes: builtins.int = ..., + num_edges: builtins.int = ..., + compression_ratio: builtins.float = ..., + bits_per_node: builtins.float = ..., + bits_per_edge: builtins.float = ..., + avg_locality: builtins.float = ..., + indegree_min: builtins.int = ..., + indegree_max: builtins.int = ..., + indegree_avg: builtins.float = ..., + outdegree_min: builtins.int = ..., + outdegree_max: builtins.int = ..., + outdegree_avg: builtins.float = ..., + ) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["avg_locality",b"avg_locality","bits_per_edge",b"bits_per_edge","bits_per_node",b"bits_per_node","compression_ratio",b"compression_ratio","indegree_avg",b"indegree_avg","indegree_max",b"indegree_max","indegree_min",b"indegree_min","num_edges",b"num_edges","num_nodes",b"num_nodes","outdegree_avg",b"outdegree_avg","outdegree_max",b"outdegree_max","outdegree_min",b"outdegree_min"]) -> None: ... +global___StatsResponse = StatsResponse diff --git a/swh/graph/rpc/swhgraph_pb2_grpc.py b/swh/graph/rpc/swhgraph_pb2_grpc.py new file mode 100644 index 0000000..ff60903 --- /dev/null +++ b/swh/graph/rpc/swhgraph_pb2_grpc.py @@ -0,0 +1,303 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +from swh.graph.rpc import swhgraph_pb2 as swh_dot_graph_dot_rpc_dot_swhgraph__pb2 + + +class TraversalServiceStub(object): + """Graph traversal service + """ + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.GetNode = channel.unary_unary( + '/swh.graph.TraversalService/GetNode', + request_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.GetNodeRequest.SerializeToString, + response_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Node.FromString, + ) + self.Traverse = channel.unary_stream( + '/swh.graph.TraversalService/Traverse', + request_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.SerializeToString, + response_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Node.FromString, + ) + self.FindPathTo = channel.unary_unary( + '/swh.graph.TraversalService/FindPathTo', + request_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.FindPathToRequest.SerializeToString, + response_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Path.FromString, + ) + self.FindPathBetween = channel.unary_unary( + '/swh.graph.TraversalService/FindPathBetween', + request_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.FindPathBetweenRequest.SerializeToString, + response_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Path.FromString, + ) + self.CountNodes = channel.unary_unary( + '/swh.graph.TraversalService/CountNodes', + request_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.SerializeToString, + response_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.CountResponse.FromString, + ) + self.CountEdges = channel.unary_unary( + '/swh.graph.TraversalService/CountEdges', + request_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.SerializeToString, + response_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.CountResponse.FromString, + ) + self.Stats = channel.unary_unary( + '/swh.graph.TraversalService/Stats', + request_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.StatsRequest.SerializeToString, + response_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.StatsResponse.FromString, + ) + + +class TraversalServiceServicer(object): + """Graph traversal service + """ + + def GetNode(self, request, context): + """GetNode returns a single Node and its properties. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def Traverse(self, request, context): + """Traverse performs a breadth-first graph traversal from a set of source + nodes, then streams the nodes it encounters (if they match a given + return filter), along with their properties. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def FindPathTo(self, request, context): + """FindPathTo searches for a shortest path between a set of source nodes + and a node that matches a specific *criteria*. + + It does so by performing a breadth-first search from the source node, + until any node that matches the given criteria is found, then follows + back its parents to return a shortest path from the source set to that + node. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def FindPathBetween(self, request, context): + """FindPathBetween searches for a shortest path between a set of source + nodes and a set of destination nodes. + + It does so by performing a *bidirectional breadth-first search*, i.e., + two parallel breadth-first searches, one from the source set ("src-BFS") + and one from the destination set ("dst-BFS"), until both searches find a + common node that joins their visited sets. This node is called the + "midpoint node". + The path returned is the path src -> ... -> midpoint -> ... -> dst, + which is always a shortest path between src and dst. + + The graph direction of both BFS can be configured separately. By + default, the dst-BFS will use the graph in the opposite direction than + the src-BFS (if direction = FORWARD, by default direction_reverse = + BACKWARD, and vice-versa). The default behavior is thus to search for + a shortest path between two nodes in a given direction. However, one + can also specify FORWARD or BACKWARD for *both* the src-BFS and the + dst-BFS. This will search for a common descendant or a common ancestor + between the two sets, respectively. These will be the midpoints of the + returned path. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def CountNodes(self, request, context): + """CountNodes does the same as Traverse, but only returns the number of + nodes accessed during the traversal. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def CountEdges(self, request, context): + """CountEdges does the same as Traverse, but only returns the number of + edges accessed during the traversal. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + def Stats(self, request, context): + """Stats returns various statistics on the overall graph. + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_TraversalServiceServicer_to_server(servicer, server): + rpc_method_handlers = { + 'GetNode': grpc.unary_unary_rpc_method_handler( + servicer.GetNode, + request_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.GetNodeRequest.FromString, + response_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Node.SerializeToString, + ), + 'Traverse': grpc.unary_stream_rpc_method_handler( + servicer.Traverse, + request_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.FromString, + response_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Node.SerializeToString, + ), + 'FindPathTo': grpc.unary_unary_rpc_method_handler( + servicer.FindPathTo, + request_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.FindPathToRequest.FromString, + response_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Path.SerializeToString, + ), + 'FindPathBetween': grpc.unary_unary_rpc_method_handler( + servicer.FindPathBetween, + request_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.FindPathBetweenRequest.FromString, + response_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Path.SerializeToString, + ), + 'CountNodes': grpc.unary_unary_rpc_method_handler( + servicer.CountNodes, + request_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.FromString, + response_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.CountResponse.SerializeToString, + ), + 'CountEdges': grpc.unary_unary_rpc_method_handler( + servicer.CountEdges, + request_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.FromString, + response_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.CountResponse.SerializeToString, + ), + 'Stats': grpc.unary_unary_rpc_method_handler( + servicer.Stats, + request_deserializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.StatsRequest.FromString, + response_serializer=swh_dot_graph_dot_rpc_dot_swhgraph__pb2.StatsResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'swh.graph.TraversalService', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + + # This class is part of an EXPERIMENTAL API. +class TraversalService(object): + """Graph traversal service + """ + + @staticmethod + def GetNode(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/swh.graph.TraversalService/GetNode', + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.GetNodeRequest.SerializeToString, + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Node.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def Traverse(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/swh.graph.TraversalService/Traverse', + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.SerializeToString, + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Node.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def FindPathTo(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/swh.graph.TraversalService/FindPathTo', + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.FindPathToRequest.SerializeToString, + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Path.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def FindPathBetween(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/swh.graph.TraversalService/FindPathBetween', + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.FindPathBetweenRequest.SerializeToString, + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.Path.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def CountNodes(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/swh.graph.TraversalService/CountNodes', + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.SerializeToString, + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.CountResponse.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def CountEdges(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/swh.graph.TraversalService/CountEdges', + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.TraversalRequest.SerializeToString, + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.CountResponse.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + + @staticmethod + def Stats(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/swh.graph.TraversalService/Stats', + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.StatsRequest.SerializeToString, + swh_dot_graph_dot_rpc_dot_swhgraph__pb2.StatsResponse.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/swh/graph/rpc_server.py b/swh/graph/rpc_server.py new file mode 100644 index 0000000..e4b4f1e --- /dev/null +++ b/swh/graph/rpc_server.py @@ -0,0 +1,33 @@ +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +""" +A simple tool to start the swh-graph GRPC server in Java. +""" + +import subprocess + +import aiohttp.test_utils +import aiohttp.web + +from swh.graph.config import check_config + + +def spawn_java_rpc_server(config, port=None): + if port is None: + port = aiohttp.test_utils.unused_port() + config = check_config(config or {}) + cmd = [ + "java", + "-cp", + config["classpath"], + *config["java_tool_options"].split(), + "org.softwareheritage.graph.rpc.GraphServer", + "--port", + str(port), + config["graph"]["path"], + ] + server = subprocess.Popen(cmd) + return server, port diff --git a/swh/graph/server/__init__.py b/swh/graph/server/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/swh/graph/swhid.py b/swh/graph/swhid.py deleted file mode 100644 index 90db73f..0000000 --- a/swh/graph/swhid.py +++ /dev/null @@ -1,419 +0,0 @@ -# Copyright (C) 2019-2021 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from __future__ import annotations - -from collections.abc import MutableMapping -from enum import Enum -import mmap -from mmap import MAP_SHARED, PROT_READ, PROT_WRITE -import os -import struct -from typing import BinaryIO, Iterator, Tuple - -from swh.model.hashutil import hash_to_hex -from swh.model.swhids import ExtendedObjectType, ExtendedSWHID - -SWHID_BIN_FMT = "BB20s" # 2 unsigned chars + 20 bytes -INT_BIN_FMT = ">q" # big endian, 8-byte integer -SWHID_BIN_SIZE = 22 # in bytes -INT_BIN_SIZE = 8 # in bytes - - -class SwhidType(Enum): - """types of existing SWHIDs, used to serialize ExtendedSWHID type as a (char) - integer - - Note that the order does matter also for driving the binary search in - SWHID-indexed maps. Integer values also matter, for compatibility with the - Java layer. - - """ - - content = 0 - directory = 1 - origin = 2 - release = 3 - revision = 4 - snapshot = 5 - - @classmethod - def from_extended_object_type(cls, object_type: ExtendedObjectType) -> SwhidType: - return cls[object_type.name.lower()] - - def to_extended_object_type(self) -> ExtendedObjectType: - return ExtendedObjectType[SwhidType(self).name.upper()] - - -def str_to_bytes(swhid_str: str) -> bytes: - """Convert a SWHID to a byte sequence - - The binary format used to represent SWHIDs as 22-byte long byte sequences as - follows: - - - 1 byte for the namespace version represented as a C `unsigned char` - - 1 byte for the object type, as the int value of :class:`SwhidType` enums, - represented as a C `unsigned char` - - 20 bytes for the SHA1 digest as a byte sequence - - Args: - swhid: persistent identifier - - Returns: - bytes: byte sequence representation of swhid - - """ - swhid = ExtendedSWHID.from_string(swhid_str) - return struct.pack( - SWHID_BIN_FMT, - swhid.scheme_version, - SwhidType.from_extended_object_type(swhid.object_type).value, - swhid.object_id, - ) - - -def bytes_to_str(bytes: bytes) -> str: - """Inverse function of :func:`str_to_bytes` - - See :func:`str_to_bytes` for a description of the binary SWHID format. - - Args: - bytes: byte sequence representation of swhid - - Returns: - swhid: persistent identifier - - """ - (version, type, bin_digest) = struct.unpack(SWHID_BIN_FMT, bytes) - - # The following is equivalent to: - # return str(ExtendedSWHID( - # object_type=SwhidType(type).to_extended_object_type(), object_id=bin_digest - # ) - # but more efficient, because ExtendedSWHID.__init__ is extremely slow. - object_type = ExtendedObjectType[SwhidType(type).name.upper()] - return f"swh:1:{object_type.value}:{hash_to_hex(bin_digest)}" - - -class _OnDiskMap: - """mmap-ed on-disk sequence of fixed size records""" - - def __init__( - self, record_size: int, fname: str, mode: str = "rb", length: int = None - ): - """open an existing on-disk map - - Args: - record_size: size of each record in bytes - fname: path to the on-disk map - mode: file open mode, usually either 'rb' for read-only maps, 'wb' - for creating new maps, or 'rb+' for updating existing ones - (default: 'rb') - length: map size in number of logical records; used to initialize - writable maps at creation time. Must be given when mode is 'wb' - and the map doesn't exist on disk; ignored otherwise - - """ - os_modes = {"rb": os.O_RDONLY, "wb": os.O_RDWR | os.O_CREAT, "rb+": os.O_RDWR} - if mode not in os_modes: - raise ValueError("invalid file open mode: " + mode) - new_map = mode == "wb" - writable_map = mode in ["wb", "rb+"] - - self.record_size = record_size - self.fd = os.open(fname, os_modes[mode]) - if new_map: - if length is None: - raise ValueError("missing length when creating new map") - os.truncate(self.fd, length * self.record_size) - - self.size = os.path.getsize(fname) - (self.length, remainder) = divmod(self.size, record_size) - if remainder: - raise ValueError( - "map size {} is not a multiple of the record size {}".format( - self.size, record_size - ) - ) - - self.mm = mmap.mmap( - self.fd, - self.size, - prot=(PROT_READ | PROT_WRITE if writable_map else PROT_READ), - flags=MAP_SHARED, - ) - - def close(self) -> None: - """close the map - - shuts down both the mmap and the underlying file descriptor - - """ - if not self.mm.closed: - self.mm.close() - os.close(self.fd) - - def __len__(self) -> int: - return self.length - - def __delitem__(self, pos: int) -> None: - raise NotImplementedError("cannot delete records from fixed-size map") - - -class SwhidToNodeMap(_OnDiskMap, MutableMapping): - """memory mapped map from :ref:`SWHIDs ` to a - continuous range 0..N of (8-byte long) integers - - This is the converse mapping of :class:`NodeToSwhidMap`. - - The on-disk serialization format is a sequence of fixed length (30 bytes) - records with the following fields: - - - SWHID (22 bytes): binary SWHID representation as per :func:`str_to_bytes` - - long (8 bytes): big endian long integer - - The records are sorted lexicographically by SWHID type and checksum, where - type is the integer value of :class:`SwhidType`. SWHID lookup in the map is - performed via binary search. Hence a huge map with, say, 11 B entries, - will require ~30 disk seeks. - - Note that, due to fixed size + ordering, it is not possible to create these - maps by random writing. Hence, __setitem__ can be used only to *update* the - value associated to an existing key, rather than to add a missing item. To - create an entire map from scratch, you should do so *sequentially*, using - static method :meth:`write_record` (or, at your own risk, by hand via the - mmap :attr:`mm`). - - """ - - # record binary format: SWHID + a big endian 8-byte big endian integer - RECORD_BIN_FMT = ">" + SWHID_BIN_FMT + "q" - RECORD_SIZE = SWHID_BIN_SIZE + INT_BIN_SIZE - - def __init__(self, fname: str, mode: str = "rb", length: int = None): - """open an existing on-disk map - - Args: - fname: path to the on-disk map - mode: file open mode, usually either 'rb' for read-only maps, 'wb' - for creating new maps, or 'rb+' for updating existing ones - (default: 'rb') - length: map size in number of logical records; used to initialize - read-write maps at creation time. Must be given when mode is - 'wb'; ignored otherwise - - """ - super().__init__(self.RECORD_SIZE, fname, mode=mode, length=length) - - def _get_bin_record(self, pos: int) -> Tuple[bytes, bytes]: - """seek and return the (binary) record at a given (logical) position - - see :func:`_get_record` for an equivalent function with additional - deserialization - - Args: - pos: 0-based record number - - Returns: - a pair `(swhid, int)`, where swhid and int are bytes - - """ - rec_pos = pos * self.RECORD_SIZE - int_pos = rec_pos + SWHID_BIN_SIZE - - return (self.mm[rec_pos:int_pos], self.mm[int_pos : int_pos + INT_BIN_SIZE]) - - def _get_record(self, pos: int) -> Tuple[str, int]: - """seek and return the record at a given (logical) position - - moral equivalent of :func:`_get_bin_record`, with additional - deserialization to non-bytes types - - Args: - pos: 0-based record number - - Returns: - a pair `(swhid, int)`, where swhid is a string-based SWHID and int the - corresponding integer identifier - - """ - (swhid_bytes, int_bytes) = self._get_bin_record(pos) - return (bytes_to_str(swhid_bytes), struct.unpack(INT_BIN_FMT, int_bytes)[0]) - - @classmethod - def write_record(cls, f: BinaryIO, swhid: str, int: int) -> None: - """write a logical record to a file-like object - - Args: - f: file-like object to write the record to - swhid: textual SWHID - int: SWHID integer identifier - - """ - f.write(str_to_bytes(swhid)) - f.write(struct.pack(INT_BIN_FMT, int)) - - def _bisect_pos(self, swhid_str: str) -> int: - """bisect the position of the given identifier. If the identifier is - not found, the position of the swhid immediately after is returned. - - Args: - swhid_str: the swhid as a string - - Returns: - the logical record of the bisected position in the map - - """ - if not isinstance(swhid_str, str): - raise TypeError("SWHID must be a str, not {}".format(type(swhid_str))) - try: - target = str_to_bytes(swhid_str) # desired SWHID as bytes - except ValueError: - raise ValueError('invalid SWHID: "{}"'.format(swhid_str)) - - lo = 0 - hi = self.length - 1 - while lo < hi: - mid = (lo + hi) // 2 - (swhid, _value) = self._get_bin_record(mid) - if swhid < target: - lo = mid + 1 - else: - hi = mid - return lo - - def _find(self, swhid_str: str) -> Tuple[int, int]: - """lookup the integer identifier of a swhid and its position - - Args: - swhid_str: the swhid as a string - - Returns: - a pair `(swhid, pos)` with swhid integer identifier and its logical - record position in the map - - """ - pos = self._bisect_pos(swhid_str) - swhid_found, value = self._get_record(pos) - if swhid_found == swhid_str: - return (value, pos) - raise KeyError(swhid_str) - - def __getitem__(self, swhid_str: str) -> int: - """lookup the integer identifier of a SWHID - - Args: - swhid: the SWHID as a string - - Returns: - the integer identifier of swhid - - """ - return self._find(swhid_str)[0] # return element, ignore position - - def __setitem__(self, swhid_str: str, int: str) -> None: - (_swhid, pos) = self._find(swhid_str) # might raise KeyError and that's OK - - rec_pos = pos * self.RECORD_SIZE - int_pos = rec_pos + SWHID_BIN_SIZE - self.mm[rec_pos:int_pos] = str_to_bytes(swhid_str) - self.mm[int_pos : int_pos + INT_BIN_SIZE] = struct.pack(INT_BIN_FMT, int) - - def __iter__(self) -> Iterator[Tuple[str, int]]: - for pos in range(self.length): - yield self._get_record(pos) - - def iter_prefix(self, prefix: str): - swh, n, t, sha = prefix.split(":") - sha = sha.ljust(40, "0") - start_swhid = ":".join([swh, n, t, sha]) - start = self._bisect_pos(start_swhid) - for pos in range(start, self.length): - swhid, value = self._get_record(pos) - if not swhid.startswith(prefix): - break - yield swhid, value - - def iter_type(self, swhid_type: str) -> Iterator[Tuple[str, int]]: - prefix = "swh:1:{}:".format(swhid_type) - yield from self.iter_prefix(prefix) - - -class NodeToSwhidMap(_OnDiskMap, MutableMapping): - """memory mapped map from a continuous range of 0..N (8-byte long) integers to - :ref:`SWHIDs ` - - This is the converse mapping of :class:`SwhidToNodeMap`. - - The on-disk serialization format is a sequence of fixed length records (22 - bytes), each being the binary representation of a SWHID as per - :func:`str_to_bytes`. - - The records are sorted by long integer, so that integer lookup is possible - via fixed-offset seek. - - """ - - RECORD_BIN_FMT = SWHID_BIN_FMT - RECORD_SIZE = SWHID_BIN_SIZE - - def __init__(self, fname: str, mode: str = "rb", length: int = None): - """open an existing on-disk map - - Args: - fname: path to the on-disk map - mode: file open mode, usually either 'rb' for read-only maps, 'wb' - for creating new maps, or 'rb+' for updating existing ones - (default: 'rb') - size: map size in number of logical records; used to initialize - read-write maps at creation time. Must be given when mode is - 'wb'; ignored otherwise - length: passed to :class:`_OnDiskMap` - - """ - - super().__init__(self.RECORD_SIZE, fname, mode=mode, length=length) - - def _get_bin_record(self, pos: int) -> bytes: - """seek and return the (binary) SWHID at a given (logical) position - - Args: - pos: 0-based record number - - Returns: - SWHID as a byte sequence - - """ - rec_pos = pos * self.RECORD_SIZE - - return self.mm[rec_pos : rec_pos + self.RECORD_SIZE] - - @classmethod - def write_record(cls, f: BinaryIO, swhid: str) -> None: - """write a SWHID to a file-like object - - Args: - f: file-like object to write the record to - swhid: textual SWHID - - """ - f.write(str_to_bytes(swhid)) - - def __getitem__(self, pos: int) -> str: - orig_pos = pos - if pos < 0: - pos = len(self) + pos - if not (0 <= pos < len(self)): - raise IndexError(orig_pos) - - return bytes_to_str(self._get_bin_record(pos)) - - def __setitem__(self, pos: int, swhid: str) -> None: - rec_pos = pos * self.RECORD_SIZE - self.mm[rec_pos : rec_pos + self.RECORD_SIZE] = str_to_bytes(swhid) - - def __iter__(self) -> Iterator[Tuple[int, str]]: - for pos in range(self.length): - yield (pos, self[pos]) diff --git a/swh/graph/tests/conftest.py b/swh/graph/tests/conftest.py index fed877b..f66d2b3 100644 --- a/swh/graph/tests/conftest.py +++ b/swh/graph/tests/conftest.py @@ -1,59 +1,70 @@ # Copyright (C) 2019-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information -import csv import multiprocessing from pathlib import Path +import subprocess from aiohttp.test_utils import TestClient, TestServer, loop_context import pytest -from swh.graph.client import RemoteGraphClient -from swh.graph.naive_client import NaiveClient +from swh.graph.http_client import RemoteGraphClient +from swh.graph.http_naive_client import NaiveClient SWH_GRAPH_TESTS_ROOT = Path(__file__).parents[0] -TEST_GRAPH_PATH = SWH_GRAPH_TESTS_ROOT / "dataset/output/example" +TEST_GRAPH_PATH = SWH_GRAPH_TESTS_ROOT / "dataset/compressed/example" class GraphServerProcess(multiprocessing.Process): def __init__(self, q, *args, **kwargs): self.q = q super().__init__(*args, **kwargs) def run(self): # Lazy import to allow debian packaging - from swh.graph.backend import Backend - from swh.graph.server.app import make_app + from swh.graph.http_server import make_app try: - backend = Backend(graph_path=str(TEST_GRAPH_PATH)) + config = {"graph": {"path": TEST_GRAPH_PATH}} with loop_context() as loop: - app = make_app(backend=backend, debug=True) + app = make_app(config=config, debug=True) client = TestClient(TestServer(app), loop=loop) loop.run_until_complete(client.start_server()) url = client.make_url("/graph/") self.q.put(url) loop.run_forever() except Exception as e: self.q.put(e) @pytest.fixture(scope="module", params=["remote", "naive"]) def graph_client(request): if request.param == "remote": queue = multiprocessing.Queue() server = GraphServerProcess(queue) server.start() res = queue.get() if isinstance(res, Exception): raise res yield RemoteGraphClient(str(res)) server.terminate() else: - with open(SWH_GRAPH_TESTS_ROOT / "dataset/example.nodes.csv") as fd: - nodes = [node for (node,) in csv.reader(fd, delimiter=" ")] - with open(SWH_GRAPH_TESTS_ROOT / "dataset/example.edges.csv") as fd: - edges = list(csv.reader(fd, delimiter=" ")) - yield NaiveClient(nodes=nodes, edges=edges) + + def zstdcat(*files): + p = subprocess.run(["zstdcat", *files], stdout=subprocess.PIPE) + return p.stdout.decode() + + edges_dataset = SWH_GRAPH_TESTS_ROOT / "dataset/edges" + edge_files = edges_dataset.glob("*/*.edges.csv.zst") + node_files = edges_dataset.glob("*/*.nodes.csv.zst") + + nodes = set(zstdcat(*node_files).strip().split("\n")) + edge_lines = [line.split() for line in zstdcat(*edge_files).strip().split("\n")] + edges = [(src, dst) for src, dst, *_ in edge_lines] + for src, dst in edges: + nodes.add(src) + nodes.add(dst) + + yield NaiveClient(nodes=list(nodes), edges=edges) diff --git a/swh/graph/tests/dataset/.gitignore b/swh/graph/tests/dataset/.gitignore deleted file mode 100644 index 531c841..0000000 --- a/swh/graph/tests/dataset/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -docker/ -output/*-bv.* -output/stderr -output/stdout -output/compression.log diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets b/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets new file mode 100644 index 0000000..fbb7a5a --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-labelled.labeloffsets @@ -0,0 +1,2 @@ + +p) \ No newline at end of file diff --git a/swh/graph/tests/dataset/compressed/example-labelled.labels b/swh/graph/tests/dataset/compressed/example-labelled.labels new file mode 100644 index 0000000..1b876ec --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-labelled.labels @@ -0,0 +1 @@ +D%B](P(i \ No newline at end of file diff --git a/swh/graph/tests/dataset/compressed/example-labelled.properties b/swh/graph/tests/dataset/compressed/example-labelled.properties new file mode 100644 index 0000000..4f4c55a --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-labelled.properties @@ -0,0 +1,3 @@ +graphclass = it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph +labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,6) +underlyinggraph = example diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets new file mode 100644 index 0000000..7726435 --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labeloffsets @@ -0,0 +1,2 @@ +B!B +(P \ No newline at end of file diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels new file mode 100644 index 0000000..9448e72 --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.labels @@ -0,0 +1,2 @@ +P:RH +jPu \ No newline at end of file diff --git a/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties b/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties new file mode 100644 index 0000000..5ee584a --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-transposed-labelled.properties @@ -0,0 +1,3 @@ +graphclass = it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph +labelspec = org.softwareheritage.graph.labels.SwhLabel(DirEntry,6) +underlyinggraph = example-transposed diff --git a/swh/graph/tests/dataset/compressed/example-transposed.graph b/swh/graph/tests/dataset/compressed/example-transposed.graph new file mode 100644 index 0000000..d8cbf2b --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-transposed.graph @@ -0,0 +1 @@ +^t5z օzZ:] \ No newline at end of file diff --git a/swh/graph/tests/dataset/output/example-transposed.obl b/swh/graph/tests/dataset/compressed/example-transposed.obl similarity index 77% rename from swh/graph/tests/dataset/output/example-transposed.obl rename to swh/graph/tests/dataset/compressed/example-transposed.obl index 54f0ac8..7ad141b 100644 Binary files a/swh/graph/tests/dataset/output/example-transposed.obl and b/swh/graph/tests/dataset/compressed/example-transposed.obl differ diff --git a/swh/graph/tests/dataset/compressed/example-transposed.offsets b/swh/graph/tests/dataset/compressed/example-transposed.offsets new file mode 100644 index 0000000..b3044db --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-transposed.offsets @@ -0,0 +1 @@ +) (P8&(R \ No newline at end of file diff --git a/swh/graph/tests/dataset/compressed/example-transposed.properties b/swh/graph/tests/dataset/compressed/example-transposed.properties new file mode 100644 index 0000000..96fcfba --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example-transposed.properties @@ -0,0 +1,35 @@ +#BVGraph properties +#Wed Mar 30 17:33:29 CEST 2022 +bitsforreferences=28 +avgbitsforintervals=0.762 +graphclass=it.unimi.dsi.big.webgraph.BVGraph +avgdist=0.429 +successoravggap=4.261 +residualexpstats=5,8,3,2,1 +arcs=23 +minintervallength=4 +bitsforoutdegrees=61 +residualavgloggap=2.076977934449935 +avgbitsforoutdegrees=2.905 +bitsforresiduals=85 +successoravgloggap=1.9987119736846723 +maxrefcount=3 +successorexpstats=7,9,4,2,1 +residualarcs=19 +avgbitsforresiduals=4.048 +avgbitsforblocks=0.19 +windowsize=7 +residualavggap=4.632 +copiedarcs=4 +avgbitsforreferences=1.333 +version=0 +compratio=1.53 +bitsperlink=8.435 +compressionflags= +nodes=21 +avgref=0.238 +zetak=3 +bitsforintervals=16 +intervalisedarcs=0 +bitspernode=9.238 +bitsforblocks=4 diff --git a/swh/graph/tests/dataset/compressed/example.edges.count.txt b/swh/graph/tests/dataset/compressed/example.edges.count.txt new file mode 100644 index 0000000..4099407 --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.edges.count.txt @@ -0,0 +1 @@ +23 diff --git a/swh/graph/tests/dataset/compressed/example.edges.stats.txt b/swh/graph/tests/dataset/compressed/example.edges.stats.txt new file mode 100644 index 0000000..c9b8ac7 --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.edges.stats.txt @@ -0,0 +1,8 @@ +dir:cnt 8 +dir:dir 3 +ori:snp 1 +rel:rev 2 +rev:dir 4 +rev:rev 3 +snp:rel 1 +snp:rev 1 diff --git a/swh/graph/tests/dataset/compressed/example.graph b/swh/graph/tests/dataset/compressed/example.graph new file mode 100644 index 0000000..d99357d --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.graph @@ -0,0 +1 @@ +}]望˚t]~[1tޗ@ \ No newline at end of file diff --git a/swh/graph/tests/dataset/output/example.indegree b/swh/graph/tests/dataset/compressed/example.indegree similarity index 100% rename from swh/graph/tests/dataset/output/example.indegree rename to swh/graph/tests/dataset/compressed/example.indegree diff --git a/swh/graph/tests/dataset/compressed/example.labels.count.txt b/swh/graph/tests/dataset/compressed/example.labels.count.txt new file mode 100644 index 0000000..45a4fb7 --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.labels.count.txt @@ -0,0 +1 @@ +8 diff --git a/swh/graph/tests/dataset/compressed/example.labels.csv.zst b/swh/graph/tests/dataset/compressed/example.labels.csv.zst new file mode 100644 index 0000000..1cc8931 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.labels.csv.zst differ diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray b/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray new file mode 100644 index 0000000..01451e0 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.labels.fcl.bytearray differ diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers b/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers new file mode 100644 index 0000000..755c4c7 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.labels.fcl.pointers differ diff --git a/swh/graph/tests/dataset/compressed/example.labels.fcl.properties b/swh/graph/tests/dataset/compressed/example.labels.fcl.properties new file mode 100644 index 0000000..deeac3a --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.labels.fcl.properties @@ -0,0 +1,2 @@ +n=8 +ratio=4 diff --git a/swh/graph/tests/dataset/compressed/example.labels.mph b/swh/graph/tests/dataset/compressed/example.labels.mph new file mode 100644 index 0000000..e417aec Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.labels.mph differ diff --git a/swh/graph/tests/dataset/output/example.mph b/swh/graph/tests/dataset/compressed/example.mph similarity index 80% copy from swh/graph/tests/dataset/output/example.mph copy to swh/graph/tests/dataset/compressed/example.mph index c6f9e19..f696b19 100644 Binary files a/swh/graph/tests/dataset/output/example.mph and b/swh/graph/tests/dataset/compressed/example.mph differ diff --git a/swh/graph/tests/dataset/compressed/example.node2swhid.bin b/swh/graph/tests/dataset/compressed/example.node2swhid.bin new file mode 100644 index 0000000..e86dae4 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.node2swhid.bin differ diff --git a/swh/graph/tests/dataset/output/example.node2type.map b/swh/graph/tests/dataset/compressed/example.node2type.map similarity index 90% rename from swh/graph/tests/dataset/output/example.node2type.map rename to swh/graph/tests/dataset/compressed/example.node2type.map index 6b91c37..1a5b7a7 100644 Binary files a/swh/graph/tests/dataset/output/example.node2type.map and b/swh/graph/tests/dataset/compressed/example.node2type.map differ diff --git a/swh/graph/tests/dataset/compressed/example.nodes.count.txt b/swh/graph/tests/dataset/compressed/example.nodes.count.txt new file mode 100644 index 0000000..aabe6ec --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.nodes.count.txt @@ -0,0 +1 @@ +21 diff --git a/swh/graph/tests/dataset/compressed/example.nodes.csv.zst b/swh/graph/tests/dataset/compressed/example.nodes.csv.zst new file mode 100644 index 0000000..0559f37 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.nodes.csv.zst differ diff --git a/swh/graph/tests/dataset/compressed/example.nodes.stats.txt b/swh/graph/tests/dataset/compressed/example.nodes.stats.txt new file mode 100644 index 0000000..097e698 --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.nodes.stats.txt @@ -0,0 +1,6 @@ +cnt 7 +dir 6 +ori 1 +rel 2 +rev 4 +snp 1 diff --git a/swh/graph/tests/dataset/output/example.obl b/swh/graph/tests/dataset/compressed/example.obl similarity index 77% rename from swh/graph/tests/dataset/output/example.obl rename to swh/graph/tests/dataset/compressed/example.obl index 1b4fd2e..8538d49 100644 Binary files a/swh/graph/tests/dataset/output/example.obl and b/swh/graph/tests/dataset/compressed/example.obl differ diff --git a/swh/graph/tests/dataset/compressed/example.offsets b/swh/graph/tests/dataset/compressed/example.offsets new file mode 100644 index 0000000..1249e27 --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.offsets @@ -0,0 +1,2 @@ +A!Bi +CB diff --git a/swh/graph/tests/dataset/compressed/example.order b/swh/graph/tests/dataset/compressed/example.order new file mode 100644 index 0000000..ff64db4 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.order differ diff --git a/swh/graph/tests/dataset/output/example.outdegree b/swh/graph/tests/dataset/compressed/example.outdegree similarity index 100% rename from swh/graph/tests/dataset/output/example.outdegree rename to swh/graph/tests/dataset/compressed/example.outdegree diff --git a/swh/graph/tests/dataset/compressed/example.persons.count.txt b/swh/graph/tests/dataset/compressed/example.persons.count.txt new file mode 100644 index 0000000..00750ed --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.persons.count.txt @@ -0,0 +1 @@ +3 diff --git a/swh/graph/tests/dataset/compressed/example.persons.csv.zst b/swh/graph/tests/dataset/compressed/example.persons.csv.zst new file mode 100644 index 0000000..0da9b20 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.persons.csv.zst differ diff --git a/swh/graph/tests/dataset/output/example.mph b/swh/graph/tests/dataset/compressed/example.persons.mph similarity index 73% rename from swh/graph/tests/dataset/output/example.mph rename to swh/graph/tests/dataset/compressed/example.persons.mph index c6f9e19..6787503 100644 Binary files a/swh/graph/tests/dataset/output/example.mph and b/swh/graph/tests/dataset/compressed/example.persons.mph differ diff --git a/swh/graph/tests/dataset/compressed/example.properties b/swh/graph/tests/dataset/compressed/example.properties new file mode 100644 index 0000000..11d426e --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.properties @@ -0,0 +1,35 @@ +#BVGraph properties +#Wed Mar 30 17:33:28 CEST 2022 +bitsforreferences=15 +avgbitsforintervals=0.667 +graphclass=it.unimi.dsi.big.webgraph.BVGraph +avgdist=0.048 +successoravggap=3.935 +residualexpstats=8,9,2,2,1 +arcs=23 +minintervallength=4 +bitsforoutdegrees=51 +residualavgloggap=1.8895225435666037 +avgbitsforoutdegrees=2.429 +bitsforresiduals=98 +successoravgloggap=1.8859500382836039 +maxrefcount=3 +successorexpstats=8,10,2,2,1 +residualarcs=22 +avgbitsforresiduals=4.667 +avgbitsforblocks=0.048 +windowsize=7 +residualavggap=4.000 +copiedarcs=1 +avgbitsforreferences=0.714 +version=0 +compratio=1.412 +bitsperlink=7.783 +compressionflags= +nodes=21 +avgref=0.048 +zetak=3 +bitsforintervals=14 +intervalisedarcs=0 +bitspernode=8.524 +bitsforblocks=1 diff --git a/swh/graph/tests/dataset/compressed/example.property.author_id.bin b/swh/graph/tests/dataset/compressed/example.property.author_id.bin new file mode 100644 index 0000000..7072382 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.author_id.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin b/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin new file mode 100644 index 0000000..18ae5fa Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.author_timestamp.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin b/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin new file mode 100644 index 0000000..ab8222e Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.author_timestamp_offset.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_id.bin b/swh/graph/tests/dataset/compressed/example.property.committer_id.bin new file mode 100644 index 0000000..693c904 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.committer_id.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin new file mode 100644 index 0000000..4c00061 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin new file mode 100644 index 0000000..9c4f149 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.committer_timestamp_offset.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin b/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin new file mode 100644 index 0000000..274f279 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.content.is_skipped.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.content.length.bin b/swh/graph/tests/dataset/compressed/example.property.content.length.bin new file mode 100644 index 0000000..4848e0e Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.content.length.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.message.bin b/swh/graph/tests/dataset/compressed/example.property.message.bin new file mode 100644 index 0000000..5d50ccf --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.property.message.bin @@ -0,0 +1,7 @@ +VmVyc2lvbiAxLjA= +VmVyc2lvbiAyLjA= +SW5pdGlhbCBjb21taXQ= +QWRkIHBhcnNlcg== +QWRkIHRlc3Rz +UmVmYWN0b3IgY29kZWJhc2U= +aHR0cHM6Ly9leGFtcGxlLmNvbS9zd2gvZ3JhcGg= diff --git a/swh/graph/tests/dataset/compressed/example.property.message.offset.bin b/swh/graph/tests/dataset/compressed/example.property.message.offset.bin new file mode 100644 index 0000000..a452a83 Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.message.offset.bin differ diff --git a/swh/graph/tests/dataset/compressed/example.property.tag_name.bin b/swh/graph/tests/dataset/compressed/example.property.tag_name.bin new file mode 100644 index 0000000..ba37d43 --- /dev/null +++ b/swh/graph/tests/dataset/compressed/example.property.tag_name.bin @@ -0,0 +1,2 @@ +djEuMA== +djIuMA== diff --git a/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin b/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin new file mode 100644 index 0000000..f6f589d Binary files /dev/null and b/swh/graph/tests/dataset/compressed/example.property.tag_name.offset.bin differ diff --git a/swh/graph/tests/dataset/output/example.stats b/swh/graph/tests/dataset/compressed/example.stats similarity index 58% rename from swh/graph/tests/dataset/output/example.stats rename to swh/graph/tests/dataset/compressed/example.stats index a58d3e2..541f39a 100644 --- a/swh/graph/tests/dataset/output/example.stats +++ b/swh/graph/tests/dataset/compressed/example.stats @@ -1,20 +1,20 @@ nodes=21 arcs=23 loops=0 -successoravggap=7.765 -avglocality=3.783 +successoravggap=4.588 +avglocality=2.522 minoutdegree=0 maxoutdegree=3 minoutdegreenode=1 -maxoutdegreenode=0 +maxoutdegreenode=9 dangling=7 terminal=7 percdangling=33.333333333333336 avgoutdegree=1.0952380952380953 -successorlogdeltastats=11,7,1,3,1 -successoravglogdelta=0.911 +successorlogdeltastats=13,5,3,2 +successoravglogdelta=0.814 minindegree=0 maxindegree=3 -minindegreenode=17 -maxindegreenode=18 +minindegreenode=20 +maxindegreenode=17 avgindegree=1.0952380952380953 diff --git a/swh/graph/tests/dataset/edges/content/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/content/graph-all.edges.csv.zst new file mode 100644 index 0000000..e58c09d Binary files /dev/null and b/swh/graph/tests/dataset/edges/content/graph-all.edges.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/content/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/content/graph-all.nodes.csv.zst new file mode 100644 index 0000000..779ad79 Binary files /dev/null and b/swh/graph/tests/dataset/edges/content/graph-all.nodes.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/directory/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/directory/graph-all.edges.csv.zst new file mode 100644 index 0000000..3e96d86 Binary files /dev/null and b/swh/graph/tests/dataset/edges/directory/graph-all.edges.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/directory/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/directory/graph-all.nodes.csv.zst new file mode 100644 index 0000000..57ad7ac Binary files /dev/null and b/swh/graph/tests/dataset/edges/directory/graph-all.nodes.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst new file mode 100644 index 0000000..11bf2e2 Binary files /dev/null and b/swh/graph/tests/dataset/edges/origin/graph-all.edges.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst new file mode 100644 index 0000000..850e058 Binary files /dev/null and b/swh/graph/tests/dataset/edges/origin/graph-all.nodes.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst new file mode 100644 index 0000000..59b5b0e Binary files /dev/null and b/swh/graph/tests/dataset/edges/release/graph-all.edges.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst new file mode 100644 index 0000000..11bfce7 Binary files /dev/null and b/swh/graph/tests/dataset/edges/release/graph-all.nodes.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/revision/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/revision/graph-all.edges.csv.zst new file mode 100644 index 0000000..72f8cbb Binary files /dev/null and b/swh/graph/tests/dataset/edges/revision/graph-all.edges.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/revision/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/revision/graph-all.nodes.csv.zst new file mode 100644 index 0000000..56acc8b Binary files /dev/null and b/swh/graph/tests/dataset/edges/revision/graph-all.nodes.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst b/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst new file mode 100644 index 0000000..97db59f Binary files /dev/null and b/swh/graph/tests/dataset/edges/snapshot/graph-all.edges.csv.zst differ diff --git a/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst b/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst new file mode 100644 index 0000000..5cd8295 Binary files /dev/null and b/swh/graph/tests/dataset/edges/snapshot/graph-all.nodes.csv.zst differ diff --git a/swh/graph/tests/dataset/example.edges.csv b/swh/graph/tests/dataset/example.edges.csv deleted file mode 100644 index a91c083..0000000 --- a/swh/graph/tests/dataset/example.edges.csv +++ /dev/null @@ -1,23 +0,0 @@ -swh:1:dir:0000000000000000000000000000000000000002 swh:1:cnt:0000000000000000000000000000000000000001 -swh:1:rev:0000000000000000000000000000000000000003 swh:1:dir:0000000000000000000000000000000000000002 -swh:1:dir:0000000000000000000000000000000000000008 swh:1:cnt:0000000000000000000000000000000000000001 -swh:1:dir:0000000000000000000000000000000000000008 swh:1:dir:0000000000000000000000000000000000000006 -swh:1:dir:0000000000000000000000000000000000000006 swh:1:cnt:0000000000000000000000000000000000000004 -swh:1:dir:0000000000000000000000000000000000000006 swh:1:cnt:0000000000000000000000000000000000000005 -swh:1:dir:0000000000000000000000000000000000000008 swh:1:cnt:0000000000000000000000000000000000000007 -swh:1:rev:0000000000000000000000000000000000000009 swh:1:dir:0000000000000000000000000000000000000008 -swh:1:rel:0000000000000000000000000000000000000010 swh:1:rev:0000000000000000000000000000000000000009 -swh:1:rev:0000000000000000000000000000000000000009 swh:1:rev:0000000000000000000000000000000000000003 -swh:1:dir:0000000000000000000000000000000000000012 swh:1:cnt:0000000000000000000000000000000000000011 -swh:1:dir:0000000000000000000000000000000000000012 swh:1:dir:0000000000000000000000000000000000000008 -swh:1:rev:0000000000000000000000000000000000000013 swh:1:dir:0000000000000000000000000000000000000012 -swh:1:rev:0000000000000000000000000000000000000013 swh:1:rev:0000000000000000000000000000000000000009 -swh:1:dir:0000000000000000000000000000000000000017 swh:1:cnt:0000000000000000000000000000000000000014 -swh:1:dir:0000000000000000000000000000000000000017 swh:1:dir:0000000000000000000000000000000000000016 -swh:1:dir:0000000000000000000000000000000000000016 swh:1:cnt:0000000000000000000000000000000000000015 -swh:1:rev:0000000000000000000000000000000000000018 swh:1:dir:0000000000000000000000000000000000000017 -swh:1:rev:0000000000000000000000000000000000000018 swh:1:rev:0000000000000000000000000000000000000013 -swh:1:rel:0000000000000000000000000000000000000019 swh:1:rev:0000000000000000000000000000000000000018 -swh:1:snp:0000000000000000000000000000000000000020 swh:1:rev:0000000000000000000000000000000000000009 -swh:1:snp:0000000000000000000000000000000000000020 swh:1:rel:0000000000000000000000000000000000000010 -swh:1:ori:0000000000000000000000000000000000000021 swh:1:snp:0000000000000000000000000000000000000020 diff --git a/swh/graph/tests/dataset/example.edges.csv.zst b/swh/graph/tests/dataset/example.edges.csv.zst deleted file mode 100644 index 41b40c1..0000000 Binary files a/swh/graph/tests/dataset/example.edges.csv.zst and /dev/null differ diff --git a/swh/graph/tests/dataset/example.nodes.csv b/swh/graph/tests/dataset/example.nodes.csv deleted file mode 100644 index 4105555..0000000 --- a/swh/graph/tests/dataset/example.nodes.csv +++ /dev/null @@ -1,21 +0,0 @@ -swh:1:cnt:0000000000000000000000000000000000000001 -swh:1:cnt:0000000000000000000000000000000000000004 -swh:1:cnt:0000000000000000000000000000000000000005 -swh:1:cnt:0000000000000000000000000000000000000007 -swh:1:cnt:0000000000000000000000000000000000000011 -swh:1:cnt:0000000000000000000000000000000000000014 -swh:1:cnt:0000000000000000000000000000000000000015 -swh:1:dir:0000000000000000000000000000000000000002 -swh:1:dir:0000000000000000000000000000000000000006 -swh:1:dir:0000000000000000000000000000000000000008 -swh:1:dir:0000000000000000000000000000000000000012 -swh:1:dir:0000000000000000000000000000000000000016 -swh:1:dir:0000000000000000000000000000000000000017 -swh:1:ori:0000000000000000000000000000000000000021 -swh:1:rel:0000000000000000000000000000000000000010 -swh:1:rel:0000000000000000000000000000000000000019 -swh:1:rev:0000000000000000000000000000000000000003 -swh:1:rev:0000000000000000000000000000000000000009 -swh:1:rev:0000000000000000000000000000000000000013 -swh:1:rev:0000000000000000000000000000000000000018 -swh:1:snp:0000000000000000000000000000000000000020 diff --git a/swh/graph/tests/dataset/example.nodes.csv.zst b/swh/graph/tests/dataset/example.nodes.csv.zst deleted file mode 100644 index 00cb5f4..0000000 Binary files a/swh/graph/tests/dataset/example.nodes.csv.zst and /dev/null differ diff --git a/swh/graph/tests/dataset/generate_dataset.py b/swh/graph/tests/dataset/generate_dataset.py new file mode 100755 index 0000000..c6abc00 --- /dev/null +++ b/swh/graph/tests/dataset/generate_dataset.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +# type: ignore + +import argparse +import datetime +import logging +from pathlib import Path +import shutil + +from swh.dataset.exporters.edges import GraphEdgesExporter +from swh.dataset.exporters.orc import ORCExporter +from swh.graph.webgraph import compress +from swh.model.model import ( + Content, + Directory, + DirectoryEntry, + ObjectType, + Origin, + OriginVisit, + OriginVisitStatus, + Person, + Release, + Revision, + RevisionType, + SkippedContent, + Snapshot, + SnapshotBranch, + TargetType, + Timestamp, + TimestampWithTimezone, +) + + +def h(id: int, width=40) -> bytes: + return bytes.fromhex(f"{id:0{width}}") + + +PERSONS = [ + Person(fullname=b"foo", name=b"foo", email=b""), + Person(fullname=b"bar", name=b"bar", email=b""), + Person(fullname=b"baz", name=b"baz", email=b""), +] + +TEST_DATASET = [ + Content(sha1_git=h(1), sha1=h(1), sha256=h(1, 64), blake2s256=h(1, 64), length=42), + Content(sha1_git=h(4), sha1=h(4), sha256=h(4, 64), blake2s256=h(4, 64), length=404), + Content( + sha1_git=h(5), sha1=h(5), sha256=h(5, 64), blake2s256=h(5, 64), length=1337 + ), + Content(sha1_git=h(7), sha1=h(7), sha256=h(7, 64), blake2s256=h(7, 64), length=666), + Content( + sha1_git=h(11), sha1=h(11), sha256=h(11, 64), blake2s256=h(11, 64), length=313 + ), + Content( + sha1_git=h(14), sha1=h(14), sha256=h(14, 64), blake2s256=h(14, 64), length=14 + ), + SkippedContent( + sha1_git=h(15), + sha1=h(15), + sha256=h(15, 64), + blake2s256=h(15, 64), + length=404, + status="absent", + reason="Not found", + ), + Directory( + id=h(2), + entries=( + DirectoryEntry( + name=b"README.md", + perms=0o100644, + type="file", + target=h(1), + ), + ), + ), + Directory( + id=h(6), + entries=( + DirectoryEntry( + name=b"README.md", + perms=0o100644, + type="file", + target=h(4), + ), + DirectoryEntry( + name=b"parser.c", + perms=0o100644, + type="file", + target=h(5), + ), + ), + ), + Directory( + id=h(8), + entries=( + DirectoryEntry( + name=b"README.md", + perms=0o100644, + type="file", + target=h(1), + ), + DirectoryEntry( + name=b"parser.c", + perms=0o100644, + type="file", + target=h(7), + ), + DirectoryEntry( + name=b"tests", + perms=0o100755, + type="dir", + target=h(6), + ), + ), + ), + Directory( + id=h(12), + entries=( + DirectoryEntry( + name=b"README.md", + perms=0o100644, + type="file", + target=h(11), + ), + DirectoryEntry( + name=b"oldproject", + perms=0o100755, + type="dir", + target=h(8), + ), + ), + ), + Directory( + id=h(16), + entries=( + DirectoryEntry( + name=b"TODO.txt", + perms=0o100644, + type="file", + target=h(15), + ), + ), + ), + Directory( + id=h(17), + entries=( + DirectoryEntry( + name=b"TODO.txt", + perms=0o100644, + type="file", + target=h(14), + ), + DirectoryEntry( + name=b"old", + perms=0o100755, + type="dir", + target=h(16), + ), + ), + ), + Revision( + id=h(3), + message=b"Initial commit", + date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1111122220, + microseconds=0, + ), + offset_bytes=b"+0200", + ), + committer=PERSONS[0], + author=PERSONS[0], + committer_date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1111122220, + microseconds=0, + ), + offset_bytes=b"+0200", + ), + type=RevisionType.GIT, + directory=h(2), + synthetic=False, + metadata=None, + parents=(), + ), + Revision( + id=h(9), + message=b"Add parser", + date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1111144440, + microseconds=0, + ), + offset_bytes=b"+0200", + ), + committer=PERSONS[1], + author=PERSONS[1], + committer_date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1111155550, + microseconds=0, + ), + offset_bytes=b"+0200", + ), + type=RevisionType.GIT, + directory=h(8), + synthetic=False, + metadata=None, + parents=(h(3),), + ), + Revision( + id=h(13), + message=b"Add tests", + date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1111166660, + microseconds=0, + ), + offset_bytes=b"+0200", + ), + committer=PERSONS[1], + author=PERSONS[0], + committer_date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1111166660, + microseconds=0, + ), + offset_bytes=b"+0200", + ), + type=RevisionType.GIT, + directory=h(12), + synthetic=False, + metadata=None, + parents=(h(9),), + ), + Revision( + id=h(18), + message=b"Refactor codebase", + date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1111177770, + microseconds=0, + ), + offset_bytes=b"+0000", + ), + committer=PERSONS[0], + author=PERSONS[2], + committer_date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1111177770, + microseconds=0, + ), + offset_bytes=b"+0000", + ), + type=RevisionType.GIT, + directory=h(17), + synthetic=False, + metadata=None, + parents=(h(13),), + ), + Release( + id=h(10), + name=b"v1.0", + date=TimestampWithTimezone( + timestamp=Timestamp( + seconds=1234567890, + microseconds=0, + ), + offset_bytes=b"+0200", + ), + author=PERSONS[0], + target_type=ObjectType.REVISION, + target=h(9), + message=b"Version 1.0", + synthetic=False, + ), + Release( + id=h(19), + name=b"v2.0", + date=None, + author=PERSONS[1], + target_type=ObjectType.REVISION, + target=h(18), + message=b"Version 2.0", + synthetic=False, + ), + Snapshot( + id=h(20), + branches={ + b"refs/heads/master": SnapshotBranch( + target=h(9), target_type=TargetType.REVISION + ), + b"refs/tags/v1.0": SnapshotBranch( + target=h(10), target_type=TargetType.RELEASE + ), + }, + ), + OriginVisit( + origin="https://example.com/swh/graph", + date=datetime.datetime( + 2013, 5, 7, 4, 20, 39, 369271, tzinfo=datetime.timezone.utc + ), + visit=1, + type="git", + ), + OriginVisitStatus( + origin="https://example.com/swh/graph", + date=datetime.datetime( + 2013, 5, 7, 4, 20, 41, 369271, tzinfo=datetime.timezone.utc + ), + visit=1, + type="git", + status="full", + snapshot=h(20), + metadata=None, + ), + Origin(url="https://example.com/swh/graph"), +] + + +def main(): + logging.basicConfig(level=logging.INFO) + + parser = argparse.ArgumentParser(description="Generate a test dataset") + parser.add_argument( + "--compress", + action="store_true", + default=False, + help="Also compress the dataset", + ) + parser.add_argument("output", help="output directory", nargs="?", default=".") + args = parser.parse_args() + + exporters = {"edges": GraphEdgesExporter, "orc": ORCExporter} + config = {"test_unique_file_id": "all"} + output_path = Path(args.output) + for name, exporter in exporters.items(): + if (output_path / name).exists(): + shutil.rmtree(output_path / name) + with exporter(config, output_path / name) as e: + for obj in TEST_DATASET: + e.process_object(obj.object_type, obj.to_dict()) + + if args.compress: + if (output_path / "compressed").exists(): + shutil.rmtree(output_path / "compressed") + compress("example", output_path / "orc", output_path / "compressed") + + +if __name__ == "__main__": + main() diff --git a/swh/graph/tests/dataset/generate_graph.sh b/swh/graph/tests/dataset/generate_graph.sh deleted file mode 100755 index 9621cad..0000000 --- a/swh/graph/tests/dataset/generate_graph.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# Clean previous run -rm -rf docker/ output -mkdir output - -# Build Docker work environment -toplevel_dir=`git rev-parse --show-toplevel` -mkdir -p docker -cp -r $toplevel_dir/docker/ . -docker build --tag swh-graph-test docker - -# Setup input for compression script -tr ' ' '\n' < example.edges.csv | sort -u > example.nodes.csv -zstd < example.nodes.csv > example.nodes.csv.zst -zstd < example.edges.csv > example.edges.csv.zst - -docker run \ - --user $(id -u):$(id -g) \ - --name swh-graph-test --rm --tty --interactive \ - --volume $(pwd):/input --volume $(pwd)/output:/output \ - swh-graph-test:latest \ - swh graph compress --graph /input/example --outdir /output diff --git a/swh/graph/tests/dataset/orc/content/content-all.orc b/swh/graph/tests/dataset/orc/content/content-all.orc new file mode 100644 index 0000000..b038074 Binary files /dev/null and b/swh/graph/tests/dataset/orc/content/content-all.orc differ diff --git a/swh/graph/tests/dataset/orc/directory/directory-all.orc b/swh/graph/tests/dataset/orc/directory/directory-all.orc new file mode 100644 index 0000000..2df504e Binary files /dev/null and b/swh/graph/tests/dataset/orc/directory/directory-all.orc differ diff --git a/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc b/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc new file mode 100644 index 0000000..1a3d9f4 Binary files /dev/null and b/swh/graph/tests/dataset/orc/directory_entry/directory_entry-all.orc differ diff --git a/swh/graph/tests/dataset/orc/origin/origin-all.orc b/swh/graph/tests/dataset/orc/origin/origin-all.orc new file mode 100644 index 0000000..cec803a Binary files /dev/null and b/swh/graph/tests/dataset/orc/origin/origin-all.orc differ diff --git a/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc b/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc new file mode 100644 index 0000000..c7965bb Binary files /dev/null and b/swh/graph/tests/dataset/orc/origin_visit/origin_visit-all.orc differ diff --git a/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc b/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc new file mode 100644 index 0000000..0a19cb1 Binary files /dev/null and b/swh/graph/tests/dataset/orc/origin_visit_status/origin_visit_status-all.orc differ diff --git a/swh/graph/tests/dataset/orc/release/release-all.orc b/swh/graph/tests/dataset/orc/release/release-all.orc new file mode 100644 index 0000000..888fa82 Binary files /dev/null and b/swh/graph/tests/dataset/orc/release/release-all.orc differ diff --git a/swh/graph/tests/dataset/orc/revision/revision-all.orc b/swh/graph/tests/dataset/orc/revision/revision-all.orc new file mode 100644 index 0000000..8c186d1 Binary files /dev/null and b/swh/graph/tests/dataset/orc/revision/revision-all.orc differ diff --git a/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc b/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc new file mode 100644 index 0000000..05a6b8d Binary files /dev/null and b/swh/graph/tests/dataset/orc/revision_extra_headers/revision_extra_headers-all.orc differ diff --git a/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc b/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc new file mode 100644 index 0000000..92f1748 Binary files /dev/null and b/swh/graph/tests/dataset/orc/revision_history/revision_history-all.orc differ diff --git a/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc b/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc new file mode 100644 index 0000000..ed19277 Binary files /dev/null and b/swh/graph/tests/dataset/orc/skipped_content/skipped_content-all.orc differ diff --git a/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc b/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc new file mode 100644 index 0000000..41bee79 Binary files /dev/null and b/swh/graph/tests/dataset/orc/snapshot/snapshot-all.orc differ diff --git a/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc b/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc new file mode 100644 index 0000000..c3a11b6 Binary files /dev/null and b/swh/graph/tests/dataset/orc/snapshot_branch/snapshot_branch-all.orc differ diff --git a/swh/graph/tests/dataset/output/example-transposed.graph b/swh/graph/tests/dataset/output/example-transposed.graph deleted file mode 100644 index ad5756e..0000000 --- a/swh/graph/tests/dataset/output/example-transposed.graph +++ /dev/null @@ -1 +0,0 @@ -z.hѮIt{ \ No newline at end of file diff --git a/swh/graph/tests/dataset/output/example-transposed.offsets b/swh/graph/tests/dataset/output/example-transposed.offsets deleted file mode 100644 index 92c2947..0000000 --- a/swh/graph/tests/dataset/output/example-transposed.offsets +++ /dev/null @@ -1,2 +0,0 @@ - - RqG4PTP( \ No newline at end of file diff --git a/swh/graph/tests/dataset/output/example-transposed.properties b/swh/graph/tests/dataset/output/example-transposed.properties deleted file mode 100644 index 512ce9d..0000000 --- a/swh/graph/tests/dataset/output/example-transposed.properties +++ /dev/null @@ -1,35 +0,0 @@ -#BVGraph properties -#Sat Dec 04 01:37:28 CET 2021 -bitsforreferences=31 -avgbitsforintervals=0.714 -graphclass=it.unimi.dsi.big.webgraph.BVGraph -avgdist=0.571 -successoravggap=6.478 -residualexpstats=6,6,2,2,2 -arcs=23 -minintervallength=4 -bitsforoutdegrees=61 -residualavgloggap=2.1534522798004265 -avgbitsforoutdegrees=2.905 -bitsforresiduals=85 -successoravgloggap=2.3226776741991215 -maxrefcount=3 -successorexpstats=7,6,4,3,3 -residualarcs=18 -avgbitsforresiduals=4.048 -avgbitsforblocks=0.238 -windowsize=7 -residualavggap=5.667 -copiedarcs=5 -avgbitsforreferences=1.476 -version=0 -compratio=1.554 -bitsperlink=8.565 -compressionflags= -nodes=21 -avgref=0.238 -zetak=3 -bitsforintervals=15 -intervalisedarcs=0 -bitspernode=9.381 -bitsforblocks=5 diff --git a/swh/graph/tests/dataset/output/example.graph b/swh/graph/tests/dataset/output/example.graph deleted file mode 100644 index 621b9b7..0000000 --- a/swh/graph/tests/dataset/output/example.graph +++ /dev/null @@ -1 +0,0 @@ -'t}UOGϹ]ް].dP}R \ No newline at end of file diff --git a/swh/graph/tests/dataset/output/example.node2swhid.bin b/swh/graph/tests/dataset/output/example.node2swhid.bin deleted file mode 100644 index 9cc50b2..0000000 Binary files a/swh/graph/tests/dataset/output/example.node2swhid.bin and /dev/null differ diff --git a/swh/graph/tests/dataset/output/example.offsets b/swh/graph/tests/dataset/output/example.offsets deleted file mode 100644 index 407e1a6..0000000 --- a/swh/graph/tests/dataset/output/example.offsets +++ /dev/null @@ -1 +0,0 @@ -BU!B diff --git a/swh/graph/tests/dataset/output/example.order b/swh/graph/tests/dataset/output/example.order deleted file mode 100644 index 2cb5540..0000000 Binary files a/swh/graph/tests/dataset/output/example.order and /dev/null differ diff --git a/swh/graph/tests/dataset/output/example.properties b/swh/graph/tests/dataset/output/example.properties deleted file mode 100644 index cb6975a..0000000 --- a/swh/graph/tests/dataset/output/example.properties +++ /dev/null @@ -1,35 +0,0 @@ -#BVGraph properties -#Sat Dec 04 01:37:26 CET 2021 -bitsforreferences=14 -avgbitsforintervals=0.667 -graphclass=it.unimi.dsi.big.webgraph.BVGraph -avgdist=0 -successoravggap=7.391 -residualexpstats=7,7,3,3,2,1 -arcs=23 -minintervallength=4 -bitsforoutdegrees=51 -residualavgloggap=2.32668281341601 -avgbitsforoutdegrees=2.429 -bitsforresiduals=111 -successoravgloggap=2.32668281341601 -maxrefcount=3 -successorexpstats=7,7,3,3,2,1 -residualarcs=23 -avgbitsforresiduals=5.286 -avgbitsforblocks=0 -windowsize=7 -residualavggap=7.391 -copiedarcs=0 -avgbitsforreferences=0.667 -version=0 -compratio=1.499 -bitsperlink=8.261 -compressionflags= -nodes=21 -avgref=0 -zetak=3 -bitsforintervals=14 -intervalisedarcs=0 -bitspernode=9.048 -bitsforblocks=0 diff --git a/swh/graph/tests/test_cli.py b/swh/graph/tests/test_cli.py index fc066b0..eceb164 100644 --- a/swh/graph/tests/test_cli.py +++ b/swh/graph/tests/test_cli.py @@ -1,56 +1,58 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from pathlib import Path from tempfile import TemporaryDirectory from typing import Dict from click.testing import CliRunner import yaml from swh.graph.cli import graph_cli_group DATA_DIR = Path(__file__).parents[0] / "dataset" def read_properties(properties_fname) -> Dict[str, str]: """read a Java .properties file""" with open(properties_fname) as f: keyvalues = ( line.split("=", maxsplit=1) for line in f if not line.strip().startswith("#") ) return dict((k.strip(), v.strip()) for (k, v) in keyvalues) def test_pipeline(): """run full compression pipeline""" # bare bone configuration, to allow testing the compression pipeline # with minimum RAM requirements on trivial graphs config = {"graph": {"compress": {"batch_size": 1000}}} runner = CliRunner() with TemporaryDirectory(suffix=".swh-graph-test") as tmpdir: config_path = Path(tmpdir, "config.yml") config_path.write_text(yaml.dump(config)) result = runner.invoke( graph_cli_group, [ "--config-file", config_path, "compress", - "--graph", - DATA_DIR / "example", - "--outdir", + "--input-dataset", + DATA_DIR / "orc", + "--output-directory", tmpdir, + "--graph-name", + "example", ], ) assert result.exit_code == 0, result properties = read_properties(Path(tmpdir) / "example.properties") assert int(properties["nodes"]) == 21 assert int(properties["arcs"]) == 23 diff --git a/swh/graph/tests/test_api_client.py b/swh/graph/tests/test_http_client.py similarity index 88% rename from swh/graph/tests/test_api_client.py rename to swh/graph/tests/test_http_client.py index 79b4d86..21021b3 100644 --- a/swh/graph/tests/test_api_client.py +++ b/swh/graph/tests/test_http_client.py @@ -1,379 +1,378 @@ +# Copyright (c) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import hashlib + import pytest from pytest import raises from swh.core.api import RemoteException -from swh.graph.client import GraphArgumentException +from swh.graph.http_client import GraphArgumentException + +TEST_ORIGIN_ID = "swh:1:ori:{}".format( + hashlib.sha1(b"https://example.com/swh/graph").hexdigest() +) def test_stats(graph_client): stats = graph_client.stats() - - assert set(stats.keys()) == {"counts", "ratios", "indegree", "outdegree"} - - assert set(stats["counts"].keys()) == {"nodes", "edges"} - assert set(stats["ratios"].keys()) == { - "compression", - "bits_per_node", - "bits_per_edge", - "avg_locality", - } - assert set(stats["indegree"].keys()) == {"min", "max", "avg"} - assert set(stats["outdegree"].keys()) == {"min", "max", "avg"} - - assert stats["counts"]["nodes"] == 21 - assert stats["counts"]["edges"] == 23 - assert isinstance(stats["ratios"]["compression"], float) - assert isinstance(stats["ratios"]["bits_per_node"], float) - assert isinstance(stats["ratios"]["bits_per_edge"], float) - assert isinstance(stats["ratios"]["avg_locality"], float) - assert stats["indegree"]["min"] == 0 - assert stats["indegree"]["max"] == 3 - assert isinstance(stats["indegree"]["avg"], float) - assert stats["outdegree"]["min"] == 0 - assert stats["outdegree"]["max"] == 3 - assert isinstance(stats["outdegree"]["avg"], float) + assert stats["num_nodes"] == 21 + assert stats["num_edges"] == 23 + assert isinstance(stats["compression_ratio"], float) + assert isinstance(stats["bits_per_node"], float) + assert isinstance(stats["bits_per_edge"], float) + assert isinstance(stats["avg_locality"], float) + assert stats["indegree_min"] == 0 + assert stats["indegree_max"] == 3 + assert isinstance(stats["indegree_avg"], float) + assert stats["outdegree_min"] == 0 + assert stats["outdegree_max"] == 3 + assert isinstance(stats["outdegree_avg"], float) def test_leaves(graph_client): - actual = list( - graph_client.leaves("swh:1:ori:0000000000000000000000000000000000000021") - ) + actual = list(graph_client.leaves(TEST_ORIGIN_ID)) expected = [ "swh:1:cnt:0000000000000000000000000000000000000001", "swh:1:cnt:0000000000000000000000000000000000000004", "swh:1:cnt:0000000000000000000000000000000000000005", "swh:1:cnt:0000000000000000000000000000000000000007", ] assert set(actual) == set(expected) def test_neighbors(graph_client): actual = list( graph_client.neighbors( "swh:1:rev:0000000000000000000000000000000000000009", direction="backward" ) ) expected = [ "swh:1:snp:0000000000000000000000000000000000000020", "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000013", ] assert set(actual) == set(expected) def test_visit_nodes(graph_client): actual = list( graph_client.visit_nodes( "swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev", ) ) expected = [ "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", ] assert set(actual) == set(expected) def test_visit_nodes_filtered(graph_client): actual = list( graph_client.visit_nodes( - "swh:1:rel:0000000000000000000000000000000000000010", return_types="dir", + "swh:1:rel:0000000000000000000000000000000000000010", + return_types="dir", ) ) expected = [ "swh:1:dir:0000000000000000000000000000000000000002", "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:dir:0000000000000000000000000000000000000006", ] assert set(actual) == set(expected) def test_visit_nodes_filtered_star(graph_client): actual = list( graph_client.visit_nodes( - "swh:1:rel:0000000000000000000000000000000000000010", return_types="*", + "swh:1:rel:0000000000000000000000000000000000000010", + return_types="*", ) ) expected = [ "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", "swh:1:dir:0000000000000000000000000000000000000002", "swh:1:cnt:0000000000000000000000000000000000000001", "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:cnt:0000000000000000000000000000000000000007", "swh:1:dir:0000000000000000000000000000000000000006", "swh:1:cnt:0000000000000000000000000000000000000004", "swh:1:cnt:0000000000000000000000000000000000000005", ] assert set(actual) == set(expected) def test_visit_edges(graph_client): actual = list( graph_client.visit_edges( "swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev,rev:dir", ) ) expected = [ ( "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000009", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:dir:0000000000000000000000000000000000000008", ), ( "swh:1:rev:0000000000000000000000000000000000000003", "swh:1:dir:0000000000000000000000000000000000000002", ), ] assert set(actual) == set(expected) def test_visit_edges_limited(graph_client): actual = list( graph_client.visit_edges( "swh:1:rel:0000000000000000000000000000000000000010", max_edges=4, edges="rel:rev,rev:rev,rev:dir", ) ) expected = [ ( "swh:1:rel:0000000000000000000000000000000000000010", "swh:1:rev:0000000000000000000000000000000000000009", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:dir:0000000000000000000000000000000000000008", ), ( "swh:1:rev:0000000000000000000000000000000000000003", "swh:1:dir:0000000000000000000000000000000000000002", ), ] # As there are four valid answers (up to reordering), we cannot check for # equality. Instead, we check the client returned all edges but one. assert set(actual).issubset(set(expected)) assert len(actual) == 3 def test_visit_edges_diamond_pattern(graph_client): actual = list( graph_client.visit_edges( - "swh:1:rev:0000000000000000000000000000000000000009", edges="*", + "swh:1:rev:0000000000000000000000000000000000000009", + edges="*", ) ) expected = [ ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:rev:0000000000000000000000000000000000000003", ), ( "swh:1:rev:0000000000000000000000000000000000000009", "swh:1:dir:0000000000000000000000000000000000000008", ), ( "swh:1:rev:0000000000000000000000000000000000000003", "swh:1:dir:0000000000000000000000000000000000000002", ), ( "swh:1:dir:0000000000000000000000000000000000000002", "swh:1:cnt:0000000000000000000000000000000000000001", ), ( "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:cnt:0000000000000000000000000000000000000001", ), ( "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:cnt:0000000000000000000000000000000000000007", ), ( "swh:1:dir:0000000000000000000000000000000000000008", "swh:1:dir:0000000000000000000000000000000000000006", ), ( "swh:1:dir:0000000000000000000000000000000000000006", "swh:1:cnt:0000000000000000000000000000000000000004", ), ( "swh:1:dir:0000000000000000000000000000000000000006", "swh:1:cnt:0000000000000000000000000000000000000005", ), ] assert set(actual) == set(expected) @pytest.mark.skip(reason="currently disabled due to T1969") def test_walk(graph_client): args = ("swh:1:dir:0000000000000000000000000000000000000016", "rel") kwargs = { "edges": "dir:dir,dir:rev,rev:*", "direction": "backward", "traversal": "bfs", } actual = list(graph_client.walk(*args, **kwargs)) expected = [ "swh:1:dir:0000000000000000000000000000000000000016", "swh:1:dir:0000000000000000000000000000000000000017", "swh:1:rev:0000000000000000000000000000000000000018", "swh:1:rel:0000000000000000000000000000000000000019", ] assert set(actual) == set(expected) kwargs2 = kwargs.copy() kwargs2["limit"] = -1 actual = list(graph_client.walk(*args, **kwargs2)) expected = ["swh:1:rel:0000000000000000000000000000000000000019"] assert set(actual) == set(expected) kwargs2 = kwargs.copy() kwargs2["limit"] = 2 actual = list(graph_client.walk(*args, **kwargs2)) expected = [ "swh:1:dir:0000000000000000000000000000000000000016", "swh:1:dir:0000000000000000000000000000000000000017", ] assert set(actual) == set(expected) +@pytest.mark.skip(reason="Random walk is deprecated") def test_random_walk_dst_is_type(graph_client): """as the walk is random, we test a visit from a cnt node to a release reachable from every single path in the backward graph, and only check the final node of the path (i.e., the release) """ args = ("swh:1:cnt:0000000000000000000000000000000000000015", "rel") kwargs = {"direction": "backward"} expected_root = "swh:1:rel:0000000000000000000000000000000000000019" actual = list(graph_client.random_walk(*args, **kwargs)) assert len(actual) > 1 # no release directly links to a content assert actual[0] == args[0] assert actual[-1] == expected_root kwargs2 = kwargs.copy() kwargs2["limit"] = -1 actual = list(graph_client.random_walk(*args, **kwargs2)) assert actual == [expected_root] kwargs2["limit"] = -2 actual = list(graph_client.random_walk(*args, **kwargs2)) assert len(actual) == 2 assert actual[-1] == expected_root kwargs2["limit"] = 3 actual = list(graph_client.random_walk(*args, **kwargs2)) assert len(actual) == 3 +@pytest.mark.skip(reason="Random walk is deprecated") def test_random_walk_dst_is_node(graph_client): """Same as test_random_walk_dst_is_type, but we target the specific release node instead of a type """ args = ( "swh:1:cnt:0000000000000000000000000000000000000015", "swh:1:rel:0000000000000000000000000000000000000019", ) kwargs = {"direction": "backward"} expected_root = "swh:1:rel:0000000000000000000000000000000000000019" actual = list(graph_client.random_walk(*args, **kwargs)) assert len(actual) > 1 # no origin directly links to a content assert actual[0] == args[0] assert actual[-1] == expected_root kwargs2 = kwargs.copy() kwargs2["limit"] = -1 actual = list(graph_client.random_walk(*args, **kwargs2)) assert actual == [expected_root] kwargs2["limit"] = -2 actual = list(graph_client.random_walk(*args, **kwargs2)) assert len(actual) == 2 assert actual[-1] == expected_root kwargs2["limit"] = 3 actual = list(graph_client.random_walk(*args, **kwargs2)) assert len(actual) == 3 def test_count(graph_client): - actual = graph_client.count_leaves( - "swh:1:ori:0000000000000000000000000000000000000021" - ) + actual = graph_client.count_leaves(TEST_ORIGIN_ID) assert actual == 4 actual = graph_client.count_visit_nodes( "swh:1:rel:0000000000000000000000000000000000000010", edges="rel:rev,rev:rev" ) assert actual == 3 actual = graph_client.count_neighbors( "swh:1:rev:0000000000000000000000000000000000000009", direction="backward" ) assert actual == 3 def test_param_validation(graph_client): with raises(GraphArgumentException) as exc_info: # SWHID not found - list(graph_client.leaves("swh:1:ori:fff0000000000000000000000000000000000021")) + list(graph_client.leaves("swh:1:rel:00ffffffff000000000000000000000000000010")) if exc_info.value.response: assert exc_info.value.response.status_code == 404 with raises(GraphArgumentException) as exc_info: # malformed SWHID list( - graph_client.neighbors("swh:1:ori:fff000000zzzzzz0000000000000000000000021") + graph_client.neighbors("swh:1:rel:00ffffffff00000000zzzzzzz000000000000010") ) if exc_info.value.response: assert exc_info.value.response.status_code == 400 with raises(GraphArgumentException) as exc_info: # malformed edge specificaiton list( graph_client.visit_nodes( "swh:1:dir:0000000000000000000000000000000000000016", edges="dir:notanodetype,dir:rev,rev:*", direction="backward", ) ) if exc_info.value.response: assert exc_info.value.response.status_code == 400 with raises(GraphArgumentException) as exc_info: # malformed direction list( graph_client.visit_nodes( "swh:1:dir:0000000000000000000000000000000000000016", edges="dir:dir,dir:rev,rev:*", direction="notadirection", ) ) if exc_info.value.response: assert exc_info.value.response.status_code == 400 @pytest.mark.skip(reason="currently disabled due to T1969") def test_param_validation_walk(graph_client): """test validation of walk-specific parameters only""" with raises(RemoteException) as exc_info: # malformed traversal order list( graph_client.walk( "swh:1:dir:0000000000000000000000000000000000000016", "rel", edges="dir:dir,dir:rev,rev:*", direction="backward", traversal="notatraversalorder", ) ) assert exc_info.value.response.status_code == 400 diff --git a/swh/graph/tests/test_swhid.py b/swh/graph/tests/test_swhid.py deleted file mode 100644 index 6053215..0000000 --- a/swh/graph/tests/test_swhid.py +++ /dev/null @@ -1,196 +0,0 @@ -# Copyright (C) 2019-2020 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from itertools import islice -import os -import shutil -import tempfile -import unittest - -from swh.graph.swhid import NodeToSwhidMap, SwhidToNodeMap, bytes_to_str, str_to_bytes -from swh.model.swhids import SWHID_TYPES - - -class TestSwhidSerialization(unittest.TestCase): - - pairs = [ - ( - "swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2", - bytes.fromhex("01" + "00" + "94a9ed024d3859793618152ea559a168bbcbb5e2"), - ), - ( - "swh:1:dir:d198bc9d7a6bcf6db04f476d29314f157507d505", - bytes.fromhex("01" + "01" + "d198bc9d7a6bcf6db04f476d29314f157507d505"), - ), - ( - "swh:1:ori:b63a575fe3faab7692c9f38fb09d4bb45651bb0f", - bytes.fromhex("01" + "02" + "b63a575fe3faab7692c9f38fb09d4bb45651bb0f"), - ), - ( - "swh:1:rel:22ece559cc7cc2364edc5e5593d63ae8bd229f9f", - bytes.fromhex("01" + "03" + "22ece559cc7cc2364edc5e5593d63ae8bd229f9f"), - ), - ( - "swh:1:rev:309cf2674ee7a0749978cf8265ab91a60aea0f7d", - bytes.fromhex("01" + "04" + "309cf2674ee7a0749978cf8265ab91a60aea0f7d"), - ), - ( - "swh:1:snp:c7c108084bc0bf3d81436bf980b46e98bd338453", - bytes.fromhex("01" + "05" + "c7c108084bc0bf3d81436bf980b46e98bd338453"), - ), - ] - - def test_str_to_bytes(self): - for (swhid_str, swhid_bytes) in self.pairs: - self.assertEqual(str_to_bytes(swhid_str), swhid_bytes) - - def test_bytes_to_str(self): - for (swhid_str, swhid_bytes) in self.pairs: - self.assertEqual(bytes_to_str(swhid_bytes), swhid_str) - - def test_round_trip(self): - for (swhid_str, swhid_bytes) in self.pairs: - self.assertEqual(swhid_str, bytes_to_str(str_to_bytes(swhid_str))) - self.assertEqual(swhid_bytes, str_to_bytes(bytes_to_str(swhid_bytes))) - - -def gen_records(types=["cnt", "dir", "ori", "rel", "rev", "snp"], length=10000): - """generate sequential SWHID/int records, suitable for filling int<->swhid maps for - testing swh-graph on-disk binary databases - - Args: - types (list): list of SWHID types to be generated, specified as the - corresponding 3-letter component in SWHIDs - length (int): number of SWHIDs to generate *per type* - - Yields: - pairs (swhid, int) where swhid is a textual SWHID and int its sequential - integer identifier - - """ - pos = 0 - for t in sorted(types): - for i in range(0, length): - seq = format(pos, "x") # current position as hex string - swhid = "swh:1:{}:{}{}".format(t, "0" * (40 - len(seq)), seq) - yield (swhid, pos) - pos += 1 - - -# pairs SWHID/position in the sequence generated by :func:`gen_records` above -MAP_PAIRS = [ - ("swh:1:cnt:0000000000000000000000000000000000000000", 0), - ("swh:1:cnt:000000000000000000000000000000000000002a", 42), - ("swh:1:dir:0000000000000000000000000000000000002afc", 11004), - ("swh:1:ori:00000000000000000000000000000000000056ce", 22222), - ("swh:1:rel:0000000000000000000000000000000000008235", 33333), - ("swh:1:rev:000000000000000000000000000000000000ad9c", 44444), - ("swh:1:snp:000000000000000000000000000000000000ea5f", 59999), -] - - -class TestSwhidToNodeMap(unittest.TestCase): - @classmethod - def setUpClass(cls): - """create reasonably sized (~2 MB) SWHID->int map to test on-disk DB""" - cls.tmpdir = tempfile.mkdtemp(prefix="swh.graph.test.") - cls.fname = os.path.join(cls.tmpdir, "swhid2int.bin") - with open(cls.fname, "wb") as f: - for (swhid, i) in gen_records(length=10000): - SwhidToNodeMap.write_record(f, swhid, i) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdir) - - def setUp(self): - self.map = SwhidToNodeMap(self.fname) - - def tearDown(self): - self.map.close() - - def test_lookup(self): - for (swhid, pos) in MAP_PAIRS: - self.assertEqual(self.map[swhid], pos) - - def test_missing(self): - with self.assertRaises(KeyError): - self.map["swh:1:ori:0101010100000000000000000000000000000000"], - with self.assertRaises(KeyError): - self.map["swh:1:cnt:0101010100000000000000000000000000000000"], - - def test_type_error(self): - with self.assertRaises(TypeError): - self.map[42] - with self.assertRaises(TypeError): - self.map[1.2] - - def test_update(self): - fname2 = self.fname + ".update" - shutil.copy(self.fname, fname2) # fresh map copy - map2 = SwhidToNodeMap(fname2, mode="rb+") - for (swhid, int) in islice(map2, 11): # update the first N items - new_int = int + 42 - map2[swhid] = new_int - self.assertEqual(map2[swhid], new_int) # check updated value - - os.unlink(fname2) # tmpdir will be cleaned even if we don't reach this - - def test_iter_type(self): - for t in SWHID_TYPES + ["ori"]: - first_20 = list(islice(self.map.iter_type(t), 20)) - k = first_20[0][1] - expected = [("swh:1:{}:{:040x}".format(t, i), i) for i in range(k, k + 20)] - assert first_20 == expected - - def test_iter_prefix(self): - for t in SWHID_TYPES + ["ori"]: - prefix = self.map.iter_prefix("swh:1:{}:00".format(t)) - first_20 = list(islice(prefix, 20)) - k = first_20[0][1] - expected = [("swh:1:{}:{:040x}".format(t, i), i) for i in range(k, k + 20)] - assert first_20 == expected - - -class TestNodeToSwhidMap(unittest.TestCase): - @classmethod - def setUpClass(cls): - """create reasonably sized (~1 MB) int->SWHID map to test on-disk DB""" - cls.tmpdir = tempfile.mkdtemp(prefix="swh.graph.test.") - cls.fname = os.path.join(cls.tmpdir, "int2swhid.bin") - with open(cls.fname, "wb") as f: - for (swhid, _i) in gen_records(length=10000): - NodeToSwhidMap.write_record(f, swhid) - - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.tmpdir) - - def setUp(self): - self.map = NodeToSwhidMap(self.fname) - - def tearDown(self): - self.map.close() - - def test_lookup(self): - for (swhid, pos) in MAP_PAIRS: - self.assertEqual(self.map[pos], swhid) - - def test_out_of_bounds(self): - with self.assertRaises(IndexError): - self.map[1000000] - with self.assertRaises(IndexError): - self.map[-1000000] - - def test_update(self): - fname2 = self.fname + ".update" - shutil.copy(self.fname, fname2) # fresh map copy - map2 = NodeToSwhidMap(fname2, mode="rb+") - for (int, swhid) in islice(map2, 11): # update the first N items - new_swhid = swhid.replace(":0", ":f") # mangle first hex digit - map2[int] = new_swhid - self.assertEqual(map2[int], new_swhid) # check updated value - - os.unlink(fname2) # tmpdir will be cleaned even if we don't reach this diff --git a/swh/graph/webgraph.py b/swh/graph/webgraph.py index 24bb4b5..c188b83 100644 --- a/swh/graph/webgraph.py +++ b/swh/graph/webgraph.py @@ -1,280 +1,370 @@ # Copyright (C) 2019 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information """WebGraph driver """ from datetime import datetime from enum import Enum import logging import os from pathlib import Path import subprocess from typing import Dict, List, Set from swh.graph.config import check_config_compress +logger = logging.getLogger(__name__) + class CompressionStep(Enum): - MPH = 1 - BV = 2 - BFS = 3 - PERMUTE_BFS = 4 - TRANSPOSE_BFS = 5 - SIMPLIFY = 6 - LLP = 7 - PERMUTE_LLP = 8 - OBL = 9 - COMPOSE_ORDERS = 10 - STATS = 11 - TRANSPOSE = 12 - TRANSPOSE_OBL = 13 - MAPS = 14 - CLEAN_TMP = 15 + EXTRACT_NODES = 1 + MPH = 2 + BV = 3 + BFS = 4 + PERMUTE_BFS = 5 + TRANSPOSE_BFS = 6 + SIMPLIFY = 7 + LLP = 8 + PERMUTE_LLP = 9 + OBL = 10 + COMPOSE_ORDERS = 11 + STATS = 12 + TRANSPOSE = 13 + TRANSPOSE_OBL = 14 + MAPS = 15 + EXTRACT_PERSONS = 16 + MPH_PERSONS = 17 + NODE_PROPERTIES = 18 + MPH_LABELS = 19 + FCL_LABELS = 20 + EDGE_LABELS = 21 + EDGE_LABELS_OBL = 22 + EDGE_LABELS_TRANSPOSE_OBL = 23 + CLEAN_TMP = 24 def __str__(self): return self.name # full compression pipeline COMP_SEQ = list(CompressionStep) # Mapping from compression steps to shell commands implementing them. Commands # will be executed by the shell, so be careful with meta characters. They are # specified here as lists of tokens that will be joined together only for ease # of line splitting. In commands, {tokens} will be interpolated with # configuration values, see :func:`compress`. STEP_ARGV: Dict[CompressionStep, List[str]] = { + CompressionStep.EXTRACT_NODES: [ + "{java}", + "org.softwareheritage.graph.compress.ExtractNodes", + "--format", + "orc", + "--temp-dir", + "{tmp_dir}", + "{in_dir}", + "{out_dir}/{graph_name}", + ], CompressionStep.MPH: [ "{java}", "it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction", "--byte-array", "--temp-dir", "{tmp_dir}", + "--decompressor", + "com.github.luben.zstd.ZstdInputStream", "{out_dir}/{graph_name}.mph", - "<( zstdcat {in_dir}/{graph_name}.nodes.csv.zst )", + "{out_dir}/{graph_name}.nodes.csv.zst", ], - # use process substitution (and hence FIFO) above as MPH class load the - # entire file in memory when reading from stdin CompressionStep.BV: [ - "zstdcat", - "{in_dir}/{graph_name}.edges.csv.zst", - "|", - "cut -d' ' -f1,2", - "|", "{java}", - "it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph", - "--byte-array", + "org.softwareheritage.graph.compress.ScatteredArcsORCGraph", "--temp-dir", "{tmp_dir}", "--function", "{out_dir}/{graph_name}.mph", + "{in_dir}", "{out_dir}/{graph_name}-base", ], CompressionStep.BFS: [ "{java}", "it.unimi.dsi.law.big.graph.BFS", "{out_dir}/{graph_name}-base", "{out_dir}/{graph_name}-bfs.order", ], CompressionStep.PERMUTE_BFS: [ "{java}", "it.unimi.dsi.big.webgraph.Transform", "mapOffline", "{out_dir}/{graph_name}-base", "{out_dir}/{graph_name}-bfs", "{out_dir}/{graph_name}-bfs.order", "{batch_size}", "{tmp_dir}", ], CompressionStep.TRANSPOSE_BFS: [ "{java}", "it.unimi.dsi.big.webgraph.Transform", "transposeOffline", "{out_dir}/{graph_name}-bfs", "{out_dir}/{graph_name}-bfs-transposed", "{batch_size}", "{tmp_dir}", ], CompressionStep.SIMPLIFY: [ "{java}", "it.unimi.dsi.big.webgraph.Transform", "simplify", "{out_dir}/{graph_name}-bfs", "{out_dir}/{graph_name}-bfs-transposed", "{out_dir}/{graph_name}-bfs-simplified", ], CompressionStep.LLP: [ "{java}", "it.unimi.dsi.law.big.graph.LayeredLabelPropagation", "-g", "{llp_gammas}", "{out_dir}/{graph_name}-bfs-simplified", "{out_dir}/{graph_name}-llp.order", ], CompressionStep.PERMUTE_LLP: [ "{java}", "it.unimi.dsi.big.webgraph.Transform", "mapOffline", "{out_dir}/{graph_name}-bfs", "{out_dir}/{graph_name}", "{out_dir}/{graph_name}-llp.order", "{batch_size}", "{tmp_dir}", ], CompressionStep.OBL: [ "{java}", "it.unimi.dsi.big.webgraph.BVGraph", "--list", "{out_dir}/{graph_name}", ], CompressionStep.COMPOSE_ORDERS: [ "{java}", - "org.softwareheritage.graph.utils.ComposePermutations", + "org.softwareheritage.graph.compress.ComposePermutations", "{out_dir}/{graph_name}-bfs.order", "{out_dir}/{graph_name}-llp.order", "{out_dir}/{graph_name}.order", ], CompressionStep.STATS: [ "{java}", "it.unimi.dsi.big.webgraph.Stats", "{out_dir}/{graph_name}", ], CompressionStep.TRANSPOSE: [ "{java}", "it.unimi.dsi.big.webgraph.Transform", "transposeOffline", "{out_dir}/{graph_name}", "{out_dir}/{graph_name}-transposed", "{batch_size}", "{tmp_dir}", ], CompressionStep.TRANSPOSE_OBL: [ "{java}", "it.unimi.dsi.big.webgraph.BVGraph", "--list", "{out_dir}/{graph_name}-transposed", ], CompressionStep.MAPS: [ - "zstdcat", - "{in_dir}/{graph_name}.nodes.csv.zst", - "|", "{java}", - "org.softwareheritage.graph.maps.NodeMapBuilder", + "org.softwareheritage.graph.compress.NodeMapBuilder", + "{out_dir}/{graph_name}", + "{tmp_dir}", + "< {out_dir}/{graph_name}.nodes.csv.zst", + ], + CompressionStep.EXTRACT_PERSONS: [ + "{java}", + "org.softwareheritage.graph.compress.ExtractPersons", + "--temp-dir", + "{tmp_dir}", + "{in_dir}", + "{out_dir}/{graph_name}", + ], + CompressionStep.MPH_PERSONS: [ + "{java}", + "it.unimi.dsi.sux4j.mph.GOVMinimalPerfectHashFunction", + "--byte-array", + "--decompressor", + "com.github.luben.zstd.ZstdInputStream", + "--temp-dir", + "{tmp_dir}", + "{out_dir}/{graph_name}.persons.mph", + "{out_dir}/{graph_name}.persons.csv.zst", + ], + CompressionStep.NODE_PROPERTIES: [ + "{java}", + "org.softwareheritage.graph.compress.WriteNodeProperties", + "{in_dir}", "{out_dir}/{graph_name}", + ], + CompressionStep.MPH_LABELS: [ + "{java}", + "it.unimi.dsi.sux4j.mph.LcpMonotoneMinimalPerfectHashFunction", + "--byte-array", + "--temp-dir", "{tmp_dir}", + "--decompressor", + "com.github.luben.zstd.ZstdInputStream", + "{out_dir}/{graph_name}.labels.mph", + "{out_dir}/{graph_name}.labels.csv.zst", + ], + CompressionStep.FCL_LABELS: [ + "{java}", + "it.unimi.dsi.big.util.MappedFrontCodedStringBigList", + "--decompressor", + "com.github.luben.zstd.ZstdInputStream", + "{out_dir}/{graph_name}.labels.fcl", + "< {out_dir}/{graph_name}.labels.csv.zst", + ], + CompressionStep.EDGE_LABELS: [ + "{java}", + "org.softwareheritage.graph.compress.LabelMapBuilder", + "--temp-dir", + "{tmp_dir}", + "{in_dir}", + "{out_dir}/{graph_name}", + ], + CompressionStep.EDGE_LABELS_OBL: [ + "{java}", + "it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph", + "--list", + "{out_dir}/{graph_name}-labelled", + ], + CompressionStep.EDGE_LABELS_TRANSPOSE_OBL: [ + "{java}", + "it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph", + "--list", + "{out_dir}/{graph_name}-transposed-labelled", ], CompressionStep.CLEAN_TMP: [ "rm", "-rf", "{out_dir}/{graph_name}-base.graph", "{out_dir}/{graph_name}-base.offsets", "{out_dir}/{graph_name}-base.properties", "{out_dir}/{graph_name}-bfs-simplified.graph", "{out_dir}/{graph_name}-bfs-simplified.offsets", "{out_dir}/{graph_name}-bfs-simplified.properties", "{out_dir}/{graph_name}-bfs-transposed.graph", "{out_dir}/{graph_name}-bfs-transposed.offsets", "{out_dir}/{graph_name}-bfs-transposed.properties", "{out_dir}/{graph_name}-bfs.graph", "{out_dir}/{graph_name}-bfs.offsets", "{out_dir}/{graph_name}-bfs.order", "{out_dir}/{graph_name}-bfs.properties", "{out_dir}/{graph_name}-llp.order", "{tmp_dir}", ], } def do_step(step, conf): - cmd = " ".join(STEP_ARGV[step]).format(**conf) + log_dir = Path(conf["out_dir"]) / "logs" + log_dir.mkdir(exist_ok=True) + + step_logger = logger.getChild(f"steps.{step.name.lower()}") + step_handler = logging.FileHandler( + log_dir + / ( + f"{conf['graph_name']}" + f"-{int(datetime.now().timestamp() * 1000)}" + f"-{str(step).lower()}.log" + ) + ) + step_logger.addHandler(step_handler) + + step_start_time = datetime.now() + step_logger.info("Starting compression step %s at %s", step, step_start_time) + cmd = " ".join(STEP_ARGV[step]).format(**conf) cmd_env = os.environ.copy() cmd_env["JAVA_TOOL_OPTIONS"] = conf["java_tool_options"] cmd_env["CLASSPATH"] = conf["classpath"] - - logging.info(f"running: {cmd}") process = subprocess.Popen( ["/bin/bash", "-c", cmd], env=cmd_env, encoding="utf8", stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) + step_logger.info("Running: %s", cmd) + with process.stdout as stdout: for line in stdout: - logging.info(line.rstrip()) + step_logger.info(line.rstrip()) rc = process.wait() if rc != 0: - raise RuntimeError( - f"compression step {step} returned non-zero " f"exit code {rc}" - ) - else: - return rc + raise RuntimeError(f"Compression step {step} returned non-zero exit code {rc}") + step_end_time = datetime.now() + step_duration = step_end_time - step_start_time + step_logger.info( + "Compression step %s finished at %s (in %s)", + step, + step_end_time, + step_duration, + ) + step_logger.removeHandler(step_handler) + step_handler.close() + return rc def compress( graph_name: str, in_dir: Path, out_dir: Path, steps: Set[CompressionStep] = set(COMP_SEQ), conf: Dict[str, str] = {}, ): """graph compression pipeline driver from nodes/edges files to compressed on-disk representation Args: graph_name: graph base name, relative to in_dir in_dir: input directory, where the uncompressed graph can be found out_dir: output directory, where the compressed graph will be stored steps: compression steps to run (default: all steps) conf: compression configuration, supporting the following keys (all are optional, so an empty configuration is fine and is the default) - batch_size: batch size for `WebGraph transformations `_; defaults to 1 billion - classpath: java classpath, defaults to swh-graph JAR only - java: command to run java VM, defaults to "java" - java_tool_options: value for JAVA_TOOL_OPTIONS environment variable; defaults to various settings for high memory machines - logback: path to a logback.xml configuration file; if not provided a temporary one will be created and used - max_ram: maximum RAM to use for compression; defaults to available virtual memory - tmp_dir: temporary directory, defaults to the "tmp" subdir of out_dir """ if not steps: steps = set(COMP_SEQ) conf = check_config_compress(conf, graph_name, in_dir, out_dir) compression_start_time = datetime.now() - logging.info(f"starting compression at {compression_start_time}") + logger.info("Starting compression at %s", compression_start_time) seq_no = 0 for step in COMP_SEQ: if step not in steps: - logging.debug(f"skipping compression step {step}") + logger.debug("Skipping compression step %s", step) continue seq_no += 1 - step_start_time = datetime.now() - logging.info( - f"starting compression step {step} " - f"({seq_no}/{len(steps)}) at {step_start_time}" - ) + logger.info("Running compression step %s (%s/%s)", step, seq_no, len(steps)) do_step(step, conf) - step_end_time = datetime.now() - step_duration = step_end_time - step_start_time - logging.info( - f"completed compression step {step} " - f"({seq_no}/{len(steps)}) " - f"at {step_end_time} in {step_duration}" - ) compression_end_time = datetime.now() compression_duration = compression_end_time - compression_start_time - logging.info(f"completed compression in {compression_duration}") + logger.info("Completed compression in %s", compression_duration) diff --git a/tools/dir2graph b/tools/dir2graph index 054ac61..b843af0 100755 --- a/tools/dir2graph +++ b/tools/dir2graph @@ -1,103 +1,106 @@ #!/usr/bin/env python3 # Copyright (C) 2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import sys from typing import Iterator, Tuple, Union import click from swh.model.from_disk import Content, Directory from swh.model.identifiers import CoreSWHID, ObjectType def swhid_of_node(obj: Union[Content, Directory]): return CoreSWHID( - object_type=ObjectType[obj.object_type.upper()], object_id=obj.hash, + object_type=ObjectType[obj.object_type.upper()], + object_id=obj.hash, ) -def walk_model(root: Directory,) -> Iterator[Tuple[CoreSWHID, Iterator[CoreSWHID]]]: +def walk_model( + root: Directory, +) -> Iterator[Tuple[CoreSWHID, Iterator[CoreSWHID]]]: """recursively visit a model.from_disk object Yield pairs (SWHID, neighbors) where SWHID is the identifier of a node and neighbors an iterator over SWHID of nodes directly reachable from it. So you can obtain all graph nodes by only looking at the first element of the pair, and edges by joining the first element with each of the neighbors. Note that no deduplication is applied, so both nodes and edges can be yielded multiple times if they do in fact appear multiple times in the graph. """ def walk_neighbors(node): for child in node.values(): yield swhid_of_node(child) to_visit = [root] while to_visit: node = to_visit.pop() swhid = swhid_of_node(node) yield (swhid, walk_neighbors(node)) for child in node.values(): to_visit.insert(0, child) @click.command() @click.argument( "directory", required=True, type=click.Path(exists=True, file_okay=False, dir_okay=True), ) @click.option( "-n", "--nodes-output", type=click.Path(file_okay=True, dir_okay=False, writable=True), help="output file where to store nodes as SWHIDs" " (if not given, node SWHIDs will not be output)." ' Use "-" for stdout.' " Default: output node SWHIDs to stdout.", ) @click.option( "-e", "--edges-output", type=click.Path(file_okay=True, dir_okay=False, writable=True), help="output file where to store edges as SWHID pairs" " (if not given, edge SWHIDs will not be output)" ' Use "-" for stdout.' " Default: do not output edge SWHIDs.", ) def main(directory, nodes_output, edges_output): """Recursively identifies the content of a directory. Outputs SWHID identifiers as both nodes (one SWHID per object) and edges (pairs of SWHIDs (parent, child) corresponding to the filesystem hierarchy). """ nodes_file = sys.stdout edges_file = None if nodes_output: if nodes_output == "-": nodes_file = sys.stdout else: nodes_file = open(nodes_output, "w") if edges_output: if edges_output == "-": edges_file = sys.stdout else: edges_file = open(edges_output, "w") root = Directory.from_disk(path=directory.encode()) for swhid, neighbors in walk_model(root): if nodes_file: nodes_file.write(f"{swhid}\n") if edges_file: for child_swhid in neighbors: edges_file.write(f"{swhid} {child_swhid}\n") if __name__ == "__main__": main() diff --git a/tox.ini b/tox.ini index 96313db..9dfc4c5 100644 --- a/tox.ini +++ b/tox.ini @@ -1,77 +1,78 @@ [tox] envlist=black,flake8,mypy,py3 [testenv] extras = testing deps = pytest-cov whitelist_externals = mvn sh commands = sh -c 'if ! [ -d {envdir}/share/swh-graph ]; then mvn -f java/pom.xml compile assembly:single; mkdir {envdir}/share/swh-graph; cp java/target/*.jar {envdir}/share/swh-graph; fi' pytest --cov={envsitepackagesdir}/swh/graph \ {envsitepackagesdir}/swh/graph \ --doctest-modules \ --cov-branch {posargs} [testenv:black] skip_install = true deps = - black==19.10b0 + black==22.3.0 commands = {envpython} -m black --check swh [testenv:flake8] skip_install = true deps = - flake8 + flake8==4.0.1 + flake8-bugbear==22.3.23 commands = {envpython} -m flake8 [testenv:mypy] extras = testing deps = - mypy==0.920 + mypy==0.942 commands = mypy swh # build documentation outside swh-environment using the current # git HEAD of swh-docs, is executed on CI for each diff to prevent # breaking doc build [testenv:sphinx] whitelist_externals = make usedevelop = true extras = testing deps = # fetch and install swh-docs in develop mode -e git+https://forge.softwareheritage.org/source/swh-docs#egg=swh.docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors SPHINXOPTS = -W commands = make -I ../.tox/sphinx/src/swh-docs/swh/ -C docs # build documentation only inside swh-environment using local state # of swh-docs package [testenv:sphinx-dev] whitelist_externals = make usedevelop = true extras = testing deps = # install swh-docs in develop mode -e ../swh-docs setenv = SWH_PACKAGE_DOC_TOX_BUILD = 1 # turn warnings into errors SPHINXOPTS = -W commands = make -I ../.tox/sphinx-dev/src/swh-docs/swh/ -C docs