diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 0000000..ec16ee1
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,5 @@
+# Enable black
+4c3c6d839b642009ba1eeee4acf4a58f209580e6
+
+# python: Reformat code with black 22.3.0
+1efea9bb9035e1d04191f8cd25a3f7ff9ad6d8f3
diff --git a/.gitignore b/.gitignore
index 3c564c7..8416519 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,13 @@
*.pyc
*.sw?
*~
.coverage
.eggs/
__pycache__
*.egg-info/
build/
dist/
version.txt
.tox
.mypy_cache/
+compressed/logs
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7a64a7f..5bf56ae 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,59 +1,52 @@
repos:
-- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v2.4.0
- hooks:
- - id: trailing-whitespace
- - id: check-json
- - id: check-yaml
-
-- repo: https://gitlab.com/pycqa/flake8
- rev: 3.8.3
- hooks:
- - id: flake8
-
-- repo: https://github.com/codespell-project/codespell
- rev: v1.16.0
- hooks:
- - id: codespell
- args: ["-L te,wth,alledges"]
-
-- repo: local
- hooks:
- - id: mypy
- name: mypy
- entry: mypy
- args: [swh]
- pass_filenames: false
- language: system
- types: [python]
-
-- repo: https://github.com/PyCQA/isort
- rev: 5.5.2
- hooks:
- - id: isort
-
-- repo: https://github.com/python/black
- rev: 19.10b0
- hooks:
- - id: black
-
-- repo: local
- hooks:
- - id: java-coding-style
- name: java style
- entry: mvn
- args: ["-f", "java/pom.xml", "spotless:apply"]
- pass_filenames: false
- language: system
-
-# unfortunately, we are far from being able to enable this...
-# - repo: https://github.com/PyCQA/pydocstyle.git
-# rev: 4.0.0
-# hooks:
-# - id: pydocstyle
-# name: pydocstyle
-# description: pydocstyle is a static analysis tool for checking compliance with Python docstring conventions.
-# entry: pydocstyle --convention=google
-# language: python
-# types: [python]
-
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.1.0
+ hooks:
+ - id: trailing-whitespace
+ - id: check-json
+ - id: check-yaml
+
+ - repo: https://gitlab.com/pycqa/flake8
+ rev: 4.0.1
+ hooks:
+ - id: flake8
+ additional_dependencies: [flake8-bugbear==22.3.23]
+
+ - repo: https://github.com/codespell-project/codespell
+ rev: v2.1.0
+ hooks:
+ - id: codespell
+ name: Check source code spelling
+ args: ["-L te,wth,alledges,afterall"]
+ stages: [commit]
+
+ - repo: local
+ hooks:
+ - id: mypy
+ name: mypy
+ entry: mypy
+ args: [swh]
+ pass_filenames: false
+ language: system
+ types: [python]
+
+ - repo: https://github.com/PyCQA/isort
+ rev: 5.10.1
+ hooks:
+ - id: isort
+
+ - repo: https://github.com/python/black
+ rev: 22.3.0
+ hooks:
+ - id: black
+
+ - repo: local
+ hooks:
+ - id: java-coding-style
+ name: java style
+ entry: mvn
+ args: ["-f", "java/pom.xml", "spotless:apply"]
+ pass_filenames: false
+ language: system
+
+exclude: ^swh/graph/rpc/
diff --git a/Makefile.local b/Makefile.local
index 034d1c7..1181cea 100644
--- a/Makefile.local
+++ b/Makefile.local
@@ -1,14 +1,17 @@
POM_PATH=java/pom.xml
java:
mvn -f $(POM_PATH) compile assembly:single
java-doc:
mvn -f $(POM_PATH) javadoc:javadoc
java-%:
mvn -f $(POM_PATH) $*
+protoc:
+ python -m grpc_tools.protoc -I. --python_out=. --mypy_out=. --grpc_python_out=. swh/graph/rpc/*.proto
+
clean-java: java-clean
.PHONY: java clean-java
diff --git a/PKG-INFO b/PKG-INFO
index 4839ff0..d6f1fb7 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,56 +1,52 @@
Metadata-Version: 2.1
Name: swh.graph
-Version: 0.5.2
+Version: 1.0.0
Summary: Software Heritage graph service
Home-page: https://forge.softwareheritage.org/diffusion/DGRPH
Author: Software Heritage developers
Author-email: swh-devel@inria.fr
-License: UNKNOWN
Project-URL: Bug Reports, https://forge.softwareheritage.org/maniphest
Project-URL: Funding, https://www.softwareheritage.org/donate
Project-URL: Source, https://forge.softwareheritage.org/source/swh-graph
Project-URL: Documentation, https://docs.softwareheritage.org/devel/swh-graph/
-Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
Classifier: Operating System :: OS Independent
Classifier: Development Status :: 3 - Alpha
Requires-Python: >=3.7
Description-Content-Type: text/x-rst
Provides-Extra: testing
License-File: LICENSE
License-File: AUTHORS
Software Heritage - graph service
=================================
Tooling and services, collectively known as ``swh-graph``, providing fast
access to the graph representation of the `Software Heritage
`_
`archive `_. The service is in-memory,
based on a compressed representation of the Software Heritage Merkle DAG.
Bibliography
------------
In addition to accompanying technical documentation, ``swh-graph`` is also
described in the following scientific paper. If you publish results based on
``swh-graph``, please acknowledge it by citing the paper as follows:
.. note::
Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli.
`Ultra-Large-Scale Repository Analysis via Graph Compression
`_. In proceedings of `SANER
2020 `_: The 27th IEEE International
Conference on Software Analysis, Evolution and Reengineering, pages
184-194. IEEE 2020.
Links: `preprint
`_,
`bibtex
`_.
-
-
diff --git a/docs/api.rst b/docs/api.rst
index 0b7f1a2..3face8d 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -1,407 +1,541 @@
.. _swh-graph-api:
-Graph RPC API
-=============
+Graph Querying HTTP API
+=======================
+
+The Graph Querying API is a high-level HTTP API intended to run common,
+relatively simple traversal queries on the compressed graph.
+
+The client/server architecture allows it to only load the graph in memory once
+then serve multiple different requests. However, it is limited in expressivity;
+more complex or resource-intensive queries should rather use the
+:ref:`Low-level Java API ` to run them as standalone
+programs.
Terminology
-----------
This API uses the following notions:
- **Node**: a node in the :ref:`Software Heritage graph `,
represented by a :ref:`SWHID `.
- **Node type**: the 3-letter specifier from the node SWHID (``cnt``, ``dir``,
``rel``, ``rev``, ``snp``, ``ori``), or ``*`` for all node types.
- **Edge type**: a pair ``src:dst`` where ``src`` and ``dst`` are either node
types, or ``*`` to denote all node types.
- **Edge restrictions**: a textual specification of which edges can be followed
during graph traversal. Either ``*`` to denote that all edges can be followed
or a comma separated list of edge types to allow following only those edges.
Note that when traversing the *backward* (i.e., transposed) graph, edge types
are reversed too. So, for instance, ``ori:snp`` makes sense when traversing
the forward graph, but useless (due to lack of matching edges in the graph)
when traversing the backward graph; conversely ``snp:ori`` is useful when
traversing the backward graph, but not in the forward one. For the same
reason ``dir:dir`` allows following edges from parent directories to
sub-directories when traversing the forward graph, but the same restriction
allows following edges from sub-directories to parent directories.
- **Node restrictions**: a textual specification of which type of nodes can be
returned after a request. Either ``*`` to denote that all types of nodes can
be returned or a comma separated list of node types to allow returning only
those node types.
Examples
~~~~~~~~
- ``swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2`` the SWHID of a node of
type content containing the full text of the GPL3 license.
- ``swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35`` the SWHID of a node of
type revision corresponding to the commit in Linux that merged the
'x86/urgent' branch on 31 December 2017.
- ``"dir:dir,dir:cnt"`` node types allowing edges from directories to
directories nodes, or directories to contents nodes.
- ``"rev:rev,dir:*"`` node types allowing edges from revisions to revisions
nodes, or from directories nodes.
- ``"*:rel"`` node types allowing all edges to releases.
- ``"cnt,snp"`` accepted node types returned in the query results.
+Endpoints
+---------
+
Leaves
-------
+~~~~~~
.. http:get:: /graph/leaves/:src
Performs a graph traversal and returns the leaves of the subgraph rooted at
the specified source node.
:param string src: source node specified as a SWHID
:query string edges: edges types the traversal can follow; default to
``"*"``
:query string direction: direction in which graph edges will be followed;
can be either ``forward`` or ``backward``, default to ``forward``
:query integer max_edges: how many edges can be traversed during the visit;
default to 0 (not restricted)
:query string return_types: only return the nodes matching this type;
default to ``"*"``
:statuscode 200: success
:statuscode 400: invalid query string provided
:statuscode 404: starting node cannot be found
**Example:**
.. sourcecode:: http
GET /graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323 HTTP/1.1
Content-Type: text/plain
Transfer-Encoding: chunked
.. sourcecode:: http
HTTP/1.1 200 OK
swh:1:cnt:540faad6b1e02e2db4f349a4845192db521ff2bd
swh:1:cnt:630585fc6d34e5e121139e2aee0a64e83dc9aae6
swh:1:cnt:f8634ced669f0a9155c8cab1b2621d57d778215e
swh:1:cnt:ba6daa801ad3ea587904b1abe9161dceedb2e0bd
...
Neighbors
----------
+~~~~~~~~~
.. http:get:: /graph/neighbors/:src
Returns node direct neighbors (linked with exactly one edge) in the graph.
:param string src: source node specified as a SWHID
:query string edges: edges types allowed to be listed as neighbors; default
to ``"*"``
:query string direction: direction in which graph edges will be followed;
can be either ``forward`` or ``backward``, default to ``forward``
:query integer max_edges: how many edges can be traversed during the visit;
default to 0 (not restricted)
:query string return_types: only return the nodes matching this type;
default to ``"*"``
:statuscode 200: success
:statuscode 400: invalid query string provided
:statuscode 404: starting node cannot be found
**Example:**
.. sourcecode:: http
GET /graph/neighbors/swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35 HTTP/1.1
Content-Type: text/plain
Transfer-Encoding: chunked
.. sourcecode:: http
HTTP/1.1 200 OK
swh:1:rev:a31e58e129f73ab5b04016330b13ed51fde7a961
swh:1:dir:b5d2aa0746b70300ebbca82a8132af386cc5986d
swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd
...
Walk
-----
+~~~~
..
.. http:get:: /graph/walk/:src/:dst
Performs a graph traversal and returns the first found path from source to
destination (final destination node included).
:param string src: starting node specified as a SWHID
:param string dst: destination node, either as a node SWHID or a node
type. The traversal will stop at the first node encountered matching
the desired destination.
:query string edges: edges types the traversal can follow; default to
``"*"``
:query string traversal: traversal algorithm; can be either ``dfs`` or
``bfs``, default to ``dfs``
:query string direction: direction in which graph edges will be followed;
can be either ``forward`` or ``backward``, default to ``forward``
:query string return_types: types of nodes we want to be displayed; default to ``"*"``
:statuscode 200: success
:statuscode 400: invalid query string provided
:statuscode 404: starting node cannot be found
**Example:**
.. sourcecode:: http
HTTP/1.1 200 OK
swh:1:rev:f39d7d78b70e0f39facb1e4fab77ad3df5c52a35
swh:1:rev:52c90f2d32bfa7d6eccd66a56c44ace1f78fbadd
swh:1:rev:cea92e843e40452c08ba313abc39f59efbb4c29c
swh:1:rev:8d517bdfb57154b8a11d7f1682ecc0f79abf8e02
...
.. http:get:: /graph/randomwalk/:src/:dst
Performs a graph *random* traversal, i.e., picking one random successor
node at each hop, from source to destination (final destination node
included).
:param string src: starting node specified as a SWHID
:param string dst: destination node, either as a node SWHID or a node type.
The traversal will stop at the first node encountered matching the
desired destination.
:query string edges: edges types the traversal can follow; default to
``"*"``
:query string direction: direction in which graph edges will be followed;
can be either ``forward`` or ``backward``, default to ``forward``
:query int limit: limit the number of nodes returned. You can use positive
numbers to get the first N results, or negative numbers to get the last
N results starting from the tail;
default to ``0``, meaning no limit.
:query integer max_edges: how many edges can be traversed during the visit;
default to 0 (not restricted)
:query string return_types: only return the nodes matching this type;
default to ``"*"``
:statuscode 200: success
:statuscode 400: invalid query string provided
:statuscode 404: starting node cannot be found
**Example:**
.. sourcecode:: http
GET /graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward HTTP/1.1
Content-Type: text/plain
Transfer-Encoding: chunked
.. sourcecode:: http
HTTP/1.1 200 OK
swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2
swh:1:dir:8de8a8823a0780524529c94464ee6ef60b98e2ed
swh:1:dir:7146ea6cbd5ffbfec58cc8df5e0552da45e69cb7
swh:1:rev:b12563e00026b48b817fd3532fc3df2db2a0f460
swh:1:rev:13e8ebe80fb878bade776131e738d5772aa0ad1b
swh:1:rev:cb39b849f167c70c1f86d4356f02d1285d49ee13
...
swh:1:rev:ff70949f336593d6c59b18e4989edf24d7f0f254
swh:1:snp:a511810642b7795e725033febdd82075064ed863
swh:1:ori:98aa0e71f5c789b12673717a97f6e9fa20aa1161
**Limit example:**
.. sourcecode:: http
GET /graph/randomwalk/swh:1:cnt:94a9ed024d3859793618152ea559a168bbcbb5e2/ori?direction=backward&limit=-2 HTTP/1.1
Content-Type: text/plain
Transfer-Encoding: chunked
.. sourcecode:: http
HTTP/1.1 200 OK
swh:1:ori:98aa0e71f5c789b12673717a97f6e9fa20aa1161
swh:1:snp:a511810642b7795e725033febdd82075064ed863
Visit
------
+~~~~~
.. http:get:: /graph/visit/nodes/:src
.. http:get:: /graph/visit/edges/:src
.. http:get:: /graph/visit/paths/:src
Performs a graph traversal and returns explored nodes, edges or paths (in
the order of the traversal).
:param string src: starting node specified as a SWHID
:query string edges: edges types the traversal can follow; default to
``"*"``
:query integer max_edges: how many edges can be traversed during the visit;
default to 0 (not restricted)
:query string return_types: only return the nodes matching this type;
default to ``"*"``
:statuscode 200: success
:statuscode 400: invalid query string provided
:statuscode 404: starting node cannot be found
**Example:**
.. sourcecode:: http
GET /graph/visit/nodes/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc HTTP/1.1
Content-Type: text/plain
Transfer-Encoding: chunked
.. sourcecode:: http
HTTP/1.1 200 OK
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc
swh:1:rev:cfab784723a6c2d33468c9ed8a566fd5e2abd8c9
swh:1:rev:53e5df0e7a6b7bd4919074c081a173655c0da164
swh:1:rev:f85647f14b8243532283eff3e08f4ee96c35945f
swh:1:rev:fe5f9ef854715fc59b9ec22f9878f11498cfcdbf
swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb
swh:1:cnt:c8cece50beae7a954f4ea27e3ae7bf941dc6d0c0
swh:1:dir:a358d0cf89821227d4c00b0ced5e0a8b3756b5db
swh:1:cnt:cc407b7e24dd300d2e1a77d8f04af89b3f962a51
swh:1:cnt:701bd0a63e11b3390a547ce8515d28c6bab8a201
...
**Example:**
.. sourcecode:: http
GET /graph/visit/edges/swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc HTTP/1.1
Content-Type: text/plain
Transfer-Encoding: chunked
.. sourcecode:: http
HTTP/1.1 200 OK
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:61f92a7db95f5a6d1fcb94d2b897ed3797584d7b
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:00e81c89c29ff3e58745fdaf7abb68daa1389e85
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:7596fdc31c9aa00aed281ccb026a74cabf2383bb
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:ec7a2341ac3d9d8b571bbdfb90a089d4e54dea56
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:1c5b5eac61eda2454034a43eb124ab490885ef3a
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:4dfa88ca55e04e8afe05e8543ddddee32dde7236
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:d56ae79e43ff1b37534370911c8a78ec7f38d437
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:19ba5d6203a040a39ecc4a77b165d3f097c1e662
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:9c56102eefea23c95405533e1de23da4b873ecc4
swh:1:snp:40f9f177b8ab0b7b3d70ee14bbc8b214e2b2dcfc swh:1:rev:3f54e816b46c2e179cd164e17fea93b3013a9db4
...
**Example:**
.. sourcecode:: http
GET /graph/visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb HTTP/1.1
Content-Type: application/x-ndjson
Transfer-Encoding: chunked
.. sourcecode:: http
HTTP/1.1 200 OK
["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"]
["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"]
["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"]
...
["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb", "swh:1:dir:2ebd4b96fa5665ff74f2b27ae41aecdc43af4463", "swh:1:cnt:1d3b6575fb7bf2a147d228e78ffd77ea193c3639"]
...
Counting results
-----------------
+~~~~~~~~~~~~~~~~
The following method variants, with trailing `/count` added, behave like their
already discussed counterparts but, instead of returning results, return the
*amount* of results that would have been returned:
.. http:get:: /graph/leaves/count/:src
Return the amount of :http:get:`/graph/leaves/:src` results
.. http:get:: /graph/neighbors/count/:src
Return the amount of :http:get:`/graph/neighbors/:src` results
.. http:get:: /graph/visit/nodes/count/:src
Return the amount of :http:get:`/graph/visit/nodes/:src` results
Stats
------
+~~~~~
.. http:get:: /graph/stats
Returns statistics on the compressed graph.
:statuscode 200: success
**Example**
.. sourcecode:: http
GET /graph/stats HTTP/1.1
Content-Type: application/json
.. sourcecode:: http
HTTP/1.1 200 OK
{
"counts": {
"nodes": 16222788,
"edges": 9907464
},
"ratios": {
"compression": 0.367,
"bits_per_node": 5.846,
"bits_per_edge": 9.573,
"avg_locality": 270.369
},
"indegree": {
"min": 0,
"max": 12382,
"avg": 0.6107127825377487
},
"outdegree": {
"min": 0,
"max": 1,
"avg": 0.6107127825377487
}
}
+
+
+Use-case examples
+-----------------
+
+This section showcases how to leverage the endpoints of the HTTP API described
+above for some common use-cases.
+
+
+Browsing
+~~~~~~~~
+
+The following use cases require traversing the *forward graph*.
+
+- **ls**: given a directory node, list (non recursively) all linked nodes of
+ type directory and content
+
+ Endpoint::
+
+ /graph/neighbors/:DIR_ID?edges=dir:cnt,dir:dir
+
+- **ls -R**: given a directory node, recursively list all linked nodes of type
+ directory and content
+
+ Endpoint::
+
+ /graph/visit/paths/:DIR_ID?edges=dir:cnt,dir:dir
+
+- **git log**: given a revision node, recursively list all linked nodes of type
+ revision
+
+ Endpoint::
+
+ /graph/visit/nodes/:REV_ID?edges=rev:rev
+
+
+Vault
+~~~~~
+
+The following use cases require traversing the *forward graph*.
+
+- **tarball** (same as *ls -R* above)
+
+- **git bundle**: given a node, recursively list all linked nodes of any kind
+
+ Endpoint::
+
+ /graph/visit/nodes/:NODE_ID?edges=*
+
+
+Provenance
+~~~~~~~~~~
+
+The following use cases require traversing the *backward (transposed)
+graph*.
+
+- **commit provenance**: given a content or directory node, return *a* commit
+ whose directory (recursively) contains it
+
+ Endpoint::
+
+ /graph/walk/:NODE_ID/rev?direction=backward&edges=dir:dir,cnt:dir,dir:rev
+
+- **complete commit provenance**: given a content or directory node, return
+ *all* commits whose directory (recursively) contains it
+
+ Endpoint::
+
+ /graph/leaves/:NODE_ID?direction=backward&edges=dir:dir,cnt:dir,dir:rev
+
+- **origin provenance**: given a content, directory, or commit node, return
+ *an* origin that has at least one snapshot that (recursively) contains it
+
+ Endpoint::
+
+ /graph/walk/:NODE_ID/ori?direction=backward&edges=*
+
+- **complete origin provenance**: given a content, directory, or commit node,
+ return *all* origins that have at least one snapshot that (recursively)
+ contains it
+
+ Endpoint::
+
+ /graph/leaves/:NODE_ID?direction=backward&edges=*
+
+
+Provenance statistics
+~~~~~~~~~~~~~~~~~~~~~
+
+The following use cases require traversing the *backward (transposed)
+graph*.
+
+- **content popularity across commits**: count the number of commits (or
+ *commit popularity*) that link to a directory that (recursively) includes a
+ given content.
+
+ Endpoint::
+
+ /graph/count/leaves/:NODE_ID?direction=backward&edges=cnt:dir,dir:dir,dir:rev
+
+- **commit popularity across origins**: count the number of origins (or *origin
+ popularity*) that have a snapshot that (recursively) includes a given commit.
+
+ Endpoint::
+
+ /graph/count/leaves/:NODE_ID?direction=backward&edges=*
+
+The following use cases require traversing the *forward graph*.
+
+- **revision size** (as n. of contents) distribution: the number of contents
+ that are (recursively) reachable from a given revision.
+
+ Endpoint::
+
+ /graph/count/leaves/:NODE_ID?edges=*
+
+- **origin size** (as n. of revisions) distribution: count the number of
+ revisions that are (recursively) reachable from a given origin.
+
+ Endpoint::
+
+ /graph/count/leaves/:NODE_ID?edges=ori:snp,snp:rel,snp:rev,rel:rev,rev:rev
diff --git a/docs/compression.rst b/docs/compression.rst
index edca8a7..bfd6c9e 100644
--- a/docs/compression.rst
+++ b/docs/compression.rst
@@ -1,125 +1,611 @@
.. _graph-compression:
+=================
Graph compression
=================
-The compression process is a pipeline implemented for the most part on top of
-the `WebGraph framework `_ and ecosystem
-libraries. The compression pipeline consists of the following steps:
+The compression pipeline is implemented on top of the `WebGraph framework
+`_. It takes an ORC Graph Dataset as an input,
+such as the ones found in the :ref:`Graph Dataset List `,
+and generates a compressed graph suitable for high intensity analyses on
+large servers.
-.. figure:: images/compression_steps.png
- :align: center
- :alt: Compression steps
- Compression steps
+Running the compression pipeline
+================================
-Each of these steps is briefly described below. For more details see the
-following paper:
+Dependencies
+------------
+
+To compress a graph, you will need to install the ``swh.graph`` tool as well as
+a recent JRE, as described in the :ref:`swh-graph-quickstart` page.
+
+You will also need the zstd_ compression tool::
+
+ $ sudo apt install zstd
+
+.. _zstd: https://facebook.github.io/zstd/
+
+
+Hardware Requirements
+---------------------
+
+The compression pipeline is even more demanding than the graph server in terms
+of hardware requirements, especially RAM. Notably, the BFS compression step
+loads a graph compressed in random order in memory, which is usually more than
+a TiB for the full graph. While it is possible to do this step with a memory
+mapping, our experiments show that this could take a very long time (several
+months) on hard drives.
+
+The LLP compression step requires 13 bytes of RAM per node, which could amount
+to storing hundreds of gigabytes in RAM in addition to loading the graph
+itself.
+
+Some steps also involve sorting the entire set of edges and their labels, by
+using large on-disk buffer files, sometimes reaching the size of the input
+dataself itself.
+
+The machine we used to compress the entire graph (dataset version 2022-04-25)
+has the following hardware specs:
+
+- 2 TiB of RAM (DDR4 ECC 2400Mhz)
+- 64 vCPUs (Dual AMD EPYC 7302 16-Core)
+- 24 TiB of SSD (NVMe)
+
+The server we rented is from the
+`HGR-HCI-4 `_
+series from OVH.
+
+
+Input dataset
+-------------
+
+First, you need to retrieve a graph to compress, in ORC format. The :ref:`Graph
+Dataset List ` has a list of datasets made available by the
+Software Heritage archive, including "teaser" subdatasets which have a more
+manageable size and are thus very useful for prototyping with less hardware
+resources.
+
+The datasets can be retrieved from S3 or the annex, in a similar fashion to
+what is described in :ref:`swh-graph-retrieving-compressed`, by simply
+replacing "compressed" by "orc":
+
+.. code:: console
+
+ (venv) $ mkdir -p 2021-03-23-popular-3k-python/orc
+ (venv) $ cd 2021-03-23-popular-3k-python/
+ (venv) $ aws s3 cp --recursive s3://softwareheritage/graph/2021-03-23-popular-3k-python/orc/ orc
+
+Alternatively, any custom ORC dataset can be used as long as it respects
+:ref:`the schema ` of the Software Heritage Graph Dataset.
+
+**Note:** for testing purposes, a fake test dataset is available in the
+``swh-graph`` repository, with just a few dozen nodes. The ORC tables are
+available in ``swh-graph/swh/graph/tests/dataset/orc/``.
+
+
+Compression
+-----------
+
+You can compress your dataset by using the ``swh graph compress`` command. It
+will run all the various steps of the pipeline in the right order.
+
+.. code:: console
+
+
+ (venv) $ swh graph compress --input-dataset orc/ --outdir compressed/
+ [...]
+ (venv) $ ls compressed/
+ graph.edges.count.txt
+ graph.edges.stats.txt
+ graph.graph
+ graph.indegree
+ graph-labelled.labeloffsets
+ graph-labelled.labels
+ [...]
+ graph-transposed.obl
+ graph-transposed.offsets
+ graph-transposed.properties
-.. note::
- Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli.
- `Ultra-Large-Scale Repository Analysis via Graph Compression
- `_. In
- proceedings of `SANER 2020 `_: The 27th IEEE
- International Conference on Software Analysis, Evolution and
- Reengineering. IEEE 2020.
+(The purpose of each of these files is detailed in the
+:ref:`swh-graph-java-api` page.
- Links: `preprint
- `_,
- `bibtex
- `_.
+For sufficiently large graphs, this command can take entire weeks. It is highly
+recommended to run it in a systemd service or in a tmux session.
-In order to practically perform graph compression, install the ``swh.graph``
-module and use the ``swh graph compress`` command line interface of the
-compression driver, that will conduct the various steps in the right order.
-See ``swh graph compress --help`` for usage details.
+It is also possible to run single steps or step ranges from the CLI:
+.. code:: bash
-1. MPH
+ swh graph compress -i orc/ -o compressed/ --steps mph-bfs
+
+See ``swh graph compress --help`` for syntax and usage details.
+
+
+Compression steps
+=================
+
+The compression pipeline consists of the following steps:
+
+.. figure:: images/compression_steps.png
+ :align: center
+ :alt: Compression steps
+ :scale: 20%
+
+ Compression steps
+
+Each of these steps is briefly described below. For more details see the
+original Software Heritage graph compression paper [SWHGraphCompression2020]_,
+as well as chapters 9 and 10 of Antoine Pietri's PhD thesis
+[PietriThesis2021]_.
+
+.. [SWHGraphCompression2020]
+ | Paolo Boldi, Antoine Pietri, Sebastiano Vigna, Stefano Zacchiroli.
+ | `Ultra-Large-Scale Repository Analysis via Graph Compression
+ `_.
+ | In proceedings of `SANER 2020 `_: The 27th
+ IEEE International Conference on Software Analysis, Evolution and
+ Reengineering. IEEE 2020.
+ | Links: `preprint
+ `_,
+ `bibtex
+ `_.
+
+
+
+.. [PietriThesis2021]
+ | Antoine Pietri
+ | `Organizing the graph of public software development for large-scale mining
+ `_.
+ | Doctoral dissertation. Inria, 2021.
+
+
+1. EXTRACT_NODES
+----------------
+
+This step reads a graph dataset and extract all the unique node SWHIDs it
+contains, including the ones that are not stored as actual objects in the
+graph, but only *referred to* by the edges. Additionally, it extracts the set
+of all unique edge labels in the graph.
+
+**Rationale:** Because the graph can contain holes, loose objects and dangling
+objects, some nodes that are referred to as destinations in the edge
+relationships might not actually be stored in the graph itself. However, to
+compress the graph using a graph compressio library, it is necessary to have a
+list of *all* the nodes in the graph, including the ones that are simply
+referred to by the edges but not actually stored as concrete objects.
+
+This step reads the entire graph dataset, and uses ``sort -u`` to extract the
+set of all the unique nodes and unique labels that will be needed as an input
+for the compression process. It also write object count statistics in various
+files:
+
+- The set of nodes is written in ``graph.nodes.csv.zst``, as a zst-compressed
+ sorted list of SWHIDs, one per line.
+- The set of edge labels is written in ``graph.labels.csv.zst``, as a
+ zst-compressed sorted list of labels encoded in base64, one per line.
+- The number of unique nodes referred to in the graph is written in a text
+ file, ``graph.nodes.count.txt``
+- The number of unique edges referred to in the graph is written in a text
+ file, ``graph.edges.count.txt``
+- The number of unique edge labels is written in a text file,
+ ``graph.labels.count.txt``
+- Statistics on the number of nodes of each type are written in a text file,
+ ``graph.nodes.stats.txt``
+- Statistics on the number of edges of each type are written in a text file,
+ ``graph.edges.stats.txt``
+
+
+2. MPH
------
-A node in the Software Heritage :ref:`data model ` is identified
-using its SWHID (see :ref:`persistent identifiers
-`). However, WebGraph internally uses integers to refer
-to node ids.
+As discussed in :ref:`swh-graph-java-basics`, a node in the Software Heritage
+:ref:`data model ` is identified by its SWHID (see :ref:`persistent
+identifiers `), but WebGraph internally uses integers
+to refer to node ids.
-Mapping between the strings and longs ids is needed before compressing the
-graph. From the `Sux4J `_ utility tool, we use the
+To create a mapping between integer node IDs and SWHIDs, we use the
`GOVMinimalPerfectHashFunction
`_
-class, mapping with no collisions N keys to N consecutive integers.
+class of the `Sux4J `_ library, which maps N keys to N
+consecutive integers.
+
+We run this function on the list of SWHIDs stored in the
+``graph.nodes.csv.zst`` file generated in the previous step.
+This allows us to generate a bijection from the set of all the *n* SWHIDs in the
+graph to the set of integers :math:`[0, n - 1]`.
-The step produces a ``.mph`` file (MPH stands for *Minimal Perfect
-Hash-function*) storing the hash function taking as input a string and returning
-a unique integer.
+The step produces a ``graph.mph`` file (MPH stands for *Minimal Perfect
+Hash-function*), containing a function which takes a SWHID (as a bytestring)
+and returns its associated node ID.
-2. BV compress
+3. BV compress
--------------
-This is the first actual compression step, building a compressed version of the
-input graph using WebGraph techniques presented in the framework paper. We use
-the `ScatteredArcsASCIIGraph
+This is the first actual compression step, where we build a compressed version
+of the input graph dataset.
+
+We use a ScatteredArcsORCGraph to load the dataset
+(implementation inspired of the `ScatteredArcsASCIIGraph
`_
-class, from WebGraph.
+class in WebGraph).
+This class wraps the ORC Graph dataset and exposes a *virtual* ImmutableGraph,
+whose nodes and edges can be iterated sequentially as if it was any other
+standard graph. To do so, it puts all the edges in batches and sorts them in an
+aggressively parallel fashion, then stores them as ``.bitstream`` files, and
+returns a `BatchGraph
+`
+created from these batches.
+
+Finally, it uses the ``BVGraph.store()`` method, which compresses the input
+graph as a `BVGraph
+`_,
+using the compression techniques described in the article *The WebGraph
+Framework I: Compression Techniques* cited above.
The resulting BV graph is stored as a set of files:
-- ``.graph``: the compressed graph in the BV format
-- ``.offsets``: offsets values to read the bit stream graph file
-- ``.obl``: offsets cache to load the graph faster
-- ``.properties``: entries used to correctly decode graph and offset files
+- ``graph-base.graph``: the compressed graph in the BV format
+- ``graph-base.offsets``: offsets values to read the bit stream graph file
+- ``graph-base.properties``: entries used to correctly decode graph and offset
+ files
-3. BFS
--------
+4. BFS
+------
+
+In [LLP]_, the paper authors empirically demonstrate that a high graph
+compression ratio can be achieved for the graph of the Web by ordering nodes
+such that vertices from the same host are close to each other.
-In the LLP paper, authors propose an empirical analysis linking node ordering
-and high compression ratio: it is important to use an ordering of nodes ids such
-that vertices from the same host are close to one another.
+In Software Heritage, there is no notion of "host" that can be used to generate
+these compression-friendly orderings, because the identifiers are just
+uniformly random cryptographic hashes. However, we can generate these orderings
+by running algorithms to inform us on which nodes are close to each other.
-Building on this insight, the previous compression results in the BV compress
-step are improved by re-ordering nodes ids using a BFS traversal order. We use
-the `BFS
+In this step, we run a BFS traversal on the entire graph to get a more
+compression-friendly ordering of nodes. We use the `BFS
`_
class from the `LAW `_ library.
-The resulting ordering is stored in the ``.order`` file, listing nodes ids in
-order of traversal.
+The resulting ordering is stored in a ``graph-bfs.order`` file, which contains
+all the node IDs in the order of traversal.
-4. Permute
-----------
+5. PERMUTE_BFS
+--------------
-Once the order is computed (BFS or another ordering technique), the final
-compressed graph is created based on the initial BV compress result, and using
-the new node order mapping. The permutation uses the `Transform
+Once the BFS order is computed, we permute the initial "base" graph using the
+this new ordering. The permutation uses the `Transform
`_
class from WebGraph framework.
-The final compressed graph is only stored in the resulting ``.graph``,
-``.offsets``, ``.obl``, and ``.properties`` files.
+The BFS-compressed graph is stored in the files
+``graph-bfs.{graph,offsets,properties}``.
+6. TRANSPOSE_BFS
+----------------
-5. Stats
---------
+We transpose the BFS-compressed graph, using the `Transform
+`_
+class from WebGraph.
+This step is a prerequisite for LLP compression.
+
+7. SIMPLIFY
+-----------
+
+This step creates a loopless and symmetric version of the BFS-compressed graph,
+using the `Transform
+`_
+class from WebGraph.
+This step is a prerequisite for LLP compression.
+
+8. LLP
+------
+
+Better compression ratios can be achieved by the Layered Label Propagation
+(LLP) algorithm to reorder nodes. This algorithm is described in [LLP]_.
+The LLP algorithm finds locality-preserving orders by clustering together nodes
+in close proximity. Similar to the BFS, this algorithm is particularly
+interesting for our use case as it is unsupervised, and does not rely on prior
+information on the clusters present in the graph. The idea behind the
+clustering algorithm is to randomly distribute communities to the nodes in the
+graph, then iteratively assign to each node the community most represented in
+its neighbors.
+
+.. [LLP] Paolo Boldi, Marco Rosa, Massimo Santini, Sebastiano Vigna.
+ *Layered label propagation: a multiresolution coordinate-free ordering for compressing social networks.*
+ WWW 2011: 587-596
+ DOI: https://doi.org/10.1145/1963405.1963488
+ preprint: https://arxiv.org/abs/1011.5425
+
+LLP is more costly than simple BFS-based compression in both time and memory.
+Even though the algorithm has a linear time complexity, it does multiple
+iterations on the graph and is significantly slower than the BFS which is just
+one single traversal. Moreover, keeping track of the communities requires a
+total of 13 bytes per node, which increases the RAM requirements.
+Because of these constraints, it is unrealistic to run the LLP algorithm on the
+uncompressed version of the graph; this is why we do an intermediate
+compression with the BFS ordering first, then compress the entire graph *again*
+with an even better ordering.
+
+The LLP algorithm takes a simplified (loopless, symmetric) graph as an input,
+which we already computed in the previous steps.
+
+The algorithm is also parameterized by a list of γ values, a "resolution" parameter
+which defines the shapes of the clustering it produces: either small, but
+denser pieces, or larger, but unavoidably sparser pieces. The algorithm then
+combines the different clusterings together to generate the output reordering.
+γ values are given to the algorithm in the form :math:`\frac{j}{2^k}`; by
+default, 12 different values of γ are used. However, the combination procedure
+is very slow, and using that many γ values could take several months in our
+case.
+We thus narrowed down a smaller set of γ values that empirically give good
+compression results, which are used by default in the pipeline. In general,
+smaller values of γ seem to generate better compression ratios. The effect of a
+given γ is that the density of the sparsest cluster is at least γ γ+1, so large
+γ values imply small, more dense clusters. It is reasonable to assume that
+since the graph is very sparse to start with, such clusters are not that
+useful.
+
+The resulting ordering is stored in a ``graph-llp.order`` file.
+
+9. PERMUTE_LLP
+--------------
+
+Once the LLP order is computed, we permute the BFS-compressed graph using the
+this new ordering. The LLP-compressed graph, which is our final compressed
+graph, is stored in the files ``graph.{graph,offsets,properties}``.
+
+10. OBL
+-------
+
+Cache the BVGraph offsets of the forward graph to make loading faster. The
+resulting offset big list is stored in the ``graph.obl`` file.
+
+11. COMPOSE_ORDERS
+------------------
+
+To be able to translate the initial MPH inputs to their resulting rank in the
+LLP-compressed graph, we need to use the two order permutations successively:
+the base → BFS permutation, then the BFS → LLP permutation.
+
+To make this less wasteful, we *compose* the two permutations into a single
+one. We use the `composePermutationsInPlace
+`_
+function of the dsiutils library. The resulting permutation is stored as a
+``graph.order`` file. Hashing a SWHID with the ``graph.mph`` function, then
+permuting the result using the ``graph.order`` permutation yields the integer
+node ID matching the input SWHID in the graph.
-Compute various statistics on the final compressed graph:
+12. STATS
+---------
-- ``.stats``: entries such as number of nodes, edges, avg/min/max degree,
+This step computes various statistics on the compressed graph:
+
+- ``.stats``: statistics such as number of nodes, edges, avg/min/max degree,
average locality, etc.
- ``.indegree``: graph indegree distribution
- ``.outdegree``: graph outdegree distribution
This step uses the `Stats
`_
class from WebGraph.
-6. Transpose
-------------
+13. TRANSPOSE
+-------------
-Create a transposed graph to allow backward traversal, using the `Transform
+Transpose the graph to allow backward traversal, using the `Transform
`_
-class from WebGraph.
+class from WebGraph. The resulting transposed graph is stored as the
+``graph-transposed.{graph,offsets,properties}`` files.
+
+
+14. TRANSPOSE_OBL
+-----------------
+
+Same as OBL, but for the transposed graph. The resulting offset big list is
+stored in the ``graph-transposed.obl`` file.
+
+
+15. MAPS
+--------
+
+This steps generates the *node mappings* described in
+:ref:`swh-graph-java-node-mappings`. In particular, it generates:
+
+- ``graph.node2swhid.bin``: a compact binary representation of all the
+ SWHIDs in the graph, ordered by their rank in the graph file.
+- ``graph.node2type.bin``: a `LongBigArrayBitVector
+ `_
+ which stores the type of each node.
+
+It does so by reading all the SWHIDs in the ``graph.nodes.csv.zst`` file generated in the
+EXTRACT_NODES step, then getting their corresponding node IDs (using the
+``.mph`` and ``.order`` files), then sorting all the SWHIDs according to
+their node ID. It then writes these SWHIDs in order, in a compact but seekable
+binary format, which can be used to return the SWHID corresponding to any given
+node in O(1).
+
+
+16. EXTRACT_PERSONS
+-------------------
+
+This step reads the ORC graph dataset and extracts all the unique persons it
+contains. Here, "persons" are defined as the set of unique pairs of name +
+email, potentially pseudonymized, found either as revision authors, revision
+committers or release authors.
+
+The ExtractPersons class reads all the persons from revision and release
+tables, then uses ``sort -u`` to get a sorted list without any duplicates. The
+resulting sorted list of authors is stored in the ``graph.persons.csv.zst``
+file.
+
+
+17. MPH_PERSONS
+---------------
+
+This step computes a Minimal Perfect Hash function on the set of all the unique
+persons extracted in the EXTRACT_PERSONS step. Each individual person is mapped
+to a unique integer in :math:`[0, n-1]` where *n* is the total number of
+persons. The resulting function is serialized and stored in the
+``graph.persons.mph`` file.
+
+
+18. NODE_PROPERTIES
+-------------------
+
+This step generates the *node property files*, as described in
+:ref:`swh-graph-java-node-properties`.
+The nodes in the Software Heritage Graph each have associated *properties*
+(e.g., commit timestamps, authors, messages, ...). The values of these
+properties for each node in the graph are compressed and stored in files
+alongside the compressed graph.
+
+The WriteNodeProperties class reads all the properties from the ORC Graph
+Dataset, then serializes each of them in a representation suitable for
+efficient random access (e.g., large binary arrays) and stores them on disk.
+
+For persons (authors, committers etc), the MPH computed in the MPH_PERSONS step
+is used to store them as a single pseudonymized integer ID, which uniquely
+represents a full name + email.
+
+The results are stored in the following list of files:
+
+- ``graph.property.author_id.bin``
+- ``graph.property.author_timestamp.bin``
+- ``graph.property.author_timestamp_offset.bin``
+- ``graph.property.committer_id.bin``
+- ``graph.property.committer_timestamp.bin``
+- ``graph.property.committer_timestamp_offset.bin``
+- ``graph.property.content.is_skipped.bin``
+- ``graph.property.content.length.bin``
+- ``graph.property.message.bin``
+- ``graph.property.message.offset.bin``
+- ``graph.property.tag_name.bin``
+- ``graph.property.tag_name.offset.bin``
+
+
+19. MPH_LABELS
+--------------
+
+This step computes a **monotone** Minimal Perfect Hash function on the set of
+all the unique *arc label names* extracted in the EXTRACT_NODES step. Each
+individual arc label name (i.e., directory entry names and snapshot branch
+names) is monotonely mapped to a unique integer in :math:`[0, n-1]`, where *n*
+is the total number of unique arc label names, which corresponds to their
+**lexical rank** in the set of all arc labels.
+
+In other words, this MPH being monotone means that the hash of the *k*-th item
+in the sorted input list of arc labels will always be *k*.
+We use the `LcpMonotoneMinimalPerfectHashFunction
+`_
+of Sux4J to generate this function.
+
+The rationale for using a monotone function here is that it will allow us to
+quickly get back the arc label from its hash without having to store an
+additional permutation.
+The resulting MPH function is serialized and stored in the ``graph.labels.mph``
+file.
+
+
+20. FCL_LABELS
+--------------
+
+This step computes a *reverse-mapping* for arc labels, i.e., a way to
+efficiently get the arc label name from its hash computed with the monotone MPH
+of the MPH_LABELS step.
+
+Thanks to the MPH being monotone, this boils down to storing all the labels in
+lexicographic order in a string list format that allows O(1) access to its
+elements. For this purpose, we use the `MappedFrontCodedStringBigList
+`_
+class from the dsiutils library, using the ``graph.labels.csv.zst`` file as its
+input. It stores the label names in a compact way by using front-coding
+compression, which is particularly efficient here because the strings are
+already in lexicographic order. The resulting FCL files are stored as
+``graph.labels.fcl.*``, and they can be loaded using memory mapping.
+
+
+21. EDGE_LABELS
+---------------
+
+
+This step generates the *edge property files*, as described in
+:ref:`swh-graph-java-edge-properties`. These files allow us to get the *edge
+labels* as we iterate on the edges of the graph. The files essentially contain
+compressed sorted triplets of the form (source, destination, label), with
+additional offsets to allow random access.
+
+To generate these files, the LabelMapBuilder class starts by reading in
+parallel the labelled edges in the ORC dataset, which can be thought of as
+quadruplets containing the source SWHID, the destination SWHID, the label name
+and the entry permission if applicable:
+
+.. code-block:: text
+
+ swh:1:snp:4548a5… swh:1:rev:0d6834… cmVmcy9oZWFkcy9tYXN0ZXI=
+ swh:1:dir:05faa1… swh:1:cnt:a35136… dGVzdC5j 33188
+ swh:1:dir:05faa1… swh:1:dir:d0ff82… dGVzdA== 16384
+ ...
+
+Using the ``graph.mph`` and the ``graph.order`` files, we hash and permute the
+source and destination nodes. We also monotonically hash the labels using the
+``graph.labels.mph`` function to obtain the arc label identifiers. The
+permissions are normalized as one of the 6 possible values in the
+``DirEntry.Permission.Type`` enum, and are then stored in the 3 lowest bits of
+the label field.
+
+.. code-block:: text
+
+ 4421 14773 154
+ 1877 21441 1134
+ 1877 14143 1141
+ ...
+
+These hashed edges and their compact-form labels are then put in large batches
+sorted in an aggressively parallel fashion, which are then stored as
+``.bitstream`` files. These batch files are put in a heap structure to perform
+a merge sort on the fly on all the batches.
+
+Then, the LabelMapBuilder loads the graph and starts iterating on its edges. It
+synchronizes the stream of edges read from the graph with the stream of sorted
+edges and labels read from the bitstreams in the heap. At this point, it writes
+the labels to the following output files:
+
+- ``graph-labelled.properties``: a property file describing the graph, notably
+ containing the basename of the wrapped graph.
+- ``graph-labelled.labels``: the compressed labels
+- ``graph-labelled.labeloffsets``: the offsets used to access the labels in
+ random order.
+
+It then does the same with backward edge batches to get the transposed
+equivalent of these files:
+``graph-transposed-labelled.{properties,labels,labeloffsets}``.
+
+
+23. EDGE_LABELS_OBL
+-------------------
+
+Cache the label offsets of the forward labelled graph to make loading faster.
+The resulting label offset big list is stored in the
+``graph-labelled.labelobl`` file.
+
+
+23. EDGE_LABELS_TRANSPOSE_OBL
+-----------------------------
+
+Same as EDGE_LABELS_OBL, but for the transposed labelled graph.
+The resulting label offset big list is stored in the
+``graph-transposed-labelled.labelobl`` file.
+
+
+24. CLEAN_TMP
+-------------
+
+This step reclaims space by deleting the temporary directory, as well as all
+the intermediate outputs that are no longer necessary now that the final graph
+has been compressed (shown in gray in the step diagram).
diff --git a/docs/grpc-api.rst b/docs/grpc-api.rst
new file mode 100644
index 0000000..70c197d
--- /dev/null
+++ b/docs/grpc-api.rst
@@ -0,0 +1,556 @@
+.. _swh-graph-grpc-api:
+
+==================
+Using the GRPC API
+==================
+
+The GRPC API is the core API used to query the graph remotely. It uses the
+`GRPC framework `_ to provide high-performance graph
+traversal methods with server streaming.
+
+It is more expressive than the :ref:`HTTP API ` (which itself
+uses the GRPC API under the hood to serve queries), however it can only be
+used internally or with a local setup, and is never exposed publicly.
+
+Its major features include: returning node and edge properties, performing BFS
+traversals, including traversals with more than one starting node, finding
+shortest paths, common ancestors, etc.
+
+Quickstart
+==========
+
+Starting the server
+-------------------
+
+The GRPC server is automatically started on port 50091 when the HTTP server
+is started with ``swh graph rpc-serve``. It can also be started directly with
+Java, instead of going through the Python layer, by using the fat-jar shipped
+with swh-graph:
+
+.. code-block:: console
+
+ $ java -cp swh-graph-XXX.jar org.softwareheritage.graph.rpc.GraphServer
+
+(See :ref:`swh-graph-java-api` and :ref:`swh-graph-memory` for more
+information on Java process options and JVM tuning.)
+
+Running queries
+---------------
+
+The `gRPC command line tool
+`_
+can be an easy way to query the GRPC API from the command line. It is
+invoked with the ``grpc_cli`` command. Of course, it is also possible to use
+a generated RPC client in any programming language supported by GRPC.
+
+All RPC methods are defined in the service ``swh.graph.TraversalService``.
+The available endpoints can be listed with ``ls``:
+
+.. code-block:: console
+
+ $ grpc_cli ls localhost:50091 swh.graph.TraversalService
+ Traverse
+ FindPathTo
+ FindPathBetween
+ CountNodes
+ CountEdges
+ Stats
+ GetNode
+
+A RPC method can be called with the ``call`` subcommand.
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.Stats ""
+ connecting to localhost:50091
+ num_nodes: 21
+ num_edges: 23
+ compression: 1.412
+ bits_per_node: 8.524
+ [...]
+ Rpc succeeded with OK status
+
+The ``--json-output`` flag can also be used to make the results easier to
+parse.
+
+.. code-block:: console
+
+ $ grpc_cli --json_output call localhost:50091 swh.graph.TraversalService.Stats ""
+ connecting to localhost:50091
+ {
+ "numNodes": "21",
+ "numEdges": "23",
+ [...]
+ }
+ Rpc succeeded with OK status
+
+
+**Note**: grpc_cli's outputs in this document are slightly modified for
+readability's sake.
+
+Simple queries
+==============
+
+For a full documentation of all the endpoints, as well as the request and
+response messages, see :ref:`swh-graph-grpc-api-protobuf`.
+
+Querying a single node
+----------------------
+
+The **GetNode** endpoint can be used to return information on a single
+node of the graph, including all its node properties, from its SWHID. Here
+are a few examples from the test graph:
+
+Content
+~~~~~~~
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \
+ 'swhid: "swh:1:cnt:0000000000000000000000000000000000000001"'
+
+.. code-block:: javascript
+
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000001"
+ cnt {
+ length: 42
+ is_skipped: false
+ }
+
+Revision
+~~~~~~~~
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \
+ 'swhid: "swh:1:rev:0000000000000000000000000000000000000009"'
+
+.. code-block:: javascript
+
+ swhid: "swh:1:rev:0000000000000000000000000000000000000009"
+ rev {
+ author: 2
+ author_date: 1111140840
+ author_date_offset: 120
+ committer: 2
+ committer_date: 1111151950
+ committer_date_offset: 120
+ message: "Add parser"
+ }
+
+Release
+~~~~~~~
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \
+ 'swhid: "swh:1:rel:0000000000000000000000000000000000000010"'
+
+.. code-block:: javascript
+
+ swhid: "swh:1:rel:0000000000000000000000000000000000000010"
+ rel {
+ author: 0
+ author_date: 1234564290
+ author_date_offset: 120
+ message: "Version 1.0"
+ }
+
+Origin
+~~~~~~
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \
+ 'swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054"'
+
+.. code-block:: javascript
+
+ swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054"
+ ori {
+ url: "https://example.com/swh/graph"
+ }
+
+
+Checking the presence of a node
+-------------------------------
+
+The **GetNode** endpoint can also be used to check if a node exists in the
+graph. The RPC will return the ``INVALID_ARGUMENT`` code, and a detailed error
+message.
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \
+ 'swhid: "swh:1:ori:ffffffffffffffffffffffffffffffffffffffff"'
+ Rpc failed with status code 3, error message: Unknown SWHID: swh:1:ori:ffffffffffffffffffffffffffffffffffffffff
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \
+ 'swhid: "invalidswhid"'
+ Rpc failed with status code 3, error message: malformed SWHID: swh:1:ori:ffffffffffffffffffffffffffffffffffffffff
+
+
+Selecting returned fields with FieldMask
+----------------------------------------
+
+Many endpoints, including **GetNode**, contain a ``mask`` field of type
+`FieldMask
+`_,
+which can be used to select which fields should be returned in the response.
+
+This is particularly interesting for traversal queries that return a large
+number of nodes, because property access is quite costly from the compressed
+graph (at least compared to regular node access). It is therefore recommended
+that clients systematically use FieldMasks to only request the properties that
+they will consume.
+
+A FieldMask is represented as a set of "field paths" in dotted notation. For
+instance, ``paths: ["swhid", "rev.message"]`` will only request the swhid and
+the message of a given node. An empty mask will return an empty object.
+
+Example:
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \
+ 'swhid: "swh:1:rev:0000000000000000000000000000000000000009", mask: {paths: ["swhid"]}'
+ swhid: "swh:1:rev:0000000000000000000000000000000000000009"
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.GetNode \
+ 'swhid: "swh:1:rev:0000000000000000000000000000000000000009", mask: {paths: ["swhid", "rev.message", "rev.author"]}'
+ swhid: "swh:1:rev:0000000000000000000000000000000000000009"
+ rev {
+ author: 2
+ message: "Add parser"
+ }
+
+
+Getting statistics on the graph
+-------------------------------
+
+The **Stats** endpoint returns overall statistics on the entire compressed
+graph. Most notably, the total number of nodes and edges, as well as the
+range of indegrees and outdegrees, and some compression-related statistics.
+
+.. code-block:: console
+
+ $ grpc_cli --json_output call localhost:50091 swh.graph.TraversalService.Stats ""
+
+.. code-block:: json
+
+ {
+ "numNodes": "21",
+ "numEdges": "23",
+ "compression": 1.412,
+ "bitsPerNode": 8.524,
+ "bitsPerEdge": 7.783,
+ "avgLocality": 2.522,
+ "indegreeMax": "3",
+ "indegreeAvg": 1.0952380952380953,
+ "outdegreeMax": "3",
+ "outdegreeAvg": 1.0952380952380953
+ }
+
+
+Graph traversals
+================
+
+Breadth-first traversal
+-----------------------
+
+The **Traverse** endpoint performs a breadth-first traversal from a set of
+source nodes, and `streams
+`_ all
+the nodes it encounters on the way. All the node properties are stored in the
+result nodes. Additionally, the *edge properties* (e.g., directory entry names
+and permissions) are stored as a list in the ``successor`` field of each node.
+
+For instance, here we run a traversal from a directory that contains two
+contents:
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \
+ "src: 'swh:1:dir:0000000000000000000000000000000000000006'"
+
+We get the following stream of nodes: first, the source directory (including
+its properties, successor list and their labels), then the contents themselves
+and their respective properties.
+
+.. code-block:: javascript
+
+ swhid: "swh:1:dir:0000000000000000000000000000000000000006"
+ successor {
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000005"
+ label {
+ name: "parser.c"
+ permission: 33188
+ }
+ }
+ successor {
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000004"
+ label {
+ name: "README.md"
+ permission: 33188
+ }
+ }
+ num_successors: 2
+
+.. code-block:: javascript
+
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000005"
+ cnt {
+ length: 1337
+ is_skipped: false
+ }
+
+.. code-block:: javascript
+
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000004"
+ cnt {
+ length: 404
+ is_skipped: false
+ }
+
+Again, it is possible to use a FieldMask to restrict which fields get returned.
+For instance, if we only care about the SWHIDs:
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \
+ "src: 'swh:1:dir:0000000000000000000000000000000000000006', mask: {paths: ['swhid']}"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000006"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000005"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000004"
+
+
+Graph direction
+~~~~~~~~~~~~~~~
+
+For many purposes, especially that of finding the provenance of software
+artifacts, it is useful to query the backward (or transposed) graph instead,
+which is the same as the forward graph except all the edges are reversed.
+To achieve this, the ``direction`` field can be used to specify a direction
+from the ``GraphDirection`` enum (either ``FORWARD`` or ``BACKWARD``).
+
+This query returns all the nodes reachable from a given directory in the
+*backward* (or "transposed") graph:
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \
+ "src: 'swh:1:dir:0000000000000000000000000000000000000006', direction: BACKWARD, mask: {paths: ['swhid']}"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000006"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000008"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000012"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000009"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000013"
+ swhid: "swh:1:rel:0000000000000000000000000000000000000010"
+ swhid: "swh:1:snp:0000000000000000000000000000000000000020"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000018"
+ swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054"
+ swhid: "swh:1:rel:0000000000000000000000000000000000000019"
+
+
+Edge restrictions
+~~~~~~~~~~~~~~~~~
+
+To constrain the types of edges that can be followed during the graph
+traversal, it is possible to specify an edge restriction string in the ``edge``
+field. It is a comma-separated list of edge types that will be followed (e.g.
+``"rev:dir,dir:cnt"`` to only follow revision → directory and directory →
+content edges).
+By default (or when ``"*"`` is provided), all edges can be followed.
+
+This query traverses the parent revisions of a given revision only (i.e., it
+outputs the *commit log* from a given commit):
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \
+ "src: 'swh:1:rev:0000000000000000000000000000000000000018', edges: 'rev:rev', mask: {paths: ['swhid']}"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000018"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000013"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000009"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000003"
+
+
+Limiting the traversal
+~~~~~~~~~~~~~~~~~~~~~~
+
+To avoid using up too much memory or resources, a traversal can be limited
+in two different ways:
+
+- the ``max_depth`` attribute defines the maximum depth of the traversal.
+- the ``max_edges`` attribute defines the maximum number of edges that can be
+ fetched by the traversal.
+
+When these limits are reached, the traversal will simply stop. While these
+options have obvious use-cases for anti-abuse, they can also be semantically
+useful: for instance, specifying ``max_depth: 1`` will only return the
+*neighbors* of the source node.
+
+
+Filtering returned nodes
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+In many cases, clients might not want to get all the traversed nodes in the
+response stream. With the ``return_nodes`` field (of type ``NodeFilter``), it
+is possible to specify various *criteria* for which nodes should be sent to the
+stream. By default, all nodes are returned.
+
+One common filter is to only want specific *node types* to be returned, which
+can be done with the ``types`` field of ``NodeFilter``. This field contains a
+node type restriction string (e.g. "dir,cnt,rev"), and defaults to "*" (all).
+For instance, to find the list of origins in which a given directory can be
+found:
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \
+ "src: 'swh:1:dir:0000000000000000000000000000000000000006', return_nodes: {types: 'ori'}, direction: BACKWARD, mask: {paths: ['swhid']}"
+ swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054"
+
+
+Traversal from multiple sources
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Traversals can have multiple starting nodes, when multiple source nodes are
+present in the ``src`` field. For instance, this BFS starts from two different
+directories, and explores the graph in parallel from these multiple starting
+points:
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.Traverse \
+ "src: ['swh:1:dir:0000000000000000000000000000000000000006', 'swh:1:dir:0000000000000000000000000000000000000017'], mask: {paths: ['swhid']}"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000006"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000017"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000005"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000004"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000014"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000016"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000015"
+
+
+Finding a path to a node matching a criteria
+--------------------------------------------
+
+The **FindPathTo** endpoint searches for a shortest path between a set of
+source nodes and any node that matches a specific *criteria*.
+It does so by performing a breadth-first search from the source node,
+until any node that matches the given criteria is found, then follows
+back its parents to return a shortest path from the source set to that
+node.
+
+The criteria can be specified in the ``target`` field of the
+``FindPathToRequest``, which is of type ``NodeFilter``.
+
+As an example, a common use-case for content provenance is to find the shortest
+path of a content to an origin in the transposed graph. This query can be
+run like this:
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.FindPathTo \
+ "src: 'swh:1:cnt:0000000000000000000000000000000000000001', target: {types: 'ori'}, direction: BACKWARD, mask: {paths: ['swhid']}"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000001"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000008"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000009"
+ swhid: "swh:1:snp:0000000000000000000000000000000000000020"
+ swhid: "swh:1:ori:83404f995118bd25774f4ac14422a8f175e7a054"
+
+As soon as the request finds an origin, it stops and returns the path from the
+source set to this origin.
+
+Similar to the **Traverse** endpoint, it is possible to specify edge
+restrictions, graph directions, as well as multiple source nodes.
+
+
+Finding a path between two sets of nodes
+----------------------------------------
+
+The **FindPathBetween** endpoint searches for a shortest path between a set of
+source nodes and a set of destination nodes.
+
+It does so by performing a *bidirectional breadth-first search*, i.e.,
+two parallel breadth-first searches, one from the source set ("src-BFS")
+and one from the destination set ("dst-BFS"), until both searches find a
+common node that joins their visited sets. This node is called the
+"midpoint node".
+The path returned is the path src -> ... -> midpoint -> ... -> dst,
+which is always a shortest path between src and dst.
+
+The graph direction of both BFS can be configured separately. By
+default, the dst-BFS will use the graph in the opposite direction than
+the src-BFS (if direction = FORWARD, by default direction_reverse =
+BACKWARD, and vice-versa). The default behavior is thus to search for
+a shortest path between two nodes in a given direction. However, one
+can also specify FORWARD or BACKWARD for *both* the src-BFS and the
+dst-BFS. This will search for a common descendant or a common ancestor
+between the two sets, respectively. These will be the midpoints of the
+returned path.
+
+Similar to the **Traverse** endpoint, it is also possible to specify edge
+restrictions.
+
+**Example 1**: shortest path from a snapshot to a content (forward graph):
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.FindPathBetween \
+ "src: 'swh:1:snp:0000000000000000000000000000000000000020', dst: 'swh:1:cnt:0000000000000000000000000000000000000004', mask: {paths: ['swhid']}"
+ swhid: "swh:1:snp:0000000000000000000000000000000000000020"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000009"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000008"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000006"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000004"
+
+**Example 2**: shortest path from a directory to a snapshot (backward graph):
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.FindPathBetween \
+ "src: 'swh:1:dir:0000000000000000000000000000000000000006', dst: 'swh:1:rel:0000000000000000000000000000000000000019', direction: BACKWARD, mask: {paths: ['swhid']}"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000006"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000008"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000012"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000013"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000018"
+ swhid: "swh:1:rel:0000000000000000000000000000000000000019"
+
+**Example 3**: common ancestor of two contents:
+
+.. code-block:: console
+
+ $ grpc_cli call localhost:50091 swh.graph.TraversalService.FindPathBetween \
+ "src: 'swh:1:cnt:0000000000000000000000000000000000000004', dst: 'swh:1:cnt:0000000000000000000000000000000000000015', direction: BACKWARD, direction_reverse: BACKWARD, mask: {paths: ['swhid']}"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000004"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000006"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000008"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000012"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000013"
+ swhid: "swh:1:rev:0000000000000000000000000000000000000018"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000017"
+ swhid: "swh:1:dir:0000000000000000000000000000000000000016"
+ swhid: "swh:1:cnt:0000000000000000000000000000000000000015"
+ middle_node_index: 5
+
+Because ``middle_node_index = 5``, the common ancestor is
+``swh:1:rev:0000000000000000000000000000000000000018``.
+
+
+.. _swh-graph-grpc-api-protobuf:
+
+Protobuf API Reference
+======================
+
+The GRPC API is specified in a single self-documenting
+`protobuf `_ file, which is
+available in the ``proto/swhgraph.proto`` file of the swh-graph repository:
+
+https://forge.softwareheritage.org/source/swh-graph/browse/master/proto/swhgraph.proto
+
+..
+ .. literalinclude:: swhgraph.proto
+ :language: protobuf
diff --git a/docs/images/Makefile b/docs/images/Makefile
index 01fbfa2..9cb29d6 100644
--- a/docs/images/Makefile
+++ b/docs/images/Makefile
@@ -1,13 +1,13 @@
all: compression_steps.png compression_steps.svg
%.png: %.dot
- dot -Gdpi=300 -Tpng $< -o $@
+ dot -Gdpi=150 -Tpng $< -o $@
%.svg: %.dot
dot -Tsvg $< -o $@
.PHONY: clean
clean:
rm -f compression_steps.png
rm -f compression_steps.svg
diff --git a/docs/images/compression_steps.dot b/docs/images/compression_steps.dot
index 7156f62..c1beb77 100644
--- a/docs/images/compression_steps.dot
+++ b/docs/images/compression_steps.dot
@@ -1,51 +1,111 @@
digraph "Compression steps" {
- // Horizontal graph
- rankdir=LR;
+ node [shape = none];
+
+ orc_dataset [label="ORC Graph\nDataset"];
+ nodes_csv [label="graph.nodes.csv.zst"];
+ labels_csv [label="graph.labels.csv.zst"];
+ graph_mph [label="graph.mph"];
subgraph {
- input_edges [label="swh.edges.csv.gz", fontsize=9, shape=none];
- input_nodes [label="swh.nodes.csv.gz", fontsize=9, shape=none];
- {rank=same; input_edges; input_nodes;}
+ node [fontcolor=darkgray];
+ graph_base [label="graph-base.graph"]
+ graph_bfs_order [label="graph-bfs.order"]
+ graph_bfs [label="graph-bfs.graph"]
+ graph_bfs_transposed [label="graph-bfs-transposed.graph"]
+ graph_bfs_simplified [label="graph-bfs-simplified.graph"]
+ graph_llp_order [label="graph-llp.order"]
}
- mph [label="MPH", shape=box];
- mph_out [label="swh.mph", fontsize=9, shape=none];
-
- bv_compress [label="BV compress", shape=box];
- bv_compress_out
- [label="swh-bv.graph\lswh-bv.offsets\lswh-bv.obl\lswh-bv.properties",
- fontsize=9, shape=none];
-
- bfs [label="BFS", shape=box];
- bfs_out [label="swh.order", fontsize=9, shape=none];
+ graph_llp [label="graph.graph"]
+ graph_llp_transposed [label="graph-transposed.graph"]
+ graph_order [label="graph.order"]
+ graph_obl [label="graph.obl"]
+ graph_transposed_obl [label="graph-transposed.obl"]
+ stats [label="graph.stats"]
+ swhidmap [label="graph.node2swhid.bin"]
+ typemap [label="graph.node2type.bin"]
+ persons_csv [label="graph.persons.csv.zst"];
+ persons_mph [label="graph.persons.mph"];
+ node_properties [label="graph.property.*"];
+ labels_mph [label="graph.labels.mph"];
+ labels_fcl [label="graph.labels.fcl"];
+ graph_labelled [label="graph-labelled.*"];
+ graph_transposed_labelled [label="graph-transposed-labelled.*"];
+ graph_labelled_obl [label="graph-labelled.labelobl"];
+ graph_transposed_labelled [label="graph-transposed-labelled.labelobl"];
- permute [label="Permute", shape=box];
- permute_out
- [label="swh.graph\lswh.offsets\lswh.obl\lswh.properties",
- fontsize=9, shape=none];
-
- stats [label="Stats", shape=box];
- stats_out
- [label="swh.stats\lswh.indegree\lswh.outdegree",
- fontsize=9, shape=none];
+ subgraph {
+ node [shape=box, fontname="Courier New"];
+ EXTRACT_NODES;
+ MPH;
+ BV;
+ BFS;
+ PERMUTE_BFS;
+ TRANSPOSE_BFS;
+ SIMPLIFY;
+ LLP;
+ PERMUTE_LLP;
+ COMPOSE_ORDERS;
+ STATS;
+ TRANSPOSE;
+ OBL;
+ TRANSPOSE_OBL;
+ NODE_MAP;
+ EXTRACT_PERSONS;
+ MPH_PERSONS;
+ NODE_PROPERTIES;
+ MPH_LABELS;
+ FCL_LABELS;
+ EDGE_LABELS;
+ EDGE_LABELS_OBL;
+ EDGE_LABELS_TRANSPOSE_OBL;
+ }
- transpose [label="Transpose", shape=box];
- transpose_out
- [label="swh-transposed.graph\lswh-transposed.offsets\lswh-transposed.obl\lswh-transposed.properties",
- fontsize=9, shape=none];
- input_nodes -> mph;
- input_edges -> bv_compress;
- mph -> mph_out;
- mph_out -> bv_compress;
- bv_compress -> bv_compress_out;
- bv_compress_out-> bfs;
- bv_compress_out-> permute;
- bfs -> bfs_out;
- bfs_out -> permute;
- permute -> permute_out;
- permute_out -> stats;
- permute_out -> transpose;
- stats -> stats_out;
- transpose -> transpose_out;
+ orc_dataset -> EXTRACT_NODES;
+ EXTRACT_NODES -> nodes_csv;
+ EXTRACT_NODES -> labels_csv;
+ nodes_csv -> MPH -> graph_mph;
+ graph_mph -> BV;
+ orc_dataset -> BV -> graph_base;
+ graph_base -> BFS -> graph_bfs_order;
+ graph_bfs_order -> PERMUTE_BFS;
+ graph_base -> PERMUTE_BFS -> graph_bfs;
+ graph_bfs -> TRANSPOSE_BFS -> graph_bfs_transposed;
+ graph_bfs_transposed -> SIMPLIFY;
+ graph_bfs -> SIMPLIFY -> graph_bfs_simplified;
+ graph_bfs_simplified -> LLP -> graph_llp_order;
+ graph_llp_order -> PERMUTE_LLP;
+ graph_bfs -> PERMUTE_LLP -> graph_llp;
+ graph_bfs_order -> COMPOSE_ORDERS;
+ graph_llp_order -> COMPOSE_ORDERS -> graph_order;
+ graph_llp -> TRANSPOSE -> graph_llp_transposed;
+ graph_llp -> OBL -> graph_obl;
+ graph_llp_transposed -> TRANSPOSE_OBL -> graph_transposed_obl;
+ graph_llp -> STATS -> stats;
+ graph_llp -> NODE_MAP;
+ nodes_csv -> NODE_MAP;
+ graph_mph -> NODE_MAP;
+ graph_order -> NODE_MAP;
+ NODE_MAP -> swhidmap;
+ NODE_MAP -> typemap;
+ orc_dataset -> EXTRACT_PERSONS -> persons_csv;
+ persons_csv -> MPH_PERSONS -> persons_mph;
+ orc_dataset -> NODE_PROPERTIES;
+ persons_mph -> NODE_PROPERTIES;
+ graph_mph -> NODE_PROPERTIES;
+ graph_order -> NODE_PROPERTIES;
+ NODE_PROPERTIES -> node_properties;
+ labels_csv -> MPH_LABELS -> labels_mph;
+ labels_mph -> FCL_LABELS;
+ labels_csv -> FCL_LABELS -> labels_fcl;
+ orc_dataset -> EDGE_LABELS;
+ labels_mph -> EDGE_LABELS;
+ graph_llp -> EDGE_LABELS;
+ graph_mph -> EDGE_LABELS;
+ graph_order -> EDGE_LABELS;
+ EDGE_LABELS -> graph_labelled;
+ EDGE_LABELS -> graph_transposed_labelled;
+ graph_labelled -> EDGE_LABELS_OBL -> graph_labelled_obl;
+ graph_transposed_labelled -> EDGE_LABELS_TRANSPOSE_OBL -> graph_transposed_labelled_obl;
}
diff --git a/docs/index.rst b/docs/index.rst
index 9bf477d..07e1068 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,17 +1,19 @@
.. _swh-graph:
.. include:: README.rst
.. toctree::
:maxdepth: 1
:caption: Overview
quickstart
+ api
+ grpc-api
+ java-api
+ memory
compression
cli
- api
- use-cases
docker
git2graph
/apidoc/swh.graph
diff --git a/docs/java-api.rst b/docs/java-api.rst
new file mode 100644
index 0000000..982236a
--- /dev/null
+++ b/docs/java-api.rst
@@ -0,0 +1,744 @@
+.. _swh-graph-java-api:
+
+Using the Java API
+==================
+
+.. highlight:: java
+
+While the :ref:`HTTP API ` is useful for many common use-cases,
+it is often not sufficient to implement more complex algorithms. This section
+describes the low-level Java API that ``swh-graph`` provides on top of the
+WebGraph framework to manipulate the compressed graph of Software Heritage.
+
+A cursory understanding of the `WebGraph framework
+`_ and its API is helpful to understand the
+notions detailed here.
+
+.. _swh-graph-java-basics:
+
+Basics
+------
+
+In the WebGraph framework, graphs are generally subclasses of
+`ImmutableGraph
+`_,
+the abstract class providing the core API to manipulate and iterate on graphs.
+Under the hood, compressed graphs are stored as the `BVGraph
+`_
+class, which contains the actual codec used to compress and decompress
+adjacency lists.
+
+Graphs **nodes** are mapped to a contiguous set of integers :math:`[0, n - 1]`
+where *n* is the total number of nodes in the graph.
+Each node has an associated *adjacency list*, i.e., a list of nodes going from
+that source node to a destination node. This list represents the **edges** (or
+**arcs**) of the graph.
+
+**Note**: edges are always directed. Undirected graphs are internally stored as
+a pair of directed edges (src → dst, dst → src), and are called "symmetric"
+graphs.
+
+On disk, a simple BVGraph with the basename ``graph`` would be represented as
+the following set of files:
+
+- ``graph.graph``: contains the compressed adjacency lists of each node, which
+ can be decompressed by the BVGraph codec.
+- ``graph.properties``: contains metadata on the graph, such as the number of
+ nodes and arcs, as well as additional loading information needed by the
+ BVGraph codec.
+- ``graph.offsets``: a list of offsets of where the adjacency list of each node
+ is stored in the main graph file.
+- ``graph.obl``: optionally, an "offset big-list file" which can be used to
+ load graphs faster.
+
+An ImmutableGraph can be loaded using different *load methods*, which have each
+different performance implications:
+
+- ``load()``: the entire graph is loaded in RAM and supports random access.
+- ``loadMapped()``: the graph is loaded by memory-mapping it from disk (see
+ ``mmap(1)``), at the cost of being potentially slower, especially when doing
+ random access on slow storage.
+- ``loadOffline()``: no data is actually loaded is memory, only sequential
+ iteration is possible.
+
+The following code loads a graph stored on disk under the ``compressed/graph``
+basename, using the memory-mapped loading mode, and stores it as a generic
+ImmutableGraph:
+
+.. code-block:: java
+
+ ImmutableGraph graph = ImmutableGraph.loadMapped("compressed/graph");
+
+Note that most of the time you will want to use the SWH-specific subclass
+**SwhUnidirectionalGraph** instead, which has the same API as an ImmutableGraph
+except it adds other SWH-specific methods. It is described later in the
+:ref:`swh-graph-java-node-mappings` section.
+
+
+Running the code
+----------------
+
+To run a piece of Java code written using the Java API, you need to run it with
+all the dependencies in your classpath (the WebGraph libraries and the
+swh-graph library). The easiest way to do it is to add the *fat jar*
+shipped in the swh-graph library on PyPI, which contains all the required
+dependencies.
+
+.. code-block:: console
+
+ $ java -cp venv/share/swh-graph/swh-graph-0.5.2.jar MyAlgo.java
+
+
+Note that to load bigger graphs, the default heap size of the JVM is likely to
+be insufficient to load entire graphs in memory. It is advised to increase this
+heap size with the ``-Xmx`` flag:
+
+.. code-block:: console
+
+ $ java -Xmx300G -cp venv/share/swh-graph/swh-graph-0.5.2.jar MyAlgo.java
+
+For more information on performance tuning and memory considerations, you
+should also read the :ref:`swh-graph-memory` page, in which we recommend
+additional JVM options for loading large graphs.
+
+
+Simple traversal
+----------------
+
+The ImmutableGraph class provides primitives to iterate and traverse graphs. It
+contains the following methods:
+
+- ``graph.numNodes()`` returns the number of nodes in the graph (*n*).
+- ``graph.numArcs()`` returns the number of arcs in the graph.
+
+And, given a node ID :math:`k \in [0, n - 1]`:
+
+- ``graph.successors(k)`` returns a LazyLongIterator on the nodes that are
+ *adjacent* to *k* (i.e., its outgoing *neighbors*).
+- ``graph.outdegree(k)`` returns the number of outgoing neighbors of *k*.
+
+
+Example: Average outdegree
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following code can be used to compute the average
+outdegree of a graph, which is a useful measure of its density:
+
+.. code-block:: java
+
+ public static long averageOutdegree(ImmutableGraph graph) {
+ return ((long) graph.numArcs()) / graph.numNodes();
+ }
+
+
+Example: Degree distributions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Using the ``outdegree()`` primitive, we can compute the outdegree distribution
+of the graph by iterating on all its nodes. The distribution will be returned
+as a map that associates to each degree *d* the number of nodes with outdegree
+*d*.
+
+.. code-block:: java
+
+ public static Map outdegreeDistribution(ImmutableGraph graph) {
+ HashMap distribution = new HashMap();
+ for (long k = 0; k < graph.numNodes(); ++k) {
+ distribution.merge(graph.outdegree(k), 1L, Long::sum);
+ }
+ return distribution;
+ }
+
+
+Example: Depth-First Traversal
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``successors`` primitive can be used to write a simple stack-based DFS
+traversal on the graph which starts from a given node and prints all the
+descendant nodes in its transitive closure:
+
+.. code-block:: java
+ :emphasize-lines: 10
+
+ public static void visitNodesDFS(ImmutableGraph graph, long srcNodeId) {
+ Stack stack = new Stack<>();
+ HashSet visited = new HashSet();
+ stack.push(srcNodeId);
+ visited.add(srcNodeId);
+
+ while (!stack.isEmpty()) {
+ long currentNodeId = stack.pop();
+ System.out.println(currentNodeId);
+
+ LazyLongIterator it = graph.successors(currentNodeId);
+ for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
+ if (!visited.contains(neighborNodeId)) {
+ stack.push(neighborNodeId);
+ visited.add(neighborNodeId);
+ }
+ }
+ }
+ }
+
+Example: Breadth-First Traversal
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Swapping the stack for a queue changes the traversal order from depth-first
+to breadth-first:
+
+.. code-block:: java
+ :emphasize-lines: 2
+
+ public static void visitNodesBFS(ImmutableGraph graph, long srcNodeId) {
+ Queue queue = new ArrayDeque<>();
+ HashSet visited = new HashSet();
+ queue.add(srcNodeId);
+ visited.add(srcNodeId);
+
+ while (!queue.isEmpty()) {
+ long currentNodeId = queue.poll();
+ System.out.println(currentNodeId);
+
+ LazyLongIterator it = graph.successors(currentNodeId);
+ for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
+ if (!visited.contains(neighborNodeId)) {
+ queue.add(neighborNodeId);
+ visited.add(neighborNodeId);
+ }
+ }
+ }
+ }
+
+
+.. _swh-graph-java-node-mappings:
+
+Node types and SWHIDs
+---------------------
+
+In the Software Heritage archive, nodes are not represented by a simple
+integer, but by a :ref:`SWHID `, which contain both the
+*type* of the node (revision, directory, blob...) and its unique identifier. We
+use **node mappings** which allow us to translate between SWHIDs and the
+compact node IDs used in the compressed graph.
+
+Most notably, we use a MPH (Minimal Perfect Hash) function implemented in the
+`GOVMinimalPerfectHashFunction
+`_
+class of the Sux4J library, which maps N keys to N consecutive integers with no
+collisions.
+
+The following files are used to store the mappings between the nodes and their
+types:
+
+- ``graph.mph``: contains a serialized minimal perfect hash function computed
+ on the list of all the SWHIDs in the graph.
+- ``graph.order``: contains the permutation that associates with each output of
+ the MPH the node ID to which it corresponds
+- ``graph.node2swhid.bin``: contains a compact binary representation of all the
+ SWHIDs in the graph, ordered by their rank in the graph file.
+- ``graph.node2type.bin``: contains a `LongBigArrayBitVector
+ `_
+ which stores the type of each node.
+
+To use these mappings easily, we provide the class **SwhUnidirectionalGraph**,
+an ImmutableGraph which wraps the underlying graph and adds a few
+utility methods to obtain SWH-specific information on the graph.
+
+A SwhUnidirectionalGraph can be loaded in a similar way to any ImmutableGraph,
+as long as the mapping files listed above are present::
+
+ SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.load(basename);
+
+This class exposes the same graph primitives as an ImmutableGraph, but it
+additionally contains the following methods:
+
+- ``SWHID getSWHID(long nodeId)``: returns the SWHID associated with a given
+ node ID. This function does a lookup of the SWHID at offset *i* in the file
+ ``graph.node2swhid.bin``.
+
+- ``long getNodeID(SWHID swhid)``: returns the node ID associated with a given
+ SWHID. It works by hashing the SWHID with the function stored in
+ ``graph.mph``, then permuting it using the permutation stored in
+ ``graph.order``. It does additional domain-checking by calling ``getSWHID()``
+ on its own result to check that the input SWHID was valid.
+
+- ``SwhType getNodeType(long nodeID)``: returns the type of a given node, as
+ an enum of all the different object types in the Software Heritage data
+ model. It does so by looking up the value at offset *i* in the bit vector
+ stored in ``graph.node2type.bin``.
+
+
+Example: Find the target directory of a revision
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As an example, we use the methods mentioned above to perform the
+following task: "given a revision, return its target directory". To do so, we
+first look up the node ID of the given revision in the compressed graph. We
+iterate on the successors of that node, and return the SWHID of the first
+destination node that has the "directory" type.
+
+
+.. code-block:: java
+ :emphasize-lines: 2
+
+ public SWHID findDirectoryOfRevision(SwhUnidirectionalGraph graph, SWHID revSwhid) {
+ long src = graph.getNodeId(revSwhid);
+ assert graph.getNodeType(src) == SwhType.REV;
+ LazyLongIterator it = graph.successors(currentNodeId);
+ for (long dst; (dst = it.nextLong()) != -1;) {
+ if (graph.getNodeType(dst) == SwhType.DIR) {
+ return graph.getSWHID(dst);
+ }
+ }
+ throw new RuntimeError("Revision has no target directory");
+ }
+
+.. _swh-graph-java-node-properties:
+
+Node properties
+---------------
+
+The Software Heritage Graph is a *property graph*, which means it has various
+properties associated with its nodes and edges (e.g., commit timestamps,
+authors, messages, ...). We compress these properties and store them in files
+alongside the compressed graph. This allows you to write traversal algorithms
+that depend on these properties.
+
+By default, properties are not assumed to be present are are not loaded when
+the graph itself is loaded. If you want to use a property, you need to
+explicitly load it first. As an example, this is how you load the "content
+length" property to get the length of a given blob::
+
+ SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.load(basename);
+ graph.loadContentLength();
+ long blobSize = graph.getContentLength(graph.getNodeID(swhid));
+
+The documentation of the SwhGraphProperties class (**TODO: link**) lists all
+the different properties, their types, and the methods used to load them and to get
+their value for a specific node.
+
+A few things of note:
+
+- A single loading call can load multiple properties at once; this is because
+ they are stored in the same file to be more space efficient.
+
+- Persons (authors, committers etc) are exported as a single pseudonymized
+ integer ID, that uniquely represents a full name + email.
+
+- Timestamps are stored as a long integer (for the timestamp itself) and a
+ short integer (for the UTC offset).
+
+
+.. _swh-graph-java-edge-properties:
+
+Edge labels
+-----------
+
+While looking up graph properties on the *nodes* of the graph is relatively
+straightforward, doing so for labels on the *arcs* is comparatively more
+difficult. These include the names and permissions of directory entries, as
+well as the branch names of snapshots.
+
+The `ArcLabelledImmutableGraph
+`_
+class in WebGraph wraps an ImmutableGraph, but augments its iterators by making them
+*labelled iterators*, which essentially allow us to look up the label of the
+arcs while iterating on them.
+
+This labelled graph is stored in the following files:
+
+- ``graph-labelled.properties``: a property file describing the graph, notably
+ containing the basename of the wrapped graph.
+- ``graph-labelled.labels``: the compressed labels
+- ``graph-labelled.labeloffsets``: the offsets used to access the labels in
+ random order.
+
+The SwhUnidirectionalGraph class contains *labelled* loading methods
+(``loadLabelled()``, ``loadLabelledMapped()``, ...). When these loading methods
+are used instead of the standard non-labelled ones, the graph is loaded as an
+ArcLabelledImmutableGraph instead of an ImmutableGraph. The following methods
+can then be used:
+
+- ``labelledSuccessors(k)`` returns a `LabelledArcIterator
+ `_
+ which is used in the same way as a LazyLongIterator except it also contains a
+ ``label()`` method to get the label of the currently traversed arc.
+- ``labelledNodeIterator()`` returns an `ArcLabelledNodeIterator
+ `_
+ of all the nodes in the graph, which replaces the LazyLongIterator of the
+ ``successor()`` function by a LabelledArcIterator similar to above.
+
+
+Label format
+~~~~~~~~~~~~
+
+The labels of each arc are returned as a ``DirEntry[]`` array. They encode
+both the name of a directory entry and its permissions. For snapshot branches,
+only the "name" field is useful.
+
+Arc label names are encoded as an integer ID representing each unique
+entry/branch name present in the graph. To retrieve the actual name associated
+with a given label ID, one needs to load the reverse mapping similar to how you
+would do for a normal property::
+
+ SwhUnidirectionalGraph graph = SwhUnidirectionalGraph.loadLabelled(basename);
+ graph.loadLabelNames();
+
+The byte array representing the actual label name can then be loaded with::
+
+ byte[] name = graph.getLabelName(label.filenameId);
+
+
+Multiedges
+~~~~~~~~~~
+
+The Software Heritage is not a *simple graph*, where at most one edge can exist
+between two vertices, but a *multigraph*, where multiple edges can be incident
+to the same two vertices. Consider for instance the case of a single directory
+``test/`` containing twice the same file blob (e.g., the empty file), under two
+different names (e.g., ``ISSUES.txt`` and ``TODO.txt``, both completely empty).
+The simple graph view of this directory will represent it as a single edge
+``test`` → *empty file*, while the multigraph view will represent it as *two*
+edges between the same nodes.
+
+Due to the copy-list model of compression, WebGraph only stores simple graphs,
+and thus stores multiedges as single edges, to which we cannot associate
+a single label name (in our example, we need to associate both names
+``ISSUES.txt`` and ``TODO.txt``).
+To represent this possibility of having multiple file names for a single arc,
+in the case of multiple relationships between two identical nodes, each arc label is
+stored as an *array* of DirEntry, each record representing one relationship
+between two nodes.
+
+
+Example: Printing all the entries of a directory
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following code showcases how one can print all the entries (name,
+permission and target SWHID) of a given directory, using the labelled methods
+seen above.
+
+.. code-block:: java
+
+ public static void printEntries(ImmutableGraph g, long dirNode) {
+ LabelledArcIterator s = g.labelledSuccessors(dirNode);
+ for (long dst; (dst = it.nextLong()) >= 0;) {
+ DirEntry[] labels = (DirEntry[]) s.label().get();
+ for (DirEntry label : labels) {
+ System.out.format(
+ "%s %s %d\n",
+ graph.getSWHID(dst);
+ new String(graph.getLabelName(label.filenameId)),
+ label.permission
+ );
+ }
+ }
+ }
+
+ // Usage: $PROGRAM
+ public static void main(String[] args) {
+ SwhUnidirectionalGraph g = SwhUnidirectionalGraph.loadLabelledMapped(args[0]);
+ g.loadLabelNames();
+ long dirNode = g.getNodeID(new SWHID(args[1]));
+ printEntries(g, dirNode);
+ }
+
+
+Transposed graph
+----------------
+
+Up until now, we have only looked at how to traverse the *forward* graph, i.e.,
+the directed graph whose edges are in the same direction as the Merkle DAG of
+the Software Heritage archive.
+For many purposes, especially that of finding the *provenance* of software
+artifacts, it is useful to query the *backward* (or *transposed*) graph
+instead, which is the same as the forward graph except all the edges are
+reversed.
+
+The transposed graph has its own set of files, counterparts to the files needed
+for the forward graph:
+
+- ``graph-transposed.graph``
+- ``graph-transposed.properties``
+- ``graph-transposed.offsets``
+- ``graph-transposed.obl``
+- ``graph-transposed-labelled.labels``
+- ``graph-transposed-labelled.labeloffsets``
+- ``graph-transposed-labelled.properties``
+
+However, because node IDs are the same in the forward and the backward graph,
+all the files that pertain to mappings between the node IDs and various
+properties (SWHIDs, property data, node permutations etc) remain the same.
+
+
+Example: Earliest revision containing a given blob
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following code loads all the committer timestamps of the revisions in the
+graph, then walks the *transposed* graph to return the earliest revision
+containing a given object.
+
+.. code-block:: java
+
+ public static long findEarliestRevisionContaining(SwhUnidirectionalGraph g, long src) {
+ long oldestRev = -1;
+ long oldestRevTs = Long.MAX_VALUE;
+
+ Stack stack = new Stack<>();
+ HashSet visited = new HashSet();
+ stack.push(src);
+ visited.add(src);
+ while (!stack.isEmpty()) {
+ long currentNodeId = stack.pop();
+ LazyLongIterator it = graph.successors(currentNodeId);
+ for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
+ if (!visited.contains(neighborNodeId)) {
+ stack.push(neighborNodeId);
+ visited.add(neighborNodeId);
+ if (g.getNodeType(neighborNodeId) == SwhType.REV) {
+ Long ts = g.getCommitterTimestamp(neighborNodeId);
+ if (ts != null && ts < oldestRevTs) {
+ oldestRev = neighborNodeId;
+ oldestRevTs = ts;
+ }
+ }
+ }
+ }
+ }
+ return oldestRev;
+ }
+
+ // Usage: $PROGRAM
+ public static void main(String[] args) {
+ // Load the backward (= transposed) graph as a SwhUnidirectionalGraph
+ SwhUnidirectionalGraph g = SwhUnidirectionalGraph.loadMapped(args[0] + "-transposed");
+ g.loadCommitterTimestamps();
+ long node = g.getNodeID(new SWHID(args[1]));
+ long oldestRev = findEarliestRevisionContaining(g, node);
+ System.out.println(g.getSWHID(oldestRev));
+ }
+
+
+
+
+Bidirectional Graph
+-------------------
+
+
+BidirectionalImmutableGraph
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+While ``graph-transposed`` can be loaded as a simple SwhUnidirectionalGraph and
+then manipulated just like the forward graph, it is often convenient to have
+*both* the forward and the backward graph in memory. Some traversal algorithms
+require first going down in the forward graph to select some nodes, then going
+up to find their provenance.
+
+To achieve that, we use the `BidirectionalImmutableGraph
+`_
+class from WebGraph, which stores both a graph and its transpose.
+This class provides the following methods to iterate on the **backward** graph,
+shown here with their counterparts for the forward graph:
+
+.. list-table::
+ :header-rows: 1
+
+ * - Forward graph operation
+ - Backward graph operation
+
+ * - ``outdegree(k)``
+ - ``indegree(k)``
+
+ * - ``successors(k)``
+ - ``predecessors(k)``
+
+In addition, the class offers a few convenience methods which are generally
+useful when you have both a graph and its transpose:
+
+- ``transpose()`` returns the transpose of the BidirectionalImmutableGraph by
+ inverting the references to the forward and the backward graphs. Successors
+ become predecessors, and vice-versa.
+- ``symmetrize()`` returns the symmetric (= undirected) version of the
+ bidirectional graph. It is implemented by a union between the forward and the
+ backward graph, which basically boils down to removing the directionality of
+ the edges (the successors of a node are also its predecessors).
+
+
+SwhBidirectionalGraph
+~~~~~~~~~~~~~~~~~~~~~
+
+Like for ImmutableGraph, we extend the BidirectionalImmutableGraph with
+SWH-specific methods, in the subclass ``SwhBidirectionalGraph``. Notably, it
+contains the method ``labelledPredecessors()``, the equivalent of
+``labelledSuccessors()`` but on the backward graph.
+
+Because SwhUnidirectionalGraph inherits from ImmutableGraph, and
+SwhBidirectionalGraph inherits from BidirectionalImmutableGraph, we put the
+common behavior between the two classes in a SwhGraph interface, which can
+represent either an unidirectional or a bidirectional graph.
+
+To avoid loading the node properties two times (once for each direction), they
+are stored in a separate class called SwhGraphProperties. In a
+SwhBidirectionalGraph, the two SwhUnidirectionalGraph share their node
+properties in memory by storing references to the same SwhGraphProperty
+object.
+
+.. code-block:: text
+
+
+ ┌──────────────┐
+ │ImmutableGraph◄────────┐
+ └────▲─────────┘ │extends
+ │ │
+ │ ┌──────────┴────────────────┐
+ extends│ │BidirectionalImmutableGraph│
+ │ └────────────▲──────────────┘
+ │ │extends
+ ┌──────────────┴───────┐ ┌──────┴──────────────┐
+ │SwhUnidirectionalGraph│◄────┤SwhBidirectionalGraph│
+ └──┬──────────────┬────┘ └────────┬───────────┬┘
+ │ │ contains x2 │ │
+ │ │ │ │
+ │ implements│ │implements │
+ │ ┌▼──────────┐ │ │
+ │ │SwhGraph(I)◄────────┘ │
+ contains│ └───────────┘ │contains
+ │ │
+ │ ┌──────────────────┐ │
+ └────────────►SwhGraphProperties◄──────────────┘
+ └──────────────────┘
+
+
+Example: Find all the shared-commit forks of a given origin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It is possible to define the *forks* of an origin as being the set of origins
+which share at least one revision with that origin.
+
+The following code loads the graph in both directions using a
+SwhBidirectionalGraph. Given an origin SWHID, it first walks the *forward*
+graph to find all its root revisions. It then walks the *backward* graph to
+find all the origins containing these root revisions, i.e., its *forks*.
+
+.. code-block:: java
+
+ public static void findSharedCommitForks(SwhUnidirectionalGraph g, long srcOrigin) {
+ Stack forwardStack = new Stack<>();
+ HashSet forwardVisited = new HashSet();
+ Stack backwardStack = new Stack<>();
+ HashSet backwardVisited = new HashSet();
+
+ // First traversal (forward graph): find all the root revisions of the
+ // origin
+ forwardStack.push(srcOrigin);
+ forwardVisited.add(srcOrigin);
+ while (!forwardStack.isEmpty()) {
+ long curr = forwardStack.pop();
+ LazyLongIterator it = graph.successors(curr);
+ boolean isRootRevision = true;
+ for (long succ; (succ = it.nextLong()) != -1;) {
+ SwhType nt = g.getNodeType(succ);
+ if (!forwardVisited.contains(succ)
+ && nt != SwhType.DIR && nt != SwhType.CNT) {
+ forwardStack.push(succ);
+ forwardVisited.add(succ);
+ isRootRevision = false;
+ }
+ }
+ if (g.getNodeType(curr) == SwhType.REV && isRootRevision) {
+ // Found a root revision, add it to the second stack
+ backwardStack.push(curr);
+ backwardVisited.add(curr);
+ }
+ }
+
+ // Second traversal (backward graph): find all the origins containing
+ // any of these root revisions and print them
+ while (!backwardStack.isEmpty()) {
+ long curr = backwardStack.pop();
+ LazyLongIterator it = graph.predecessors(curr);
+ boolean isRootRevision = true;
+ for (long succ; (succ = it.nextLong()) != -1;) {
+ SwhType nt = g.getNodeType(succ);
+ if (!backwardVisited.contains(succ)) {
+ backwardStack.push(succ);
+ backwardVisited.add(succ);
+ if (nt == SwhType.ORI) {
+ // Found an origin, print it.
+ System.out.println(g.getSWHID(succ));
+ }
+ }
+ }
+ }
+ }
+
+ // Usage: $PROGRAM
+ public static void main(String[] args) {
+ // Load both forward and backward graphs as a SwhBidirectionalGraph
+ SwhBidirectionalGraph g = SwhBidirectionalGraph.loadMapped(args[0]);
+ long node = g.getNodeID(new SWHID(args[1]));
+ findSharedCommitForks(g, node);
+ }
+
+
+Large-scale processing
+----------------------
+
+Multithreading
+~~~~~~~~~~~~~~
+
+ImmutableGraph is not thread-safe. When writing multithreaded algorithms,
+calling ``successors()`` on the same graph from multiple threads will return
+garbage.
+
+Instead, each thread should create its own "lightweight copy" of the graph by
+calling ``.copy()``. This will not actually copy the entire graph data, which
+will remain shared across threads, but it will create new instances of the
+iterators so that each thread can independently iterate on the graph data.
+
+
+Data structures for large traversals
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When doing very large traversals, such as a BFS on the entire graph, the
+usual data structures (HashSet, Stack, ArrayDeque, etc.) will be quite
+inefficient. If you know you are going to traverse large parts of the graph,
+it's better to use more appropriate data structures, a lot of which can be
+found in the dsiutils library. In particular:
+
+- `LongArrayBitVector
+ `_
+ is an efficient bit-vector implementation, which can be used to store the
+ nodes that have already been seen in the visit. Its memory footprint is too
+ big to use for small traversals, but it is very efficient to traverse the
+ full graph, as every node only takes a single bit.
+
+- `ByteDiskQueue
+ `_ can
+ be used to efficiently store the queue of nodes to visit on disk, when it is
+ too large to fit in RAM.
+
+Other types in dsiutils and fastutil can save significant memory:
+``LongArrayList`` saves at least 8 bytes per entry over ``ArrayList``,
+and ``Long2LongOpenHashMap`` saves at least 16 bytes for every entry over
+``HashMap``. We strongly recommend reading the documentation of the
+unimi libraries and looking at the code for usage examples.
+
+
+BigArrays
+~~~~~~~~~
+
+When working with the Software Heritage graph, is often necessary to store
+large arrays of values, with a size exceeding 2^32 items. Unfortunately,
+standard Java arrays do not support this.
+
+To circumvent this, WebGraph uses the `BigArrays scheme
+`_ from
+the fastutil library: "big arrays" are stored as arrays of arrays, supporting
+quadrillions of records.
+
+A BigArray ``long[][] a`` can be used with the following methods:
+
+- ``BigArrays.get(a, i)`` to get the value at index *i*
+- ``BigArrays.set(a, i, v)`` to set the value at index *i* to *v*.
+- ``BigArrays.length(a)`` to get the total length of the bigarray.
diff --git a/docs/memory.rst b/docs/memory.rst
new file mode 100644
index 0000000..f30f9c4
--- /dev/null
+++ b/docs/memory.rst
@@ -0,0 +1,130 @@
+.. _swh-graph-memory:
+
+Memory & Performance tuning
+===========================
+
+This page discusses various considerations related to memory usage and
+performance tuning when using the ``swh-graph`` library to load large
+compressed graphs.
+
+JVM options
+-----------
+
+In production, we tend to use very large servers which have enough RAM to load
+the entire graph in RAM. In these setups, the default JVM options are often
+suboptimal. We recommend to start the JVM with the following options, which
+tend to significantly improve performance::
+
+ java \
+ -ea \
+ -server \
+ -XX:PretenureSizeThreshold=512M \
+ -XX:MaxNewSize=4G \
+ -XX:+UseLargePages \
+ -XX:+UseTransparentHugePages \
+ -XX:+UseNUMA \
+ -XX:+UseTLAB \
+ -XX:+ResizeTLAB \
+
+These options are documented in the manual of ``java(1)`` the Oracle
+documentation.
+
+
+Temporary directory
+-------------------
+
+Many of the graph algorithms (either for compression or traversal) tend to
+offload some of their run-time memory to disk. For instance, the `BFS
+`_
+algorithm in the LAW library uses a temporary directory to write its queue of
+nodes to visit.
+
+Because these can be quite large and sometimes overflow the default ``/tmp``
+partition, it is advised to systematically specify a path to a local temporary
+directory with enough space to accommodate the needs of the Java programs. This
+can be done using the ``-Djava.io.tmpdir`` parameter on the Java CLI::
+
+ java -Djava.io.tmpdir=/srv/softwareheritage/ssd/tmp
+
+
+Memory mapping vs Direct loading
+--------------------------------
+
+The main dial you can use to manage your memory usage is to chose between
+memory-mapping and direct-loading the graph data. The different loading modes
+available when loading the graph are documented in :ref:`swh-graph-java-api`.
+
+Loading in mapped mode will not load any extra data in RAM, but will instead
+use the ``mmap(1)`` syscall to put the graph file located on disk in the
+virtual address space. The Linux kernel will then be free to arbitrarily cache
+the file, either partially or in its entirety, depending on the available
+memory space.
+
+In our experiments, memory-mapping a small graph from a SSD only incurs a
+relatively small slowdown (about 15-20%). However, when the graph is too big to
+fit in RAM, the kernel has to constantly invalidate pages to cache newly
+accessed sections, which incurs a very large performance penalty. A full
+traversal of a large graph that usually takes about 20 hours when loaded in
+main memory could take more than a year when mapped from a hard drive!
+
+When deciding what to direct-load and what to memory-map, here are a few rules
+of thumb:
+
+- If you don't need random access to the graph edges, you can consider using
+ the "offline" loading mode. The offsets won't be loaded which will save
+ dozens of gigabytes of RAM.
+
+- If you only need to query some specific nodes or run trivial traversals,
+ memory-mapping the graph from a HDD should be a reasonable solution that
+ doesn't take an inordinate amount of time. It might be bad for your disks,
+ though.
+
+- If you are constrained in available RAM, memory-mapping the graph from an SSD
+ offers reasonable performance for reasonably complex algorithms.
+
+- If you have a heavy workload (i.e. running a full traversal of the entire
+ graph) and you can afford the RAM, direct loading will be orders of magnitude
+ faster than all the above options.
+
+
+Sharing mapped data across processes
+------------------------------------
+
+Often, multiple processes can be working on the same data (mappings or the
+graph itself), for instance when running different experiments on the same
+graph. This is problematic in terms of RAM usage when using direct memory
+loading, as the same data of potentially hundreds of gigabytes is loaded in
+memory twice.
+As we have seen, memory-mapping can be used to avoid storing redundant data in
+RAM, but comes at the cost of potentially slower I/O as the data is no longer
+guaranteed to be loaded in main memory and is reliant on kernel heuristics.
+
+To efficiently share data across two different compressed graph processes,
+another option is to copy graph data to a ``tmpfs`` not backed by a disk swap,
+which forces the kernel to load it entirely in RAM. Subsequent memory-mappings
+of the files stored in the tmpfs will simply map the data stored in RAM to
+virtual memory pages, and return a pointer to the in-memory structure.
+
+To do so, we create a directory in ``/dev/shm`` in which we **copy** all the
+files that we want to direct-load in RAM, and **symlink** all the others. Then,
+we load the graph using the memory-mapped loading mode, which makes it use the
+shared memory stored in the tmpfs under the hood.
+
+Here is a systemd service that can be used to perform this task automatically:
+
+.. code-block:: ini
+
+ [Unit]
+ Description=swh-graph memory sharing in tmpfs
+
+ [Service]
+ Type=oneshot
+ RemainAfterExit=yes
+ ExecStart=mkdir -p /dev/shm/swh-graph/default
+ ExecStart=sh -c "ln -s /.../compressed/* /dev/shm/swh-graph/default"
+ ExecStart=cp --remove-destination /.../compressed/graph.graph /dev/shm/swh-graph/default
+ ExecStart=cp --remove-destination /.../compressed/graph-transposed.graph /dev/shm/swh-graph/default
+ ExecStop=rm -rf /dev/shm/swh-graph/default
+
+ [Install]
+ WantedBy=multi-user.target
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
index 7ac51bd..425a547 100644
--- a/docs/quickstart.rst
+++ b/docs/quickstart.rst
@@ -1,174 +1,132 @@
+.. _swh-graph-quickstart:
+
Quickstart
==========
-This quick tutorial shows how to compress and browse a graph using ``swh.graph``.
-
-It does not cover the technical details behind the graph compression techniques
-(refer to :ref:`graph-compression`).
-
+This quick tutorial shows how to start the ``swh.graph`` service to query
+an existing compressed graph with the high-level HTTP API.
Dependencies
------------
In order to run the ``swh.graph`` tool, you will need Python (>= 3.7) and Java
-JRE, you do not need the JDK if you install the package from pypi, but may want
-to install it if you want to hack the code or install it from this git
-repository. To compress a graph, you will need zstd_ compression tools.
-
-It is highly recommended to install this package in a virtualenv.
-
-On a Debian stable (buster) system:
-
-.. code:: bash
-
- $ sudo apt install python3-virtualenv default-jre zstd
+JRE. On a Debian system:
+.. code:: console
-.. _zstd: https://facebook.github.io/zstd/
+ $ sudo apt install python3 python3-venv default-jre
-
-Install
--------
+Installing swh.graph
+--------------------
Create a virtualenv and activate it:
-.. code:: bash
+.. code:: console
- ~/tmp$ mkdir swh-graph-tests
- ~/tmp$ cd swh-graph-tests
- ~/t/swh-graph-tests$ virtualenv swhenv
- ~/t/swh-graph-tests$ . swhenv/bin/activate
+ $ python3 -m venv .venv
+ $ source .venv/bin/activate
Install the ``swh.graph`` python package:
-.. code:: bash
+.. code:: console
- (swhenv) ~/t/swh-graph-tests$ pip install swh.graph
+ (venv) $ pip install swh.graph
[...]
- (swhenv) ~/t/swh-graph-tests swh graph --help
+ (venv) $ swh graph --help
Usage: swh graph [OPTIONS] COMMAND [ARGS]...
Software Heritage graph tools.
Options:
-C, --config-file FILE YAML configuration file
-h, --help Show this message and exit.
Commands:
- api-client client for the graph RPC service
- cachemount Cache the mmapped files of the compressed graph in a tmpfs.
compress Compress a graph using WebGraph Input: a pair of files...
- map Manage swh-graph on-disk maps
rpc-serve run the graph RPC service
-Compression
------------
-
-Existing datasets
-^^^^^^^^^^^^^^^^^
-
-You can directly use compressed graph datasets provided by Software Heritage.
-Here is a small and realistic dataset (3.1GB):
-
- https://annex.softwareheritage.org/public/dataset/graph/latest/popular-3k-python/python3kcompress.tar
-
-.. code:: bash
- (swhenv) ~/t/swh-graph-tests$ curl -O https://annex.softwareheritage.org/public/dataset/graph/latest/popular-3k-python/python3kcompress.tar
- (swhenv) ~/t/swh-graph-tests$ tar xvf python3kcompress.tar
- (swhenv) ~/t/swh-graph-tests$ touch python3kcompress/*.obl # fix the mtime of cached offset files to allow faster loading
+.. _swh-graph-retrieving-compressed:
-Note: not for the faint heart, but the full dataset is available at:
+Retrieving a compressed graph
+-----------------------------
- https://annex.softwareheritage.org/public/dataset/graph/latest/compressed/
+Software Heritage provides a list of off-the-shelf datasets that can be used
+for various research or prototyping purposes. Most of them are available in
+*compressed* representation, i.e., in a format suitable to be loaded and
+queried by the ``swh-graph`` library.
-Own datasets
-^^^^^^^^^^^^
+All the publicly available datasets are documented on this page:
+https://docs.softwareheritage.org/devel/swh-dataset/graph/dataset.html
-A graph is described as both its adjacency list and the set of nodes
-identifiers in plain text format. Such graph example can be found in the
-``swh/graph/tests/dataset/`` folder.
+A good way of retrieving these datasets is to use the `AWS S3 CLI
+`_.
-You can compress the example graph on the command line like this:
+Here is an example with the dataset ``2021-03-23-popular-3k-python``, which has
+a relatively reasonable size (~15 GiB including property data, with
+the compressed graph itself being less than 700 MiB):
-.. code:: bash
+.. code:: console
+ (venv) $ pip install awscli
+ [...]
+ (venv) $ mkdir -p 2021-03-23-popular-3k-python/compressed
+ (venv) $ cd 2021-03-23-popular-3k-python/
+ (venv) $ aws s3 cp --recursive s3://softwareheritage/graph/2021-03-23-popular-3k-python/compressed/ compressed
- (swhenv) ~/t/swh-graph-tests$ swh graph compress --graph swh/graph/tests/dataset/example --outdir output/
- [...]
+You can also retrieve larger graphs, but note that these graphs are generally
+intended to be loaded fully in RAM, and do not fit on ordinary desktop
+machines. The server we use in production to run the graph service has more
+than 700 GiB of RAM. These memory considerations are discussed in more details
+in :ref:`swh-graph-memory`.
- (swhenv) ~/t/swh-graph-tests$ ls output/
- example-bv.properties example.mph example.obl example.outdegree example.swhid2node.bin example-transposed.offsets
- example.graph example.node2swhid.bin example.offsets example.properties example-transposed.graph example-transposed.properties
- example.indegree example.node2type.map example.order example.stats example-transposed.obl
+**Note:** for testing purposes, a fake test dataset is available in the
+``swh-graph`` repository, with just a few dozen nodes. Its basename is
+``swh-graph/swh/graph/tests/dataset/compressed/example``.
API server
----------
-To start a ``swh.graph`` API server of a compressed graph dataset, run:
+To start a ``swh.graph`` API server of a compressed graph dataset, you need to
+use the ``rpc-serve`` command with the basename of the graph, which is the path prefix
+of all the graph files (e.g., with the basename ``compressed/graph``, it will
+attempt to load the files located at
+``compressed/graph.{graph,properties,offsets,...}``.
-.. code:: bash
+In our example:
+
+.. code:: console
- (swhenv) ~/t/swh-graph-tests$ swh graph rpc-serve -g output/example
- Loading graph output/example ...
+ (venv) $ swh graph rpc-serve -g compressed/graph
+ Loading graph compressed/graph ...
Graph loaded.
======== Running on http://0.0.0.0:5009 ========
(Press CTRL+C to quit)
From there you can use this endpoint to query the compressed graph, for example
-with httpie_ (``sudo apt install``) from another terminal:
+with httpie_ (``sudo apt install httpie``):
.. _httpie: https://httpie.org
.. code:: bash
- ~/tmp$ http :5009/graph/visit/nodes/swh:1:rel:0000000000000000000000000000000000000010
- HTTP/1.1 200 OK
- Content-Type: text/plain
- Date: Tue, 15 Sep 2020 08:33:25 GMT
- Server: Python/3.8 aiohttp/3.6.2
- Transfer-Encoding: chunked
-
- swh:1:rel:0000000000000000000000000000000000000010
- swh:1:rev:0000000000000000000000000000000000000009
- swh:1:rev:0000000000000000000000000000000000000003
- swh:1:dir:0000000000000000000000000000000000000002
- swh:1:cnt:0000000000000000000000000000000000000001
- swh:1:dir:0000000000000000000000000000000000000008
- swh:1:dir:0000000000000000000000000000000000000006
- swh:1:cnt:0000000000000000000000000000000000000004
- swh:1:cnt:0000000000000000000000000000000000000005
- swh:1:cnt:0000000000000000000000000000000000000007
-
-
-Running the existing ``python3kcompress`` dataset:
-
-.. code:: bash
-
- (swhenv) ~/t/swh-graph-tests$ swh graph rpc-serve -g python3kcompress/python3k
- Loading graph python3kcompress/python3k ...
- Graph loaded.
- ======== Running on http://0.0.0.0:5009 ========
- (Press CTRL+C to quit)
-
-
~/tmp$ http :5009/graph/leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323
HTTP/1.1 200 OK
Content-Type: text/plain
Date: Tue, 15 Sep 2020 08:35:19 GMT
Server: Python/3.8 aiohttp/3.6.2
Transfer-Encoding: chunked
swh:1:cnt:33af56e02dd970873d8058154bf016ec73b35dfb
swh:1:cnt:b03b4ffd7189ae5457d8e1c2ee0490b1938fd79f
swh:1:cnt:74d127c2186f7f0e8b14a27249247085c49d548a
swh:1:cnt:c0139aa8e79b338e865a438326629fa22fa8f472
[...]
swh:1:cnt:a6b60e797063fef707bbaa4f90cfb4a2cbbddd4a
swh:1:cnt:cc0a1deca559c1dd2240c08156d31cde1d8ed406
-
-See the documentation of the :ref:`API ` for more details.
+See the documentation of the :ref:`API ` for more details on how
+to use the HTTP graph querying API.
diff --git a/docs/use-cases.rst b/docs/use-cases.rst
deleted file mode 100644
index ce01d8c..0000000
--- a/docs/use-cases.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-=========
-Use cases
-=========
-
-
-This document lists use cases and benchmark scenarios for the Software Heritage
-graph service.
-
-
-Conventions
-===========
-
-- **Node identification**: in the following, nodes are always identified by
- their :ref:`SWHIDs `.
-
-
-Use cases
-=========
-
-
-Browsing
---------
-
-The following use cases require traversing the *forward graph*.
-
-- **ls**: given a directory node, list (non recursively) all linked nodes of
- type directory and content
-
- Implementation::
-
- /graph/neighbors/:DIR_ID?edges=dir:cnt,dir:dir
-
-- **ls -R**: given a directory node, recursively list all linked nodes of type
- directory and content
-
- Implementation::
-
- /graph/visit/paths/:DIR_ID?edges=dir:cnt,dir:dir
-
-- **git log**: given a revision node, recursively list all linked nodes of type
- revision
-
- Implementation::
-
- /graph/visit/nodes/:REV_ID?edges=rev:rev
-
-
-Vault
------
-
-The following use cases require traversing the *forward graph*.
-
-- **tarball** (same as *ls -R* above)
-
-- **git bundle**: given a node, recursively list all linked nodes of any kind
-
- Implementation::
-
- /graph/visit/nodes/:NODE_ID?edges=*
-
-
-Provenance
-----------
-
-The following use cases require traversing the *backward (transposed)
-graph*.
-
-- **commit provenance**: given a content or directory node, return *a* commit
- whose directory (recursively) contains it
-
- Implementation::
-
- /graph/walk/:NODE_ID/rev?direction=backward&edges=dir:dir,cnt:dir,dir:rev
-
-- **complete commit provenance**: given a content or directory node, return
- *all* commits whose directory (recursively) contains it
-
- Implementation::
-
- /graph/leaves/:NODE_ID?direction=backward&edges=dir:dir,cnt:dir,dir:rev
-
-- **origin provenance**: given a content, directory, or commit node, return
- *an* origin that has at least one snapshot that (recursively) contains it
-
- Implementation::
-
- /graph/walk/:NODE_ID/ori?direction=backward&edges=*
-
-- **complete origin provenance**: given a content, directory, or commit node,
- return *all* origins that have at least one snapshot that (recursively)
- contains it
-
- Implementation::
-
- /graph/leaves/:NODE_ID?direction=backward&edges=*
-
-- *SLOC tracking*: left as future work
-
-
-Provenance statistics
-~~~~~~~~~~~~~~~~~~~~~
-
-The following use cases require traversing the *backward (transposed)
-graph*.
-
-- **content popularity across commits**: for each content, count the number of
- commits (or *commit popularity*) that link to a directory that (recursively)
- includes it. Plot the distribution of content popularity across commits.
-
- Implementation: apply *complete commit provenance* to each content node,
- count the returned commits, aggregate.
-
-- **commit popularity across origins**: for each commit, count the number of
- origins (or *origin popularity*) that have a snapshot that (recursively)
- includes it. Plot the distribution of commit popularity across origins.
-
- Implementation: apply *complete origin provenance* to each commit node, count
- the returned origins, aggregate.
-
-- *SLOC popularity across contents*: left as future work
-
-The following use cases require traversing the *forward graph*.
-
-- **revision size** (as n. of contents) distribution: for each revision, count
- the number of contents that are (recursively) reachable from it. Plot the
- distribution of revision sizes.
-
-- **origin size** (as n. of revisions) distribution: for each origin, count the
- number of revisions that are (recursively) reachable from it. Plot the
- distribution of origin sizes.
-
-
-Benchmarks
-==========
-
-Notes on how to benchmark graph access:
-
-- separate pure-graph timings from timings related to use additional mappings
- (e.g., node types), no matter if the mappings are in-memory or on-disk
-
-- separate in-memory timings from on-disk timings; in particular, separate the
- timing of translating node identifiers between internal integers and SWHIDs
-
-- for each use case that requires a node as input, we will randomize the choice
- of the input node and repeat the experiment a suitable number of times; where
- possible we will aggregate results computing basic statistics (average,
- standard deviation), as well as normalize results w.r.t. the “size” of the
- chosen node (e.g., number of nodes/path length in the resulting visit)
-
-
-Basic benchmarks
-----------------
-
-- **Edge traversal**: given a node, retrieve the first node in its adjacency
- list.
-
- For reference: Apostolico, Drovandi in *Graph Compression by BFS* report
- times to retrieve the adjacency list of a node (and/or test if an edge exists
- between two nodes) in the 2-3 us range, for the largest graph in their
- experiments (22 M nodes, 600 M edges).
-
-
-Each use case is a benchmark
-----------------------------
-
-In addition to abstract benchmark, we will use each use case above as a
-scenario-based benchmark.
diff --git a/java/README.md b/java/README.md
index 623e98e..7276284 100644
--- a/java/README.md
+++ b/java/README.md
@@ -1,51 +1,49 @@
Graph service - Java backend
============================
Server side Java RPC API.
Build
-----
```bash
$ mvn compile assembly:single
```
Start RPC API
-------------
```bash
$ java -cp target/swh-graph-*.jar \
- org.softwareheritage.graph.server.App \
+ org.softwareheritage.graph.rpc.GraphServer \
```
-Default port is 5009 (use the `--port` option to change port number). If you
-need timings metadata send back to the client in addition to the result, use the
-`--timings` flag.
+Default port is 50091 (use the `--port` option to change port number).
Tests
-----
Unit tests rely on test data that are already available in the Git repository
(under `src/swh/graph/tests/dataset/`). You generally only need to run them
using Maven:
```bash
$ mvn test
```
In case you want to regenerate the test data:
```bash
# Graph compression
$ cd src/swh/graph/tests/dataset
$ ./generate_graph.sh
$ cd ../../../..
$ mvn compile assembly:single
# Dump mapping files
$ java -cp target/swh-graph-*.jar \
- org.softwareheritage.graph.maps.NodeMapBuilder \
+ org.softwareheritage.graph.compress.NodeMapBuilder \
src/swh/graph/tests/dataset/example.nodes.csv.gz \
src/swh/graph/tests/dataset/output/example
```
diff --git a/java/pom.xml b/java/pom.xml
index 26f0ade..405ce93 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -1,274 +1,403 @@
4.0.0org.softwareheritage.graphswh-graph${git.closest.tag.name}swh-graphhttps://forge.softwareheritage.org/source/swh-graph/UTF-811
+ 3.21.1
+ 1.47.0ch.qos.logbacklogback-classic1.2.3org.junit.jupiterjunit-jupiter-api5.7.0testorg.junit.jupiterjunit-jupiter-engine5.7.0test
-
- org.hamcrest
- hamcrest
- 2.2
- test
-
-
- io.javalin
- javalin
- 3.0.0
- org.slf4jslf4j-simple1.7.26
-
- com.fasterxml.jackson.core
- jackson-databind
- 2.13.0
- it.unimi.dsiwebgraph-big
- 3.6.6
+ 3.7.0it.unimi.dsifastutil
- 8.5.6
+ 8.5.8it.unimi.dsidsiutils
- 2.6.17
+ 2.7.2it.unimi.dsisux4j
- 5.2.3
+ 5.3.1it.unimi.dsilaw2.7.2org.apache.hadoophadoop-commonorg.umlgraphumlgraphorg.eclipse.jetty.aggregatejetty-allit.unimi.dimg4jit.unimi.dimg4j-bigcom.martiansoftwarejsap2.1
-
- net.sf.py4j
- py4j
- 0.10.9.3
- commons-codeccommons-codec1.15
+
+ com.github.luben
+ zstd-jni
+ 1.5.1-1
+
+
+ org.apache.orc
+ orc-core
+ 1.7.1
+
+
+ org.apache.hadoop
+ hadoop-common
+ 3.3.1
+
+
+ org.apache.hadoop
+ hadoop-client-runtime
+ 3.3.1
+
+
+ com.google.protobuf
+ protobuf-java
+ ${protobuf.version}
+
+
+ io.grpc
+ grpc-netty-shaded
+ ${grpc.version}
+
+
+ io.grpc
+ grpc-protobuf
+ ${grpc.version}
+
+
+ io.grpc
+ grpc-stub
+ ${grpc.version}
+
+
+ io.grpc
+ grpc-services
+ ${grpc.version}
+
+
+ io.grpc
+ grpc-testing
+ ${grpc.version}
+
+
+ javax.annotation
+ javax.annotation-api
+ 1.3.2
+
+
+ com.google.protobuf
+ protobuf-java-util
+ ${protobuf.version}
+ maven-clean-plugin3.1.0maven-resources-plugin3.0.2maven-compiler-plugin3.8.01111-verbose-Xlint:allmaven-surefire-plugin2.22.2maven-failsafe-plugin2.22.2maven-jar-plugin3.0.2maven-install-plugin2.5.2maven-deploy-plugin2.8.2maven-site-plugin3.7.1maven-project-info-reports-plugin3.0.0
+
+ maven-dependency-plugin
+ 3.1.2
+ maven-assembly-plugin3.3.0
- org.softwareheritage.graph.server.App
+ org.softwareheritage.graph.rpc.GraphServerjar-with-dependenciesfalsemake-assemblypackagesinglecom.diffplug.spotlessspotless-maven-plugin
- 2.4.1
+ 2.22.1*.md.gitignoretrue44.16.0.coding-style.xmlpl.project13.mavengit-commit-id-plugin3.0.1get-the-git-infosrevisioninitializetruetruetruetruev*git.closest.tag.name^vtrue
-
-
-
-
+
+ maven-source-plugin
+ 2.1.1
+
+
+ bundle-sources
+ package
+
+ jar-no-fork
+ test-jar-no-fork
+
+
+
+ org.apache.maven.pluginsmaven-javadoc-plugin
- 3.1.1
+ 3.3.1
+
+
+ resource-bundles
+ package
+
+
+ resource-bundle
+
+
+ test-resource-bundle
+
+
+ false
+
+
+
+ javadoc-jar
+ package
+
+ jar
+
+
+
+
+ true
+
+ it.unimi.dsi:webgraph-big:*
+
+
+ https://webgraph.di.unimi.it/docs-big/
+ https://dsiutils.di.unimi.it/docs/
+ https://fastutil.di.unimi.it/docs/
+ https://law.di.unimi.it/software/law-docs/
+
+
+
+ implSpec
+ a
+ Implementation Requirements:
+
+
+ implNote
+ a
+ Implementation Note:
+
+
+
+
+
+ org.xolstice.maven.plugins
+ protobuf-maven-plugin
+ 0.6.1
+
+ com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier}
+ grpc-java
+ io.grpc:protoc-gen-grpc-java:${grpc.version}:exe:${os.detected.classifier}
+
+
+
+
+ compile
+ compile-custom
+ test-compile
+ test-compile-custom
+
+
+
-
+
+
+ kr.motd.maven
+ os-maven-plugin
+ 1.6.2
+
+
+
diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java
index 737148a..a91276f 100644
--- a/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java
+++ b/java/src/main/java/org/softwareheritage/graph/AllowedEdges.java
@@ -1,74 +1,98 @@
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import java.util.ArrayList;
/**
* Edge restriction based on node types, used when visiting the graph.
*
* Software Heritage
* graph contains multiple node types (contents, directories, revisions, ...) and restricting
* the traversal to specific node types is necessary for many querying operations:
* use cases.
*
* @author The Software Heritage developers
*/
public class AllowedEdges {
/**
* 2D boolean matrix storing access rights for all combination of src/dst node types (first
* dimension is source, second dimension is destination), when edge restriction is not enforced this
* array is set to null for early bypass.
*/
public boolean[][] restrictedTo;
/**
* Constructor.
*
* @param edgesFmt a formatted string describing allowed
* edges
*/
public AllowedEdges(String edgesFmt) {
- int nbNodeTypes = Node.Type.values().length;
+ int nbNodeTypes = SwhType.values().length;
this.restrictedTo = new boolean[nbNodeTypes][nbNodeTypes];
// Special values (null, empty, "*")
if (edgesFmt == null || edgesFmt.isEmpty()) {
return;
}
if (edgesFmt.equals("*")) {
// Allows for quick bypass (with simple null check) when no edge restriction
restrictedTo = null;
return;
}
// Format: "src1:dst1,src2:dst2,[...]"
String[] edgeTypes = edgesFmt.split(",");
for (String edgeType : edgeTypes) {
String[] nodeTypes = edgeType.split(":");
if (nodeTypes.length != 2) {
throw new IllegalArgumentException("Cannot parse edge type: " + edgeType);
}
- ArrayList srcTypes = Node.Type.parse(nodeTypes[0]);
- ArrayList dstTypes = Node.Type.parse(nodeTypes[1]);
- for (Node.Type srcType : srcTypes) {
- for (Node.Type dstType : dstTypes) {
+ ArrayList srcTypes = SwhType.parse(nodeTypes[0]);
+ ArrayList dstTypes = SwhType.parse(nodeTypes[1]);
+ for (SwhType srcType : srcTypes) {
+ for (SwhType dstType : dstTypes) {
restrictedTo[srcType.ordinal()][dstType.ordinal()] = true;
}
}
}
}
/**
* Checks if a given edge can be followed during graph traversal.
*
* @param srcType edge source type
* @param dstType edge destination type
* @return true if allowed and false otherwise
*/
- public boolean isAllowed(Node.Type srcType, Node.Type dstType) {
+ public boolean isAllowed(SwhType srcType, SwhType dstType) {
if (restrictedTo == null)
return true;
return restrictedTo[srcType.ordinal()][dstType.ordinal()];
}
+
+ /**
+ * Return a new AllowedEdges instance with reversed edge restrictions. e.g. "src1:dst1,src2:dst2"
+ * becomes "dst1:src1,dst2:src2"
+ *
+ * @return a new AllowedEdges instance with reversed edge restrictions
+ */
+ public AllowedEdges reverse() {
+ AllowedEdges reversed = new AllowedEdges(null);
+ reversed.restrictedTo = new boolean[restrictedTo.length][restrictedTo[0].length];
+ for (int i = 0; i < restrictedTo.length; i++) {
+ for (int j = 0; j < restrictedTo[0].length; j++) {
+ reversed.restrictedTo[i][j] = restrictedTo[j][i];
+ }
+ }
+ return reversed;
+ }
}
diff --git a/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java b/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java
index 40f473a..d80cae4 100644
--- a/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java
+++ b/java/src/main/java/org/softwareheritage/graph/AllowedNodes.java
@@ -1,50 +1,57 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
/**
- * TODO
+ * Node type restriction, useful to implement filtering of returned nodes during traversal.
*
* @author The Software Heritage developers
*/
public class AllowedNodes {
public boolean[] restrictedTo;
/**
* Constructor.
*
* @param nodesFmt a formatted string describing allowed nodes
*/
public AllowedNodes(String nodesFmt) {
- int nbNodeTypes = Node.Type.values().length;
+ int nbNodeTypes = SwhType.values().length;
this.restrictedTo = new boolean[nbNodeTypes];
// Special values (null, empty, "*")
if (nodesFmt == null || nodesFmt.isEmpty()) {
return;
}
if (nodesFmt.equals("*")) {
// Allows for quick bypass (with simple null check) when no node restriction
restrictedTo = null;
return;
}
// Format: "nodeType1,nodeType2,[...]"
String[] nodeTypesStr = nodesFmt.split(",");
for (String nodeTypeStr : nodeTypesStr) {
- for (Node.Type nodeType : Node.Type.parse(nodeTypeStr)) {
- this.restrictedTo[Node.Type.toInt(nodeType)] = true;
+ for (SwhType nodeType : SwhType.parse(nodeTypeStr)) {
+ this.restrictedTo[SwhType.toInt(nodeType)] = true;
}
}
}
/**
* Checks if a given node type is allowed.
*
* @param nodeType node type to check
* @return true if allowed and false otherwise
*/
- public boolean isAllowed(Node.Type nodeType) {
+ public boolean isAllowed(SwhType nodeType) {
if (restrictedTo == null)
return true;
- return restrictedTo[Node.Type.toInt(nodeType)];
+ return restrictedTo[SwhType.toInt(nodeType)];
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java b/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java
deleted file mode 100644
index be19956..0000000
--- a/java/src/main/java/org/softwareheritage/graph/BidirectionalImmutableGraph.java
+++ /dev/null
@@ -1,128 +0,0 @@
-package org.softwareheritage.graph;
-
-import it.unimi.dsi.big.webgraph.ImmutableGraph;
-import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import it.unimi.dsi.big.webgraph.Transform;
-import it.unimi.dsi.fastutil.longs.LongIterator;
-
-/**
- * A directed immutable graph which can be iterated in both directions (forward and backward). It
- * exposes the backward equivalents of the ImmutableGraph primitives (indegree() and
- * predecessors()). This is implemented by passing two graphs, one in the forward and one in the
- * backward direction.
- */
-public class BidirectionalImmutableGraph extends ImmutableGraph {
- private final ImmutableGraph forwardGraph;
- private final ImmutableGraph backwardGraph;
-
- /**
- * Creates a bidirectional immutable graph
- *
- * @param forwardGraph The graph in the forward direction
- * @param backwardGraph The graph in the backward direction
- */
- protected BidirectionalImmutableGraph(ImmutableGraph forwardGraph, ImmutableGraph backwardGraph) {
- this.forwardGraph = forwardGraph;
- this.backwardGraph = backwardGraph;
- }
-
- @Override
- public long numNodes() {
- assert forwardGraph.numNodes() == backwardGraph.numNodes();
- return this.forwardGraph.numNodes();
- }
-
- @Override
- public long numArcs() {
- assert forwardGraph.numArcs() == backwardGraph.numArcs();
- return this.forwardGraph.numArcs();
- }
-
- @Override
- public boolean randomAccess() {
- return this.forwardGraph.randomAccess() && this.backwardGraph.randomAccess();
- }
-
- @Override
- public boolean hasCopiableIterators() {
- return forwardGraph.hasCopiableIterators() && backwardGraph.hasCopiableIterators();
- }
-
- @Override
- public BidirectionalImmutableGraph copy() {
- return new BidirectionalImmutableGraph(this.forwardGraph.copy(), this.backwardGraph.copy());
- }
-
- /**
- * Returns the transposed version of the bidirectional graph. Successors become predecessors, and
- * vice-versa.
- */
- public BidirectionalImmutableGraph transpose() {
- return new BidirectionalImmutableGraph(backwardGraph, forwardGraph);
- }
-
- /**
- * Returns the symmetric version of the bidirectional graph. It returns the (lazy) union of the
- * forward graph and the backward graph. This is equivalent to removing the directionality of the
- * edges: the successors of a node are also its predecessors.
- *
- * @return a symmetric, undirected BidirectionalImmutableGraph.
- */
- public BidirectionalImmutableGraph symmetrize() {
- ImmutableGraph symmetric = Transform.union(forwardGraph, backwardGraph);
- return new BidirectionalImmutableGraph(symmetric, symmetric);
- }
-
- /**
- * Returns the simplified version of the bidirectional graph. Works like symmetrize(), but also
- * removes the loop edges.
- *
- * @return a simplified (loopless and symmetric) BidirectionalImmutableGraph
- */
- public BidirectionalImmutableGraph simplify() {
- ImmutableGraph simplified = Transform.simplify(forwardGraph, backwardGraph);
- return new BidirectionalImmutableGraph(simplified, simplified);
- }
-
- /** Returns the outdegree of a node */
- @Override
- public long outdegree(long l) {
- return forwardGraph.outdegree(l);
- }
-
- /** Returns the indegree of a node */
- public long indegree(long l) {
- return backwardGraph.outdegree(l);
- }
-
- /** Returns a lazy iterator over the successors of a given node. */
- @Override
- public LazyLongIterator successors(long nodeId) {
- return forwardGraph.successors(nodeId);
- }
-
- /** Returns a lazy iterator over the predecessors of a given node. */
- public LazyLongIterator predecessors(long nodeId) {
- return backwardGraph.successors(nodeId);
- }
-
- /** Returns a reference to an array containing the predecessors of a given node. */
- public long[][] predecessorBigArray(long x) {
- return backwardGraph.successorBigArray(x);
- }
-
- /** Returns an iterator enumerating the indegrees of the nodes of this graph. */
- public LongIterator indegrees() {
- return backwardGraph.outdegrees();
- }
-
- /** Returns the underlying ImmutableGraph in the forward direction. */
- public ImmutableGraph getForwardGraph() {
- return forwardGraph;
- }
-
- /** Returns the underlying ImmutableGraph in the backward direction. */
- public ImmutableGraph getBackwardGraph() {
- return backwardGraph;
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/Entry.java b/java/src/main/java/org/softwareheritage/graph/Entry.java
deleted file mode 100644
index a2d3f5a..0000000
--- a/java/src/main/java/org/softwareheritage/graph/Entry.java
+++ /dev/null
@@ -1,193 +0,0 @@
-package org.softwareheritage.graph;
-
-import java.io.*;
-import java.util.ArrayList;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.fasterxml.jackson.databind.PropertyNamingStrategy;
-
-public class Entry {
- private Graph graph;
-
- public void load_graph(String graphBasename) throws IOException {
- System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = Graph.loadMapped(graphBasename);
- System.err.println("Graph loaded.");
- }
-
- public Graph get_graph() {
- return graph.copy();
- }
-
- public String stats() {
- try {
- Stats stats = new Stats(graph.getPath());
- ObjectMapper objectMapper = new ObjectMapper();
- objectMapper.setPropertyNamingStrategy(PropertyNamingStrategy.SNAKE_CASE);
- return objectMapper.writeValueAsString(stats);
- } catch (IOException e) {
- throw new RuntimeException("Cannot read stats: " + e);
- }
- }
-
- public void check_swhid(String src) {
- graph.getNodeId(new SWHID(src));
- }
-
- private int count_visitor(NodeCountVisitor f, long srcNodeId) {
- int[] count = {0};
- f.accept(srcNodeId, (node) -> {
- count[0]++;
- });
- return count[0];
- }
-
- public int count_leaves(String direction, String edgesFmt, String src, long maxEdges) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- Traversal t = new Traversal(graph.copy(), direction, edgesFmt, maxEdges);
- return count_visitor(t::leavesVisitor, srcNodeId);
- }
-
- public int count_neighbors(String direction, String edgesFmt, String src, long maxEdges) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- Traversal t = new Traversal(graph.copy(), direction, edgesFmt, maxEdges);
- return count_visitor(t::neighborsVisitor, srcNodeId);
- }
-
- public int count_visit_nodes(String direction, String edgesFmt, String src, long maxEdges) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- Traversal t = new Traversal(graph.copy(), direction, edgesFmt, maxEdges);
- return count_visitor(t::visitNodesVisitor, srcNodeId);
- }
-
- public QueryHandler get_handler(String clientFIFO) {
- return new QueryHandler(graph.copy(), clientFIFO);
- }
-
- private interface NodeCountVisitor {
- void accept(long nodeId, Traversal.NodeIdConsumer consumer);
- }
-
- public class QueryHandler {
- Graph graph;
- BufferedWriter out;
- String clientFIFO;
-
- public QueryHandler(Graph graph, String clientFIFO) {
- this.graph = graph;
- this.clientFIFO = clientFIFO;
- this.out = null;
- }
-
- public void writeNode(SWHID swhid) {
- try {
- out.write(swhid.toString() + "\n");
- } catch (IOException e) {
- throw new RuntimeException("Cannot write response to client: " + e);
- }
- }
-
- public void writeEdge(SWHID src, SWHID dst) {
- try {
- out.write(src.toString() + " " + dst.toString() + "\n");
- } catch (IOException e) {
- throw new RuntimeException("Cannot write response to client: " + e);
- }
- }
-
- public void open() {
- try {
- FileOutputStream file = new FileOutputStream(this.clientFIFO);
- this.out = new BufferedWriter(new OutputStreamWriter(file));
- } catch (IOException e) {
- throw new RuntimeException("Cannot open client FIFO: " + e);
- }
- }
-
- public void close() {
- try {
- out.close();
- } catch (IOException e) {
- throw new RuntimeException("Cannot write response to client: " + e);
- }
- }
-
- public void leaves(String direction, String edgesFmt, String src, long maxEdges, String returnTypes) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- open();
- Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes);
- for (Long nodeId : t.leaves(srcNodeId)) {
- writeNode(graph.getSWHID(nodeId));
- }
- close();
- }
-
- public void neighbors(String direction, String edgesFmt, String src, long maxEdges, String returnTypes) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- open();
- Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes);
- for (Long nodeId : t.neighbors(srcNodeId)) {
- writeNode(graph.getSWHID(nodeId));
- }
- close();
- }
-
- public void visit_nodes(String direction, String edgesFmt, String src, long maxEdges, String returnTypes) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- open();
- Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes);
- for (Long nodeId : t.visitNodes(srcNodeId)) {
- writeNode(graph.getSWHID(nodeId));
- }
- close();
- }
-
- public void visit_edges(String direction, String edgesFmt, String src, long maxEdges, String returnTypes) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- open();
- Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges);
- t.visitNodesVisitor(srcNodeId, null, (srcId, dstId) -> {
- writeEdge(graph.getSWHID(srcId), graph.getSWHID(dstId));
- });
- close();
- }
-
- public void walk(String direction, String edgesFmt, String algorithm, String src, String dst, long maxEdges,
- String returnTypes) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- open();
- ArrayList res;
- Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes);
- if (dst.matches("ori|snp|rel|rev|dir|cnt")) {
- Node.Type dstType = Node.Type.fromStr(dst);
- res = t.walk(srcNodeId, dstType, algorithm);
- } else {
- long dstNodeId = graph.getNodeId(new SWHID(dst));
- res = t.walk(srcNodeId, dstNodeId, algorithm);
- }
- for (Long nodeId : res) {
- writeNode(graph.getSWHID(nodeId));
- }
- close();
- }
-
- public void random_walk(String direction, String edgesFmt, int retries, String src, String dst, long maxEdges,
- String returnTypes) {
- long srcNodeId = graph.getNodeId(new SWHID(src));
- open();
- ArrayList res;
- Traversal t = new Traversal(graph, direction, edgesFmt, maxEdges, returnTypes);
- if (dst.matches("ori|snp|rel|rev|dir|cnt")) {
- Node.Type dstType = Node.Type.fromStr(dst);
- res = t.randomWalk(srcNodeId, dstType, retries);
- } else {
- long dstNodeId = graph.getNodeId(new SWHID(dst));
- res = t.randomWalk(srcNodeId, dstNodeId, retries);
- }
- for (Long nodeId : res) {
- writeNode(graph.getSWHID(nodeId));
- }
- close();
- }
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/Graph.java b/java/src/main/java/org/softwareheritage/graph/Graph.java
deleted file mode 100644
index 8d9acf1..0000000
--- a/java/src/main/java/org/softwareheritage/graph/Graph.java
+++ /dev/null
@@ -1,304 +0,0 @@
-package org.softwareheritage.graph;
-
-import it.unimi.dsi.big.webgraph.ImmutableGraph;
-import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import it.unimi.dsi.logging.ProgressLogger;
-import org.softwareheritage.graph.maps.NodeIdMap;
-import org.softwareheritage.graph.maps.NodeTypesMap;
-
-import java.io.IOException;
-
-/**
- * Main class storing the compressed graph and node id mappings.
- *
- * The compressed graph is stored using the WebGraph
- * ecosystem. Additional mappings are necessary because Software Heritage uses string based persistent
- * identifiers (SWHID) while WebGraph uses integers internally. These two mappings (long id
- * ↔ SWHID) are used for the input (users refer to the graph using SWHID) and the output
- * (convert back to SWHID for users results). However, since graph traversal can be restricted
- * depending on the node type (see {@link AllowedEdges}), a long id → node type map is stored
- * as well to avoid a full SWHID lookup.
- *
- * @author The Software Heritage developers
- * @see org.softwareheritage.graph.AllowedEdges
- * @see org.softwareheritage.graph.maps.NodeIdMap
- * @see org.softwareheritage.graph.maps.NodeTypesMap
- */
-
-public class Graph extends ImmutableGraph {
- /**
- * Bidirectional graph containing two compressed {@link it.unimi.dsi.big.webgraph.BVGraph} one for
- * each direction
- */
- BidirectionalImmutableGraph graph;
-
- /** Path and basename of the compressed graph */
- String path;
- /** Mapping long id ↔ SWHIDs */
- NodeIdMap nodeIdMap;
- /** Mapping long id → node types */
- NodeTypesMap nodeTypesMap;
-
- /**
- * Constructor.
- *
- * @param path path and basename of the compressed graph to load
- */
-
- private Graph(String path) throws IOException {
- loadInternal(path, null, LoadMethod.MAPPED);
- }
-
- /**
- * Loading mechanisms
- */
-
- enum LoadMethod {
- MEMORY, MAPPED, OFFLINE,
- }
-
- protected Graph loadInternal(String path, ProgressLogger pl, LoadMethod method) throws IOException {
- this.path = path;
- ImmutableGraph direct = null;
- ImmutableGraph transposed = null;
- if (method == LoadMethod.MEMORY) {
- direct = ImmutableGraph.load(path, pl);
- transposed = ImmutableGraph.load(path + "-transposed", pl);
- } else if (method == LoadMethod.MAPPED) {
- direct = ImmutableGraph.load(path, pl);
- transposed = ImmutableGraph.loadMapped(path + "-transposed", pl);
- } else if (method == LoadMethod.OFFLINE) {
- direct = ImmutableGraph.loadOffline(path, pl);
- transposed = ImmutableGraph.loadOffline(path + "-transposed", pl);
- }
- this.graph = new BidirectionalImmutableGraph(direct, transposed);
- this.nodeTypesMap = new NodeTypesMap(path);
- this.nodeIdMap = new NodeIdMap(path, numNodes());
- return this;
- }
-
- protected Graph() {
- }
-
- public static Graph load(String path, ProgressLogger pl) throws IOException {
- return new Graph().loadInternal(path, pl, LoadMethod.MEMORY);
- }
-
- public static Graph loadMapped(String path, ProgressLogger pl) throws IOException {
- return new Graph().loadInternal(path, pl, LoadMethod.MAPPED);
- }
-
- public static Graph loadOffline(String path, ProgressLogger pl) throws IOException {
- return new Graph().loadInternal(path, null, LoadMethod.OFFLINE);
- }
-
- public static Graph load(String path) throws IOException {
- return new Graph().loadInternal(path, null, LoadMethod.MEMORY);
- }
-
- public static Graph loadMapped(String path) throws IOException {
- return new Graph().loadInternal(path, null, LoadMethod.MAPPED);
- }
-
- public static Graph loadOffline(String path) throws IOException {
- return new Graph().loadInternal(path, null, LoadMethod.OFFLINE);
- }
-
- /**
- * Constructor used for copy()
- */
- protected Graph(BidirectionalImmutableGraph graph, String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) {
- this.graph = graph;
- this.path = path;
- this.nodeIdMap = nodeIdMap;
- this.nodeTypesMap = nodeTypesMap;
- }
-
- /**
- * Return a flyweight copy of the graph.
- */
- @Override
- public Graph copy() {
- return new Graph(this.graph.copy(), this.path, this.nodeIdMap, this.nodeTypesMap);
- }
-
- @Override
- public boolean randomAccess() {
- return graph.randomAccess();
- }
-
- /**
- * Return a transposed version of the graph.
- */
- public Graph transpose() {
- return new Graph(this.graph.transpose(), this.path, this.nodeIdMap, this.nodeTypesMap);
- }
-
- /**
- * Return a symmetric version of the graph.
- */
- public Graph symmetrize() {
- return new Graph(this.graph.symmetrize(), this.path, this.nodeIdMap, this.nodeTypesMap);
- }
-
- /**
- * Cleans up graph resources after use.
- */
- public void cleanUp() throws IOException {
- nodeIdMap.close();
- }
-
- /**
- * Returns number of nodes in the graph.
- *
- * @return number of nodes in the graph
- */
- @Override
- public long numNodes() {
- return graph.numNodes();
- }
-
- /**
- * Returns number of edges in the graph.
- *
- * @return number of edges in the graph
- */
- @Override
- public long numArcs() {
- return graph.numArcs();
- }
-
- /**
- * Returns lazy iterator of successors of a node.
- *
- * @param nodeId node specified as a long id
- * @return lazy iterator of successors of the node, specified as a
- * WebGraph LazyLongIterator
- */
- @Override
- public LazyLongIterator successors(long nodeId) {
- return graph.successors(nodeId);
- }
-
- /**
- * Returns lazy iterator of successors of a node while following a specific set of edge types.
- *
- * @param nodeId node specified as a long id
- * @param allowedEdges the specification of which edges can be traversed
- * @return lazy iterator of successors of the node, specified as a
- * WebGraph LazyLongIterator
- */
- public LazyLongIterator successors(long nodeId, AllowedEdges allowedEdges) {
- if (allowedEdges.restrictedTo == null) {
- // All edges are allowed, bypass edge check
- return this.successors(nodeId);
- } else {
- LazyLongIterator allSuccessors = this.successors(nodeId);
- Graph thisGraph = this;
- return new LazyLongIterator() {
- @Override
- public long nextLong() {
- long neighbor;
- while ((neighbor = allSuccessors.nextLong()) != -1) {
- if (allowedEdges.isAllowed(thisGraph.getNodeType(nodeId), thisGraph.getNodeType(neighbor))) {
- return neighbor;
- }
- }
- return -1;
- }
-
- @Override
- public long skip(final long n) {
- long i;
- for (i = 0; i < n && nextLong() != -1; i++)
- ;
- return i;
- }
- };
- }
- }
-
- /**
- * Returns the outdegree of a node.
- *
- * @param nodeId node specified as a long id
- * @return outdegree of a node
- */
- @Override
- public long outdegree(long nodeId) {
- return graph.outdegree(nodeId);
- }
-
- /**
- * Returns lazy iterator of predecessors of a node.
- *
- * @param nodeId node specified as a long id
- * @return lazy iterator of predecessors of the node, specified as a
- * WebGraph LazyLongIterator
- */
- public LazyLongIterator predecessors(long nodeId) {
- return graph.predecessors(nodeId);
- }
-
- /**
- * Returns the indegree of a node.
- *
- * @param nodeId node specified as a long id
- * @return indegree of a node
- */
- public long indegree(long nodeId) {
- return graph.indegree(nodeId);
- }
-
- /**
- * Returns the underlying BidirectionalImmutableGraph.
- *
- * @return WebGraph ImmutableGraph
- */
- public ImmutableGraph getGraph() {
- return this.graph;
- }
-
- /**
- * Returns the graph full path.
- *
- * @return graph full path
- */
- public String getPath() {
- return path;
- }
-
- /**
- * Converts {@link SWHID} node to long.
- *
- * @param swhid node specified as a {@link SWHID}
- * @return internal long node id
- * @see SWHID
- */
- public long getNodeId(SWHID swhid) {
- return nodeIdMap.getNodeId(swhid);
- }
-
- /**
- * Converts long id node to {@link SWHID}.
- *
- * @param nodeId node specified as a long id
- * @return external SWHID
- * @see SWHID
- */
- public SWHID getSWHID(long nodeId) {
- return nodeIdMap.getSWHID(nodeId);
- }
-
- /**
- * Returns node type.
- *
- * @param nodeId node specified as a long id
- * @return corresponding node type
- * @see org.softwareheritage.graph.Node.Type
- */
- public Node.Type getNodeType(long nodeId) {
- return nodeTypesMap.getType(nodeId);
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/Node.java b/java/src/main/java/org/softwareheritage/graph/Node.java
deleted file mode 100644
index e4a61d3..0000000
--- a/java/src/main/java/org/softwareheritage/graph/Node.java
+++ /dev/null
@@ -1,116 +0,0 @@
-package org.softwareheritage.graph;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-/**
- * A node in the Software Heritage graph.
- *
- * @author The Software Heritage developers
- */
-
-public class Node {
- /**
- * Software Heritage graph node types, as described in the
- * data model.
- */
- public enum Type {
- /** Content node */
- CNT,
- /** Directory node */
- DIR,
- /** Origin node */
- ORI,
- /** Release node */
- REL,
- /** Revision node */
- REV,
- /** Snapshot node */
- SNP;
-
- /**
- * Converts integer to corresponding SWH node type.
- *
- * @param intType node type represented as an integer
- * @return the corresponding {@link Node.Type} value
- * @see org.softwareheritage.graph.Node.Type
- */
- public static Node.Type fromInt(int intType) {
- switch (intType) {
- case 0:
- return CNT;
- case 1:
- return DIR;
- case 2:
- return ORI;
- case 3:
- return REL;
- case 4:
- return REV;
- case 5:
- return SNP;
- }
- return null;
- }
-
- /**
- * Converts node types to the corresponding int value
- *
- * @param type node type as an enum
- * @return the corresponding int value
- */
- public static int toInt(Node.Type type) {
- switch (type) {
- case CNT:
- return 0;
- case DIR:
- return 1;
- case ORI:
- return 2;
- case REL:
- return 3;
- case REV:
- return 4;
- case SNP:
- return 5;
- }
- throw new IllegalArgumentException("Unknown node type: " + type);
- }
-
- /**
- * Converts string to corresponding SWH node type.
- *
- * @param strType node type represented as a string
- * @return the corresponding {@link Node.Type} value
- * @see org.softwareheritage.graph.Node.Type
- */
- public static Node.Type fromStr(String strType) {
- if (!strType.matches("cnt|dir|ori|rel|rev|snp")) {
- throw new IllegalArgumentException("Unknown node type: " + strType);
- }
- return Node.Type.valueOf(strType.toUpperCase());
- }
-
- /**
- * Parses SWH node type possible values from formatted string (see the
- * API syntax).
- *
- * @param strFmtType node types represented as a formatted string
- * @return a list containing the {@link Node.Type} values
- * @see org.softwareheritage.graph.Node.Type
- */
- public static ArrayList parse(String strFmtType) {
- ArrayList types = new ArrayList<>();
-
- if (strFmtType.equals("*")) {
- List nodeTypes = Arrays.asList(Node.Type.values());
- types.addAll(nodeTypes);
- } else {
- types.add(Node.Type.fromStr(strFmtType));
- }
-
- return types;
- }
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/NodesFiltering.java b/java/src/main/java/org/softwareheritage/graph/NodesFiltering.java
deleted file mode 100644
index 3f3e7a3..0000000
--- a/java/src/main/java/org/softwareheritage/graph/NodesFiltering.java
+++ /dev/null
@@ -1,107 +0,0 @@
-package org.softwareheritage.graph;
-
-import java.util.ArrayList;
-
-/**
- *
NodesFiltering
- *
- * class that manages the filtering of nodes that have been returned after a visit of the graph.
- * parameterized by a string that represents either no filtering (*) or a set of node types.
- *
- *
- *
- *
- *
graph/visit/nodes/swh:1:rel:0000000000000000000000000000000000000010 return_types==rev will
- * only return 'rev' nodes.
- *
- *
graph/visit/nodes/swh:1:rel:0000000000000000000000000000000000000010
- * return_types==rev,snp,cnt will only return 'rev' 'snp' 'cnt' nodes.
- *
- *
graph/visit/nodes/swh:1:rel:0000000000000000000000000000000000000010 return_types==* will
- * return all the nodes.
- *
- *
- * How to use NodesFiltering :
- *
- *
- * {@code
- * Long id1 = .... // graph.getNodeType(id1) == CNT
- * Long id2 = .... // graph.getNodeType(id2) == SNP
- * Long id3 = .... // graph.getNodeType(id3) == ORI
- * ArrayList nodeIds = nez ArrayList();
- * nodeIds.add(id1); nodeIds.add(id2); nodeIds.add(id3);
- *
- * NodeFiltering nds = new NodesFiltering("snp,ori"); // we allow only snp node types to be shown
- * System.out.println(nds.filterByNodeTypes(nodeIds,graph)); // will print id2, id3
- *
- * nds = NodesFiltering("*");
- * System.out.println(nds.filterByNodeTypes(nodeIds,graph)); // will print id1, id2 id3
- *
- * }
- *
- */
-
-public class NodesFiltering {
-
- boolean restricted;
- ArrayList allowedNodesTypes;
-
- /**
- * Default constructor, in order to handle the * case (all types of nodes are allowed to be
- * returned). allowedNodesTypes will contains [SNP,CNT....] all types of nodes.
- *
- */
- public NodesFiltering() {
- restricted = false;
- allowedNodesTypes = Node.Type.parse("*");
- }
-
- /**
- * Constructor
- *
- * @param strTypes a formatted string describing the types of nodes we want to allow to be shown.
- *
- * NodesFilterind("cnt,snp") will set allowedNodesTypes to [CNT,SNP]
- *
- */
- public NodesFiltering(String strTypes) {
- restricted = true;
- allowedNodesTypes = new ArrayList();
- String[] types = strTypes.split(",");
- for (String type : types) {
- allowedNodesTypes.add(Node.Type.fromStr(type));
- }
- }
-
- /**
- * Check if the type given in parameter is in the list of allowed types.
- *
- * @param typ the type of the node.
- */
- public boolean typeIsAllowed(Node.Type typ) {
- return this.allowedNodesTypes.contains(typ);
- }
-
- /**
- *
- * the function that filters the nodes returned, we browse the list of nodes found after a visit and
- * we create a new list with only the nodes that have a type that is contained in the list of
- * allowed types (allowedNodesTypes)
- *
- *
- * @param nodeIds the nodes founded during the visit
- * @param g the graph in order to find the types of nodes from their id in nodeIds
- * @return a new list with the id of node which have a type in allowedTypes
- *
- *
- */
- public ArrayList filterByNodeTypes(ArrayList nodeIds, Graph g) {
- ArrayList filteredNodes = new ArrayList();
- for (Long node : nodeIds) {
- if (this.typeIsAllowed(g.getNodeType(node))) {
- filteredNodes.add(node);
- }
- }
- return filteredNodes;
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/SWHID.java b/java/src/main/java/org/softwareheritage/graph/SWHID.java
index 16aff83..3ccb90a 100644
--- a/java/src/main/java/org/softwareheritage/graph/SWHID.java
+++ b/java/src/main/java/org/softwareheritage/graph/SWHID.java
@@ -1,118 +1,125 @@
+/*
+ * Copyright (c) 2019 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import com.fasterxml.jackson.annotation.JsonValue;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.binary.Hex;
/**
* A Software Heritage persistent identifier (SWHID), see persistent
* identifier documentation.
*
* @author The Software Heritage developers
*/
public class SWHID {
/** Fixed hash length of the SWHID */
public static final int HASH_LENGTH = 40;
/** Full SWHID as a string */
String swhid;
/** SWHID node type */
- Node.Type type;
+ SwhType type;
/**
* Constructor.
*
* @param swhid full SWHID as a string
*/
public SWHID(String swhid) {
this.swhid = swhid;
// SWHID format: 'swh:1:type:hash'
String[] parts = swhid.split(":");
if (parts.length != 4 || !parts[0].equals("swh") || !parts[1].equals("1")) {
throw new IllegalArgumentException("malformed SWHID: " + swhid);
}
- this.type = Node.Type.fromStr(parts[2]);
+ this.type = SwhType.fromStr(parts[2]);
if (!parts[3].matches("[0-9a-f]{" + HASH_LENGTH + "}")) {
throw new IllegalArgumentException("malformed SWHID: " + swhid);
}
}
/**
* Creates a SWHID from a compact binary representation.
*
* The binary format is specified in the Python module swh.graph.swhid:str_to_bytes .
*/
public static SWHID fromBytes(byte[] input) {
byte[] digest = new byte[20];
System.arraycopy(input, 2, digest, 0, digest.length);
- String swhidStr = String.format("swh:%d:%s:%s", input[0], Node.Type.fromInt(input[1]).toString().toLowerCase(),
+ String swhidStr = String.format("swh:%d:%s:%s", input[0], SwhType.fromInt(input[1]).toString().toLowerCase(),
Hex.encodeHexString(digest));
return new SWHID(swhidStr);
}
@Override
public boolean equals(Object otherObj) {
if (otherObj == this)
return true;
if (!(otherObj instanceof SWHID))
return false;
SWHID other = (SWHID) otherObj;
return swhid.equals(other.getSWHID());
}
@Override
public int hashCode() {
return swhid.hashCode();
}
@Override
public String toString() {
return swhid;
}
/**
* Converts SWHID to a compact binary representation.
*
* The binary format is specified in the Python module swh.graph.swhid:str_to_bytes .
*/
public byte[] toBytes() {
byte[] bytes = new byte[22];
byte[] digest;
bytes[0] = (byte) 1; // namespace version
- bytes[1] = (byte) Node.Type.toInt(this.type); // SWHID type
+ bytes[1] = (byte) SwhType.toInt(this.type); // SWHID type
try {
digest = Hex.decodeHex(this.swhid.substring(10)); // SHA1 hash
System.arraycopy(digest, 0, bytes, 2, digest.length);
} catch (DecoderException e) {
throw new IllegalArgumentException("invalid hex sequence in SWHID: " + this.swhid);
}
return bytes;
}
/**
* Returns full SWHID as a string.
*
* @return full SWHID string
*/
@JsonValue
public String getSWHID() {
return swhid;
}
/**
* Returns SWHID node type.
*
- * @return SWHID corresponding {@link Node.Type}
- * @see org.softwareheritage.graph.Node.Type
+ * @return SWHID corresponding {@link SwhType}
+ * @see SwhType
*/
- public Node.Type getType() {
+ public SwhType getType() {
return type;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/Stats.java b/java/src/main/java/org/softwareheritage/graph/Stats.java
deleted file mode 100644
index 1c1cb0f..0000000
--- a/java/src/main/java/org/softwareheritage/graph/Stats.java
+++ /dev/null
@@ -1,67 +0,0 @@
-package org.softwareheritage.graph;
-
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.util.Properties;
-
-/**
- * Statistics on the compressed graph.
- *
- * These statistics are not computed but directly read from
- * WebGraph generated .stats and .properties files.
- *
- * @author The Software Heritage developers
- */
-
-public class Stats {
- public Counts counts;
- public Ratios ratios;
- public Degree indegree;
- public Degree outdegree;
- /**
- * Constructor.
- *
- * @param graphPath path and basename of compressed graph
- */
- public Stats(String graphPath) throws IOException {
- Properties properties = new Properties();
- properties.load(new FileInputStream(graphPath + ".properties"));
- properties.load(new FileInputStream(graphPath + ".stats"));
-
- this.counts = new Counts();
- this.ratios = new Ratios();
- this.indegree = new Degree();
- this.outdegree = new Degree();
-
- this.counts.nodes = Long.parseLong(properties.getProperty("nodes"));
- this.counts.edges = Long.parseLong(properties.getProperty("arcs"));
- this.ratios.compression = Double.parseDouble(properties.getProperty("compratio"));
- this.ratios.bitsPerNode = Double.parseDouble(properties.getProperty("bitspernode"));
- this.ratios.bitsPerEdge = Double.parseDouble(properties.getProperty("bitsperlink"));
- this.ratios.avgLocality = Double.parseDouble(properties.getProperty("avglocality"));
- this.indegree.min = Long.parseLong(properties.getProperty("minindegree"));
- this.indegree.max = Long.parseLong(properties.getProperty("maxindegree"));
- this.indegree.avg = Double.parseDouble(properties.getProperty("avgindegree"));
- this.outdegree.min = Long.parseLong(properties.getProperty("minoutdegree"));
- this.outdegree.max = Long.parseLong(properties.getProperty("maxoutdegree"));
- this.outdegree.avg = Double.parseDouble(properties.getProperty("avgoutdegree"));
- }
-
- public static class Counts {
- public long nodes;
- public long edges;
- }
-
- public static class Ratios {
- public double compression;
- public double bitsPerNode;
- public double bitsPerEdge;
- public double avgLocality;
- }
-
- public static class Degree {
- public long min;
- public long max;
- public double avg;
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/Subgraph.java b/java/src/main/java/org/softwareheritage/graph/Subgraph.java
index 3e7e7fd..9cafc0b 100644
--- a/java/src/main/java/org/softwareheritage/graph/Subgraph.java
+++ b/java/src/main/java/org/softwareheritage/graph/Subgraph.java
@@ -1,224 +1,231 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.big.webgraph.NodeIterator;
import java.util.NoSuchElementException;
public class Subgraph extends ImmutableGraph {
- private final Graph underlyingGraph;
+ private final SwhBidirectionalGraph underlyingGraph;
public final AllowedNodes allowedNodeTypes;
private long nodeCount = -1;
/**
* Constructor.
*
*/
- public Subgraph(Graph underlyingGraph, AllowedNodes allowedNodeTypes) {
+ public Subgraph(SwhBidirectionalGraph underlyingGraph, AllowedNodes allowedNodeTypes) {
this.underlyingGraph = underlyingGraph.copy();
this.allowedNodeTypes = allowedNodeTypes;
}
/**
* Return a flyweight copy of the graph.
*/
@Override
public Subgraph copy() {
return new Subgraph(this.underlyingGraph.copy(), allowedNodeTypes);
}
@Override
public boolean randomAccess() {
return underlyingGraph.randomAccess();
}
/**
* Return a transposed version of the graph.
*/
public Subgraph transpose() {
return new Subgraph(underlyingGraph.transpose(), allowedNodeTypes);
}
/**
* Return a symmetric version of the graph.
*/
public Subgraph symmetrize() {
return new Subgraph(underlyingGraph.symmetrize(), allowedNodeTypes);
}
/**
* Returns number of nodes in the graph.
*
* @return number of nodes in the graph
*/
@Override
public long numNodes() {
if (nodeCount == -1) {
for (long i = 0; i < underlyingGraph.numNodes(); ++i) {
if (nodeExists(i))
++nodeCount;
}
}
return nodeCount;
}
/**
* Returns number of edges in the graph.
*
* @return number of edges in the graph
*/
@Override
public long numArcs() {
throw new UnsupportedOperationException("Cannot determine the number of arcs in a subgraph");
}
public long maxNodeNumber() {
return underlyingGraph.numNodes();
}
public boolean nodeExists(long node) {
return allowedNodeTypes.isAllowed(underlyingGraph.getNodeType(node));
}
/**
* Returns lazy iterator of successors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of successors of the node, specified as a
* WebGraph LazyLongIterator
*/
@Override
public LazyLongIterator successors(long nodeId) {
if (!nodeExists(nodeId)) {
throw new IllegalArgumentException("Node " + nodeId + " not in subgraph");
}
LazyLongIterator allSuccessors = underlyingGraph.successors(nodeId);
return new LazyLongIterator() {
@Override
public long nextLong() {
long neighbor;
while ((neighbor = allSuccessors.nextLong()) != -1) {
if (nodeExists(neighbor)) {
return neighbor;
}
}
return -1;
}
@Override
public long skip(final long n) {
long i;
for (i = 0; i < n && nextLong() != -1; i++)
;
return i;
}
};
}
/**
* Returns the outdegree of a node.
*
* @param nodeId node specified as a long id
* @return outdegree of a node
*/
@Override
public long outdegree(long nodeId) {
long deg = 0;
for (LazyLongIterator allSuccessors = successors(nodeId); allSuccessors.nextLong() != -1; ++deg)
;
return deg;
}
@Override
public NodeIterator nodeIterator() {
return new NodeIterator() {
final long n = numNodes();
long i = -1;
long done = 0;
@Override
public boolean hasNext() {
return done <= n;
}
@Override
public long nextLong() {
if (!hasNext())
throw new NoSuchElementException();
do {
++i;
if (i >= underlyingGraph.numNodes())
throw new NoSuchElementException();
} while (!nodeExists(i));
++done;
return i;
}
@Override
public long outdegree() {
return Subgraph.this.outdegree(i);
}
@Override
public LazyLongIterator successors() {
return Subgraph.this.successors(i);
}
};
}
/**
* Returns lazy iterator of predecessors of a node.
*
* @param nodeId node specified as a long id
* @return lazy iterator of predecessors of the node, specified as a
* WebGraph LazyLongIterator
*/
public LazyLongIterator predecessors(long nodeId) {
return this.transpose().successors(nodeId);
}
/**
* Returns the indegree of a node.
*
* @param nodeId node specified as a long id
* @return indegree of a node
*/
public long indegree(long nodeId) {
return this.transpose().outdegree(nodeId);
}
/**
* Converts {@link SWHID} node to long.
*
* @param swhid node specified as a {@link SWHID}
* @return internal long node id
* @see SWHID
*/
public long getNodeId(SWHID swhid) {
return underlyingGraph.getNodeId(swhid);
}
/**
* Converts long id node to {@link SWHID}.
*
* @param nodeId node specified as a long id
* @return external SWHID
* @see SWHID
*/
public SWHID getSWHID(long nodeId) {
return underlyingGraph.getSWHID(nodeId);
}
/**
* Returns node type.
*
* @param nodeId node specified as a long id
* @return corresponding node type
- * @see Node.Type
+ * @see SwhType
*/
- public Node.Type getNodeType(long nodeId) {
+ public SwhType getNodeType(long nodeId) {
return underlyingGraph.getNodeType(nodeId);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java
new file mode 100644
index 0000000..04b2a8c
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/SwhBidirectionalGraph.java
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph;
+
+import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
+import it.unimi.dsi.big.webgraph.BidirectionalImmutableGraph;
+import it.unimi.dsi.logging.ProgressLogger;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Class representing the compressed Software Heritage graph in both directions (forward and
+ * backward).
+ *
+ * This class uses the {@link BidirectionalImmutableGraph} class internally to implement the
+ * backward equivalent of graph operations ({@link SwhBidirectionalGraph#indegree(long)},
+ * {@link SwhBidirectionalGraph#predecessors(long)}, etc.) by holding a reference to two
+ * {@link SwhUnidirectionalGraph} (a forward graph and a backward graph).
+ *
+ * Both graphs share their graph properties in memory by storing references to the same
+ * {@link SwhGraphProperties} object.
+ *
+ *
+ *
+ * @author The Software Heritage developers
+ * @see SwhUnidirectionalGraph
+ */
+
+public class SwhBidirectionalGraph extends BidirectionalImmutableGraph implements SwhGraph {
+ /** Property data of the graph (id/type mappings etc.) */
+ public final SwhGraphProperties properties;
+
+ private final SwhUnidirectionalGraph forwardGraph;
+ private final SwhUnidirectionalGraph backwardGraph;
+
+ public SwhBidirectionalGraph(SwhUnidirectionalGraph forwardGraph, SwhUnidirectionalGraph backwardGraph,
+ SwhGraphProperties properties) {
+ super(forwardGraph, backwardGraph);
+ this.forwardGraph = forwardGraph;
+ this.backwardGraph = backwardGraph;
+ this.properties = properties;
+ }
+
+ private SwhBidirectionalGraph(BidirectionalImmutableGraph graph, SwhGraphProperties properties) {
+ super(graph.forward, graph.backward);
+ this.forwardGraph = new SwhUnidirectionalGraph(graph.forward, properties);
+ this.backwardGraph = new SwhUnidirectionalGraph(graph.backward, properties);
+ this.properties = properties;
+ }
+
+ public static SwhBidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl)
+ throws IOException {
+ SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadGraphOnly(method, path, is, pl);
+ SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadGraphOnly(method, path + "-transposed", is, pl);
+ SwhGraphProperties properties = SwhGraphProperties.load(path);
+ forward.setProperties(properties);
+ backward.setProperties(properties);
+ return new SwhBidirectionalGraph(forward, backward, properties);
+ }
+
+ public static SwhBidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl)
+ throws IOException {
+ SwhUnidirectionalGraph forward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path, is, pl);
+ SwhUnidirectionalGraph backward = SwhUnidirectionalGraph.loadLabelledGraphOnly(method, path + "-transposed", is,
+ pl);
+ SwhGraphProperties properties = SwhGraphProperties.load(path);
+ forward.setProperties(properties);
+ backward.setProperties(properties);
+ return new SwhBidirectionalGraph(forward, backward, properties);
+ }
+
+ // loadXXX methods from ImmutableGraph
+ public static SwhBidirectionalGraph load(String path, ProgressLogger pl) throws IOException {
+ return load(LoadMethod.STANDARD, path, null, pl);
+ }
+ public static SwhBidirectionalGraph load(String path) throws IOException {
+ return load(LoadMethod.STANDARD, path, null, null);
+ }
+ public static SwhBidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException {
+ return load(LoadMethod.MAPPED, path, null, pl);
+ }
+ public static SwhBidirectionalGraph loadMapped(String path) throws IOException {
+ return load(LoadMethod.MAPPED, path, null, null);
+ }
+ public static SwhBidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException {
+ return load(LoadMethod.OFFLINE, path, null, pl);
+ }
+ public static SwhBidirectionalGraph loadOffline(String path) throws IOException {
+ return load(LoadMethod.OFFLINE, path, null, null);
+ }
+
+ // Labelled versions of the loadXXX methods from ImmutableGraph
+ public static SwhBidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException {
+ return loadLabelled(LoadMethod.STANDARD, path, null, pl);
+ }
+ public static SwhBidirectionalGraph loadLabelled(String path) throws IOException {
+ return loadLabelled(LoadMethod.STANDARD, path, null, null);
+ }
+ public static SwhBidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException {
+ return loadLabelled(LoadMethod.MAPPED, path, null, pl);
+ }
+ public static SwhBidirectionalGraph loadLabelledMapped(String path) throws IOException {
+ return loadLabelled(LoadMethod.MAPPED, path, null, null);
+ }
+ public static SwhBidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException {
+ return loadLabelled(LoadMethod.OFFLINE, path, null, pl);
+ }
+ public static SwhBidirectionalGraph loadLabelledOffline(String path) throws IOException {
+ return loadLabelled(LoadMethod.OFFLINE, path, null, null);
+ }
+
+ @Override
+ public SwhBidirectionalGraph copy() {
+ return new SwhBidirectionalGraph(forwardGraph.copy(), backwardGraph.copy(), this.properties);
+ }
+
+ @Override
+ public SwhBidirectionalGraph transpose() {
+ return new SwhBidirectionalGraph(super.transpose(), this.properties);
+ }
+
+ @Override
+ public SwhBidirectionalGraph symmetrize() {
+ return new SwhBidirectionalGraph(super.symmetrize(), this.properties);
+ }
+
+ public SwhUnidirectionalGraph getForwardGraph() {
+ return this.forwardGraph;
+ }
+
+ public SwhUnidirectionalGraph getBackwardGraph() {
+ return this.backwardGraph;
+ }
+
+ /**
+ * Returns a *labelled* lazy iterator over the successors of a given node. The iteration terminates
+ * when -1 is returned.
+ */
+ public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) {
+ return forwardGraph.labelledSuccessors(x);
+ }
+
+ /**
+ * Returns a *labelled* lazy iterator over the predecessors of a given node. The iteration
+ * terminates when -1 is returned.
+ */
+ public ArcLabelledNodeIterator.LabelledArcIterator labelledPredecessors(long x) {
+ return backwardGraph.labelledSuccessors(x);
+ }
+
+ public void close() throws IOException {
+ this.properties.close();
+ }
+
+ @Override
+ public SwhGraphProperties getProperties() {
+ return properties;
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
new file mode 100644
index 0000000..aee50cd
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraph.java
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph;
+
+import java.io.IOException;
+
+/**
+ * Common interface for SWH graph classes.
+ *
+ * This interface forwards all property loading/access methods to the SwhGraphProperties object
+ * returned by the getProperties() method of the implementing class. This allows API users to write
+ * graph.getNodeType() instead of graph.getProperties().getNodeType().
+ */
+public interface SwhGraph {
+ /**
+ * Cleans up graph resources after use.
+ */
+ void close() throws IOException;
+
+ /**
+ * Returns the SWH graph properties object of this graph.
+ *
+ * @return graph properties
+ */
+ SwhGraphProperties getProperties();
+
+ /** @see SwhGraphProperties#getPath() */
+ default String getPath() {
+ return getProperties().getPath();
+ }
+
+ /** @see SwhGraphProperties#getNodeId(SWHID) */
+ default long getNodeId(SWHID swhid) {
+ return getProperties().getNodeId(swhid);
+ }
+
+ /** @see SwhGraphProperties#getSWHID(long) */
+ default SWHID getSWHID(long nodeId) {
+ return getProperties().getSWHID(nodeId);
+ }
+
+ /** @see SwhGraphProperties#getNodeType(long) */
+ default SwhType getNodeType(long nodeId) {
+ return getProperties().getNodeType(nodeId);
+ }
+
+ /** @see SwhGraphProperties#loadContentLength() */
+ default void loadContentLength() throws IOException {
+ getProperties().loadContentLength();
+ }
+
+ /** @see SwhGraphProperties#getContentLength(long) */
+ default Long getContentLength(long nodeId) {
+ return getProperties().getContentLength(nodeId);
+ }
+
+ /** @see SwhGraphProperties#loadPersonIds() */
+ default void loadPersonIds() throws IOException {
+ getProperties().loadPersonIds();
+ }
+
+ /** @see SwhGraphProperties#getAuthorId(long) */
+ default Long getAuthorId(long nodeId) {
+ return getProperties().getAuthorId(nodeId);
+ }
+
+ /** @see SwhGraphProperties#getCommitterId(long) */
+ default Long getCommitterId(long nodeId) {
+ return getProperties().getCommitterId(nodeId);
+ }
+
+ /** @see SwhGraphProperties#loadContentIsSkipped() */
+ default void loadContentIsSkipped() throws IOException {
+ getProperties().loadContentIsSkipped();
+ }
+
+ /** @see SwhGraphProperties#isContentSkipped(long) */
+ default boolean isContentSkipped(long nodeId) {
+ return getProperties().isContentSkipped(nodeId);
+ }
+
+ /** @see SwhGraphProperties#loadAuthorTimestamps() */
+ default void loadAuthorTimestamps() throws IOException {
+ getProperties().loadAuthorTimestamps();
+ }
+
+ /** @see SwhGraphProperties#getAuthorTimestamp(long) */
+ default Long getAuthorTimestamp(long nodeId) {
+ return getProperties().getAuthorTimestamp(nodeId);
+ }
+
+ /** @see SwhGraphProperties#getAuthorTimestampOffset(long) */
+ default Short getAuthorTimestampOffset(long nodeId) {
+ return getProperties().getAuthorTimestampOffset(nodeId);
+ }
+
+ /** @see SwhGraphProperties#loadCommitterTimestamps() */
+ default void loadCommitterTimestamps() throws IOException {
+ getProperties().loadCommitterTimestamps();
+ }
+
+ /** @see SwhGraphProperties#getCommitterTimestamp(long) */
+ default Long getCommitterTimestamp(long nodeId) {
+ return getProperties().getCommitterTimestamp(nodeId);
+ }
+
+ /** @see SwhGraphProperties#getCommitterTimestampOffset(long) */
+ default Short getCommitterTimestampOffset(long nodeId) {
+ return getProperties().getCommitterTimestampOffset(nodeId);
+ }
+
+ /** @see SwhGraphProperties#loadMessages() */
+ default void loadMessages() throws IOException {
+ getProperties().loadMessages();
+ }
+
+ /** @see SwhGraphProperties#getMessage(long) */
+ default byte[] getMessage(long nodeId) {
+ return getProperties().getMessage(nodeId);
+ }
+
+ /** @see SwhGraphProperties#getUrl(long) */
+ default String getUrl(long nodeId) {
+ return getProperties().getUrl(nodeId);
+ }
+
+ /** @see SwhGraphProperties#loadTagNames() */
+ default void loadTagNames() throws IOException {
+ getProperties().loadTagNames();
+ }
+
+ /** @see SwhGraphProperties#getTagName(long) */
+ default byte[] getTagName(long nodeId) {
+ return getProperties().getTagName(nodeId);
+ }
+
+ /** @see SwhGraphProperties#loadLabelNames() */
+ default void loadLabelNames() throws IOException {
+ getProperties().loadLabelNames();
+ }
+
+ /** @see SwhGraphProperties#getLabelName(long) */
+ default byte[] getLabelName(long labelId) {
+ return getProperties().getLabelName(labelId);
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
new file mode 100644
index 0000000..3372947
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/SwhGraphProperties.java
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph;
+
+import it.unimi.dsi.big.util.MappedFrontCodedStringBigList;
+import it.unimi.dsi.bits.LongArrayBitVector;
+import it.unimi.dsi.fastutil.bytes.ByteBigList;
+import it.unimi.dsi.fastutil.bytes.ByteMappedBigList;
+import it.unimi.dsi.fastutil.ints.IntBigList;
+import it.unimi.dsi.fastutil.ints.IntMappedBigList;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.longs.LongBigList;
+import it.unimi.dsi.fastutil.longs.LongMappedBigList;
+import it.unimi.dsi.fastutil.shorts.ShortBigList;
+import it.unimi.dsi.fastutil.shorts.ShortMappedBigList;
+import it.unimi.dsi.sux4j.util.EliasFanoLongBigList;
+import org.apache.commons.configuration2.ex.ConfigurationException;
+import org.softwareheritage.graph.maps.NodeIdMap;
+import org.softwareheritage.graph.maps.NodeTypesMap;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.Base64;
+
+/**
+ * This objects contains SWH graph properties such as node labels.
+ *
+ * Some property mappings are necessary because Software Heritage uses string based persistent
+ * identifiers (SWHID) while WebGraph uses integers internally.
+ *
+ * The two node ID mappings (long id ↔ SWHID) are used for the input (users refer to the graph
+ * using SWHID) and the output (convert back to SWHID for users results).
+ *
+ * Since graph traversal can be restricted depending on the node type (see {@link AllowedEdges}), a
+ * long id → node type map is stored as well to avoid a full SWHID lookup.
+ *
+ * @see NodeIdMap
+ * @see NodeTypesMap
+ */
+public class SwhGraphProperties {
+ private final String path;
+
+ private final NodeIdMap nodeIdMap;
+ private final NodeTypesMap nodeTypesMap;
+ private LongBigList authorTimestamp;
+ private ShortBigList authorTimestampOffset;
+ private LongBigList committerTimestamp;
+ private ShortBigList committerTimestampOffset;
+ private LongBigList contentLength;
+ private LongArrayBitVector contentIsSkipped;
+ private IntBigList authorId;
+ private IntBigList committerId;
+ private ByteBigList messageBuffer;
+ private LongBigList messageOffsets;
+ private ByteBigList tagNameBuffer;
+ private LongBigList tagNameOffsets;
+ private MappedFrontCodedStringBigList edgeLabelNames;
+
+ protected SwhGraphProperties(String path, NodeIdMap nodeIdMap, NodeTypesMap nodeTypesMap) {
+ this.path = path;
+ this.nodeIdMap = nodeIdMap;
+ this.nodeTypesMap = nodeTypesMap;
+ }
+
+ public static SwhGraphProperties load(String path) throws IOException {
+ return new SwhGraphProperties(path, new NodeIdMap(path), new NodeTypesMap(path));
+ }
+
+ /**
+ * Cleans up resources after use.
+ */
+ public void close() throws IOException {
+ edgeLabelNames.close();
+ }
+
+ /** Return the basename of the compressed graph */
+ public String getPath() {
+ return path;
+ }
+
+ /**
+ * Converts {@link SWHID} node to long.
+ *
+ * @param swhid node specified as a {@link SWHID}
+ * @return internal long node id
+ * @see SWHID
+ */
+ public long getNodeId(SWHID swhid) {
+ return nodeIdMap.getNodeId(swhid);
+ }
+
+ /**
+ * Converts long id node to {@link SWHID}.
+ *
+ * @param nodeId node specified as a long id
+ * @return external SWHID
+ * @see SWHID
+ */
+ public SWHID getSWHID(long nodeId) {
+ return nodeIdMap.getSWHID(nodeId);
+ }
+
+ /**
+ * Returns node type.
+ *
+ * @param nodeId node specified as a long id
+ * @return corresponding node type
+ * @see SwhType
+ */
+ public SwhType getNodeType(long nodeId) {
+ return nodeTypesMap.getType(nodeId);
+ }
+
+ private static LongBigList loadMappedLongs(String path) throws IOException {
+ try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
+ return LongMappedBigList.map(raf.getChannel());
+ }
+ }
+
+ private static IntBigList loadMappedInts(String path) throws IOException {
+ try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
+ return IntMappedBigList.map(raf.getChannel());
+ }
+ }
+
+ private static ShortBigList loadMappedShorts(String path) throws IOException {
+ try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
+ return ShortMappedBigList.map(raf.getChannel());
+ }
+ }
+
+ private static ByteBigList loadMappedBytes(String path) throws IOException {
+ try (RandomAccessFile raf = new RandomAccessFile(path, "r")) {
+ return ByteMappedBigList.map(raf.getChannel());
+ }
+ }
+
+ private static LongBigList loadEFLongs(String path) throws IOException {
+ try {
+ return (EliasFanoLongBigList) BinIO.loadObject(path);
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e);
+ }
+ }
+
+ private static byte[] getLine(ByteBigList byteArray, long start) {
+ long end = start;
+ while (end < byteArray.size64() && byteArray.getByte(end) != '\n') {
+ end++;
+ }
+ int length = (int) (end - start);
+ byte[] buffer = new byte[length];
+ byteArray.getElements(start, buffer, 0, length);
+ return buffer;
+ }
+
+ /** Load the sizes of the content nodes */
+ public void loadContentLength() throws IOException {
+ contentLength = loadMappedLongs(path + ".property.content.length.bin");
+ }
+
+ /** Get the size (in bytes) of the given content node */
+ public Long getContentLength(long nodeId) {
+ if (contentLength == null) {
+ throw new IllegalStateException("Content lengths not loaded");
+ }
+ long res = contentLength.getLong(nodeId);
+ return (res >= 0) ? res : null;
+ }
+
+ /** Load the IDs of the persons (authors and committers) */
+ public void loadPersonIds() throws IOException {
+ authorId = loadMappedInts(path + ".property.author_id.bin");
+ committerId = loadMappedInts(path + ".property.committer_id.bin");
+ }
+
+ /** Get a unique integer ID representing the author of the given revision or release node */
+ public Long getAuthorId(long nodeId) {
+ if (authorId == null) {
+ throw new IllegalStateException("Author IDs not loaded");
+ }
+ long res = authorId.getInt(nodeId);
+ return (res >= 0) ? res : null;
+ }
+
+ /** Get a unique integer ID representing the committer of the given revision node */
+ public Long getCommitterId(long nodeId) {
+ if (committerId == null) {
+ throw new IllegalStateException("Committer IDs not loaded");
+ }
+ long res = committerId.getInt(nodeId);
+ return (res >= 0) ? res : null;
+ }
+
+ /**
+ * Loads a boolean array indicating whether the given content node was skipped during archive
+ * ingestion
+ */
+ public void loadContentIsSkipped() throws IOException {
+ try {
+ contentIsSkipped = (LongArrayBitVector) BinIO.loadObject(path + ".property.content.is_skipped.bin");
+ } catch (ClassNotFoundException e) {
+ throw new IOException(e);
+ }
+ }
+
+ /** Returns whether the given content node was skipped during archive ingestion */
+ public boolean isContentSkipped(long nodeId) {
+ if (contentIsSkipped == null) {
+ throw new IllegalStateException("Skipped content array not loaded");
+ }
+ return contentIsSkipped.getBoolean(nodeId);
+ }
+
+ /** Load the timestamps at which the releases and revisions were authored */
+ public void loadAuthorTimestamps() throws IOException {
+ authorTimestamp = loadMappedLongs(path + ".property.author_timestamp.bin");
+ authorTimestampOffset = loadMappedShorts(path + ".property.author_timestamp_offset.bin");
+ }
+
+ /** Return the timestamp at which the given revision or release was authored */
+ public Long getAuthorTimestamp(long nodeId) {
+ if (authorTimestamp == null) {
+ throw new IllegalStateException("Author timestamps not loaded");
+ }
+ long res = authorTimestamp.getLong(nodeId);
+ return (res > Long.MIN_VALUE) ? res : null;
+ }
+
+ /** Return the timestamp offset at which the given revision or release was authored */
+ public Short getAuthorTimestampOffset(long nodeId) {
+ if (authorTimestampOffset == null) {
+ throw new IllegalStateException("Author timestamp offsets not loaded");
+ }
+ short res = authorTimestampOffset.getShort(nodeId);
+ return (res > Short.MIN_VALUE) ? res : null;
+ }
+
+ /** Load the timestamps at which the releases and revisions were committed */
+ public void loadCommitterTimestamps() throws IOException {
+ committerTimestamp = loadMappedLongs(path + ".property.committer_timestamp.bin");
+ committerTimestampOffset = loadMappedShorts(path + ".property.committer_timestamp_offset.bin");
+ }
+
+ /** Return the timestamp at which the given revision was committed */
+ public Long getCommitterTimestamp(long nodeId) {
+ if (committerTimestamp == null) {
+ throw new IllegalStateException("Committer timestamps not loaded");
+ }
+ long res = committerTimestamp.getLong(nodeId);
+ return (res > Long.MIN_VALUE) ? res : null;
+ }
+
+ /** Return the timestamp offset at which the given revision was committed */
+ public Short getCommitterTimestampOffset(long nodeId) {
+ if (committerTimestampOffset == null) {
+ throw new IllegalStateException("Committer timestamp offsets not loaded");
+ }
+ short res = committerTimestampOffset.getShort(nodeId);
+ return (res > Short.MIN_VALUE) ? res : null;
+ }
+
+ /** Load the revision messages, the release messages and the origin URLs */
+ public void loadMessages() throws IOException {
+ messageBuffer = loadMappedBytes(path + ".property.message.bin");
+ messageOffsets = loadMappedLongs(path + ".property.message.offset.bin");
+ }
+
+ /** Get the message of the given revision or release node */
+ public byte[] getMessage(long nodeId) {
+ if (messageBuffer == null || messageOffsets == null) {
+ throw new IllegalStateException("Messages not loaded");
+ }
+ long startOffset = messageOffsets.getLong(nodeId);
+ if (startOffset == -1) {
+ return null;
+ }
+ return Base64.getDecoder().decode(getLine(messageBuffer, startOffset));
+ }
+
+ /** Get the URL of the given origin node */
+ public String getUrl(long nodeId) {
+ byte[] url = getMessage(nodeId);
+ return (url != null) ? new String(url) : null;
+ }
+
+ /** Load the release names */
+ public void loadTagNames() throws IOException {
+ tagNameBuffer = loadMappedBytes(path + ".property.tag_name.bin");
+ tagNameOffsets = loadMappedLongs(path + ".property.tag_name.offset.bin");
+ }
+
+ /** Get the name of the given release node */
+ public byte[] getTagName(long nodeId) {
+ if (tagNameBuffer == null || tagNameOffsets == null) {
+ throw new IllegalStateException("Tag names not loaded");
+ }
+ long startOffset = tagNameOffsets.getLong(nodeId);
+ if (startOffset == -1) {
+ return null;
+ }
+ return Base64.getDecoder().decode(getLine(tagNameBuffer, startOffset));
+ }
+
+ /** Load the arc label names (directory entry names and snapshot branch names) */
+ public void loadLabelNames() throws IOException {
+ try {
+ edgeLabelNames = MappedFrontCodedStringBigList.load(path + ".labels.fcl");
+ } catch (ConfigurationException e) {
+ throw new IOException(e);
+ }
+ }
+
+ /**
+ * Get the arc label name (either a directory entry name or snapshot branch name) associated with
+ * the given label ID
+ */
+ public byte[] getLabelName(long labelId) {
+ if (edgeLabelNames == null) {
+ throw new IllegalStateException("Label names not loaded");
+ }
+ return Base64.getDecoder().decode(edgeLabelNames.getArray(labelId));
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhPath.java b/java/src/main/java/org/softwareheritage/graph/SwhPath.java
deleted file mode 100644
index 8693e02..0000000
--- a/java/src/main/java/org/softwareheritage/graph/SwhPath.java
+++ /dev/null
@@ -1,122 +0,0 @@
-package org.softwareheritage.graph;
-
-import com.fasterxml.jackson.annotation.JsonValue;
-
-import java.util.ArrayList;
-
-/**
- * Wrapper class to store a list of {@link SWHID}.
- *
- * @author The Software Heritage developers
- * @see SWHID
- */
-
-public class SwhPath {
- /** Internal list of {@link SWHID} */
- ArrayList path;
-
- /**
- * Constructor.
- */
- public SwhPath() {
- this.path = new ArrayList<>();
- }
-
- /**
- * Constructor.
- *
- * @param swhids variable number of string SWHIDs to initialize this path with
- */
- public SwhPath(String... swhids) {
- this();
- for (String swhid : swhids) {
- add(new SWHID(swhid));
- }
- }
-
- /**
- * Constructor.
- *
- * @param swhids variable number of {@link SWHID} to initialize this path with
- * @see SWHID
- */
- public SwhPath(SWHID... swhids) {
- this();
- for (SWHID swhid : swhids) {
- add(swhid);
- }
- }
-
- /**
- * Returns this path as a list of {@link SWHID}.
- *
- * @return list of {@link SWHID} constituting the path
- * @see SWHID
- */
- @JsonValue
- public ArrayList getPath() {
- return path;
- }
-
- /**
- * Adds a {@link SWHID} to this path.
- *
- * @param swhid {@link SWHID} to add to this path
- * @see SWHID
- */
- public void add(SWHID swhid) {
- path.add(swhid);
- }
-
- /**
- * Returns the {@link SWHID} at the specified position in this path.
- *
- * @param index position of the {@link SWHID} to return
- * @return {@link SWHID} at the specified position
- * @see SWHID
- */
- public SWHID get(int index) {
- return path.get(index);
- }
-
- /**
- * Returns the number of elements in this path.
- *
- * @return number of elements in this path
- */
- public int size() {
- return path.size();
- }
-
- @Override
- public boolean equals(Object otherObj) {
- if (otherObj == this)
- return true;
- if (!(otherObj instanceof SwhPath))
- return false;
-
- SwhPath other = (SwhPath) otherObj;
- if (size() != other.size()) {
- return false;
- }
-
- for (int i = 0; i < size(); i++) {
- SWHID thisSWHID = get(i);
- SWHID otherSWHID = other.get(i);
- if (!thisSWHID.equals(otherSWHID)) {
- return false;
- }
- }
-
- return true;
- }
-
- @Override
- public String toString() {
- StringBuilder str = new StringBuilder();
- for (SWHID swhid : path) {
- str.append(swhid).append("/");
- }
- return str.toString();
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhType.java b/java/src/main/java/org/softwareheritage/graph/SwhType.java
new file mode 100644
index 0000000..5578837
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/SwhType.java
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Software Heritage graph node types, as described in the
+ * data model.
+ */
+public enum SwhType {
+ /** Content node */
+ CNT,
+ /** Directory node */
+ DIR,
+ /** Origin node */
+ ORI,
+ /** Release node */
+ REL,
+ /** Revision node */
+ REV,
+ /** Snapshot node */
+ SNP;
+
+ /**
+ * Converts integer to corresponding SWH node type.
+ *
+ * @param intType node type represented as an integer
+ * @return the corresponding {@link SwhType} value
+ * @see SwhType
+ */
+ public static SwhType fromInt(int intType) {
+ switch (intType) {
+ case 0:
+ return CNT;
+ case 1:
+ return DIR;
+ case 2:
+ return ORI;
+ case 3:
+ return REL;
+ case 4:
+ return REV;
+ case 5:
+ return SNP;
+ }
+ return null;
+ }
+
+ /**
+ * Converts node types to the corresponding int value
+ *
+ * @param type node type as an enum
+ * @return the corresponding int value
+ */
+ public static int toInt(SwhType type) {
+ switch (type) {
+ case CNT:
+ return 0;
+ case DIR:
+ return 1;
+ case ORI:
+ return 2;
+ case REL:
+ return 3;
+ case REV:
+ return 4;
+ case SNP:
+ return 5;
+ }
+ throw new IllegalArgumentException("Unknown node type: " + type);
+ }
+
+ /**
+ * Converts string to corresponding SWH node type.
+ *
+ * @param strType node type represented as a string
+ * @return the corresponding {@link SwhType} value
+ * @see SwhType
+ */
+ public static SwhType fromStr(String strType) {
+ if (!strType.matches("cnt|dir|ori|rel|rev|snp")) {
+ throw new IllegalArgumentException("Unknown node type: " + strType);
+ }
+ return SwhType.valueOf(strType.toUpperCase());
+ }
+
+ /**
+ * Converts byte array name to the int code of the corresponding SWH node type. Used for
+ * performance-critical deserialization.
+ *
+ * @param name node type represented as a byte array (e.g. b"cnt")
+ * @return the ordinal value of the corresponding {@link SwhType}
+ * @see SwhType
+ */
+ public static int byteNameToInt(byte[] name) {
+ if (Arrays.equals(name, "cnt".getBytes())) {
+ return 0;
+ } else if (Arrays.equals(name, "dir".getBytes())) {
+ return 1;
+ } else if (Arrays.equals(name, "ori".getBytes())) {
+ return 2;
+ } else if (Arrays.equals(name, "rel".getBytes())) {
+ return 3;
+ } else if (Arrays.equals(name, "rev".getBytes())) {
+ return 4;
+ } else if (Arrays.equals(name, "snp".getBytes())) {
+ return 5;
+ } else
+ return -1;
+ }
+
+ /**
+ * Parses SWH node type possible values from formatted string (see the
+ * API syntax).
+ *
+ * @param strFmtType node types represented as a formatted string
+ * @return a list containing the {@link SwhType} values
+ * @see SwhType
+ */
+ public static ArrayList parse(String strFmtType) {
+ ArrayList types = new ArrayList<>();
+
+ if (strFmtType.equals("*")) {
+ List nodeTypes = Arrays.asList(SwhType.values());
+ types.addAll(nodeTypes);
+ } else {
+ types.add(SwhType.fromStr(strFmtType));
+ }
+
+ return types;
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java
new file mode 100644
index 0000000..3f865d0
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/SwhUnidirectionalGraph.java
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph;
+
+import it.unimi.dsi.big.webgraph.ImmutableGraph;
+import it.unimi.dsi.big.webgraph.LazyLongIterator;
+import it.unimi.dsi.big.webgraph.labelling.ArcLabelledNodeIterator;
+import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
+import it.unimi.dsi.logging.ProgressLogger;
+import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Class representing the compressed Software Heritage graph in a single direction.
+ *
+ * The compressed graph is stored using the WebGraph
+ * framework. This class contains an {@link ImmutableGraph} representing the graph itself, as well
+ * as a reference to the object containing the graph properties (e.g. node labels). Optionally,
+ * arc labels (properties stored on the graph edges) can also be loaded with the
+ * loadLabelled...() function family.
+ *
+ * @author The Software Heritage developers
+ * @see SwhGraphProperties
+ * @see SwhUnidirectionalGraph
+ */
+
+public class SwhUnidirectionalGraph extends ImmutableGraph implements SwhGraph {
+ /** Underlying ImmutableGraph */
+ private final ImmutableGraph graph;
+
+ /** Labelled ImmutableGraph, null if labels are not loaded */
+ private ArcLabelledImmutableGraph labelledGraph;
+
+ /** Property data of the graph (id/type mappings etc.) */
+ public SwhGraphProperties properties;
+
+ public SwhUnidirectionalGraph(ImmutableGraph graph, SwhGraphProperties properties) {
+ this.graph = graph;
+ this.properties = properties;
+ }
+
+ protected SwhUnidirectionalGraph(ImmutableGraph graph, ArcLabelledImmutableGraph labelledGraph,
+ SwhGraphProperties properties) {
+ this.graph = graph;
+ this.labelledGraph = labelledGraph;
+ this.properties = properties;
+ }
+
+ /**
+ * Load the (unlabelled) graph only, without the SWH properties.
+ */
+ public static SwhUnidirectionalGraph loadGraphOnly(LoadMethod method, String path, InputStream is,
+ ProgressLogger pl) throws IOException {
+ return new SwhUnidirectionalGraph(ImmutableGraph.load(method, path, is, pl), null);
+ }
+
+ /**
+ * Load the labelled graph only, without the SWH properties.
+ */
+ public static SwhUnidirectionalGraph loadLabelledGraphOnly(LoadMethod method, String path, InputStream is,
+ ProgressLogger pl) throws IOException {
+ BitStreamArcLabelledImmutableGraph g = (BitStreamArcLabelledImmutableGraph) BitStreamArcLabelledImmutableGraph
+ .load(method, path + "-labelled", is, pl);
+ return new SwhUnidirectionalGraph(g.g, g, null);
+ }
+
+ /**
+ * Load the SWH properties of the graph from a given path.
+ */
+ public void loadProperties(String path) throws IOException {
+ properties = SwhGraphProperties.load(path);
+ }
+
+ /**
+ * Setter for the SWH graph properties.
+ *
+ * @param properties The {@link SwhGraphProperties} object containing the graph properties
+ */
+ public void setProperties(SwhGraphProperties properties) {
+ this.properties = properties;
+ }
+
+ /**
+ * Load the unlabelled graph and its SWH properties.
+ */
+ public static SwhUnidirectionalGraph load(LoadMethod method, String path, InputStream is, ProgressLogger pl)
+ throws IOException {
+ SwhUnidirectionalGraph g = loadGraphOnly(method, path, is, pl);
+ g.loadProperties(path);
+ return g;
+ }
+
+ /**
+ * Load the labelled graph and its SWH properties.
+ */
+ public static SwhUnidirectionalGraph loadLabelled(LoadMethod method, String path, InputStream is, ProgressLogger pl)
+ throws IOException {
+ SwhUnidirectionalGraph g = loadLabelledGraphOnly(method, path, is, pl);
+ g.loadProperties(path);
+ return g;
+ }
+
+ // loadXXX methods of ImmutableGraph
+ public static SwhUnidirectionalGraph load(String path, ProgressLogger pl) throws IOException {
+ return load(LoadMethod.STANDARD, path, null, pl);
+ }
+ public static SwhUnidirectionalGraph load(String path) throws IOException {
+ return load(LoadMethod.STANDARD, path, null, null);
+ }
+ public static SwhUnidirectionalGraph loadMapped(String path, ProgressLogger pl) throws IOException {
+ return load(LoadMethod.MAPPED, path, null, pl);
+ }
+ public static SwhUnidirectionalGraph loadMapped(String path) throws IOException {
+ return load(LoadMethod.MAPPED, path, null, null);
+ }
+ public static SwhUnidirectionalGraph loadOffline(String path, ProgressLogger pl) throws IOException {
+ return load(LoadMethod.OFFLINE, path, null, pl);
+ }
+ public static SwhUnidirectionalGraph loadOffline(String path) throws IOException {
+ return load(LoadMethod.OFFLINE, path, null, null);
+ }
+
+ // Labelled versions of the loadXXX methods from ImmutableGraph
+ public static SwhUnidirectionalGraph loadLabelled(String path, ProgressLogger pl) throws IOException {
+ return loadLabelled(LoadMethod.STANDARD, path, null, pl);
+ }
+ public static SwhUnidirectionalGraph loadLabelled(String path) throws IOException {
+ return loadLabelled(LoadMethod.STANDARD, path, null, null);
+ }
+ public static SwhUnidirectionalGraph loadLabelledMapped(String path, ProgressLogger pl) throws IOException {
+ return loadLabelled(LoadMethod.MAPPED, path, null, pl);
+ }
+ public static SwhUnidirectionalGraph loadLabelledMapped(String path) throws IOException {
+ return loadLabelled(LoadMethod.MAPPED, path, null, null);
+ }
+ public static SwhUnidirectionalGraph loadLabelledOffline(String path, ProgressLogger pl) throws IOException {
+ return loadLabelled(LoadMethod.OFFLINE, path, null, pl);
+ }
+ public static SwhUnidirectionalGraph loadLabelledOffline(String path) throws IOException {
+ return loadLabelled(LoadMethod.OFFLINE, path, null, null);
+ }
+
+ @Override
+ public SwhUnidirectionalGraph copy() {
+ return new SwhUnidirectionalGraph(this.graph.copy(),
+ this.labelledGraph != null ? this.labelledGraph.copy() : null, this.properties);
+ }
+
+ @Override
+ public boolean randomAccess() {
+ return graph.randomAccess();
+ }
+
+ public void close() throws IOException {
+ this.properties.close();
+ }
+
+ @Override
+ public long numNodes() {
+ return graph.numNodes();
+ }
+
+ @Override
+ public long numArcs() {
+ return graph.numArcs();
+ }
+
+ @Override
+ public LazyLongIterator successors(long nodeId) {
+ return graph.successors(nodeId);
+ }
+
+ /**
+ * Returns a labelled node iterator for scanning the graph sequentially, starting from the
+ * first node.
+ */
+ public ArcLabelledNodeIterator labelledNodeIterator() {
+ if (labelledGraph == null) {
+ throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded.");
+ }
+ return labelledGraph.nodeIterator();
+ }
+
+ /**
+ * Returns a labelled node iterator for scanning the graph sequentially, starting from a
+ * given node.
+ */
+ public ArcLabelledNodeIterator labelledNodeIterator(long from) {
+ if (labelledGraph == null) {
+ throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded.");
+ }
+ return labelledGraph.nodeIterator(from);
+ }
+
+ /**
+ * Returns a labelled lazy iterator over the successors of a given node. The iteration
+ * terminates when -1 is returned.
+ */
+ public ArcLabelledNodeIterator.LabelledArcIterator labelledSuccessors(long x) {
+ if (labelledGraph == null) {
+ throw new RuntimeException("Calling labelledNodeIterator() but labels were not loaded.");
+ }
+ return labelledGraph.successors(x);
+ }
+
+ @Override
+ public long outdegree(long nodeId) {
+ return graph.outdegree(nodeId);
+ }
+
+ @Override
+ public SwhGraphProperties getProperties() {
+ return properties;
+ }
+
+ public ImmutableGraph underlyingGraph() {
+ return graph;
+ }
+
+ public ArcLabelledImmutableGraph underlyingLabelledGraph() {
+ return labelledGraph;
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/Traversal.java b/java/src/main/java/org/softwareheritage/graph/Traversal.java
deleted file mode 100644
index 4c8c669..0000000
--- a/java/src/main/java/org/softwareheritage/graph/Traversal.java
+++ /dev/null
@@ -1,580 +0,0 @@
-package org.softwareheritage.graph;
-
-import java.util.ArrayDeque;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.Map;
-import java.util.Queue;
-import java.util.Random;
-import java.util.Stack;
-import java.util.function.Consumer;
-import java.util.function.LongConsumer;
-
-import org.softwareheritage.graph.server.Endpoint;
-
-import it.unimi.dsi.big.webgraph.LazyLongIterator;
-
-/**
- * Traversal algorithms on the compressed graph.
- *
- * Internal implementation of the traversal API endpoints. These methods only input/output internal
- * long ids, which are converted in the {@link Endpoint} higher-level class to {@link SWHID}.
- *
- * @author The Software Heritage developers
- * @see Endpoint
- */
-
-public class Traversal {
- /** Graph used in the traversal */
- Graph graph;
- /** Graph edge restriction */
- AllowedEdges edges;
-
- /** Hash set storing if we have visited a node */
- HashSet visited;
- /** Hash map storing parent node id for each nodes during a traversal */
- Map parentNode;
- /** Number of edges accessed during traversal */
- long nbEdgesAccessed;
-
- /** The anti Dos limit of edges traversed while a visit */
- long maxEdges;
- /** The string represent the set of type restriction */
- NodesFiltering ndsfilter;
-
- /** random number generator, for random walks */
- Random rng;
-
- /**
- * Constructor.
- *
- * @param graph graph used in the traversal
- * @param direction a string (either "forward" or "backward") specifying edge orientation
- * @param edgesFmt a formatted string describing allowed
- * edges
- */
-
- public Traversal(Graph graph, String direction, String edgesFmt) {
- this(graph, direction, edgesFmt, 0);
- }
-
- public Traversal(Graph graph, String direction, String edgesFmt, long maxEdges) {
- this(graph, direction, edgesFmt, maxEdges, "*");
- }
-
- public Traversal(Graph graph, String direction, String edgesFmt, long maxEdges, String returnTypes) {
- if (!direction.matches("forward|backward")) {
- throw new IllegalArgumentException("Unknown traversal direction: " + direction);
- }
-
- if (direction.equals("backward")) {
- this.graph = graph.transpose();
- } else {
- this.graph = graph;
- }
- this.edges = new AllowedEdges(edgesFmt);
-
- this.visited = new HashSet<>();
- this.parentNode = new HashMap<>();
- this.nbEdgesAccessed = 0;
- this.maxEdges = maxEdges;
- this.rng = new Random();
-
- if (returnTypes.equals("*")) {
- this.ndsfilter = new NodesFiltering();
- } else {
- this.ndsfilter = new NodesFiltering(returnTypes);
- }
- }
-
- /**
- * Returns number of accessed edges during traversal.
- *
- * @return number of edges accessed in last traversal
- */
- public long getNbEdgesAccessed() {
- return nbEdgesAccessed;
- }
-
- /**
- * Returns number of accessed nodes during traversal.
- *
- * @return number of nodes accessed in last traversal
- */
- public long getNbNodesAccessed() {
- return this.visited.size();
- }
-
- /**
- * Push version of {@link #leaves} will fire passed callback for each leaf.
- */
- public void leavesVisitor(long srcNodeId, NodeIdConsumer cb) {
- Stack stack = new Stack<>();
- this.nbEdgesAccessed = 0;
-
- stack.push(srcNodeId);
- visited.add(srcNodeId);
-
- while (!stack.isEmpty()) {
- long currentNodeId = stack.pop();
-
- long neighborsCnt = 0;
- nbEdgesAccessed += graph.outdegree(currentNodeId);
- if (this.maxEdges > 0) {
- if (nbEdgesAccessed >= this.maxEdges) {
- break;
- }
- }
- LazyLongIterator it = graph.successors(currentNodeId, edges);
- for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
- neighborsCnt++;
- if (!visited.contains(neighborNodeId)) {
- stack.push(neighborNodeId);
- visited.add(neighborNodeId);
- }
- }
-
- if (neighborsCnt == 0) {
- cb.accept(currentNodeId);
- }
- }
- }
-
- /**
- * Returns the leaves of a subgraph rooted at the specified source node.
- *
- * @param srcNodeId source node
- * @return list of node ids corresponding to the leaves
- */
- public ArrayList leaves(long srcNodeId) {
- ArrayList nodeIds = new ArrayList();
- leavesVisitor(srcNodeId, nodeIds::add);
- if (ndsfilter.restricted) {
- return ndsfilter.filterByNodeTypes(nodeIds, graph);
- }
- return nodeIds;
- }
-
- /**
- * Push version of {@link #neighbors}: will fire passed callback on each neighbor.
- */
- public void neighborsVisitor(long srcNodeId, NodeIdConsumer cb) {
- this.nbEdgesAccessed = graph.outdegree(srcNodeId);
- if (this.maxEdges > 0) {
- if (nbEdgesAccessed >= this.maxEdges) {
- return;
- }
- }
- LazyLongIterator it = graph.successors(srcNodeId, edges);
- for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
- cb.accept(neighborNodeId);
- }
- }
-
- /**
- * Returns node direct neighbors (linked with exactly one edge).
- *
- * @param srcNodeId source node
- * @return list of node ids corresponding to the neighbors
- */
- public ArrayList neighbors(long srcNodeId) {
- ArrayList nodeIds = new ArrayList<>();
- neighborsVisitor(srcNodeId, nodeIds::add);
- if (ndsfilter.restricted) {
- return ndsfilter.filterByNodeTypes(nodeIds, graph);
- }
- return nodeIds;
- }
-
- /**
- * Push version of {@link #visitNodes}: will fire passed callback on each visited node.
- */
- public void visitNodesVisitor(long srcNodeId, NodeIdConsumer nodeCb, EdgeIdConsumer edgeCb) {
- Stack stack = new Stack<>();
- this.nbEdgesAccessed = 0;
-
- stack.push(srcNodeId);
- visited.add(srcNodeId);
-
- while (!stack.isEmpty()) {
- long currentNodeId = stack.pop();
- if (nodeCb != null) {
- nodeCb.accept(currentNodeId);
- }
- nbEdgesAccessed += graph.outdegree(currentNodeId);
- if (this.maxEdges > 0) {
- if (nbEdgesAccessed >= this.maxEdges) {
- break;
- }
- }
- LazyLongIterator it = graph.successors(currentNodeId, edges);
- for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
- if (edgeCb != null) {
- edgeCb.accept(currentNodeId, neighborNodeId);
- }
- if (!visited.contains(neighborNodeId)) {
- stack.push(neighborNodeId);
- visited.add(neighborNodeId);
- }
- }
- }
- }
-
- /** One-argument version to handle callbacks properly */
- public void visitNodesVisitor(long srcNodeId, NodeIdConsumer cb) {
- visitNodesVisitor(srcNodeId, cb, null);
- }
-
- /**
- * Performs a graph traversal and returns explored nodes.
- *
- * @param srcNodeId source node
- * @return list of explored node ids
- */
- public ArrayList visitNodes(long srcNodeId) {
- ArrayList nodeIds = new ArrayList<>();
- visitNodesVisitor(srcNodeId, nodeIds::add);
- if (ndsfilter.restricted) {
- return ndsfilter.filterByNodeTypes(nodeIds, graph);
- }
- return nodeIds;
- }
-
- /**
- * Push version of {@link #visitPaths}: will fire passed callback on each discovered (complete)
- * path.
- */
- public void visitPathsVisitor(long srcNodeId, PathConsumer cb) {
- Stack currentPath = new Stack<>();
- this.nbEdgesAccessed = 0;
- visitPathsInternalVisitor(srcNodeId, currentPath, cb);
- }
-
- /**
- * Performs a graph traversal and returns explored paths.
- *
- * @param srcNodeId source node
- * @return list of explored paths (represented as a list of node ids)
- */
- public ArrayList> visitPaths(long srcNodeId) {
- ArrayList> paths = new ArrayList<>();
- visitPathsVisitor(srcNodeId, paths::add);
- return paths;
- }
-
- private void visitPathsInternalVisitor(long currentNodeId, Stack currentPath, PathConsumer cb) {
- currentPath.push(currentNodeId);
-
- long visitedNeighbors = 0;
-
- nbEdgesAccessed += graph.outdegree(currentNodeId);
- if (this.maxEdges > 0) {
- if (nbEdgesAccessed >= this.maxEdges) {
- currentPath.pop();
- return;
- }
- }
- LazyLongIterator it = graph.successors(currentNodeId, edges);
- for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
- visitPathsInternalVisitor(neighborNodeId, currentPath, cb);
- visitedNeighbors++;
- }
-
- if (visitedNeighbors == 0) {
- ArrayList path = new ArrayList<>(currentPath);
- cb.accept(path);
- }
-
- currentPath.pop();
- }
-
- /**
- * Performs a graph traversal with backtracking, and returns the first found path from source to
- * destination.
- *
- * @param srcNodeId source node
- * @param dst destination (either a node or a node type)
- * @return found path as a list of node ids
- */
- public ArrayList walk(long srcNodeId, T dst, String visitOrder) {
- long dstNodeId;
- if (visitOrder.equals("dfs")) {
- dstNodeId = walkInternalDFS(srcNodeId, dst);
- } else if (visitOrder.equals("bfs")) {
- dstNodeId = walkInternalBFS(srcNodeId, dst);
- } else {
- throw new IllegalArgumentException("Unknown visit order: " + visitOrder);
- }
-
- if (dstNodeId == -1) {
- throw new IllegalArgumentException("Cannot find destination: " + dst);
- }
-
- return backtracking(srcNodeId, dstNodeId);
- }
-
- /**
- * Performs a random walk (picking a random successor at each step) from source to destination.
- *
- * @param srcNodeId source node
- * @param dst destination (either a node or a node type)
- * @return found path as a list of node ids or an empty path to indicate that no suitable path have
- * been found
- */
- public ArrayList randomWalk(long srcNodeId, T dst) {
- return randomWalk(srcNodeId, dst, 0);
- }
-
- /**
- * Performs a stubborn random walk (picking a random successor at each step) from source to
- * destination. The walk is "stubborn" in the sense that it will not give up the first time if a
- * satisfying target node is found, but it will retry up to a limited amount of times.
- *
- * @param srcNodeId source node
- * @param dst destination (either a node or a node type)
- * @param retries number of times to retry; 0 means no retries (single walk)
- * @return found path as a list of node ids or an empty path to indicate that no suitable path have
- * been found
- */
- public ArrayList randomWalk(long srcNodeId, T dst, int retries) {
- long curNodeId = srcNodeId;
- ArrayList path = new ArrayList<>();
- this.nbEdgesAccessed = 0;
- boolean found;
-
- if (retries < 0) {
- throw new IllegalArgumentException("Negative number of retries given: " + retries);
- }
-
- while (true) {
- path.add(curNodeId);
- LazyLongIterator successors = graph.successors(curNodeId, edges);
- curNodeId = randomPick(successors);
- if (curNodeId < 0) {
- found = false;
- break;
- }
- if (isDstNode(curNodeId, dst)) {
- path.add(curNodeId);
- found = true;
- break;
- }
- }
-
- if (found) {
- if (ndsfilter.restricted) {
- return ndsfilter.filterByNodeTypes(path, graph);
- }
- return path;
- } else if (retries > 0) { // try again
- return randomWalk(srcNodeId, dst, retries - 1);
- } else { // not found and no retries left
- path.clear();
- return path;
- }
- }
-
- /**
- * Randomly choose an element from an iterator over Longs using reservoir sampling
- *
- * @param elements iterator over selection domain
- * @return randomly chosen element or -1 if no suitable element was found
- */
- private long randomPick(LazyLongIterator elements) {
- long curPick = -1;
- long seenCandidates = 0;
-
- for (long element; (element = elements.nextLong()) != -1;) {
- seenCandidates++;
- if (Math.round(rng.nextFloat() * (seenCandidates - 1)) == 0) {
- curPick = element;
- }
- }
-
- return curPick;
- }
-
- /**
- * Internal DFS function of {@link #walk}.
- *
- * @param srcNodeId source node
- * @param dst destination (either a node or a node type)
- * @return final destination node or -1 if no path found
- */
- private long walkInternalDFS(long srcNodeId, T dst) {
- Stack stack = new Stack<>();
- this.nbEdgesAccessed = 0;
-
- stack.push(srcNodeId);
- visited.add(srcNodeId);
-
- while (!stack.isEmpty()) {
- long currentNodeId = stack.pop();
- if (isDstNode(currentNodeId, dst)) {
- return currentNodeId;
- }
-
- nbEdgesAccessed += graph.outdegree(currentNodeId);
- LazyLongIterator it = graph.successors(currentNodeId, edges);
- for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
- if (!visited.contains(neighborNodeId)) {
- stack.push(neighborNodeId);
- visited.add(neighborNodeId);
- parentNode.put(neighborNodeId, currentNodeId);
- }
- }
- }
-
- return -1;
- }
-
- /**
- * Internal BFS function of {@link #walk}.
- *
- * @param srcNodeId source node
- * @param dst destination (either a node or a node type)
- * @return final destination node or -1 if no path found
- */
- private long walkInternalBFS(long srcNodeId, T dst) {
- Queue queue = new LinkedList<>();
- this.nbEdgesAccessed = 0;
-
- queue.add(srcNodeId);
- visited.add(srcNodeId);
-
- while (!queue.isEmpty()) {
- long currentNodeId = queue.poll();
- if (isDstNode(currentNodeId, dst)) {
- return currentNodeId;
- }
-
- nbEdgesAccessed += graph.outdegree(currentNodeId);
- LazyLongIterator it = graph.successors(currentNodeId, edges);
- for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
- if (!visited.contains(neighborNodeId)) {
- queue.add(neighborNodeId);
- visited.add(neighborNodeId);
- parentNode.put(neighborNodeId, currentNodeId);
- }
- }
- }
-
- return -1;
- }
-
- /**
- * Internal function of {@link #walk} to check if a node corresponds to the destination.
- *
- * @param nodeId current node
- * @param dst destination (either a node or a node type)
- * @return true if the node is a destination, or false otherwise
- */
- private boolean isDstNode(long nodeId, T dst) {
- if (dst instanceof Long) {
- long dstNodeId = (Long) dst;
- return nodeId == dstNodeId;
- } else if (dst instanceof Node.Type) {
- Node.Type dstType = (Node.Type) dst;
- return graph.getNodeType(nodeId) == dstType;
- } else {
- return false;
- }
- }
-
- /**
- * Internal backtracking function of {@link #walk}.
- *
- * @param srcNodeId source node
- * @param dstNodeId destination node
- * @return the found path, as a list of node ids
- */
- private ArrayList backtracking(long srcNodeId, long dstNodeId) {
- ArrayList path = new ArrayList<>();
- long currentNodeId = dstNodeId;
- while (currentNodeId != srcNodeId) {
- path.add(currentNodeId);
- currentNodeId = parentNode.get(currentNodeId);
- }
- path.add(srcNodeId);
- Collections.reverse(path);
- return path;
- }
-
- /**
- * Find a common descendant between two given nodes using two parallel BFS
- *
- * @param lhsNode the first node
- * @param rhsNode the second node
- * @return the found path, as a list of node ids
- */
- public Long findCommonDescendant(long lhsNode, long rhsNode) {
- Queue lhsStack = new ArrayDeque<>();
- Queue rhsStack = new ArrayDeque<>();
- HashSet lhsVisited = new HashSet<>();
- HashSet rhsVisited = new HashSet<>();
- lhsStack.add(lhsNode);
- rhsStack.add(rhsNode);
- lhsVisited.add(lhsNode);
- rhsVisited.add(rhsNode);
-
- this.nbEdgesAccessed = 0;
- Long curNode;
-
- while (!lhsStack.isEmpty() || !rhsStack.isEmpty()) {
- if (!lhsStack.isEmpty()) {
- curNode = lhsStack.poll();
- nbEdgesAccessed += graph.outdegree(curNode);
- LazyLongIterator it = graph.successors(curNode, edges);
- for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
- if (!lhsVisited.contains(neighborNodeId)) {
- if (rhsVisited.contains(neighborNodeId))
- return neighborNodeId;
- lhsStack.add(neighborNodeId);
- lhsVisited.add(neighborNodeId);
- }
- }
- }
-
- if (!rhsStack.isEmpty()) {
- curNode = rhsStack.poll();
- nbEdgesAccessed += graph.outdegree(curNode);
- LazyLongIterator it = graph.successors(curNode, edges);
- for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
- if (!rhsVisited.contains(neighborNodeId)) {
- if (lhsVisited.contains(neighborNodeId))
- return neighborNodeId;
- rhsStack.add(neighborNodeId);
- rhsVisited.add(neighborNodeId);
- }
- }
- }
- }
-
- return null;
- }
-
- public interface NodeIdConsumer extends LongConsumer {
- /**
- * Callback for incrementally receiving node identifiers during a graph visit.
- */
- void accept(long nodeId);
- }
-
- public interface EdgeIdConsumer {
- /**
- * Callback for incrementally receiving edge identifiers during a graph visit.
- */
- void accept(long srcId, long dstId);
- }
-
- public interface PathConsumer extends Consumer> {
- /**
- * Callback for incrementally receiving node paths (made of node identifiers) during a graph visit.
- */
- void accept(ArrayList path);
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/algo/TopologicalTraversal.java b/java/src/main/java/org/softwareheritage/graph/algo/TopologicalTraversal.java
deleted file mode 100644
index bdbb60d..0000000
--- a/java/src/main/java/org/softwareheritage/graph/algo/TopologicalTraversal.java
+++ /dev/null
@@ -1,70 +0,0 @@
-package org.softwareheritage.graph.algo;
-
-import com.google.common.primitives.Longs;
-import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import it.unimi.dsi.bits.LongArrayBitVector;
-import it.unimi.dsi.fastutil.Arrays;
-import it.unimi.dsi.fastutil.BigArrays;
-import it.unimi.dsi.fastutil.longs.LongBigArrays;
-import it.unimi.dsi.io.ByteDiskQueue;
-import it.unimi.dsi.logging.ProgressLogger;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Traversal;
-import org.softwareheritage.graph.experiments.forks.ForkCC;
-
-import java.io.File;
-import java.io.IOException;
-
-public class TopologicalTraversal {
- public static void run(final Graph graph, Traversal.NodeIdConsumer cb) throws IOException {
- final long[][] indegree = LongBigArrays.newBigArray(graph.numNodes());
- final ProgressLogger pl = new ProgressLogger();
-
- pl.itemsName = "nodes";
- pl.expectedUpdates = graph.numNodes();
-
- pl.start("Fetching indegrees...");
- long n = graph.numNodes();
- for (long i = 0; i < graph.numNodes(); ++i) {
- BigArrays.add(indegree, i, graph.indegree(i));
- }
- pl.done();
-
- LongArrayBitVector visited = LongArrayBitVector.ofLength(n);
-
- int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
- final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue");
- final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
- final byte[] byteBuf = new byte[Long.BYTES];
-
- pl.start("Traversal in topological order...");
- for (long i = 0; i < graph.numNodes(); ++i) {
- if (visited.getBoolean(i) || BigArrays.get(indegree, i) != 0L) {
- continue;
- }
-
- queue.enqueue(Longs.toByteArray(i));
- visited.set(i);
-
- while (!queue.isEmpty()) {
- queue.dequeue(byteBuf);
- final long currentNode = Longs.fromByteArray(byteBuf);
-
- cb.accept(currentNode);
-
- final LazyLongIterator iterator = graph.successors(currentNode);
- long succ;
- while ((succ = iterator.nextLong()) != -1) {
- BigArrays.add(indegree, succ, -1L);
- if (visited.getBoolean(succ) || BigArrays.get(indegree, succ) != 0)
- continue;
- visited.set(succ);
- queue.enqueue(Longs.toByteArray(succ));
- }
-
- pl.update();
- }
- }
- pl.done();
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java b/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java
deleted file mode 100644
index 9397de7..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/AccessEdge.java
+++ /dev/null
@@ -1,45 +0,0 @@
-package org.softwareheritage.graph.benchmark;
-
-import com.martiansoftware.jsap.JSAPException;
-import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.benchmark.utils.Statistics;
-import org.softwareheritage.graph.benchmark.utils.Timing;
-
-import java.io.IOException;
-import java.util.ArrayList;
-
-/**
- * Benchmark to time edge access time.
- *
- * @author The Software Heritage developers
- */
-
-public class AccessEdge {
- /**
- * Main entrypoint.
- *
- * @param args command line arguments
- */
- public static void main(String[] args) throws IOException, JSAPException {
- Benchmark bench = new Benchmark();
- bench.parseCommandLineArgs(args);
-
- Graph graph = Graph.loadMapped(bench.args.graphPath);
-
- long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes);
-
- ArrayList timings = new ArrayList<>();
- for (long nodeId : nodeIds) {
- long startTime = Timing.start();
- LazyLongIterator neighbors = graph.successors(nodeId);
- long firstNeighbor = neighbors.nextLong();
- double duration = Timing.stop(startTime);
- timings.add(duration);
- }
-
- System.out.println("Used " + bench.args.nbNodes + " random edges (results are in seconds):");
- Statistics stats = new Statistics(timings);
- stats.printAll();
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java b/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
deleted file mode 100644
index 43aec2e..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/BFS.java
+++ /dev/null
@@ -1,107 +0,0 @@
-package org.softwareheritage.graph.benchmark;
-
-import com.google.common.primitives.Longs;
-import com.martiansoftware.jsap.*;
-import it.unimi.dsi.big.webgraph.ImmutableGraph;
-import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import it.unimi.dsi.bits.LongArrayBitVector;
-import it.unimi.dsi.fastutil.Arrays;
-import it.unimi.dsi.io.ByteDiskQueue;
-import it.unimi.dsi.logging.ProgressLogger;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.softwareheritage.graph.Graph;
-
-import java.io.File;
-import java.io.IOException;
-
-public class BFS {
- private final static Logger LOGGER = LoggerFactory.getLogger(BFS.class);
- private final ImmutableGraph graph;
-
- public BFS(ImmutableGraph graph) {
- this.graph = graph;
- }
-
- private static JSAPResult parse_args(String[] args) {
- JSAPResult config = null;
- try {
- SimpleJSAP jsap = new SimpleJSAP(BFS.class.getName(), "",
- new Parameter[]{
- new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
- "graph", "Basename of the compressed graph"),
-
- new FlaggedOption("useTransposed", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, 'T',
- "transposed", "Use transposed graph (default: false)"),});
-
- config = jsap.parse(args);
- if (jsap.messagePrinted()) {
- System.exit(1);
- }
- } catch (JSAPException e) {
- e.printStackTrace();
- }
- return config;
- }
-
- public static void main(String[] args) throws IOException {
- JSAPResult config = parse_args(args);
- String graphPath = config.getString("graphPath");
- boolean useTransposed = config.getBoolean("useTransposed");
-
- System.err.println("Loading graph " + graphPath + " ...");
- Graph graph = Graph.loadMapped(graphPath);
- System.err.println("Graph loaded.");
-
- if (useTransposed)
- graph = graph.transpose();
-
- BFS bfs = new BFS(graph);
- bfs.bfsperm();
- }
-
- // Partly inlined from it.unimi.dsi.law.big.graph.BFS
- private void bfsperm() throws IOException {
- final long n = graph.numNodes();
- // Allow enough memory to behave like in-memory queue
- int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
-
- // Use a disk based queue to store BFS frontier
- final File queueFile = File.createTempFile(BFS.class.getSimpleName(), "queue");
- final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
- final byte[] byteBuf = new byte[Long.BYTES];
- // WARNING: no 64-bit version of this data-structure, but it can support
- // indices up to 2^37
- final LongArrayBitVector visited = LongArrayBitVector.ofLength(n);
- final ProgressLogger pl = new ProgressLogger(LOGGER);
- pl.expectedUpdates = n;
- pl.itemsName = "nodes";
- pl.start("Starting breadth-first visit...");
-
- for (long i = 0; i < n; i++) {
- if (visited.getBoolean(i))
- continue;
- queue.enqueue(Longs.toByteArray(i));
- visited.set(i);
-
- while (!queue.isEmpty()) {
- queue.dequeue(byteBuf);
- final long currentNode = Longs.fromByteArray(byteBuf);
-
- final LazyLongIterator iterator = graph.successors(currentNode);
- long succ;
- while ((succ = iterator.nextLong()) != -1) {
- if (!visited.getBoolean(succ)) {
- visited.set(succ);
- queue.enqueue(Longs.toByteArray(succ));
- }
- }
-
- pl.update();
- }
- }
-
- pl.done();
- queue.close();
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
deleted file mode 100644
index 98dd854..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/Benchmark.java
+++ /dev/null
@@ -1,154 +0,0 @@
-package org.softwareheritage.graph.benchmark;
-
-import com.martiansoftware.jsap.*;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.SWHID;
-import org.softwareheritage.graph.benchmark.utils.Random;
-import org.softwareheritage.graph.benchmark.utils.Statistics;
-import org.softwareheritage.graph.server.Endpoint;
-
-import java.io.BufferedWriter;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.StringJoiner;
-import java.util.function.Function;
-
-/**
- * Benchmark common utility functions.
- *
- * @author The Software Heritage developers
- */
-
-public class Benchmark {
- /** CSV separator for log file */
- final String CSV_SEPARATOR = ";";
- /** Command line arguments */
- public Args args;
- /**
- * Constructor.
- */
- public Benchmark() {
- this.args = new Args();
- }
-
- /**
- * Parses benchmark command line arguments.
- *
- * @param args command line arguments
- */
- public void parseCommandLineArgs(String[] args) throws JSAPException {
- SimpleJSAP jsap = new SimpleJSAP(Benchmark.class.getName(),
- "Benchmark tool for Software Heritage use-cases scenarios.",
- new Parameter[]{
- new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
- JSAP.NOT_GREEDY, "The basename of the compressed graph."),
- new FlaggedOption("nbNodes", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'n',
- "nb-nodes", "Number of random nodes used to do the benchmark."),
- new FlaggedOption("logFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'l',
- "log-file", "File name to output CSV format benchmark log."),
- new FlaggedOption("seed", JSAP.LONG_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 's', "seed",
- "Random generator seed."),});
-
- JSAPResult config = jsap.parse(args);
- if (jsap.messagePrinted()) {
- System.exit(1);
- }
-
- this.args.graphPath = config.getString("graphPath");
- this.args.nbNodes = config.getInt("nbNodes");
- this.args.logFile = config.getString("logFile");
- this.args.random = config.contains("seed") ? new Random(config.getLong("seed")) : new Random();
- }
-
- /**
- * Creates CSV file for log output.
- */
- public void createCSVLogFile() throws IOException {
- try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile))) {
- StringJoiner csvHeader = new StringJoiner(CSV_SEPARATOR);
- csvHeader.add("use case name").add("SWHID").add("number of edges accessed").add("traversal timing")
- .add("swhid2node timing").add("node2swhid timing");
- csvLog.write(csvHeader.toString() + "\n");
- }
- }
-
- /**
- * Times a specific endpoint and outputs individual datapoints along with aggregated statistics.
- *
- * @param useCaseName benchmark use-case name
- * @param graph compressed graph used in the benchmark
- * @param nodeIds node ids to use as starting point for the endpoint traversal
- * @param operation endpoint function to benchmark
- * @param dstFmt destination formatted string as described in the
- * API
- * @param algorithm traversal algorithm used in endpoint call (either "dfs" or "bfs")
- */
- public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds,
- Function operation, String dstFmt, String algorithm) throws IOException {
- ArrayList timings = new ArrayList<>();
- ArrayList timingsNormalized = new ArrayList<>();
- ArrayList nbEdgesAccessed = new ArrayList<>();
-
- final boolean append = true;
- try (Writer csvLog = new BufferedWriter(new FileWriter(args.logFile, append))) {
- for (long nodeId : nodeIds) {
- SWHID swhid = graph.getSWHID(nodeId);
-
- Endpoint.Output output = (dstFmt == null)
- ? operation.apply(new Endpoint.Input(swhid))
- : operation.apply(new Endpoint.Input(swhid, dstFmt, algorithm));
-
- StringJoiner csvLine = new StringJoiner(CSV_SEPARATOR);
- csvLine.add(useCaseName).add(swhid.toString()).add(Long.toString(output.meta.nbEdgesAccessed))
- .add(Double.toString(output.meta.timings.traversal))
- .add(Double.toString(output.meta.timings.swhid2node))
- .add(Double.toString(output.meta.timings.node2swhid));
- csvLog.write(csvLine.toString() + "\n");
-
- timings.add(output.meta.timings.traversal);
- nbEdgesAccessed.add((double) output.meta.nbEdgesAccessed);
- if (output.meta.nbEdgesAccessed != 0) {
- timingsNormalized.add(output.meta.timings.traversal / output.meta.nbEdgesAccessed);
- }
- }
- }
-
- System.out.println("\n" + useCaseName + " use-case:");
-
- System.out.println("timings:");
- Statistics stats = new Statistics(timings);
- stats.printAll();
-
- System.out.println("timings normalized:");
- Statistics statsNormalized = new Statistics(timingsNormalized);
- statsNormalized.printAll();
-
- System.out.println("nb edges accessed:");
- Statistics statsNbEdgesAccessed = new Statistics(nbEdgesAccessed);
- statsNbEdgesAccessed.printAll();
- }
-
- /**
- * Same as {@link #timeEndpoint} but without destination or algorithm specified to endpoint call.
- */
- public void timeEndpoint(String useCaseName, Graph graph, long[] nodeIds,
- Function operation) throws IOException {
- timeEndpoint(useCaseName, graph, nodeIds, operation, null, null);
- }
-
- /**
- * Input arguments.
- */
- public class Args {
- /** Basename of the compressed graph */
- public String graphPath;
- /** Number of random nodes to use for the benchmark */
- public int nbNodes;
- /** File name for CSV format benchmark log */
- public String logFile;
- /** Random generator */
- public Random random;
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java
deleted file mode 100644
index 6a0cf58..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/Browsing.java
+++ /dev/null
@@ -1,42 +0,0 @@
-package org.softwareheritage.graph.benchmark;
-
-import com.martiansoftware.jsap.JSAPException;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
-import org.softwareheritage.graph.server.Endpoint;
-
-import java.io.IOException;
-
-/**
- * Benchmark Software Heritage
- * browsing
- * use-cases scenarios.
- *
- * @author The Software Heritage developers
- */
-
-public class Browsing {
- /**
- * Main entrypoint.
- *
- * @param args command line arguments
- */
- public static void main(String[] args) throws IOException, JSAPException {
- Benchmark bench = new Benchmark();
- bench.parseCommandLineArgs(args);
-
- Graph graph = Graph.loadMapped(bench.args.graphPath);
-
- long[] dirNodeIds = bench.args.random.generateNodeIdsOfType(graph, bench.args.nbNodes, Node.Type.DIR);
- long[] revNodeIds = bench.args.random.generateNodeIdsOfType(graph, bench.args.nbNodes, Node.Type.REV);
-
- Endpoint dirEndpoint = new Endpoint(graph, "forward", "dir:cnt,dir:dir");
- Endpoint revEndpoint = new Endpoint(graph, "forward", "rev:rev");
-
- System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):");
- bench.createCSVLogFile();
- bench.timeEndpoint("ls", graph, dirNodeIds, dirEndpoint::neighbors);
- bench.timeEndpoint("ls -R", graph, dirNodeIds, dirEndpoint::visitPaths);
- bench.timeEndpoint("git log", graph, revNodeIds, revEndpoint::visitNodes);
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java
deleted file mode 100644
index 9b3c4c9..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/Provenance.java
+++ /dev/null
@@ -1,45 +0,0 @@
-package org.softwareheritage.graph.benchmark;
-
-import com.martiansoftware.jsap.JSAPException;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.server.Endpoint;
-
-import java.io.IOException;
-
-/**
- * Benchmark Software Heritage
- * provenance
- * use-cases scenarios.
- *
- * @author The Software Heritage developers
- */
-
-public class Provenance {
- /**
- * Main entrypoint.
- *
- * @param args command line arguments
- */
- public static void main(String[] args) throws IOException, JSAPException {
- Benchmark bench = new Benchmark();
- bench.parseCommandLineArgs(args);
-
- Graph graph = Graph.loadMapped(bench.args.graphPath);
-
- long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes);
-
- Endpoint commitProvenanceEndpoint = new Endpoint(graph, "backward", "dir:dir,cnt:dir,dir:rev");
- Endpoint originProvenanceEndpoint = new Endpoint(graph, "backward", "*");
-
- System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):");
- bench.createCSVLogFile();
-
- bench.timeEndpoint("commit provenance (dfs)", graph, nodeIds, commitProvenanceEndpoint::walk, "rev", "dfs");
- bench.timeEndpoint("commit provenance (bfs)", graph, nodeIds, commitProvenanceEndpoint::walk, "rev", "bfs");
- bench.timeEndpoint("complete commit provenance", graph, nodeIds, commitProvenanceEndpoint::leaves);
-
- bench.timeEndpoint("origin provenance (dfs)", graph, nodeIds, originProvenanceEndpoint::walk, "ori", "dfs");
- bench.timeEndpoint("origin provenance (bfs)", graph, nodeIds, originProvenanceEndpoint::walk, "ori", "bfs");
- bench.timeEndpoint("complete origin provenance", graph, nodeIds, originProvenanceEndpoint::leaves);
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java b/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java
deleted file mode 100644
index c0e19f6..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/Vault.java
+++ /dev/null
@@ -1,37 +0,0 @@
-package org.softwareheritage.graph.benchmark;
-
-import com.martiansoftware.jsap.JSAPException;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.server.Endpoint;
-
-import java.io.IOException;
-
-/**
- * Benchmark Software Heritage
- * vault use-case
- * scenario.
- *
- * @author The Software Heritage developers
- */
-
-public class Vault {
- /**
- * Main entrypoint.
- *
- * @param args command line arguments
- */
- public static void main(String[] args) throws IOException, JSAPException {
- Benchmark bench = new Benchmark();
- bench.parseCommandLineArgs(args);
-
- Graph graph = Graph.loadMapped(bench.args.graphPath);
-
- long[] nodeIds = bench.args.random.generateNodeIds(graph, bench.args.nbNodes);
-
- Endpoint endpoint = new Endpoint(graph, "forward", "*");
-
- System.out.println("Used " + bench.args.nbNodes + " random nodes (results are in seconds):");
- bench.createCSVLogFile();
- bench.timeEndpoint("git bundle", graph, nodeIds, endpoint::visitNodes);
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
deleted file mode 100644
index ee4c530..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Random.java
+++ /dev/null
@@ -1,67 +0,0 @@
-package org.softwareheritage.graph.benchmark.utils;
-
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
-
-import java.util.PrimitiveIterator;
-
-/**
- * Random related utility class.
- *
- * @author The Software Heritage developers
- */
-
-public class Random {
- /** Internal pseudorandom generator */
- java.util.Random random;
-
- /**
- * Constructor.
- */
- public Random() {
- this.random = new java.util.Random();
- }
-
- /**
- * Constructor.
- *
- * @param seed random generator seed
- */
- public Random(long seed) {
- this.random = new java.util.Random(seed);
- }
-
- /**
- * Generates random node ids.
- *
- * @param graph graph used to pick node ids
- * @param nbNodes number of node ids to generate
- * @return an array of random node ids
- */
- public long[] generateNodeIds(Graph graph, int nbNodes) {
- return random.longs(nbNodes, 0, graph.numNodes()).toArray();
- }
-
- /**
- * Generates random node ids with a specific type.
- *
- * @param graph graph used to pick node ids
- * @param nbNodes number of node ids to generate
- * @param expectedType specific node type to pick
- * @return an array of random node ids
- */
- public long[] generateNodeIdsOfType(Graph graph, int nbNodes, Node.Type expectedType) {
- PrimitiveIterator.OfLong nodes = random.longs(0, graph.numNodes()).iterator();
- long[] nodeIds = new long[nbNodes];
-
- long nextId;
- for (int i = 0; i < nbNodes; i++) {
- do {
- nextId = nodes.nextLong();
- } while (graph.getNodeType(nextId) != expectedType);
- nodeIds[i] = nextId;
- }
-
- return nodeIds;
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Statistics.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Statistics.java
deleted file mode 100644
index 96bdfd0..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Statistics.java
+++ /dev/null
@@ -1,104 +0,0 @@
-package org.softwareheritage.graph.benchmark.utils;
-
-import java.util.ArrayList;
-import java.util.Collections;
-
-/**
- * Compute various statistics on a list of values.
- *
- * @author The Software Heritage developers
- */
-
-public class Statistics {
- /** Input values */
- ArrayList values;
-
- /**
- * Constructor.
- *
- * @param values input values
- */
- public Statistics(ArrayList values) {
- this.values = values;
- }
-
- /**
- * Returns the minimum value.
- *
- * @return minimum value
- */
- public double getMin() {
- double min = Double.POSITIVE_INFINITY;
- for (double v : values) {
- min = Math.min(min, v);
- }
- return min;
- }
-
- /**
- * Returns the maximum value.
- *
- * @return maximum value
- */
- public double getMax() {
- double max = Double.NEGATIVE_INFINITY;
- for (double v : values) {
- max = Math.max(max, v);
- }
- return max;
- }
-
- /**
- * Computes the average.
- *
- * @return average value
- */
- public double getAverage() {
- double sum = 0;
- for (double v : values) {
- sum += v;
- }
- return sum / (double) values.size();
- }
-
- /**
- * Returns the median value.
- *
- * @return median value
- */
- public double getMedian() {
- Collections.sort(values);
- int length = values.size();
- if (length % 2 == 0) {
- return (values.get(length / 2) + values.get(length / 2 - 1)) / 2;
- } else {
- return values.get(length / 2);
- }
- }
-
- /**
- * Computes the standard deviation.
- *
- * @return standard deviation value
- */
- public double getStandardDeviation() {
- double average = getAverage();
- double variance = 0;
- for (double v : values) {
- variance += (v - average) * (v - average);
- }
- variance /= (double) values.size();
- return Math.sqrt(variance);
- }
-
- /**
- * Computes and prints all statistical values.
- */
- public void printAll() {
- System.out.println("min value: " + getMin());
- System.out.println("max value: " + getMax());
- System.out.println("average: " + getAverage());
- System.out.println("median: " + getMedian());
- System.out.println("standard deviation: " + getStandardDeviation());
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Timing.java b/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Timing.java
deleted file mode 100644
index de5de6c..0000000
--- a/java/src/main/java/org/softwareheritage/graph/benchmark/utils/Timing.java
+++ /dev/null
@@ -1,30 +0,0 @@
-package org.softwareheritage.graph.benchmark.utils;
-
-/**
- * Time measurement utility class.
- *
- * @author The Software Heritage developers
- */
-
-public class Timing {
- /**
- * Returns measurement starting timestamp.
- *
- * @return timestamp used for time measurement
- */
- public static long start() {
- return System.nanoTime();
- }
-
- /**
- * Ends timing measurement and returns total duration in seconds.
- *
- * @param startTime measurement starting timestamp
- * @return time in seconds elapsed since starting point
- */
- public static double stop(long startTime) {
- long endTime = System.nanoTime();
- double duration = (double) (endTime - startTime) / 1_000_000_000;
- return duration;
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java
new file mode 100644
index 0000000..1f12744
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/compress/CSVEdgeDataset.java
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+
+import com.github.luben.zstd.ZstdInputStream;
+import it.unimi.dsi.fastutil.bytes.ByteArrays;
+import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+/**
+ * A graph dataset in (zstd-compressed) CSV format.
+ *
+ * This format does not contain any properties apart from the SWHIDs of the nodes, and optionally
+ * the labels of the edges and the permissions of the directory entries.
+ *
+ * The structure of the dataset is as follows: one directory per object type, each containing:
+ *
+ *
+ *
a number of files *.nodes.csv.zst containing the SWHIDs of the objects stored in
+ * the graph, one per line.
+ *
a number of files *.edges.csv.zst containing the edges of the graph, one per
+ * line. The format of each edge is as follows:
+ * SRC_SWHID DST_SWHID [BASE64_LABEL] [INT_PERMISSION].
+ *
+ *
+ */
+public class CSVEdgeDataset implements GraphDataset {
+ final static Logger logger = LoggerFactory.getLogger(CSVEdgeDataset.class);
+
+ final private File datasetDir;
+
+ public CSVEdgeDataset(String datasetPath) {
+ this(new File(datasetPath));
+ }
+
+ public CSVEdgeDataset(File datasetDir) {
+ if (!datasetDir.exists()) {
+ throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist");
+ }
+ this.datasetDir = datasetDir;
+ }
+
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ File[] allTables = datasetDir.listFiles();
+ if (allTables == null) {
+ return;
+ }
+ for (File tableFile : allTables) {
+ File[] allCsvFiles = tableFile.listFiles();
+ if (allCsvFiles == null) {
+ continue;
+ }
+ for (File csvFile : allCsvFiles) {
+ if (csvFile.getName().endsWith(".edges.csv.zst")) {
+ readEdgesCsvZst(csvFile.getPath(), edgeCb);
+ } else if (csvFile.getName().endsWith(".nodes.csv.zst")) {
+ readNodesCsvZst(csvFile.getPath(), nodeCb);
+ }
+ }
+ }
+ }
+
+ public static void readEdgesCsvZst(String csvZstPath, GraphDataset.EdgeCallback cb) throws IOException {
+ InputStream csvInputStream = new ZstdInputStream(new BufferedInputStream(new FileInputStream(csvZstPath)));
+ readEdgesCsv(csvInputStream, cb);
+ }
+
+ public static void readEdgesCsv(InputStream csvInputStream, GraphDataset.EdgeCallback cb) throws IOException {
+ FastBufferedInputStream csvReader = new FastBufferedInputStream(csvInputStream);
+
+ Charset charset = StandardCharsets.US_ASCII;
+ byte[] array = new byte[1024];
+ for (long line = 0;; line++) {
+ int start = 0, len;
+ while ((len = csvReader.readLine(array, start, array.length - start,
+ FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) {
+ start += len;
+ array = ByteArrays.grow(array, array.length + 1);
+ }
+ if (len == -1)
+ break; // EOF
+ final int lineLength = start + len;
+
+ // Skip whitespace at the start of the line.
+ int offset = 0;
+ while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
+ offset++;
+ if (offset == lineLength) {
+ continue;
+ }
+ if (array[0] == '#')
+ continue;
+
+ // Scan source id.
+ start = offset;
+ while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
+ offset++;
+ final byte[] ss = Arrays.copyOfRange(array, start, offset);
+
+ // Skip whitespace between identifiers.
+ while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
+ offset++;
+ if (offset == lineLength) {
+ logger.error("Error at line " + line + ": no target");
+ continue;
+ }
+
+ // Scan target ID
+ start = offset;
+ while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
+ offset++;
+ final byte[] ts = Arrays.copyOfRange(array, start, offset);
+
+ // Skip whitespace between identifiers.
+ while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
+ offset++;
+ // Scan label
+ byte[] ls = null;
+ if (offset < lineLength) {
+ start = offset;
+ while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
+ offset++;
+ ls = Arrays.copyOfRange(array, start, offset);
+ }
+
+ // Skip whitespace between identifiers.
+ while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
+ offset++;
+ // Scan permission
+ int permission = 0;
+ if (offset < lineLength) {
+ start = offset;
+ while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
+ offset++;
+ permission = Integer.parseInt(new String(array, start, offset - start, charset));
+ }
+
+ cb.onEdge(ss, ts, ls, permission);
+ }
+ }
+
+ public static void readNodesCsvZst(String csvZstPath, GraphDataset.NodeCallback cb) throws IOException {
+ InputStream csvInputStream = new ZstdInputStream(new BufferedInputStream(new FileInputStream(csvZstPath)));
+ readNodesCsv(csvInputStream, cb);
+ }
+
+ public static void readNodesCsv(InputStream csvInputStream, GraphDataset.NodeCallback cb) throws IOException {
+ FastBufferedInputStream csvReader = new FastBufferedInputStream(csvInputStream);
+
+ byte[] array = new byte[1024];
+ for (long line = 0;; line++) {
+ int start = 0, len;
+ while ((len = csvReader.readLine(array, start, array.length - start,
+ FastBufferedInputStream.ALL_TERMINATORS)) == array.length - start) {
+ start += len;
+ array = ByteArrays.grow(array, array.length + 1);
+ }
+ if (len == -1)
+ break; // EOF
+ final int lineLength = start + len;
+
+ // Skip whitespace at the start of the line.
+ int offset = 0;
+ while (offset < lineLength && array[offset] >= 0 && array[offset] <= ' ')
+ offset++;
+ if (offset == lineLength) {
+ continue;
+ }
+ if (array[0] == '#')
+ continue;
+
+ // Scan source id.
+ start = offset;
+ while (offset < lineLength && (array[offset] < 0 || array[offset] > ' '))
+ offset++;
+ final byte[] ss = Arrays.copyOfRange(array, start, offset);
+
+ cb.onNode(ss);
+ }
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java b/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java
similarity index 85%
rename from java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java
rename to java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java
index 3e094e8..62d3460 100644
--- a/java/src/main/java/org/softwareheritage/graph/utils/ComposePermutations.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ComposePermutations.java
@@ -1,51 +1,58 @@
-package org.softwareheritage.graph.utils;
+/*
+ * Copyright (c) 2021-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.io.BinIO;
import java.io.File;
import java.io.IOException;
/**
* CLI program used to compose two on-disk permutations.
*
* It takes two on-disk permutations as parameters, p1 and p2, and writes on disk (p1 o p2) at the
- * given location. This is useful for multi-step compression (e.g. Unordered -> BFS -> LLP), as it
+ * given location. This is useful for multi-step compression (e.g., Unordered -> BFS -> LLP), as it
* can be used to merge all the intermediate permutations.
*/
public class ComposePermutations {
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
new UnflaggedOption("firstPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED, "The first permutation"),
new UnflaggedOption("secondPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED,
"The second permutation"),
new UnflaggedOption("outputPermutation", JSAP.STRING_PARSER, JSAP.REQUIRED,
"The output permutation"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
JSAPResult config = parse_args(args);
String firstPermFilename = config.getString("firstPermutation");
String secondPermFilename = config.getString("secondPermutation");
String outputPermFilename = config.getString("outputPermutation");
long[][] firstPerm = BinIO.loadLongsBig(new File(firstPermFilename));
long[][] secondPerm = BinIO.loadLongsBig(new File(secondPermFilename));
long[][] outputPerm = Util.composePermutationsInPlace(firstPerm, secondPerm);
BinIO.storeLongs(outputPerm, outputPermFilename);
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java
new file mode 100644
index 0000000..9e2ad40
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractNodes.java
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+
+import com.github.luben.zstd.ZstdOutputStream;
+import com.martiansoftware.jsap.*;
+import it.unimi.dsi.logging.ProgressLogger;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.softwareheritage.graph.SwhType;
+import org.softwareheritage.graph.utils.Sort;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+/**
+ * Read a graph dataset and extract all the unique node SWHIDs it contains, including the ones that
+ * are not stored as actual objects in the graph, but only referred to by the edges.
+ * Additionally, extract the set of all unique edge labels in the graph.
+ *
+ *
+ *
The set of nodes is written in ${outputBasename}.nodes.csv.zst, as a
+ * zst-compressed sorted list of SWHIDs, one per line.
+ *
The set of edge labels is written in ${outputBasename}.labels.csv.zst, as a
+ * zst-compressed sorted list of labels encoded in base64, one per line.
+ *
The number of unique nodes referred to in the graph is written in a text file,
+ * ${outputBasename}.nodes.count.txt
+ *
The number of unique edges referred to in the graph is written in a text file,
+ * ${outputBasename}.edges.count.txt
+ *
The number of unique edge labels is written in a text file,
+ * ${outputBasename}.labels.count.txt
+ *
Statistics on the number of nodes of each type are written in a text file,
+ * ${outputBasename}.nodes.stats.txt
+ *
Statistics on the number of edges of each type are written in a text file,
+ * ${outputBasename}.edges.stats.txt
+ *
+ *
+ *
+ * Rationale: Because the graph can contain holes, loose objects and dangling
+ * objects, some nodes that are referred to as destinations in the edge relationships might not
+ * actually be stored in the graph itself. However, to compress the graph using a graph compression
+ * library, it is necessary to have a list of all the nodes in the graph, including the
+ * ones that are simply referred to by the edges but not actually stored as concrete objects.
+ *
+ *
+ *
+ * This class reads the entire graph dataset, and uses sort -u to extract the set of
+ * all the unique nodes and unique labels that will be needed as an input for the compression
+ * process.
+ *
+ */
+public class ExtractNodes {
+ private final static Logger logger = LoggerFactory.getLogger(ExtractNodes.class);
+
+ // Create one thread per processor.
+ final static int numThreads = Runtime.getRuntime().availableProcessors();
+
+ // Allocate up to 20% of maximum memory for sorting subprocesses.
+ final static long sortBufferSize = (long) (Runtime.getRuntime().maxMemory() * 0.2 / numThreads / 2);
+
+ private static JSAPResult parseArgs(String[] args) {
+ JSAPResult config = null;
+ try {
+ SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
+ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the edges dataset"),
+ new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
+ "Basename of the output files"),
+
+ new FlaggedOption("format", JSAP.STRING_PARSER, "orc", JSAP.NOT_REQUIRED, 'f', "format",
+ "Format of the input dataset (orc, csv)"),
+ new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, String.valueOf(sortBufferSize) + "b",
+ JSAP.NOT_REQUIRED, 'S', "sort-buffer-size",
+ "Size of the memory buffer used by each sort process"),
+ new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir",
+ "Path to the temporary directory used by sort")});
+
+ config = jsap.parse(args);
+ if (jsap.messagePrinted()) {
+ System.exit(1);
+ }
+ } catch (JSAPException e) {
+ System.err.println("Usage error: " + e.getMessage());
+ System.exit(1);
+ }
+ return config;
+ }
+
+ public static void main(String[] args) throws IOException, InterruptedException {
+ JSAPResult parsedArgs = parseArgs(args);
+ String datasetPath = parsedArgs.getString("dataset");
+ String outputBasename = parsedArgs.getString("outputBasename");
+
+ String datasetFormat = parsedArgs.getString("format");
+ String sortBufferSize = parsedArgs.getString("sortBufferSize");
+ String sortTmpPath = parsedArgs.getString("sortTmpDir", null);
+
+ File sortTmpDir = new File(sortTmpPath);
+ sortTmpDir.mkdirs();
+
+ // Open edge dataset
+ GraphDataset dataset;
+ if (datasetFormat.equals("orc")) {
+ dataset = new ORCGraphDataset(datasetPath);
+ } else if (datasetFormat.equals("csv")) {
+ dataset = new CSVEdgeDataset(datasetPath);
+ } else {
+ throw new IllegalArgumentException("Unknown dataset format: " + datasetFormat);
+ }
+
+ extractNodes(dataset, outputBasename, sortBufferSize, sortTmpDir);
+ }
+
+ public static void extractNodes(GraphDataset dataset, String outputBasename, String sortBufferSize, File sortTmpDir)
+ throws IOException, InterruptedException {
+ // Read the dataset and write the nodes and labels to the sorting processes
+ AtomicLong edgeCount = new AtomicLong(0);
+ AtomicLongArray edgeCountByType = new AtomicLongArray(SwhType.values().length * SwhType.values().length);
+
+ int numThreads = Runtime.getRuntime().availableProcessors();
+ ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads);
+
+ Process[] nodeSorters = new Process[numThreads];
+ File[] nodeBatchPaths = new File[numThreads];
+ Process[] labelSorters = new Process[numThreads];
+ File[] labelBatches = new File[numThreads];
+ long[] progressCounts = new long[numThreads];
+
+ AtomicInteger nextThreadId = new AtomicInteger(0);
+ ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement);
+
+ ProgressLogger pl = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
+ pl.itemsName = "edges";
+ pl.start("Reading node/edge files and writing sorted batches.");
+
+ GraphDataset.NodeCallback nodeCallback = (node) -> {
+ int threadId = threadLocalId.get();
+ if (nodeSorters[threadId] == null) {
+ nodeBatchPaths[threadId] = File.createTempFile("nodes", ".txt", sortTmpDir);
+ nodeSorters[threadId] = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(),
+ List.of("-o", nodeBatchPaths[threadId].getPath()));
+ }
+ OutputStream nodeOutputStream = nodeSorters[threadId].getOutputStream();
+ nodeOutputStream.write(node);
+ nodeOutputStream.write('\n');
+ };
+
+ GraphDataset.NodeCallback labelCallback = (label) -> {
+ int threadId = threadLocalId.get();
+ if (labelSorters[threadId] == null) {
+ labelBatches[threadId] = File.createTempFile("labels", ".txt", sortTmpDir);
+ labelSorters[threadId] = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(),
+ List.of("-o", labelBatches[threadId].getPath()));
+ }
+ OutputStream labelOutputStream = labelSorters[threadId].getOutputStream();
+ labelOutputStream.write(label);
+ labelOutputStream.write('\n');
+ };
+
+ try {
+ forkJoinPool.submit(() -> {
+ try {
+ dataset.readEdges((node) -> {
+ nodeCallback.onNode(node);
+ }, (src, dst, label, perm) -> {
+ nodeCallback.onNode(src);
+ nodeCallback.onNode(dst);
+
+ if (label != null) {
+ labelCallback.onNode(label);
+ }
+ edgeCount.incrementAndGet();
+ // Extract type of src and dst from their SWHID: swh:1:XXX
+ byte[] srcTypeBytes = Arrays.copyOfRange(src, 6, 6 + 3);
+ byte[] dstTypeBytes = Arrays.copyOfRange(dst, 6, 6 + 3);
+ int srcType = SwhType.byteNameToInt(srcTypeBytes);
+ int dstType = SwhType.byteNameToInt(dstTypeBytes);
+ if (srcType != -1 && dstType != -1) {
+ edgeCountByType.incrementAndGet(srcType * SwhType.values().length + dstType);
+ } else {
+ System.err.println("Invalid edge type: " + new String(srcTypeBytes) + " -> "
+ + new String(dstTypeBytes));
+ System.exit(1);
+ }
+
+ int threadId = threadLocalId.get();
+ if (++progressCounts[threadId] > 1000) {
+ synchronized (pl) {
+ pl.update(progressCounts[threadId]);
+ }
+ progressCounts[threadId] = 0;
+ }
+ });
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }).get();
+ } catch (ExecutionException e) {
+ throw new RuntimeException(e);
+ }
+
+ // Close all the sorters stdin
+ for (int i = 0; i < numThreads; i++) {
+ if (nodeSorters[i] != null) {
+ nodeSorters[i].getOutputStream().close();
+ }
+ if (labelSorters[i] != null) {
+ labelSorters[i].getOutputStream().close();
+ }
+ }
+
+ // Wait for sorting processes to finish
+ for (int i = 0; i < numThreads; i++) {
+ if (nodeSorters[i] != null) {
+ nodeSorters[i].waitFor();
+ }
+ if (labelSorters[i] != null) {
+ labelSorters[i].waitFor();
+ }
+ }
+ pl.done();
+
+ ArrayList nodeSortMergerOptions = new ArrayList<>(List.of("-m"));
+ ArrayList labelSortMergerOptions = new ArrayList<>(List.of("-m"));
+ for (int i = 0; i < numThreads; i++) {
+ if (nodeBatchPaths[i] != null) {
+ nodeSortMergerOptions.add(nodeBatchPaths[i].getPath());
+ }
+ if (labelBatches[i] != null) {
+ labelSortMergerOptions.add(labelBatches[i].getPath());
+ }
+ }
+
+ // Spawn node merge-sorting process
+ Process nodeSortMerger = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), nodeSortMergerOptions);
+ nodeSortMerger.getOutputStream().close();
+ OutputStream nodesFileOutputStream = new ZstdOutputStream(
+ new BufferedOutputStream(new FileOutputStream(outputBasename + ".nodes.csv.zst")));
+ NodesOutputThread nodesOutputThread = new NodesOutputThread(
+ new BufferedInputStream(nodeSortMerger.getInputStream()), nodesFileOutputStream);
+ nodesOutputThread.start();
+
+ // Spawn label merge-sorting process
+ Process labelSortMerger = Sort.spawnSort(sortBufferSize, sortTmpDir.getPath(), labelSortMergerOptions);
+ labelSortMerger.getOutputStream().close();
+ OutputStream labelsFileOutputStream = new ZstdOutputStream(
+ new BufferedOutputStream(new FileOutputStream(outputBasename + ".labels.csv.zst")));
+ LabelsOutputThread labelsOutputThread = new LabelsOutputThread(
+ new BufferedInputStream(labelSortMerger.getInputStream()), labelsFileOutputStream);
+ labelsOutputThread.start();
+
+ pl.logger().info("Waiting for merge-sort and writing output files...");
+ nodeSortMerger.waitFor();
+ labelSortMerger.waitFor();
+ nodesOutputThread.join();
+ labelsOutputThread.join();
+
+ long[][] edgeCountByTypeArray = new long[SwhType.values().length][SwhType.values().length];
+ for (int i = 0; i < edgeCountByTypeArray.length; i++) {
+ for (int j = 0; j < edgeCountByTypeArray[i].length; j++) {
+ edgeCountByTypeArray[i][j] = edgeCountByType.get(i * SwhType.values().length + j);
+ }
+ }
+
+ // Write node, edge and label counts/statistics
+ printEdgeCounts(outputBasename, edgeCount.get(), edgeCountByTypeArray);
+ printNodeCounts(outputBasename, nodesOutputThread.getNodeCount(), nodesOutputThread.getNodeTypeCounts());
+ printLabelCounts(outputBasename, labelsOutputThread.getLabelCount());
+
+ // Clean up sorted batches
+ for (int i = 0; i < numThreads; i++) {
+ if (nodeBatchPaths[i] != null) {
+ nodeBatchPaths[i].delete();
+ }
+ if (labelBatches[i] != null) {
+ labelBatches[i].delete();
+ }
+ }
+ }
+
+ private static void printEdgeCounts(String basename, long edgeCount, long[][] edgeTypeCounts) throws IOException {
+ PrintWriter nodeCountWriter = new PrintWriter(basename + ".edges.count.txt");
+ nodeCountWriter.println(edgeCount);
+ nodeCountWriter.close();
+
+ PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".edges.stats.txt");
+ TreeMap edgeTypeCountsMap = new TreeMap<>();
+ for (SwhType src : SwhType.values()) {
+ for (SwhType dst : SwhType.values()) {
+ long cnt = edgeTypeCounts[SwhType.toInt(src)][SwhType.toInt(dst)];
+ if (cnt > 0)
+ edgeTypeCountsMap.put(src.toString().toLowerCase() + ":" + dst.toString().toLowerCase(), cnt);
+ }
+ }
+ for (Map.Entry entry : edgeTypeCountsMap.entrySet()) {
+ nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue());
+ }
+ nodeTypesCountWriter.close();
+ }
+
+ private static void printNodeCounts(String basename, long nodeCount, long[] nodeTypeCounts) throws IOException {
+ PrintWriter nodeCountWriter = new PrintWriter(basename + ".nodes.count.txt");
+ nodeCountWriter.println(nodeCount);
+ nodeCountWriter.close();
+
+ PrintWriter nodeTypesCountWriter = new PrintWriter(basename + ".nodes.stats.txt");
+ TreeMap nodeTypeCountsMap = new TreeMap<>();
+ for (SwhType v : SwhType.values()) {
+ nodeTypeCountsMap.put(v.toString().toLowerCase(), nodeTypeCounts[SwhType.toInt(v)]);
+ }
+ for (Map.Entry entry : nodeTypeCountsMap.entrySet()) {
+ nodeTypesCountWriter.println(entry.getKey() + " " + entry.getValue());
+ }
+ nodeTypesCountWriter.close();
+ }
+
+ private static void printLabelCounts(String basename, long labelCount) throws IOException {
+ PrintWriter nodeCountWriter = new PrintWriter(basename + ".labels.count.txt");
+ nodeCountWriter.println(labelCount);
+ nodeCountWriter.close();
+ }
+
+ private static class NodesOutputThread extends Thread {
+ private final InputStream sortedNodesStream;
+ private final OutputStream nodesOutputStream;
+
+ private long nodeCount = 0;
+ private final long[] nodeTypeCounts = new long[SwhType.values().length];
+
+ NodesOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) {
+ this.sortedNodesStream = sortedNodesStream;
+ this.nodesOutputStream = nodesOutputStream;
+ }
+
+ @Override
+ public void run() {
+ BufferedReader reader = new BufferedReader(
+ new InputStreamReader(sortedNodesStream, StandardCharsets.UTF_8));
+ try {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ nodesOutputStream.write(line.getBytes(StandardCharsets.UTF_8));
+ nodesOutputStream.write('\n');
+ nodeCount++;
+ try {
+ SwhType nodeType = SwhType.fromStr(line.split(":")[2]);
+ nodeTypeCounts[SwhType.toInt(nodeType)]++;
+ } catch (ArrayIndexOutOfBoundsException e) {
+ System.err.println("Error parsing SWHID: " + line);
+ System.exit(1);
+ }
+ }
+ nodesOutputStream.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public long getNodeCount() {
+ return nodeCount;
+ }
+
+ public long[] getNodeTypeCounts() {
+ return nodeTypeCounts;
+ }
+ }
+
+ private static class LabelsOutputThread extends Thread {
+ private final InputStream sortedLabelsStream;
+ private final OutputStream labelsOutputStream;
+
+ private long labelCount = 0;
+
+ LabelsOutputThread(InputStream sortedLabelsStream, OutputStream labelsOutputStream) {
+ this.labelsOutputStream = labelsOutputStream;
+ this.sortedLabelsStream = sortedLabelsStream;
+ }
+
+ @Override
+ public void run() {
+ BufferedReader reader = new BufferedReader(
+ new InputStreamReader(sortedLabelsStream, StandardCharsets.UTF_8));
+ try {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ labelsOutputStream.write(line.getBytes(StandardCharsets.UTF_8));
+ labelsOutputStream.write('\n');
+ labelCount++;
+ }
+ labelsOutputStream.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public long getLabelCount() {
+ return labelCount;
+ }
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java
new file mode 100644
index 0000000..fc5cc5b
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ExtractPersons.java
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+
+import com.github.luben.zstd.ZstdOutputStream;
+import com.martiansoftware.jsap.*;
+import org.softwareheritage.graph.utils.Sort;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Read a graph dataset and extract all the unique authors it contains.
+ *
+ *
+ * This class reads the revision and release tables of the graph dataset, and uses
+ * sort -u to extract the set of all the unique persons (name + email, potentially
+ * pseudonymized) and store them in a file.
+ *
+ */
+public class ExtractPersons {
+ private static JSAPResult parseArgs(String[] args) {
+ JSAPResult config = null;
+ try {
+ SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
+ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC dataset"),
+ new UnflaggedOption("outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
+ "Basename of the output files"),
+
+ new FlaggedOption("sortBufferSize", JSAP.STRING_PARSER, "30%", JSAP.NOT_REQUIRED, 'S',
+ "sort-buffer-size", "Size of the memory buffer used by sort"),
+ new FlaggedOption("sortTmpDir", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'T', "temp-dir",
+ "Path to the temporary directory used by sort")});
+
+ config = jsap.parse(args);
+ if (jsap.messagePrinted()) {
+ System.exit(1);
+ }
+ } catch (JSAPException e) {
+ System.err.println("Usage error: " + e.getMessage());
+ System.exit(1);
+ }
+ return config;
+ }
+
+ private static void processAuthorColumn(ORCGraphDataset.SwhOrcTable table, String columnName, OutputStream stream)
+ throws IOException {
+ table.readBytes64Column(columnName, (swhid, personBase64) -> {
+ stream.write(personBase64);
+ stream.write('\n');
+ });
+ }
+
+ public static void main(String[] args) throws IOException, InterruptedException {
+ JSAPResult parsedArgs = parseArgs(args);
+ String datasetPath = parsedArgs.getString("dataset");
+ String outputBasename = parsedArgs.getString("outputBasename");
+
+ String sortBufferSize = parsedArgs.getString("sortBufferSize");
+ String sortTmpDir = parsedArgs.getString("sortTmpDir", null);
+
+ ORCGraphDataset dataset = new ORCGraphDataset(datasetPath);
+
+ extractPersons(dataset, outputBasename, sortBufferSize, sortTmpDir);
+ }
+
+ public static void extractPersons(ORCGraphDataset dataset, String outputBasename, String sortBufferSize,
+ String sortTmpDir) throws IOException, InterruptedException {
+ (new File(sortTmpDir)).mkdirs();
+
+ // Spawn person sorting process
+ Process personSort = Sort.spawnSort(sortBufferSize, sortTmpDir);
+ BufferedOutputStream personSortStdin = new BufferedOutputStream(personSort.getOutputStream());
+ BufferedInputStream personSortStdout = new BufferedInputStream(personSort.getInputStream());
+ OutputStream personsFileOutputStream = new ZstdOutputStream(
+ new BufferedOutputStream(new FileOutputStream(outputBasename + ".persons.csv.zst")));
+ PersonsOutputThread personsOutputThread = new PersonsOutputThread(personSortStdout, personsFileOutputStream);
+ personsOutputThread.start();
+
+ processAuthorColumn(dataset.getTable("release"), "author", personSortStdin);
+ processAuthorColumn(dataset.getTable("revision"), "author", personSortStdin);
+ processAuthorColumn(dataset.getTable("revision"), "committer", personSortStdin);
+
+ // Wait for sorting processes to finish
+ personSortStdin.close();
+ personSort.waitFor();
+ personsOutputThread.join();
+
+ // Write person count statistics
+ printPersonsCounts(outputBasename, personsOutputThread.getPersonCount());
+ }
+
+ private static void printPersonsCounts(String basename, long labelCount) throws IOException {
+ PrintWriter nodeCountWriter = new PrintWriter(basename + ".persons.count.txt");
+ nodeCountWriter.println(labelCount);
+ nodeCountWriter.close();
+ }
+
+ private static class PersonsOutputThread extends Thread {
+ private final InputStream sortedPersonsStream;
+ private final OutputStream personsOutputStream;
+
+ private long personCount = 0;
+
+ PersonsOutputThread(InputStream sortedNodesStream, OutputStream nodesOutputStream) {
+ this.sortedPersonsStream = sortedNodesStream;
+ this.personsOutputStream = nodesOutputStream;
+ }
+
+ @Override
+ public void run() {
+ BufferedReader reader = new BufferedReader(
+ new InputStreamReader(sortedPersonsStream, StandardCharsets.UTF_8));
+ try {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ personsOutputStream.write(line.getBytes(StandardCharsets.UTF_8));
+ personsOutputStream.write('\n');
+ personCount++;
+ }
+ personsOutputStream.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public long getPersonCount() {
+ return personCount;
+ }
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java
new file mode 100644
index 0000000..ae38cda
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/compress/GraphDataset.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+
+import java.io.IOException;
+
+/**
+ * GraphDataset is a common interface to represent on-disk graph datasets in various formats,
+ * usually extracted from the SWH archive with the swh-dataset tool.
+ */
+public interface GraphDataset {
+ interface NodeCallback {
+ void onNode(byte[] node) throws IOException;
+ }
+
+ interface EdgeCallback {
+ void onEdge(byte[] src, byte[] dst, byte[] label, int permission) throws IOException;
+ }
+
+ /**
+ * Read the graph dataset and call the callback methods for each node and edge encountered.
+ *
+ *
+ *
The node callback is called for each object stored in the graph.
+ *
The edge callback is called for each relationship (between two nodes) stored in the
+ * graph.
+ *
+ *
+ *
+ * Note that because the graph can contain holes, loose objects and dangling objects, the edge
+ * callback may be called with parameters representing nodes that are not stored in the graph. This
+ * is because some nodes that are referred to as destinations in the dataset might not be present in
+ * the archive (e.g., a revision entry in a directory pointing to a revision that we have not
+ * crawled yet).
+ *
+ *
+ *
+ * In order to generate a complete set of all the nodes that are referred to in the graph
+ * dataset, see the {@link ExtractNodes} class.
+ *
+ *
+ * @param nodeCb callback for each node
+ * @param edgeCb callback for each edge
+ */
+ void readEdges(NodeCallback nodeCb, EdgeCallback edgeCb) throws IOException;
+
+ interface TimestampCallback {
+ void onTimestamp(byte[] swhid, long timestamp, short offset) throws IOException;
+ }
+
+ interface LongCallback {
+ void onLong(byte[] swhid, long value) throws IOException;
+ }
+
+ interface BytesCallback {
+ void onBytes(byte[] swhid, byte[] value) throws IOException;
+ }
+
+ interface HashedEdgeCallback {
+ void onHashedEdge(long src, long dst, long label, int permission) throws IOException;
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java
new file mode 100644
index 0000000..31531ec
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/compress/LabelMapBuilder.java
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2020-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+
+import com.martiansoftware.jsap.*;
+import it.unimi.dsi.big.webgraph.LazyLongIterator;
+import it.unimi.dsi.big.webgraph.labelling.ArcLabelledImmutableGraph;
+import it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph;
+import it.unimi.dsi.fastutil.Arrays;
+import it.unimi.dsi.fastutil.BigArrays;
+import it.unimi.dsi.fastutil.Size64;
+import it.unimi.dsi.fastutil.longs.LongBigArrays;
+import it.unimi.dsi.fastutil.longs.LongHeapSemiIndirectPriorityQueue;
+import it.unimi.dsi.fastutil.objects.Object2LongFunction;
+import it.unimi.dsi.fastutil.objects.ObjectArrayList;
+import it.unimi.dsi.io.InputBitStream;
+import it.unimi.dsi.io.OutputBitStream;
+import it.unimi.dsi.logging.ProgressLogger;
+import it.unimi.dsi.big.webgraph.ImmutableGraph;
+import it.unimi.dsi.big.webgraph.NodeIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.softwareheritage.graph.labels.DirEntry;
+import org.softwareheritage.graph.labels.SwhLabel;
+import org.softwareheritage.graph.maps.NodeIdMap;
+import org.softwareheritage.graph.utils.ForkJoinBigQuickSort2;
+import org.softwareheritage.graph.utils.ForkJoinQuickSort3;
+
+import java.io.*;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.IntStream;
+
+public class LabelMapBuilder {
+ final static Logger logger = LoggerFactory.getLogger(LabelMapBuilder.class);
+
+ // Create one thread per processor.
+ final static int numThreads = Runtime.getRuntime().availableProcessors();
+ // Allocate up to 40% of maximum memory.
+ final static int DEFAULT_BATCH_SIZE = Math
+ .min((int) (Runtime.getRuntime().maxMemory() * 0.4 / (numThreads * 8 * 3)), Arrays.MAX_ARRAY_SIZE);
+
+ String orcDatasetPath;
+ String graphPath;
+ String outputGraphPath;
+ String tmpDir;
+ int batchSize;
+
+ long numNodes;
+ long numArcs;
+
+ NodeIdMap nodeIdMap;
+ Object2LongFunction filenameMph;
+ long numFilenames;
+ int totalLabelWidth;
+
+ public LabelMapBuilder(String orcDatasetPath, String graphPath, String outputGraphPath, int batchSize,
+ String tmpDir) throws IOException {
+ this.orcDatasetPath = orcDatasetPath;
+ this.graphPath = graphPath;
+ this.outputGraphPath = (outputGraphPath == null) ? graphPath : outputGraphPath;
+ this.batchSize = batchSize;
+ this.tmpDir = tmpDir;
+
+ ImmutableGraph graph = ImmutableGraph.loadOffline(graphPath);
+ this.numArcs = graph.numArcs();
+ this.numNodes = graph.numNodes();
+
+ this.nodeIdMap = new NodeIdMap(graphPath);
+
+ filenameMph = NodeIdMap.loadMph(graphPath + ".labels.mph");
+ numFilenames = getMPHSize(filenameMph);
+ totalLabelWidth = DirEntry.labelWidth(numFilenames);
+ }
+
+ private static JSAPResult parse_args(String[] args) {
+ JSAPResult config = null;
+ try {
+ SimpleJSAP jsap = new SimpleJSAP(LabelMapBuilder.class.getName(), "", new Parameter[]{
+ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"),
+ new UnflaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.REQUIRED, "Basename of the output graph"),
+ new FlaggedOption("outputGraphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o',
+ "output-graph", "Basename of the output graph, same as --graph if not specified"),
+ new FlaggedOption("batchSize", JSAP.INTEGER_PARSER, String.valueOf(DEFAULT_BATCH_SIZE),
+ JSAP.NOT_REQUIRED, 'b', "batch-size", "Number of triplets held in memory in each batch"),
+ new FlaggedOption("tmpDir", JSAP.STRING_PARSER, "tmp", JSAP.NOT_REQUIRED, 'T', "temp-dir",
+ "Temporary directory path"),});
+
+ config = jsap.parse(args);
+ if (jsap.messagePrinted()) {
+ System.exit(1);
+ }
+ } catch (JSAPException e) {
+ e.printStackTrace();
+ }
+ return config;
+ }
+
+ public static void main(String[] args) throws IOException, InterruptedException {
+ JSAPResult config = parse_args(args);
+ String orcDataset = config.getString("dataset");
+ String graphPath = config.getString("graphPath");
+ String outputGraphPath = config.getString("outputGraphPath");
+ int batchSize = config.getInt("batchSize");
+ String tmpDir = config.getString("tmpDir");
+
+ LabelMapBuilder builder = new LabelMapBuilder(orcDataset, graphPath, outputGraphPath, batchSize, tmpDir);
+
+ builder.computeLabelMap();
+ }
+
+ static long getMPHSize(Object2LongFunction mph) {
+ return (mph instanceof Size64) ? ((Size64) mph).size64() : mph.size();
+ }
+
+ void computeLabelMap() throws IOException {
+ File tempDirFile = new File(tmpDir);
+ ObjectArrayList forwardBatches = new ObjectArrayList<>();
+ ObjectArrayList backwardBatches = new ObjectArrayList<>();
+ genSortedBatches(forwardBatches, backwardBatches, tempDirFile);
+
+ BatchEdgeLabelLineIterator forwardBatchHeapIterator = new BatchEdgeLabelLineIterator(forwardBatches);
+ writeLabels(forwardBatchHeapIterator, graphPath, outputGraphPath);
+ for (File batch : forwardBatches) {
+ batch.delete();
+ }
+
+ BatchEdgeLabelLineIterator backwardBatchHeapIterator = new BatchEdgeLabelLineIterator(backwardBatches);
+ writeLabels(backwardBatchHeapIterator, graphPath + "-transposed", outputGraphPath + "-transposed");
+ for (File batch : backwardBatches) {
+ batch.delete();
+ }
+
+ logger.info("Done");
+ }
+
+ void genSortedBatches(ObjectArrayList forwardBatches, ObjectArrayList backwardBatches, File tempDirFile)
+ throws IOException {
+ logger.info("Initializing batch arrays.");
+ long[][] srcArrays = new long[numThreads][batchSize];
+ long[][] dstArrays = new long[numThreads][batchSize];
+ long[][] labelArrays = new long[numThreads][batchSize];
+ int[] indexes = new int[numThreads];
+ long[] progressCounts = new long[numThreads];
+
+ ProgressLogger plSortingBatches = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
+ plSortingBatches.itemsName = "edges";
+ plSortingBatches.expectedUpdates = this.numArcs;
+ plSortingBatches.start("Reading edges and writing sorted batches.");
+
+ AtomicInteger nextThreadId = new AtomicInteger(0);
+ ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement);
+
+ readHashedEdgeLabels((src, dst, label, perms) -> {
+ // System.err.println("0. Input " + src + " " + dst + " " + label + " " + perms);
+ int threadId = threadLocalId.get();
+ int idx = indexes[threadId]++;
+ srcArrays[threadId][idx] = src;
+ dstArrays[threadId][idx] = dst;
+ labelArrays[threadId][idx] = DirEntry.toEncoded(label, perms);
+ if (++progressCounts[threadId] > 1000) {
+ synchronized (plSortingBatches) {
+ plSortingBatches.update(progressCounts[threadId]);
+ }
+ progressCounts[threadId] = 0;
+ }
+
+ if (idx == batchSize - 1) {
+ processBidirectionalBatches(batchSize, srcArrays[threadId], dstArrays[threadId], labelArrays[threadId],
+ tempDirFile, forwardBatches, backwardBatches);
+ indexes[threadId] = 0;
+ }
+ });
+
+ IntStream.range(0, numThreads).parallel().forEach(t -> {
+ int idx = indexes[t];
+ if (idx > 0) {
+ try {
+ processBidirectionalBatches(idx, srcArrays[t], dstArrays[t], labelArrays[t], tempDirFile,
+ forwardBatches, backwardBatches);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ });
+
+ // Trigger the GC to free up the large arrays
+ for (int i = 0; i < numThreads; i++) {
+ srcArrays[i] = null;
+ dstArrays[i] = null;
+ labelArrays[i] = null;
+ }
+
+ logger.info("Created " + forwardBatches.size() + " forward batches and " + backwardBatches.size()
+ + " backward batches.");
+ }
+
+ void readHashedEdgeLabels(GraphDataset.HashedEdgeCallback cb) throws IOException {
+ ORCGraphDataset dataset = new ORCGraphDataset(orcDatasetPath);
+ ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads);
+ try {
+ forkJoinPool.submit(() -> {
+ try {
+ dataset.readEdges((node) -> {
+ }, (src, dst, label, perms) -> {
+ if (label == null) {
+ return;
+ }
+ long srcNode = nodeIdMap.getNodeId(src);
+ long dstNode = nodeIdMap.getNodeId(dst);
+ long labelId = filenameMph.getLong(label);
+ cb.onHashedEdge(srcNode, dstNode, labelId, perms);
+ });
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }).get();
+ } catch (InterruptedException | ExecutionException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ void processBidirectionalBatches(final int n, final long[] source, final long[] target, final long[] labels,
+ final File tempDir, final List forwardBatches, final List backwardBatches) throws IOException {
+ processBatch(n, source, target, labels, tempDir, forwardBatches);
+ processBatch(n, target, source, labels, tempDir, backwardBatches);
+ }
+
+ void processBatch(final int n, final long[] source, final long[] target, final long[] labels, final File tempDir,
+ final List batches) throws IOException {
+ if (n == 0) {
+ return;
+ }
+ ForkJoinQuickSort3.parallelQuickSort(source, target, labels, 0, n);
+
+ final File batchFile = File.createTempFile("batch", ".bitstream", tempDir);
+ batchFile.deleteOnExit();
+ batches.add(batchFile);
+ final OutputBitStream batch = new OutputBitStream(batchFile);
+
+ // Compute unique triplets
+ int u = 1;
+ for (int i = n - 1; i-- != 0;) {
+ if (source[i] != source[i + 1] || target[i] != target[i + 1] || labels[i] != labels[i + 1]) {
+ u++;
+ }
+ }
+ batch.writeDelta(u);
+
+ // Write batch
+ long prevSource = source[0];
+ batch.writeLongDelta(prevSource);
+ batch.writeLongDelta(target[0]);
+ batch.writeLongDelta(labels[0]);
+ // System.err.println("1. Wrote " + prevSource + " " + target[0] + " " + labels[0]);
+
+ for (int i = 1; i < n; i++) {
+ if (source[i] != prevSource) {
+ // Default case, we write (source - prevsource, target, label)
+ batch.writeLongDelta(source[i] - prevSource);
+ batch.writeLongDelta(target[i]);
+ batch.writeLongDelta(labels[i]);
+ prevSource = source[i];
+ } else if (target[i] != target[i - 1] || labels[i] != labels[i - 1]) {
+ // Case where source is identical with prevsource, but target or label differ.
+ // We write (0, target - prevtarget, label)
+ batch.writeLongDelta(0);
+ batch.writeLongDelta(target[i] - target[i - 1]);
+ batch.writeLongDelta(labels[i]);
+ } else {
+ continue;
+ }
+ // System.err.println("1. Wrote " + source[i] + " " + target[i] + " " + labels[i]);
+ }
+ batch.close();
+ }
+
+ void writeLabels(EdgeLabelLineIterator mapLines, String graphBasename, String outputGraphBasename)
+ throws IOException {
+ // Loading the graph to iterate
+ ImmutableGraph graph = ImmutableGraph.loadMapped(graphBasename);
+
+ // Get the sorted output and write the labels and label offsets
+ ProgressLogger plLabels = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
+ plLabels.itemsName = "edges";
+ plLabels.expectedUpdates = this.numArcs;
+ plLabels.start("Writing the labels to the label file: " + outputGraphBasename + "-labelled.*");
+
+ OutputBitStream labels = new OutputBitStream(
+ new File(outputGraphBasename + "-labelled" + BitStreamArcLabelledImmutableGraph.LABELS_EXTENSION));
+ OutputBitStream offsets = new OutputBitStream(new File(
+ outputGraphBasename + "-labelled" + BitStreamArcLabelledImmutableGraph.LABEL_OFFSETS_EXTENSION));
+ offsets.writeGamma(0);
+
+ EdgeLabelLine line = new EdgeLabelLine(-1, -1, -1, -1);
+
+ NodeIterator it = graph.nodeIterator();
+ boolean started = false;
+
+ ArrayList labelBuffer = new ArrayList<>(128);
+ while (it.hasNext()) {
+ long srcNode = it.nextLong();
+
+ long bits = 0;
+ LazyLongIterator s = it.successors();
+ long dstNode;
+ while ((dstNode = s.nextLong()) >= 0) {
+ while (line != null && line.srcNode <= srcNode && line.dstNode <= dstNode) {
+ if (line.srcNode == srcNode && line.dstNode == dstNode) {
+ labelBuffer.add(new DirEntry(line.filenameId, line.permission));
+ }
+
+ if (!mapLines.hasNext())
+ break;
+
+ line = mapLines.next();
+ if (!started) {
+ plLabels.start("Writing label map to file...");
+ started = true;
+ }
+ }
+
+ SwhLabel l = new SwhLabel("edgelabel", totalLabelWidth, labelBuffer.toArray(new DirEntry[0]));
+ labelBuffer.clear();
+ bits += l.toBitStream(labels, -1);
+ plLabels.lightUpdate();
+ }
+ offsets.writeLongGamma(bits);
+ }
+
+ labels.close();
+ offsets.close();
+ plLabels.done();
+
+ graph = null;
+
+ PrintWriter pw = new PrintWriter(new FileWriter(outputGraphBasename + "-labelled.properties"));
+ pw.println(ImmutableGraph.GRAPHCLASS_PROPERTY_KEY + " = " + BitStreamArcLabelledImmutableGraph.class.getName());
+ pw.println(BitStreamArcLabelledImmutableGraph.LABELSPEC_PROPERTY_KEY + " = " + SwhLabel.class.getName()
+ + "(DirEntry," + totalLabelWidth + ")");
+ pw.println(ArcLabelledImmutableGraph.UNDERLYINGGRAPH_PROPERTY_KEY + " = "
+ + Paths.get(outputGraphBasename).getFileName());
+ pw.close();
+ }
+
+ public static class EdgeLabelLine {
+ public long srcNode;
+ public long dstNode;
+ public long filenameId;
+ public int permission;
+
+ public EdgeLabelLine(long labelSrcNode, long labelDstNode, long labelFilenameId, int labelPermission) {
+ this.srcNode = labelSrcNode;
+ this.dstNode = labelDstNode;
+ this.filenameId = labelFilenameId;
+ this.permission = labelPermission;
+ }
+ }
+
+ public abstract static class EdgeLabelLineIterator implements Iterator {
+ @Override
+ public abstract boolean hasNext();
+
+ @Override
+ public abstract EdgeLabelLine next();
+ }
+
+ public static class BatchEdgeLabelLineIterator extends EdgeLabelLineIterator {
+ private static final int STD_BUFFER_SIZE = 128 * 1024;
+
+ private final InputBitStream[] batchIbs;
+ private final int[] inputStreamLength;
+ private final long[] refArray;
+ private final LongHeapSemiIndirectPriorityQueue queue;
+ private final long[] prevTarget;
+
+ /** The last returned node (-1 if no node has been returned yet). */
+ private long lastNode;
+ private long[][] lastNodeSuccessors = LongBigArrays.EMPTY_BIG_ARRAY;
+ private long[][] lastNodeLabels = LongBigArrays.EMPTY_BIG_ARRAY;
+ private long lastNodeOutdegree;
+ private long lastNodeCurrentSuccessor;
+
+ public BatchEdgeLabelLineIterator(final List batches) throws IOException {
+ this.batchIbs = new InputBitStream[batches.size()];
+ this.refArray = new long[batches.size()];
+ this.prevTarget = new long[batches.size()];
+ this.queue = new LongHeapSemiIndirectPriorityQueue(refArray);
+ this.inputStreamLength = new int[batches.size()];
+
+ for (int i = 0; i < batches.size(); i++) {
+ batchIbs[i] = new InputBitStream(batches.get(i), STD_BUFFER_SIZE);
+ this.inputStreamLength[i] = batchIbs[i].readDelta();
+ this.refArray[i] = batchIbs[i].readLongDelta();
+ queue.enqueue(i);
+ }
+
+ this.lastNode = -1;
+ this.lastNodeOutdegree = 0;
+ this.lastNodeCurrentSuccessor = 0;
+ }
+
+ public boolean hasNextNode() {
+ return !queue.isEmpty();
+ }
+
+ private void readNextNode() throws IOException {
+ assert hasNext();
+
+ int i;
+ lastNode++;
+ lastNodeOutdegree = 0;
+ lastNodeCurrentSuccessor = 0;
+
+ /*
+ * We extract elements from the queue as long as their target is equal to last. If during the
+ * process we exhaust a batch, we close it.
+ */
+ while (!queue.isEmpty() && refArray[i = queue.first()] == lastNode) {
+ lastNodeSuccessors = BigArrays.grow(lastNodeSuccessors, lastNodeOutdegree + 1);
+ lastNodeLabels = BigArrays.grow(lastNodeLabels, lastNodeOutdegree + 1);
+
+ long target = prevTarget[i] += batchIbs[i].readLongDelta();
+ long label = batchIbs[i].readLongDelta();
+ BigArrays.set(lastNodeSuccessors, lastNodeOutdegree, target);
+ BigArrays.set(lastNodeLabels, lastNodeOutdegree, label);
+
+ // System.err.println("2. Read " + lastNode + " " + target + " " + label);
+ if (--inputStreamLength[i] == 0) {
+ queue.dequeue();
+ batchIbs[i].close();
+ batchIbs[i] = null;
+ } else {
+ // We read a new source and update the queue.
+ final long sourceDelta = batchIbs[i].readLongDelta();
+ if (sourceDelta != 0) {
+ refArray[i] += sourceDelta;
+ prevTarget[i] = 0;
+ queue.changed();
+ }
+ }
+ lastNodeOutdegree++;
+ }
+
+ // Neither quicksort nor heaps are stable, so we reestablish order here.
+ // LongBigArrays.radixSort(lastNodeSuccessors, lastNodeLabels, 0, lastNodeOutdegree);
+ ForkJoinBigQuickSort2.parallelQuickSort(lastNodeSuccessors, lastNodeLabels, 0, lastNodeOutdegree);
+ }
+
+ @Override
+ public boolean hasNext() {
+ return lastNodeCurrentSuccessor < lastNodeOutdegree || hasNextNode();
+ }
+
+ @Override
+ public EdgeLabelLine next() {
+ if (lastNode == -1 || lastNodeCurrentSuccessor >= lastNodeOutdegree) {
+ try {
+ do {
+ readNextNode();
+ } while (hasNextNode() && lastNodeOutdegree == 0);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ long src = lastNode;
+ long dst = BigArrays.get(lastNodeSuccessors, lastNodeCurrentSuccessor);
+ long compressedLabel = BigArrays.get(lastNodeLabels, lastNodeCurrentSuccessor);
+ long labelName = DirEntry.labelNameFromEncoded(compressedLabel);
+ int permission = DirEntry.permissionFromEncoded(compressedLabel);
+ // System.err.println("3. Output (encoded): " + src + " " + dst + " " + compressedLabel);
+ // System.err.println("4. Output (decoded): " + src + " " + dst + " " + labelName + " " +
+ // permission);
+ lastNodeCurrentSuccessor++;
+ return new EdgeLabelLine(src, dst, labelName, permission);
+ }
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java b/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java
similarity index 90%
rename from java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
rename to java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java
index 626c747..105a921 100644
--- a/java/src/main/java/org/softwareheritage/graph/maps/NodeMapBuilder.java
+++ b/java/src/main/java/org/softwareheritage/graph/compress/NodeMapBuilder.java
@@ -1,191 +1,201 @@
-package org.softwareheritage.graph.maps;
+/*
+ * Copyright (c) 2019-2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+import com.github.luben.zstd.ZstdInputStream;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.softwareheritage.graph.Node;
import org.softwareheritage.graph.SWHID;
+import org.softwareheritage.graph.SwhType;
+import org.softwareheritage.graph.maps.NodeIdMap;
+import org.softwareheritage.graph.maps.NodeTypesMap;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.Scanner;
import java.util.concurrent.TimeUnit;
/**
* Create maps needed at runtime by the graph service, in particular:
*
*
- *
SWHID → WebGraph long node id
- *
WebGraph long node id → SWHID (converse of the former)
+ *
WebGraph long node id → SWHID
*
WebGraph long node id → SWH node type (enum)
*
*
* @author The Software Heritage developers
*/
public class NodeMapBuilder {
final static String SORT_BUFFER_SIZE = "40%";
final static Logger logger = LoggerFactory.getLogger(NodeMapBuilder.class);
/**
* Main entrypoint.
*
* @param args command line arguments
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
logger.error("Usage: COMPRESSED_GRAPH_BASE_NAME TEMP_DIR < NODES_CSV");
System.exit(1);
}
String graphPath = args[0];
String tmpDir = args[1];
logger.info("starting maps generation...");
precomputeNodeIdMap(graphPath, tmpDir);
logger.info("maps generation completed");
}
/**
* Computes and dumps on disk mapping files.
*
* @param graphPath path of the compressed graph
*/
static void precomputeNodeIdMap(String graphPath, String tmpDir) throws IOException {
ProgressLogger plSWHID2Node = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
ProgressLogger plNode2SWHID = new ProgressLogger(logger, 10, TimeUnit.SECONDS);
- plSWHID2Node.itemsName = "Hashing swhid→node";
- plNode2SWHID.itemsName = "Building map node→swhid";
+ plSWHID2Node.itemsName = "nodes";
+ plNode2SWHID.itemsName = "nodes";
// first half of SWHID->node mapping: SWHID -> WebGraph MPH (long)
Object2LongFunction mphMap = NodeIdMap.loadMph(graphPath + ".mph");
long nbIds = (mphMap instanceof Size64) ? ((Size64) mphMap).size64() : mphMap.size();
plSWHID2Node.expectedUpdates = nbIds;
plNode2SWHID.expectedUpdates = nbIds;
// second half of SWHID->node mapping: WebGraph MPH (long) -> BFS order (long)
long[][] bfsMap = LongBigArrays.newBigArray(nbIds);
logger.info("loading BFS order file...");
long loaded = BinIO.loadLongs(graphPath + ".order", bfsMap);
logger.info("BFS order file loaded");
if (loaded != nbIds) {
logger.error("graph contains " + nbIds + " nodes, but read " + loaded);
System.exit(2);
}
/*
* Read on stdin a list of SWHIDs, hash them with MPH, then permute them according to the .order
* file
*/
- FastBufferedReader buffer = new FastBufferedReader(new InputStreamReader(System.in, StandardCharsets.US_ASCII));
+ FastBufferedReader buffer = new FastBufferedReader(
+ new InputStreamReader(new ZstdInputStream(new BufferedInputStream(System.in))));
LineIterator swhidIterator = new LineIterator(buffer);
/*
* The WebGraph node id -> SWHID mapping can be obtained from the SWHID->node one by numerically
* sorting on node id and sequentially writing obtained SWHIDs to a binary map. Delegates the
* sorting job to /usr/bin/sort via pipes
*/
ProcessBuilder processBuilder = new ProcessBuilder();
processBuilder.command("sort", "--numeric-sort", "--key", "2", "--buffer-size", SORT_BUFFER_SIZE,
"--temporary-directory", tmpDir);
Process sort = processBuilder.start();
BufferedOutputStream sort_stdin = new BufferedOutputStream(sort.getOutputStream());
BufferedInputStream sort_stdout = new BufferedInputStream(sort.getInputStream());
// for the binary format of nodeToSwhidMap, see Python module swh.graph.swhid:IntToSwhidMap
try (BufferedOutputStream nodeToSwhidMap = new BufferedOutputStream(
new FileOutputStream(graphPath + NodeIdMap.NODE_TO_SWHID))) {
/*
* background handler for sort output, it will be fed SWHID/node pairs, and will itself fill
* nodeToSwhidMap as soon as data from sort is ready.
*/
SortOutputHandler outputHandler = new SortOutputHandler(sort_stdout, nodeToSwhidMap, plNode2SWHID);
outputHandler.start();
/*
* Type map from WebGraph node ID to SWH type. Used at runtime by pure Java graph traversals to
* efficiently check edge restrictions.
*/
- final int nbBitsPerNodeType = (int) Math.ceil(Math.log(Node.Type.values().length) / Math.log(2));
+ final int nbBitsPerNodeType = (int) Math.ceil(Math.log(SwhType.values().length) / Math.log(2));
LongArrayBitVector nodeTypesBitVector = LongArrayBitVector.ofLength(nbBitsPerNodeType * nbIds);
LongBigList nodeTypesMap = nodeTypesBitVector.asLongBigList(nbBitsPerNodeType);
plSWHID2Node.start("Hashing SWHIDs to fill sort input");
for (long iNode = 0; iNode < nbIds && swhidIterator.hasNext(); iNode++) {
String swhidStr = swhidIterator.next().toString();
SWHID swhid = new SWHID(swhidStr);
long mphId = mphMap.getLong(swhidStr.getBytes(StandardCharsets.US_ASCII));
long nodeId = BigArrays.get(bfsMap, mphId);
sort_stdin.write((swhidStr + "\t" + nodeId + "\n").getBytes(StandardCharsets.US_ASCII));
nodeTypesMap.set(nodeId, swhid.getType().ordinal());
plSWHID2Node.lightUpdate();
}
plSWHID2Node.done();
sort_stdin.close();
// write type map
logger.info("storing type map");
BinIO.storeObject(nodeTypesMap, graphPath + NodeTypesMap.NODE_TO_TYPE);
logger.info("type map stored");
// wait for nodeToSwhidMap filling
try {
logger.info("waiting for node2swhid map...");
int sortExitCode = sort.waitFor();
if (sortExitCode != 0) {
logger.error("sort returned non-zero exit code: " + sortExitCode);
System.exit(2);
}
outputHandler.join();
} catch (InterruptedException e) {
logger.error("processing of sort output failed with: " + e);
System.exit(2);
}
}
}
private static class SortOutputHandler extends Thread {
private final Scanner input;
private final OutputStream output;
private final ProgressLogger pl;
SortOutputHandler(InputStream input, OutputStream output, ProgressLogger pl) {
this.input = new Scanner(input, StandardCharsets.US_ASCII);
this.output = output;
this.pl = pl;
}
public void run() {
boolean sortDone = false;
logger.info("node2swhid: waiting for sort output...");
while (input.hasNextLine()) {
if (!sortDone) {
sortDone = true;
this.pl.start("filling node2swhid map");
}
String line = input.nextLine(); // format: SWHID NODE_ID
SWHID swhid = new SWHID(line.split("\\t")[0]); // get SWHID
try {
output.write(swhid.toBytes());
} catch (IOException e) {
logger.error("writing to node->SWHID map failed with: " + e);
}
this.pl.lightUpdate();
}
this.pl.done();
}
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java b/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java
new file mode 100644
index 0000000..d16b5ae
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ORCGraphDataset.java
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+
+import com.github.luben.zstd.ZstdOutputStream;
+import com.google.common.primitives.Bytes;
+import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.RecordReader;
+import org.apache.orc.TypeDescription;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.*;
+import java.util.*;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.ForkJoinTask;
+
+/**
+ * A graph dataset in ORC format.
+ *
+ * This format of dataset is a full export of the graph, including all the edge and node properties.
+ *
+ * For convenience purposes, this class also provides a main method to print all the edges of the
+ * graph, so that the output can be piped to
+ * {@link it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph}.
+ *
+ * Reading edges from ORC files using this class is about ~2.5 times slower than reading them
+ * directly from a plaintext format.
+ */
+public class ORCGraphDataset implements GraphDataset {
+ final static Logger logger = LoggerFactory.getLogger(ORCGraphDataset.class);
+
+ final static public int ORC_BATCH_SIZE = 16 * 1024;
+
+ private File datasetDir;
+
+ protected ORCGraphDataset() {
+ }
+
+ public ORCGraphDataset(String datasetPath) {
+ this(new File(datasetPath));
+ }
+
+ public ORCGraphDataset(File datasetDir) {
+ if (!datasetDir.exists()) {
+ throw new IllegalArgumentException("Dataset " + datasetDir.getName() + " does not exist");
+ }
+ this.datasetDir = datasetDir;
+ }
+
+ /**
+ * Return the given table as a {@link SwhOrcTable}. The return value can be down-casted to the type
+ * of the specific table it represents.
+ */
+ public SwhOrcTable getTable(String tableName) {
+ File tableDir = new File(datasetDir, tableName);
+ if (!tableDir.exists()) {
+ return null;
+ }
+ switch (tableName) {
+ case "skipped_content":
+ return new SkippedContentOrcTable(tableDir);
+ case "content":
+ return new ContentOrcTable(tableDir);
+ case "directory":
+ return new DirectoryOrcTable(tableDir);
+ case "directory_entry":
+ return new DirectoryEntryOrcTable(tableDir);
+ case "revision":
+ return new RevisionOrcTable(tableDir);
+ case "revision_history":
+ return new RevisionHistoryOrcTable(tableDir);
+ case "release":
+ return new ReleaseOrcTable(tableDir);
+ case "snapshot_branch":
+ return new SnapshotBranchOrcTable(tableDir);
+ case "snapshot":
+ return new SnapshotOrcTable(tableDir);
+ case "origin_visit_status":
+ return new OriginVisitStatusOrcTable(tableDir);
+ case "origin_visit":
+ return new OriginVisitOrcTable(tableDir);
+ case "origin":
+ return new OriginOrcTable(tableDir);
+ default :
+ return null;
+ }
+ }
+
+ /** Return all the tables in this dataset as a map of {@link SwhOrcTable}. */
+ public Map allTables() {
+ HashMap tables = new HashMap<>();
+ File[] tableDirs = datasetDir.listFiles();
+ if (tableDirs == null) {
+ return tables;
+ }
+ for (File tableDir : tableDirs) {
+ SwhOrcTable table = getTable(tableDir.getName());
+ if (table != null) {
+ tables.put(tableDir.getName(), table);
+ }
+ }
+ return tables;
+ }
+
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ Map tables = allTables();
+ for (SwhOrcTable table : tables.values()) {
+ table.readEdges(nodeCb, edgeCb);
+ }
+ }
+
+ /**
+ * A class representing an ORC table, stored on disk as a set of ORC files all in the same
+ * directory.
+ */
+ public static class ORCTable {
+ private final File tableDir;
+
+ public ORCTable(File tableDir) {
+ if (!tableDir.exists()) {
+ throw new IllegalArgumentException("Table " + tableDir.getName() + " does not exist");
+ }
+ this.tableDir = tableDir;
+ }
+
+ public static ORCTable load(File tableDir) {
+ return new ORCTable(tableDir);
+ }
+
+ /**
+ * Utility function for byte columns. Return as a byte array the value of the given row in the
+ * column vector.
+ */
+ public static byte[] getBytesRow(BytesColumnVector columnVector, int row) {
+ if (columnVector.isRepeating) {
+ row = 0;
+ }
+ if (columnVector.isNull[row]) {
+ return null;
+ }
+ return Arrays.copyOfRange(columnVector.vector[row], columnVector.start[row],
+ columnVector.start[row] + columnVector.length[row]);
+ }
+
+ /**
+ * Utility function for long columns. Return as a long the value of the given row in the column
+ * vector.
+ */
+ public static Long getLongRow(LongColumnVector columnVector, int row) {
+ if (columnVector.isRepeating) {
+ row = 0;
+ }
+ if (columnVector.isNull[row]) {
+ return null;
+ }
+ return columnVector.vector[row];
+ }
+
+ interface ReadOrcBatchHandler {
+ void accept(VectorizedRowBatch batch, Map columnMap) throws IOException;
+ }
+
+ /**
+ * Read the table, calling the given handler for each new batch of rows. Optionally, if columns is
+ * not null, will only scan the columns present in this set instead of the entire table.
+ *
+ * If this method is called from within a ForkJoinPool, the ORC table will be read in parallel using
+ * that thread pool. Otherwise, the ORC files will be read sequentially.
+ */
+ public void readOrcTable(ReadOrcBatchHandler batchHandler, Set columns) throws IOException {
+ File[] listing = tableDir.listFiles();
+ if (listing == null) {
+ throw new IOException("No files found in " + tableDir.getName());
+ }
+ ForkJoinPool forkJoinPool = ForkJoinTask.getPool();
+ if (forkJoinPool == null) {
+ // Sequential case
+ for (File file : listing) {
+ readOrcFile(file.getPath(), batchHandler, columns);
+ }
+ } else {
+ // Parallel case
+ ArrayList listingArray = new ArrayList<>(Arrays.asList(listing));
+ listingArray.parallelStream().forEach(file -> {
+ try {
+ readOrcFile(file.getPath(), batchHandler, columns);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
+ }
+ }
+
+ private void readOrcFile(String path, ReadOrcBatchHandler batchHandler, Set columns)
+ throws IOException {
+ try (Reader reader = OrcFile.createReader(new Path(path), OrcFile.readerOptions(new Configuration()))) {
+ TypeDescription schema = reader.getSchema();
+
+ Reader.Options options = reader.options();
+ if (columns != null) {
+ options.include(createColumnsToRead(schema, columns));
+ }
+ Map columnMap = getColumnMap(schema);
+
+ try (RecordReader records = reader.rows(options)) {
+ VectorizedRowBatch batch = reader.getSchema().createRowBatch(ORC_BATCH_SIZE);
+ while (records.nextBatch(batch)) {
+ batchHandler.accept(batch, columnMap);
+ }
+ }
+ }
+ }
+
+ private static Map getColumnMap(TypeDescription schema) {
+ Map columnMap = new HashMap<>();
+ List fieldNames = schema.getFieldNames();
+ for (int i = 0; i < fieldNames.size(); i++) {
+ columnMap.put(fieldNames.get(i), i);
+ }
+ return columnMap;
+ }
+
+ private static boolean[] createColumnsToRead(TypeDescription schema, Set columns) {
+ boolean[] columnsToRead = new boolean[schema.getMaximumId() + 1];
+ List fieldNames = schema.getFieldNames();
+ List columnTypes = schema.getChildren();
+ for (int i = 0; i < fieldNames.size(); i++) {
+ if (columns.contains(fieldNames.get(i))) {
+ logger.debug("Adding column " + fieldNames.get(i) + " with ID " + i + " to the read list");
+ TypeDescription type = columnTypes.get(i);
+ for (int id = type.getId(); id <= type.getMaximumId(); id++) {
+ columnsToRead[id] = true;
+ }
+ }
+ }
+ return columnsToRead;
+ }
+ }
+
+ /** Base class for SWH-specific ORC tables. */
+ public static class SwhOrcTable {
+ protected ORCTable orcTable;
+
+ protected static final byte[] cntPrefix = "swh:1:cnt:".getBytes();
+ protected static final byte[] dirPrefix = "swh:1:dir:".getBytes();
+ protected static final byte[] revPrefix = "swh:1:rev:".getBytes();
+ protected static final byte[] relPrefix = "swh:1:rel:".getBytes();
+ protected static final byte[] snpPrefix = "swh:1:snp:".getBytes();
+ protected static final byte[] oriPrefix = "swh:1:ori:".getBytes();
+
+ protected String getIdColumn() {
+ return "id";
+ }
+ protected byte[] getSwhidPrefix() {
+ throw new UnsupportedOperationException();
+ }
+ protected byte[] idToSwhid(byte[] id) {
+ return Bytes.concat(getSwhidPrefix(), id);
+ }
+
+ protected SwhOrcTable() {
+ }
+
+ public SwhOrcTable(File tableDir) {
+ orcTable = new ORCTable(tableDir);
+ }
+
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ // No nodes or edges to read in the table by default.
+ }
+
+ protected static byte[] urlToOriginId(byte[] url) {
+ return DigestUtils.sha1Hex(url).getBytes();
+ }
+
+ public void readIdColumn(NodeCallback cb) throws IOException {
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row));
+ cb.onNode(id);
+ }
+ }, Set.of(getIdColumn()));
+ }
+
+ public void readLongColumn(String longColumn, LongCallback cb) throws IOException {
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
+ LongColumnVector dateVector = (LongColumnVector) batch.cols[columnMap.get(longColumn)];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row));
+ Long date = ORCTable.getLongRow(dateVector, row);
+ if (date != null) {
+ cb.onLong(id, date);
+ }
+ }
+ }, Set.of(getIdColumn(), longColumn));
+ }
+
+ public void readTimestampColumn(String dateColumn, String dateOffsetColumn, TimestampCallback cb)
+ throws IOException {
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
+ TimestampColumnVector dateVector = (TimestampColumnVector) batch.cols[columnMap.get(dateColumn)];
+ LongColumnVector dateOffsetVector = (LongColumnVector) batch.cols[columnMap.get(dateOffsetColumn)];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row));
+ long date = dateVector.getTimestampAsLong(row); // rounded to seconds
+ Long dateOffset = ORCTable.getLongRow(dateOffsetVector, row);
+ if (dateOffset != null) {
+ cb.onTimestamp(id, date, dateOffset.shortValue());
+ }
+ }
+ }, Set.of(getIdColumn(), dateColumn, dateOffsetColumn));
+ }
+
+ public void readBytes64Column(String longColumn, BytesCallback cb) throws IOException {
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector idVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
+ BytesColumnVector valueVector = (BytesColumnVector) batch.cols[columnMap.get(longColumn)];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] id = idToSwhid(ORCTable.getBytesRow(idVector, row));
+ byte[] value = Base64.getEncoder().encode(ORCTable.getBytesRow(valueVector, row));
+ cb.onBytes(id, value);
+ }
+ }, Set.of(getIdColumn(), longColumn));
+ }
+ }
+
+ public static class SkippedContentOrcTable extends SwhOrcTable {
+ public SkippedContentOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ protected String getIdColumn() {
+ return "sha1_git";
+ }
+
+ @Override
+ protected byte[] getSwhidPrefix() {
+ return cntPrefix;
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ readIdColumn(nodeCb);
+ }
+ }
+
+ public static class ContentOrcTable extends SwhOrcTable {
+ public ContentOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ protected String getIdColumn() {
+ return "sha1_git";
+ }
+
+ @Override
+ protected byte[] getSwhidPrefix() {
+ return cntPrefix;
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ readIdColumn(nodeCb);
+ }
+ }
+
+ public static class DirectoryOrcTable extends SwhOrcTable {
+ public DirectoryOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ protected byte[] getSwhidPrefix() {
+ return dirPrefix;
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ readIdColumn(nodeCb);
+ }
+ }
+
+ public static class DirectoryEntryOrcTable extends SwhOrcTable {
+ public DirectoryEntryOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ byte[] cntType = "file".getBytes();
+ byte[] dirType = "dir".getBytes();
+ byte[] revType = "rev".getBytes();
+
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector srcVector = (BytesColumnVector) batch.cols[columnMap.get("directory_id")];
+ BytesColumnVector dstVector = (BytesColumnVector) batch.cols[columnMap.get("target")];
+ BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("type")];
+ BytesColumnVector labelVector = (BytesColumnVector) batch.cols[columnMap.get("name")];
+ LongColumnVector permissionVector = (LongColumnVector) batch.cols[columnMap.get("perms")];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row);
+ byte[] targetPrefix;
+ if (Arrays.equals(targetType, cntType)) {
+ targetPrefix = cntPrefix;
+ } else if (Arrays.equals(targetType, dirType)) {
+ targetPrefix = dirPrefix;
+ } else if (Arrays.equals(targetType, revType)) {
+ targetPrefix = revPrefix;
+ } else {
+ continue;
+ }
+
+ byte[] src = Bytes.concat(dirPrefix, ORCTable.getBytesRow(srcVector, row));
+ byte[] dst = Bytes.concat(targetPrefix, ORCTable.getBytesRow(dstVector, row));
+ byte[] label = Base64.getEncoder().encode(ORCTable.getBytesRow(labelVector, row));
+ Long permission = ORCTable.getLongRow(permissionVector, row);
+ edgeCb.onEdge(src, dst, label, permission != null ? permission.intValue() : 0);
+ }
+ }, Set.of("directory_id", "target", "type", "name", "perms"));
+ }
+ }
+
+ public static class RevisionOrcTable extends SwhOrcTable {
+ public RevisionOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ protected byte[] getSwhidPrefix() {
+ return revPrefix;
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector revisionIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")];
+ BytesColumnVector directoryIdVector = (BytesColumnVector) batch.cols[columnMap.get("directory")];
+ for (int row = 0; row < batch.size; row++) {
+ byte[] revisionId = Bytes.concat(revPrefix, ORCTable.getBytesRow(revisionIdVector, row));
+ byte[] directoryId = Bytes.concat(dirPrefix, ORCTable.getBytesRow(directoryIdVector, row));
+ nodeCb.onNode(revisionId);
+ edgeCb.onEdge(revisionId, directoryId, null, -1);
+ }
+ }, Set.of("id", "directory"));
+ }
+ }
+
+ public static class RevisionHistoryOrcTable extends SwhOrcTable {
+ public RevisionHistoryOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector revisionIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")];
+ BytesColumnVector parentIdVector = (BytesColumnVector) batch.cols[columnMap.get("parent_id")];
+ for (int row = 0; row < batch.size; row++) {
+ byte[] parentId = Bytes.concat(revPrefix, ORCTable.getBytesRow(parentIdVector, row));
+ byte[] revisionId = Bytes.concat(revPrefix, ORCTable.getBytesRow(revisionIdVector, row));
+ edgeCb.onEdge(revisionId, parentId, null, -1);
+ }
+ }, Set.of("id", "parent_id"));
+ }
+ }
+
+ public static class ReleaseOrcTable extends SwhOrcTable {
+ public ReleaseOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ protected byte[] getSwhidPrefix() {
+ return relPrefix;
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ byte[] cntType = "content".getBytes();
+ byte[] dirType = "directory".getBytes();
+ byte[] revType = "revision".getBytes();
+ byte[] relType = "release".getBytes();
+
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector releaseIdVector = (BytesColumnVector) batch.cols[columnMap.get("id")];
+ BytesColumnVector targetIdVector = (BytesColumnVector) batch.cols[columnMap.get("target")];
+ BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("target_type")];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row);
+
+ byte[] targetPrefix;
+ if (Arrays.equals(targetType, cntType)) {
+ targetPrefix = cntPrefix;
+ } else if (Arrays.equals(targetType, dirType)) {
+ targetPrefix = dirPrefix;
+ } else if (Arrays.equals(targetType, revType)) {
+ targetPrefix = revPrefix;
+ } else if (Arrays.equals(targetType, relType)) {
+ targetPrefix = relPrefix;
+ } else {
+ continue;
+ }
+
+ byte[] releaseId = Bytes.concat(relPrefix, ORCTable.getBytesRow(releaseIdVector, row));
+ byte[] targetId = Bytes.concat(targetPrefix, ORCTable.getBytesRow(targetIdVector, row));
+ nodeCb.onNode(releaseId);
+ edgeCb.onEdge(releaseId, targetId, null, -1);
+ }
+ }, Set.of("id", "target", "target_type"));
+ }
+ }
+
+ public static class SnapshotOrcTable extends SwhOrcTable {
+ public SnapshotOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ protected byte[] getSwhidPrefix() {
+ return snpPrefix;
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ readIdColumn(nodeCb);
+ }
+ }
+
+ public static class SnapshotBranchOrcTable extends SwhOrcTable {
+ public SnapshotBranchOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ byte[] cntType = "content".getBytes();
+ byte[] dirType = "directory".getBytes();
+ byte[] revType = "revision".getBytes();
+ byte[] relType = "release".getBytes();
+
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector snapshotIdVector = (BytesColumnVector) batch.cols[columnMap.get("snapshot_id")];
+ BytesColumnVector targetIdVector = (BytesColumnVector) batch.cols[columnMap.get("target")];
+ BytesColumnVector targetTypeVector = (BytesColumnVector) batch.cols[columnMap.get("target_type")];
+ BytesColumnVector branchNameVector = (BytesColumnVector) batch.cols[columnMap.get("name")];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] targetType = ORCTable.getBytesRow(targetTypeVector, row);
+ byte[] targetPrefix;
+ if (Arrays.equals(targetType, cntType)) {
+ targetPrefix = cntPrefix;
+ } else if (Arrays.equals(targetType, dirType)) {
+ targetPrefix = dirPrefix;
+ } else if (Arrays.equals(targetType, revType)) {
+ targetPrefix = revPrefix;
+ } else if (Arrays.equals(targetType, relType)) {
+ targetPrefix = relPrefix;
+ } else {
+ continue;
+ }
+
+ byte[] snapshotId = Bytes.concat(snpPrefix, ORCTable.getBytesRow(snapshotIdVector, row));
+ byte[] targetId = Bytes.concat(targetPrefix, ORCTable.getBytesRow(targetIdVector, row));
+ byte[] branchName = Base64.getEncoder().encode(ORCTable.getBytesRow(branchNameVector, row));
+ nodeCb.onNode(snapshotId);
+ edgeCb.onEdge(snapshotId, targetId, branchName, -1);
+ }
+ }, Set.of("snapshot_id", "name", "target", "target_type"));
+ }
+ }
+
+ public static class OriginVisitStatusOrcTable extends SwhOrcTable {
+ public OriginVisitStatusOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector originUrlVector = (BytesColumnVector) batch.cols[columnMap.get("origin")];
+ BytesColumnVector snapshotIdVector = (BytesColumnVector) batch.cols[columnMap.get("snapshot")];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] originId = urlToOriginId(ORCTable.getBytesRow(originUrlVector, row));
+ byte[] snapshot_id = ORCTable.getBytesRow(snapshotIdVector, row);
+ if (snapshot_id == null || snapshot_id.length == 0) {
+ continue;
+ }
+ edgeCb.onEdge(Bytes.concat(oriPrefix, originId), Bytes.concat(snpPrefix, snapshot_id), null, -1);
+ }
+ }, Set.of("origin", "snapshot"));
+ }
+ }
+
+ public static class OriginVisitOrcTable extends SwhOrcTable {
+ public OriginVisitOrcTable(File tableDir) {
+ super(tableDir);
+ }
+ }
+
+ public static class OriginOrcTable extends SwhOrcTable {
+ public OriginOrcTable(File tableDir) {
+ super(tableDir);
+ }
+
+ @Override
+ protected byte[] getSwhidPrefix() {
+ return oriPrefix;
+ }
+
+ @Override
+ protected byte[] idToSwhid(byte[] id) {
+ return Bytes.concat(getSwhidPrefix(), urlToOriginId(id));
+ }
+
+ @Override
+ protected String getIdColumn() {
+ return "url";
+ }
+
+ @Override
+ public void readEdges(GraphDataset.NodeCallback nodeCb, GraphDataset.EdgeCallback edgeCb) throws IOException {
+ readIdColumn(nodeCb);
+ }
+
+ public void readURLs(BytesCallback cb) throws IOException {
+ orcTable.readOrcTable((batch, columnMap) -> {
+ BytesColumnVector urlVector = (BytesColumnVector) batch.cols[columnMap.get(getIdColumn())];
+
+ for (int row = 0; row < batch.size; row++) {
+ byte[] id = idToSwhid(ORCTable.getBytesRow(urlVector, row));
+ byte[] url = Base64.getEncoder().encode(ORCTable.getBytesRow(urlVector, row));
+ cb.onBytes(id, url);
+ }
+ }, Set.of(getIdColumn()));
+ }
+ }
+
+ /**
+ * Export an ORC graph to the CSV edge dataset format as two different files,
+ * nodes.csv.zst and edges.csv.zst.
+ */
+ public static void exportToCsvDataset(String orcDataset, String csvDatasetBasename) throws IOException {
+ ORCGraphDataset dataset = new ORCGraphDataset(orcDataset);
+ File nodesFile = new File(csvDatasetBasename + ".nodes.csv.zst");
+ File edgesFile = new File(csvDatasetBasename + ".edges.csv.zst");
+ FastBufferedOutputStream nodesOut = new FastBufferedOutputStream(
+ new ZstdOutputStream(new FileOutputStream(nodesFile)));
+ FastBufferedOutputStream edgesOut = new FastBufferedOutputStream(
+ new ZstdOutputStream(new FileOutputStream(edgesFile)));
+ dataset.readEdges((node) -> {
+ nodesOut.write(node);
+ nodesOut.write('\n');
+ }, (src, dst, label, perms) -> {
+ edgesOut.write(src);
+ edgesOut.write(' ');
+ edgesOut.write(dst);
+ if (label != null) {
+ edgesOut.write(' ');
+ edgesOut.write(label);
+ edgesOut.write(' ');
+ }
+ if (perms != -1) {
+ edgesOut.write(' ');
+ edgesOut.write(Long.toString(perms).getBytes());
+ }
+ edgesOut.write('\n');
+ });
+ }
+
+ /**
+ * Print all the edges of the graph to stdout. Can be piped to
+ * {@link it.unimi.dsi.big.webgraph.ScatteredArcsASCIIGraph} to import the graph dataset and convert
+ * it to a {@link it.unimi.dsi.big.webgraph.BVGraph}.
+ */
+ public static void printSimpleEdges(String orcDataset) throws IOException {
+ ORCGraphDataset dataset = new ORCGraphDataset(orcDataset);
+ FastBufferedOutputStream out = new FastBufferedOutputStream(System.out);
+ dataset.readEdges((node) -> {
+ }, (src, dst, label, perms) -> {
+ out.write(src);
+ out.write(' ');
+ out.write(dst);
+ out.write('\n');
+ });
+ out.flush();
+ }
+
+ public static void main(String[] args) throws IOException {
+ printSimpleEdges(args[0]);
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java b/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java
new file mode 100644
index 0000000..9320d98
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/compress/ScatteredArcsORCGraph.java
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.IntStream;
+
+import it.unimi.dsi.big.webgraph.BVGraph;
+import it.unimi.dsi.big.webgraph.ImmutableSequentialGraph;
+import it.unimi.dsi.big.webgraph.NodeIterator;
+import it.unimi.dsi.big.webgraph.Transform;
+import it.unimi.dsi.fastutil.Arrays;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.martiansoftware.jsap.FlaggedOption;
+import com.martiansoftware.jsap.JSAP;
+import com.martiansoftware.jsap.JSAPException;
+import com.martiansoftware.jsap.JSAPResult;
+import com.martiansoftware.jsap.Parameter;
+import com.martiansoftware.jsap.SimpleJSAP;
+import com.martiansoftware.jsap.UnflaggedOption;
+
+import it.unimi.dsi.fastutil.Size64;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.objects.Object2LongFunction;
+import it.unimi.dsi.fastutil.objects.ObjectArrayList;
+import it.unimi.dsi.logging.ProgressLogger;
+
+public class ScatteredArcsORCGraph extends ImmutableSequentialGraph {
+ private static final Logger LOGGER = LoggerFactory.getLogger(ScatteredArcsORCGraph.class);
+
+ /** The default number of threads. */
+ public static final int DEFAULT_NUM_THREADS = Runtime.getRuntime().availableProcessors();
+
+ /** The default batch size. */
+ public static final int DEFAULT_BATCH_SIZE = Math
+ .min((int) (Runtime.getRuntime().maxMemory() * 0.4 / (DEFAULT_NUM_THREADS * 8 * 2)), Arrays.MAX_ARRAY_SIZE);
+
+ /** The batch graph used to return node iterators. */
+ private final Transform.BatchGraph batchGraph;
+
+ /**
+ * Creates a scattered-arcs ORC graph.
+ *
+ * @param dataset the Swh ORC Graph dataset
+ * @param function an explicitly provided function from string representing nodes to node numbers,
+ * or null for the standard behaviour.
+ * @param n the number of nodes of the graph (used only if function is not
+ * null).
+ * @param numThreads the number of threads to use.
+ * @param batchSize the number of integers in a batch; two arrays of integers of this size will be
+ * allocated by each thread.
+ * @param tempDir a temporary directory for the batches, or null for
+ * {@link File#createTempFile(java.lang.String, java.lang.String)}'s choice.
+ * @param pl a progress logger, or null.
+ */
+ public ScatteredArcsORCGraph(final ORCGraphDataset dataset, final Object2LongFunction function,
+ final long n, final int numThreads, final int batchSize, final File tempDir, final ProgressLogger pl)
+ throws IOException {
+ final ObjectArrayList batches = new ObjectArrayList<>();
+ ForkJoinPool forkJoinPool = new ForkJoinPool(numThreads);
+
+ long[][] srcArrays = new long[numThreads][batchSize];
+ long[][] dstArrays = new long[numThreads][batchSize];
+ int[] indexes = new int[numThreads];
+ long[] progressCounts = new long[numThreads];
+ AtomicInteger pairs = new AtomicInteger(0);
+
+ AtomicInteger nextThreadId = new AtomicInteger(0);
+ ThreadLocal threadLocalId = ThreadLocal.withInitial(nextThreadId::getAndIncrement);
+
+ if (pl != null) {
+ pl.itemsName = "arcs";
+ pl.start("Creating sorted batches...");
+ }
+
+ try {
+ forkJoinPool.submit(() -> {
+ try {
+ dataset.readEdges((node) -> {
+ }, (src, dst, label, perms) -> {
+ long s = function.getLong(src);
+ long t = function.getLong(dst);
+
+ int threadId = threadLocalId.get();
+ int idx = indexes[threadId]++;
+ srcArrays[threadId][idx] = s;
+ dstArrays[threadId][idx] = t;
+
+ if (idx == batchSize - 1) {
+ pairs.addAndGet(Transform.processBatch(batchSize, srcArrays[threadId], dstArrays[threadId],
+ tempDir, batches));
+ indexes[threadId] = 0;
+ }
+
+ if (pl != null && ++progressCounts[threadId] > 1000) {
+ synchronized (pl) {
+ pl.update(progressCounts[threadId]);
+ }
+ progressCounts[threadId] = 0;
+ }
+
+ });
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }).get();
+ } catch (InterruptedException | ExecutionException e) {
+ throw new RuntimeException(e);
+ }
+
+ IntStream.range(0, numThreads).parallel().forEach(t -> {
+ int idx = indexes[t];
+ if (idx > 0) {
+ try {
+ pairs.addAndGet(Transform.processBatch(idx, srcArrays[t], dstArrays[t], tempDir, batches));
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ });
+
+ // Trigger the GC to free up the large arrays
+ for (int i = 0; i < numThreads; i++) {
+ srcArrays[i] = null;
+ dstArrays[i] = null;
+ }
+
+ if (pl != null) {
+ pl.done();
+ pl.logger().info("Created " + batches.size() + " batches.");
+ }
+
+ batchGraph = new Transform.BatchGraph(n, pairs.get(), batches);
+ }
+
+ @Override
+ public long numNodes() {
+ if (batchGraph == null)
+ throw new UnsupportedOperationException(
+ "The number of nodes is unknown (you need to generate all the batches first).");
+ return batchGraph.numNodes();
+ }
+
+ @Override
+ public long numArcs() {
+ if (batchGraph == null)
+ throw new UnsupportedOperationException(
+ "The number of arcs is unknown (you need to generate all the batches first).");
+ return batchGraph.numArcs();
+ }
+
+ @Override
+ public NodeIterator nodeIterator(final long from) {
+ return batchGraph.nodeIterator(from);
+ }
+
+ @Override
+ public boolean hasCopiableIterators() {
+ return batchGraph.hasCopiableIterators();
+ }
+
+ @Override
+ public ScatteredArcsORCGraph copy() {
+ return this;
+ }
+
+ @SuppressWarnings("unchecked")
+ public static void main(final String[] args)
+ throws IllegalArgumentException, SecurityException, IOException, JSAPException, ClassNotFoundException {
+ final SimpleJSAP jsap = new SimpleJSAP(ScatteredArcsORCGraph.class.getName(),
+ "Converts a scattered list of arcs from an ORC graph dataset into a BVGraph.",
+ new Parameter[]{
+ new FlaggedOption("logInterval", JSAP.LONG_PARSER,
+ Long.toString(ProgressLogger.DEFAULT_LOG_INTERVAL), JSAP.NOT_REQUIRED, 'l',
+ "log-interval", "The minimum time interval between activity logs in milliseconds."),
+ new FlaggedOption("numThreads", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_NUM_THREADS),
+ JSAP.NOT_REQUIRED, 't', "threads", "The number of threads to use."),
+ new FlaggedOption("batchSize", JSAP.INTSIZE_PARSER, Integer.toString(DEFAULT_BATCH_SIZE),
+ JSAP.NOT_REQUIRED, 's', "batch-size", "The maximum size of a batch, in arcs."),
+ new FlaggedOption("tempDir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'T',
+ "temp-dir", "A directory for all temporary batch files."),
+ new FlaggedOption("function", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f',
+ "function",
+ "A serialised function from strings to longs that will be used to translate identifiers to node numbers."),
+ new FlaggedOption("comp", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 'c', "comp",
+ "A compression flag (may be specified several times).")
+ .setAllowMultipleDeclarations(true),
+ new FlaggedOption("windowSize", JSAP.INTEGER_PARSER,
+ String.valueOf(BVGraph.DEFAULT_WINDOW_SIZE), JSAP.NOT_REQUIRED, 'w', "window-size",
+ "Reference window size (0 to disable)."),
+ new FlaggedOption("maxRefCount", JSAP.INTEGER_PARSER,
+ String.valueOf(BVGraph.DEFAULT_MAX_REF_COUNT), JSAP.NOT_REQUIRED, 'm', "max-ref-count",
+ "Maximum number of backward references (-1 for ∞)."),
+ new FlaggedOption("minIntervalLength", JSAP.INTEGER_PARSER,
+ String.valueOf(BVGraph.DEFAULT_MIN_INTERVAL_LENGTH), JSAP.NOT_REQUIRED, 'i',
+ "min-interval-length", "Minimum length of an interval (0 to disable)."),
+ new FlaggedOption("zetaK", JSAP.INTEGER_PARSER, String.valueOf(BVGraph.DEFAULT_ZETA_K),
+ JSAP.NOT_REQUIRED, 'k', "zeta-k", "The k parameter for zeta-k codes."),
+ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
+ JSAP.NOT_GREEDY, "The path to the ORC graph dataset."),
+ new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
+ JSAP.NOT_GREEDY, "The basename of the output graph"),});
+
+ final JSAPResult jsapResult = jsap.parse(args);
+ if (jsap.messagePrinted())
+ System.exit(1);
+
+ String basename = jsapResult.getString("basename");
+ String orcDatasetPath = jsapResult.getString("dataset");
+ ORCGraphDataset orcDataset = new ORCGraphDataset(orcDatasetPath);
+
+ int flags = 0;
+ for (final String compressionFlag : jsapResult.getStringArray("comp")) {
+ try {
+ flags |= BVGraph.class.getField(compressionFlag).getInt(BVGraph.class);
+ } catch (final Exception notFound) {
+ throw new JSAPException("Compression method " + compressionFlag + " unknown.");
+ }
+ }
+
+ final int windowSize = jsapResult.getInt("windowSize");
+ final int zetaK = jsapResult.getInt("zetaK");
+ int maxRefCount = jsapResult.getInt("maxRefCount");
+ if (maxRefCount == -1)
+ maxRefCount = Integer.MAX_VALUE;
+ final int minIntervalLength = jsapResult.getInt("minIntervalLength");
+
+ if (!jsapResult.userSpecified("function")) {
+ throw new IllegalArgumentException("Function must be specified.");
+ }
+ final Object2LongFunction function = (Object2LongFunction) BinIO
+ .loadObject(jsapResult.getString("function"));
+ long n = function instanceof Size64 ? ((Size64) function).size64() : function.size();
+
+ File tempDir = null;
+ if (jsapResult.userSpecified("tempDir")) {
+ tempDir = new File(jsapResult.getString("tempDir"));
+ }
+
+ final ProgressLogger pl = new ProgressLogger(LOGGER, jsapResult.getLong("logInterval"), TimeUnit.MILLISECONDS);
+ final int batchSize = jsapResult.getInt("batchSize");
+ final int numThreads = jsapResult.getInt("numThreads");
+ final ScatteredArcsORCGraph graph = new ScatteredArcsORCGraph(orcDataset, function, n, numThreads, batchSize,
+ tempDir, pl);
+ BVGraph.store(graph, basename, windowSize, maxRefCount, minIntervalLength, zetaK, flags, pl);
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
new file mode 100644
index 0000000..f06ba59
--- /dev/null
+++ b/java/src/main/java/org/softwareheritage/graph/compress/WriteNodeProperties.java
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2022 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
+package org.softwareheritage.graph.compress;
+
+import com.martiansoftware.jsap.*;
+import it.unimi.dsi.bits.LongArrayBitVector;
+import it.unimi.dsi.fastutil.BigArrays;
+import it.unimi.dsi.fastutil.ints.IntBigArrays;
+import it.unimi.dsi.fastutil.io.BinIO;
+import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
+import it.unimi.dsi.fastutil.longs.LongBigArrays;
+import it.unimi.dsi.fastutil.objects.Object2LongFunction;
+import it.unimi.dsi.fastutil.shorts.ShortBigArrays;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.softwareheritage.graph.maps.NodeIdMap;
+import org.softwareheritage.graph.compress.ORCGraphDataset.*;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.lang.reflect.InvocationTargetException;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * This class is used to extract the node properties from the graph dataset, and write them to a set
+ * of property files.
+ *
+ * Note: because the nodes are not sorted by type, we have an incentive to minimize the number of
+ * "holes" in offset arrays. This is why many unrelated properties are cobbled together in the same
+ * files (e.g. commit messages, tag messages and origin URLs are all in a "message" property file).
+ * Once we migrate to a TypedImmutableGraph as the underlying storage of the graph, we can split all
+ * the different properties in their own files.
+ */
+public class WriteNodeProperties {
+ final static Logger logger = LoggerFactory.getLogger(WriteNodeProperties.class);
+
+ private final ORCGraphDataset dataset;
+ private final String graphBasename;
+ private final NodeIdMap nodeIdMap;
+ private final long numNodes;
+
+ public WriteNodeProperties(String dataset, String graphBasename, NodeIdMap nodeIdMap) {
+ this.dataset = new ORCGraphDataset(dataset);
+ this.graphBasename = graphBasename;
+ this.nodeIdMap = nodeIdMap;
+ this.numNodes = nodeIdMap.size64();
+ }
+
+ public static String[] PROPERTY_WRITERS = new String[]{"timestamps", "content_length", "content_is_skipped",
+ "person_ids", "messages", "tag_names",};
+
+ private static JSAPResult parseArgs(String[] args) {
+ JSAPResult config = null;
+ try {
+ SimpleJSAP jsap = new SimpleJSAP(ComposePermutations.class.getName(), "", new Parameter[]{
+ new UnflaggedOption("dataset", JSAP.STRING_PARSER, JSAP.REQUIRED, "Path to the ORC graph dataset"),
+ new UnflaggedOption("graphBasename", JSAP.STRING_PARSER, JSAP.REQUIRED,
+ "Basename of the output graph"),
+ new FlaggedOption("properties", JSAP.STRING_PARSER, "*", JSAP.NOT_REQUIRED, 'p', "properties",
+ "Properties to write, comma separated (default: all). Possible choices: "
+ + String.join(",", PROPERTY_WRITERS)),});
+ config = jsap.parse(args);
+ if (jsap.messagePrinted()) {
+ System.exit(1);
+ }
+ } catch (JSAPException e) {
+ System.err.println("Usage error: " + e.getMessage());
+ System.exit(1);
+ }
+ return config;
+ }
+
+ public static void main(String[] argv) throws IOException, ClassNotFoundException, NoSuchMethodException,
+ InvocationTargetException, IllegalAccessException {
+ JSAPResult args = parseArgs(argv);
+ String dataset = args.getString("dataset");
+ String graphBasename = args.getString("graphBasename");
+ NodeIdMap nodeIdMap = new NodeIdMap(graphBasename);
+
+ Set properties;
+ if (args.getString("properties").equals("*")) {
+ properties = Set.of(PROPERTY_WRITERS);
+ } else {
+ properties = new HashSet<>(Arrays.asList(args.getString("properties").split(",")));
+ }
+
+ WriteNodeProperties writer = new WriteNodeProperties(dataset, graphBasename, nodeIdMap);
+ if (properties.contains("timestamps")) {
+ writer.writeTimestamps();
+ }
+ if (properties.contains("content_length")) {
+ writer.writeContentLength();
+ }
+ if (properties.contains("content_is_skipped")) {
+ writer.writeContentIsSkipped();
+ }
+ if (properties.contains("person_ids")) {
+ writer.writePersonIds();
+ }
+ if (properties.contains("messages")) {
+ writer.writeMessages();
+ }
+ if (properties.contains("tag_names")) {
+ writer.writeTagNames();
+ }
+ }
+
+ public void writeContentLength() throws IOException {
+ logger.info("Writing content lengths");
+ long[][] valueArray = LongBigArrays.newBigArray(numNodes);
+ BigArrays.fill(valueArray, -1);
+
+ for (String tableName : new String[]{"content", "skipped_content"}) {
+ SwhOrcTable table = dataset.getTable(tableName);
+ if (table == null) {
+ continue;
+ }
+ table.readLongColumn("length", (swhid, value) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(valueArray, id, value);
+ });
+ }
+
+ BinIO.storeLongs(valueArray, graphBasename + ".property.content.length.bin");
+ }
+
+ public void writeContentIsSkipped() throws IOException {
+ LongArrayBitVector isSkippedBitVector = LongArrayBitVector.ofLength(numNodes);
+ SwhOrcTable table = dataset.getTable("skipped_content");
+ if (table != null) {
+ table.readIdColumn((swhid) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ isSkippedBitVector.set(id);
+ });
+ }
+ BinIO.storeObject(isSkippedBitVector, graphBasename + ".property.content.is_skipped.bin");
+ }
+
+ public void writeTimestamps() throws IOException {
+ logger.info("Writing author/committer timestamps for release + revision");
+ SwhOrcTable releaseTable = dataset.getTable("release");
+ SwhOrcTable revisionTable = dataset.getTable("revision");
+
+ long[][] timestampArray = LongBigArrays.newBigArray(numNodes);
+ short[][] timestampOffsetArray = ShortBigArrays.newBigArray(numNodes);
+
+ // Author timestamps
+ BigArrays.fill(timestampArray, Long.MIN_VALUE);
+ BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE);
+ releaseTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(timestampArray, id, date);
+ BigArrays.set(timestampOffsetArray, id, dateOffset);
+ });
+ revisionTable.readTimestampColumn("date", "date_offset", (swhid, date, dateOffset) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(timestampArray, id, date);
+ BigArrays.set(timestampOffsetArray, id, dateOffset);
+ });
+ BinIO.storeLongs(timestampArray, graphBasename + ".property.author_timestamp.bin");
+ BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.author_timestamp_offset.bin");
+
+ // Committer timestamps
+ BigArrays.fill(timestampArray, Long.MIN_VALUE);
+ BigArrays.fill(timestampOffsetArray, Short.MIN_VALUE);
+ revisionTable.readTimestampColumn("committer_date", "committer_offset", (swhid, date, dateOffset) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(timestampArray, id, date);
+ BigArrays.set(timestampOffsetArray, id, dateOffset);
+ });
+ BinIO.storeLongs(timestampArray, graphBasename + ".property.committer_timestamp.bin");
+ BinIO.storeShorts(timestampOffsetArray, graphBasename + ".property.committer_timestamp_offset.bin");
+ }
+
+ public void writePersonIds() throws IOException {
+ logger.info("Writing author/committer IDs for release + revision");
+ Object2LongFunction personIdMap = NodeIdMap.loadMph(graphBasename + ".persons.mph");
+ SwhOrcTable releaseTable = dataset.getTable("release");
+ SwhOrcTable revisionTable = dataset.getTable("revision");
+
+ int[][] personArray = IntBigArrays.newBigArray(numNodes);
+
+ // Author IDs
+ BigArrays.fill(personArray, -1);
+ releaseTable.readBytes64Column("author", (swhid, personBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
+ });
+ revisionTable.readBytes64Column("author", (swhid, personBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
+ });
+ BinIO.storeInts(personArray, graphBasename + ".property.author_id.bin");
+
+ // Committer IDs
+ BigArrays.fill(personArray, -1);
+ revisionTable.readBytes64Column("committer", (swhid, personBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ BigArrays.set(personArray, id, (int) personIdMap.getLong(personBase64));
+ });
+ BinIO.storeInts(personArray, graphBasename + ".property.committer_id.bin");
+ }
+
+ public void writeMessages() throws IOException {
+ logger.info("Writing messages for release + revision, and URLs for origins");
+
+ long[][] messageOffsetArray = LongBigArrays.newBigArray(numNodes);
+ BigArrays.fill(messageOffsetArray, -1);
+
+ FastBufferedOutputStream messageStream = new FastBufferedOutputStream(
+ new FileOutputStream(graphBasename + ".property.message.bin"));
+ AtomicLong offset = new AtomicLong(0L);
+
+ SwhOrcTable releaseTable = dataset.getTable("release");
+ releaseTable.readBytes64Column("message", (swhid, messageBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ messageStream.write(messageBase64);
+ messageStream.write('\n');
+ BigArrays.set(messageOffsetArray, id, offset.longValue());
+ offset.addAndGet(messageBase64.length + 1);
+ });
+
+ SwhOrcTable revisionTable = dataset.getTable("revision");
+ revisionTable.readBytes64Column("message", (swhid, messageBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ messageStream.write(messageBase64);
+ messageStream.write('\n');
+ BigArrays.set(messageOffsetArray, id, offset.longValue());
+ offset.addAndGet(messageBase64.length + 1);
+ });
+
+ OriginOrcTable originTable = (OriginOrcTable) dataset.getTable("origin");
+ originTable.readURLs((swhid, messageBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ messageStream.write(messageBase64);
+ messageStream.write('\n');
+ BigArrays.set(messageOffsetArray, id, offset.longValue());
+ offset.addAndGet(messageBase64.length + 1);
+ });
+
+ // TODO: check which one is optimal in terms of memory/disk usage, EF vs mapped file
+ BinIO.storeLongs(messageOffsetArray, graphBasename + ".property.message.offset.bin");
+ // EliasFanoLongBigList messageOffsetEF = new
+ // EliasFanoLongBigList(LongBigArrayBigList.wrap(messageOffsetArray));
+ // BinIO.storeObject(messageOffsetEF, graphBasename + ".property.message.offset.bin");
+ messageStream.close();
+ }
+
+ public void writeTagNames() throws IOException {
+ logger.info("Writing tag names for release");
+
+ long[][] tagNameOffsetArray = LongBigArrays.newBigArray(numNodes);
+ BigArrays.fill(tagNameOffsetArray, -1);
+
+ FastBufferedOutputStream tagNameStream = new FastBufferedOutputStream(
+ new FileOutputStream(graphBasename + ".property.tag_name.bin"));
+ AtomicLong offset = new AtomicLong(0L);
+
+ SwhOrcTable releaseTable = dataset.getTable("release");
+ releaseTable.readBytes64Column("name", (swhid, tagNameBase64) -> {
+ long id = nodeIdMap.getNodeId(swhid);
+ tagNameStream.write(tagNameBase64);
+ tagNameStream.write('\n');
+ BigArrays.set(tagNameOffsetArray, id, offset.longValue());
+ offset.addAndGet(tagNameBase64.length + 1);
+ });
+
+ BinIO.storeLongs(tagNameOffsetArray, graphBasename + ".property.tag_name.offset.bin");
+ // EliasFanoLongBigList tagNameOffsetEF = new
+ // EliasFanoLongBigList(LongBigArrayBigList.wrap(tagNameOffsetArray));
+ // BinIO.storeObject(tagNameOffsetEF, graphBasename + ".property.tag_name.offset.bin");
+ tagNameStream.close();
+ }
+}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java
deleted file mode 100644
index f36ce88..0000000
--- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindCommonAncestor.java
+++ /dev/null
@@ -1,62 +0,0 @@
-package org.softwareheritage.graph.experiments.forks;
-
-import com.martiansoftware.jsap.*;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Traversal;
-
-import java.io.IOException;
-import java.util.Scanner;
-
-public class FindCommonAncestor {
- private Graph graph;
-
- private void load_graph(String graphBasename) throws IOException {
- System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = Graph.loadMapped(graphBasename);
- System.err.println("Graph loaded.");
- }
-
- private static JSAPResult parse_args(String[] args) {
- JSAPResult config = null;
- try {
- SimpleJSAP jsap = new SimpleJSAP(FindCommonAncestor.class.getName(), "",
- new Parameter[]{
- new FlaggedOption("edgesFmt", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'e',
- "edges", "Edges constraints"),
- new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
- "graph", "Basename of the compressed graph"),});
-
- config = jsap.parse(args);
- if (jsap.messagePrinted()) {
- System.exit(1);
- }
- } catch (JSAPException e) {
- e.printStackTrace();
- }
- return config;
- }
-
- public static void main(String[] args) {
- JSAPResult config = parse_args(args);
-
- String graphPath = config.getString("graphPath");
- String edgesFmt = config.getString("edgesFmt");
-
- FindCommonAncestor fca = new FindCommonAncestor();
- try {
- fca.load_graph(graphPath);
- } catch (IOException e) {
- System.out.println("Could not load graph: " + e);
- System.exit(2);
- }
-
- Scanner input = new Scanner(System.in);
- while (input.hasNextLong()) {
- long lhsNode = input.nextLong();
- long rhsNode = input.nextLong();
-
- Traversal t = new Traversal(fca.graph.symmetrize(), "forward", edgesFmt);
- System.out.println(t.findCommonDescendant(lhsNode, rhsNode));
- }
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java
deleted file mode 100644
index 2e5afd9..0000000
--- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/FindPath.java
+++ /dev/null
@@ -1,123 +0,0 @@
-package org.softwareheritage.graph.experiments.forks;
-
-import com.martiansoftware.jsap.*;
-import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
-
-import java.io.IOException;
-import java.util.*;
-
-public class FindPath {
- private Graph graph;
- private Long emptySnapshot;
-
- private void load_graph(String graphBasename) throws IOException {
- System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = Graph.loadMapped(graphBasename).symmetrize();
- System.err.println("Graph loaded.");
- this.emptySnapshot = null;
- }
-
- private static JSAPResult parse_args(String[] args) {
- JSAPResult config = null;
- try {
- SimpleJSAP jsap = new SimpleJSAP(FindPath.class.getName(), "",
- new Parameter[]{new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
- 'g', "graph", "Basename of the compressed graph"),});
-
- config = jsap.parse(args);
- if (jsap.messagePrinted()) {
- System.exit(1);
- }
- } catch (JSAPException e) {
- e.printStackTrace();
- }
- return config;
- }
-
- private boolean nodeIsEmptySnapshot(Long node) {
- if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP
- && this.graph.outdegree(node) == 0) {
- System.err.println("Found empty snapshot: " + node);
- this.emptySnapshot = node;
- }
- return node.equals(this.emptySnapshot);
- }
-
- private Boolean shouldVisit(Long node) {
- Node.Type nt = this.graph.getNodeType(node);
- if (nt != Node.Type.REV && nt != Node.Type.REL && nt != Node.Type.SNP && nt != Node.Type.ORI) {
- return false;
- }
- if (this.nodeIsEmptySnapshot(node))
- return false;
- return true;
- }
-
- private ArrayList findPath(Long src, Long dst) {
- HashSet visited = new HashSet<>();
- Queue queue = new ArrayDeque<>();
- Map parentNode = new HashMap<>();
-
- queue.add(src);
- visited.add(src);
-
- while (!queue.isEmpty()) {
- long currentNode = queue.poll();
-
- final LazyLongIterator iterator = graph.successors(currentNode);
- long succ;
- while ((succ = iterator.nextLong()) != -1) {
- if (!shouldVisit(succ) || visited.contains(succ))
- continue;
- visited.add(succ);
- queue.add(succ);
- parentNode.put(succ, currentNode);
-
- if (succ == dst) {
- ArrayList path = new ArrayList<>();
- long n = dst;
- while (n != src) {
- path.add(n);
- n = parentNode.get(n);
- }
- path.add(src);
- Collections.reverse(path);
- return path;
- }
- }
- }
- return null;
- }
-
- public static void main(String[] args) {
- JSAPResult config = parse_args(args);
-
- String graphPath = config.getString("graphPath");
-
- FindPath fpath = new FindPath();
- try {
- fpath.load_graph(graphPath);
- } catch (IOException e) {
- System.out.println("Could not load graph: " + e);
- System.exit(2);
- }
-
- Scanner input = new Scanner(System.in);
- while (input.hasNextLong()) {
- long lhsNode = input.nextLong();
- long rhsNode = input.nextLong();
-
- ArrayList path = fpath.findPath(lhsNode, rhsNode);
- if (path != null) {
- for (Long n : path) {
- System.out.format("%d ", n);
- }
- System.out.println();
- } else {
- System.out.println("null");
- }
- }
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java
index 714df2e..446b0e1 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCC.java
@@ -1,249 +1,256 @@
+/*
+ * Copyright (c) 2019 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.forks;
import com.google.common.primitives.Longs;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.io.ByteDiskQueue;
import it.unimi.dsi.logging.ProgressLogger;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
+import org.softwareheritage.graph.SwhBidirectionalGraph;
+import org.softwareheritage.graph.SwhType;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class ForkCC {
public Boolean includeRootDir;
- private Graph graph;
+ private SwhBidirectionalGraph graph;
private Long emptySnapshot;
private LongArrayBitVector visited;
private LongArrayBitVector whitelist;
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ForkCC.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't',
"whitelist", "Whitelist of origins"),
new FlaggedOption("includeRootDir", JSAP.BOOLEAN_PARSER, "false", JSAP.NOT_REQUIRED, 'R',
"includerootdir", "Include root directory (default: false)"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private static void printDistribution(ArrayList> components) {
TreeMap distribution = new TreeMap<>();
for (ArrayList component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry entry : distribution.entrySet()) {
System.out.format("%d %d\n", entry.getKey(), entry.getValue());
}
}
private static void printLargestComponent(ArrayList> components) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList component = components.get(indexLargest);
for (Long node : component) {
System.out.println(node);
}
}
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = Graph.loadMapped(graphBasename).symmetrize();
+ this.graph = SwhBidirectionalGraph.loadMapped(graphBasename).symmetrize();
System.err.println("Graph loaded.");
this.emptySnapshot = null;
this.whitelist = null;
this.visited = null;
this.includeRootDir = null;
}
private boolean nodeIsEmptySnapshot(Long node) {
- if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP
+ if (this.emptySnapshot == null && this.graph.getNodeType(node) == SwhType.SNP
&& this.graph.outdegree(node) == 0) {
System.err.println("Found empty snapshot: " + node);
this.emptySnapshot = node;
}
return node.equals(this.emptySnapshot);
}
private Boolean shouldVisit(Long node) {
- Node.Type nt = this.graph.getNodeType(node);
- if (nt == Node.Type.CNT) {
+ SwhType nt = this.graph.getNodeType(node);
+ if (nt == SwhType.CNT) {
return false;
}
- if (nt == Node.Type.DIR && !includeRootDir)
+ if (nt == SwhType.DIR && !includeRootDir)
return false;
if (this.nodeIsEmptySnapshot(node))
return false;
if (visited.getBoolean(node))
return false;
return true;
}
private ArrayList> compute(ProgressLogger pl) throws IOException {
final long n = graph.numNodes();
// Allow enough memory to behave like in-memory queue
int bufferSize = (int) Math.min(Arrays.MAX_ARRAY_SIZE & ~0x7, 8L * n);
// Use a disk based queue to store BFS frontier
final File queueFile = File.createTempFile(ForkCC.class.getSimpleName(), "queue");
final ByteDiskQueue queue = ByteDiskQueue.createNew(queueFile, bufferSize, true);
final byte[] byteBuf = new byte[Long.BYTES];
// WARNING: no 64-bit version of this data-structure, but it can support
// indices up to 2^37
visited = LongArrayBitVector.ofLength(n);
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting connected components visit...");
ArrayList> components = new ArrayList<>();
for (long i = 0; i < n; i++) {
- if (!shouldVisit(i) || this.graph.getNodeType(i) == Node.Type.DIR)
+ if (!shouldVisit(i) || this.graph.getNodeType(i) == SwhType.DIR)
continue;
ArrayList component = new ArrayList<>();
queue.enqueue(Longs.toByteArray(i));
visited.set(i);
while (!queue.isEmpty()) {
queue.dequeue(byteBuf);
final long currentNode = Longs.fromByteArray(byteBuf);
- Node.Type cur_nt = this.graph.getNodeType(currentNode);
- if (cur_nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) {
+ SwhType cur_nt = this.graph.getNodeType(currentNode);
+ if (cur_nt == SwhType.ORI && (this.whitelist == null || this.whitelist.getBoolean(currentNode))) {
// TODO: add a check that the origin has >=1 non-empty snapshot
component.add(currentNode);
}
final LazyLongIterator iterator = graph.successors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
if (!shouldVisit(succ))
continue;
- if (this.graph.getNodeType(succ) == Node.Type.DIR && cur_nt != Node.Type.REV)
+ if (this.graph.getNodeType(succ) == SwhType.DIR && cur_nt != SwhType.REV)
continue;
visited.set(succ);
queue.enqueue(Longs.toByteArray(succ));
}
pl.update();
}
if (component.size() > 0) {
components.add(component);
}
}
pl.done();
queue.close();
return components;
}
private static void printDistribution(ArrayList> components, Formatter out) {
TreeMap distribution = new TreeMap<>();
for (ArrayList component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry entry : distribution.entrySet()) {
out.format("%d %d\n", entry.getKey(), entry.getValue());
}
}
private static void printLargestComponent(ArrayList> components, Formatter out) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList component = components.get(indexLargest);
for (Long node : component) {
out.format("%d\n", node);
}
}
private static void printAllComponents(ArrayList> components, Formatter out) {
for (int i = 1; i < components.size(); ++i) {
ArrayList component = components.get(i);
for (Long node : component) {
out.format("%d ", node);
}
out.format("\n");
}
}
private void parseWhitelist(String path) {
System.err.println("Loading whitelist " + path + " ...");
this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes());
Scanner scanner;
try {
scanner = new Scanner(new File(path));
while (scanner.hasNextLong()) {
whitelist.set(scanner.nextLong());
}
System.err.println("Whitelist loaded.");
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String whitelistPath = config.getString("whitelistPath");
boolean includeRootDir = config.getBoolean("includeRootDir");
String outdirPath = config.getString("outdir");
ForkCC forkCc = new ForkCC();
try {
forkCc.load_graph(graphPath);
forkCc.includeRootDir = includeRootDir;
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
if (whitelistPath != null) {
forkCc.parseWhitelist(whitelistPath);
}
ProgressLogger logger = new ProgressLogger();
// noinspection ResultOfMethodCallIgnored
new File(outdirPath).mkdirs();
try {
ArrayList> components = forkCc.compute(logger);
printDistribution(components, new Formatter(outdirPath + "/distribution.txt"));
printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt"));
printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt"));
} catch (IOException e) {
e.printStackTrace();
}
logger.done();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java
index 4d749bd..746d51e 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ForkCliques.java
@@ -1,223 +1,230 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.forks;
import ch.qos.logback.classic.Level;
import ch.qos.logback.classic.Logger;
import com.google.common.primitives.Longs;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.bits.LongArrayBitVector;
import it.unimi.dsi.logging.ProgressLogger;
import org.slf4j.LoggerFactory;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
+import org.softwareheritage.graph.SwhBidirectionalGraph;
+import org.softwareheritage.graph.SwhType;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.*;
public class ForkCliques {
- private Graph graph;
+ private SwhBidirectionalGraph graph;
private LongArrayBitVector whitelist;
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = Graph.loadMapped(graphBasename);
+ this.graph = SwhBidirectionalGraph.loadMapped(graphBasename);
System.err.println("Graph loaded.");
this.whitelist = null;
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ForkCliques.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("whitelistPath", JSAP.STRING_PARSER, null, JSAP.NOT_REQUIRED, 't',
"whitelist", "Whitelist of origins"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private ArrayList dfsAt(Long baseNode) {
ArrayList res = new ArrayList<>();
final Deque stack = new ArrayDeque<>();
HashSet seen = new HashSet<>();
stack.push(baseNode);
while (!stack.isEmpty()) {
final Long currentNode = stack.pop();
final LazyLongIterator iterator = this.graph.predecessors(currentNode);
long succ;
while ((succ = iterator.nextLong()) != -1) {
if (!seen.contains(succ)) {
- Node.Type nt = this.graph.getNodeType(succ);
- if (nt == Node.Type.DIR || nt == Node.Type.CNT)
+ SwhType nt = this.graph.getNodeType(succ);
+ if (nt == SwhType.DIR || nt == SwhType.CNT)
continue;
- if (nt == Node.Type.ORI && (this.whitelist == null || this.whitelist.getBoolean(succ))) {
+ if (nt == SwhType.ORI && (this.whitelist == null || this.whitelist.getBoolean(succ))) {
res.add(succ);
} else {
stack.push(succ);
seen.add(succ);
}
}
}
}
Collections.sort(res);
return res;
}
private boolean isBaseRevision(Long node) {
- if (this.graph.getNodeType(node) != Node.Type.REV)
+ if (this.graph.getNodeType(node) != SwhType.REV)
return false;
final LazyLongIterator iterator = this.graph.successors(node);
long succ;
while ((succ = iterator.nextLong()) != -1) {
- if (this.graph.getNodeType(succ) == Node.Type.REV)
+ if (this.graph.getNodeType(succ) == SwhType.REV)
return false;
}
return true;
}
static private String fingerprint(ArrayList cluster) {
MessageDigest digest;
try {
digest = MessageDigest.getInstance("SHA-256");
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
return null;
}
for (Long n : cluster)
digest.update(Longs.toByteArray(n));
return new String(digest.digest());
}
private ArrayList> compute(ProgressLogger pl) {
final long n = this.graph.numNodes();
HashSet fingerprints = new HashSet<>();
ArrayList> clusters = new ArrayList<>();
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Starting topological sort...");
for (long i = 0; i < n; i++) {
if (isBaseRevision(i)) {
ArrayList currentCluster = dfsAt(i);
String clusterFp = fingerprint(currentCluster);
if (!fingerprints.contains(clusterFp)) {
fingerprints.add(clusterFp);
clusters.add(currentCluster);
}
}
pl.update();
}
pl.done();
return clusters;
}
private static void printDistribution(ArrayList> components, Formatter out) {
TreeMap distribution = new TreeMap<>();
for (ArrayList component : components) {
distribution.merge((long) component.size(), 1L, Long::sum);
}
for (Map.Entry entry : distribution.entrySet()) {
out.format("%d %d\n", entry.getKey(), entry.getValue());
}
}
private static void printLargestComponent(ArrayList> components, Formatter out) {
int indexLargest = 0;
for (int i = 1; i < components.size(); ++i) {
if (components.get(i).size() > components.get(indexLargest).size())
indexLargest = i;
}
ArrayList component = components.get(indexLargest);
for (Long node : component) {
out.format("%d\n", node);
}
}
private static void printAllComponents(ArrayList> components, Formatter out) {
for (int i = 1; i < components.size(); ++i) {
ArrayList component = components.get(i);
for (Long node : component) {
out.format("%d ", node);
}
out.format("\n");
}
}
private void parseWhitelist(String path) {
System.err.println("Loading whitelist " + path + " ...");
this.whitelist = LongArrayBitVector.ofLength(this.graph.numNodes());
Scanner scanner;
try {
scanner = new Scanner(new File(path));
while (scanner.hasNextLong()) {
whitelist.set(scanner.nextLong());
}
System.err.println("Whitelist loaded.");
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String whitelistPath = config.getString("whitelistPath");
String outdirPath = config.getString("outdir");
ForkCliques forkCliques = new ForkCliques();
try {
forkCliques.load_graph(graphPath);
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
if (whitelistPath != null) {
forkCliques.parseWhitelist(whitelistPath);
}
Logger rootLogger = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(org.slf4j.Logger.ROOT_LOGGER_NAME);
rootLogger.setLevel(Level.DEBUG);
ProgressLogger logger = new ProgressLogger(rootLogger);
ArrayList> components = forkCliques.compute(logger);
// noinspection ResultOfMethodCallIgnored
new File(outdirPath).mkdirs();
try {
printDistribution(components, new Formatter(outdirPath + "/distribution.txt"));
printLargestComponent(components, new Formatter(outdirPath + "/largest_clique.txt"));
printAllComponents(components, new Formatter(outdirPath + "/all_cliques.txt"));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
logger.done();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java
index 332a908..0ffb690 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/forks/ListEmptyOrigins.java
@@ -1,88 +1,95 @@
+/*
+ * Copyright (c) 2019 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.forks;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
+import org.softwareheritage.graph.SwhBidirectionalGraph;
+import org.softwareheritage.graph.SwhType;
import java.io.IOException;
import java.util.ArrayList;
public class ListEmptyOrigins {
- private Graph graph;
+ private SwhBidirectionalGraph graph;
private Long emptySnapshot;
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(ListEmptyOrigins.class.getName(), "",
new Parameter[]{new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED,
'g', "graph", "Basename of the compressed graph"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
public static void main(String[] args) {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
ListEmptyOrigins leo = new ListEmptyOrigins();
try {
leo.load_graph(graphPath);
} catch (IOException e) {
System.out.println("Could not load graph: " + e);
System.exit(2);
}
ArrayList badlist = leo.compute(leo.graph);
for (Long bad : badlist) {
System.out.println(bad);
}
}
private void load_graph(String graphBasename) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = Graph.loadMapped(graphBasename);
+ this.graph = SwhBidirectionalGraph.loadMapped(graphBasename);
System.err.println("Graph loaded.");
this.emptySnapshot = null;
}
private boolean nodeIsEmptySnapshot(Long node) {
System.err.println(this.graph.getNodeType(node) + " " + this.graph.outdegree(node) + " " + node);
- if (this.emptySnapshot == null && this.graph.getNodeType(node) == Node.Type.SNP
+ if (this.emptySnapshot == null && this.graph.getNodeType(node) == SwhType.SNP
&& this.graph.outdegree(node) == 0) {
System.err.println("Found empty snapshot: " + node);
this.emptySnapshot = node;
}
return node.equals(this.emptySnapshot);
}
private ArrayList compute(ImmutableGraph graph) {
final long n = graph.numNodes();
ArrayList bad = new ArrayList<>();
for (long i = 0; i < n; i++) {
- Node.Type nt = this.graph.getNodeType(i);
- if (nt != Node.Type.ORI)
+ SwhType nt = this.graph.getNodeType(i);
+ if (nt != SwhType.ORI)
continue;
final LazyLongIterator iterator = graph.successors(i);
long succ;
boolean found = false;
while ((succ = iterator.nextLong()) != -1) {
if (this.graph.outdegree(succ) > 0) {
found = true;
}
}
if (!found)
bad.add(i);
}
return bad;
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java b/java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java
deleted file mode 100644
index 89fd675..0000000
--- a/java/src/main/java/org/softwareheritage/graph/experiments/multiplicationfactor/GenDistribution.java
+++ /dev/null
@@ -1,130 +0,0 @@
-package org.softwareheritage.graph.experiments.multiplicationfactor;
-
-import com.martiansoftware.jsap.*;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
-import org.softwareheritage.graph.Traversal;
-import org.softwareheritage.graph.benchmark.utils.Timing;
-
-import java.io.IOException;
-import java.util.Scanner;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-public class GenDistribution {
- private Graph graph;
-
- private static JSAPResult parse_args(String[] args) {
- JSAPResult config = null;
- try {
- SimpleJSAP jsap = new SimpleJSAP(GenDistribution.class.getName(), "",
- new Parameter[]{
- new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
- "graph", "Basename of the compressed graph"),
- new FlaggedOption("srcType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 's',
- "srctype", "Source node type"),
- new FlaggedOption("dstType", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'd',
- "dsttype", "Destination node type"),
- new FlaggedOption("edgesFmt", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'e',
- "edges", "Edges constraints"),
-
- new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "128", JSAP.NOT_REQUIRED, 't',
- "numthreads", "Number of threads"),});
-
- config = jsap.parse(args);
- if (jsap.messagePrinted()) {
- System.exit(1);
- }
- } catch (JSAPException e) {
- e.printStackTrace();
- }
- return config;
- }
-
- public static void main(String[] args) {
- JSAPResult config = parse_args(args);
-
- String graphPath = config.getString("graphPath");
- Node.Type srcType = Node.Type.fromStr(config.getString("srcType"));
- Node.Type dstType = Node.Type.fromStr(config.getString("dstType"));
- String edgesFmt = config.getString("edgesFmt");
- int numThreads = config.getInt("numThreads");
-
- GenDistribution tp = new GenDistribution();
- try {
- tp.load_graph(graphPath);
- } catch (IOException e) {
- System.out.println("Could not load graph: " + e);
- System.exit(2);
- }
-
- final long END_OF_QUEUE = -1L;
-
- ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads);
- ExecutorService service = Executors.newFixedThreadPool(numThreads + 1);
-
- service.submit(() -> {
- try {
- Scanner input = new Scanner(System.in);
- while (input.hasNextLong()) {
- long node = input.nextLong();
- if (tp.graph.getNodeType(node) == srcType) {
- queue.put(node);
- }
- }
- } catch (InterruptedException e) {
- e.printStackTrace();
- } finally {
- for (int i = 0; i < numThreads; ++i) {
- try {
- queue.put(END_OF_QUEUE);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
- });
-
- for (int i = 0; i < numThreads; ++i) {
- service.submit(() -> {
- Graph thread_graph = tp.graph.copy();
- long startTime;
- double totalTime;
-
- while (true) {
- Long node = null;
- try {
- node = queue.take();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- if (node == null || node == END_OF_QUEUE) {
- return;
- }
-
- Traversal t = new Traversal(thread_graph, "backward", edgesFmt);
- int[] count = {0};
-
- startTime = Timing.start();
- t.visitNodesVisitor(node, (curnode) -> {
- if (tp.graph.getNodeType(curnode) == dstType) {
- count[0]++;
- }
- });
- totalTime = Timing.stop(startTime);
- System.out.format("%d %d %d %d %f\n", node, count[0], t.getNbNodesAccessed(),
- t.getNbEdgesAccessed(), totalTime);
- }
- });
- }
-
- service.shutdown();
- }
-
- private void load_graph(String graphBasename) throws IOException {
- System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = Graph.loadMapped(graphBasename);
- System.err.println("Graph loaded.");
- }
-}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java
index ad7eadf..dd8d203 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/AveragePaths.java
@@ -1,188 +1,195 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.topology;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.Util;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.XoRoShiRo128PlusRandom;
import org.softwareheritage.graph.*;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
import java.util.concurrent.*;
public class AveragePaths {
- private final Graph graph;
+ private final SwhBidirectionalGraph graph;
private final Subgraph subgraph;
private final ConcurrentHashMap result;
private final String outdir;
public AveragePaths(String graphBasename, String allowedNodes, String outdir) throws IOException {
System.err.println("Loading graph " + graphBasename + " ...");
- this.graph = Graph.loadMapped(graphBasename);
+ this.graph = SwhBidirectionalGraph.loadMapped(graphBasename);
this.subgraph = new Subgraph(this.graph, new AllowedNodes(allowedNodes));
this.outdir = outdir;
System.err.println("Graph loaded.");
result = new ConcurrentHashMap<>();
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(AveragePaths.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("nodeTypes", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 's',
"nodetypes", "Node type constraints"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),
new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "32", JSAP.NOT_REQUIRED, 't',
"numthreads", "Number of threads"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private void run(int numThreads) throws InterruptedException {
final long END_OF_QUEUE = -1L;
ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads);
ExecutorService service = Executors.newFixedThreadPool(numThreads + 1);
service.submit(() -> {
try {
- Graph thread_graph = graph.copy();
+ SwhBidirectionalGraph thread_graph = graph.copy();
Subgraph thread_subgraph = subgraph.copy();
long[][] randomPerm = Util.identity(thread_graph.numNodes());
LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom());
long n = thread_graph.numNodes();
ProgressLogger pl = new ProgressLogger();
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Filling processor queue...");
for (long j = 0; j < n; ++j) {
long node = BigArrays.get(randomPerm, j);
if (thread_subgraph.nodeExists(node) && thread_subgraph.outdegree(node) == 0) {
queue.put(node);
}
if (j % 10000 == 0) {
printResult();
}
pl.update();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
for (int i = 0; i < numThreads; ++i) {
try {
queue.put(END_OF_QUEUE);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
});
for (int i = 0; i < numThreads; ++i) {
service.submit(() -> {
try {
Subgraph thread_subgraph = subgraph.copy();
while (true) {
Long node = null;
try {
node = queue.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
if (node == null || node == END_OF_QUEUE) {
return;
}
bfsAt(thread_subgraph, node);
}
} catch (Exception e) {
e.printStackTrace();
}
});
}
service.shutdown();
service.awaitTermination(365, TimeUnit.DAYS);
}
private void bfsAt(Subgraph graph, long srcNodeId) {
ArrayDeque queue = new ArrayDeque<>();
HashSet visited = new HashSet<>();
long FRONTIER_MARKER = -1;
queue.addLast(srcNodeId);
visited.add(srcNodeId);
long distance = 0;
queue.addLast(FRONTIER_MARKER);
while (!queue.isEmpty()) {
long currentNodeId = queue.removeFirst();
// System.err.println("curr: " + currentNodeId);
if (currentNodeId == FRONTIER_MARKER) {
if (queue.isEmpty()) // avoid infinite loops
break;
++distance;
queue.addLast(FRONTIER_MARKER);
continue;
}
if (graph.indegree(currentNodeId) == 0) {
result.merge(distance, 1L, Long::sum);
}
LazyLongIterator it = graph.predecessors(currentNodeId);
for (long neighborNodeId; (neighborNodeId = it.nextLong()) != -1;) {
if (!visited.contains(neighborNodeId)) {
queue.addLast(neighborNodeId);
visited.add(neighborNodeId);
}
}
}
}
public void printResult() throws IOException {
new File(outdir).mkdirs();
PrintWriter f = new PrintWriter(new FileWriter(outdir + "/distribution.txt"));
TreeMap sortedDistribution = new TreeMap<>(result);
for (Map.Entry entry : sortedDistribution.entrySet()) {
f.println(entry.getKey() + " " + entry.getValue());
}
f.close();
}
public static void main(String[] args) throws IOException, InterruptedException {
JSAPResult config = parse_args(args);
String graphPath = config.getString("graphPath");
String outdir = config.getString("outdir");
String allowedNodes = config.getString("nodeTypes");
int numThreads = config.getInt("numThreads");
AveragePaths tp = new AveragePaths(graphPath, allowedNodes, outdir);
tp.run(numThreads);
tp.printResult();
}
}
diff --git a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java
index 2e6fa0c..558aa39 100644
--- a/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java
+++ b/java/src/main/java/org/softwareheritage/graph/experiments/topology/ClusteringCoefficient.java
@@ -1,325 +1,331 @@
+/*
+ * Copyright (c) 2020 The Software Heritage developers
+ * See the AUTHORS file at the top-level directory of this distribution
+ * License: GNU General Public License version 3, or any later version
+ * See top-level LICENSE file for more information
+ */
+
package org.softwareheritage.graph.experiments.topology;
import com.martiansoftware.jsap.*;
import it.unimi.dsi.Util;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.big.webgraph.LazyLongIterator;
import it.unimi.dsi.fastutil.BigArrays;
import it.unimi.dsi.fastutil.longs.LongBigArrays;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.XoRoShiRo128PlusRandom;
-import org.softwareheritage.graph.Graph;
-import org.softwareheritage.graph.Node;
+import org.softwareheritage.graph.SwhBidirectionalGraph;
+import org.softwareheritage.graph.SwhType;
import java.io.*;
import java.util.*;
import java.util.concurrent.*;
public class ClusteringCoefficient {
- private final Graph graph;
+ private final SwhBidirectionalGraph graph;
private final String outdirPath;
private final ConcurrentHashMap result_full;
private final ConcurrentHashMap result_dircnt;
private final ConcurrentHashMap result_rev;
private final ConcurrentHashMap result_revrel;
private final ConcurrentHashMap result_orisnp;
public ClusteringCoefficient(String graphBasename, String outdirPath) throws IOException {
this.outdirPath = outdirPath;
System.err.println("Loading graph " + graphBasename + " ...");
- Graph directedGraph = Graph.loadMapped(graphBasename);
+ SwhBidirectionalGraph directedGraph = SwhBidirectionalGraph.loadMapped(graphBasename);
this.graph = directedGraph.symmetrize();
System.err.println("Graph loaded.");
result_full = new ConcurrentHashMap<>();
result_dircnt = new ConcurrentHashMap<>();
result_rev = new ConcurrentHashMap<>();
result_revrel = new ConcurrentHashMap<>();
result_orisnp = new ConcurrentHashMap<>();
}
private static JSAPResult parse_args(String[] args) {
JSAPResult config = null;
try {
SimpleJSAP jsap = new SimpleJSAP(AveragePaths.class.getName(), "",
new Parameter[]{
new FlaggedOption("graphPath", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'g',
"graph", "Basename of the compressed graph"),
new FlaggedOption("outdir", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o',
"outdir", "Directory where to put the results"),
new FlaggedOption("numThreads", JSAP.INTEGER_PARSER, "32", JSAP.NOT_REQUIRED, 't',
"numthreads", "Number of threads"),});
config = jsap.parse(args);
if (jsap.messagePrinted()) {
System.exit(1);
}
} catch (JSAPException e) {
e.printStackTrace();
}
return config;
}
private void run(int numThreads) throws InterruptedException {
final long END_OF_QUEUE = -1L;
ArrayBlockingQueue queue = new ArrayBlockingQueue<>(numThreads);
ExecutorService service = Executors.newFixedThreadPool(numThreads + 1);
service.submit(() -> {
try {
- Graph thread_graph = graph.copy();
+ SwhBidirectionalGraph thread_graph = graph.copy();
long[][] randomPerm = Util.identity(thread_graph.numNodes());
LongBigArrays.shuffle(randomPerm, new XoRoShiRo128PlusRandom());
long n = thread_graph.numNodes();
ProgressLogger pl = new ProgressLogger();
pl.expectedUpdates = n;
pl.itemsName = "nodes";
pl.start("Filling processor queue...");
for (long j = 0; j < n; ++j) {
long node = BigArrays.get(randomPerm, j);
queue.put(node);
if (j % 10000 == 0) {
printResult();
}
pl.update();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
for (int i = 0; i < numThreads; ++i) {
try {
queue.put(END_OF_QUEUE);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
});
for (int i = 0; i < numThreads; ++i) {
service.submit(() -> {
try {
- Graph thread_graph = graph.copy();
+ SwhBidirectionalGraph thread_graph = graph.copy();
while (true) {
Long node = null;
try {
node = queue.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
if (node == null || node == END_OF_QUEUE) {
return;
}
computeAt(thread_graph, node);
}
} catch (Exception e) {
e.printStackTrace();
}
});
}
service.shutdown();
service.awaitTermination(365, TimeUnit.DAYS);
}
- private void computeAt(Graph graph, long node) {
+ private void computeAt(SwhBidirectionalGraph graph, long node) {
long d = graph.outdegree(node);
if (d < 2) {
return;
}
- Node.Type nodeType = graph.getNodeType(node);
+ SwhType nodeType = graph.getNodeType(node);
HashSet neighborhood = new HashSet<>();
long succ;
final LazyLongIterator iterator = graph.successors(node);
while ((succ = iterator.nextLong()) != -1) {
neighborhood.add(succ);
}
long triangles_full = 0;
long triangles_dircnt = 0;
long triangles_rev = 0;
long triangles_revrel = 0;
long triangles_orisnp = 0;
for (Long neighbor : neighborhood) {
- Node.Type neighborNodeType = graph.getNodeType(neighbor);
+ SwhType neighborNodeType = graph.getNodeType(neighbor);
final LazyLongIterator it = graph.successors(neighbor);
while ((succ = it.nextLong()) != -1) {
if (neighborhood.contains(succ)) {
- Node.Type succNodeType = graph.getNodeType(succ);
+ SwhType succNodeType = graph.getNodeType(succ);
triangles_full++;
- if ((nodeType == Node.Type.DIR || nodeType == Node.Type.CNT)
- && (neighborNodeType == Node.Type.DIR || neighborNodeType == Node.Type.CNT)
- && (succNodeType == Node.Type.DIR || succNodeType == Node.Type.CNT)) {
+ if ((nodeType == SwhType.DIR || nodeType == SwhType.CNT)
+ && (neighborNodeType == SwhType.DIR || neighborNodeType == SwhType.CNT)
+ && (succNodeType == SwhType.DIR || succNodeType == SwhType.CNT)) {
triangles_dircnt++;
- } else if ((nodeType == Node.Type.REV || nodeType == Node.Type.REL)
- && (neighborNodeType == Node.Type.REV || neighborNodeType == Node.Type.REL)
- && (succNodeType == Node.Type.REV || succNodeType == Node.Type.REL)) {
+ } else if ((nodeType == SwhType.REV || nodeType == SwhType.REL)
+ && (neighborNodeType == SwhType.REV || neighborNodeType == SwhType.REL)
+ && (succNodeType == SwhType.REV || succNodeType == SwhType.REL)) {
triangles_revrel++;
- if (nodeType == Node.Type.REV && neighborNodeType == Node.Type.REV
- && succNodeType == Node.Type.REV)
+ if (nodeType == SwhType.REV && neighborNodeType == SwhType.REV && succNodeType == SwhType.REV)
triangles_rev++;
- } else if ((nodeType == Node.Type.ORI || nodeType == Node.Type.SNP)
- && (neighborNodeType == Node.Type.ORI || neighborNodeType == Node.Type.SNP)
- && (succNodeType == Node.Type.ORI || succNodeType == Node.Type.SNP)) {
+ } else if ((nodeType == SwhType.ORI || nodeType == SwhType.SNP)
+ && (neighborNodeType == SwhType.ORI || neighborNodeType == SwhType.SNP)
+ && (succNodeType == SwhType.ORI || succNodeType == SwhType.SNP)) {
triangles_orisnp++;
}
}
}
}
result_full.merge(triangles_full, 1L, Long::sum);
result_dircnt.merge(triangles_dircnt, 1L, Long::sum);
result_rev.merge(triangles_rev, 1L, Long::sum);
result_revrel.merge(triangles_revrel, 1L, Long::sum);
result_orisnp.merge(triangles_orisnp, 1L, Long::sum);
}
public void printSortedDistribution(String distribPath, Map distrib) throws IOException {
PrintWriter f = new PrintWriter(new FileWriter(distribPath));
TreeMap sortedDistribution = new TreeMap<>(distrib);
for (Map.Entry