Page MenuHomeSoftware Heritage

D747.diff
No OneTemporary

D747.diff

diff --git a/docs/Makefile b/docs/Makefile
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1 +1,3 @@
include ../../swh-docs/Makefile.sphinx
+-include Makefile.local
+
diff --git a/docs/Makefile.local b/docs/Makefile.local
new file mode 100644
--- /dev/null
+++ b/docs/Makefile.local
@@ -0,0 +1,11 @@
+sphinx/html: images
+sphinx/clean: clean-images
+assets: images
+
+images:
+ make -C images/
+clean-images:
+ make -C images/ clean
+
+.PHONY: images clean-images
+
diff --git a/docs/images/.gitignore b/docs/images/.gitignore
new file mode 100644
--- /dev/null
+++ b/docs/images/.gitignore
@@ -0,0 +1 @@
+tasks-metadata-indexers.svg
diff --git a/docs/images/Makefile b/docs/images/Makefile
new file mode 100644
--- /dev/null
+++ b/docs/images/Makefile
@@ -0,0 +1,11 @@
+
+UML_DIAGS_SRC = $(wildcard *.uml)
+UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC))
+
+all: $(UML_DIAGS)
+
+%.svg: %.uml
+ DISPLAY="" plantuml -tsvg $<
+
+clean:
+ -rm -f $(DEP_GRAPHS) $(UML_DIAGS)
diff --git a/docs/images/tasks-metadata-indexers.uml b/docs/images/tasks-metadata-indexers.uml
new file mode 100644
--- /dev/null
+++ b/docs/images/tasks-metadata-indexers.uml
@@ -0,0 +1,84 @@
+@startuml
+ participant LOADERS as "Loaders"
+ participant JOURNAL as "Journal"
+ participant SCHEDULER as "Scheduler"
+ participant IDX_ORIG_HEAD as "Origin-Head Indexer"
+ participant IDX_REV_META as "Revision Metadata Indexer"
+ participant IDX_CONT_META as "Content Metadata Indexer"
+ participant IDX_ORIG_META as "Origin Metadata Indexer"
+ participant IDX_STORAGE as "Indexer Storage"
+ participant STORAGE as "Graph Storage"
+ participant OBJ_STORAGE as "Object Storage"
+
+ activate OBJ_STORAGE
+ activate IDX_STORAGE
+ activate STORAGE
+ activate JOURNAL
+ activate SCHEDULER
+
+ activate LOADERS
+
+ LOADERS->>JOURNAL: Origin 42 was added/revisited
+ deactivate LOADERS
+
+ JOURNAL->>SCHEDULER: run indexers on origin 42
+
+ SCHEDULER->>IDX_ORIG_HEAD: Find HEAD revision of 42
+ activate IDX_ORIG_HEAD
+
+ IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin=42)
+
+ STORAGE->>IDX_ORIG_HEAD: branches
+
+ IDX_ORIG_HEAD->>SCHEDULER: run Revision Metadata Indexer\non revision 42abcdef\n(head of origin 42)
+ deactivate IDX_ORIG_HEAD
+
+ SCHEDULER->>IDX_REV_META: Index revision 42abcdef\n(head of origin 42)
+ activate IDX_REV_META
+
+ IDX_REV_META->>STORAGE: revision_get(sha1=42abcdef)
+ STORAGE->>IDX_REV_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...}
+
+ IDX_REV_META->>STORAGE: directory_ls(sha1=456789ab)
+ STORAGE->>IDX_REV_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...]
+
+ IDX_REV_META->>IDX_REV_META: package.json is a metadata file
+
+ IDX_REV_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe)
+ IDX_STORAGE->>IDX_REV_META: none / {author: "Jane Doe", ...}
+
+ alt If the storage answered "none"
+ IDX_REV_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file
+ activate IDX_CONT_META
+
+ IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe
+
+ OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...'
+
+ IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author
+
+ IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...})
+ IDX_STORAGE->>IDX_CONT_META: ok
+
+ IDX_CONT_META->>IDX_REV_META: extracted: {author: "Jane Doe", ...}
+ deactivate IDX_CONT_META
+ end
+
+ IDX_REV_META->>IDX_STORAGE: revision_metadata_add(sha1=42abcdef, {author: "Jane Doe", ...})
+ IDX_STORAGE->>IDX_REV_META: ok
+
+ IDX_REV_META->>SCHEDULER: run Origin Metadata Indexer\non origin 42; the head is 42abcdef
+ deactivate IDX_REV_META
+
+ SCHEDULER->>IDX_ORIG_META: Index origin 42; the head is 42abcdef
+ activate IDX_ORIG_META
+
+ IDX_ORIG_META->>IDX_STORAGE: revision_metadata_get(sha1=42abcdef)
+ IDX_STORAGE->>IDX_ORIG_META: {author: "Jane Doe", ...}
+
+ IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id=42, {author: "Jane Doe", ...})
+ IDX_STORAGE->>IDX_ORIG_META: ok
+ deactivate IDX_ORIG_META
+
+
+@enduml
diff --git a/docs/index.rst b/docs/index.rst
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,6 +13,7 @@
README.md
dev-info.rst
+ metadata-workflow.rst
Reference Documentation
diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst
new file mode 100644
--- /dev/null
+++ b/docs/metadata-workflow.rst
@@ -0,0 +1,11 @@
+Metadata workflow
+=================
+
+Indexing :term:`intrinsic metadata` requires extracting information from the
+lowest levels of the :ref:`Merkle DAG <swh-merkle-dag>` (directories, files,
+and content blobs) and associate them to the highest ones (origins).
+In order to deduplicate the work between origins, we split this work between
+multiple indexers, which coordinate with each other and save their results
+at each step in the indexer storage.
+
+.. thumbnail:: images/tasks-metadata-indexers.svg

File Metadata

Mime Type
text/plain
Expires
Dec 20 2024, 3:11 AM (11 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3218890

Event Timeline