diff --git a/docs/Makefile b/docs/Makefile
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1 +1,3 @@
 include ../../swh-docs/Makefile.sphinx
+-include Makefile.local
+
diff --git a/docs/Makefile.local b/docs/Makefile.local
new file mode 100644
--- /dev/null
+++ b/docs/Makefile.local
@@ -0,0 +1,11 @@
+sphinx/html: images
+sphinx/clean: clean-images
+assets: images
+
+images:
+	make -C images/
+clean-images:
+	make -C images/ clean
+
+.PHONY: images clean-images
+
diff --git a/docs/images/.gitignore b/docs/images/.gitignore
new file mode 100644
--- /dev/null
+++ b/docs/images/.gitignore
@@ -0,0 +1 @@
+tasks-metadata-indexers.svg
diff --git a/docs/images/Makefile b/docs/images/Makefile
new file mode 100644
--- /dev/null
+++ b/docs/images/Makefile
@@ -0,0 +1,11 @@
+
+UML_DIAGS_SRC = $(wildcard *.uml)
+UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC))
+
+all: $(UML_DIAGS)
+
+%.svg: %.uml
+	DISPLAY="" plantuml -tsvg $<
+
+clean:
+	-rm -f $(DEP_GRAPHS) $(UML_DIAGS)
diff --git a/docs/images/tasks-metadata-indexers.uml b/docs/images/tasks-metadata-indexers.uml
new file mode 100644
--- /dev/null
+++ b/docs/images/tasks-metadata-indexers.uml
@@ -0,0 +1,84 @@
+@startuml
+  participant LOADERS as "Loaders"
+  participant JOURNAL as "Journal"
+  participant SCHEDULER as "Scheduler"
+  participant IDX_ORIG_HEAD as "Origin-Head Indexer"
+  participant IDX_REV_META as "Revision Metadata Indexer"
+  participant IDX_CONT_META as "Content Metadata Indexer"
+  participant IDX_ORIG_META as "Origin Metadata Indexer"
+  participant IDX_STORAGE as "Indexer Storage"
+  participant STORAGE as "Graph Storage"
+  participant OBJ_STORAGE as "Object Storage"
+
+  activate OBJ_STORAGE
+  activate IDX_STORAGE
+  activate STORAGE
+  activate JOURNAL
+  activate SCHEDULER
+
+  activate LOADERS
+
+  LOADERS->>JOURNAL: Origin 42 was added/revisited
+  deactivate LOADERS
+
+  JOURNAL->>SCHEDULER: run indexers on origin 42
+
+  SCHEDULER->>IDX_ORIG_HEAD: Find HEAD revision of 42
+  activate IDX_ORIG_HEAD
+
+  IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin=42)
+
+  STORAGE->>IDX_ORIG_HEAD: branches
+
+  IDX_ORIG_HEAD->>SCHEDULER: run Revision Metadata Indexer\non revision 42abcdef\n(head of origin 42)
+  deactivate IDX_ORIG_HEAD
+
+  SCHEDULER->>IDX_REV_META: Index revision 42abcdef\n(head of origin 42)
+  activate IDX_REV_META
+
+  IDX_REV_META->>STORAGE: revision_get(sha1=42abcdef)
+  STORAGE->>IDX_REV_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...}
+
+  IDX_REV_META->>STORAGE: directory_ls(sha1=456789ab)
+  STORAGE->>IDX_REV_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...]
+
+  IDX_REV_META->>IDX_REV_META: package.json is a metadata file
+
+  IDX_REV_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe)
+  IDX_STORAGE->>IDX_REV_META: none / {author: "Jane Doe", ...}
+
+  alt If the storage answered "none"
+    IDX_REV_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file
+    activate IDX_CONT_META
+
+    IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe
+
+    OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...'
+
+    IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author
+
+    IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...})
+    IDX_STORAGE->>IDX_CONT_META: ok
+
+    IDX_CONT_META->>IDX_REV_META: extracted: {author: "Jane Doe", ...}
+    deactivate IDX_CONT_META
+  end
+
+  IDX_REV_META->>IDX_STORAGE: revision_metadata_add(sha1=42abcdef, {author: "Jane Doe", ...})
+  IDX_STORAGE->>IDX_REV_META: ok
+
+  IDX_REV_META->>SCHEDULER: run Origin Metadata Indexer\non origin 42; the head is 42abcdef
+  deactivate IDX_REV_META
+
+  SCHEDULER->>IDX_ORIG_META: Index origin 42; the head is 42abcdef
+  activate IDX_ORIG_META
+
+  IDX_ORIG_META->>IDX_STORAGE: revision_metadata_get(sha1=42abcdef)
+  IDX_STORAGE->>IDX_ORIG_META: {author: "Jane Doe", ...}
+
+  IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id=42, {author: "Jane Doe", ...})
+  IDX_STORAGE->>IDX_ORIG_META: ok
+  deactivate IDX_ORIG_META
+  
+
+@enduml
diff --git a/docs/index.rst b/docs/index.rst
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,6 +13,7 @@
 
    README.md
    dev-info.rst
+   metadata-workflow.rst
 
 
 Reference Documentation
diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst
new file mode 100644
--- /dev/null
+++ b/docs/metadata-workflow.rst
@@ -0,0 +1,64 @@
+Metadata workflow
+=================
+
+Intrinsic metadata
+------------------
+
+Indexing :term:`intrinsic metadata` requires extracting information from the
+lowest levels of the :ref:`Merkle DAG <swh-merkle-dag>` (directories, files,
+and content blobs) and associate them to the highest ones (origins).
+In order to deduplicate the work between origins, we split this work between
+multiple indexers, which coordinate with each other and save their results
+at each step in the indexer storage.
+
+.. thumbnail:: images/tasks-metadata-indexers.svg
+
+
+Origin-Head Indexer
+___________________
+
+First, the Origin-Head indexer gets called externally, with an origin as
+argument (or multiple origins, that are handled sequentially).
+For now, its tasks are scheduled manually via recurring Scheduler tasks; but
+in the near future, the :term:`journal` will be used to do that.
+
+It first looks up the last :term:`snapshot` and determines what the main
+branch of origin is (the "Head branch") and what revision it points to
+(the "Head").
+Intrinsic metadata for that origin will be extracted from that revision.
+
+It schedules a Revision Metadata Indexer task for that revision, with a
+hint that the revision is the Head of that particular origin.
+
+
+Revision and Content Metadata Indexers
+______________________________________
+
+These two indexers do the hard part of the work. The Revision Metadata
+Indexer fetches the root directory associated with a revision, then extracts
+the metadata from that directory.
+
+To do so, it lists files in that directory, and looks for known names, such
+as `codemeta.json`, `package.json`, or `pom.xml`. If there are any, it
+runs the Content Metadata Indexer on them, which in turn fetches their
+contents and runs them through extraction dictionaries/mappings.
+
+Their results are saved in a database (the indexer storage), associated with
+the content and revision hashes.
+
+If it received a hint that this revision is the head of an origin, the
+Revision Metadata Indexer then schedules the Origin Metadata Indexer
+to run on that origin.
+
+
+Origin Metadata Indexer
+_______________________
+
+The job of this indexer is very simple: it takes an origin identifier and
+a revision hash, and copies the metadata of the former to a new table, to
+associate it with the latter.
+
+The reason for this is to be able to perform searches on metadata, and
+efficiently find out which origins matched the pattern.
+Running that search on the `revision_metadata` table would require either
+a reverse lookup from revisions to origins, which is costly.