diff --git a/docs/Makefile b/docs/Makefile --- a/docs/Makefile +++ b/docs/Makefile @@ -1 +1,3 @@ include ../../swh-docs/Makefile.sphinx +-include Makefile.local + diff --git a/docs/Makefile.local b/docs/Makefile.local new file mode 100644 --- /dev/null +++ b/docs/Makefile.local @@ -0,0 +1,11 @@ +sphinx/html: images +sphinx/clean: clean-images +assets: images + +images: + make -C images/ +clean-images: + make -C images/ clean + +.PHONY: images clean-images + diff --git a/docs/images/.gitignore b/docs/images/.gitignore new file mode 100644 --- /dev/null +++ b/docs/images/.gitignore @@ -0,0 +1 @@ +tasks-metadata-indexers.svg diff --git a/docs/images/Makefile b/docs/images/Makefile new file mode 100644 --- /dev/null +++ b/docs/images/Makefile @@ -0,0 +1,11 @@ + +UML_DIAGS_SRC = $(wildcard *.uml) +UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC)) + +all: $(UML_DIAGS) + +%.svg: %.uml + DISPLAY="" plantuml -tsvg $< + +clean: + -rm -f $(DEP_GRAPHS) $(UML_DIAGS) diff --git a/docs/images/tasks-metadata-indexers.uml b/docs/images/tasks-metadata-indexers.uml new file mode 100644 --- /dev/null +++ b/docs/images/tasks-metadata-indexers.uml @@ -0,0 +1,84 @@ +@startuml + participant LOADERS as "Loaders" + participant JOURNAL as "Journal" + participant SCHEDULER as "Scheduler" + participant IDX_ORIG_HEAD as "Origin-Head Indexer" + participant IDX_REV_META as "Revision Metadata Indexer" + participant IDX_CONT_META as "Content Metadata Indexer" + participant IDX_ORIG_META as "Origin Metadata Indexer" + participant IDX_STORAGE as "Indexer Storage" + participant STORAGE as "Graph Storage" + participant OBJ_STORAGE as "Object Storage" + + activate OBJ_STORAGE + activate IDX_STORAGE + activate STORAGE + activate JOURNAL + activate SCHEDULER + + activate LOADERS + + LOADERS->>JOURNAL: Origin 42 was added/revisited + deactivate LOADERS + + JOURNAL->>SCHEDULER: run indexers on origin 42 + + SCHEDULER->>IDX_ORIG_HEAD: Find HEAD revision of 42 + activate IDX_ORIG_HEAD + + IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin=42) + + STORAGE->>IDX_ORIG_HEAD: branches + + IDX_ORIG_HEAD->>SCHEDULER: run Revision Metadata Indexer\non revision 42abcdef\n(head of origin 42) + deactivate IDX_ORIG_HEAD + + SCHEDULER->>IDX_REV_META: Index revision 42abcdef\n(head of origin 42) + activate IDX_REV_META + + IDX_REV_META->>STORAGE: revision_get(sha1=42abcdef) + STORAGE->>IDX_REV_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...} + + IDX_REV_META->>STORAGE: directory_ls(sha1=456789ab) + STORAGE->>IDX_REV_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...] + + IDX_REV_META->>IDX_REV_META: package.json is a metadata file + + IDX_REV_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe) + IDX_STORAGE->>IDX_REV_META: none / {author: "Jane Doe", ...} + + alt If the storage answered "none" + IDX_REV_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file + activate IDX_CONT_META + + IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe + + OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...' + + IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author + + IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_CONT_META: ok + + IDX_CONT_META->>IDX_REV_META: extracted: {author: "Jane Doe", ...} + deactivate IDX_CONT_META + end + + IDX_REV_META->>IDX_STORAGE: revision_metadata_add(sha1=42abcdef, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_REV_META: ok + + IDX_REV_META->>SCHEDULER: run Origin Metadata Indexer\non origin 42; the head is 42abcdef + deactivate IDX_REV_META + + SCHEDULER->>IDX_ORIG_META: Index origin 42; the head is 42abcdef + activate IDX_ORIG_META + + IDX_ORIG_META->>IDX_STORAGE: revision_metadata_get(sha1=42abcdef) + IDX_STORAGE->>IDX_ORIG_META: {author: "Jane Doe", ...} + + IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id=42, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_ORIG_META: ok + deactivate IDX_ORIG_META + + +@enduml diff --git a/docs/index.rst b/docs/index.rst --- a/docs/index.rst +++ b/docs/index.rst @@ -13,6 +13,7 @@ README.md dev-info.rst + metadata_workflow.rst Reference Documentation diff --git a/docs/metadata_workflow.rst b/docs/metadata_workflow.rst new file mode 100644 --- /dev/null +++ b/docs/metadata_workflow.rst @@ -0,0 +1,11 @@ +Metadata workflow +================= + +Indexing metadata requires extracting information from the lowest levels +of the :ref:`Merkle DAG ` (directories, files, and content +blobs) and associate them to the highest ones (origins). +In order to deduplicate the work between origins, we split this work between +multiple indexers, which coordinate with each other and save their results +at each step in the indexer storage. + +.. thumbnail:: images/tasks-metadata-indexers.svg