diff --git a/docs/Makefile b/docs/Makefile --- a/docs/Makefile +++ b/docs/Makefile @@ -1 +1,3 @@ include ../../swh-docs/Makefile.sphinx +-include Makefile.local + diff --git a/docs/Makefile.local b/docs/Makefile.local new file mode 100644 --- /dev/null +++ b/docs/Makefile.local @@ -0,0 +1,11 @@ +sphinx/html: images +sphinx/clean: clean-images +assets: images + +images: + make -C images/ +clean-images: + make -C images/ clean + +.PHONY: images clean-images + diff --git a/docs/images/.gitignore b/docs/images/.gitignore new file mode 100644 --- /dev/null +++ b/docs/images/.gitignore @@ -0,0 +1 @@ +tasks-metadata-indexers.svg diff --git a/docs/images/Makefile b/docs/images/Makefile new file mode 100644 --- /dev/null +++ b/docs/images/Makefile @@ -0,0 +1,11 @@ + +UML_DIAGS_SRC = $(wildcard *.uml) +UML_DIAGS = $(patsubst %.uml,%.svg,$(UML_DIAGS_SRC)) + +all: $(UML_DIAGS) + +%.svg: %.uml + DISPLAY="" plantuml -tsvg $< + +clean: + -rm -f $(DEP_GRAPHS) $(UML_DIAGS) diff --git a/docs/images/tasks-metadata-indexers.uml b/docs/images/tasks-metadata-indexers.uml new file mode 100644 --- /dev/null +++ b/docs/images/tasks-metadata-indexers.uml @@ -0,0 +1,84 @@ +@startuml + participant LOADERS as "Loaders" + participant JOURNAL as "Journal" + participant SCHEDULER as "Scheduler" + participant IDX_ORIG_HEAD as "Origin-Head Indexer" + participant IDX_REV_META as "Revision Metadata Indexer" + participant IDX_CONT_META as "Content Metadata Indexer" + participant IDX_ORIG_META as "Origin Metadata Indexer" + participant IDX_STORAGE as "Indexer Storage" + participant STORAGE as "Graph Storage" + participant OBJ_STORAGE as "Object Storage" + + activate OBJ_STORAGE + activate IDX_STORAGE + activate STORAGE + activate JOURNAL + activate SCHEDULER + + activate LOADERS + + LOADERS->>JOURNAL: Origin 42 was added/revisited + deactivate LOADERS + + JOURNAL->>SCHEDULER: run indexers on origin 42 + + SCHEDULER->>IDX_ORIG_HEAD: Find HEAD revision of 42 + activate IDX_ORIG_HEAD + + IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin=42) + + STORAGE->>IDX_ORIG_HEAD: branches + + IDX_ORIG_HEAD->>SCHEDULER: run Revision Metadata Indexer\non revision 42abcdef\n(head of origin 42) + deactivate IDX_ORIG_HEAD + + SCHEDULER->>IDX_REV_META: Index revision 42abcdef\n(head of origin 42) + activate IDX_REV_META + + IDX_REV_META->>STORAGE: revision_get(sha1=42abcdef) + STORAGE->>IDX_REV_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...} + + IDX_REV_META->>STORAGE: directory_ls(sha1=456789ab) + STORAGE->>IDX_REV_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...] + + IDX_REV_META->>IDX_REV_META: package.json is a metadata file + + IDX_REV_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe) + IDX_STORAGE->>IDX_REV_META: none / {author: "Jane Doe", ...} + + alt If the storage answered "none" + IDX_REV_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file + activate IDX_CONT_META + + IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe + + OBJ_STORAGE->>IDX_CONT_META: raw content is: '{"name": "FooPackage", "author": "Jane Doe"...' + + IDX_CONT_META->>IDX_CONT_META: "Jane Doe" is the author + + IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_CONT_META: ok + + IDX_CONT_META->>IDX_REV_META: extracted: {author: "Jane Doe", ...} + deactivate IDX_CONT_META + end + + IDX_REV_META->>IDX_STORAGE: revision_metadata_add(sha1=42abcdef, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_REV_META: ok + + IDX_REV_META->>SCHEDULER: run Origin Metadata Indexer\non origin 42; the head is 42abcdef + deactivate IDX_REV_META + + SCHEDULER->>IDX_ORIG_META: Index origin 42; the head is 42abcdef + activate IDX_ORIG_META + + IDX_ORIG_META->>IDX_STORAGE: revision_metadata_get(sha1=42abcdef) + IDX_STORAGE->>IDX_ORIG_META: {author: "Jane Doe", ...} + + IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id=42, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_ORIG_META: ok + deactivate IDX_ORIG_META + + +@enduml diff --git a/docs/index.rst b/docs/index.rst --- a/docs/index.rst +++ b/docs/index.rst @@ -13,6 +13,7 @@ README.md dev-info.rst + metadata-workflow.rst Reference Documentation diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst new file mode 100644 --- /dev/null +++ b/docs/metadata-workflow.rst @@ -0,0 +1,64 @@ +Metadata workflow +================= + +Intrinsic metadata +------------------ + +Indexing :term:`intrinsic metadata` requires extracting information from the +lowest levels of the :ref:`Merkle DAG ` (directories, files, +and content blobs) and associate them to the highest ones (origins). +In order to deduplicate the work between origins, we split this work between +multiple indexers, which coordinate with each other and save their results +at each step in the indexer storage. + +.. thumbnail:: images/tasks-metadata-indexers.svg + + +Origin-Head Indexer +___________________ + +First, the Origin-Head indexer gets called externally, with an origin as +argument (or multiple origins, that are handled sequentially). +For now, its tasks are scheduled manually via recurring Scheduler tasks; but +in the near future, the :term:`journal` will be used to do that. + +It first looks up the last :term:`snapshot` and determines what the main +branch of origin is (the "Head branch") and what revision it points to +(the "Head"). +Intrinsic metadata for that origin will be extracted from that revision. + +It schedules a Revision Metadata Indexer task for that revision, with a +hint that the revision is the Head of that particular origin. + + +Revision and Content Metadata Indexers +______________________________________ + +These two indexers do the hard part of the work. The Revision Metadata +Indexer fetches the root directory associated with a revision, then extracts +the metadata from that directory. + +To do so, it lists files in that directory, and looks for known names, such +as `codemeta.json`, `package.json`, or `pom.xml`. If there are any, it +runs the Content Metadata Indexer on them, which in turn fetches their +contents and runs them through extraction dictionaries/mappings. + +Their results are saved in a database (the indexer storage), associated with +the content and revision hashes. + +If it received a hint that this revision is the head of an origin, the +Revision Metadata Indexer then schedules the Origin Metadata Indexer +to run on that origin. + + +Origin Metadata Indexer +_______________________ + +The job of this indexer is very simple: it takes an origin identifier and +a revision hash, and copies the metadata of the former to a new table, to +associate it with the latter. + +The reason for this is to be able to perform searches on metadata, and +efficiently find out which origins matched the pattern. +Running that search on the `revision_metadata` table would require either +a reverse lookup from revisions to origins, which is costly.