diff --git a/docs/images/tasks-metadata-indexers.uml b/docs/images/tasks-metadata-indexers.uml --- a/docs/images/tasks-metadata-indexers.uml +++ b/docs/images/tasks-metadata-indexers.uml @@ -1,11 +1,10 @@ @startuml participant LOADERS as "Loaders" participant JOURNAL as "Journal" - participant SCHEDULER as "Scheduler" + participant IDX_ORIG_META as "Origin Metadata Indexer" participant IDX_ORIG_HEAD as "Origin-Head Indexer" - participant IDX_REV_META as "Revision Metadata Indexer" + participant IDX_DIR_META as "Directory Metadata Indexer" participant IDX_CONT_META as "Content Metadata Indexer" - participant IDX_ORIG_META as "Origin Metadata Indexer" participant IDX_STORAGE as "Indexer Storage" participant STORAGE as "Graph Storage" participant OBJ_STORAGE as "Object Storage" @@ -14,41 +13,41 @@ activate IDX_STORAGE activate STORAGE activate JOURNAL - activate SCHEDULER + activate IDX_ORIG_META activate LOADERS - LOADERS->>JOURNAL: Origin 42 was added/revisited + LOADERS->>JOURNAL: Origin http://example.org/repo.git\nwas added/revisited deactivate LOADERS - JOURNAL->>SCHEDULER: run indexers on origin 42 + JOURNAL->>IDX_ORIG_META: run indexers on origin\nhttp://example.org/repo.git - SCHEDULER->>IDX_ORIG_HEAD: Find HEAD revision of 42 + IDX_ORIG_META->>IDX_ORIG_HEAD: Find HEAD revision of\nhttp://example.org/repo.git activate IDX_ORIG_HEAD - IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin=42) + IDX_ORIG_HEAD->>STORAGE: snapshot_get_latest(origin="http://example.org/repo.git") STORAGE->>IDX_ORIG_HEAD: branches - IDX_ORIG_HEAD->>SCHEDULER: run Revision Metadata Indexer\non revision 42abcdef\n(head of origin 42) + IDX_ORIG_HEAD->>IDX_ORIG_META: run Revision Metadata Indexer\non revision 42abcdef (head of origin\nhttp://example.org/repo.git) deactivate IDX_ORIG_HEAD - SCHEDULER->>IDX_REV_META: Index revision 42abcdef\n(head of origin 42) - activate IDX_REV_META + IDX_ORIG_META->>STORAGE: revision_get(sha1=42abcdef) + STORAGE->>IDX_ORIG_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...} - IDX_REV_META->>STORAGE: revision_get(sha1=42abcdef) - STORAGE->>IDX_REV_META: {id: 42abcdef, message: "Commit message", directory: 456789ab, ...} + IDX_ORIG_META->>IDX_DIR_META: Index directory 456789ab\n(head of origin http://example.org/repo.git) + activate IDX_DIR_META - IDX_REV_META->>STORAGE: directory_ls(sha1=456789ab) - STORAGE->>IDX_REV_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...] + IDX_DIR_META->>STORAGE: directory_ls(sha1=456789ab) + STORAGE->>IDX_DIR_META: [{id: 1234cafe, name: "package.json", type: file, ...}, {id: cafe4321, name: "README", type: file, ...}, ...] - IDX_REV_META->>IDX_REV_META: package.json is a metadata file + IDX_DIR_META->>IDX_DIR_META: package.json is a metadata file - IDX_REV_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe) - IDX_STORAGE->>IDX_REV_META: none / {author: "Jane Doe", ...} + IDX_DIR_META->>IDX_STORAGE: content_metadata_get(sha1=1234cafe) + IDX_STORAGE->>IDX_DIR_META: none / {author: "Jane Doe", ...} alt If the storage answered "none" - IDX_REV_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file + IDX_DIR_META->>IDX_CONT_META: Index file 1234cafe as an NPM metadata file activate IDX_CONT_META IDX_CONT_META->>OBJ_STORAGE: content_get 1234cafe @@ -60,23 +59,17 @@ IDX_CONT_META->>IDX_STORAGE: content_metadata_add(sha1=1234cafe, {author: "Jane Doe", ...}) IDX_STORAGE->>IDX_CONT_META: ok - IDX_CONT_META->>IDX_REV_META: extracted: {author: "Jane Doe", ...} + IDX_CONT_META->>IDX_DIR_META: extracted: {author: "Jane Doe", ...} deactivate IDX_CONT_META - end - IDX_REV_META->>IDX_STORAGE: revision_metadata_add(sha1=42abcdef, {author: "Jane Doe", ...}) - IDX_STORAGE->>IDX_REV_META: ok - - IDX_REV_META->>SCHEDULER: run Origin Metadata Indexer\non origin 42; the head is 42abcdef - deactivate IDX_REV_META - - SCHEDULER->>IDX_ORIG_META: Index origin 42; the head is 42abcdef - activate IDX_ORIG_META + IDX_DIR_META->>IDX_STORAGE: directory_metadata_add(sha1=456789ab, {author: "Jane Doe", ...}) + IDX_STORAGE->>IDX_DIR_META: ok + end - IDX_ORIG_META->>IDX_STORAGE: revision_metadata_get(sha1=42abcdef) - IDX_STORAGE->>IDX_ORIG_META: {author: "Jane Doe", ...} + IDX_DIR_META->>IDX_ORIG_META: extracted: {author: "Jane Doe", ...} + deactivate IDX_DIR_META - IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id=42, {author: "Jane Doe", ...}) + IDX_ORIG_META->>IDX_STORAGE: origin_metadata_add(id="http://example.org/repo.git", {author: "Jane Doe", ...}, from_directory=456789ab) IDX_STORAGE->>IDX_ORIG_META: ok deactivate IDX_ORIG_META diff --git a/docs/metadata-workflow.rst b/docs/metadata-workflow.rst --- a/docs/metadata-workflow.rst +++ b/docs/metadata-workflow.rst @@ -30,14 +30,14 @@ (the "Head"). Intrinsic metadata for that origin will be extracted from that revision. -It schedules a Revision Metadata Indexer task for that revision, with a -hint that the revision is the Head of that particular origin. +It schedules a Directory Metadata Indexer task for the root directory of +that revision. -Revision and Content Metadata Indexers -______________________________________ +Directory and Content Metadata Indexers +_______________________________________ -These two indexers do the hard part of the work. The Revision Metadata +These two indexers do the hard part of the work. The Directory Metadata Indexer fetches the root directory associated with a revision, then extracts the metadata from that directory. @@ -48,24 +48,21 @@ See below for details. Their results are saved in a database (the indexer storage), associated with -the content and revision hashes. - -If it received a hint that this revision is the head of an origin, the -Revision Metadata Indexer then schedules the Origin Metadata Indexer -to run on that origin. +the content and directory hashes. Origin Metadata Indexer _______________________ The job of this indexer is very simple: it takes an origin identifier and -a revision hash, and copies the metadata of the former to a new table, to -associate it with the latter. +uses the Origin-Head and Directory indexers to get metadata from the head +directory of an origin, and copies the metadata of the former to a new table, +to associate it with the latter. The reason for this is to be able to perform searches on metadata, and efficiently find out which origins matched the pattern. -Running that search on the ``revision_metadata`` table would require either -a reverse lookup from revisions to origins, which is costly. +Running that search on the ``directory_metadata`` table would require either +a reverse lookup from directories to origins, which is costly. Translation from language-specific metadata to CodeMeta