diff --git a/sql/upgrades/137.sql b/sql/upgrades/137.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/137.sql @@ -0,0 +1,158 @@ +-- SWH DB schema upgrade +-- from_version: 136 +-- to_version: 137 +-- description: Add comment columns to all tables + +insert into dbversion(version, release, description) + values(137, now(), 'Work In Progress'); + +-- comment for columns of dbversion table +comment on table dbversion is "Details of current db version"; +comment on column dbversion.version is 'Sql schema version'; +comment on column dbversion.release is 'Version release date and time'; +comment on column dbversion.description is 'Release description'; + +-- comment for columns of content table +comment on table content is "Checksums of file content which is actually stored externally"; +comment on column content.sha1 is 'Content sha1 hash'; +comment on column content.sha1_git is 'Git object sha1 hash'; +comment on column content.sha256 is 'Content Sha256 hash'; +comment on column content.blake2s256 is 'Content blake2s hash'; +comment on column content.length is 'Content length'; +comment on column content.ctime is 'First seen time'; +comment on column content.status is 'Content status (absent, visible, hidden)'; +comment on column content.object_id is 'Content identifier'; + +-- comment for columns of skipped_content +comment on table skipped_content is "Content blobs observed but not ingested in + Software heritage archive"; +comment on column skipped_content.sha1 is 'Skipped content sha1 hash'; +comment on column skipped_content.sha1_git is 'Git object sha1 hash'; +comment on column skipped_content.sha256 is 'Skipped content sha256 hash'; +comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash'; +comment on column skipped_content.length is 'Skipped content length'; +comment on column skipped_content.ctime is 'First seen time'; +comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)'; +comment on column skipped_content.reason is 'Reason for skipping'; +comment on column skipped_content.origin is 'Origin table identifier'; +comment on column skipped_content.object_id is 'Skipped content identifier'; + +-- comment for columns of fetch_history +comment on table fetch_history is "Log of all origin fetches"; +comment on column fetch_history.id is 'Identifier for fetch history'; +comment on column fetch_history.origin is 'Origin table identifier'; +comment on column fetch_history.date is ''; +comment on column fetch_history.status is 'True indicates successful fetch'; +comment on column fetch_history.result is 'Detailed return values, times etc'; +comment on column fetch_history.stdout is 'Fetch operation output'; +comment on column fetch_history.stderr is 'Fetch operation error'; +comment on column fetch_history.duration is 'Time taken to complete fetch, NULL if ongoing'; + +-- comment for columns of directory +comment on table directory is "Contents of a directory, synonymous to tree (git)"; +comment on column directory.id is "Git object sha1 hash"; +comment on column directory.dir_entries is "Sub-directories, reference directory_entry_dir"; +comment on column directory.file_entries is "Contained files, reference directory_entry_file"; +comment on column directory.rev_entries is "Mounted revisions, reference directory_entry_rev"; +comment on column directory.object_id is "Short object identifier"; + +-- comment for columns of directory_entry_dir +comment on table directory_entry_dir is "Directory entry for directory"; +comment on column directory_entry_dir.id is "Directory identifier"; +comment on column directory_entry_dir.target is "Target directory identifier"; +comment on column directory_entry_dir.name is "Path name, relative to containing directory"; +comment on column directory_entry_dir.perms is "Unix-like permissions"; + +-- comment for columns of directory_entry_file +comment on table directory_entry_file is "Directory entry for file"; +comment on column directory_entry_file.id is "File identifier"; +comment on column directory_entry_file.target is "Target file identifier"; +comment on column directory_entry_file.name is "Path name, relative to containing directory"; +comment on column directory_entry_file.perms is "Unix-like permissions"; + +-- comment for columns of directory_entry_rev +comment on table directory_entry_rev is "Directory entry for revision"; +comment on column directory_entry_dir.id is "Revision identifier"; +comment on column directory_entry_dir.target is "Target revision in identifier"; +comment on column directory_entry_dir.name is "Path name, relative to containing directory"; +comment on column directory_entry_dir.perms is "Unix-like permissions"; + +-- comment for columns of person +comment on table person is "Person referenced in code artifact release metadata"; +comment on column person.id is "Person identifier"; +comment on column person.name is "Name"; +comment on column person.email is "Email"; +comment on column person.fullname is "Full name"; + +-- comment for columns of revision +comment on table revision is "" +comment on column revision.id is "Git id of sha1 checksum"; +comment on column revision.date is ""; +comment on column revision.date_offset is ""; +comment on column revision.committer_date is ""; +comment on column revision.committer_date_offset is ""; +comment on column revision.type is ""; +comment on column revision.directory is ""; +comment on column revision.message is ""; +comment on column revision.author is ""; +comment on column revision.committer is ""; +comment on column revision.synthetic is ""; +comment on column revision.metadata is ""; +comment on column revision.object_id is ""; +comment on column revision.date_neg_utc_offset is ""; +comment on column revision.committer_date_neg_utc_offset is ""; + +-- comment for columns of revision_history +comment on table revision_history is ""; +comment on column revision_history.id is "Revision history git object sha1 checksum"; +comment on column revision_history.parent_id is "Parent git object identifier"; +comment on column revision_history.parent_rank is "Parent position in merge commits, 0-based"; + +-- comment for columns of snapshot +comment on table snapshot is "State of a software origin as crawled by Software Heritage"; +comment on column snapshot.object_id is "Internal object identifier"; +comment on column snapshot.id is "Intrinsic snapshot identifier"; + +-- comment for columns of snapshot_branch +comment on table snapshot_branch is "Associates branches with objects in Heritage Marke DAG"; +comment on column snapshot_branch.object_id is "Internal object identifier"; +comment on column snapshot_branch.name is "Branch name"; +comment on column snapshot_branch.target is "Target object identifier"; +comment on column snapshot_branch.target_type is "Target object type"; + +-- comment for columns of snapshot_branches +comment on table snapshot_branches is "Mapping between snapshot and their branches"; +comment on column snapshot_branches.snapshot_id is "Snapshot identifier"; +comment on column snapshot_branches.branch_id is "Branch identifier"; + +-- comment for columns of release +comment on table release is "Details of a software release, synonymous with + a tag (git) or version number (tarball)"; +comment on column release.id is "Release git identifier"; +comment on column release.target is "Target git identifier"; +comment on column release.date is "Date"; +comment on column release.date_offset is ""; +comment on column release.name is "Name"; +comment on column release.comment is "Comment"; +comment on column release.author is "Author"; +comment on column release.synthetic is "Indicates if created by Software Heritage"; +comment on column release.object_id is "Object identifier"; +comment on column release.target_type is "Object type (''content'', ''directory'', ..)"; +comment on column release.date_neg_utc_offset is ""; + +-- comment for columns of object_counts +comment on table object_counts is "Cache of object counts"; +comment on column object_counts.object_type is "Object type (''content'', ''directory'', ..)"; +comment on column object_counts.value is "Count of objects in the table"; +comment on column object_counts.last_update is "Last update for object count"; +comment on column object_counts.single_update is "standalone (true) or bucketed counts (false)"; + +-- comment for columns of object_counts_bucketed +comment on table object_counts_bucketed is "Bucketed count for objects ordered by type"; +comment on column object_counts_bucketed.line is ""; +comment on column object_counts_bucketed.object_type is "Object type (''content'', ''directory'', ..)"; +comment on column object_counts_bucketed.identifier is "Common identifier for bucketed objects"; +comment on column object_counts_bucketed.bucket_start is "Lower bound (inclusive) for the bucket"; +comment on column object_counts_bucketed.bucket_end is "Upper bound (exclusive) for the bucket"; +comment on column object_counts_bucketed.value is "Count of objects in the bucket"; +comment on column object_counts_bucketed.last_update is "Last update for the object count in this bucket"; \ No newline at end of file diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -10,9 +10,14 @@ description text ); +comment on table dbversion is "Details of current db version"; +comment on column dbversion.version is 'Sql schema version'; +comment on column dbversion.release is 'Version release date and time'; +comment on column dbversion.description is 'Release description'; + -- latest schema version insert into dbversion(version, release, description) - values(133, now(), 'Work In Progress'); + values(137, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); @@ -50,6 +55,16 @@ object_id bigserial ); +comment on table content is "Checksums of file content which is actually stored externally"; +comment on column content.sha1 is 'Content sha1 hash'; +comment on column content.sha1_git is 'Git object sha1 hash'; +comment on column content.sha256 is 'Content Sha256 hash'; +comment on column content.blake2s256 is 'Content blake2s hash'; +comment on column content.length is 'Content length'; +comment on column content.ctime is 'First seen time'; +comment on column content.status is 'Content status (absent, visible, hidden)'; +comment on column content.object_id is 'Content identifier'; + -- An origin is a place, identified by an URL, where software source code -- artifacts can be found. We support different kinds of origins, e.g., git and @@ -65,6 +80,10 @@ url text not null ); +comment on column origin.id is 'Artifact origin id'; +comment on column origin.type is 'Type of origin'; +comment on column origin.url is 'Url of origin'; + -- Content blobs observed somewhere, but not ingested into the archive for -- whatever reason. This table is separate from the content table as we might @@ -87,6 +106,19 @@ object_id bigserial ); +comment on table skipped_content is "Content blobs observed but not ingested in + Software heritage archive"; +comment on column skipped_content.sha1 is 'Skipped content sha1 hash'; +comment on column skipped_content.sha1_git is 'Git object sha1 hash'; +comment on column skipped_content.sha256 is 'Skipped content sha256 hash'; +comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash'; +comment on column skipped_content.length is 'Skipped content length'; +comment on column skipped_content.ctime is 'First seen time'; +comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)'; +comment on column skipped_content.reason is 'Reason for skipping'; +comment on column skipped_content.origin is 'Origin table identifier'; +comment on column skipped_content.object_id is 'Skipped content identifier'; + -- Log of all origin fetches (i.e., origin crawling) that have been done in the -- past, or are still ongoing. Similar to list_history, but for origins. @@ -102,6 +134,16 @@ duration interval -- fetch duration of NULL if still ongoing ); +comment on table fetch_history is "Log of all origin fetches"; +comment on column fetch_history.id is 'Identifier for fetch history'; +comment on column fetch_history.origin is 'Origin table identifier'; +comment on column fetch_history.date is ''; +comment on column fetch_history.status is 'True indicates successful fetch'; +comment on column fetch_history.result is 'Detailed return values, times etc'; +comment on column fetch_history.stdout is 'Fetch operation output'; +comment on column fetch_history.stderr is 'Fetch operation error'; +comment on column fetch_history.duration is 'Time taken to complete fetch, NULL if ongoing'; + -- A file-system directory. A directory is a list of directory entries (see -- tables: directory_entry_{dir,file}). @@ -123,6 +165,14 @@ object_id bigserial -- short object identifier ); +comment on table directory is "Contents of a directory, synonymous to tree (git)"; +comment on column directory.id is "Git object sha1 hash"; +comment on column directory.dir_entries is "Sub-directories, reference directory_entry_dir"; +comment on column directory.file_entries is "Contained files, reference directory_entry_file"; +comment on column directory.rev_entries is "Mounted revisions, reference directory_entry_rev"; +comment on column directory.object_id is "Short object identifier"; + + -- A directory entry pointing to a (sub-)directory. create table directory_entry_dir ( @@ -132,6 +182,13 @@ perms file_perms -- unix-like permissions ); +comment on table directory_entry_dir is "Directory entry for directory"; +comment on column directory_entry_dir.id is "Directory identifier"; +comment on column directory_entry_dir.target is "Target directory identifier"; +comment on column directory_entry_dir.name is "Path name, relative to containing directory"; +comment on column directory_entry_dir.perms is "Unix-like permissions"; + + -- A directory entry pointing to a file content. create table directory_entry_file ( @@ -141,6 +198,13 @@ perms file_perms -- unix-like permissions ); +comment on table directory_entry_file is "Directory entry for file"; +comment on column directory_entry_file.id is "File identifier"; +comment on column directory_entry_file.target is "Target file identifier"; +comment on column directory_entry_file.name is "Path name, relative to containing directory"; +comment on column directory_entry_file.perms is "Unix-like permissions"; + + -- A directory entry pointing to a revision. create table directory_entry_rev ( @@ -150,6 +214,12 @@ perms file_perms -- unix-like permissions ); +comment on table directory_entry_rev is "Directory entry for revision"; +comment on column directory_entry_dir.id is "Revision identifier"; +comment on column directory_entry_dir.target is "Target revision in identifier"; +comment on column directory_entry_dir.name is "Path name, relative to containing directory"; +comment on column directory_entry_dir.perms is "Unix-like permissions"; + -- A person referenced by some source code artifacts, e.g., a VCS revision or -- release metadata. @@ -162,6 +232,12 @@ -- will usually be of the form 'name ' ); +comment on table person is "Person referenced in code artifact release metadata"; +comment on column person.id is "Person identifier"; +comment on column person.name is "Name"; +comment on column person.email is "Email"; +comment on column person.fullname is "Full name"; + -- The state of a source code tree at a specific point in time. -- @@ -191,6 +267,24 @@ committer_date_neg_utc_offset boolean ); +comment on table revision is "" +comment on column revision.id is "Git id of sha1 checksum"; +comment on column revision.date is ""; +comment on column revision.date_offset is ""; +comment on column revision.committer_date is ""; +comment on column revision.committer_date_offset is ""; +comment on column revision.type is ""; +comment on column revision.directory is ""; +comment on column revision.message is ""; +comment on column revision.author is ""; +comment on column revision.committer is ""; +comment on column revision.synthetic is ""; +comment on column revision.metadata is ""; +comment on column revision.object_id is ""; +comment on column revision.date_neg_utc_offset is ""; +comment on column revision.committer_date_neg_utc_offset is ""; + + -- either this table or the sha1_git[] column on the revision table create table revision_history ( @@ -200,6 +294,11 @@ -- parent position in merge commits, 0-based ); +comment on table revision_history is ""; +comment on column revision_history.id is "Revision history git object sha1 checksum"; +comment on column revision_history.parent_id is "Parent git object identifier"; +comment on column revision_history.parent_rank is "Parent position in merge commits, 0-based"; + -- Crawling history of software origins visited by Software Heritage. Each -- visit is a 3-way mapping between a software origin, a timestamp, and a @@ -233,6 +332,11 @@ id sha1_git not null -- snapshot intrinsic identifier ); +comment on table snapshot is "State of a software origin as crawled by Software Heritage"; +comment on column snapshot.object_id is "Internal object identifier"; +comment on column snapshot.id is "Intrinsic snapshot identifier"; + + -- Each snapshot associate "branch" names to other objects in the Software -- Heritage Merkle DAG. This table describes branches as mappings between names -- and target typed objects. @@ -244,6 +348,13 @@ target_type snapshot_target -- target object type, e.g., "revision" ); +comment on table snapshot_branch is "Associates branches with objects in Heritage Marke DAG"; +comment on column snapshot_branch.object_id is "Internal object identifier"; +comment on column snapshot_branch.name is "Branch name"; +comment on column snapshot_branch.target is "Target object identifier"; +comment on column snapshot_branch.target_type is "Target object type"; + + -- Mapping between snapshots and their branches. create table snapshot_branches ( @@ -251,6 +362,10 @@ branch_id bigint not null -- branch identifier, ref. snapshot_branch.object_id ); +comment on table snapshot_branches is "Mapping between snapshot and their branches"; +comment on column snapshot_branches.snapshot_id is "Snapshot identifier"; +comment on column snapshot_branches.branch_id is "Branch identifier"; + -- A "memorable" point in time in the development history of a software -- project. @@ -273,6 +388,20 @@ date_neg_utc_offset boolean ); +comment on table release is "Details of a software release, synonymous with + a tag (git) or version number (tarball)"; +comment on column release.id is "Release git identifier"; +comment on column release.target is "Target git identifier"; +comment on column release.date is "Date"; +comment on column release.date_offset is ""; +comment on column release.name is "Name"; +comment on column release.comment is "Comment"; +comment on column release.author is "Author"; +comment on column release.synthetic is "Indicates if created by Software Heritage"; +comment on column release.object_id is "Object identifier"; +comment on column release.target_type is "Object type (''content'', ''directory'', ..)"; +comment on column release.date_neg_utc_offset is ""; + -- Tools create table tool @@ -336,6 +465,13 @@ single_update boolean -- whether we update this table standalone (true) or through bucketed counts (false) ); +comment on table object_counts is "Cache of object counts"; +comment on column object_counts.object_type is "Object type (''content'', ''directory'', ..)"; +comment on column object_counts.value is "Count of objects in the table"; +comment on column object_counts.last_update is "Last update for object count"; +comment on column object_counts.single_update is "standalone (true) or bucketed counts (false)"; + + create table object_counts_bucketed ( line serial not null, -- PK @@ -346,3 +482,13 @@ value bigint, -- count of objects in the bucket last_update timestamptz -- last update for the object count in this bucket ); + +comment on table object_counts_bucketed is "Bucketed count for objects ordered by type"; +comment on column object_counts_bucketed.line is ""; +comment on column object_counts_bucketed.object_type is "Object type (''content'', ''directory'', ..)"; +comment on column object_counts_bucketed.identifier is "Common identifier for bucketed objects"; +comment on column object_counts_bucketed.bucket_start is "Lower bound (inclusive) for the bucket"; +comment on column object_counts_bucketed.bucket_end is "Upper bound (exclusive) for the bucket"; +comment on column object_counts_bucketed.value is "Count of objects in the bucket"; +comment on column object_counts_bucketed.last_update is "Last update for the object count in this bucket"; +