diff --git a/sql/upgrades/137.sql b/sql/upgrades/137.sql new file mode 100644 --- /dev/null +++ b/sql/upgrades/137.sql @@ -0,0 +1,159 @@ +-- SWH DB schema upgrade +-- from_version: 136 +-- to_version: 137 +-- description: Add comment columns to all tables + +insert into dbversion(version, release, description) + values(137, now(), 'Work In Progress'); + +-- comment for columns of dbversion table +comment on table dbversion is 'Details of current db version'; +comment on column dbversion.version is 'Sql schema version'; +comment on column dbversion.release is 'Version deployment timestamp'; +comment on column dbversion.description is 'Release description'; + +-- comment for columns of content table +comment on table content is 'Checksums of file content which is actually stored externally'; +comment on column content.sha1 is 'Content sha1 hash'; +comment on column content.sha1_git is 'Git object sha1 hash'; +comment on column content.sha256 is 'Content Sha256 hash'; +comment on column content.blake2s256 is 'Content blake2s hash'; +comment on column content.length is 'Content length'; +comment on column content.ctime is 'First seen time'; +comment on column content.status is 'Content status (absent, visible, hidden)'; +comment on column content.object_id is 'Content identifier'; + +-- comment for columns of skipped_content +comment on table skipped_content is 'Content blobs observed but not ingested in + Software heritage archive'; +comment on column skipped_content.sha1 is 'Skipped content sha1 hash'; +comment on column skipped_content.sha1_git is 'Git object sha1 hash'; +comment on column skipped_content.sha256 is 'Skipped content sha256 hash'; +comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash'; +comment on column skipped_content.length is 'Skipped content length'; +comment on column skipped_content.ctime is 'First seen time'; +comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)'; +comment on column skipped_content.reason is 'Reason for skipping'; +comment on column skipped_content.origin is 'Origin table identifier'; +comment on column skipped_content.object_id is 'Skipped content identifier'; + +-- comment for columns of fetch_history +comment on table fetch_history is 'Log of all origin fetches'; +comment on column fetch_history.id is 'Identifier for fetch history'; +comment on column fetch_history.origin is 'Origin table identifier'; +comment on column fetch_history.date is 'Fetch start time'; +comment on column fetch_history.status is 'True indicates successful fetch'; +comment on column fetch_history.result is 'Detailed return values, times etc'; +comment on column fetch_history.stdout is 'Fetch operation output'; +comment on column fetch_history.stderr is 'Fetch operation error'; +comment on column fetch_history.duration is 'Time taken to complete fetch, NULL if ongoing'; + +-- comment for columns of directory +comment on table directory is 'Contents of a directory, synonymous to tree (git)'; +comment on column directory.id is 'Git object sha1 hash'; +comment on column directory.dir_entries is 'Sub-directories, reference directory_entry_dir'; +comment on column directory.file_entries is 'Contained files, reference directory_entry_file'; +comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev'; +comment on column directory.object_id is 'Short object identifier'; + +-- comment for columns of directory_entry_dir +comment on table directory_entry_dir is 'Directory entry for directory'; +comment on column directory_entry_dir.id is 'Directory identifier'; +comment on column directory_entry_dir.target is 'Target directory identifier'; +comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; +comment on column directory_entry_dir.perms is 'Unix-like permissions'; + +-- comment for columns of directory_entry_file +comment on table directory_entry_file is 'Directory entry for file'; +comment on column directory_entry_file.id is 'File identifier'; +comment on column directory_entry_file.target is 'Target file identifier'; +comment on column directory_entry_file.name is 'Path name, relative to containing directory'; +comment on column directory_entry_file.perms is 'Unix-like permissions'; + +-- comment for columns of directory_entry_rev +comment on table directory_entry_rev is 'Directory entry for revision'; +comment on column directory_entry_dir.id is 'Revision identifier'; +comment on column directory_entry_dir.target is 'Target revision in identifier'; +comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; +comment on column directory_entry_dir.perms is 'Unix-like permissions'; + +-- comment for columns of person +comment on table person is 'Person referenced in code artifact release metadata'; +comment on column person.id is 'Person identifier'; +comment on column person.name is 'Name'; +comment on column person.email is 'Email'; +comment on column person.fullname is 'Full name (raw name)'; + +-- comment for columns of revision +comment on table revision is 'Revision represents the state of a source code tree at a + specific point in time'; +comment on column revision.id is 'Git id of sha1 checksum'; +comment on column revision.date is 'Timestamp when revision was authored'; +comment on column revision.date_offset is 'Authored timestamp offset from UTC'; +comment on column revision.committer_date is 'Timestamp when revision was committed'; +comment on column revision.committer_date_offset is 'Committed timestamp offset from UTC'; +comment on column revision.type is 'Possible revision types (''git'', ''tar'', ..)'; +comment on column revision.directory is 'Directory identifier'; +comment on column revision.message is 'Revision message'; +comment on column revision.author is 'Author identifier'; +comment on column revision.committer is 'Committer identifier'; +comment on column revision.synthetic is 'true iff revision has been created by Software Heritage'; +comment on column revision.metadata is 'extra metadata (tarball checksums, extra commit information, etc...)'; +comment on column revision.object_id is 'Object identifier'; +comment on column revision.date_neg_utc_offset is 'True indicates -0 UTC offset for author timestamp'; +comment on column revision.committer_date_neg_utc_offset is 'True indicates -0 UTC offset for committer timestamp'; + +-- comment for columns of revision_history +comment on table revision_history is 'Sequence of revision history with parent and position in history'; +comment on column revision_history.id is 'Revision history git object sha1 checksum'; +comment on column revision_history.parent_id is 'Parent revision git object identifier'; +comment on column revision_history.parent_rank is 'Parent position in merge commits, 0-based'; + +-- comment for columns of snapshot +comment on table snapshot is 'State of a software origin as crawled by Software Heritage'; +comment on column snapshot.object_id is 'Internal object identifier'; +comment on column snapshot.id is 'Intrinsic snapshot identifier'; + +-- comment for columns of snapshot_branch +comment on table snapshot_branch is 'Associates branches with objects in Heritage Merkle DAG'; +comment on column snapshot_branch.object_id is 'Internal object identifier'; +comment on column snapshot_branch.name is 'Branch name'; +comment on column snapshot_branch.target is 'Target object identifier'; +comment on column snapshot_branch.target_type is 'Target object type'; + +-- comment for columns of snapshot_branches +comment on table snapshot_branches is 'Mapping between snapshot and their branches'; +comment on column snapshot_branches.snapshot_id is 'Snapshot identifier'; +comment on column snapshot_branches.branch_id is 'Branch identifier'; + +-- comment for columns of release +comment on table release is 'Details of a software release, synonymous with + a tag (git) or version number (tarball)'; +comment on column release.id is 'Release git identifier'; +comment on column release.target is 'Target git identifier'; +comment on column release.date is 'Release timestamp'; +comment on column release.date_offset is 'Timestamp offset from UTC'; +comment on column release.name is 'Name'; +comment on column release.comment is 'Comment'; +comment on column release.author is 'Author'; +comment on column release.synthetic is 'Indicates if created by Software Heritage'; +comment on column release.object_id is 'Object identifier'; +comment on column release.target_type is 'Object type (''content'', ''directory'', ..)'; +comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp'; + +-- comment for columns of object_counts +comment on table object_counts is 'Cache of object counts'; +comment on column object_counts.object_type is 'Object type (''content'', ''directory'', ..)'; +comment on column object_counts.value is 'Count of objects in the table'; +comment on column object_counts.last_update is 'Last update for object count'; +comment on column object_counts.single_update is 'standalone (true) or bucketed counts (false)'; + +-- comment for columns of object_counts_bucketed +comment on table object_counts_bucketed is 'Bucketed count for objects ordered by type'; +comment on column object_counts_bucketed.line is 'Auto incremented idenitfier value'; +comment on column object_counts_bucketed.object_type is 'Object type (''content'', ''directory'', ..)'; +comment on column object_counts_bucketed.identifier is 'Common identifier for bucketed objects'; +comment on column object_counts_bucketed.bucket_start is 'Lower bound (inclusive) for the bucket'; +comment on column object_counts_bucketed.bucket_end is 'Upper bound (exclusive) for the bucket'; +comment on column object_counts_bucketed.value is 'Count of objects in the bucket'; +comment on column object_counts_bucketed.last_update is 'Last update for the object count in this bucket'; diff --git a/swh/storage/sql/30-swh-schema.sql b/swh/storage/sql/30-swh-schema.sql --- a/swh/storage/sql/30-swh-schema.sql +++ b/swh/storage/sql/30-swh-schema.sql @@ -10,9 +10,14 @@ description text ); +comment on table dbversion is 'Details of current db version'; +comment on column dbversion.version is 'Sql schema version'; +comment on column dbversion.release is 'Version deployment timestamp'; +comment on column dbversion.description is 'Release description'; + -- latest schema version insert into dbversion(version, release, description) - values(133, now(), 'Work In Progress'); + values(137, now(), 'Work In Progress'); -- a SHA1 checksum create domain sha1 as bytea check (length(value) = 20); @@ -50,6 +55,16 @@ object_id bigserial ); +comment on table content is 'Checksums of file content which is actually stored externally'; +comment on column content.sha1 is 'Content sha1 hash'; +comment on column content.sha1_git is 'Git object sha1 hash'; +comment on column content.sha256 is 'Content Sha256 hash'; +comment on column content.blake2s256 is 'Content blake2s hash'; +comment on column content.length is 'Content length'; +comment on column content.ctime is 'First seen time'; +comment on column content.status is 'Content status (absent, visible, hidden)'; +comment on column content.object_id is 'Content identifier'; + -- An origin is a place, identified by an URL, where software source code -- artifacts can be found. We support different kinds of origins, e.g., git and @@ -65,6 +80,10 @@ url text not null ); +comment on column origin.id is 'Artifact origin id'; +comment on column origin.type is 'Type of origin'; +comment on column origin.url is 'Url of origin'; + -- Content blobs observed somewhere, but not ingested into the archive for -- whatever reason. This table is separate from the content table as we might @@ -87,6 +106,19 @@ object_id bigserial ); +comment on table skipped_content is 'Content blobs observed but not ingested in + Software heritage archive'; +comment on column skipped_content.sha1 is 'Skipped content sha1 hash'; +comment on column skipped_content.sha1_git is 'Git object sha1 hash'; +comment on column skipped_content.sha256 is 'Skipped content sha256 hash'; +comment on column skipped_content.blake2s256 is 'Skipped content blake2s hash'; +comment on column skipped_content.length is 'Skipped content length'; +comment on column skipped_content.ctime is 'First seen time'; +comment on column skipped_content.status is 'Skipped content status (absent, visible, hidden)'; +comment on column skipped_content.reason is 'Reason for skipping'; +comment on column skipped_content.origin is 'Origin table identifier'; +comment on column skipped_content.object_id is 'Skipped content identifier'; + -- Log of all origin fetches (i.e., origin crawling) that have been done in the -- past, or are still ongoing. Similar to list_history, but for origins. @@ -102,6 +134,16 @@ duration interval -- fetch duration of NULL if still ongoing ); +comment on table fetch_history is 'Log of all origin fetches'; +comment on column fetch_history.id is 'Identifier for fetch history'; +comment on column fetch_history.origin is 'Origin table identifier'; +comment on column fetch_history.date is 'Fetch start time'; +comment on column fetch_history.status is 'True indicates successful fetch'; +comment on column fetch_history.result is 'Detailed return values, times etc'; +comment on column fetch_history.stdout is 'Fetch operation output'; +comment on column fetch_history.stderr is 'Fetch operation error'; +comment on column fetch_history.duration is 'Time taken to complete fetch, NULL if ongoing'; + -- A file-system directory. A directory is a list of directory entries (see -- tables: directory_entry_{dir,file}). @@ -123,6 +165,14 @@ object_id bigserial -- short object identifier ); +comment on table directory is 'Contents of a directory, synonymous to tree (git)'; +comment on column directory.id is 'Git object sha1 hash'; +comment on column directory.dir_entries is 'Sub-directories, reference directory_entry_dir'; +comment on column directory.file_entries is 'Contained files, reference directory_entry_file'; +comment on column directory.rev_entries is 'Mounted revisions, reference directory_entry_rev'; +comment on column directory.object_id is 'Short object identifier'; + + -- A directory entry pointing to a (sub-)directory. create table directory_entry_dir ( @@ -132,6 +182,13 @@ perms file_perms -- unix-like permissions ); +comment on table directory_entry_dir is 'Directory entry for directory'; +comment on column directory_entry_dir.id is 'Directory identifier'; +comment on column directory_entry_dir.target is 'Target directory identifier'; +comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; +comment on column directory_entry_dir.perms is 'Unix-like permissions'; + + -- A directory entry pointing to a file content. create table directory_entry_file ( @@ -141,6 +198,13 @@ perms file_perms -- unix-like permissions ); +comment on table directory_entry_file is 'Directory entry for file'; +comment on column directory_entry_file.id is 'File identifier'; +comment on column directory_entry_file.target is 'Target file identifier'; +comment on column directory_entry_file.name is 'Path name, relative to containing directory'; +comment on column directory_entry_file.perms is 'Unix-like permissions'; + + -- A directory entry pointing to a revision. create table directory_entry_rev ( @@ -150,6 +214,12 @@ perms file_perms -- unix-like permissions ); +comment on table directory_entry_rev is 'Directory entry for revision'; +comment on column directory_entry_dir.id is 'Revision identifier'; +comment on column directory_entry_dir.target is 'Target revision in identifier'; +comment on column directory_entry_dir.name is 'Path name, relative to containing directory'; +comment on column directory_entry_dir.perms is 'Unix-like permissions'; + -- A person referenced by some source code artifacts, e.g., a VCS revision or -- release metadata. @@ -162,6 +232,12 @@ -- will usually be of the form 'name ' ); +comment on table person is 'Person referenced in code artifact release metadata'; +comment on column person.id is 'Person identifier'; +comment on column person.name is 'Name'; +comment on column person.email is 'Email'; +comment on column person.fullname is 'Full name (raw name)'; + -- The state of a source code tree at a specific point in time. -- @@ -180,7 +256,7 @@ committer_date timestamptz, committer_date_offset smallint, type revision_type not null, - directory sha1_git, -- source code "root" directory + directory sha1_git, -- source code 'root' directory message bytea, author bigint, committer bigint, @@ -191,6 +267,25 @@ committer_date_neg_utc_offset boolean ); +comment on table revision is 'Revision represents the state of a source code tree at a + specific point in time'; +comment on column revision.id is 'Git id of sha1 checksum'; +comment on column revision.date is 'Timestamp when revision was authored'; +comment on column revision.date_offset is 'Authored timestamp offset from UTC'; +comment on column revision.committer_date is 'Timestamp when revision was committed'; +comment on column revision.committer_date_offset is 'Committed timestamp offset from UTC'; +comment on column revision.type is 'Possible revision types (''git'', ''tar'', ..)'; +comment on column revision.directory is 'Directory identifier'; +comment on column revision.message is 'Revision message'; +comment on column revision.author is 'Author identifier'; +comment on column revision.committer is 'Committer identifier'; +comment on column revision.synthetic is 'true iff revision has been created by Software Heritage'; +comment on column revision.metadata is 'extra metadata (tarball checksums, extra commit information, etc...)'; +comment on column revision.object_id is 'Object identifier'; +comment on column revision.date_neg_utc_offset is 'True indicates -0 UTC offset for author timestamp'; +comment on column revision.committer_date_neg_utc_offset is 'True indicates -0 UTC offset for committer timestamp'; + + -- either this table or the sha1_git[] column on the revision table create table revision_history ( @@ -200,6 +295,11 @@ -- parent position in merge commits, 0-based ); +comment on table revision_history is 'Sequence of revision history with parent and position in history'; +comment on column revision_history.id is 'Revision history git object sha1 checksum'; +comment on column revision_history.parent_id is 'Parent revision git object identifier'; +comment on column revision_history.parent_rank is 'Parent position in merge commits, 0-based'; + -- Crawling history of software origins visited by Software Heritage. Each -- visit is a 3-way mapping between a software origin, a timestamp, and a @@ -233,6 +333,11 @@ id sha1_git not null -- snapshot intrinsic identifier ); +comment on table snapshot is 'State of a software origin as crawled by Software Heritage'; +comment on column snapshot.object_id is 'Internal object identifier'; +comment on column snapshot.id is 'Intrinsic snapshot identifier'; + + -- Each snapshot associate "branch" names to other objects in the Software -- Heritage Merkle DAG. This table describes branches as mappings between names -- and target typed objects. @@ -244,6 +349,13 @@ target_type snapshot_target -- target object type, e.g., "revision" ); +comment on table snapshot_branch is 'Associates branches with objects in Heritage Merkle DAG'; +comment on column snapshot_branch.object_id is 'Internal object identifier'; +comment on column snapshot_branch.name is 'Branch name'; +comment on column snapshot_branch.target is 'Target object identifier'; +comment on column snapshot_branch.target_type is 'Target object type'; + + -- Mapping between snapshots and their branches. create table snapshot_branches ( @@ -251,6 +363,10 @@ branch_id bigint not null -- branch identifier, ref. snapshot_branch.object_id ); +comment on table snapshot_branches is 'Mapping between snapshot and their branches'; +comment on column snapshot_branches.snapshot_id is 'Snapshot identifier'; +comment on column snapshot_branches.branch_id is 'Branch identifier'; + -- A "memorable" point in time in the development history of a software -- project. @@ -273,6 +389,19 @@ date_neg_utc_offset boolean ); +comment on table release is 'Details of a software release, synonymous with + a tag (git) or version number (tarball)'; +comment on column release.id is 'Release git identifier'; +comment on column release.target is 'Target git identifier'; +comment on column release.date is 'Release timestamp'; +comment on column release.date_offset is 'Timestamp offset from UTC'; +comment on column release.name is 'Name'; +comment on column release.comment is 'Comment'; +comment on column release.author is 'Author'; +comment on column release.synthetic is 'Indicates if created by Software Heritage'; +comment on column release.object_id is 'Object identifier'; +comment on column release.target_type is 'Object type (''content'', ''directory'', ..)'; +comment on column release.date_neg_utc_offset is 'True indicates -0 UTC offset for release timestamp'; -- Tools create table tool @@ -336,6 +465,13 @@ single_update boolean -- whether we update this table standalone (true) or through bucketed counts (false) ); +comment on table object_counts is 'Cache of object counts'; +comment on column object_counts.object_type is 'Object type (''content'', ''directory'', ..)'; +comment on column object_counts.value is 'Count of objects in the table'; +comment on column object_counts.last_update is 'Last update for object count'; +comment on column object_counts.single_update is 'standalone (true) or bucketed counts (false)'; + + create table object_counts_bucketed ( line serial not null, -- PK @@ -346,3 +482,13 @@ value bigint, -- count of objects in the bucket last_update timestamptz -- last update for the object count in this bucket ); + +comment on table object_counts_bucketed is 'Bucketed count for objects ordered by type'; +comment on column object_counts_bucketed.line is 'Auto incremented idenitfier value'; +comment on column object_counts_bucketed.object_type is 'Object type (''content'', ''directory'', ..)'; +comment on column object_counts_bucketed.identifier is 'Common identifier for bucketed objects'; +comment on column object_counts_bucketed.bucket_start is 'Lower bound (inclusive) for the bucket'; +comment on column object_counts_bucketed.bucket_end is 'Upper bound (exclusive) for the bucket'; +comment on column object_counts_bucketed.value is 'Count of objects in the bucket'; +comment on column object_counts_bucketed.last_update is 'Last update for the object count in this bucket'; +