#+title: swh-git-loader - Specification (draft) #+author: swh team #+source: https://intranet.softwareheritage.org/index.php/Swh_git_loader The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. * License This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. See top-level LICENSE file for the full text of the GNU General Public License along with this program. * Dependencies ** Runtime - python3 - python3-psycopg2 - python3-pygit2 ** Test - python3-nose * Requirements ** Functional - input: a Git bare repository available locally, on the filesystem - input (optional): a table mapping SHA256 of individual files to path on the filesystem that contain the corresponding content (AKA, the file cache) - input (optional): a set of SHA1 of Git commits that have already been seen in the past (AKA, the Git commit cache) - output: an augmented SWH dataset, where all files present in all blobs referenced by any Git object, have been added *** algo Sketch of the (naive) algorithm that the Git loader should execute #+begin_src pseudo for each ref in the repo for each commit referenced by the commit graph starting at that ref if we have a git commit cache and the commit is in there: stop treating the current commit sub-graph for each tree referenced by the commit for each blob referenced by the tree compute the SHA256 checksum of the blob lookup the checksum in the file cache if it is not there add the file to the dataset on the filesystem add the file to the file cache, pointing to the file path on the filesystem #+end_src ** Non-functional - implementation language, Python3 - coding guidelines: conform to PEP8 - Git access: via libgit2/pygit - cache: implemented as Postgres tables ** File-system storage Given a file with SHA256 of b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c It will be stored at STORAGE_ROOT/b5/bb/9d/80/14a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c * Configuration Create a configuration file in *~/.config/sgloader.ini*: #+begin_src ini [main] file_content_storage_dir = swh-git-loader/file-content-storage object_content_storage_dir = swh-git-loader/object-content-storage log_dir = swh-git-loader/log # http://initd.org/psycopg/docs/module.html#psycopg2.connect db_url = dbname=swhgitloader #+end_src * Run ** environment initialization #+begin_src sh export PYTHONPATH=`pwd`:$PYTHONPATH #+end_src ** Help #+begin_src sh bin/sgloader --help #+end_src ** Parse a repository from a clean slate Clean and initialize the model then parse the repository git: #+begin_src sh bin/sgloader cleandb bin/sgloader initdb bin/sgloader load /path/to/git/repo #+end_src For ease: #+begin_src sh make clean-and-run REPO_PATH=/path/to/git/repo #+end_src ** Parse an existing repository #+begin_src sh bin/sgloader load /path/to/git/repo #+end_src ** Clean data #+begin_src sh bin/sgloader cleandb #+end_src For ease: #+begin_src sh make cleandb #+end_src ** Init data #+begin_src sh bin/sgloader initdb #+end_src * IN-PROGRESS Improvments [11/16] - [X] Push on remote git repository - [X] Serialize blob's data and not blob's size. - [X] Logging in python? How to see the log? - [X] Replace sqlalchemy dao layer with psycopg2 - [X] Improve sgloader cli interface - [X] Serialize sha256 as bytes - [X] Update README.org - [X] Serialize sha1 as bytes - [X] Use sha1 instead of sha256 for file cache - [X] Use postgresql's bytea column for sha1 - [X] Improve git object dispatch (look up on repo object only if necessary) - [ ] Store git object on disk too - [ ] Make the compression for the file storage optional - [ ] Make the compression for the git object storage optional - [ ] Add computation folder with depth parametrized - [ ] Add functional test which adds new commits * Performance This is not perf test per say. It's runs on a given machine. ** Spec cat /proc/cpuinfo: #+begin_src sh processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 61 model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz stepping : 4 microcode : 0x16 cpu MHz : 3100.195 cache size : 4096 KB physical id : 0 siblings : 4 core id : 0 cpu cores : 2 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 20 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap bogomips : 5187.99 clflush size : 64 cache_alignment : 64 address sizes : 39 bits physical, 48 bits virtual power management: processor : 1 vendor_id : GenuineIntel cpu family : 6 model : 61 model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz stepping : 4 microcode : 0x16 cpu MHz : 3099.992 cache size : 4096 KB physical id : 0 siblings : 4 core id : 0 cpu cores : 2 apicid : 1 initial apicid : 1 fpu : yes fpu_exception : yes cpuid level : 20 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap bogomips : 5187.99 clflush size : 64 cache_alignment : 64 address sizes : 39 bits physical, 48 bits virtual power management: processor : 2 vendor_id : GenuineIntel cpu family : 6 model : 61 model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz stepping : 4 microcode : 0x16 cpu MHz : 3099.992 cache size : 4096 KB physical id : 0 siblings : 4 core id : 1 cpu cores : 2 apicid : 2 initial apicid : 2 fpu : yes fpu_exception : yes cpuid level : 20 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap bogomips : 5187.99 clflush size : 64 cache_alignment : 64 address sizes : 39 bits physical, 48 bits virtual power management: processor : 3 vendor_id : GenuineIntel cpu family : 6 model : 61 model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz stepping : 4 microcode : 0x16 cpu MHz : 3100.093 cache size : 4096 KB physical id : 0 siblings : 4 core id : 1 cpu cores : 2 apicid : 3 initial apicid : 3 fpu : yes fpu_exception : yes cpuid level : 20 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap bogomips : 5187.99 clflush size : 64 cache_alignment : 64 address sizes : 39 bits physical, 48 bits virtual power management: #+end_src ** Expected results Given a specific repository https://github.com/ardumont/dot-files.git Here is the expected result for each run (as per comparison purposes): #+begin_src sh swhgitloader=> select count(*) from object_cache where type = 0; -- commit count ------- 1744 (1 row) swhgitloader=> select count(*) from object_cache where type = 1; -- tree count ------- 2839 (1 row) swhgitloader=> select count(*) from file_cache; count ------- 2958 (1 row) #+end_src ** sqlalchemy ORM framework. #+begin_src sh # tony at corellia in ~/work/inria/repo/swh-git-loader on git:master o [10:35:08] $ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files rm -rf ./log rm -rf ./dataset/ mkdir -p log dataset bin/sgloader -v cleandb bin/sgloader -v initdb bin/sgloader -v load ~/repo/perso/dot-files make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 161.05s user 10.82s system 76% cpu 3:46.01 total #+end_src ** psycopg2 A simple db client. First implementation, with one open/close for each db access: #+begin_src sh # tony at corellia in ~/work/inria/repo/swh-git-loader on git:master x [17:38:56] $ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files rm -rf ./log rm -rf ./dataset/ mkdir -p log dataset bin/sgloader -v cleandb bin/sgloader -v initdb bin/sgloader -v load ~/repo/perso/dot-files make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 85.82s user 23.53s system 19% cpu 9:16.00 total #+end_src With one opened connection during all the computation: #+begin_src sh # tony at corellia in ~/work/inria/repo/swh-git-loader on git:psycopg2-tryout x [18:02:27] $ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files rm -rf ./log rm -rf ./dataset/ mkdir -p log dataset bin/sgloader -v cleandb bin/sgloader -v initdb bin/sgloader -v load ~/repo/perso/dot-files make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 39.45s user 8.02s system 50% cpu 1:34.08 total #+end_src Sanitize the algorithm (remove unneeded check, use the file cache, ...) : #+begin_src sh # tony at corellia in ~/work/inria/repo/swh-git-loader on git:psycopg2-tryout x [10:42:03] $ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files rm -rf ./log rm -rf ./dataset/ mkdir -p log dataset bin/sgloader -v cleandb bin/sgloader -v initdb bin/sgloader -v load ~/repo/perso/dot-files make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 15.90s user 2.08s system 31% cpu 56.879 total #+end_src No need for byte decoding before serializing on disk: #+begin_src sh # tony at corellia in ~/work/inria/repo/swh-git-loader on git:master x [12:36:10] $ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files rm -rf ./log rm -rf ./dataset/ mkdir -p log dataset bin/sgloader -v cleandb bin/sgloader -v initdb bin/sgloader -v load ~/repo/perso/dot-files make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 14.67s user 1.64s system 30% cpu 54.303 total #+end_src ** Sample |--------+----------------------------------------------| | repo | url | |--------+----------------------------------------------| | linux | https://github.com/torvalds/linux.git | | gcc | https://gcc.gnu.org/git/?p=gcc.git;a=summary | | pygit2 | https://github.com/libgit2/pygit2.git | |--------+----------------------------------------------| * Filemode investigation git - https://github.com/git/git/blob/398dd4bd039680ba98497fbedffa415a43583c16/vcs-svn/repo_tree.h#L6-L9: #+begin_src c #define REPO_MODE_DIR 0040000 #define REPO_MODE_BLB 0100644 #define REPO_MODE_EXE 0100755 #define REPO_MODE_LNK 0120000 #+end_src pygit2 - https://github.com/libgit2/pygit2/blob/d63c2d4fd7e45d99364b4d2ccc6a4dafc9b51705/src/pygit2.c#L211-L221: #+begin_src c ADD_CONSTANT_INT(m, GIT_OBJ_ANY) ADD_CONSTANT_INT(m, GIT_OBJ_COMMIT) ADD_CONSTANT_INT(m, GIT_OBJ_TREE) ADD_CONSTANT_INT(m, GIT_OBJ_BLOB) ADD_CONSTANT_INT(m, GIT_OBJ_TAG) /* Valid modes for index and tree entries. */ ADD_CONSTANT_INT(m, GIT_FILEMODE_TREE) ADD_CONSTANT_INT(m, GIT_FILEMODE_BLOB) ADD_CONSTANT_INT(m, GIT_FILEMODE_BLOB_EXECUTABLE) ADD_CONSTANT_INT(m, GIT_FILEMODE_LINK) ADD_CONSTANT_INT(m, GIT_FILEMODE_COMMIT) #+end_src pygit2 - https://github.com/libgit2/pygit2/blob/c099655fc034c3be63017d0a3e112ea10928464a/src/tree.c#L52-L58: #+begin_src c PyDoc_STRVAR(TreeEntry_filemode__doc__, "Filemode."); PyObject * TreeEntry_filemode__get__(TreeEntry *self) { return PyLong_FromLong(git_tree_entry_filemode(self->entry)); } #+end_src pygit2 - https://github.com/libgit2/pygit2/blob/50a70086bfc72922b63a6e842582021a2bad0b24/src/utils.h#L49: #+begin_src c #define PyLong_FromLong PyInt_FromLong #+end_src From doc https://docs.python.org/2/c-api/int.html: #+begin_src txt PyObject* PyInt_FromLong(long ival) Return value: New reference. Create a new integer object with a value of ival. The current implementation keeps an array of integer objects for all integers between -5 and 256, when you create an int in that range you actually just get back a reference to the existing object. So it should be possible to change the value of 1. I suspect the behaviour of Python in this case is undefined. :-) #+end_src libgit2 - https://github.com/libgit2/libgit2/blob/623fbd93f1a7538df0c9a433df68f87bbd58b803/src/tree.c#L239-L241: #+begin_src c git_filemode_t git_tree_entry_filemode(const git_tree_entry *entry) { return normalize_filemode(entry->attr); } #+end_src libgit2 - https://github.com/libgit2/libgit2/blob/623fbd93f1a7538df0c9a433df68f87bbd58b803/src/tree.c#L31-L51: #+begin_src c GIT_INLINE(git_filemode_t) normalize_filemode(git_filemode_t filemode) { /* Tree bits set, but it's not a commit */ if (GIT_MODE_TYPE(filemode) == GIT_FILEMODE_TREE) return GIT_FILEMODE_TREE; /* If any of the x bits are set */ if (GIT_PERMS_IS_EXEC(filemode)) return GIT_FILEMODE_BLOB_EXECUTABLE; /* 16XXXX means commit */ if (GIT_MODE_TYPE(filemode) == GIT_FILEMODE_COMMIT) return GIT_FILEMODE_COMMIT; /* 12XXXX means commit */ if (GIT_MODE_TYPE(filemode) == GIT_FILEMODE_LINK) return GIT_FILEMODE_LINK; /* Otherwise, return a blob */ return GIT_FILEMODE_BLOB; } #+end_src libgit2 - https://github.com/libgit2/libgit2/blob/f85a9c2767b43f35904bf39858488a4b7bc304e8/src/common.h#L13-L18: #+begin_src c /** Declare a function as always inlined. */ #if defined(_MSC_VER) # define GIT_INLINE(type) static __inline type #else # define GIT_INLINE(type) static inline type #endif #+end_src libgit2 - https://github.com/libgit2/libgit2/blob/d24a5312d8ab6d3cdb259e450ec9f1e2e6f3399d/src/fileops.h#L243-L250: #+begin_src c #define GIT_PERMS_IS_EXEC(MODE) (((MODE) & 0111) != 0) #define GIT_PERMS_CANONICAL(MODE) (GIT_PERMS_IS_EXEC(MODE) ? 0755 : 0644) #define GIT_PERMS_FOR_WRITE(MODE) (GIT_PERMS_IS_EXEC(MODE) ? 0777 : 0666) #define GIT_MODE_PERMS_MASK 0777 #define GIT_MODE_TYPE_MASK 0170000 #define GIT_MODE_TYPE(MODE) ((MODE) & GIT_MODE_TYPE_MASK) #define GIT_MODE_ISBLOB(MODE) (GIT_MODE_TYPE(MODE) == GIT_MODE_TYPE(GIT_FILEMODE_BLOB)) #+end_src libgit2 - https://github.com/libgit2/libgit2/blob/c5c5cdb106d012d132475d9156923857f8d302fc/include/git2/types.h#L204-L212: #+begin_src c /** Valid modes for index and tree entries. */ typedef enum { GIT_FILEMODE_UNREADABLE = 0000000, GIT_FILEMODE_TREE = 0040000, GIT_FILEMODE_BLOB = 0100644, GIT_FILEMODE_BLOB_EXECUTABLE = 0100755, GIT_FILEMODE_LINK = 0120000, GIT_FILEMODE_COMMIT = 0160000, } git_filemode_t; #+end_src