Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/Makefile b/Makefile
index 487cda3..f9e136e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,65 +1,68 @@
FLAKE = flake8
BINDIR = bin
SRCDIR = sgloader
REPO_PATH=$(HOME)/work/inria/repo/org-beamer-swh
# add -v for example
FLAG=
NOSE = nosetests3
TESTFLAGS = -s
TESTDIR = ./tests
DB=swhgitloader2
DB_TEST=swhgitloader-test
deps:
sudo apt-get install -y python3 python3-pygit2 python3-psycopg2 python3-nose ipython3
prepare:
mkdir -p /tmp/swh-git-loader/log /tmp/swh-git-loader/file-content-storage /tmp/swh-git-loader/object-object-storage
clean:
rm -rf /tmp/swh-git-loader/log /tmp/swh-git-loader/file-content-storage /tmp/swh-git-loader/object-object-storage
help: clean prepare
PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) -h
cleandb: clean prepare
- PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) --actions cleandb
+ PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) cleandb
initdb: clean prepare
- PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) --actions initdb
+ PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) initdb
run: clean prepare
- PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) --actions initdb --load-repo $(REPO_PATH)
+ PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) initdb
+ PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) load $(REPO_PATH)
clean-and-run: clean prepare
- PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) --actions cleandb,initdb --load-repo $(REPO_PATH)
+ PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) initdb
+ PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) cleandb
+ PYTHONPATH=`pwd` $(BINDIR)/sgloader $(FLAG) load $(REPO_PATH)
check:
$(FLAKE) $(BINDIR)/sgloader $(SRCDIR)/*.py
profile:
[ -f profile-sgloader.py ] && python3 -m cProfile profile-sgloader.py
test:
$(NOSE) $(TESTFLAGS) $(TESTDIR)
test-connect-db:
psql -d $(DB_TEST)
test-drop-db:
sudo su -l postgres -c "dropdb $(DB_TEST)"
test-create-db:
sudo su -l postgres -c "createdb -O $(USER) $(DB_TEST)"
connect-db:
psql -d $(DB)
drop-db:
sudo su -l postgres -c "dropdb $(DB)"
create-db:
sudo su -l postgres -c "createdb -O $(USER) $(DB)"
diff --git a/README.org b/README.org
index 0e9442b..3c6bc67 100644
--- a/README.org
+++ b/README.org
@@ -1,480 +1,487 @@
#+title: swh-git-loader - Specification (draft)
#+author: swh team
#+source: https://intranet.softwareheritage.org/index.php/Swh_git_loader
The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before.
* License
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.
See top-level LICENSE file for the full text of the GNU General Public License
along with this program.
* Dependencies
** Runtime
- python3
- python3-psycopg2
- python3-pygit2
** Test
- python3-nose
* Requirements
** Functional
- input: a Git bare repository available locally, on the filesystem
- input (optional): a table mapping SHA256 of individual files to path on the filesystem that contain the corresponding content (AKA, the file cache)
- input (optional): a set of SHA1 of Git commits that have already been seen in the past (AKA, the Git commit cache)
- output: an augmented SWH dataset, where all files present in all blobs referenced by any Git object, have been added
*** algo
Sketch of the (naive) algorithm that the Git loader should execute
#+begin_src pseudo
for each ref in the repo
for each commit referenced by the commit graph starting at that ref
if we have a git commit cache and the commit is in there: stop treating the current commit sub-graph
for each tree referenced by the commit
for each blob referenced by the tree
compute the SHA256 checksum of the blob
lookup the checksum in the file cache
if it is not there
add the file to the dataset on the filesystem
add the file to the file cache, pointing to the file path on the filesystem
#+end_src
** Non-functional
- implementation language, Python3
- coding guidelines: conform to PEP8
- Git access: via libgit2/pygit
- cache: implemented as Postgres tables
** File-system storage
Given a file with SHA256 of b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c
It will be stored at STORAGE_ROOT/b5/bb/9d/80/14a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c
* Configuration
Create a configuration file in *~/.config/sgloader.ini*:
#+begin_src ini
[main]
file_content_storage_dir = /tmp/swh-git-loader/file-content-storage
object_content_storage_dir = /tmp/swh-git-loader/object-object-storage
log_dir = /tmp/swh-git-loader/log
# http://initd.org/psycopg/docs/module.html#psycopg2.connect
db_url = dbname=swhgitloader2
#+end_src
* Run
** environment initialization
#+begin_src sh
export PYTHONPATH=`pwd`:$PYTHONPATH
#+end_src
** Help
#+begin_src sh
bin/sgloader --help
#+end_src
** Parse a repository from a clean slate
Clean and initialize the model then parse the repository git:
#+begin_src sh
-bin/sgloader --actions cleandb,initdb --load-repo /path/to/repo
+bin/sgloader cleandb
+bin/sgloader initdb
+bin/sgloader load /path/to/git/repo
#+end_src
For ease:
#+begin_src sh
-make clean-and-run REPO_PATH=/path/to/repo
+make clean-and-run REPO_PATH=/path/to/git/repo
#+end_src
** Parse an existing repository
#+begin_src sh
-bin/sgloader --load-repo /path/to/repo
+bin/sgloader load /path/to/git/repo
#+end_src
** Clean data
#+begin_src sh
-bin/sgloader --actions cleandb
+bin/sgloader cleandb
#+end_src
For ease:
#+begin_src sh
make cleandb
#+end_src
** Init data
#+begin_src sh
-bin/sgloader --actions initdb
+bin/sgloader initdb
#+end_src
* IN-PROGRESS Improvments [11/12]
- [X] Push on remote git repository
- [X] Serialize blob's data and not blob's size.
- [X] Logging in python? How to see the log?
- [X] Replace sqlalchemy dao layer with psycopg2
- [X] Improve sgloader cli interface
- [X] Serialize sha256 as bytes
- [X] Update README.org
- [X] Serialize sha1 as bytes
- [X] Use sha1 instead of sha256 for file cache
- [X] Use postgresql's bytea column for sha1
- [X] Improve git object dispatch (look up on repo object only if necessary)
- [ ] Store git object on disk too
* Performance
This is not perf test per say.
It's runs on a given machine.
** Spec
cat /proc/cpuinfo:
#+begin_src sh
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 61
model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
stepping : 4
microcode : 0x16
cpu MHz : 3100.195
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 0
cpu cores : 2
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 20
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap
bogomips : 5187.99
clflush size : 64
cache_alignment : 64
address sizes : 39 bits physical, 48 bits virtual
power management:
processor : 1
vendor_id : GenuineIntel
cpu family : 6
model : 61
model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
stepping : 4
microcode : 0x16
cpu MHz : 3099.992
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 0
cpu cores : 2
apicid : 1
initial apicid : 1
fpu : yes
fpu_exception : yes
cpuid level : 20
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap
bogomips : 5187.99
clflush size : 64
cache_alignment : 64
address sizes : 39 bits physical, 48 bits virtual
power management:
processor : 2
vendor_id : GenuineIntel
cpu family : 6
model : 61
model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
stepping : 4
microcode : 0x16
cpu MHz : 3099.992
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 1
cpu cores : 2
apicid : 2
initial apicid : 2
fpu : yes
fpu_exception : yes
cpuid level : 20
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap
bogomips : 5187.99
clflush size : 64
cache_alignment : 64
address sizes : 39 bits physical, 48 bits virtual
power management:
processor : 3
vendor_id : GenuineIntel
cpu family : 6
model : 61
model name : Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz
stepping : 4
microcode : 0x16
cpu MHz : 3100.093
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 1
cpu cores : 2
apicid : 3
initial apicid : 3
fpu : yes
fpu_exception : yes
cpuid level : 20
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap
bogomips : 5187.99
clflush size : 64
cache_alignment : 64
address sizes : 39 bits physical, 48 bits virtual
power management:
#+end_src
** Expected results
Given a specific repository https://github.com/ardumont/dot-files.git
Here is the expected result for each run (as per comparison purposes):
#+begin_src sh
swhgitloader=> select count(*) from object_cache where type = 0; -- commit
count
-------
1744
(1 row)
swhgitloader=> select count(*) from object_cache where type = 1; -- tree
count
-------
2839
(1 row)
swhgitloader=> select count(*) from file_cache;
count
-------
2958
(1 row)
#+end_src
** sqlalchemy
ORM framework.
#+begin_src sh
# tony at corellia in ~/work/inria/repo/swh-git-loader on git:master o [10:35:08]
$ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files
rm -rf ./log
rm -rf ./dataset/
mkdir -p log dataset
bin/sgloader -v cleandb
-bin/sgloader -v --repo-path ~/repo/perso/dot-files initdb
+bin/sgloader -v initdb
+bin/sgloader -v load ~/repo/perso/dot-files
make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 161.05s user 10.82s system 76% cpu 3:46.01 total
#+end_src
** psycopg2
A simple db client.
First implementation, with one open/close for each db access:
#+begin_src sh
# tony at corellia in ~/work/inria/repo/swh-git-loader on git:master x [17:38:56]
$ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files
rm -rf ./log
rm -rf ./dataset/
mkdir -p log dataset
bin/sgloader -v cleandb
-bin/sgloader -v --repo-path ~/repo/perso/dot-files initdb
+bin/sgloader -v initdb
+bin/sgloader -v load ~/repo/perso/dot-files
make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 85.82s user 23.53s system 19% cpu 9:16.00 total
#+end_src
With one opened connection during all the computation:
#+begin_src sh
# tony at corellia in ~/work/inria/repo/swh-git-loader on git:psycopg2-tryout x [18:02:27]
$ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files
rm -rf ./log
rm -rf ./dataset/
mkdir -p log dataset
bin/sgloader -v cleandb
-bin/sgloader -v --repo-path ~/repo/perso/dot-files initdb
+bin/sgloader -v initdb
+bin/sgloader -v load ~/repo/perso/dot-files
make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 39.45s user 8.02s system 50% cpu 1:34.08 total
#+end_src
Sanitize the algorithm (remove unneeded check, use the file cache, ...) :
#+begin_src sh
# tony at corellia in ~/work/inria/repo/swh-git-loader on git:psycopg2-tryout x [10:42:03]
$ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files
rm -rf ./log
rm -rf ./dataset/
mkdir -p log dataset
bin/sgloader -v cleandb
-bin/sgloader -v --repo-path ~/repo/perso/dot-files initdb
+bin/sgloader -v initdb
+bin/sgloader -v load ~/repo/perso/dot-files
make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 15.90s user 2.08s system 31% cpu 56.879 total
#+end_src
No need for byte decoding before serializing on disk:
#+begin_src sh
# tony at corellia in ~/work/inria/repo/swh-git-loader on git:master x [12:36:10]
$ time make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files
rm -rf ./log
rm -rf ./dataset/
mkdir -p log dataset
bin/sgloader -v cleandb
-bin/sgloader -v --repo-path ~/repo/perso/dot-files initdb
+bin/sgloader -v initdb
+bin/sgloader -v load ~/repo/perso/dot-files
make cleandb run FLAG=-v REPO_PATH=~/repo/perso/dot-files 14.67s user 1.64s system 30% cpu 54.303 total
#+end_src
** Sample
|--------+----------------------------------------------|
| repo | url |
|--------+----------------------------------------------|
| linux | |
| gcc | https://gcc.gnu.org/git/?p=gcc.git;a=summary |
| pygit2 | |
|--------+----------------------------------------------|
* Filemode investigation
git - https://github.com/git/git/blob/398dd4bd039680ba98497fbedffa415a43583c16/vcs-svn/repo_tree.h#L6-L9:
#+begin_src c
#define REPO_MODE_DIR 0040000
#define REPO_MODE_BLB 0100644
#define REPO_MODE_EXE 0100755
#define REPO_MODE_LNK 0120000
#+end_src
pygit2 - https://github.com/libgit2/pygit2/blob/d63c2d4fd7e45d99364b4d2ccc6a4dafc9b51705/src/pygit2.c#L211-L221:
#+begin_src c
ADD_CONSTANT_INT(m, GIT_OBJ_ANY)
ADD_CONSTANT_INT(m, GIT_OBJ_COMMIT)
ADD_CONSTANT_INT(m, GIT_OBJ_TREE)
ADD_CONSTANT_INT(m, GIT_OBJ_BLOB)
ADD_CONSTANT_INT(m, GIT_OBJ_TAG)
/* Valid modes for index and tree entries. */
ADD_CONSTANT_INT(m, GIT_FILEMODE_TREE)
ADD_CONSTANT_INT(m, GIT_FILEMODE_BLOB)
ADD_CONSTANT_INT(m, GIT_FILEMODE_BLOB_EXECUTABLE)
ADD_CONSTANT_INT(m, GIT_FILEMODE_LINK)
ADD_CONSTANT_INT(m, GIT_FILEMODE_COMMIT)
#+end_src
pygit2 - https://github.com/libgit2/pygit2/blob/c099655fc034c3be63017d0a3e112ea10928464a/src/tree.c#L52-L58:
#+begin_src c
PyDoc_STRVAR(TreeEntry_filemode__doc__, "Filemode.");
PyObject *
TreeEntry_filemode__get__(TreeEntry *self)
{
return PyLong_FromLong(git_tree_entry_filemode(self->entry));
}
#+end_src
pygit2 - https://github.com/libgit2/pygit2/blob/50a70086bfc72922b63a6e842582021a2bad0b24/src/utils.h#L49:
#+begin_src c
#define PyLong_FromLong PyInt_FromLong
#+end_src
From doc https://docs.python.org/2/c-api/int.html:
#+begin_src txt
PyObject* PyInt_FromLong(long ival)
Return value: New reference.
Create a new integer object with a value of ival.
The current implementation keeps an array of integer objects for all integers between -5 and 256, when you
create an int in that range you actually just get back a reference to the existing object. So it should be
possible to change the value of 1. I suspect the behaviour of Python in this case is undefined. :-)
#+end_src
libgit2 - https://github.com/libgit2/libgit2/blob/623fbd93f1a7538df0c9a433df68f87bbd58b803/src/tree.c#L239-L241:
#+begin_src c
git_filemode_t git_tree_entry_filemode(const git_tree_entry *entry)
{
return normalize_filemode(entry->attr);
}
#+end_src
libgit2 - https://github.com/libgit2/libgit2/blob/623fbd93f1a7538df0c9a433df68f87bbd58b803/src/tree.c#L31-L51:
#+begin_src c
GIT_INLINE(git_filemode_t) normalize_filemode(git_filemode_t filemode)
{
/* Tree bits set, but it's not a commit */
if (GIT_MODE_TYPE(filemode) == GIT_FILEMODE_TREE)
return GIT_FILEMODE_TREE;
/* If any of the x bits are set */
if (GIT_PERMS_IS_EXEC(filemode))
return GIT_FILEMODE_BLOB_EXECUTABLE;
/* 16XXXX means commit */
if (GIT_MODE_TYPE(filemode) == GIT_FILEMODE_COMMIT)
return GIT_FILEMODE_COMMIT;
/* 12XXXX means commit */
if (GIT_MODE_TYPE(filemode) == GIT_FILEMODE_LINK)
return GIT_FILEMODE_LINK;
/* Otherwise, return a blob */
return GIT_FILEMODE_BLOB;
}
#+end_src
libgit2 - https://github.com/libgit2/libgit2/blob/f85a9c2767b43f35904bf39858488a4b7bc304e8/src/common.h#L13-L18:
#+begin_src c
/** Declare a function as always inlined. */
#if defined(_MSC_VER)
# define GIT_INLINE(type) static __inline type
#else
# define GIT_INLINE(type) static inline type
#endif
#+end_src
libgit2 - https://github.com/libgit2/libgit2/blob/d24a5312d8ab6d3cdb259e450ec9f1e2e6f3399d/src/fileops.h#L243-L250:
#+begin_src c
#define GIT_PERMS_IS_EXEC(MODE) (((MODE) & 0111) != 0)
#define GIT_PERMS_CANONICAL(MODE) (GIT_PERMS_IS_EXEC(MODE) ? 0755 : 0644)
#define GIT_PERMS_FOR_WRITE(MODE) (GIT_PERMS_IS_EXEC(MODE) ? 0777 : 0666)
#define GIT_MODE_PERMS_MASK 0777
#define GIT_MODE_TYPE_MASK 0170000
#define GIT_MODE_TYPE(MODE) ((MODE) & GIT_MODE_TYPE_MASK)
#define GIT_MODE_ISBLOB(MODE) (GIT_MODE_TYPE(MODE) == GIT_MODE_TYPE(GIT_FILEMODE_BLOB))
#+end_src
libgit2 - https://github.com/libgit2/libgit2/blob/c5c5cdb106d012d132475d9156923857f8d302fc/include/git2/types.h#L204-L212:
#+begin_src c
/** Valid modes for index and tree entries. */
typedef enum {
GIT_FILEMODE_UNREADABLE = 0000000,
GIT_FILEMODE_TREE = 0040000,
GIT_FILEMODE_BLOB = 0100644,
GIT_FILEMODE_BLOB_EXECUTABLE = 0100755,
GIT_FILEMODE_LINK = 0120000,
GIT_FILEMODE_COMMIT = 0160000,
} git_filemode_t;
#+end_src
diff --git a/bin/sgloader b/bin/sgloader
index ae69cab..311441d 100755
--- a/bin/sgloader
+++ b/bin/sgloader
@@ -1,90 +1,86 @@
#!/usr/bin/env python3
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>,
# Antoine R. Dumont <antoine.romain.dumont@gmail.com>
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import argparse
import configparser
import logging
import os
from swh.gitloader.loader import run
# Default configuration file
DEFAULT_CONF_FILE = '~/.config/sgloader.ini'
# default configuration (can be overriden by the DEFAULT_CONF_FILE)
DEFAULT_CONF = {
'file_content_storage_dir': '/tmp/swh-git-loader/file-content-storage',
'object_content_storage_dir': '/tmp/swh-git-loader/git-object-storage',
'log_dir': '/tmp/swh-git-loader/log',
# http://initd.org/psycopg/docs/module.html#psycopg2.connect
'db_url': 'dbname=swhgitloader'
}
def parse_args():
- """ Parse the configuration for the cli.
+ """Parse the configuration for the cli.
+
"""
+
cli = argparse.ArgumentParser(
description='Parse git repository objects to load them into DB.')
-
cli.add_argument('--verbose', '-v', action='store_true',
help='Verbosity level in log file.')
+ cli.add_argument('--config', '-c', help='configuration file path')
- cli.add_argument('--configuration-file', '-c', dest='conf_file',
- help='Configuration file path to load db access, ' +
- 'log file, etc...')
+ subcli = cli.add_subparsers(dest='action')
+ subcli.add_parser('initdb', help='initialize DB')
+ subcli.add_parser('cleandb', help='clean DB')
- cli.add_argument('--load-repo', '-l',
- dest='repo_path',
- help='Git repository path to load.')
-
- cli.add_argument('--actions', '-a', dest='actions', nargs='?',
- help='Comma separated action values in [initdb|cleandb]',
- default='initdb')
+ load_cli = subcli.add_parser('load', help='load Git repo into DB')
+ load_cli.add_argument('repository', help='Git repository path')
args = cli.parse_args()
+ if not args.action:
+ cli.error('no action given')
return args
def read_conf(args):
"""Read the user's configuration file.
args contains the repo to parse.
Transmit to the result.
- (No cli override.)
"""
+
config = configparser.ConfigParser(defaults=DEFAULT_CONF)
- conf_file = DEFAULT_CONF_FILE if args.conf_file is None else args.conf_file
+ conf_file = args.config or DEFAULT_CONF_FILE
config.read(os.path.expanduser(conf_file))
conf = config._sections['main']
- conf['repo_path'] = args.repo_path
- actions = args.actions
- conf['actions'] = actions if actions is None else actions.split(',')
+ # propagate CLI arguments to conf dictionary
+ conf['action'] = args.action
+ if 'repository' in args:
+ conf['repository'] = args.repository
return conf
if __name__ == '__main__':
args = parse_args()
conf = read_conf(args)
log_filename = os.path.join(conf['log_dir'], 'sgloader.log')
logging.basicConfig(filename=log_filename,
level=logging.DEBUG if args.verbose else logging.INFO)
- run(conf['actions'],
- conf['db_url'],
- conf['repo_path'],
- conf['file_content_storage_dir'],
- conf['object_content_storage_dir'])
+ run(conf)
diff --git a/swh/gitloader/loader.py b/swh/gitloader/loader.py
index 243dacb..ede053f 100644
--- a/swh/gitloader/loader.py
+++ b/swh/gitloader/loader.py
@@ -1,221 +1,220 @@
# Copyright (C) 2015 Stefano Zacchiroli <zack@upsilon.cc>,
# Antoine R. Dumont <antoine.romain.dumont@gmail.com>
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
import os
import pygit2
import hashlib
from swh import db_utils
from swh.gitloader import models
def load_repo(parent_repo_path):
"""Load the repository path.
"""
repo_path = pygit2.discover_repository(parent_repo_path)
return pygit2.Repository(repo_path)
def commits_from(repo, commit):
"""Return the lists of commits from a given commit.
"""
return repo.walk(commit.id, pygit2.GIT_SORT_TOPOLOGICAL)
def in_cache_objects(db_conn, sha, type):
"""Determine if an object with hash sha is in the cache.
"""
return models.find_object(db_conn, sha, type) is not None
def add_object_in_cache(db_conn, sha, obj_type):
"""Add obj in cache.
"""
logging.debug('Injecting object \'%s\' in cache' % sha)
models.add_object(db_conn, sha, obj_type)
def _hashkey_sha1(data):
"""Given some data, compute the hash ready object of such data.
Return the reference but not the computation.
"""
sha1 = hashlib.sha1()
sha1.update(data)
return sha1
def in_cache_blobs(db_conn, binhashkey):
"""Determine if a binary binhashkey is in the blob cache.
"""
return models.find_blob(db_conn, binhashkey) is not None
def add_blob_in_cache(db_conn, filepath, binhashkey):
"""Add blob in cache.
"""
models.add_blob(db_conn, binhashkey, filepath)
def write_blob_on_disk(blob, filepath):
"""Write blob on disk.
"""
f = open(filepath, 'wb')
f.write(blob.data)
f.close()
def create_dir_from_hash(file_content_storage_dir, hash):
"""Create directory from a given hash.
"""
def _compute_folder_name(file_content_storage_dir):
"""Compute the folder prefix from a hash key.
"""
# FIXME: find some split function
return os.path.join(file_content_storage_dir,
hash[0:2],
hash[2:4],
hash[4:6],
hash[6:8])
folder_in_storage = _compute_folder_name(file_content_storage_dir)
os.makedirs(folder_in_storage, exist_ok=True)
return folder_in_storage
def add_blob_in_file_storage(db_conn, file_content_storage_dir, blob, hashkey):
"""Add blob in the file content storage (on disk).
TODO: split in another module, file manipulation maybe?
"""
folder_in_storage = create_dir_from_hash(file_content_storage_dir, hashkey)
filepath = os.path.join(folder_in_storage, hashkey)
logging.debug("Injecting blob '%s' in file content storage." % filepath)
write_blob_on_disk(blob, filepath)
return filepath
TYPE_TREE = 1
TYPE_COMMIT = 0
def parse_git_repo(db_conn,
repo_path,
file_content_storage_dir,
object_content_storage_dir):
"""Parse git repository `repo_path` and flush
blobs on disk in `file_content_storage_dir`.
"""
def _store_blobs_from_tree(tree_ref, repo):
"""Given a tree, walk the tree and store the blobs in file content storage
(if not already present).
"""
if in_cache_objects(db_conn, tree_ref.hex, TYPE_TREE):
logging.debug("Tree \'%s\' already visited, skip!" % tree_ref.hex)
return
# Add the tree in cache
add_object_in_cache(db_conn, tree_ref.hex, TYPE_TREE)
# Now walk the tree
for tree_entry in tree_ref:
filemode = tree_entry.filemode
if (filemode == pygit2.GIT_FILEMODE_COMMIT): # submodule!
logging.warn("Submodule - Key \'%s\' not found!"
% tree_entry.id)
break
elif (filemode == pygit2.GIT_FILEMODE_TREE): # Tree
logging.debug("Tree \'%s\' -> walk!"
% tree_entry.id)
_store_blobs_from_tree(repo[tree_entry.id], repo)
else:
blob_entry_ref = repo[tree_entry.id]
hashkey = _hashkey_sha1(blob_entry_ref.data)
binhashkey = hashkey.digest()
# Remains only Blob
if in_cache_blobs(db_conn, binhashkey):
logging.debug('Existing blob \'%s\' -> skip' %
blob_entry_ref.hex)
continue
logging.debug("New blob \'%s\' -> in file storage!" %
blob_entry_ref.hex)
filepath = add_blob_in_file_storage(
db_conn,
file_content_storage_dir,
blob_entry_ref,
hashkey.hexdigest())
# add the file to the file cache, pointing to the file
# path on the filesystem
add_blob_in_cache(db_conn, filepath, binhashkey)
repo = load_repo(repo_path)
all_refs = repo.listall_references()
# for each ref in the repo
for ref_name in all_refs:
logging.debug("Parse reference \'%s\' " % ref_name)
ref = repo.lookup_reference(ref_name)
head_commit = ref.peel()
# for each commit referenced by the commit graph starting at that ref
for commit in commits_from(repo, head_commit):
# if we have a git commit cache and the commit is in there:
if in_cache_objects(db_conn, commit.hex, TYPE_COMMIT):
break # stop treating the current commit sub-graph
else:
add_object_in_cache(db_conn, commit.hex,
TYPE_COMMIT)
_store_blobs_from_tree(commit.tree, repo)
-def run(actions, db_url,
- repo_path=None,
- file_content_storage_dir=None,
- object_content_storage_dir=None):
- """Parse a given git repository.
-actions: CSV values amongst [initdb|cleandb]
-repo_path: Path to the git repository
-file_content_storage_dir: The folder where to store the raw blobs
-object_content_storage_dir: The folder where to store the remaining git objects
+def run(conf):
+ """loader driver, dispatching to the relevant action
+
+ used configuration keys:
+ - action: requested action
+ - repository: git repository path ('load' action only)
+ - file_content_storage_dir: path to file content storage
+ - object_content_storage_dir: path to git object content storage
"""
- db_conn = db_utils.db_connect(db_url)
-
- for action in actions:
- if action == 'cleandb':
- logging.info("Database cleanup!")
- models.cleandb(db_conn)
- elif action == 'initdb':
- logging.info("Database initialization!")
- models.initdb(db_conn)
- else:
- logging.warn("Unknown action '%s', skip!" % action)
-
- if repo_path is not None:
- logging.info("Parsing git repository \'%s\'" % repo_path)
+
+ db_conn = db_utils.db_connect(conf['db_url'])
+ action = conf['action']
+
+ if action == 'cleandb':
+ logging.info("Database cleanup!")
+ models.cleandb(db_conn)
+ elif action == 'initdb':
+ logging.info("Database initialization!")
+ models.initdb(db_conn)
+ elif action == 'load':
+ logging.info("Loading git repository %s" % conf['repository'])
parse_git_repo(db_conn,
- repo_path,
- file_content_storage_dir,
- object_content_storage_dir)
+ conf['repository'],
+ conf['file_content_storage_dir'],
+ conf['object_content_storage_dir'])
+ else:
+ logging.warn("Unknown action '%s', skip!" % action)
db_conn.close()

File Metadata

Mime Type
text/x-diff
Expires
Mon, Aug 25, 5:49 PM (1 w, 12 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3330370

Event Timeline