diff --git a/Makefile b/Makefile index ad6e258..02f9793 100644 --- a/Makefile +++ b/Makefile @@ -1,157 +1,154 @@ FLAKE = flake8 BINDIR = bin SRCDIR = swh REPO_PATH=../debsources # add -v for example FLAG= NOSE = nosetests3 TESTFLAGS = -s TESTDIR = ./swh/tests DB=softwareheritage-dev DB_TEST=$(DB)-test SWH_LOADER=$(BINDIR)/swh-git-loader SWH_DB_MANAGER=$(BINDIR)/swh-db-manager SWH_BACK=$(BINDIR)/swh-backend # could use cProfile PROFILE_TYPE=profile FOLLOW_LOG=-f deps: apt-get install -y \ python3 \ python3-pygit2 \ python3-psycopg2 \ python3-nose \ python3-flask \ python3-requests \ python3-retrying \ ipython3 clean: rm -rf /tmp/swh-git-loader/content-storage cleandb: clean PYTHONPATH=`pwd` $(SWH_DB_MANAGER) $(FLAG) cleandb run-remote: PYTHONPATH=`pwd` $(SWH_LOADER) $(FLAG) --config ./resources/remote-git-loader.ini load $(REPO_PATH) run-local: PYTHONPATH=`pwd` $(SWH_LOADER) $(FLAG) --config ./resources/local-git-loader.ini load $(REPO_PATH) run: # works with the default ~/.config/swh/git-loader.ini file PYTHONPATH=`pwd` $(SWH_LOADER) $(FLAG) load $(REPO_PATH) run-back: PYTHONPATH=`pwd` $(SWH_BACK) $(FLAG) check: $(FLAKE) $(BINDIR) $(SRCDIR) profile-run: PYTHONPATH=`pwd` python3 -m $(PROFILE_TYPE) -o ./scratch/swhgitloader.$(PROFILE_TYPE) ./scratch/profile-swhgitloader.py profile-stats: PYTHONPATH=`pwd` ./scratch/analyse-profile.py test-run-back: PYTHONPATH=`pwd` $(SWH_BACK) $(FLAG) --config ./resources/test/back.ini test: $(NOSE) $(TESTFLAGS) $(TESTDIR) test-http: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_http.py test-swhmap: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_swhmap.py test-remote-loader: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_remote_loader.py test-local-loader: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_local_loader.py test-api: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api*.py test-api-post-per-type: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_post_*.py test-api-content: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_content.py test-api-directory: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_directory.py test-api-revision: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_revision.py test-api-release: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_release.py test-api-occurrence: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_occurrence.py test-api-home: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_home.py test-api-origin: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_origin.py test-api-person: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_person.py test-api-pickle: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_pickle.py test-file: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_file.py connect-db: psql -d $(DB) create-db: cd ../swh-sql && make clean initdb drop-db: cd ../swh-sql && make clean dropdb test-connect-db: psql -d $(DB_TEST) test-create-db: cd ../swh-sql && make clean initdb DBNAME=$(DB_TEST) test-drop-db: cd ../swh-sql && make clean dropdb DBNAME=$(DB_TEST) check-meta: @echo "Repository: $(REPO_PATH)" @echo "Git metadata:" @$(BINDIR)/dir-git-repo-meta.sh $(REPO_PATH) @echo @echo "DB metadata:" @$(BINDIR)/db-git-repo-meta.sh $(DB) $(REPO_PATH) @echo -readme: - pandoc -f org -t markdown README.org > README - log-loader: tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/sgloader.log log-back: tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/back.log coverage: $(NOSE) --with-coverage $(SRCDIR) -v --cover-package=$(SRCDIR) diff --git a/README.org b/README.org deleted file mode 100644 index d16cb58..0000000 --- a/README.org +++ /dev/null @@ -1,190 +0,0 @@ -#+title: swh-git-loader - Specification (draft) -#+author: swh team -#+source: https://intranet.softwareheritage.org/index.php/Swh_git_loader - -The Software Heritage Git Loader is a tool and a library to walk a local Git repository and inject into the SWH dataset all contained files that weren't known before. - -* License - -This program is free software: you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation, either version 3 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -PARTICULAR PURPOSE. See the GNU General Public License for more details. - -See top-level LICENSE file for the full text of the GNU General Public License -along with this program. - -* Dependencies - -** Runtime - -- python3 -- python3-psycopg2 -- python3-pygit2 - -** Test - -- python3-nose - -* Requirements - -- implementation language, Python3 -- coding guidelines: conform to PEP8 -- Git access: via libgit2/pygit -- cache: implemented as Postgres tables - -* Configuration - -swh-git-loader depends on some tools, here are the configuration files for those: -** swh-db-manager - -This is solely a tool in charge of db cleanup now. - -Create a configuration file in *~/.config/db-manager.ini* - -#+begin_src ini -[main] - -# Where to store the logs -log_dir = swh-git-loader/log - -# url access to db -db_url = dbname=swhgitloader -#+end_src - -See http://initd.org/psycopg/docs/module.html#psycopg2.connect for the db url's schema - -** swh-git-loader - -Create a configuration file in *~/.config/swh/git-loader.ini*: - -#+begin_src ini -[main] -# Where to store the logs -log_dir = /tmp/swh-git-loader/log - -# how to access the backend (remote or local) -backend-type = remote - -# backend-type remote: url access to api rest's backend -# backend-type local: configuration file to backend file .ini (cf. back.ini file) -backend = http://localhost:5000 -#+end_src - -Note: -- [[http://initd.org/psycopg/docs/module.html#psycopg2.connect][DB url DSL]] -- the configuration file can be changed in the CLI with the flag `-c ` or `--config-file ` -** swh-backend - -Backend api. -This - -Create a configuration file in *~/.config/swh/back.ini*: -#+begin_src ini -[main] - -# where to store blob on disk -content_storage_dir = /tmp/swh-git-loader/content-storage - -# Where to store the logs -log_dir = swh-git-loader/log - -# url access to db: dbname= (port= user= pass=) -db_url = dbname=swhgitloader - -# activate the compression for each vcs stored object -# storage_compression = true - -# compute folder's depth on disk aa/bb/cc/dd -# folder_depth = 2 - -# Debugger (for dev only) -debug = true - -# server port to listen to requests -port = 6000 -#+end_src -See http://initd.org/psycopg/docs/module.html#psycopg2.connect for the db url's schema - -* Run -** Environment initialization - -#+begin_src sh -export PYTHONPATH=`pwd`:$PYTHONPATH -#+end_src - -** Backend - -*** With initialization - -This depends on swh-sql repository, so: -#+begin_src sh -cd /path/to/swh-sql && make clean initdb DBNAME=softwareheritage-dev -#+end_src - -Using the Makefile eases: -#+begin_src sh -make drop-db create-db run-back FOLLOW_LOG=-f -#+end_src - -*** without initialization - -Running the backend. - -#+begin_src sh -./bin/swh-backend -v -#+end_src - -With makefile: - -#+begin_src sh -make run-back FOLLOW_LOG=-f -#+end_src - -** Help - -#+begin_src sh -bin/swh-git-loader --help -bin/swh-db-manager --help -#+end_src - -** Parse a repository from a clean slate - -Clean and initialize the model then parse the repository git: -#+begin_src sh -bin/swh-db-manager cleandb -bin/swh-git-loader load /path/to/git/repo -#+end_src - -For ease: -#+begin_src sh -time make cleandb run REPO_PATH=~/work/inria/repo/swh-git-cloner -#+end_src - -** Parse an existing repository - -#+begin_src sh -bin/swh-git-loader load /path/to/git/repo -#+end_src - -** Clean data - -This will truncate the relevant table in the schema -#+begin_src sh -bin/swh-db-manager cleandb -#+end_src - -For ease: -#+begin_src sh -make cleandb -#+end_src - -** Init data - -#+begin_src sh -make drop-db create-db -#+end_src diff --git a/TODO.org b/TODO.org deleted file mode 100644 index 5775d00..0000000 --- a/TODO.org +++ /dev/null @@ -1,116 +0,0 @@ -#+title: TODO -#+author: swh team - -* DONE swh implementation - poc 1 -CLOSED: [2015-07-22 Wed 12:20] - -- [X] Push on remote git repository -- [X] All git objects must be written in storage (at the moment only blobs) -- [X] Improve performance -- [X] Serialize blob's data and not blob's size. -- [X] Logging in python? How to see the log? -- [X] Replace sqlalchemy dao layer with psycopg2 -- [X] Improve sgloader cli interface -- [X] Serialize sha256 as bytes -- [X] Update README.org -- [X] Switch dao layer (from sqlalchemy to psycopg2) -- [X] Serialize sha1 as bytes -- [X] Use sha1 instead of sha256 for file cache -- [X] Improve architecture -- [X] Use postgresql's bytea column for sha1 -- [X] Improve git object dispatch (look up on repo object only if necessary) -- [X] Add functional test which adds new commits -- [X] Store git object on disk too -- [X] Make the compression for the file storage optional -- [X] Expose the flag to the swh-git-loader's configuration -- [X] Make the compression for the git object storage optional -- [X] Expose option flag for blob compression -- [X] Add computation folder with depth as parameter -- [X] Expose option flag for folder depth -- [X] Test coverage for at least primitives functions [2/2] - - [X] swh.file - - [X] swh.hash -- [X] Add git-sha1 function in swh.hash module -- [X] Separate the git repository parsing from the persistence (using backend api) -- [X] Enforce retrying disk writing policy -- [X] Use blob's git sha1 as key on disk -- [X] Enforce retrying policy on http http requests -- [X] Share http connection throughout the git repository parsing - -* IN-PROGRESS swh implementation - poc 2 - -- [X] One content storage (2 were used, one for content, one for revision/directory) -- [X] Improve api backend to use the `real` schema -- [X] Adapt loader to speak the api backend the dummy way (~json) -- [X] Clean up some db-manager code + adapt makefile to use swh-sql -- [X] Improve protocol communication between loader and api backend (drop json) > pickle - -- [ ] Roberto: do not use specific language serialization stack (pickle) -> the backend api could not be that private after all. -- [ ] Roberto: add md5 checksum on contents to make sure there were no integrity during transport -- [ ] Fix multiple FIXMEs (`M-x rgrep RET FIXME RET *.py RET /path/to/swh-git-loader RET`) - - [ ] Use sha256 with sha1 for content filtering to backend - - [ ] Find the right times (atime, ctime, mtime) - - [ ] Determine if `swhmap` data structure is really useful or not -> could be a simple map right now... - - [ ] ... - -- [ ] Deal with sha1s collisions -- [ ] Ultimately, the swh-git-loader is not distributed right now. Determine if we need it to run with celery and then adapt code accordingly. - -* Global enhancement - -** Implementation details - -- Pointers on raw data instead of in memory representation -> cf. read_raw() call in `swh/gitloader/git.py`... - -- Maybe loader's repository memory model on disk and then open a file handler and provides this to the http client when ready --> pickle (for the moment) knows how to deal with it --> flask can deal with it (content-type header is 'application/octet-stream') - -** Performance - -Of course, we'd need to measure the actual performance first to determine if we could improve on it. - -But some hints: -Use future computations. Can we send in // some data? --> POST contents' signatures --> POST directories' signatures --> POST revisions' signatures - -** Collision - -We need to deal with potential collisions. - -Backend check for collision and warn the loader if there is. - -The loader deals with the collision message accordingly. - --> if everything is ok, when done, sends a message to queue to say it's ok to delete the repository --> if collisions has been detected, at the end of it all, sends a message to another error queue to say `WARNING collision, do not destroy this repository` - -* Discussion - -** How to stream blob's data - -Returned as raw data - -** Structure log - -This way they could serve for analysis by other mechanism - -** Rules - -**** Don't lose data - -Multiple workers. -Same disks and db. - -**** Transaction - -Unit of transaction. -Reading if a commit exists, if not write on disk + on db. -If one disk fails, fail the transaction. - -**** ? - -**** Profiling -Look into the cursor implementation details.