diff --git a/Makefile b/Makefile index a06e171..880addd 100644 --- a/Makefile +++ b/Makefile @@ -1,157 +1,157 @@ FLAKE = flake8 BINDIR = bin SRCDIR = swh REPO_PATH=../debsources # add -v for example FLAG= NOSE = nosetests3 TESTFLAGS = -s TESTDIR = ./swh/tests DB=softwareheritage-dev DB_TEST=$(DB)-test SWH_LOADER=$(BINDIR)/swh-git-loader SWH_DB_MANAGER=$(BINDIR)/swh-db-manager SWH_BACK=$(BINDIR)/swh-backend # could use cProfile PROFILE_TYPE=profile FOLLOW_LOG=-f # Adapt python-path to use other modules _PYPATH=`pwd`:`pwd`/../swh-core deps: apt-get install -y \ python3 \ python3-pygit2 \ python3-psycopg2 \ python3-nose \ python3-flask \ python3-requests \ python3-retrying \ ipython3 clean: rm -rf /tmp/swh-git-loader/content-storage cleandb: clean PYTHONPATH=$(_PYPATH) $(SWH_DB_MANAGER) $(FLAG) cleandb run-remote: PYTHONPATH=`pwd`:`pwd`/../swh-core $(SWH_LOADER) $(FLAG) --config ./resources/remote-git-loader.ini load $(REPO_PATH) run-local: PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) --config ./resources/local-git-loader.ini load $(REPO_PATH) run: # works with the default ~/.config/swh/git-loader.ini file PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) load $(REPO_PATH) run-back: PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG) check: $(FLAKE) $(BINDIR) $(SRCDIR) profile-run: PYTHONPATH=$(_PYPATH) python3 -m $(PROFILE_TYPE) -o ./scratch/swhgitloader.$(PROFILE_TYPE) ./scratch/profile-swhgitloader.py profile-stats: PYTHONPATH=$(_PYPATH) ./scratch/analyse-profile.py test-run-back: PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG) --config ./resources/test/back.ini test: PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR) test-remote-loader: PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_remote_loader.py test-local-loader: PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_local_loader.py test-http: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_http.py -test-swhmap: - $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_swhmap.py +test-swhrepo: + $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_swhrepo.py test-api: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api*.py test-api-post-per-type: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_post_*.py test-api-content: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_content.py test-api-directory: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_directory.py test-api-revision: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_revision.py test-api-release: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_release.py test-api-occurrence: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_occurrence.py test-api-home: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_home.py test-api-origin: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_origin.py test-api-person: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_person.py test-api-pickle: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_pickle.py test-file: $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_file.py connect-db: psql -d $(DB) create-db: cd ../swh-sql && make clean initdb drop-db: cd ../swh-sql && make clean dropdb test-connect-db: psql -d $(DB_TEST) test-create-db: cd ../swh-sql && make clean initdb DBNAME=$(DB_TEST) test-drop-db: cd ../swh-sql && make clean dropdb DBNAME=$(DB_TEST) check-meta: @echo "Repository: $(REPO_PATH)" @echo "Git metadata:" @$(BINDIR)/dir-git-repo-meta.sh $(REPO_PATH) @echo @echo "DB metadata:" @$(BINDIR)/db-git-repo-meta.sh $(DB) $(REPO_PATH) @echo log-loader: tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/sgloader.log log-back: tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/back.log coverage: PYTHONPATH=$(_PYPATH) $(NOSE) --with-coverage $(SRCDIR) -v --cover-package=$(SRCDIR) diff --git a/swh/data/swhrepo.py b/swh/data/swhrepo.py index dbf4ed6..b82ebe7 100644 --- a/swh/data/swhrepo.py +++ b/swh/data/swhrepo.py @@ -1,92 +1,70 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information - -class SWHMap(): - """Data structure that ensures easy access to current keys. - FIXME: improve or remove altogether - """ - def __init__(self): - self.sha1s_hex = set() - self.sha1s_map = {} - - def add(self, sha1, obj): - """Add obj with type obj_type and sha1. - """ - self.sha1s_hex.add(sha1) - self.sha1s_map[sha1] = obj - - def keys(self): - return self.sha1s_hex - - def objects(self): - return self.sha1s_map - - class SWHRepo(): """Structure with: - - sha1s as list - - swh objects map (indexed by sha1) + - sha1s as list + - map indexed by sha1 """ def __init__(self): self.origin = {} self.releases = [] self.occurrences = [] - self.contents = SWHMap() - self.directories = SWHMap() - self.revisions = SWHMap() + self.contents = {} + self.directories = {} + self.revisions = {} self.persons = {} self.visited = set() def add_origin(self, origin): self.origin = origin def get_origin(self): return self.origin def add_release(self, release): self.releases.append(release) def get_releases(self): return self.releases def add_occurrence(self, occurrence): self.occurrences.append(occurrence) def get_occurrences(self): return self.occurrences def add_content(self, content_ref): sha1 = content_ref['sha1'] - self.contents.add(sha1, content_ref) + self.contents[sha1] = content_ref self.visited.add(sha1) def get_contents(self): return self.contents def add_directory(self, directory): sha1 = directory['sha1'] - self.directories.add(sha1, directory) + self.directories[sha1] = directory self.visited.add(sha1) def get_directories(self): return self.directories def add_revision(self, revision): sha1 = revision['sha1'] - self.revisions.add(sha1, revision) + self.revisions[sha1] = revision self.visited.add(sha1) def add_person(self, id, person): self.persons[id] = person def get_persons(self): - return list(self.persons.values()) + return self.persons.values() def already_visited(self, sha1): return sha1 in self.visited def get_revisions(self): return self.revisions diff --git a/swh/gitloader/local_store.py b/swh/gitloader/local_store.py index aa6294d..5080c62 100644 --- a/swh/gitloader/local_store.py +++ b/swh/gitloader/local_store.py @@ -1,87 +1,86 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.storage import store, db, service from swh.conf import reader # FIXME duplicated from bin/swh-backend... # Default configuration file DEFAULT_CONF_FILE = '~/.config/swh/back.ini' # default configuration DEFAULT_CONF = { 'content_storage_dir': ('string', '/tmp/swh-git-loader/content-storage'), 'log_dir': ('string', '/tmp/swh-git-loader/log'), 'db_url': ('string', 'dbname=softwareheritage-dev'), 'storage_compression': ('bool', None), 'folder_depth': ('int', 4), 'debug': ('bool', None), 'port': ('int', 5000) } def store_only_new(db_conn, conf, obj_type, obj): """Store object if not already present. """ obj.update({'type': obj_type}) if not store.find(db_conn, obj): store.add(db_conn, conf, obj) _obj_to_persist_fn = {store.Type.revision: service.add_revisions} def store_unknown_objects(db_conn, conf, obj_type, swhmap): """Load objects to the backend. """ sha1s = swhmap.keys() # have: filter unknown obj unknown_obj_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s) if not unknown_obj_sha1s: return True # seen: now store in backend - obj_map = swhmap.objects() persist_fn = _obj_to_persist_fn.get(obj_type, service.add_objects) - obj_fulls = map(obj_map.get, unknown_obj_sha1s) + obj_fulls = map(swhmap.get, unknown_obj_sha1s) return persist_fn(db_conn, conf, obj_type, obj_fulls) def load_to_back(backend_setup_file, swhrepo): """Load to the backend the repository swhrepo. """ # Read the configuration file (no check yet) conf = reader.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF) with db.connect(conf['db_url']) as db_conn: # First, store/retrieve the origin identifier # FIXME: should be done by the cloner worker (which is not yet plugged # on the right swh db ftm) service.add_origin(db_conn, swhrepo.get_origin()) # First reference all unknown persons service.add_persons(db_conn, conf, store.Type.person, swhrepo.get_persons()) res = store_unknown_objects(db_conn, conf, store.Type.content, swhrepo.get_contents()) if res: res = store_unknown_objects(db_conn, conf, store.Type.directory, swhrepo.get_directories()) if res: res = store_unknown_objects(db_conn, conf, store.Type.revision, swhrepo.get_revisions()) if res: # brutally send all remaining occurrences service.add_objects(db_conn, conf, store.Type.occurrence, swhrepo.get_occurrences()) # and releases (the idea here is that compared to existing # objects, the quantity is less) service.add_objects(db_conn, conf, store.Type.release, swhrepo.get_releases()) diff --git a/swh/gitloader/remote_store.py b/swh/gitloader/remote_store.py index 90ee744..640dedc 100644 --- a/swh/gitloader/remote_store.py +++ b/swh/gitloader/remote_store.py @@ -1,65 +1,63 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.storage import store from swh.client import http def store_unknown_objects(back_url, obj_type, swhmap): """Load objects to the backend. """ - sha1s = swhmap.keys() + sha1s = list(swhmap.keys()) # have: filter unknown obj unknown_obj_sha1s = http.post(back_url, obj_type, sha1s) if not unknown_obj_sha1s: return True - # seen: now create the data for the backend to store - obj_map = swhmap.objects() # store unknown objects - return http.put(back_url, obj_type, map(obj_map.get, unknown_obj_sha1s)) + return http.put(back_url, obj_type, map(swhmap.get, unknown_obj_sha1s)) -def load_to_back(back_url, swhrepo): - """Load to the back_url the repository swhrepo. +def load_to_back(back_url, swh_repo): + """Load to the back_url the repository swh_repo. """ # First, store/retrieve the origin identifier # FIXME: should be done by the cloner worker (which is not yet plugged on # the right swh db ftm) http.put(back_url, obj_type=store.Type.origin, - obj=swhrepo.get_origin()) + obj=swh_repo.get_origin()) http.put(back_url, obj_type=store.Type.person, - obj=swhrepo.get_persons()) + obj=list(swh_repo.get_persons())) # let the backend and api discuss what's really needed # - first this worker sends the checksums # - then the backend answers the checksums it does not know # - then the worker sends only what the backend does not know per # object type basis res = store_unknown_objects(back_url, store.Type.content, - swhrepo.get_contents()) + swh_repo.get_contents()) if res: res = store_unknown_objects(back_url, store.Type.directory, - swhrepo.get_directories()) + swh_repo.get_directories()) if res: res = store_unknown_objects(back_url, store.Type.revision, - swhrepo.get_revisions()) + swh_repo.get_revisions()) if res: # brutally send all remaining occurrences http.put(back_url, store.Type.occurrence, - swhrepo.get_occurrences()) + swh_repo.get_occurrences()) # and releases (the idea here is that compared to existing # other objects, the quantity is less) http.put(back_url, store.Type.release, - swhrepo.get_releases()) + swh_repo.get_releases()) # FIXME: deal with collision failures which should be raised by backend. diff --git a/swh/tests/test_swhmap.py b/swh/tests/test_swhmap.py deleted file mode 100644 index 2864c69..0000000 --- a/swh/tests/test_swhmap.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (C) 2015 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -import unittest - -from nose.tools import istest - -from swh.data import swhrepo -from test_utils import app_client - - -class SWHMapTestCase(unittest.TestCase): - @istest - def new_swhmap(self): - # when - m = swhrepo.SWHMap() - - # then - assert m.keys() == set() - assert m.objects() == {} - - @istest - def add_first(self): - # given - m = swhrepo.SWHMap() - - # when - m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something'}) - - # then - - keys = m.keys() - assert len(keys) == 1 - assert 'some-sha1' in keys - assert m.objects()['some-sha1'] == {'sha1': 'some-sha1', 'type': 'something'} - - @istest - def add_second_time_can_update(self): - # given - m = swhrepo.SWHMap() - m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something'}) - - # when - m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something-else'}) - - # then - keys = m.keys() - assert len(keys) == 1 - assert 'some-sha1' in keys - assert m.objects()['some-sha1'] == {'sha1': 'some-sha1', 'type': 'something-else'} - - -class SWHRepoTestCase(unittest.TestCase): - def setUp(self): - self.app, db_url = app_client() - - @istest - def new_swhrepo(self): - # when - r = swhrepo.SWHRepo() - - r.add_origin({'url': 'foobar'}) - r.add_content({'sha1': 'some-con-sha1'}) - r.add_directory({'sha1': 'some-dir-sha1'}) - r.add_revision({'sha1': 'some-rev-sha1'}) - r.add_occurrence({'sha1': 'some-occ-sha1'}) - r.add_release({'sha1': 'some-rel-sha1'}) - - # then - assert r.get_origin() == {'url': 'foobar'} - assert r.get_releases() == [{'sha1': 'some-rel-sha1'}] - assert r.get_occurrences() == [{'sha1': 'some-occ-sha1'}] - - assert r.already_visited('some-con-sha1') is True - assert r.already_visited('some-dir-sha1') is True - assert r.already_visited('some-rev-sha1') is True - assert r.already_visited('some-occ-sha1') is False - assert r.already_visited('some-rel-sha1') is False - - assert 'some-con-sha1' in r.get_contents().keys() - assert 'some-dir-sha1' in r.get_directories().keys() - assert 'some-rev-sha1' in r.get_revisions().keys() diff --git a/swh/tests/test_swhrepo.py b/swh/tests/test_swhrepo.py new file mode 100644 index 0000000..850fbf8 --- /dev/null +++ b/swh/tests/test_swhrepo.py @@ -0,0 +1,57 @@ +# Copyright (C) 2015 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest + +from swh.data import swhrepo +from test_utils import app_client + + +class SWHRepoTestCase(unittest.TestCase): + def setUp(self): + self.app, db_url = app_client() + + @istest + def new_swhrepo(self): + # when + r = swhrepo.SWHRepo() + + r.add_origin({'url': 'foobar'}) + + r.add_content({'sha1': 'some-con-sha1'}) + r.add_content({'sha1': 'some-con-sha1-2','stuff': 'some-stuff'}) + r.add_directory({'sha1': 'some-dir-sha1'}) + r.add_directory({'sha1': 'some-dir-sha1-2'}) + r.add_revision({'sha1': 'some-rev-sha1'}) + r.add_revision({'sha1': 'some-rev-sha1-2'}) + r.add_person('id0', {'name': 'the one'}) + r.add_person('id1', {'name': 'another one'}) + + r.add_occurrence({'sha1': 'some-occ-sha1'}) + r.add_release({'sha1': 'some-rel-sha1'}) + + # then + assert r.get_origin() == {'url': 'foobar'} + assert r.get_releases() == [{'sha1': 'some-rel-sha1'}] + assert r.get_occurrences() == [{'sha1': 'some-occ-sha1'}] + + for sha in ['some-con-sha1', 'some-con-sha1-2', + 'some-dir-sha1', 'some-dir-sha1-2', + 'some-rev-sha1', 'some-rev-sha1-2']: + assert r.already_visited(sha) is True + + assert r.already_visited('some-occ-sha1') is False + assert r.already_visited('some-rel-sha1') is False + + assert r.get_contents() == {'some-con-sha1': {'sha1': 'some-con-sha1'}, + 'some-con-sha1-2': {'sha1': 'some-con-sha1-2','stuff': 'some-stuff'}} + assert r.get_directories() == {'some-dir-sha1': {'sha1': 'some-dir-sha1'}, + 'some-dir-sha1-2': {'sha1': 'some-dir-sha1-2'}} + assert r.get_revisions() == {'some-rev-sha1': {'sha1': 'some-rev-sha1'}, + 'some-rev-sha1-2': {'sha1': 'some-rev-sha1-2'}} + + assert len(r.get_persons()) == 2