Page MenuHomeSoftware Heritage

No OneTemporary

diff --git a/Makefile b/Makefile
index a06e171..880addd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,157 +1,157 @@
FLAKE = flake8
BINDIR = bin
SRCDIR = swh
REPO_PATH=../debsources
# add -v for example
FLAG=
NOSE = nosetests3
TESTFLAGS = -s
TESTDIR = ./swh/tests
DB=softwareheritage-dev
DB_TEST=$(DB)-test
SWH_LOADER=$(BINDIR)/swh-git-loader
SWH_DB_MANAGER=$(BINDIR)/swh-db-manager
SWH_BACK=$(BINDIR)/swh-backend
# could use cProfile
PROFILE_TYPE=profile
FOLLOW_LOG=-f
# Adapt python-path to use other modules
_PYPATH=`pwd`:`pwd`/../swh-core
deps:
apt-get install -y \
python3 \
python3-pygit2 \
python3-psycopg2 \
python3-nose \
python3-flask \
python3-requests \
python3-retrying \
ipython3
clean:
rm -rf /tmp/swh-git-loader/content-storage
cleandb: clean
PYTHONPATH=$(_PYPATH) $(SWH_DB_MANAGER) $(FLAG) cleandb
run-remote:
PYTHONPATH=`pwd`:`pwd`/../swh-core $(SWH_LOADER) $(FLAG) --config ./resources/remote-git-loader.ini load $(REPO_PATH)
run-local:
PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) --config ./resources/local-git-loader.ini load $(REPO_PATH)
run:
# works with the default ~/.config/swh/git-loader.ini file
PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) load $(REPO_PATH)
run-back:
PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG)
check:
$(FLAKE) $(BINDIR) $(SRCDIR)
profile-run:
PYTHONPATH=$(_PYPATH) python3 -m $(PROFILE_TYPE) -o ./scratch/swhgitloader.$(PROFILE_TYPE) ./scratch/profile-swhgitloader.py
profile-stats:
PYTHONPATH=$(_PYPATH) ./scratch/analyse-profile.py
test-run-back:
PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG) --config ./resources/test/back.ini
test:
PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)
test-remote-loader:
PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_remote_loader.py
test-local-loader:
PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_local_loader.py
test-http:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_http.py
-test-swhmap:
- $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_swhmap.py
+test-swhrepo:
+ $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_swhrepo.py
test-api:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api*.py
test-api-post-per-type:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_post_*.py
test-api-content:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_content.py
test-api-directory:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_directory.py
test-api-revision:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_revision.py
test-api-release:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_release.py
test-api-occurrence:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_occurrence.py
test-api-home:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_home.py
test-api-origin:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_origin.py
test-api-person:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_person.py
test-api-pickle:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_pickle.py
test-file:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_file.py
connect-db:
psql -d $(DB)
create-db:
cd ../swh-sql && make clean initdb
drop-db:
cd ../swh-sql && make clean dropdb
test-connect-db:
psql -d $(DB_TEST)
test-create-db:
cd ../swh-sql && make clean initdb DBNAME=$(DB_TEST)
test-drop-db:
cd ../swh-sql && make clean dropdb DBNAME=$(DB_TEST)
check-meta:
@echo "Repository: $(REPO_PATH)"
@echo "Git metadata:"
@$(BINDIR)/dir-git-repo-meta.sh $(REPO_PATH)
@echo
@echo "DB metadata:"
@$(BINDIR)/db-git-repo-meta.sh $(DB) $(REPO_PATH)
@echo
log-loader:
tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/sgloader.log
log-back:
tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/back.log
coverage:
PYTHONPATH=$(_PYPATH) $(NOSE) --with-coverage $(SRCDIR) -v --cover-package=$(SRCDIR)
diff --git a/swh/data/swhrepo.py b/swh/data/swhrepo.py
index dbf4ed6..b82ebe7 100644
--- a/swh/data/swhrepo.py
+++ b/swh/data/swhrepo.py
@@ -1,92 +1,70 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
-class SWHMap():
- """Data structure that ensures easy access to current keys.
- FIXME: improve or remove altogether
- """
- def __init__(self):
- self.sha1s_hex = set()
- self.sha1s_map = {}
-
- def add(self, sha1, obj):
- """Add obj with type obj_type and sha1.
- """
- self.sha1s_hex.add(sha1)
- self.sha1s_map[sha1] = obj
-
- def keys(self):
- return self.sha1s_hex
-
- def objects(self):
- return self.sha1s_map
-
-
class SWHRepo():
"""Structure with:
- - sha1s as list
- - swh objects map (indexed by sha1)
+ - sha1s as list
+ - map indexed by sha1
"""
def __init__(self):
self.origin = {}
self.releases = []
self.occurrences = []
- self.contents = SWHMap()
- self.directories = SWHMap()
- self.revisions = SWHMap()
+ self.contents = {}
+ self.directories = {}
+ self.revisions = {}
self.persons = {}
self.visited = set()
def add_origin(self, origin):
self.origin = origin
def get_origin(self):
return self.origin
def add_release(self, release):
self.releases.append(release)
def get_releases(self):
return self.releases
def add_occurrence(self, occurrence):
self.occurrences.append(occurrence)
def get_occurrences(self):
return self.occurrences
def add_content(self, content_ref):
sha1 = content_ref['sha1']
- self.contents.add(sha1, content_ref)
+ self.contents[sha1] = content_ref
self.visited.add(sha1)
def get_contents(self):
return self.contents
def add_directory(self, directory):
sha1 = directory['sha1']
- self.directories.add(sha1, directory)
+ self.directories[sha1] = directory
self.visited.add(sha1)
def get_directories(self):
return self.directories
def add_revision(self, revision):
sha1 = revision['sha1']
- self.revisions.add(sha1, revision)
+ self.revisions[sha1] = revision
self.visited.add(sha1)
def add_person(self, id, person):
self.persons[id] = person
def get_persons(self):
- return list(self.persons.values())
+ return self.persons.values()
def already_visited(self, sha1):
return sha1 in self.visited
def get_revisions(self):
return self.revisions
diff --git a/swh/gitloader/local_store.py b/swh/gitloader/local_store.py
index aa6294d..5080c62 100644
--- a/swh/gitloader/local_store.py
+++ b/swh/gitloader/local_store.py
@@ -1,87 +1,86 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.storage import store, db, service
from swh.conf import reader
# FIXME duplicated from bin/swh-backend...
# Default configuration file
DEFAULT_CONF_FILE = '~/.config/swh/back.ini'
# default configuration
DEFAULT_CONF = {
'content_storage_dir': ('string', '/tmp/swh-git-loader/content-storage'),
'log_dir': ('string', '/tmp/swh-git-loader/log'),
'db_url': ('string', 'dbname=softwareheritage-dev'),
'storage_compression': ('bool', None),
'folder_depth': ('int', 4),
'debug': ('bool', None),
'port': ('int', 5000)
}
def store_only_new(db_conn, conf, obj_type, obj):
"""Store object if not already present.
"""
obj.update({'type': obj_type})
if not store.find(db_conn, obj):
store.add(db_conn, conf, obj)
_obj_to_persist_fn = {store.Type.revision: service.add_revisions}
def store_unknown_objects(db_conn, conf, obj_type, swhmap):
"""Load objects to the backend.
"""
sha1s = swhmap.keys()
# have: filter unknown obj
unknown_obj_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s)
if not unknown_obj_sha1s:
return True
# seen: now store in backend
- obj_map = swhmap.objects()
persist_fn = _obj_to_persist_fn.get(obj_type, service.add_objects)
- obj_fulls = map(obj_map.get, unknown_obj_sha1s)
+ obj_fulls = map(swhmap.get, unknown_obj_sha1s)
return persist_fn(db_conn, conf, obj_type, obj_fulls)
def load_to_back(backend_setup_file, swhrepo):
"""Load to the backend the repository swhrepo.
"""
# Read the configuration file (no check yet)
conf = reader.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF)
with db.connect(conf['db_url']) as db_conn:
# First, store/retrieve the origin identifier
# FIXME: should be done by the cloner worker (which is not yet plugged
# on the right swh db ftm)
service.add_origin(db_conn, swhrepo.get_origin())
# First reference all unknown persons
service.add_persons(db_conn, conf, store.Type.person,
swhrepo.get_persons())
res = store_unknown_objects(db_conn, conf, store.Type.content,
swhrepo.get_contents())
if res:
res = store_unknown_objects(db_conn, conf, store.Type.directory,
swhrepo.get_directories())
if res:
res = store_unknown_objects(db_conn, conf, store.Type.revision,
swhrepo.get_revisions())
if res:
# brutally send all remaining occurrences
service.add_objects(db_conn, conf, store.Type.occurrence,
swhrepo.get_occurrences())
# and releases (the idea here is that compared to existing
# objects, the quantity is less)
service.add_objects(db_conn, conf, store.Type.release,
swhrepo.get_releases())
diff --git a/swh/gitloader/remote_store.py b/swh/gitloader/remote_store.py
index 90ee744..640dedc 100644
--- a/swh/gitloader/remote_store.py
+++ b/swh/gitloader/remote_store.py
@@ -1,65 +1,63 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.storage import store
from swh.client import http
def store_unknown_objects(back_url, obj_type, swhmap):
"""Load objects to the backend.
"""
- sha1s = swhmap.keys()
+ sha1s = list(swhmap.keys())
# have: filter unknown obj
unknown_obj_sha1s = http.post(back_url, obj_type, sha1s)
if not unknown_obj_sha1s:
return True
- # seen: now create the data for the backend to store
- obj_map = swhmap.objects()
# store unknown objects
- return http.put(back_url, obj_type, map(obj_map.get, unknown_obj_sha1s))
+ return http.put(back_url, obj_type, map(swhmap.get, unknown_obj_sha1s))
-def load_to_back(back_url, swhrepo):
- """Load to the back_url the repository swhrepo.
+def load_to_back(back_url, swh_repo):
+ """Load to the back_url the repository swh_repo.
"""
# First, store/retrieve the origin identifier
# FIXME: should be done by the cloner worker (which is not yet plugged on
# the right swh db ftm)
http.put(back_url,
obj_type=store.Type.origin,
- obj=swhrepo.get_origin())
+ obj=swh_repo.get_origin())
http.put(back_url,
obj_type=store.Type.person,
- obj=swhrepo.get_persons())
+ obj=list(swh_repo.get_persons()))
# let the backend and api discuss what's really needed
# - first this worker sends the checksums
# - then the backend answers the checksums it does not know
# - then the worker sends only what the backend does not know per
# object type basis
res = store_unknown_objects(back_url, store.Type.content,
- swhrepo.get_contents())
+ swh_repo.get_contents())
if res:
res = store_unknown_objects(back_url, store.Type.directory,
- swhrepo.get_directories())
+ swh_repo.get_directories())
if res:
res = store_unknown_objects(back_url, store.Type.revision,
- swhrepo.get_revisions())
+ swh_repo.get_revisions())
if res:
# brutally send all remaining occurrences
http.put(back_url,
store.Type.occurrence,
- swhrepo.get_occurrences())
+ swh_repo.get_occurrences())
# and releases (the idea here is that compared to existing
# other objects, the quantity is less)
http.put(back_url,
store.Type.release,
- swhrepo.get_releases())
+ swh_repo.get_releases())
# FIXME: deal with collision failures which should be raised by backend.
diff --git a/swh/tests/test_swhmap.py b/swh/tests/test_swhmap.py
deleted file mode 100644
index 2864c69..0000000
--- a/swh/tests/test_swhmap.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (C) 2015 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import unittest
-
-from nose.tools import istest
-
-from swh.data import swhrepo
-from test_utils import app_client
-
-
-class SWHMapTestCase(unittest.TestCase):
- @istest
- def new_swhmap(self):
- # when
- m = swhrepo.SWHMap()
-
- # then
- assert m.keys() == set()
- assert m.objects() == {}
-
- @istest
- def add_first(self):
- # given
- m = swhrepo.SWHMap()
-
- # when
- m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something'})
-
- # then
-
- keys = m.keys()
- assert len(keys) == 1
- assert 'some-sha1' in keys
- assert m.objects()['some-sha1'] == {'sha1': 'some-sha1', 'type': 'something'}
-
- @istest
- def add_second_time_can_update(self):
- # given
- m = swhrepo.SWHMap()
- m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something'})
-
- # when
- m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something-else'})
-
- # then
- keys = m.keys()
- assert len(keys) == 1
- assert 'some-sha1' in keys
- assert m.objects()['some-sha1'] == {'sha1': 'some-sha1', 'type': 'something-else'}
-
-
-class SWHRepoTestCase(unittest.TestCase):
- def setUp(self):
- self.app, db_url = app_client()
-
- @istest
- def new_swhrepo(self):
- # when
- r = swhrepo.SWHRepo()
-
- r.add_origin({'url': 'foobar'})
- r.add_content({'sha1': 'some-con-sha1'})
- r.add_directory({'sha1': 'some-dir-sha1'})
- r.add_revision({'sha1': 'some-rev-sha1'})
- r.add_occurrence({'sha1': 'some-occ-sha1'})
- r.add_release({'sha1': 'some-rel-sha1'})
-
- # then
- assert r.get_origin() == {'url': 'foobar'}
- assert r.get_releases() == [{'sha1': 'some-rel-sha1'}]
- assert r.get_occurrences() == [{'sha1': 'some-occ-sha1'}]
-
- assert r.already_visited('some-con-sha1') is True
- assert r.already_visited('some-dir-sha1') is True
- assert r.already_visited('some-rev-sha1') is True
- assert r.already_visited('some-occ-sha1') is False
- assert r.already_visited('some-rel-sha1') is False
-
- assert 'some-con-sha1' in r.get_contents().keys()
- assert 'some-dir-sha1' in r.get_directories().keys()
- assert 'some-rev-sha1' in r.get_revisions().keys()
diff --git a/swh/tests/test_swhrepo.py b/swh/tests/test_swhrepo.py
new file mode 100644
index 0000000..850fbf8
--- /dev/null
+++ b/swh/tests/test_swhrepo.py
@@ -0,0 +1,57 @@
+# Copyright (C) 2015 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+
+from nose.tools import istest
+
+from swh.data import swhrepo
+from test_utils import app_client
+
+
+class SWHRepoTestCase(unittest.TestCase):
+ def setUp(self):
+ self.app, db_url = app_client()
+
+ @istest
+ def new_swhrepo(self):
+ # when
+ r = swhrepo.SWHRepo()
+
+ r.add_origin({'url': 'foobar'})
+
+ r.add_content({'sha1': 'some-con-sha1'})
+ r.add_content({'sha1': 'some-con-sha1-2','stuff': 'some-stuff'})
+ r.add_directory({'sha1': 'some-dir-sha1'})
+ r.add_directory({'sha1': 'some-dir-sha1-2'})
+ r.add_revision({'sha1': 'some-rev-sha1'})
+ r.add_revision({'sha1': 'some-rev-sha1-2'})
+ r.add_person('id0', {'name': 'the one'})
+ r.add_person('id1', {'name': 'another one'})
+
+ r.add_occurrence({'sha1': 'some-occ-sha1'})
+ r.add_release({'sha1': 'some-rel-sha1'})
+
+ # then
+ assert r.get_origin() == {'url': 'foobar'}
+ assert r.get_releases() == [{'sha1': 'some-rel-sha1'}]
+ assert r.get_occurrences() == [{'sha1': 'some-occ-sha1'}]
+
+ for sha in ['some-con-sha1', 'some-con-sha1-2',
+ 'some-dir-sha1', 'some-dir-sha1-2',
+ 'some-rev-sha1', 'some-rev-sha1-2']:
+ assert r.already_visited(sha) is True
+
+ assert r.already_visited('some-occ-sha1') is False
+ assert r.already_visited('some-rel-sha1') is False
+
+ assert r.get_contents() == {'some-con-sha1': {'sha1': 'some-con-sha1'},
+ 'some-con-sha1-2': {'sha1': 'some-con-sha1-2','stuff': 'some-stuff'}}
+ assert r.get_directories() == {'some-dir-sha1': {'sha1': 'some-dir-sha1'},
+ 'some-dir-sha1-2': {'sha1': 'some-dir-sha1-2'}}
+ assert r.get_revisions() == {'some-rev-sha1': {'sha1': 'some-rev-sha1'},
+ 'some-rev-sha1-2': {'sha1': 'some-rev-sha1-2'}}
+
+ assert len(r.get_persons()) == 2

File Metadata

Mime Type
text/x-diff
Expires
Tue, Aug 19, 12:58 AM (3 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3213162

Event Timeline