Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9697804
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
18 KB
Subscribers
None
View Options
diff --git a/Makefile b/Makefile
index a06e171..880addd 100644
--- a/Makefile
+++ b/Makefile
@@ -1,157 +1,157 @@
FLAKE = flake8
BINDIR = bin
SRCDIR = swh
REPO_PATH=../debsources
# add -v for example
FLAG=
NOSE = nosetests3
TESTFLAGS = -s
TESTDIR = ./swh/tests
DB=softwareheritage-dev
DB_TEST=$(DB)-test
SWH_LOADER=$(BINDIR)/swh-git-loader
SWH_DB_MANAGER=$(BINDIR)/swh-db-manager
SWH_BACK=$(BINDIR)/swh-backend
# could use cProfile
PROFILE_TYPE=profile
FOLLOW_LOG=-f
# Adapt python-path to use other modules
_PYPATH=`pwd`:`pwd`/../swh-core
deps:
apt-get install -y \
python3 \
python3-pygit2 \
python3-psycopg2 \
python3-nose \
python3-flask \
python3-requests \
python3-retrying \
ipython3
clean:
rm -rf /tmp/swh-git-loader/content-storage
cleandb: clean
PYTHONPATH=$(_PYPATH) $(SWH_DB_MANAGER) $(FLAG) cleandb
run-remote:
PYTHONPATH=`pwd`:`pwd`/../swh-core $(SWH_LOADER) $(FLAG) --config ./resources/remote-git-loader.ini load $(REPO_PATH)
run-local:
PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) --config ./resources/local-git-loader.ini load $(REPO_PATH)
run:
# works with the default ~/.config/swh/git-loader.ini file
PYTHONPATH=$(_PYPATH) $(SWH_LOADER) $(FLAG) load $(REPO_PATH)
run-back:
PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG)
check:
$(FLAKE) $(BINDIR) $(SRCDIR)
profile-run:
PYTHONPATH=$(_PYPATH) python3 -m $(PROFILE_TYPE) -o ./scratch/swhgitloader.$(PROFILE_TYPE) ./scratch/profile-swhgitloader.py
profile-stats:
PYTHONPATH=$(_PYPATH) ./scratch/analyse-profile.py
test-run-back:
PYTHONPATH=$(_PYPATH) $(SWH_BACK) $(FLAG) --config ./resources/test/back.ini
test:
PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)
test-remote-loader:
PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_remote_loader.py
test-local-loader:
PYTHONPATH=$(_PYPATH) $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_local_loader.py
test-http:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_http.py
-test-swhmap:
- $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_swhmap.py
+test-swhrepo:
+ $(NOSE) $(TESTFLAGS) $(TESTDIR)/test_swhrepo.py
test-api:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api*.py
test-api-post-per-type:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_post_*.py
test-api-content:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_content.py
test-api-directory:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_directory.py
test-api-revision:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_revision.py
test-api-release:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_release.py
test-api-occurrence:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_occurrence.py
test-api-home:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_home.py
test-api-origin:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_origin.py
test-api-person:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_person.py
test-api-pickle:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_api_pickle.py
test-file:
$(NOSE) $(TESTFLAGS) $(TESTDIR)/test_file.py
connect-db:
psql -d $(DB)
create-db:
cd ../swh-sql && make clean initdb
drop-db:
cd ../swh-sql && make clean dropdb
test-connect-db:
psql -d $(DB_TEST)
test-create-db:
cd ../swh-sql && make clean initdb DBNAME=$(DB_TEST)
test-drop-db:
cd ../swh-sql && make clean dropdb DBNAME=$(DB_TEST)
check-meta:
@echo "Repository: $(REPO_PATH)"
@echo "Git metadata:"
@$(BINDIR)/dir-git-repo-meta.sh $(REPO_PATH)
@echo
@echo "DB metadata:"
@$(BINDIR)/db-git-repo-meta.sh $(DB) $(REPO_PATH)
@echo
log-loader:
tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/sgloader.log
log-back:
tail $(FOLLOW_LOG) /tmp/swh-git-loader/log/back.log
coverage:
PYTHONPATH=$(_PYPATH) $(NOSE) --with-coverage $(SRCDIR) -v --cover-package=$(SRCDIR)
diff --git a/swh/data/swhrepo.py b/swh/data/swhrepo.py
index dbf4ed6..b82ebe7 100644
--- a/swh/data/swhrepo.py
+++ b/swh/data/swhrepo.py
@@ -1,92 +1,70 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
-
-class SWHMap():
- """Data structure that ensures easy access to current keys.
- FIXME: improve or remove altogether
- """
- def __init__(self):
- self.sha1s_hex = set()
- self.sha1s_map = {}
-
- def add(self, sha1, obj):
- """Add obj with type obj_type and sha1.
- """
- self.sha1s_hex.add(sha1)
- self.sha1s_map[sha1] = obj
-
- def keys(self):
- return self.sha1s_hex
-
- def objects(self):
- return self.sha1s_map
-
-
class SWHRepo():
"""Structure with:
- - sha1s as list
- - swh objects map (indexed by sha1)
+ - sha1s as list
+ - map indexed by sha1
"""
def __init__(self):
self.origin = {}
self.releases = []
self.occurrences = []
- self.contents = SWHMap()
- self.directories = SWHMap()
- self.revisions = SWHMap()
+ self.contents = {}
+ self.directories = {}
+ self.revisions = {}
self.persons = {}
self.visited = set()
def add_origin(self, origin):
self.origin = origin
def get_origin(self):
return self.origin
def add_release(self, release):
self.releases.append(release)
def get_releases(self):
return self.releases
def add_occurrence(self, occurrence):
self.occurrences.append(occurrence)
def get_occurrences(self):
return self.occurrences
def add_content(self, content_ref):
sha1 = content_ref['sha1']
- self.contents.add(sha1, content_ref)
+ self.contents[sha1] = content_ref
self.visited.add(sha1)
def get_contents(self):
return self.contents
def add_directory(self, directory):
sha1 = directory['sha1']
- self.directories.add(sha1, directory)
+ self.directories[sha1] = directory
self.visited.add(sha1)
def get_directories(self):
return self.directories
def add_revision(self, revision):
sha1 = revision['sha1']
- self.revisions.add(sha1, revision)
+ self.revisions[sha1] = revision
self.visited.add(sha1)
def add_person(self, id, person):
self.persons[id] = person
def get_persons(self):
- return list(self.persons.values())
+ return self.persons.values()
def already_visited(self, sha1):
return sha1 in self.visited
def get_revisions(self):
return self.revisions
diff --git a/swh/gitloader/local_store.py b/swh/gitloader/local_store.py
index aa6294d..5080c62 100644
--- a/swh/gitloader/local_store.py
+++ b/swh/gitloader/local_store.py
@@ -1,87 +1,86 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.storage import store, db, service
from swh.conf import reader
# FIXME duplicated from bin/swh-backend...
# Default configuration file
DEFAULT_CONF_FILE = '~/.config/swh/back.ini'
# default configuration
DEFAULT_CONF = {
'content_storage_dir': ('string', '/tmp/swh-git-loader/content-storage'),
'log_dir': ('string', '/tmp/swh-git-loader/log'),
'db_url': ('string', 'dbname=softwareheritage-dev'),
'storage_compression': ('bool', None),
'folder_depth': ('int', 4),
'debug': ('bool', None),
'port': ('int', 5000)
}
def store_only_new(db_conn, conf, obj_type, obj):
"""Store object if not already present.
"""
obj.update({'type': obj_type})
if not store.find(db_conn, obj):
store.add(db_conn, conf, obj)
_obj_to_persist_fn = {store.Type.revision: service.add_revisions}
def store_unknown_objects(db_conn, conf, obj_type, swhmap):
"""Load objects to the backend.
"""
sha1s = swhmap.keys()
# have: filter unknown obj
unknown_obj_sha1s = service.filter_unknowns_type(db_conn, obj_type, sha1s)
if not unknown_obj_sha1s:
return True
# seen: now store in backend
- obj_map = swhmap.objects()
persist_fn = _obj_to_persist_fn.get(obj_type, service.add_objects)
- obj_fulls = map(obj_map.get, unknown_obj_sha1s)
+ obj_fulls = map(swhmap.get, unknown_obj_sha1s)
return persist_fn(db_conn, conf, obj_type, obj_fulls)
def load_to_back(backend_setup_file, swhrepo):
"""Load to the backend the repository swhrepo.
"""
# Read the configuration file (no check yet)
conf = reader.read(backend_setup_file or DEFAULT_CONF_FILE, DEFAULT_CONF)
with db.connect(conf['db_url']) as db_conn:
# First, store/retrieve the origin identifier
# FIXME: should be done by the cloner worker (which is not yet plugged
# on the right swh db ftm)
service.add_origin(db_conn, swhrepo.get_origin())
# First reference all unknown persons
service.add_persons(db_conn, conf, store.Type.person,
swhrepo.get_persons())
res = store_unknown_objects(db_conn, conf, store.Type.content,
swhrepo.get_contents())
if res:
res = store_unknown_objects(db_conn, conf, store.Type.directory,
swhrepo.get_directories())
if res:
res = store_unknown_objects(db_conn, conf, store.Type.revision,
swhrepo.get_revisions())
if res:
# brutally send all remaining occurrences
service.add_objects(db_conn, conf, store.Type.occurrence,
swhrepo.get_occurrences())
# and releases (the idea here is that compared to existing
# objects, the quantity is less)
service.add_objects(db_conn, conf, store.Type.release,
swhrepo.get_releases())
diff --git a/swh/gitloader/remote_store.py b/swh/gitloader/remote_store.py
index 90ee744..640dedc 100644
--- a/swh/gitloader/remote_store.py
+++ b/swh/gitloader/remote_store.py
@@ -1,65 +1,63 @@
# Copyright (C) 2015 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.storage import store
from swh.client import http
def store_unknown_objects(back_url, obj_type, swhmap):
"""Load objects to the backend.
"""
- sha1s = swhmap.keys()
+ sha1s = list(swhmap.keys())
# have: filter unknown obj
unknown_obj_sha1s = http.post(back_url, obj_type, sha1s)
if not unknown_obj_sha1s:
return True
- # seen: now create the data for the backend to store
- obj_map = swhmap.objects()
# store unknown objects
- return http.put(back_url, obj_type, map(obj_map.get, unknown_obj_sha1s))
+ return http.put(back_url, obj_type, map(swhmap.get, unknown_obj_sha1s))
-def load_to_back(back_url, swhrepo):
- """Load to the back_url the repository swhrepo.
+def load_to_back(back_url, swh_repo):
+ """Load to the back_url the repository swh_repo.
"""
# First, store/retrieve the origin identifier
# FIXME: should be done by the cloner worker (which is not yet plugged on
# the right swh db ftm)
http.put(back_url,
obj_type=store.Type.origin,
- obj=swhrepo.get_origin())
+ obj=swh_repo.get_origin())
http.put(back_url,
obj_type=store.Type.person,
- obj=swhrepo.get_persons())
+ obj=list(swh_repo.get_persons()))
# let the backend and api discuss what's really needed
# - first this worker sends the checksums
# - then the backend answers the checksums it does not know
# - then the worker sends only what the backend does not know per
# object type basis
res = store_unknown_objects(back_url, store.Type.content,
- swhrepo.get_contents())
+ swh_repo.get_contents())
if res:
res = store_unknown_objects(back_url, store.Type.directory,
- swhrepo.get_directories())
+ swh_repo.get_directories())
if res:
res = store_unknown_objects(back_url, store.Type.revision,
- swhrepo.get_revisions())
+ swh_repo.get_revisions())
if res:
# brutally send all remaining occurrences
http.put(back_url,
store.Type.occurrence,
- swhrepo.get_occurrences())
+ swh_repo.get_occurrences())
# and releases (the idea here is that compared to existing
# other objects, the quantity is less)
http.put(back_url,
store.Type.release,
- swhrepo.get_releases())
+ swh_repo.get_releases())
# FIXME: deal with collision failures which should be raised by backend.
diff --git a/swh/tests/test_swhmap.py b/swh/tests/test_swhmap.py
deleted file mode 100644
index 2864c69..0000000
--- a/swh/tests/test_swhmap.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (C) 2015 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
-
-import unittest
-
-from nose.tools import istest
-
-from swh.data import swhrepo
-from test_utils import app_client
-
-
-class SWHMapTestCase(unittest.TestCase):
- @istest
- def new_swhmap(self):
- # when
- m = swhrepo.SWHMap()
-
- # then
- assert m.keys() == set()
- assert m.objects() == {}
-
- @istest
- def add_first(self):
- # given
- m = swhrepo.SWHMap()
-
- # when
- m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something'})
-
- # then
-
- keys = m.keys()
- assert len(keys) == 1
- assert 'some-sha1' in keys
- assert m.objects()['some-sha1'] == {'sha1': 'some-sha1', 'type': 'something'}
-
- @istest
- def add_second_time_can_update(self):
- # given
- m = swhrepo.SWHMap()
- m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something'})
-
- # when
- m.add('some-sha1', {'sha1': 'some-sha1', 'type': 'something-else'})
-
- # then
- keys = m.keys()
- assert len(keys) == 1
- assert 'some-sha1' in keys
- assert m.objects()['some-sha1'] == {'sha1': 'some-sha1', 'type': 'something-else'}
-
-
-class SWHRepoTestCase(unittest.TestCase):
- def setUp(self):
- self.app, db_url = app_client()
-
- @istest
- def new_swhrepo(self):
- # when
- r = swhrepo.SWHRepo()
-
- r.add_origin({'url': 'foobar'})
- r.add_content({'sha1': 'some-con-sha1'})
- r.add_directory({'sha1': 'some-dir-sha1'})
- r.add_revision({'sha1': 'some-rev-sha1'})
- r.add_occurrence({'sha1': 'some-occ-sha1'})
- r.add_release({'sha1': 'some-rel-sha1'})
-
- # then
- assert r.get_origin() == {'url': 'foobar'}
- assert r.get_releases() == [{'sha1': 'some-rel-sha1'}]
- assert r.get_occurrences() == [{'sha1': 'some-occ-sha1'}]
-
- assert r.already_visited('some-con-sha1') is True
- assert r.already_visited('some-dir-sha1') is True
- assert r.already_visited('some-rev-sha1') is True
- assert r.already_visited('some-occ-sha1') is False
- assert r.already_visited('some-rel-sha1') is False
-
- assert 'some-con-sha1' in r.get_contents().keys()
- assert 'some-dir-sha1' in r.get_directories().keys()
- assert 'some-rev-sha1' in r.get_revisions().keys()
diff --git a/swh/tests/test_swhrepo.py b/swh/tests/test_swhrepo.py
new file mode 100644
index 0000000..850fbf8
--- /dev/null
+++ b/swh/tests/test_swhrepo.py
@@ -0,0 +1,57 @@
+# Copyright (C) 2015 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import unittest
+
+from nose.tools import istest
+
+from swh.data import swhrepo
+from test_utils import app_client
+
+
+class SWHRepoTestCase(unittest.TestCase):
+ def setUp(self):
+ self.app, db_url = app_client()
+
+ @istest
+ def new_swhrepo(self):
+ # when
+ r = swhrepo.SWHRepo()
+
+ r.add_origin({'url': 'foobar'})
+
+ r.add_content({'sha1': 'some-con-sha1'})
+ r.add_content({'sha1': 'some-con-sha1-2','stuff': 'some-stuff'})
+ r.add_directory({'sha1': 'some-dir-sha1'})
+ r.add_directory({'sha1': 'some-dir-sha1-2'})
+ r.add_revision({'sha1': 'some-rev-sha1'})
+ r.add_revision({'sha1': 'some-rev-sha1-2'})
+ r.add_person('id0', {'name': 'the one'})
+ r.add_person('id1', {'name': 'another one'})
+
+ r.add_occurrence({'sha1': 'some-occ-sha1'})
+ r.add_release({'sha1': 'some-rel-sha1'})
+
+ # then
+ assert r.get_origin() == {'url': 'foobar'}
+ assert r.get_releases() == [{'sha1': 'some-rel-sha1'}]
+ assert r.get_occurrences() == [{'sha1': 'some-occ-sha1'}]
+
+ for sha in ['some-con-sha1', 'some-con-sha1-2',
+ 'some-dir-sha1', 'some-dir-sha1-2',
+ 'some-rev-sha1', 'some-rev-sha1-2']:
+ assert r.already_visited(sha) is True
+
+ assert r.already_visited('some-occ-sha1') is False
+ assert r.already_visited('some-rel-sha1') is False
+
+ assert r.get_contents() == {'some-con-sha1': {'sha1': 'some-con-sha1'},
+ 'some-con-sha1-2': {'sha1': 'some-con-sha1-2','stuff': 'some-stuff'}}
+ assert r.get_directories() == {'some-dir-sha1': {'sha1': 'some-dir-sha1'},
+ 'some-dir-sha1-2': {'sha1': 'some-dir-sha1-2'}}
+ assert r.get_revisions() == {'some-rev-sha1': {'sha1': 'some-rev-sha1'},
+ 'some-rev-sha1-2': {'sha1': 'some-rev-sha1-2'}}
+
+ assert len(r.get_persons()) == 2
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Tue, Aug 19, 12:58 AM (3 w, 2 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3213162
Attached To
rDLDG Git loader
Event Timeline
Log In to Comment