diff --git a/swh/client/http.py b/swh/client/http.py index e8aaf89..f62c797 100755 --- a/swh/client/http.py +++ b/swh/client/http.py @@ -1,85 +1,86 @@ #!/usr/bin/env python3 # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import requests from retrying import retry from swh.retry import policy from swh.storage import store from swh.protocols import serial session_swh = requests.Session() def compute_simple_url(base_url, type): """Compute the api url. """ return '%s%s' % (base_url, type) @retry(retry_on_exception=policy.retry_if_connection_error, wrap_exception=True, stop_max_attempt_number=3) def execute(map_type_url, method_fn, base_url, obj_type, data, result_fn=lambda result: result.ok): """Execute a query to the backend. - map_type_url is a map of {type: url backend} - method_fn is swh_session.post or swh_session.put - base_url is the base url of the backend - obj_type is the nature of the data - data is the data to send to the backend - result_fn is a function which takes the response result and do something with it. The default function is to return if the server is ok or not. """ if not data: return data res = method_fn(compute_simple_url(base_url, map_type_url[obj_type]), data=serial.dumps(data), headers={'Content-Type': serial.MIMETYPE}) return result_fn(res) # url mapping for lookup url_lookup_per_type = { store.Type.origin: "/origins/" , store.Type.content: "/vcs/contents/" , store.Type.directory: "/vcs/directories/" , store.Type.revision: "/vcs/revisions/" } def post(base_url, obj_type, obj_sha1s): """Retrieve the objects of type type with sha1 sha1hex. """ return execute(url_lookup_per_type, session_swh.post, base_url, obj_type, obj_sha1s, result_fn=lambda res: serial.loads(res.content)) # url mapping for storage url_store_per_type = { store.Type.origin: "/origins/" , store.Type.content: "/vcs/contents/" , store.Type.directory: "/vcs/directories/" , store.Type.revision: "/vcs/revisions/" , store.Type.release: "/vcs/releases/" , store.Type.occurrence: "/vcs/occurrences/" + , store.Type.person: "/vcs/persons/" } def put(base_url, obj_type, obj): """Given an obj (map, simple object) of obj_type, PUT it in the backend. """ return execute(url_store_per_type, session_swh.put, base_url, obj_type, obj) diff --git a/swh/data/swhmap.py b/swh/data/swhmap.py index 4a6765f..dbf4ed6 100644 --- a/swh/data/swhmap.py +++ b/swh/data/swhmap.py @@ -1,92 +1,92 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information class SWHMap(): """Data structure that ensures easy access to current keys. FIXME: improve or remove altogether """ def __init__(self): self.sha1s_hex = set() self.sha1s_map = {} def add(self, sha1, obj): """Add obj with type obj_type and sha1. """ self.sha1s_hex.add(sha1) self.sha1s_map[sha1] = obj def keys(self): return self.sha1s_hex def objects(self): return self.sha1s_map class SWHRepo(): """Structure with: - sha1s as list - swh objects map (indexed by sha1) """ def __init__(self): self.origin = {} self.releases = [] self.occurrences = [] self.contents = SWHMap() self.directories = SWHMap() self.revisions = SWHMap() self.persons = {} self.visited = set() def add_origin(self, origin): self.origin = origin def get_origin(self): return self.origin def add_release(self, release): self.releases.append(release) def get_releases(self): return self.releases def add_occurrence(self, occurrence): self.occurrences.append(occurrence) def get_occurrences(self): return self.occurrences def add_content(self, content_ref): sha1 = content_ref['sha1'] self.contents.add(sha1, content_ref) self.visited.add(sha1) def get_contents(self): return self.contents def add_directory(self, directory): sha1 = directory['sha1'] self.directories.add(sha1, directory) self.visited.add(sha1) def get_directories(self): return self.directories def add_revision(self, revision): sha1 = revision['sha1'] self.revisions.add(sha1, revision) self.visited.add(sha1) def add_person(self, id, person): self.persons[id] = person def get_persons(self): - return self.persons.values() + return list(self.persons.values()) def already_visited(self, sha1): return sha1 in self.visited def get_revisions(self): return self.revisions diff --git a/swh/gitloader/remote_store.py b/swh/gitloader/remote_store.py index fb9c94b..90ee744 100644 --- a/swh/gitloader/remote_store.py +++ b/swh/gitloader/remote_store.py @@ -1,60 +1,65 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from swh.storage import store from swh.client import http def store_unknown_objects(back_url, obj_type, swhmap): """Load objects to the backend. """ sha1s = swhmap.keys() # have: filter unknown obj unknown_obj_sha1s = http.post(back_url, obj_type, sha1s) if not unknown_obj_sha1s: return True # seen: now create the data for the backend to store obj_map = swhmap.objects() # store unknown objects return http.put(back_url, obj_type, map(obj_map.get, unknown_obj_sha1s)) def load_to_back(back_url, swhrepo): """Load to the back_url the repository swhrepo. """ # First, store/retrieve the origin identifier # FIXME: should be done by the cloner worker (which is not yet plugged on # the right swh db ftm) http.put(back_url, obj_type=store.Type.origin, obj=swhrepo.get_origin()) + http.put(back_url, + obj_type=store.Type.person, + obj=swhrepo.get_persons()) + # let the backend and api discuss what's really needed # - first this worker sends the checksums # - then the backend answers the checksums it does not know # - then the worker sends only what the backend does not know per # object type basis res = store_unknown_objects(back_url, store.Type.content, swhrepo.get_contents()) + if res: res = store_unknown_objects(back_url, store.Type.directory, swhrepo.get_directories()) if res: res = store_unknown_objects(back_url, store.Type.revision, swhrepo.get_revisions()) if res: # brutally send all remaining occurrences http.put(back_url, store.Type.occurrence, swhrepo.get_occurrences()) # and releases (the idea here is that compared to existing # other objects, the quantity is less) http.put(back_url, store.Type.release, swhrepo.get_releases()) # FIXME: deal with collision failures which should be raised by backend. diff --git a/swh/tests/test_http.py b/swh/tests/test_http.py index 8140f15..b87c2d6 100644 --- a/swh/tests/test_http.py +++ b/swh/tests/test_http.py @@ -1,40 +1,41 @@ # Copyright (C) 2015 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import unittest from nose.tools import istest from swh.client import http from swh.storage import store class TestHttp(unittest.TestCase): @istest def url(self): # when s = http.compute_simple_url('http://base-url', '/end') # then assert s == 'http://base-url/end' @istest def url_lookup_per_type(self): # then assert http.url_lookup_per_type == { store.Type.origin: "/origins/" , store.Type.content: "/vcs/contents/" , store.Type.directory: "/vcs/directories/" , store.Type.revision: "/vcs/revisions/" } @istest def url_store_per_type(self): # then assert http.url_store_per_type == { store.Type.origin: "/origins/" , store.Type.content: "/vcs/contents/" , store.Type.directory: "/vcs/directories/" , store.Type.revision: "/vcs/revisions/" , store.Type.release: "/vcs/releases/" , store.Type.occurrence: "/vcs/occurrences/" + , store.Type.person: "/vcs/persons/" }