diff --git a/swh/fetcher/googlecode/retrieve-google-archive-source.py b/swh/fetcher/googlecode/retrieve-google-archive-source.py index 875ea7e..80cba27 100755 --- a/swh/fetcher/googlecode/retrieve-google-archive-source.py +++ b/swh/fetcher/googlecode/retrieve-google-archive-source.py @@ -1,34 +1,44 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import sys import os import requests from . import utils if __name__ == '__main__': for archive in sys.stdin: archive_gs = archive.rstrip() - parent_dir, filename, url_meta, url_content = utils.transform( + data = utils.transform( # noqa archive_gs) + + parent_dir = data['parent_dir'] + filename = data['filename'] + url_project_archive_meta = data['url_project_archive_meta'] + url_project_meta = data['url_project_meta'] os.makedirs(parent_dir, exist_ok=True) project_name = os.path.basename(parent_dir) filename = project_name + '-' + filename + '.json' try: - r = requests.get(url_meta) + r = requests.get(url_project_archive_meta) + except: + archive_size = '' + else: + archive_size = r.json()['size'] + + repo_type = '' + try: + r = requests.get(url_project_meta) except: - print('', filename) + repo_type = '' else: - print(r.json()['size'], filename) + repo_type = r.json()['repoType'] - # we store the project metadata - filepath = os.path.join(parent_dir, filename) - with open(filepath, 'wb') as f: - f.write(r.text.encode('utf-8')) + print(filename, archive_size, repo_type) diff --git a/swh/fetcher/googlecode/tests/test_utils.py b/swh/fetcher/googlecode/tests/test_utils.py new file mode 100644 index 0000000..601bb0a --- /dev/null +++ b/swh/fetcher/googlecode/tests/test_utils.py @@ -0,0 +1,33 @@ +# Copyright (C) 2016 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import unittest + +from nose.tools import istest + +from swh.fetcher.googlecode.utils import transform + + +class TestUtils(unittest.TestCase): + @istest + def transform(self): + input = 'gs://google-code-archive-source/v2/apache-extras.org/abhishsinha-cassandra-jdbc-source/source-archive.zip' # noqa + + actual_gs_transformed = transform(input) + + self.assertEquals( + actual_gs_transformed, { + 'parent_dir': + 'v2/apache-extras.org/a/abhishsinha-cassandra-jdbc-source', + + 'filename': + 'source-archive.zip', + + 'url_project_archive_meta': + 'https://www.googleapis.com/storage/v1/b/google-code-archive-source/o/v2%2Fapache-extras.org%2Fabhishsinha-cassandra-jdbc-source%2Fsource-archive.zip', # noqa + + 'url_project_meta': + 'https://storage.googleapis.com/google-code-archive/v2/apache-extras.org/abhishsinha-cassandra-jdbc-source/project.json' # noqa + }) diff --git a/swh/fetcher/googlecode/utils.py b/swh/fetcher/googlecode/utils.py index a587be8..26eca8b 100644 --- a/swh/fetcher/googlecode/utils.py +++ b/swh/fetcher/googlecode/utils.py @@ -1,35 +1,48 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os def compute_destination_folder(path): """Given a path, compute a destination folder to which downloads the remote files. """ parent_dir = os.path.dirname(path) project_name = os.path.basename(parent_dir) parent_ddir = os.path.dirname(parent_dir) return os.path.join(parent_ddir, project_name[0], project_name) -prefix_url_api = 'https://www.googleapis.com/storage/v1/b/google-code-archive-source/o' # noqa +prefix_source_url_api = 'https://www.googleapis.com/storage/v1/b/google-code-archive-source/o' # noqa +prefix_project_meta = 'https://storage.googleapis.com/google-code-archive' def transform(url_gs): - """Transform input gs:// url into: + """Transform input gs:// url into a dictionary with the following + information. + + Returns: + Dict of the following form: - destination folder - filename - - metadata url to fetch - - actual content to fetch + - metadata archive url to fetch + - project metadata url to fetch + """ url_gs = url_gs.replace('gs://google-code-archive-source/', '') filename = os.path.basename(url_gs) - url_meta = '%s/%s' % (prefix_url_api, url_gs.replace('/', '%2F')) - parent_dir = compute_destination_folder(url_gs) - return parent_dir, filename, url_meta, '%s?alt=media' % url_meta + project_name = os.path.dirname(url_gs) + url_meta = '%s/%s' % (prefix_source_url_api, url_gs.replace('/', '%2F')) + url_project_meta = '%s/%s/project.json' % (prefix_project_meta, + project_name) + return { + 'parent_dir': compute_destination_folder(url_gs), + 'filename': filename, + 'url_project_archive_meta': url_meta, + 'url_project_meta': url_project_meta + }