diff --git a/swh/fetcher/googlecode/fetcher.py b/swh/fetcher/googlecode/fetcher.py index 5849d03..6504e8f 100644 --- a/swh/fetcher/googlecode/fetcher.py +++ b/swh/fetcher/googlecode/fetcher.py @@ -1,196 +1,183 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import logging import os import requests from swh.core import config, hashutil -from .utils import transform +from .utils import transform, load_meta from .hashutil import md5_hash, md5_from_b64 class SWHGoogleArchiveFetcher(config.SWHConfig): """A google code archive fetcher. This fetcher will: - retrieve the archive metadata and write it to disk. - download the archive to retrieve and write it to disk. - check that size and checksums (md5, crc32c) match those describe in the metadata. """ def __init__(self): self.log = logging.getLogger( 'swh.fetcher.google.SWHGoogleArchiveFetcher') l = logging.getLogger('requests.packages.urllib3.connectionpool') l.setLevel(logging.WARN) - def load_meta(self, filepath): - """Try and load the metadata from the given filepath. - It is assumed that the code is called after checking the file - exists. - - """ - import json - try: - with open(filepath, 'r') as f: - return json.loads(f.read()) - except: - return None - def retrieve_source_meta(self, url_meta, filepath_meta): if os.path.exists(filepath_meta): - meta = self.load_meta(filepath_meta) + meta = load_meta(filepath_meta) if meta: # some meta could be corrupted, so we try to load them return meta # and if we fail, we try to fetch them again meta = {} try: r = requests.get(url_meta) except Exception as e: msg = 'Problem when fetching metadata %s.' % url_meta self.log.error(msg) raise ValueError(msg, e) else: meta = r.json() with open(filepath_meta, 'w') as f: f.write(r.text) return meta def write_and_check(self, filepath, response, md5h): """Write the response's stream content to filepath. Compute the hash at the same time and ensure the md5h is the same. Returns: True if everything is ok and the md5 computed match the content. False otherwise """ h = hashlib.md5() with open(filepath, 'wb') as f: for chunk in response.iter_content(hashutil.HASH_BLOCK_SIZE): f.write(chunk) h.update(chunk) return md5h == h.digest() def retrieve_source(self, archive_gs, meta, filepath): url = meta['mediaLink'] if not os.path.exists(filepath): self.log.debug('Fetching %s\' raw data.' % url) try: r = requests.get(url, stream=True) except Exception as e: msg = 'Problem when fetching archive %s from url %s.' % ( archive_gs, url) self.log.error(msg) raise ValueError(msg, e) else: if not r.ok: msg = 'Problem when fetching archive %s from url %s.' % ( archive_gs, url) self.log.error(msg) raise ValueError(msg) return self.write_and_check(filepath, r, md5_from_b64(meta['md5Hash'])) def check_source_ok(self, meta, filepath, with_md5=False): expected_size = int(meta['size']) ok = True actual_size = os.path.getsize(filepath) if actual_size != expected_size: msg = 'Bad size. Expected: %s. Got: %s' % ( expected_size, actual_size) self.log.error(msg) ok = False if with_md5: expected_md5 = md5_from_b64(meta['md5Hash']) self.log.debug('Checking %s\' raw data checksums and size.' % filepath) # Last, check the metadata are ok with open(filepath, 'rb') as f: md5_h = md5_hash(f) if md5_h != expected_md5: msg = 'Bad md5 signature. Expected: %s. Got: %s' % ( expected_md5, md5_h) self.log.error(msg) ok = False return ok def process(self, archive_gs, destination_rootpath): self.log.info('Fetch %s\'s metadata' % archive_gs) # First retrieve the archive gs's metadata data = transform(archive_gs) parent_dir = data['parent_dir'] filename = data['filename'] url_project_archive_meta = data['url_project_archive_meta'] url_project_meta = data['url_project_meta'] parent_dir = os.path.join(destination_rootpath, parent_dir) os.makedirs(parent_dir, exist_ok=True) project_name = os.path.basename(parent_dir) filename = project_name + '-' + filename filename_meta = filename + '.json' filepath = os.path.join(parent_dir, filename) filepath_meta = os.path.join(parent_dir, filename_meta) filepath_project_meta = os.path.join(parent_dir, 'project.json') meta = self.retrieve_source_meta(url_project_archive_meta, filepath_meta) if not meta: raise ValueError('Fail to download archive source metadata, stop.') project_meta = self.retrieve_source_meta(url_project_meta, filepath_project_meta) if not project_meta: raise ValueError('Fail to download project metadata, stop.') # check existence of the file if os.path.exists(filepath): # it already exists, check it's ok checks_ok = self.check_source_ok(meta, filepath, with_md5=True) if checks_ok: # it's ok, we are done self.log.info('Archive %s already fetched!' % archive_gs) return self.log.error('Clean corrupted file %s' % filepath) os.remove(filepath) filepath_corrupted = filepath + '.corrupted' if os.path.exists(filepath_corrupted): os.remove(filepath_corrupted) # the file does not exist, we retrieve it checks_ok = self.retrieve_source(archive_gs, meta, filepath) # Third - Check the retrieved source if checks_ok and self.check_source_ok(meta, filepath): self.log.info('Archive %s fetched.' % archive_gs) return # Trouble, we rename the corrupted file if os.path.exists(filepath): self.log.error('Rename corrupted file %s to %s' % ( os.path.basename(filepath), os.path.basename(filepath_corrupted))) os.rename(filepath, filepath_corrupted) diff --git a/swh/fetcher/googlecode/utils.py b/swh/fetcher/googlecode/utils.py index 26eca8b..9eacf23 100644 --- a/swh/fetcher/googlecode/utils.py +++ b/swh/fetcher/googlecode/utils.py @@ -1,48 +1,64 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import os +import json + def compute_destination_folder(path): """Given a path, compute a destination folder to which downloads the remote files. """ parent_dir = os.path.dirname(path) project_name = os.path.basename(parent_dir) parent_ddir = os.path.dirname(parent_dir) return os.path.join(parent_ddir, project_name[0], project_name) prefix_source_url_api = 'https://www.googleapis.com/storage/v1/b/google-code-archive-source/o' # noqa prefix_project_meta = 'https://storage.googleapis.com/google-code-archive' def transform(url_gs): """Transform input gs:// url into a dictionary with the following information. Returns: Dict of the following form: - destination folder - filename - metadata archive url to fetch - project metadata url to fetch """ url_gs = url_gs.replace('gs://google-code-archive-source/', '') filename = os.path.basename(url_gs) project_name = os.path.dirname(url_gs) url_meta = '%s/%s' % (prefix_source_url_api, url_gs.replace('/', '%2F')) url_project_meta = '%s/%s/project.json' % (prefix_project_meta, project_name) return { 'parent_dir': compute_destination_folder(url_gs), 'filename': filename, 'url_project_archive_meta': url_meta, 'url_project_meta': url_project_meta } +def load_meta(filepath): + """Load the metadata from the given filepath (json file). + It is assumed that the code is called after checking the file + exists. + + Returns: + Dictionary of data or None if any problem is encountered. + + """ + try: + with open(filepath, 'r') as f: + return json.loads(f.read()) + except: + return None