diff --git a/README b/README index 16ed593..10c6f57 100644 --- a/README +++ b/README @@ -1,57 +1,81 @@ swh-fetcher-googlecode ====================== This fetcher does: - parse a gs:// url and transforms it according to the email's rule (see below) - deriving the file's url as metadata (mediaLink, length, crc32c, md5Hash, etc...) - writes on disk such metadata file - deriving the actual content from the mediaLink entry (exactly the url described higher) - writes on disk such content - checks the content file's metadata (crc32c, md5, length) match the one described in file metadata - flag as corrupted the file if it does not ``` Date: Fri, 8 Apr 2016 13:25:41 -0700 From: Chris Smith To: Roberto Di Cosmo Cc: Stefano Zacchiroli Subject: Re: Archiving the sources from Google Code into Software Heritage Message-ID: You can get the list of all files stored in Google Cloud Storage, which power the Google Code Archive here: https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip https://storage.googleapis.com/google-code-archive/google-code-archive-source.txt.zip https://storage.googleapis.com/google-code-archive/google-code-archive-downloads.txt.zip Just download and unzip the files. They contain all the Google Cloud Storage object names in each bucket. From there you will need to just download the actual files via a basic conversion. For exmaple, with Google Cloud Storage URL gs://google-code-archive/v2/code.google/hg4j/project.json, you can get the file's contents by URL-escaping the string and adding it to googleapis.com. e.g. https://www.googleapis.com/storage/v1/ b/google-code-archive/o/v2%2Fcode.google.com%2Fhg4j%2Fproject.json?alt=media. The "?alt=media" part gets the object's contents, not the metadata. You probably only care about the google-code-archive-source bucket, since that is where we contain tarballs of git, hg, and the new svn dumps. But if you were interested in poking around the project metadata (e.g. issues) the schema is here . If you run into any troubles let me know. I'll be able to look into any missing or corrupt repositories for the next couple months. After that time we will shut down the Google Code DVCS backends, and only the Google Code Archive snapshot will remain. (So you resurrecting these projects might ferret out any problems with my data.) Cheers, -Chris ``` Note: It only what's described only for source archive. + + +# Metadata sample: + +``` +{ + "kind": "storage#object", + "id": "google-code-archive-source/v2/code.google.com/hg4j/source-archive.zip/1455746620701000", + "selfLink": "https://www.googleapis.com/storage/v1/b/google-code-archive-source/o/v2%2Fcode.google.com%2Fhg4j%2Fsource-archive.zip", + "name": "v2/code.google.com/hg4j/source-archive.zip", + "bucket": "google-code-archive-source", + "generation": "1455746620701000", + "metageneration": "1", + "contentType": "application/octet-stream", + "timeCreated": "2016-02-17T22:03:40.698Z", + "updated": "2016-02-17T22:03:40.698Z", + "storageClass": "NEARLINE", + "size": "4655405", + "md5Hash": "FaIRjuSDe4v51H1+sRuggQ==", + "mediaLink": "https://www.googleapis.com/download/storage/v1/b/google-code-archive-source/o/v2%2Fcode.google.com%2Fhg4j%2Fsource-archive.zip?generation=1455746620701000&alt=media", + "crc32c": "PNKIqA==", + "etag": "CMjy1uHm/8oCEAE=" +} +``` diff --git a/swh/fetcher/googlecode/loader.py b/swh/fetcher/googlecode/loader.py index 7314781..aef93c0 100644 --- a/swh/fetcher/googlecode/loader.py +++ b/swh/fetcher/googlecode/loader.py @@ -1,181 +1,160 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import logging import os import requests from swh.core import config, hashutil from .utils import transform from .hashutil import crc32c_hash, md5_hash, md5_from_b64, crc32c_from_b64 -# sample meta: -# { -# "kind": "storage#object", # noqa -# "id": "google-code-archive-source/v2/code.google.com/hg4j/source-archive.zip/1455746620701000", # noqa -# "selfLink": "https://www.googleapis.com/storage/v1/b/google-code-archive-source/o/v2%2Fcode.google.com%2Fhg4j%2Fsource-archive.zip", # noqa -# "name": "v2/code.google.com/hg4j/source-archive.zip", # noqa -# "bucket": "google-code-archive-source", # noqa -# "generation": "1455746620701000", # noqa -# "metageneration": "1", # noqa -# "contentType": "application/octet-stream", # noqa -# "timeCreated": "2016-02-17T22:03:40.698Z", # noqa -# "updated": "2016-02-17T22:03:40.698Z", # noqa -# "storageClass": "NEARLINE", # noqa -# "size": "4655405", # noqa -# "md5Hash": "FaIRjuSDe4v51H1+sRuggQ==", # noqa -# "mediaLink": "https://www.googleapis.com/download/storage/v1/b/google-code-archive-source/o/v2%2Fcode.google.com%2Fhg4j%2Fsource-archive.zip?generation=1455746620701000&alt=media", # noqa -# "crc32c": "PNKIqA==", # noqa -# "etag": "CMjy1uHm/8oCEAE=" # noqa -# } - - class SWHGoogleFetcher(config.SWHConfig): """A swh data fetcher loader. This fetcher will: - retrieve the archive metadata and write it to disk. - download the archive to retrieve and write it to disk. - check that size and checksums (md5, crc32c) match those describe in the metadata. """ def __init__(self): self.log = logging.getLogger('swh.fetcher.google.SWHGoogleFetcher') l = logging.getLogger('requests.packages.urllib3.connectionpool') l.setLevel(logging.WARN) def retrieve_source_meta(self, url_meta, filepath_meta): if os.path.exists(filepath_meta): import json with open(filepath_meta, 'r') as f: meta = json.loads(f.read()) else: meta = {} try: r = requests.get(url_meta) except Exception as e: msg = 'Problem when fetching metadata %s.' % url_meta self.log.error(msg) raise ValueError(msg, e) else: meta = r.json() with open(filepath_meta, 'w') as f: f.write(r.text) return meta def retrieve_source(self, url, filepath): if not os.path.exists(filepath): self.log.debug('Fetching %s\' raw data.' % url) try: r = requests.get(url, stream=True) except Exception as e: msg = 'Problem when fetching file %s.' % url self.log.error(msg) raise ValueError(msg, e) else: if not r.ok: msg = 'Problem when fetching file %s.' % url self.log.error(msg) raise ValueError(msg) else: with open(filepath, 'wb') as f: for chunk in r.iter_content(hashutil.HASH_BLOCK_SIZE): f.write(chunk) def check_source(self, meta, filepath): expected = { 'crc32c': crc32c_from_b64(meta['crc32c']), 'md5': md5_from_b64(meta['md5Hash']), 'size': int(meta['size']) } error = False actual_size = os.path.getsize(filepath) if actual_size != expected['size']: msg = 'Bad size. Expected: %s. Got: %s' % ( expected['size'], actual_size) self.log.error(msg) error = True self.log.debug('Checking %s\' raw data checksums and size.' % filepath) # Last, check the metadata are ok with open(filepath, 'rb') as f: md5_h = md5_hash(f) if md5_h != expected['md5']: msg = 'Bad md5 signature. Expected: %s. Got: %s' % ( expected['md5'], md5_h) self.log.error(msg) error = True f.seek(0) crc32c_h = crc32c_hash(f) if expected['crc32c'] != crc32c_h: msg = 'Bad crc32c signature. Expected: %s. Got: %s' % ( expected['crc32c'], crc32c_h) self.log.error(msg) error = True return error def process(self, archive_gs, destination_rootpath): self.log.info('Fetch %s\'s metadata' % archive_gs) # First retrieve the archive gs's metadata parent_dir, filename, url_meta, url_content = transform( archive_gs) parent_dir = os.path.join(destination_rootpath, parent_dir) os.makedirs(parent_dir, exist_ok=True) project_name = os.path.basename(parent_dir) filename = project_name + '-' + filename filename_meta = filename + '.json' filepath = os.path.join(parent_dir, filename) filepath_meta = os.path.join(parent_dir, filename_meta) meta = self.retrieve_source_meta(url_meta, filepath_meta) if not meta: raise ValueError('Fail to download metadata, stop.') # check existence of the file if os.path.exists(filepath): # it already exists, check it's ok errors = self.check_source(meta, filepath) if errors: if os.path.exists(filepath): self.log.error('Clean corrupted file %s' % filepath) os.remove(filepath) else: # it's ok, we are done! self.log.info('Archive %s already fetched!' % archive_gs) return # the file does not exist, we retrieve it self.retrieve_source(meta['mediaLink'], filepath) # Third - Check the retrieved source errors = self.check_source(meta, filepath) if errors: if os.path.exists(filepath): filepath_corrupted = filepath + '.corrupted' self.log.error('Rename corrupted file %s to %s' % ( os.path.basename(filepath), os.path.basename(filepath_corrupted))) os.rename(filepath, filepath_corrupted) else: self.log.info('Archive %s fetched.' % archive_gs)