diff --git a/org/analysis-errors.org b/org/analysis-errors.org new file mode 100644 index 0000000..cbd32f9 --- /dev/null +++ b/org/analysis-errors.org @@ -0,0 +1,71 @@ +#+title: Errors when fetching archives +#+author: ardumont + +Unfortunately, the error log does not render the initial url and gives only either filename or either the computed url, so we need to fetch back those errors and recompute the initial url... + +* Logs + +Retrieve the errors from swh-log db: +#+BEGIN_SRC sh +psql -c "select level, message from log where src_host='worker01.softwareheritage.org' and ts between '2016-04-10 00:00:00.00+01' and '2016-05-01 00:00:00.00+01' and level = 'error';" service=swh-log > in-errors +#+END_SRC + +There are 2 kinds of errors, either corruption (mismatch between declared size or md5) or download errors. + +* Corruption + +Problem when fetching are just download errors: +#+BEGIN_SRC sh +grep 'Problem when fetching' in-errors > INDEX-errors-when-fetching +#+END_SRC + +We can also have corrupted files: +#+BEGIN_SRC sh +grep corrupted in-errors | awk '{print $11}' > INDEX-corrupted-ones +#+END_SRC + +Those have 2 kinds: +- repo +- source-archive files + +** repo + +#+BEGIN_SRC sh +grep "repo" INDEX-corrupted-ones > INDEX-corrupted-repo +#+END_SRC + +Computing back the initial url gives: +#+BEGIN_SRC sh +for f in $(cat INDEX-corrupted-repo); do + f=$(echo $f | sed 's/\(.*\)-repo.*/\1/') + grep $f downloaded-google-archive-source +done >> INDEX-to-reschedule +#+END_SRC + +Note: downloaded-google-archive-source is the file containing list of initial urls. + +** source-archive + +#+BEGIN_SRC sh +grep "source" INDEX-corrupted-ones > INDEX-corrupted-source +#+END_SRC + +Computing back the initial urls gives: +#+BEGIN_SRC sh +for f in $(cat INDEX-corrupted-source); do + f=$(echo $f | sed 's/\(.*\)-source-archive.*/\1/') + grep $f downloaded-google-archive-source +done >> INDEX-to-reschedule-2 +#+END_SRC + +* Download errors + +#+BEGIN_SRC sh +grep "Problem when fetching file" in-errors | grep -v "ValueError" | awk '{print $10}' > INDEX-errors-when-fetching +#+END_SRC + +#+BEGIN_SRC sh +for f in $(cat INDEX-errors-when-fetching | sed 's/.*%2F\(.*\)%2F.*/\1/'); do + grep $f downloaded-google-archive-source +done >> INDEX-to-reschedule-3 +#+END_SRC diff --git a/swh/fetcher/googlecode/loader.py b/swh/fetcher/googlecode/loader.py index b49a69d..75bb8c3 100644 --- a/swh/fetcher/googlecode/loader.py +++ b/swh/fetcher/googlecode/loader.py @@ -1,196 +1,195 @@ # Copyright (C) 2015-2016 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import hashlib import logging import os import requests from swh.core import config, hashutil from .utils import transform from .hashutil import md5_hash, md5_from_b64 class SWHGoogleFetcher(config.SWHConfig): """A swh data fetcher loader. This fetcher will: - retrieve the archive metadata and write it to disk. - download the archive to retrieve and write it to disk. - check that size and checksums (md5, crc32c) match those describe in the metadata. """ def __init__(self): self.log = logging.getLogger('swh.fetcher.google.SWHGoogleFetcher') l = logging.getLogger('requests.packages.urllib3.connectionpool') l.setLevel(logging.WARN) def load_meta(self, filepath): """Try and load the metadata from the given filepath. It is assumed that the code is called after checking the file exists. """ import json try: with open(filepath, 'r') as f: return json.loads(f.read()) except: return None def retrieve_source_meta(self, url_meta, filepath_meta): if os.path.exists(filepath_meta): meta = self.load_meta(filepath_meta) if meta: # some meta could be corrupted, so we try to load them return meta # and if we fail, we try to fetch them again meta = {} try: r = requests.get(url_meta) except Exception as e: msg = 'Problem when fetching metadata %s.' % url_meta self.log.error(msg) raise ValueError(msg, e) else: meta = r.json() with open(filepath_meta, 'w') as f: f.write(r.text) return meta def write_and_check(self, filepath, response, md5h): """Write the response's stream content to filepath. Compute the hash at the same time and ensure the md5h is the same. Returns: True if everything is ok and the md5 computed match the content. False otherwise """ h = hashlib.md5() with open(filepath, 'wb') as f: for chunk in response.iter_content(hashutil.HASH_BLOCK_SIZE): f.write(chunk) h.update(chunk) return md5h == h.digest() def retrieve_source(self, archive_gs, meta, filepath): url = meta['mediaLink'] if not os.path.exists(filepath): self.log.debug('Fetching %s\' raw data.' % url) try: r = requests.get(url, stream=True) except Exception as e: msg = 'Problem when fetching archive %s from url %s.' % ( archive_gs, url) self.log.error(msg) raise ValueError(msg, e) else: if not r.ok: msg = 'Problem when fetching archive %s from url %s.' % ( archive_gs, url) self.log.error(msg) raise ValueError(msg) return self.write_and_check(filepath, r, md5_from_b64(meta['md5Hash'])) def check_source_ok(self, meta, filepath, with_md5=False): expected_size = int(meta['size']) ok = True actual_size = os.path.getsize(filepath) if actual_size != expected_size: msg = 'Bad size. Expected: %s. Got: %s' % ( expected_size, actual_size) self.log.error(msg) ok = False if with_md5: expected_md5 = md5_from_b64(meta['md5Hash']) self.log.debug('Checking %s\' raw data checksums and size.' % filepath) # Last, check the metadata are ok with open(filepath, 'rb') as f: md5_h = md5_hash(f) if md5_h != expected_md5: msg = 'Bad md5 signature. Expected: %s. Got: %s' % ( expected_md5, md5_h) self.log.error(msg) ok = False return ok def process(self, archive_gs, destination_rootpath): self.log.info('Fetch %s\'s metadata' % archive_gs) # First retrieve the archive gs's metadata data = transform(archive_gs) parent_dir = data['parent_dir'] filename = data['filename'] url_project_archive_meta = data['url_project_archive_meta'] url_project_meta = data['url_project_meta'] parent_dir = os.path.join(destination_rootpath, parent_dir) os.makedirs(parent_dir, exist_ok=True) project_name = os.path.basename(parent_dir) filename = project_name + '-' + filename filename_meta = filename + '.json' filepath = os.path.join(parent_dir, filename) filepath_meta = os.path.join(parent_dir, filename_meta) filepath_project_meta = os.path.join(parent_dir, 'project.json') meta = self.retrieve_source_meta(url_project_archive_meta, filepath_meta) if not meta: raise ValueError('Fail to download archive source metadata, stop.') project_meta = self.retrieve_source_meta(url_project_meta, filepath_project_meta) if not project_meta: raise ValueError('Fail to download project metadata, stop.') # check existence of the file if os.path.exists(filepath): # it already exists, check it's ok checks_ok = self.check_source_ok(meta, filepath, with_md5=True) if checks_ok: # it's ok, we are done self.log.info('Archive %s already fetched!' % archive_gs) return self.log.error('Clean corrupted file %s' % filepath) os.remove(filepath) filepath_corrupted = filepath + '.corrupted' if os.path.exists(filepath_corrupted): os.remove(filepath_corrupted) # the file does not exist, we retrieve it checks_ok = self.retrieve_source(archive_gs, meta, filepath) # Third - Check the retrieved source if checks_ok and self.check_source_ok(meta, filepath): self.log.info('Archive %s fetched.' % archive_gs) return # Trouble, we rename the corrupted file if os.path.exists(filepath): - filepath_corrupted = filepath + '.corrupted' self.log.error('Rename corrupted file %s to %s' % ( os.path.basename(filepath), os.path.basename(filepath_corrupted))) os.rename(filepath, filepath_corrupted)