diff --git a/ardumont/sentry/analyse_hash_collision.py b/ardumont/sentry/analyse_hash_collision.py index d9e0469..ab4ca55 100644 --- a/ardumont/sentry/analyse_hash_collision.py +++ b/ardumont/sentry/analyse_hash_collision.py @@ -1,156 +1,199 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information # use: # python -m analyse_hash_collision \ # --data-file hash-collisions.json \ # | jq . > summary-collisions.json import ast import json from collections import defaultdict from typing import Any, Dict, List, Tuple import click from swh.model.hashutil import hash_to_hex, DEFAULT_ALGORITHMS +from swh.storage import get_storage as get_swhstorage + + +storage = None + + +def get_storage(): + global storage + if not storage: + storage = get_swhstorage( + cls='remote', url='http://uffizi.internal.softwareheritage.org:5002') + + return storage def import_data(f): return json.loads(open(f).read()) def content_get_metadata( content_ids: List[bytes]) -> Dict[bytes, Dict[str, Any]]: """Retrieve contents from the storage """ - from swh.storage import get_storage - storage = get_storage( - cls='remote', url='http://uffizi.internal.softwareheritage.org:5002') - contents = storage.content_get_metadata(content_ids) + contents = get_storage().content_get_metadata(content_ids) result = {} for hash_id, all_contents in contents.items(): count = len(all_contents) if count > 1: click.echo(f'hash_id {hash_id} has multiple entries: {count}') # to ease comparison: # - take only 1 of the contents (most cases i guess) # - drop the length hashes = all_contents[0] hashes.pop('length', None) result[hash_id] = hashes return result -def content_hex_hashes(content: Dict[str, bytes]) -> Dict[str, str]: - """Convert bytes hashes into hex hashes. Also "enforce" the key order (not an - OrderedDict though but that seems enough for json dumps). +def content_find(content: Dict[str, bytes]) -> Dict[str, bytes]: + """Retrieve content from the storage """ - return { - algo: hash_to_hex(content[algo]) for algo in DEFAULT_ALGORITHMS - } + c = get_storage().content_find(content) + return c[0] + + +def content_hex_hashes( + content: Dict[str, bytes], with_details=False) -> Dict[str, str]: + """Convert bytes hashes into hex hashes. + + """ + c = content.copy() + for algo in DEFAULT_ALGORITHMS: + c[algo] = hash_to_hex(content[algo]) + ctime = c.get('ctime') + if ctime: + c['ctime'] = ctime.isoformat() + return c + + +def content_equal(content0: Dict, content1: Dict) -> bool: + """Check if content are equals solely comparing their hashes + + """ + for algo in DEFAULT_ALGORITHMS: + if content0[algo] != content1[algo]: + return False + return True def compute_diff_hashes( content0: Dict[str, str], content1: Dict[str, str]) -> Tuple[ bool, Dict[str, str]]: """Compute the specific different between content """ falsy = False diff_hashes = {} for algo in DEFAULT_ALGORITHMS: hash0 = content0[algo] hash1 = content1[algo] if hash0 != hash1: diff_hashes[algo] = [hash0, hash1] # different length is a smell for falsy collisions falsy = len(hash0) != len(hash1) return falsy, diff_hashes @click.command() @click.option('--data-file', default='hash-collision-all-sentry-id-1438.json') def main(data_file): data = import_data(data_file) # how many collisions skipped due to incomplete message summary_skipped = 0 # how many collisions summary_count = defaultdict(int) # one hash ends up with multiple collisions detailed_collisions = defaultdict(list) count = 0 for entry_id, entry in data.items(): message = entry['message'] count += 1 if message.endswith('...'): # TOOD: Find a way to retrieve the full message # because it can't be parsed for now summary_skipped += 1 # incomplete message, skipping for now continue + date_created = entry['date-created'] msg: Tuple[str, bytes, Dict[str, bytes]] = ast.literal_eval(message) algo, hash_id, colliding_contents = msg # Asserting we only have sha1 collisions so far assert algo == 'sha1' summary_count[hash_id] += 1 # take only 1 content, on previous iteration, the list was multiple # occurences of the same hash - # TODO: ensure it remains true - detailed_collisions[hash_id] = colliding_contents[0] + assert len(colliding_contents) == 1 + sentry_content = colliding_contents[0] + sentry_content['date-reported-by-sentry'] = date_created + detailed_collisions[hash_id] = sentry_content # Retrieve the contents from storage to compare full_contents = content_get_metadata(list(summary_count.keys())) count_collisions = 0 count_falsy_collisions = 0 collisions = {} falsy_collisions = {} for hash_id, stored_content in full_contents.items(): - collision_content = content_hex_hashes(detailed_collisions[hash_id]) - stored_content = content_hex_hashes(stored_content) - - if collision_content != stored_content: - falsy, diff_hashes = compute_diff_hashes( - stored_content, collision_content) - hex_hash_id = hash_to_hex(hash_id) - if falsy: - count_falsy_collisions += 1 - falsy_collisions[hex_hash_id] = [ - ('stored-cnt', stored_content), - ('sentry-cnt', collision_content), - ('difference', diff_hashes) - ] - else: - count_collisions += 1 - collisions[hex_hash_id] = [ - ('stored-cnt', stored_content), - ('sentry-cnt', collision_content), - ('difference', diff_hashes) - ] + collision_content_hhashes = content_hex_hashes( + detailed_collisions[hash_id]) + stored_content_hhashes = content_hex_hashes(stored_content) + + if content_equal(collision_content_hhashes, stored_content_hhashes): + continue + + falsy, diff_hashes = compute_diff_hashes( + stored_content_hhashes, collision_content_hhashes) + + hex_hash_id = hash_to_hex(hash_id) + if falsy: + count_falsy_collisions += 1 + # we want the ctime + stored_content_hhashes = content_hex_hashes( + content_find(stored_content)) + + falsy_collisions[hex_hash_id] = [ + ('stored-cnt', stored_content_hhashes), + ('sentry-cnt', collision_content_hhashes), + ('difference', diff_hashes) + ] + else: + count_collisions += 1 + collisions[hex_hash_id] = [ + ('stored-cnt', stored_content_hhashes), + ('sentry-cnt', collision_content_hhashes), + ('difference', diff_hashes) + ] summary = { - 'total-collisions-raises-in-sentry': count, - 'total-collisions-on-sha1': count_collisions, - 'total-falsy-collisions-on-sha1': count_falsy_collisions, + 'total-collisions-raised-in-sentry': count, + 'total-collisions': count_collisions, + 'total-falsy-collisions': count_falsy_collisions, 'detailed-collisions': collisions, - 'detailed-falsy-collision': falsy_collisions, + 'detailed-falsy-collisions': falsy_collisions, } click.echo(json.dumps(summary)) if __name__ == '__main__': main() diff --git a/ardumont/sentry/sentry.py b/ardumont/sentry/sentry.py index febbb03..fc9675a 100644 --- a/ardumont/sentry/sentry.py +++ b/ardumont/sentry/sentry.py @@ -1,204 +1,205 @@ # Copyright (C) 2020 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information import json import logging import click import requests from typing import Any, Dict, Optional, Iterable logger = logging.getLogger(__name__) SENTRY_URL = 'https://sentry.softwareheritage.org' ORGA_SLUG = 'swh' def url_api_project(base_url: str) -> str: return f'{base_url}/api/0/projects/' def url_api_token(base_url: str) -> str: return f'{base_url}/settings/account/api/auth-tokens/' def url_project_issues(base_url: str, project_slug: str, short_id: Optional[str] = None) -> str: return f'{base_url}/api/0/projects/{ORGA_SLUG}/{project_slug}/issues/' def url_issue(base_url: str, issue_id: int) -> str: return f'{base_url}/api/0/issues/{issue_id}/' def url_issue_events(base_url: str, issue_id: int) -> str: return f'{base_url}/api/0/issues/{issue_id}/events/' @click.group() @click.option('-a', '--api-url', default=SENTRY_URL, help='sentry api to use') @click.option('-t', '--token', help='Api authentication token') @click.pass_context def main(ctx, api_url: str, token: str): """Allow sentry data manipulation with the click """ api_token = url_api_token(api_url) if not token: raise ValueError( f'Missing api token, connect and generate one in {api_token}' ) ctx.ensure_object(dict) ctx.obj['token'] = token ctx.obj['url'] = { 'base': api_url, 'project': url_api_project(api_url), 'api-token': api_token, } def query(url, token: Optional[str] = None) -> Dict[str, Any]: """Query the sentry api url with authentication token. This returns result per page. """ resp = requests.get(url, headers={ 'Authorization': f'Bearer {token}', 'content-type': 'application/json' }) if resp.ok: logger.debug('resp: %(resp)s', {'resp': resp}) data = resp.json() if 'next' in resp.links: next_page = resp.links['next']['url'] else: next_page = None return {'data': data, 'next': next_page} return {'data': None, 'next': None} def query_all(url, token: Optional[str] = None): """Query api which resolves the pagination """ while True: data = query(url, token=token) if not data['data']: break yield data['data'] if not data['next']: break url = data['next'] @main.command('project') @click.pass_context def list_projects(ctx: Dict) -> Dict[str, Any]: """List all projects's. This returns a mapping from their slug to their {id, name}. """ url = ctx.obj['url']['project'] token = ctx.obj['token'] page_projects = query_all(url, token=token) mappings = {} for projects in page_projects: for project in projects: mappings[project['slug']] = { 'id': project['id'], 'name': project['name'], } click.echo(json.dumps(mappings)) @main.command('issues') @click.option('--project-slug', '-p', required=1, help="Project's slug identifier") @click.pass_context def issues(ctx, project_slug): """List all projects's issues. This returns a mapping from their id to their summary. """ base_url = ctx.obj['url']['base'] token = ctx.obj['token'] url = url_project_issues(base_url, project_slug) data = query_all(url, token=token) mappings = {} for issues in data: for issue in issues: mappings[issue['id']] = { 'short-id': issue['shortId'], 'status': issue['status'], 'metadata': issue['metadata'], } click.echo(json.dumps(mappings)) @main.command('issue') @click.option('--issue-id', '-i', help='Issue id (not the short one listed in ui)') @click.pass_context def issue(ctx, issue_id): """Get detail about a specific issue by its id. """ base_url = ctx.obj['url']['base'] token = ctx.obj['token'] url = url_issue(base_url, issue_id) data = query(url, token=token) issue = data['data'] if data: summary_issue = { 'short-id': issue['shortId'], 'title': issue['title'], 'first-seen': issue['firstSeen'], 'last-seen': issue['lastSeen'], 'count': issue['count'], 'status': issue['status'], 'project': issue['project']['slug'], 'culprit': issue['culprit'], 'metadata': issue['metadata'], } click.echo(json.dumps(summary_issue)) @main.command('events') @click.option('--issue-id', '-i', help='Issue id (not the short one listed in ui)') @click.pass_context def events(ctx, issue_id): """Get detail about a specific issue by its id. """ base_url = ctx.obj['url']['base'] token = ctx.obj['token'] url = url_issue_events(base_url, issue_id) data = query_all(url, token=token) mappings = {} for events in data: for event in events: mappings[event['id']] = { + 'date-created': event['dateCreated'], 'culprit': event['culprit'], 'title': event['title'], 'message': event['message'], 'project-id': event['projectID'], 'group-id': event['groupID'], } click.echo(json.dumps(mappings)) if __name__ == '__main__': # logging.basicConfig(level=logging.DEBUG) main()