diff --git a/ardumont/cran/analysis.py b/ardumont/cran/analysis.py new file mode 100644 index 0000000..17eee6e --- /dev/null +++ b/ardumont/cran/analysis.py @@ -0,0 +1,158 @@ +# Copyright (C) 2019 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +import click +import datetime +import gzip +import logging +import json + +from collections import defaultdict +from pprint import pprint + +logger = logging.getLogger(__name__) + + +def validate_date_pattern(date_text, pattern): + """Validate the date is of a given pattern (strptime like) + + Pattern possible for example: '%Y-%m-%d' + """ + valid = True + try: + datetime.datetime.strptime(date_text, pattern) + except ValueError: + valid = False + return valid + + +def date_field_pattern_repartition(data, date_field='Date'): + """Try and validate the data set + + """ + status_date = defaultdict(int) + patterns = [ + '%d %B %Y', # '21 February 2012', + '%d %b %Y', + '%Y-%m-%d', + '%Y.%m.%d', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%Y-%d-%m', + '%Y/%m/%d', + '%Y-%m-%d %H:%M:%S', + ] + invalid_dates = [] + for d in data: + valid = False + date = d.get(date_field) + if date is None: + status_date[None] += 1 + continue + for pattern in patterns: + if validate_date_pattern(date, pattern): + status_date['valid'] += 1 + status_date[pattern] += 1 + valid = True + continue + if not valid: + status_date['invalid'] += 1 + invalid_dates.append(date) + + return status_date, invalid_dates + + +def author_field_repartition(data): + """Compute field repartition + + """ + summary = defaultdict(int) + + for d in data: + maintainer = d.get('Maintainer') + author = d.get('Author') + if maintainer is not None and author is not None: + summary['maintainer_and_author'] += 1 + elif maintainer: + summary['maintainer'] += 1 + elif author: + summary['author'] += 1 + else: + summary['no_author_no_maintainer'] += 1 + + return summary + + +def date_field_repartition(data): + """Compute field repartition + + """ + summary = defaultdict(int) + + for d in data: + date = d.get('Date') + published = d.get('Published') + if published is not None and date is not None: + summary['date_and_published'] += 1 + elif date: + summary['date'] += 1 + elif published: + summary['published'] += 1 + else: + summary['no_date_no_published'] += 1 + + return summary + + +def load_data(filepath): + """Load data set from filepath (json in gzip file) + + """ + logger.debug('filepath: %s', filepath) + with gzip.open(filepath) as f: + data = json.loads(f.read()) + + logger.debug('len(data): %s', len(data)) + return data + + +@click.command() +@click.option('--dataset', help='Json data set as gzip', required=True, + default='list-all-packages.R.json.gz') +@click.option('--with-pattern-date-repartition', is_flag=True, default=False) +@click.option('--with-author-repartition', is_flag=True, default=False) +@click.option('--with-date-repartition', is_flag=True, default=False) +def main(dataset, with_pattern_date_repartition, + with_author_repartition, with_date_repartition): + data = load_data(dataset) + + if with_pattern_date_repartition: + for field_date in ['Date', 'Published']: + summary, invalid_dates = date_field_pattern_repartition( + data, field_date) + logger.info("Summary for '%s' field", field_date) + pprint(summary) + + logger.info("Unknown date format for '%s' field" % field_date) + pprint(invalid_dates) + + if with_author_repartition: + summary = author_field_repartition(data) + pprint(summary) + + if with_date_repartition: + summary = date_field_repartition(data) + pprint(summary) + + +if __name__ == '__main__': + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s %(process)d %(message)s' + ) + + main()