# Copyright (C) 2015  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information

import glob
import logging
import os
import subprocess
import time

import pygit2

from collections import defaultdict
from datetime import datetime
from pygit2 import GIT_REF_OID, Oid
from pygit2 import GIT_OBJ_BLOB, GIT_OBJ_TREE, GIT_OBJ_COMMIT, GIT_OBJ_TAG, GIT_SORT_TOPOLOGICAL
from enum import Enum

from swh.core import hashutil
from swh.loader.git.data import swhrepo
from swh.storage.storage import Storage


class DirectoryTypeEntry(Enum):
    """Types of git objects.
    """
    file = 'file'
    directory = 'directory'


def date_format(d):
    """d is expected to be a datetime object.
    """
    return time.strftime("%a, %d %b %Y %H:%M:%S +0000", d.timetuple())


def now():
    """Cheat time values."""
    return date_format(datetime.utcnow())


def timestamp_to_string(timestamp):
    """Convert a timestamps to string.
    """
    return date_format(datetime.utcfromtimestamp(timestamp))


def list_objects_from_packfile_index(packfile_index):
    """List the objects indexed by this packfile, in packfile offset
    order.
    """
    input_file = open(packfile_index, 'rb')

    with subprocess.Popen(
        ['/usr/bin/git', 'show-index'],
        stdin=input_file,
        stdout=subprocess.PIPE,
    ) as process:

        data = []

        for line in process.stdout.readlines():
            # git show-index returns the line as:
            # <packfile offset> <object_id> (<object CRC>)
            line_components = line.split()
            offset = int(line_components[0])
            object_id = line_components[1]

            data.append((offset, object_id))

        yield from (Oid(hex=object_id.decode('ascii'))
                    for _, object_id in sorted(data))

    input_file.close()


def simple_list_objects(repo):
    """List the objects in a given repository. Watch out for duplicates!"""
    objects_dir = os.path.join(repo.path, 'objects')
    # Git hashes are 40-character long
    objects_glob = os.path.join(objects_dir, '[0-9a-f]' * 2, '[0-9a-f]' * 38)

    packfile_dir = os.path.join(objects_dir, 'pack')

    if os.path.isdir(packfile_dir):
        for packfile_index in os.listdir(packfile_dir):
            if not packfile_index.endswith('.idx'):
                # Not an index file
                continue
            packfile_index_path = os.path.join(packfile_dir, packfile_index)
            yield from list_objects_from_packfile_index(packfile_index_path)

    for object_file in glob.glob(objects_glob):
        # Rebuild the object id as the last two components of the path
        yield Oid(hex=''.join(object_file.split(os.path.sep)[-2:]))


def list_objects(repo):
    """List the objects in a given repository, removing duplicates"""
    seen = set()
    for oid in simple_list_objects(repo):
        if oid not in seen:
            yield oid
            seen.add(oid)


def get_objects_per_object_type(repo):
    """Get all the (pygit2-parsed) objects from repo per object type"""
    objects_per_object_type = defaultdict(list)

    for object_id in list_objects(repo):
        object = repo[object_id]
        objects_per_object_type[object.type].append(object_id)

    return objects_per_object_type


HASH_ALGORITHMS = ['sha1', 'sha256']


def send_in_packets(repo, source_list, formatter, sender, packet_size, extra_data=None):
    """Send objects from `source_list`, passed through `formatter` (being
    passed the `repo` and `extra_data`), by the `sender`, in packets
    of `packet_size` objects

    """

    if extra_data is None:
        extra_data = {}

    objects = []
    for id in source_list:
        objects.append(formatter(repo, id, **extra_data))
        if len(objects) >= packet_size:
            sender(objects)
            objects = []

    sender(objects)


def send_contents(content_list):
    """Actually send properly formatted contents to the database"""
    logging.info("Sending %d contents" % len(content_list))
    s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test')

    s.content_add(content_list)
    logging.info("Done sending %d contents" % len(content_list))


def send_directories(directory_list):
    """Actually send properly formatted directories to the database"""
    # TODO: send directories
    logging.info("Sending %d directories" % len(directory_list))
    s = Storage('dbname=softwareheritage-dev', '/tmp/swh-loader-git/test')

    s.directory_add(directory_list)
    logging.info("Done sending %d directories" % len(directory_list))


def send_revisions(revision_list):
    """Actually send properly formatted revisions to the database"""
    logging.info("Sending %d revisions" % len(revision_list))
    # TODO: send revisions
    logging.info("Done sending %d revisions" % len(revision_list))


def send_releases(release_list):
    """Actually send properly formatted releases to the database"""
    logging.info("Sending %d releases" % len(release_list))
    # TODO: send releases
    logging.info("Done sending %d releases" % len(release_list))


def blob_to_content(repo, id):
    """Format a blob as a content"""
    blob = repo[id]
    data = blob.data
    hashes = hashutil.hashdata(data, HASH_ALGORITHMS)
    return {
        'sha1_git': id.raw,
        'sha1': hashes['sha1'],
        'sha256': hashes['sha256'],
        'data': data,
        'length': blob.size,
    }


def tree_to_directory(repo, id):
    """Format a tree as a directory"""
    ret = {
        'id': id.raw,
    }
    entries = []
    ret['entries'] = entries

    entry_type_map = {
        'tree': 'dir',
        'blob': 'file',
        'commit': 'rev',
    }

    for entry in repo[id]:
        entries.append({
            'type': entry_type_map[entry.type],
            'perms': entry.filemode,
            'name': entry.name,
            'target': entry.id.raw,
            'atime': None,
            'mtime': None,
            'ctime': None,
        })

    return ret


def commit_to_revision(repo, id):
    """Format a commit as a revision"""
    # TODO: format commits
    return {
        'id': id,
    }


def annotated_tag_to_release(repo, id):
    """Format an annotated tag as a release"""
    # TODO: format annotated tags
    return {
        'id': id,
    }


def unannotated_tag_to_release(repo, id):
    """Format an unannotated tag as a release"""
    # TODO: format unannotated tags
    return {
        'id': id,
    }


def bulk_send_blobs(repo, blob_dict):
    """Format blobs as swh contents and send them to the database in bulks
    of maximum `threshold` objects

    """
    # TODO: move to config file
    content_packet_size = 100000

    send_in_packets(repo, blob_dict, blob_to_content, send_contents, content_packet_size)


def bulk_send_trees(repo, tree_dict):
    """Format trees as swh directories and send them to the database

    """
    # TODO: move to config file
    directory_packet_size = 25000

    send_in_packets(repo, tree_dict, tree_to_directory, send_directories, directory_packet_size)


def bulk_send_commits(repo, commit_dict):
    """Format commits as swh revisions and send them to the database

    """
    # TODO: move to config file
    revision_packet_size = 10000

    send_in_packets(repo, commit_dict, commit_to_revision, send_revisions, revision_packet_size)


def bulk_send_annotated_tags(repo, tag_dict):
    """Format annotated tags (pygit2.Tag objects) as swh releases and send
    them to the database

    """
    # TODO: move to config file
    release_packet_size = 10000

    send_in_packets(repo, tag_dict, annotated_tag_to_release, send_releases, release_packet_size)


def bulk_send_unannotated_tags(repo, tag_dict, commit_dict):
    """Format unannotated tags (strings) as swh releases and send
    them to the database

    """
    # TODO: move to config file
    release_packet_size = 10000

    extra_data = {
        'commits': commit_dict,
    }
    send_in_packets(repo, tag_dict, unannotated_tag_to_release,
                    send_releases, release_packet_size, extra_data)


def parse_via_object_list(repo_path):
    logging.info("Started loading %s" % repo_path)
    repo = pygit2.Repository(repo_path)
    objects_per_object_type = get_objects_per_object_type(repo)

    logging.info("Done listing the objects in %s: will load %d contents, "
                 "%d directories, %d revisions, %d releases" % (
                     repo_path,
                     len(objects_per_object_type[GIT_OBJ_BLOB]),
                     len(objects_per_object_type[GIT_OBJ_TREE]),
                     len(objects_per_object_type[GIT_OBJ_COMMIT]),
                     len(objects_per_object_type[GIT_OBJ_TAG])))

    #  bulk_send_blobs(repo, objects_per_object_type[GIT_OBJ_BLOB])
    bulk_send_trees(repo, objects_per_object_type[GIT_OBJ_TREE])
    bulk_send_commits(repo, objects_per_object_type[GIT_OBJ_COMMIT])
    bulk_send_annotated_tags(repo, objects_per_object_type[GIT_OBJ_TAG])
    # TODO: send unannotated tags
    bulk_send_unannotated_tags(repo, [], objects_per_object_type[GIT_OBJ_COMMIT])

    return objects_per_object_type, {type: len(list) for type, list in objects_per_object_type.items()}

def parse(repo_path):
    """Given a repository path, parse and return a memory model of such
    repository."""
    def read_signature(signature):
        return '%s <%s>' % (signature.name, signature.email)

    def treewalk(repo, tree):
        """Walk a tree with the same implementation as `os.path`.
        Returns: tree, trees, blobs
        """
        trees, blobs, dir_entries = [], [], []
        for tree_entry in tree:
            if swh_repo.already_visited(tree_entry.hex):
                logging.debug('tree_entry %s already visited, skipped' % tree_entry.hex)
                continue

            obj = repo.get(tree_entry.oid)
            if obj is None:  # or obj.type == GIT_OBJ_COMMIT:
                logging.warn('skip submodule-commit %s' % tree_entry.hex)
                continue  # submodule!

            if obj.type == GIT_OBJ_TREE:
                logging.debug('found tree %s' % tree_entry.hex)
                nature = DirectoryTypeEntry.directory.value
                trees.append(tree_entry)
            else:
                logging.debug('found content %s' % tree_entry.hex)
                data = obj.data
                nature = DirectoryTypeEntry.file.value
                hashes = hashutil.hashdata(data, HASH_ALGORITHMS)
                blobs.append({'id': obj.hex,
                              'type': storage.Type.content,
                              'content-sha1': hashes['sha1'],
                              'content-sha256': hashes['sha256'],
                              'content': data,
                              'size': obj.size})

            dir_entries.append({'name': tree_entry.name,
                                'type': storage.Type.directory_entry,
                                'target-sha1': obj.hex,
                                'nature': nature,
                                'perms': tree_entry.filemode,
                                'atime': None,
                                'mtime': None,
                                'ctime': None,
                                'parent': tree.hex})

        yield tree, dir_entries, trees, blobs
        for tree_entry in trees:
            for x in treewalk(repo, repo[tree_entry.oid]):
                yield x

    def walk_tree(repo, swh_repo, rev):
        """Walk the rev revision's directories.
        """
        if swh_repo.already_visited(rev.hex):
            logging.debug('commit %s already visited, skipped' % rev.hex)
            return swh_repo

        for dir_root, dir_entries, _, contents_ref in treewalk(repo, rev.tree):
            for content_ref in contents_ref:
                swh_repo.add_content(content_ref)

            swh_repo.add_directory({'id': dir_root.hex,
                                    'type': storage.Type.directory,
                                    'entries': dir_entries})

        revision_parent_sha1s = list(map(str, rev.parent_ids))

        author = {'name': rev.author.name,
                  'email': rev.author.email,
                  'type': storage.Type.person}
        committer = {'name': rev.committer.name,
                     'email': rev.committer.email,
                     'type': storage.Type.person}

        swh_repo.add_revision({'id': rev.hex,
                               'type':storage.Type.revision,
                               'date': timestamp_to_string(rev.commit_time),
                               'directory': rev.tree.hex,
                               'message': rev.message,
                               'committer': committer,
                               'author': author,
                               'parent-sha1s': revision_parent_sha1s
        })

        swh_repo.add_person(read_signature(rev.author), author)
        swh_repo.add_person(read_signature(rev.committer), committer)

        return swh_repo

    def walk_revision_from(repo, swh_repo, head_rev):
        """Walk the rev history log from head_rev.
        - repo is the current repository
        - rev is the latest rev to start from.
        """
        for rev in repo.walk(head_rev.id, GIT_SORT_TOPOLOGICAL):
            swh_repo = walk_tree(repo, swh_repo, rev)

        return swh_repo

    repo = pygit2.Repository(repo_path)
    # memory model
    swh_repo = swhrepo.SWHRepo()
    # add origin
    origin = {'type': 'git',
              'url': 'file://' + repo.path}
    swh_repo.add_origin(origin)
    # add references and crawl them
    for ref_name in repo.listall_references():
        logging.info('walk reference %s' % ref_name)
        ref = repo.lookup_reference(ref_name)

        head_rev = repo[ref.target] \
                        if ref.type is GIT_REF_OID \
                        else ref.peel(GIT_OBJ_COMMIT)  # noqa

        if isinstance(head_rev, pygit2.Tag):
            head_start = head_rev.get_object()
            taggerSig = head_rev.tagger
            author = {'name': taggerSig.name,
                      'email': taggerSig.email,
                      'type': storage.Type.person}
            release = {'id': head_rev.hex,
                       'type': storage.Type.release,
                       'revision': head_rev.target.hex,
                       'name': ref_name,
                       'date': now(),  # FIXME: find the tag's date,
                       'author':  author,
                       'comment': head_rev.message}

            swh_repo.add_release(release)
            swh_repo.add_person(read_signature(taggerSig), author)
        else:
            swh_repo.add_occurrence({'id': head_rev.hex,
                                     'revision': head_rev.hex,
                                     'reference': ref_name,
                                     'url-origin': origin['url'],
                                     'type': storage.Type.occurrence})
            head_start = head_rev

        # crawl commits and trees
        walk_revision_from(repo, swh_repo, head_start)

    return swh_repo