Changeset View
Changeset View
Standalone View
Standalone View
swh/provenance/tests/data/generate_storage_from_git.py
# Copyright (C) 2021 The Software Heritage developers | # Copyright (C) 2021 The Software Heritage developers | ||||
# See the AUTHORS file at the top-level directory of this distribution | # See the AUTHORS file at the top-level directory of this distribution | ||||
# License: GNU General Public License version 3, or any later version | # License: GNU General Public License version 3, or any later version | ||||
# See top-level LICENSE file for more information | # See top-level LICENSE file for more information | ||||
from datetime import datetime, timezone | from datetime import datetime, timezone | ||||
import os | import os | ||||
import re | |||||
from subprocess import check_output | |||||
from typing import Dict | |||||
import click | import click | ||||
from swh.core.api.serializers import msgpack_dumps | |||||
from swh.loader.git.from_disk import GitLoaderFromDisk | from swh.loader.git.from_disk import GitLoaderFromDisk | ||||
from swh.model.hashutil import hash_to_bytes as h2b | |||||
from swh.provenance.tests.test_provenance_db import ts2dt | |||||
from swh.storage import get_storage | from swh.storage import get_storage | ||||
def load_git_repo(url, directory, storage): | def load_git_repo(url, directory, storage): | ||||
visit_date = datetime.now(tz=timezone.utc) | visit_date = datetime.now(tz=timezone.utc) | ||||
loader = GitLoaderFromDisk( | loader = GitLoaderFromDisk( | ||||
url=url, | url=url, | ||||
directory=directory, | directory=directory, | ||||
visit_date=visit_date, | visit_date=visit_date, | ||||
storage=storage, | storage=storage, | ||||
) | ) | ||||
return loader.load() | return loader.load() | ||||
def pop_key(d, k): | def pop_key(d, k): | ||||
d.pop(k) | d.pop(k) | ||||
return d | return d | ||||
def dump_file(hash, storage, cache): | |||||
if hash not in cache: | |||||
content = storage.content_find({"sha1_git": hash})[0] | |||||
cache[hash] = content | |||||
# we remove ctime to make the resulting data (eg. output msgpack file) | |||||
# independent from execution time | |||||
yield "content", pop_key(content.to_dict(), "ctime") | |||||
def dump_directory(hash, storage, cache): | |||||
if hash not in cache: | |||||
dircontent = list(storage.directory_ls(hash)) | |||||
cache[hash] = dircontent | |||||
yield "directory", {"id": hash, "entries": list(storage.directory_ls(hash))} | |||||
for direntry in dircontent: | |||||
if direntry["type"] == "dir": | |||||
yield from dump_directory(direntry["target"], storage, cache) | |||||
elif direntry["type"] == "file": | |||||
yield from dump_file(direntry["target"], storage, cache) | |||||
else: | |||||
raise ValueError("Unexpected directory entry type {direntry['type']}") | |||||
def dump_git_revision(hash, storage, cache): | |||||
if hash not in cache: | |||||
rev = storage.revision_get([hash])[0] | |||||
revd = { | |||||
"id": rev.id, | |||||
"date": ts2dt(rev.date.to_dict()), | |||||
"parents": rev.parents, | |||||
"directory": rev.directory, | |||||
} | |||||
revd = rev.to_dict() | |||||
cache[hash] = revd | |||||
for parent in rev.parents: | |||||
yield from dump_git_revision(parent, storage, cache) | |||||
yield from dump_directory(rev.directory, storage, cache) | |||||
yield "revision", cache[hash] | |||||
@click.command() | @click.command() | ||||
@click.option( | |||||
"-r", | |||||
"--head", | |||||
default="master", | |||||
help="head revision to start from", | |||||
) | |||||
@click.option("-o", "--output", default=None, help="output file") | @click.option("-o", "--output", default=None, help="output file") | ||||
@click.argument("git-repo") | @click.argument("git-repo", type=click.Path(exists=True, file_okay=False)) | ||||
def main(head, output, git_repo): | def main(output, git_repo): | ||||
"simple tool to generate the git_repo.msgpack dataset file used in some tests" | "simple tool to generate the git_repo.msgpack dataset file used in some tests" | ||||
sto = get_storage(cls="memory") | if output is None: | ||||
output = f"{git_repo}.msgpack" | |||||
with open(output, "wb") as outstream: | |||||
sto = get_storage( | |||||
cls="memory", journal_writer={"cls": "stream", "output_stream": outstream} | |||||
) | |||||
if git_repo.endswith("/"): | if git_repo.endswith("/"): | ||||
git_repo = git_repo[:-1] | git_repo = git_repo[:-1] | ||||
reponame = os.path.basename(git_repo) | reponame = os.path.basename(git_repo) | ||||
load_git_repo(f"https://{reponame}", git_repo, sto) | load_git_repo(f"https://{reponame}", git_repo, sto) | ||||
click.echo(f"Serialized the storage made from {reponame} in {output}") | |||||
if output is None: | |||||
output = f"{git_repo}.msgpack" | |||||
if not re.match("[0-9a-fA-F]{40}", head): | |||||
headhash = ( | |||||
check_output(["git", "-C", git_repo, "rev-parse", head]).decode().strip() | |||||
) | |||||
click.echo(f"Revision hash for {head} is {headhash}") | |||||
else: | |||||
headhash = head | |||||
cache: Dict[bytes, dict] = {} | |||||
outf = open(output, "wb") | |||||
outd = [] | |||||
for e in dump_git_revision(h2b(headhash), storage=sto, cache=cache): | |||||
outd.append(e) | |||||
outf.write(msgpack_dumps(outd)) | |||||
click.echo(f"Wrote {len(outd)} objects in {output}") | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
main() | main() |