diff --git a/bin/orc-merge b/bin/orc-merge new file mode 100755 index 0000000..e29d6bd --- /dev/null +++ b/bin/orc-merge @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Merge multiple ORC files into a single one.""" + +import argparse + +import pyorc + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "-o", "--output", type=argparse.FileType(mode="wb"), required=True + ) + parser.add_argument("files", type=argparse.FileType(mode="rb"), nargs="+") + args = parser.parse_args() + + schema = str(pyorc.Reader(args.files[0]).schema) + + with pyorc.Writer(args.output, schema) as writer: + for i, f in enumerate(args.files): + reader = pyorc.Reader(f) + if str(reader.schema) != schema: + raise RuntimeError( + "Inconsistent ORC schemas.\n" + "\tFirst file schema: {}\n" + "\tFile #{} schema: {}".format(schema, i, str(reader.schema)) + ) + for line in reader: + writer.write(line) + + +if __name__ == "__main__": + main() diff --git a/bin/orc-print-contents b/bin/orc-print-contents new file mode 100755 index 0000000..02c5e35 --- /dev/null +++ b/bin/orc-print-contents @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2021 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +"""Print the contents of an ORC file.""" + +import argparse + +import pyorc + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("files", type=argparse.FileType(mode="rb"), nargs="+") + args = parser.parse_args() + + for orc_file in args.files: + reader = pyorc.Reader(orc_file) + for row in reader: + print(row)