Page MenuHomeSoftware Heritage

D2657.diff
No OneTemporary

D2657.diff

diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -39,13 +39,13 @@
# Full sample:
# https://forge.softwareheritage.org/diffusion/DCORE/browse/master/setup.py
setup(
- name='swh.<module-name>', # example: swh.loader.pypi
- description='Software Heritage <Module\'s intent>',
+ name='swh.scanner',
+ description='Software Heritage code scanner',
long_description=long_description,
long_description_content_type='text/markdown',
author='Software Heritage developers',
author_email='swh-devel@inria.fr',
- url='https://forge.softwareheritage.org/diffusion/<module-git-code>',
+ url='https://forge.softwareheritage.org/diffusion/DTSCN/',
packages=find_packages(), # packages's modules
install_requires=parse_requirements() + parse_requirements('swh'),
tests_require=parse_requirements('test'),
@@ -55,7 +55,7 @@
include_package_data=True,
entry_points='''
[swh.cli.subcommands]
- <cli-name>=swh.<module>.cli:cli
+ scanner=swh.scanner.cli:scanner
''',
classifiers=[
"Programming Language :: Python :: 3",
@@ -67,6 +67,6 @@
project_urls={
'Bug Reports': 'https://forge.softwareheritage.org/maniphest',
'Funding': 'https://www.softwareheritage.org/donate',
- 'Source': 'https://forge.softwareheritage.org/source/swh-<module>',
+ 'Source': 'https://forge.softwareheritage.org/source/swh-scanner',
},
)
diff --git a/swh/foo/bar.py b/swh/foo/bar.py
deleted file mode 100644
--- a/swh/foo/bar.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (C) 2019 The Software Heritage developers
-# See the AUTHORS file at the top-level directory of this distribution
-# License: GNU General Public License version 3, or any later version
-# See top-level LICENSE file for more information
diff --git a/swh/foo/cli.py b/swh/foo/cli.py
deleted file mode 100644
--- a/swh/foo/cli.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import click
-
-from swh.core.cli import CONTEXT_SETTINGS
-
-
-@click.group(name='foo', context_settings=CONTEXT_SETTINGS)
-@click.pass_context
-def cli(ctx):
- """Foo main command.
- """
-
-
-@cli.command()
-@click.option('--bar', help='Something')
-@click.pass_context
-def bar(ctx, bar):
- '''Do something.'''
- click.echo('bar')
diff --git a/swh/foo/py.typed b/swh/foo/py.typed
deleted file mode 100644
--- a/swh/foo/py.typed
+++ /dev/null
@@ -1 +0,0 @@
-# Marker file for PEP 561.
diff --git a/swh/foo/__init__.py b/swh/scanner/__init__.py
rename from swh/foo/__init__.py
rename to swh/scanner/__init__.py
diff --git a/swh/scanner/cli.py b/swh/scanner/cli.py
new file mode 100644
--- /dev/null
+++ b/swh/scanner/cli.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import click
+
+from swh.core.cli import CONTEXT_SETTINGS
+from swh.scanner.scanner import run
+
+
+@click.group(name='scanner', context_settings=CONTEXT_SETTINGS)
+@click.pass_context
+def scanner(ctx):
+ '''Software Heritage Scanner tools.'''
+ pass
+
+
+@scanner.command(name='scan')
+@click.argument('path', required=True)
+@click.option('--host', '-h', default='localhost',
+ metavar='IP', show_default=True,
+ help="web api endpoint ip")
+@click.option('--port', '-p', default='5080',
+ metavar='PORT', show_default=True,
+ help="web api endpoint port")
+@click.pass_context
+def scan(ctx, path, host, port):
+ result = run(path, host, port)
+ print(result)
+
+
+def main():
+ return scanner(auto_envvar_prefix='SWH_SCANNER')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py
new file mode 100644
--- /dev/null
+++ b/swh/scanner/scanner.py
@@ -0,0 +1,108 @@
+# Copyright (C) 2020 The Software Heritage developers
+# See the AUTHORS file at the top-level directory of this distribution
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import requests
+import os
+import json
+import itertools
+from pathlib import PosixPath
+
+from swh.model.cli import pid_of_file, pid_of_dir
+from swh.model.identifiers import (
+ parse_persistent_identifier,
+ DIRECTORY, CONTENT
+)
+
+
+def pids_discovery(pids, host, port):
+ """
+ Args:
+ pids list(str): A list of persistent identifier
+ Returns:
+ A dictionary with:
+ key(str): persistent identifier
+ value(dict):
+ value['known'] = True if pid is found
+ value['known'] = False if pid is not found
+ """
+ endpoint = 'http://%s:%s/api/1/known/' % (host, port)
+ req = requests.post(endpoint, json=pids)
+ resp = req.text
+ return json.loads(resp)
+
+
+def get_sub_paths(path):
+ """Find the persistent identifier of the paths and files under
+ a given path.
+
+ Args:
+ path(PosixPath): the entry root
+
+ Yields:
+ tuple(path, pid): pairs of path and the relative persistent
+ identifier
+ """
+ def pid_of(path):
+ if path.is_dir():
+ return pid_of_dir(bytes(path))
+ elif path.is_file():
+ return pid_of_file(bytes(path))
+
+ dirpath, dnames, fnames = next(os.walk(path))
+ for node in itertools.chain(dnames, fnames):
+ path = PosixPath(dirpath).joinpath(node)
+ yield (path, pid_of(path))
+
+
+def parse_path(path, host, port):
+ """Check if the sub paths of the given path is present in the
+ archive or not.
+ Args:
+ path(PosixPath): The source path
+ host(str): ip for the api request
+ port(str): port for the api request
+ Yields:
+ a tuple with the path found, the persistent identifier
+ relative to the path and a boolean: False if not found,
+ True if found.
+ """
+ pid_map = dict(get_sub_paths(path))
+ parsed_pids = pids_discovery(list(pid_map.values()), host, port)
+
+ for sub_path, pid in pid_map.items():
+ yield (sub_path, pid, parsed_pids[pid]['known'])
+
+
+def run(root, host, port):
+ """Scan the given root
+ Args:
+ path: the path to scan
+ host(str): ip for the api request
+ port(str): port for the api request
+ Returns:
+ A set containing pairs of the path discovered and the
+ relative persistent identifier
+ """
+ def _scan(root, host, port, accum):
+ assert root not in accum
+
+ next_paths = []
+ for path, pid, found in parse_path(root, host, port):
+ obj_type = parse_persistent_identifier(pid).object_type
+
+ if obj_type == CONTENT and found:
+ accum.add((str(path), pid))
+ elif obj_type == DIRECTORY:
+ if found:
+ accum.add((str(path), pid))
+ else:
+ next_paths.append(path)
+
+ for new_path in next_paths:
+ accum = _scan(new_path, host, port, accum)
+
+ return accum
+
+ return _scan(root, host, port, set())
diff --git a/swh/foo/tests/__init__.py b/swh/scanner/tests/__init__.py
rename from swh/foo/tests/__init__.py
rename to swh/scanner/tests/__init__.py

File Metadata

Mime Type
text/plain
Expires
Thu, Jan 30, 9:44 AM (3 w, 3 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3223231

Event Timeline