diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information import asyncio +import os from typing import Any, Dict, Iterable import aiohttp @@ -70,6 +71,24 @@ raise Exception(f"policy '{policy}' not found") +# here is a set of directory we should disregard +COMMON_EXCLUDE_PATTERNS = [ + b'.bzr', + b'.coverage', + b'*.egg-info', + b'.eggs', + b'.git', + b'.hg', + b'.mypy_cache', + b'__pycache__', + b'.svn', + b'.tox', +] +COMMON_EXCLUDE_PATTERNS.extend([ + b'*/' + p for p in COMMON_EXCLUDE_PATTERNS +]) + + def scan( config: Dict[str, Any], root_path: str, @@ -82,6 +101,7 @@ """Scan a source code project to discover files and directories already present in the archive""" converted_patterns = [pattern.encode() for pattern in exclude_patterns] + converted_patterns.extend(COMMON_EXCLUDE_PATTERNS) source_tree = model_of_dir(root_path.encode(), converted_patterns) nodes_data = MerkleNodeInfo()