diff --git a/requirements-swh.txt b/requirements-swh.txt --- a/requirements-swh.txt +++ b/requirements-swh.txt @@ -1,3 +1,3 @@ # Add here internal Software Heritage dependencies, one per line. swh.core -swh.model >= 0.0.64 +swh.model >= 0.3.8 diff --git a/swh/scanner/dashboard/dashboard.py b/swh/scanner/dashboard/dashboard.py --- a/swh/scanner/dashboard/dashboard.py +++ b/swh/scanner/dashboard/dashboard.py @@ -20,8 +20,7 @@ Generate the data_table from the path taken from the chart. For each file builds the html table rows showing the known status, a local link to - the file and the relative Software Heritage Persistent Identifier. - + the file and the relative SoftWare Heritage persistent IDentifier (SWHID). """ data = [] for file_info in source.getFilesFromDir(dir_path): diff --git a/swh/scanner/model.py b/swh/scanner/model.py --- a/swh/scanner/model.py +++ b/swh/scanner/model.py @@ -104,7 +104,7 @@ Returns: a dictionary containing a path as key and its known/unknown status and the - Software Heritage persistent identifier as values. + SWHID as values. """ return {str(self.path): {"swhid": self.swhid, "known": self.known,}} @@ -165,7 +165,7 @@ Yields: a dictionary containing a path with its known/unknown status and the - Software Heritage persistent identifier + SWHID """ for child_node in self.iterate(): yield child_node.attributes @@ -229,7 +229,7 @@ def count_contents(self) -> Tuple[int, int]: """Count how many contents are present inside a directory. - If a directory has a pid returns as it has all the contents. + If a directory has a SWHID returns as it has all the contents. Returns: A tuple with the total number of the contents and the number diff --git a/swh/scanner/scanner.py b/swh/scanner/scanner.py --- a/swh/scanner/scanner.py +++ b/swh/scanner/scanner.py @@ -15,55 +15,55 @@ from swh.model.from_disk import Directory, Content, accept_all_directories from swh.model.identifiers import ( - persistent_identifier, - parse_persistent_identifier, + swhid, + parse_swhid, DIRECTORY, CONTENT, ) -async def pids_discovery( - pids: List[str], session: aiohttp.ClientSession, api_url: str, +async def swhids_discovery( + swhids: List[str], session: aiohttp.ClientSession, api_url: str, ) -> Dict[str, Dict[str, bool]]: - """API Request to get information about the persistent identifiers given in - input. + """API Request to get information about the SoftWare Heritage persistent + IDentifiers (SWHIDs) given in input. Args: - pids: a list of persistent identifier + swhids: a list of SWHIDS api_url: url for the API request Returns: A dictionary with: - key: persistent identifier searched + key: SWHID searched value: - value['known'] = True if the pid is found - value['known'] = False if the pid is not found + value['known'] = True if the SWHID is found + value['known'] = False if the SWHID is not found """ endpoint = api_url + "known/" chunk_size = 1000 requests = [] - def get_chunk(pids): - for i in range(0, len(pids), chunk_size): - yield pids[i : i + chunk_size] + def get_chunk(swhids): + for i in range(0, len(swhids), chunk_size): + yield swhids[i : i + chunk_size] - async def make_request(pids): - async with session.post(endpoint, json=pids) as resp: + async def make_request(swhids): + async with session.post(endpoint, json=swhids) as resp: if resp.status != 200: error_response(resp.reason, resp.status, endpoint) return await resp.json() - if len(pids) > chunk_size: - for pids_chunk in get_chunk(pids): - requests.append(asyncio.create_task(make_request(pids_chunk))) + if len(swhids) > chunk_size: + for swhids_chunk in get_chunk(swhids): + requests.append(asyncio.create_task(make_request(swhids_chunk))) res = await asyncio.gather(*requests) # concatenate list of dictionaries return dict(itertools.chain.from_iterable(e.items() for e in res)) else: - return await make_request(pids) + return await make_request(swhids) def directory_filter(path_name: Union[str, bytes], exclude_patterns: Set[Any]) -> bool: @@ -86,18 +86,18 @@ def get_subpaths( path: PosixPath, exclude_patterns: Set[Any] ) -> Iterator[Tuple[PosixPath, str]]: - """Find the persistent identifier of the directories and files under a - given path. + """Find the SoftWare Heritage persistent IDentifier (SWHID) of + the directories and files under a given path. Args: path: the root path Yields: - pairs of: path, the relative persistent identifier + pairs of: path, the relative SWHID """ - def pid_of(path): + def swhid_of(path): if path.is_dir(): if exclude_patterns: @@ -111,15 +111,15 @@ path=bytes(path), dir_filter=dir_filter ).get_data() - return persistent_identifier(DIRECTORY, obj) + return swhid(DIRECTORY, obj) else: obj = Content.from_file(path=bytes(path)).get_data() - return persistent_identifier(CONTENT, obj) + return swhid(CONTENT, obj) dirpath, dnames, fnames = next(os.walk(path)) for node in itertools.chain(dnames, fnames): sub_path = PosixPath(dirpath).joinpath(node) - yield (sub_path, pid_of(sub_path)) + yield (sub_path, swhid_of(sub_path)) async def parse_path( @@ -137,15 +137,17 @@ Returns: a map containing tuples with: a subpath of the given path, - the pid of the subpath and the result of the api call + the SWHID of the subpath and the result of the api call """ parsed_paths = dict(get_subpaths(path, exclude_patterns)) - parsed_pids = await pids_discovery(list(parsed_paths.values()), session, api_url) + parsed_swhids = await swhids_discovery( + list(parsed_paths.values()), session, api_url + ) def unpack(tup): - subpath, pid = tup - return (subpath, pid, parsed_pids[pid]["known"]) + subpath, swhid = tup + return (subpath, swhid, parsed_swhids[swhid]["known"]) return map(unpack, parsed_paths.items()) @@ -164,15 +166,15 @@ """ async def _scan(root, session, api_url, source_tree, exclude_patterns): - for path, pid, known in await parse_path( + for path, obj_swhid, known in await parse_path( root, session, api_url, exclude_patterns ): - obj_type = parse_persistent_identifier(pid).object_type + obj_type = parse_swhid(obj_swhid).object_type if obj_type == CONTENT: - source_tree.addNode(path, pid, known) + source_tree.addNode(path, obj_swhid, known) elif obj_type == DIRECTORY and directory_filter(path, exclude_patterns): - source_tree.addNode(path, pid, known) + source_tree.addNode(path, obj_swhid, known) if not known: await _scan(path, session, api_url, source_tree, exclude_patterns) diff --git a/swh/scanner/tests/conftest.py b/swh/scanner/tests/conftest.py --- a/swh/scanner/tests/conftest.py +++ b/swh/scanner/tests/conftest.py @@ -10,7 +10,7 @@ from pathlib import PosixPath from aioresponses import aioresponses # type: ignore -from swh.model.cli import pid_of_file, pid_of_dir +from swh.model.cli import swhid_of_file, swhid_of_dir from swh.scanner.model import Tree from .flask_api import create_app @@ -68,12 +68,12 @@ filesample2.touch() avail_path = { - subdir: pid_of_dir(bytes(subdir)), - subsubdir: pid_of_dir(bytes(subsubdir)), - subdir2: pid_of_dir(bytes(subdir2)), - subfile: pid_of_file(bytes(subfile)), - filesample: pid_of_file(bytes(filesample)), - filesample2: pid_of_file(bytes(filesample2)), + subdir: swhid_of_dir(bytes(subdir)), + subsubdir: swhid_of_dir(bytes(subsubdir)), + subdir2: swhid_of_dir(bytes(subdir2)), + subfile: swhid_of_file(bytes(subfile)), + filesample: swhid_of_file(bytes(filesample)), + filesample2: swhid_of_file(bytes(filesample2)), } return { @@ -111,11 +111,11 @@ subsubdir_path = temp_folder["subsubdir"] known_paths = [filesample_path, filesample2_path, subsubdir_path] - for path, pid in temp_folder["paths"].items(): + for path, swhid in temp_folder["paths"].items(): if path in known_paths: - example_tree.addNode(path, pid, True) + example_tree.addNode(path, swhid, True) else: - example_tree.addNode(path, pid, False) + example_tree.addNode(path, swhid, False) return example_tree.getDirectoriesInfo(root) diff --git a/swh/scanner/tests/data.py b/swh/scanner/tests/data.py --- a/swh/scanner/tests/data.py +++ b/swh/scanner/tests/data.py @@ -9,7 +9,7 @@ "swh:1:dir:4b825dc642cb6eb9a060e54bf8d69288fbee4904": {"known": True}, } -# present pids inside /data/sample-folder +# present SWHIDs inside /data/sample-folder present_swhids = [ "swh:1:cnt:7c4c57ba9ff496ad179b8f65b1d286edbda34c9a", # quotes.md "swh:1:cnt:68769579c3eaadbe555379b9c3538e6628bae1eb", # some-binary diff --git a/swh/scanner/tests/flask_api.py b/swh/scanner/tests/flask_api.py --- a/swh/scanner/tests/flask_api.py +++ b/swh/scanner/tests/flask_api.py @@ -19,7 +19,7 @@ if len(swhids) > 900: raise LargePayloadExc( - "The maximum number of PIDs this endpoint " "can receive is 900" + "The maximum number of SWHIDs this endpoint can receive is 900" ) res = {swhid: {"known": False} for swhid in swhids} diff --git a/swh/scanner/tests/test_dashboard.py b/swh/scanner/tests/test_dashboard.py --- a/swh/scanner/tests/test_dashboard.py +++ b/swh/scanner/tests/test_dashboard.py @@ -11,8 +11,8 @@ def test_generate_table_body(example_tree, temp_folder): subdir_path = temp_folder["subdir"] - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) generated_body = generate_table_body(subdir_path, example_tree) diff --git a/swh/scanner/tests/test_model.py b/swh/scanner/tests/test_model.py --- a/swh/scanner/tests/test_model.py +++ b/swh/scanner/tests/test_model.py @@ -11,8 +11,8 @@ def test_tree_add_node(example_tree, temp_folder): avail_paths = temp_folder["paths"].keys() - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, False) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, False) for path, node in example_tree.children.items(): assert path in avail_paths @@ -22,8 +22,8 @@ def test_to_json_no_one_present(example_tree, temp_folder): - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, False) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, False) result = example_tree.toDict() @@ -34,8 +34,8 @@ def test_get_json_tree_all_present(example_tree, temp_folder): - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) result = example_tree.toDict() @@ -49,8 +49,8 @@ root = temp_folder["root"] filesample_path = temp_folder["filesample"] - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True if path == filesample_path else False) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True if path == filesample_path else False) result = example_tree.toDict() @@ -70,11 +70,11 @@ subdir_path = temp_folder["subdir"].relative_to(root_path) subsubdir_path = temp_folder["subsubdir"].relative_to(root_path) - for path, pid in temp_folder["paths"].items(): + for path, swhid in temp_folder["paths"].items(): if path == filesample_path or path == filesample2_path: - example_tree.addNode(path, pid, True) + example_tree.addNode(path, swhid, True) else: - example_tree.addNode(path, pid, False) + example_tree.addNode(path, swhid, False) directories = example_tree.getDirectoriesInfo(example_tree.path) @@ -85,24 +85,24 @@ def test_get_files_from_dir(example_tree, temp_folder): subdir_path = temp_folder["subdir"] - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) files = example_tree.getFilesFromDir(subdir_path) assert len(files) == 2 def test_get_files_source_path(example_tree, temp_folder): - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) files = example_tree.getFilesFromDir(example_tree.path) assert len(files) == 1 def test_get_files_from_dir_raise_exception(example_tree, temp_folder): - for path, pid in temp_folder["paths"].items(): - example_tree.addNode(path, pid, True) + for path, swhid in temp_folder["paths"].items(): + example_tree.addNode(path, swhid, True) with pytest.raises(InvalidDirectoryPath): example_tree.getFilesFromDir("test/") diff --git a/swh/scanner/tests/test_scanner.py b/swh/scanner/tests/test_scanner.py --- a/swh/scanner/tests/test_scanner.py +++ b/swh/scanner/tests/test_scanner.py @@ -9,7 +9,7 @@ from .data import correct_api_response, present_swhids, to_exclude_swhid -from swh.scanner.scanner import pids_discovery, get_subpaths, run +from swh.scanner.scanner import swhids_discovery, get_subpaths, run from swh.scanner.model import Tree from swh.scanner.cli import extract_regex_objs from swh.scanner.exceptions import APIError @@ -26,7 +26,7 @@ ) actual_result = event_loop.run_until_complete( - pids_discovery([], aiosession, "http://example.org/api/") + swhids_discovery([], aiosession, "http://example.org/api/") ) assert correct_api_response == actual_result @@ -37,7 +37,7 @@ with pytest.raises(APIError): event_loop.run_until_complete( - pids_discovery([], aiosession, "http://example.org/api/") + swhids_discovery([], aiosession, "http://example.org/api/") ) @@ -49,18 +49,18 @@ ] # /known/ is limited at 900 with pytest.raises(APIError): - event_loop.run_until_complete(pids_discovery(request, aiosession, api_url)) + event_loop.run_until_complete(swhids_discovery(request, aiosession, api_url)) def test_scanner_get_subpaths(temp_folder): root = temp_folder["root"] actual_result = [] - for subpath, pid in get_subpaths(root, tuple()): + for subpath, swhid in get_subpaths(root, tuple()): # also check if it's a symlink since pytest tmp_dir fixture create # also a symlink to each directory inside the tmp_dir path if subpath.is_dir() and not subpath.is_symlink(): - actual_result.append((subpath, pid)) + actual_result.append((subpath, swhid)) assert len(actual_result) == 2