diff --git a/swh/indexer/fossology_license.py b/swh/indexer/fossology_license.py --- a/swh/indexer/fossology_license.py +++ b/swh/indexer/fossology_license.py @@ -18,6 +18,7 @@ Returns: A dict with the following keys: + - licenses ([str]): associated detected licenses to path - path (bytes): content filepath @@ -80,6 +81,7 @@ Returns: A dict with the following keys: + - licenses ([str]): associated detected licenses to path - path (bytes): content filepath @@ -95,10 +97,11 @@ Returns: A dict, representing a content_license, with keys: - - id (bytes): content's identifier (sha1) - - license (bytes): license in bytes - - path (bytes): path - - indexer_configuration_id (int): tool used to compute the output + + - id (bytes): content's identifier (sha1) + - license (bytes): license in bytes + - path (bytes): path + - indexer_configuration_id (int): tool used to compute the output """ content_path = self.write_to_temp( @@ -122,9 +125,11 @@ Args: results ([dict]): list of content_license, dict with the following keys: + - id (bytes): content's identifier (sha1) - license (bytes): license in bytes - path (bytes): path + policy_update ([str]): either 'update-dups' or 'ignore-dups' to respectively update duplicates or ignore them @@ -136,6 +141,7 @@ class ContentFossologyLicenseIndexer( MixinFossologyLicenseIndexer, DiskIndexer, ContentIndexer): """Indexer in charge of: + - filtering out content already indexed - reading content from objstorage per the content's id (sha1) - computing {license, encoding} from that content @@ -159,9 +165,10 @@ """FossologyLicense Range Indexer working on range of content identifiers. It: + - filters out the non textual content - (optionally) filters out content already indexed (cf - :func:`indexed_contents_in_range`) + :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage diff --git a/swh/indexer/metadata_detector.py b/swh/indexer/metadata_detector.py --- a/swh/indexer/metadata_detector.py +++ b/swh/indexer/metadata_detector.py @@ -11,12 +11,12 @@ def detect_metadata(files): """ Detects files potentially containing metadata + Args: - - file_entries (list): list of files + file_entries (list): list of files Returns: - - empty list if nothing was found - - dictionary {mapping_filenames[name]:f['sha1']} + dictionary {mapping_filenames[name]:f['sha1']} (may be empty) """ results = {} for (mapping_name, mapping) in MAPPINGS.items(): @@ -39,14 +39,16 @@ def extract_minimal_metadata_dict(metadata_list): """ Every item in the metadata_list is a dict of translated_metadata in the - CodeMeta vocabulary - we wish to extract a minimal set of terms and keep all values corresponding - to this term without duplication + CodeMeta vocabulary. + + We wish to extract a minimal set of terms and keep all values corresponding + to this term without duplication. + Args: - - metadata_list (list): list of dicts of translated_metadata + metadata_list (list): list of dicts of translated_metadata Returns: - - minimal_dict (dict): one dict with selected values of metadata + minimal_dict (dict): dict with selected values of metadata """ minimal_dict = {} for document in metadata_list: diff --git a/swh/indexer/metadata_dictionary.py b/swh/indexer/metadata_dictionary.py --- a/swh/indexer/metadata_dictionary.py +++ b/swh/indexer/metadata_dictionary.py @@ -39,12 +39,12 @@ def detect_metadata_files(self, files): """ Detects files potentially containing metadata + Args: - - file_entries (list): list of files + file_entries (list): list of files Returns: - - empty list if nothing was found - - list of sha1 otherwise + list: list of sha1 (possibly empty) """ pass @@ -88,11 +88,11 @@ and translating with the appropriate mapping Args: - content_dict (dict) + content_dict (dict): content dict to translate Returns: dict: translated metadata in json-friendly form needed for - the indexer + the indexer """ translated_metadata = {'@type': SCHEMA_URI + 'SoftwareSourceCode'} @@ -128,11 +128,11 @@ json data and translating with the appropriate mapping Args: - raw_content: bytes + raw_content (bytes): raw content to translate Returns: dict: translated metadata in json-friendly form needed for - the indexer + the indexer """ try: diff --git a/swh/indexer/mimetype.py b/swh/indexer/mimetype.py --- a/swh/indexer/mimetype.py +++ b/swh/indexer/mimetype.py @@ -17,7 +17,7 @@ raw_content (bytes): content's raw data Returns: - A dict with mimetype and encoding key and corresponding values + dict: mimetype and encoding key and corresponding values (as bytes). """ @@ -60,11 +60,11 @@ data (bytes): raw content in bytes Returns: - A dict, representing a content_mimetype, with keys: + dict: content's mimetype; dict keys being - - id (bytes): content's identifier (sha1) - - mimetype (bytes): mimetype in bytes - - encoding (bytes): encoding in bytes + - **id** (bytes): content's identifier (sha1) + - **mimetype** (bytes): mimetype in bytes + - **encoding** (bytes): encoding in bytes """ try: @@ -84,15 +84,11 @@ """Persist the results in storage. Args: - results ([dict]): list of content_mimetype, dict with the - following keys: - - - id (bytes): content's identifier (sha1) - - mimetype (bytes): mimetype in bytes - - encoding (bytes): encoding in bytes + results ([dict]): list of content's mimetype dicts + (see :meth:`.index`) policy_update ([str]): either 'update-dups' or 'ignore-dups' to - respectively update duplicates or ignore them + respectively update duplicates or ignore them """ self.idx_storage.content_mimetype_add( @@ -103,14 +99,17 @@ """Mimetype Indexer working on list of content identifiers. It: - - (optionally) filters out content already indexed (cf. :callable:`filter`) + + - (optionally) filters out content already indexed (cf. + :meth:`.filter`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage FIXME: - - 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer - - 2. Do we keep it afterwards? ~> i think this can be used with the journal + + 1. Rename redundant ContentMimetypeIndexer to MimetypeIndexer + 2. Do we keep it afterwards? ~> i think this can be used with the journal """ def filter(self, ids): @@ -129,7 +128,9 @@ """Mimetype Range Indexer working on range of content identifiers. It: - - (optionally) filters out content already indexed (cf :callable:`range`) + + - (optionally) filters out content already indexed (cf + :meth:`.indexed_contents_in_range`) - reads content from objstorage per the content's id (sha1) - computes {mimetype, encoding} from that content - stores result in storage @@ -138,15 +139,16 @@ def indexed_contents_in_range(self, start, end): """Retrieve indexed content id within range [start, end]. - Args - **start** (bytes): Starting bound from range identifier - **end** (bytes): End range identifier + Args: + start (bytes): Starting bound from range identifier + end (bytes): End range identifier Returns: a dict with keys: - - **ids** [bytes]: iterable of content ids within the range. - - **next** (Optional[bytes]): The next range of sha1 starts at - this sha1 if any + + - **ids** [bytes]: iterable of content ids within the range. + - **next** (Optional[bytes]): The next range of sha1 starts at + this sha1 if any """ return self.idx_storage.content_mimetype_get_range(