Changeset View
Standalone View
swh/lister/phabricator/lister.py
- This file was added.
# Copyright (C) 2019 the Software Heritage developers | |||||
ardumont: `-2019`
(same on other headers ;) | |||||
Done Inline ActionsAs @ardumont already pointed out, the headers should start with Copyright (C) 2019 (we are not yet in 2020) anlambert: As @ardumont already pointed out, the headers should start with `Copyright (C) 2019` (we are… | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
from swh.lister.core.indexing_lister import SWHIndexingHttpLister | |||||
from swh.lister.phabricator.models import PhabricatorModel | |||||
from collections import defaultdict | |||||
class PhabricatorLister(SWHIndexingHttpLister): | |||||
Done Inline ActionsAfter reading the diffusion.repository.search endpoint documentation, I would use the following query parameters here: &order=oldest&attachments[uris]=1&after=%s. The order=oldest will ensure that you get the list of repositories by order of creation (and thus repositories ids will be in ascending order which will facilitate incremental listing). The attachments[uris]=1 will add info regarding repository uris in the JSON reponse. Proceeding this way will ensure to get valid uris and there will be no need to reconstruct them as you did below (as there is no guarantee they will be valid ones). anlambert: After reading the [[ https://forge.softwareheritage.org/conduit/method/diffusion.repository. | |||||
PATH_TEMPLATE = '&order=oldest&attachments[uris]=1&after=%s' | |||||
Done Inline ActionsThis constructor must be moved after the class variable (those in upper cases) definition anlambert: This constructor must be moved after the class variable (those in upper cases) definition | |||||
MODEL = PhabricatorModel | |||||
Done Inline ActionsUse forge_url instead of FORGEURL for code style consistency anlambert: Use `forge_url` instead of `FORGEURL` for code style consistency | |||||
LISTER_NAME = 'phabricator' | |||||
def __init__(self, forge_url, api_token, override_config=None): | |||||
Done Inline Actionswe should check if forge_url contains a leading slash here and remove it if it is the case as requests to anlambert: we should check if `forge_url` contains a leading slash here and remove it if it is the case as… | |||||
if forge_url.endswith("/"): | |||||
Done Inline ActionsThis is more readable if forge_url.endswith('/'): anlambert: This is more readable
```lang=python
if forge_url.endswith('/'):
```
| |||||
forge_url = forge_url[:-1] | |||||
self.forge_url = forge_url | |||||
Done Inline ActionsHere you should process the data contained in the array repo['attachments']['uris']['uris'] and extract a repository uri that we will be able to load into the archive. You should filter the uri to extract according to the value of elt['fields']['uri']['raw'], elt being an element contained in the array mentionned above. For instance, if the value is equal to https://shortname or http://shortname you can extract the corresponding repository uri. We should have some kind of priority list in order to handle all possible uri schemes. I pasted below an example of data to process for a repository hosted on https://phabricator.kde.org $ curl https://phabricator.kde.org/api/diffusion.repository.search -d api.token=XXXX -d attachments[uris]=1 -d limit=1 | jq '.' { "result": { "data": [ { "id": 37, "type": "REPO", "phid": "PHID-REPO-rhzpjasqu2ynsqeip27z", "fields": { "name": "Krita", "vcs": "git", "callsign": null, "shortName": "krita", "status": "active", "isImporting": false, "spacePHID": "PHID-SPCE-6njcdzuokbeuy6ybq5sn", "dateCreated": 1444506956, "dateModified": 1482836504, "policy": { "view": "public", "edit": "PHID-PROJ-o4c424cwgljycukfodwg", "diffusion.push": "PHID-PROJ-c3schegkpd4o2sas53xu" } }, "attachments": { "uris": { "uris": [ { "id": "2109", "type": "RURI", "phid": "PHID-RURI-h6dogp2mzwffsqmi2fsz", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "ssh://git@git.kde.org/krita", "display": "ssh://git@git.kde.org/krita", "effective": "ssh://git@git.kde.org/krita", "normalized": "git.kde.org/krita" }, "io": { "raw": "none", "default": "none", "effective": "none" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": null, "identifier": null }, "dateCreated": "1473675015", "dateModified": "1482835832" } }, { "id": "2108", "type": "RURI", "phid": "PHID-RURI-7lbwzakiwi5glfcmlzuc", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "https://anongit.kde.org/krita", "display": "https://anongit.kde.org/krita", "effective": "https://anongit.kde.org/krita", "normalized": "anongit.kde.org/krita" }, "io": { "raw": "none", "default": "none", "effective": "none" }, "display": { "raw": "always", "default": "never", "effective": "always" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": null, "identifier": null }, "dateCreated": "1473674991", "dateModified": "1482836504" } }, { "id": "348", "type": "RURI", "phid": "PHID-RURI-53tce2xqd65omsoh3jzz", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "git://anongit.kde.org/krita", "display": "git://anongit.kde.org/krita", "effective": "git://anongit.kde.org/krita", "normalized": "anongit.kde.org/krita" }, "io": { "raw": "observe", "default": "none", "effective": "observe" }, "display": { "raw": "always", "default": "never", "effective": "always" }, "credentialPHID": "PHID-CDTL-lbunomrcrjmhmpjhr6uc", "disabled": false, "builtin": { "protocol": null, "identifier": null }, "dateCreated": "1463470863", "dateModified": "1473674933" } }, { "id": "347", "type": "RURI", "phid": "PHID-RURI-ejo4gh5nh5vo3sx4tu4u", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "http://id", "display": "http://phabricator.kde.org/diffusion/37/krita.git", "effective": "http://phabricator.kde.org/diffusion/37/krita.git", "normalized": "phabricator.kde.org/diffusion/37" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "http", "identifier": "id" }, "dateCreated": "1463470863", "dateModified": "1482835833" } }, { "id": "346", "type": "RURI", "phid": "PHID-RURI-zdrqmmshjcuxktcaawr6", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "http://phabricator.kde.org/source/krita.git", "display": "http://phabricator.kde.org/source/krita.git", "effective": "http://phabricator.kde.org/source/krita.git", "normalized": "phabricator.kde.org/source/krita" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "always", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "http", "identifier": "shortname" }, "dateCreated": "1463470863", "dateModified": "1480800604" } }, { "id": "344", "type": "RURI", "phid": "PHID-RURI-5weymrwtiejwgs7sy74d", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "https://id", "display": "https://phabricator.kde.org/diffusion/37/krita.git", "effective": "https://phabricator.kde.org/diffusion/37/krita.git", "normalized": "phabricator.kde.org/diffusion/37" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "https", "identifier": "id" }, "dateCreated": "1463470863", "dateModified": "1482835834" } }, { "id": "343", "type": "RURI", "phid": "PHID-RURI-52a7t7f3dgamts45clos", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "https://phabricator.kde.org/source/krita.git", "display": "https://phabricator.kde.org/source/krita.git", "effective": "https://phabricator.kde.org/source/krita.git", "normalized": "phabricator.kde.org/source/krita" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "always", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "https", "identifier": "shortname" }, "dateCreated": "1463470863", "dateModified": "1480800621" } }, { "id": "341", "type": "RURI", "phid": "PHID-RURI-pzavdb5o73w35rpwbctu", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "ssh://id", "display": "ssh://code@code.kde.org/diffusion/37/krita.git", "effective": "ssh://code@code.kde.org/diffusion/37/krita.git", "normalized": "code.kde.org/diffusion/37" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "ssh", "identifier": "id" }, "dateCreated": "1463470863", "dateModified": "1482835834" } }, { "id": "340", "type": "RURI", "phid": "PHID-RURI-umlemztrfd75gpdhawfs", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "ssh://code@phabricator.kde.org/source/krita.git", "display": "ssh://code@code.kde.org/source/krita.git", "effective": "ssh://code@code.kde.org/source/krita.git", "normalized": "code.kde.org/source/krita" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "always", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "ssh", "identifier": "shortname" }, "dateCreated": "1463470863", "dateModified": "1480800626" } } ] } } } ], "maps": {}, "query": { "queryKey": null }, "cursor": { "limit": 1, "after": "37", "before": null, "order": null } }, "error_code": null, "error_info": null } anlambert: Here you should process the data contained in the array `repo['attachments']['uris']['uris']`… | |||||
api_endpoint = 'api/diffusion.repository.'\ | |||||
'search?api.token=%s' % api_token | |||||
anlambertUnsubmitted Done Inline ActionsJust a nitpick but for a long literal string, the following syntax is recommended in Python: api_endpoint = ('api/diffusion.repository.' 'search?api.token=%s') % api_token anlambert: Just a nitpick but for a long literal string, the following syntax is recommended in Python… | |||||
api_baseurl = '%s/%s' % (forge_url, api_endpoint) | |||||
Done Inline Actionsyou can remove that line anlambert: you can remove that line | |||||
super().__init__(api_baseurl=api_baseurl, | |||||
Done Inline ActionsHere base_url corresponds to the forge_url passed in the lister constructor. anlambert: Here `base_url` corresponds to the `forge_url` passed in the lister constructor.
So you should… | |||||
override_config=override_config) | |||||
def get_model_from_repo(self, repo): | |||||
url = get_repo_url(repo['attachments']['uris']['uris']) | |||||
if url is None: | |||||
Done Inline Actionsget_repo_url seems a better name here anlambert: `get_repo_url` seems a better name here | |||||
Done Inline ActionsI think this could be moved in a function rather than a method as we do not need any lister state here. Plus it will make the task of writing tests easier (they are missing by the way and must be added to validate the repository url extraction approach). anlambert: I think this could be moved in a function rather than a method as we do not need any lister… | |||||
return None | |||||
return { | |||||
'uid': self.forge_url + str(repo['id']), | |||||
Done Inline Actionslet's gain some line of code here by using a defaultdict # put this import at the of the file from collections import defaultdict processed_urls = defaultdict(dict) anlambert: let's gain some line of code here by using a `defaultdict`
```lang=python
# put this import at… | |||||
'indexable': repo['id'], | |||||
'name': repo['fields']['shortName'], | |||||
'full_name': repo['fields']['name'], | |||||
'html_url': url, | |||||
'origin_url': url, | |||||
'description': None, | |||||
'origin_type': repo['fields']['vcs'] | |||||
} | |||||
Done Inline Actionsreplace this by: processed_urls[protocol][identifier] = url anlambert: replace this by:
```lang=python
processed_urls[protocol][identifier] = url
``` | |||||
Done Inline ActionsI didn't do it this way because "attachments": { "uris": { "uris": [ { "id": "803", "type": "RURI", "phid": "PHID-RURI-gxga6bkytergmvce3hjb", "fields": { "repositoryPHID": "PHID-REPO-oyzmq4ek2ditbftyosxb", "uri": { "raw": "git@github.com:SoftwareHeritage/puppet-swh-site.git", "display": "git@github.com:SoftwareHeritage/puppet-swh-site.git", "effective": "git@github.com:SoftwareHeritage/puppet-swh-site.git", "normalized": "github.com/SoftwareHeritage/puppet-swh-site" }, "io": { "raw": "mirror", "default": "none", "effective": "mirror" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": "PHID-CDTL-hwgnkw6vk2nrh475bvkf", "disabled": false, "builtin": { "protocol": null, "identifier": null }, "dateCreated": "1486563621", "dateModified": "1486563621" } }, nahimilega: I didn't do it this way because
1)To reduce the memory consumption of the code as we only need… | |||||
Done Inline ActionsThe memory usage is negligible. anyway, you can do it like that: if protocol in ('http', 'https'): processed_urls[protocol][identifier] = url vlorentz: The memory usage is negligible.
anyway, you can do it like that:
```
if protocol in ('http'… | |||||
anlambertUnsubmitted Done Inline ActionsCould you add the following here: def request_headers(self): """(Override) Set requests headers to send when querying the Phabricator API """ return {'User-Agent': 'Software Heritage phabricator lister', 'Accept': 'application/json'} This enables to specify that Software Heritage is the API consumer. anlambert: Could you add the following here:
```lang=python
def request_headers(self):
"""… | |||||
def get_next_target_from_response(self, response): | |||||
body = response.json()['result']['cursor'] | |||||
if body['after'] != 'null': | |||||
Done Inline ActionsHere, I would proceed as follows:
{ '<protocol>': { '<identifier>': '<effective_uri>', ... }, ... }
{ "id": "492", "type": "RURI", "phid": "PHID-RURI-5tp5xevbqwx2phtbh4sr", "fields": { "repositoryPHID": "PHID-REPO-72x6isvol3qu4eoxhteu", "uri": { "raw": "https://callsign", "display": "https://forge.softwareheritage.org/diffusion/DSCH/swh-scheduler.git", "effective": "https://forge.softwareheritage.org/diffusion/DSCH/swh-scheduler.git", "normalized": "forge.softwareheritage.org/diffusion/DSCH" }, "io": { "raw": "read", "default": "readwrite", "effective": "read" }, "display": { "raw": "default", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "https", "identifier": "callsign" }, "dateCreated": "1465923367", "dateModified": "1465923367" }
anlambert: Here, I would proceed as follows:
1. Process the whole uris list in a first pass and store them… | |||||
return body['after'] | |||||
else: | |||||
return None | |||||
def transport_response_simplified(self, response): | |||||
repos = response.json() | |||||
if repos['result'] is None: | |||||
raise ValueError( | |||||
'Problem during information fetch: %s' % repos['error_code']) | |||||
repos = repos['result']['data'] | |||||
return [self.get_model_from_repo(repo) for repo in repos] | |||||
def filter_before_inject(self, models_list): | |||||
"""Overrides SWHIndexingLister.filter_before_inject | |||||
Done Inline ActionsYou should change that doc to: Return the index of the first repository hosted on the Phabricator instance anlambert: You should change that doc to:
```
Return the index of the first repository hosted on the… | |||||
Bounds query results by this Lister's set max_index. | |||||
Done Inline ActionsHere you should use imbricated loops to avoid code duplication: for protocol in ['https', 'http']: for identifier in ['shortname', 'callsign', 'id']: if protocol in processed_urls and identifier in processed_urls[protocol]: return processed_urls[protocol][identifier] return None They might exist cases where no repository url can be extracted from the input data. anlambert: Here you should use imbricated loops to avoid code duplication:
```lang=python
for protocol in… | |||||
""" | |||||
models_list = [m for m in models_list if m is not None] | |||||
return super().filter_before_inject(models_list) | |||||
def _bootstrap_repositories_listing(self): | |||||
""" | |||||
Method called when no min_bound value has been provided | |||||
to the lister. Its purpose is to: | |||||
Done Inline ActionsReturn url for a hosted repository from its uris attachments according to the following priority lists: * protocol: https > http * identifier: shortname > callsign > id anlambert: ```
Return url for a hosted repository from its uris attachments according to the following… | |||||
1. get the first repository data hosted on the Phabricator | |||||
instance | |||||
2. inject them into the lister database | |||||
Done Inline ActionsNo need to indent the doc here anlambert: No need to indent the doc here | |||||
3. return the first repository index to start the listing | |||||
after that value | |||||
Returns: | |||||
int: The first repository index | |||||
""" | |||||
params = '&order=oldest&limit=1' | |||||
response = self.safely_issue_request(params) | |||||
models_list = self.transport_response_simplified(response) | |||||
self.max_index = models_list[0]['indexable'] | |||||
models_list = self.filter_before_inject(models_list) | |||||
injected = self.inject_repo_data_into_db(models_list) | |||||
Done Inline ActionsI would rather write here: for protocol in ('https', 'http'): if url.startswith(protocol): processed_urls[protocol]['undefined'] = url break anlambert: I would rather write here:
```lang=python
for protocol in ('https', 'http'):
if url. | |||||
self.create_missing_origins_and_tasks(models_list, injected) | |||||
return self.max_index | |||||
Done Inline ActionsReplace None by undefined here anlambert: Replace `None` by `undefined` here | |||||
def run(self, min_bound=None, max_bound=None): | |||||
""" | |||||
(Override) Run the lister on the specified Phabricator instance | |||||
Args: | |||||
min_bound (int): Optional repository index to start the listing | |||||
after it | |||||
max_bound (int): Optional repository index to stop the listing | |||||
after it | |||||
""" | |||||
# initial call to the lister, we need to bootstrap it in that case | |||||
if min_bound is None: | |||||
min_bound = self._bootstrap_repositories_listing() | |||||
super().run(min_bound, max_bound) | |||||
def get_repo_url(attachments): | |||||
""" | |||||
Return url for a hosted repository from its uris attachments according | |||||
to the following priority lists: | |||||
* protocol: https > http | |||||
* identifier: shortname > callsign > id | |||||
anlambertUnsubmitted Done Inline ActionsYou should unindent this docstring for more consistency with the rest of the code anlambert: You should unindent this docstring for more consistency with the rest of the code | |||||
""" | |||||
processed_urls = defaultdict(dict) | |||||
for uri in attachments: | |||||
protocol = uri['fields']['builtin']['protocol'] | |||||
url = uri['fields']['uri']['effective'] | |||||
identifier = uri['fields']['builtin']['identifier'] | |||||
if protocol in ('http', 'https'): | |||||
processed_urls[protocol][identifier] = url | |||||
elif protocol is None: | |||||
for protocol in ('https', 'http'): | |||||
if url.startswith(protocol): | |||||
processed_urls[protocol]['undefined'] = url | |||||
break | |||||
for protocol in ['https', 'http']: | |||||
for identifier in ['shortname', 'callsign', 'id', 'undefined']: | |||||
if (protocol in processed_urls and | |||||
identifier in processed_urls[protocol]): | |||||
return processed_urls[protocol][identifier] | |||||
return None |
-2019
(same on other headers ;)