Changeset View
Standalone View
swh/lister/phabricator/lister.py
- This file was added.
# Copyright (C) 2019 the Software Heritage developers | |||||
ardumont: `-2019`
(same on other headers ;) | |||||
Done Inline ActionsAs @ardumont already pointed out, the headers should start with Copyright (C) 2019 (we are not yet in 2020) anlambert: As @ardumont already pointed out, the headers should start with `Copyright (C) 2019` (we are… | |||||
# License: GNU General Public License version 3, or any later version | |||||
# See top-level LICENSE file for more information | |||||
from swh.lister.core.indexing_lister import SWHIndexingHttpLister | |||||
from swh.lister.phabricator.models import PhabricatorModel | |||||
from collections import defaultdict | |||||
class PhabricatorLister(SWHIndexingHttpLister): | |||||
Done Inline ActionsAfter reading the diffusion.repository.search endpoint documentation, I would use the following query parameters here: &order=oldest&attachments[uris]=1&after=%s. The order=oldest will ensure that you get the list of repositories by order of creation (and thus repositories ids will be in ascending order which will facilitate incremental listing). The attachments[uris]=1 will add info regarding repository uris in the JSON reponse. Proceeding this way will ensure to get valid uris and there will be no need to reconstruct them as you did below (as there is no guarantee they will be valid ones). anlambert: After reading the [[ https://forge.softwareheritage.org/conduit/method/diffusion.repository. | |||||
def __init__(self, forge_url, api_token, override_config=None): | |||||
Done Inline ActionsThis constructor must be moved after the class variable (those in upper cases) definition anlambert: This constructor must be moved after the class variable (those in upper cases) definition | |||||
self.forge_url = forge_url | |||||
Done Inline ActionsUse forge_url instead of FORGEURL for code style consistency anlambert: Use `forge_url` instead of `FORGEURL` for code style consistency | |||||
api_endpoint = 'api/diffusion.repository.'\ | |||||
'search?api.token=%s' % api_token | |||||
api_baseurl = '%s/%s' % (forge_url, api_endpoint) | |||||
Done Inline Actionswe should check if forge_url contains a leading slash here and remove it if it is the case as requests to anlambert: we should check if `forge_url` contains a leading slash here and remove it if it is the case as… | |||||
super().__init__(api_baseurl=api_baseurl, | |||||
Done Inline ActionsThis is more readable if forge_url.endswith('/'): anlambert: This is more readable
```lang=python
if forge_url.endswith('/'):
```
| |||||
override_config=override_config) | |||||
PATH_TEMPLATE = '&order=oldest&attachments[uris]=1&after=%s' | |||||
Done Inline ActionsHere you should process the data contained in the array repo['attachments']['uris']['uris'] and extract a repository uri that we will be able to load into the archive. You should filter the uri to extract according to the value of elt['fields']['uri']['raw'], elt being an element contained in the array mentionned above. For instance, if the value is equal to https://shortname or http://shortname you can extract the corresponding repository uri. We should have some kind of priority list in order to handle all possible uri schemes. I pasted below an example of data to process for a repository hosted on https://phabricator.kde.org $ curl https://phabricator.kde.org/api/diffusion.repository.search -d api.token=XXXX -d attachments[uris]=1 -d limit=1 | jq '.' { "result": { "data": [ { "id": 37, "type": "REPO", "phid": "PHID-REPO-rhzpjasqu2ynsqeip27z", "fields": { "name": "Krita", "vcs": "git", "callsign": null, "shortName": "krita", "status": "active", "isImporting": false, "spacePHID": "PHID-SPCE-6njcdzuokbeuy6ybq5sn", "dateCreated": 1444506956, "dateModified": 1482836504, "policy": { "view": "public", "edit": "PHID-PROJ-o4c424cwgljycukfodwg", "diffusion.push": "PHID-PROJ-c3schegkpd4o2sas53xu" } }, "attachments": { "uris": { "uris": [ { "id": "2109", "type": "RURI", "phid": "PHID-RURI-h6dogp2mzwffsqmi2fsz", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "ssh://git@git.kde.org/krita", "display": "ssh://git@git.kde.org/krita", "effective": "ssh://git@git.kde.org/krita", "normalized": "git.kde.org/krita" }, "io": { "raw": "none", "default": "none", "effective": "none" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": null, "identifier": null }, "dateCreated": "1473675015", "dateModified": "1482835832" } }, { "id": "2108", "type": "RURI", "phid": "PHID-RURI-7lbwzakiwi5glfcmlzuc", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "https://anongit.kde.org/krita", "display": "https://anongit.kde.org/krita", "effective": "https://anongit.kde.org/krita", "normalized": "anongit.kde.org/krita" }, "io": { "raw": "none", "default": "none", "effective": "none" }, "display": { "raw": "always", "default": "never", "effective": "always" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": null, "identifier": null }, "dateCreated": "1473674991", "dateModified": "1482836504" } }, { "id": "348", "type": "RURI", "phid": "PHID-RURI-53tce2xqd65omsoh3jzz", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "git://anongit.kde.org/krita", "display": "git://anongit.kde.org/krita", "effective": "git://anongit.kde.org/krita", "normalized": "anongit.kde.org/krita" }, "io": { "raw": "observe", "default": "none", "effective": "observe" }, "display": { "raw": "always", "default": "never", "effective": "always" }, "credentialPHID": "PHID-CDTL-lbunomrcrjmhmpjhr6uc", "disabled": false, "builtin": { "protocol": null, "identifier": null }, "dateCreated": "1463470863", "dateModified": "1473674933" } }, { "id": "347", "type": "RURI", "phid": "PHID-RURI-ejo4gh5nh5vo3sx4tu4u", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "http://id", "display": "http://phabricator.kde.org/diffusion/37/krita.git", "effective": "http://phabricator.kde.org/diffusion/37/krita.git", "normalized": "phabricator.kde.org/diffusion/37" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "http", "identifier": "id" }, "dateCreated": "1463470863", "dateModified": "1482835833" } }, { "id": "346", "type": "RURI", "phid": "PHID-RURI-zdrqmmshjcuxktcaawr6", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "http://phabricator.kde.org/source/krita.git", "display": "http://phabricator.kde.org/source/krita.git", "effective": "http://phabricator.kde.org/source/krita.git", "normalized": "phabricator.kde.org/source/krita" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "always", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "http", "identifier": "shortname" }, "dateCreated": "1463470863", "dateModified": "1480800604" } }, { "id": "344", "type": "RURI", "phid": "PHID-RURI-5weymrwtiejwgs7sy74d", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "https://id", "display": "https://phabricator.kde.org/diffusion/37/krita.git", "effective": "https://phabricator.kde.org/diffusion/37/krita.git", "normalized": "phabricator.kde.org/diffusion/37" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "https", "identifier": "id" }, "dateCreated": "1463470863", "dateModified": "1482835834" } }, { "id": "343", "type": "RURI", "phid": "PHID-RURI-52a7t7f3dgamts45clos", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "https://phabricator.kde.org/source/krita.git", "display": "https://phabricator.kde.org/source/krita.git", "effective": "https://phabricator.kde.org/source/krita.git", "normalized": "phabricator.kde.org/source/krita" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "always", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "https", "identifier": "shortname" }, "dateCreated": "1463470863", "dateModified": "1480800621" } }, { "id": "341", "type": "RURI", "phid": "PHID-RURI-pzavdb5o73w35rpwbctu", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "ssh://id", "display": "ssh://code@code.kde.org/diffusion/37/krita.git", "effective": "ssh://code@code.kde.org/diffusion/37/krita.git", "normalized": "code.kde.org/diffusion/37" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "ssh", "identifier": "id" }, "dateCreated": "1463470863", "dateModified": "1482835834" } }, { "id": "340", "type": "RURI", "phid": "PHID-RURI-umlemztrfd75gpdhawfs", "fields": { "repositoryPHID": "PHID-REPO-rhzpjasqu2ynsqeip27z", "uri": { "raw": "ssh://code@phabricator.kde.org/source/krita.git", "display": "ssh://code@code.kde.org/source/krita.git", "effective": "ssh://code@code.kde.org/source/krita.git", "normalized": "code.kde.org/source/krita" }, "io": { "raw": "default", "default": "read", "effective": "read" }, "display": { "raw": "never", "default": "always", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "ssh", "identifier": "shortname" }, "dateCreated": "1463470863", "dateModified": "1480800626" } } ] } } } ], "maps": {}, "query": { "queryKey": null }, "cursor": { "limit": 1, "after": "37", "before": null, "order": null } }, "error_code": null, "error_info": null } anlambert: Here you should process the data contained in the array `repo['attachments']['uris']['uris']`… | |||||
MODEL = PhabricatorModel | |||||
LISTER_NAME = 'phabricator' | |||||
Done Inline ActionsJust a nitpick but for a long literal string, the following syntax is recommended in Python: api_endpoint = ('api/diffusion.repository.' 'search?api.token=%s') % api_token anlambert: Just a nitpick but for a long literal string, the following syntax is recommended in Python… | |||||
Done Inline Actionsyou can remove that line anlambert: you can remove that line | |||||
def get_model_from_repo(self, repo): | |||||
Done Inline ActionsHere base_url corresponds to the forge_url passed in the lister constructor. anlambert: Here `base_url` corresponds to the `forge_url` passed in the lister constructor.
So you should… | |||||
url = get_repo_url(repo['attachments']['uris']['uris']) | |||||
if url is None: | |||||
return {} | |||||
return { | |||||
'uid': self.forge_url + str(repo['id']), | |||||
Done Inline Actionsget_repo_url seems a better name here anlambert: `get_repo_url` seems a better name here | |||||
Done Inline ActionsI think this could be moved in a function rather than a method as we do not need any lister state here. Plus it will make the task of writing tests easier (they are missing by the way and must be added to validate the repository url extraction approach). anlambert: I think this could be moved in a function rather than a method as we do not need any lister… | |||||
'indexable': repo['id'], | |||||
'name': repo['fields']['shortName'], | |||||
'full_name': repo['fields']['name'], | |||||
Done Inline Actionslet's gain some line of code here by using a defaultdict # put this import at the of the file from collections import defaultdict processed_urls = defaultdict(dict) anlambert: let's gain some line of code here by using a `defaultdict`
```lang=python
# put this import at… | |||||
'html_url': url, | |||||
'origin_url': url, | |||||
'description': None, | |||||
'origin_type': repo['fields']['vcs'] | |||||
} | |||||
def get_next_target_from_response(self, response): | |||||
body = response.json()['result']['cursor'] | |||||
Done Inline Actionsreplace this by: processed_urls[protocol][identifier] = url anlambert: replace this by:
```lang=python
processed_urls[protocol][identifier] = url
``` | |||||
Done Inline ActionsI didn't do it this way because "attachments": { "uris": { "uris": [ { "id": "803", "type": "RURI", "phid": "PHID-RURI-gxga6bkytergmvce3hjb", "fields": { "repositoryPHID": "PHID-REPO-oyzmq4ek2ditbftyosxb", "uri": { "raw": "git@github.com:SoftwareHeritage/puppet-swh-site.git", "display": "git@github.com:SoftwareHeritage/puppet-swh-site.git", "effective": "git@github.com:SoftwareHeritage/puppet-swh-site.git", "normalized": "github.com/SoftwareHeritage/puppet-swh-site" }, "io": { "raw": "mirror", "default": "none", "effective": "mirror" }, "display": { "raw": "never", "default": "never", "effective": "never" }, "credentialPHID": "PHID-CDTL-hwgnkw6vk2nrh475bvkf", "disabled": false, "builtin": { "protocol": null, "identifier": null }, "dateCreated": "1486563621", "dateModified": "1486563621" } }, nahimilega: I didn't do it this way because
1)To reduce the memory consumption of the code as we only need… | |||||
Done Inline ActionsThe memory usage is negligible. anyway, you can do it like that: if protocol in ('http', 'https'): processed_urls[protocol][identifier] = url vlorentz: The memory usage is negligible.
anyway, you can do it like that:
```
if protocol in ('http'… | |||||
if body['after'] != 'null': | |||||
Done Inline ActionsCould you add the following here: def request_headers(self): """(Override) Set requests headers to send when querying the Phabricator API """ return {'User-Agent': 'Software Heritage phabricator lister', 'Accept': 'application/json'} This enables to specify that Software Heritage is the API consumer. anlambert: Could you add the following here:
```lang=python
def request_headers(self):
"""… | |||||
return body['after'] | |||||
else: | |||||
return None | |||||
Done Inline ActionsHere, I would proceed as follows:
{ '<protocol>': { '<identifier>': '<effective_uri>', ... }, ... }
{ "id": "492", "type": "RURI", "phid": "PHID-RURI-5tp5xevbqwx2phtbh4sr", "fields": { "repositoryPHID": "PHID-REPO-72x6isvol3qu4eoxhteu", "uri": { "raw": "https://callsign", "display": "https://forge.softwareheritage.org/diffusion/DSCH/swh-scheduler.git", "effective": "https://forge.softwareheritage.org/diffusion/DSCH/swh-scheduler.git", "normalized": "forge.softwareheritage.org/diffusion/DSCH" }, "io": { "raw": "read", "default": "readwrite", "effective": "read" }, "display": { "raw": "default", "default": "never", "effective": "never" }, "credentialPHID": null, "disabled": false, "builtin": { "protocol": "https", "identifier": "callsign" }, "dateCreated": "1465923367", "dateModified": "1465923367" }
anlambert: Here, I would proceed as follows:
1. Process the whole uris list in a first pass and store them… | |||||
def transport_response_simplified(self, response): | |||||
repos = response.json() | |||||
if repos['result'] is None: | |||||
raise ValueError( | |||||
'Problem during information fetch: %s' % repos['error_code']) | |||||
repos = repos['result']['data'] | |||||
return [self.get_model_from_repo(repo) for repo in repos] | |||||
def first_repository_index(self): | |||||
""" | |||||
Return the index of the first repository hosted | |||||
on the Phabricator instance | |||||
""" | |||||
Done Inline ActionsYou should change that doc to: Return the index of the first repository hosted on the Phabricator instance anlambert: You should change that doc to:
```
Return the index of the first repository hosted on the… | |||||
params = '&order=oldest&limit=1' | |||||
Done Inline ActionsHere you should use imbricated loops to avoid code duplication: for protocol in ['https', 'http']: for identifier in ['shortname', 'callsign', 'id']: if protocol in processed_urls and identifier in processed_urls[protocol]: return processed_urls[protocol][identifier] return None They might exist cases where no repository url can be extracted from the input data. anlambert: Here you should use imbricated loops to avoid code duplication:
```lang=python
for protocol in… | |||||
response = self.safely_issue_request(params) | |||||
return response.json()['result']['data'][0]['id'] | |||||
def get_repo_url(attachments): | |||||
""" | |||||
Return url for a hosted repository from its uris attachments according | |||||
to the following priority lists: | |||||
* protocol: https > http | |||||
Done Inline ActionsReturn url for a hosted repository from its uris attachments according to the following priority lists: * protocol: https > http * identifier: shortname > callsign > id anlambert: ```
Return url for a hosted repository from its uris attachments according to the following… | |||||
* identifier: shortname > callsign > id | |||||
""" | |||||
processed_urls = defaultdict(dict) | |||||
for uri in attachments: | |||||
Done Inline ActionsNo need to indent the doc here anlambert: No need to indent the doc here | |||||
protocol = uri['fields']['builtin']['protocol'] | |||||
url = uri['fields']['uri']['effective'] | |||||
identifier = uri['fields']['builtin']['identifier'] | |||||
if protocol in ('http', 'https'): | |||||
processed_urls[protocol][identifier] = url | |||||
for protocol in ['https', 'http']: | |||||
for identifier in ['shortname', 'callsign', 'id']: | |||||
if (protocol in processed_urls and | |||||
identifier in processed_urls[protocol]): | |||||
return processed_urls[protocol][identifier] | |||||
return None | |||||
Done Inline ActionsI would rather write here: for protocol in ('https', 'http'): if url.startswith(protocol): processed_urls[protocol]['undefined'] = url break anlambert: I would rather write here:
```lang=python
for protocol in ('https', 'http'):
if url. | |||||
Done Inline ActionsReplace None by undefined here anlambert: Replace `None` by `undefined` here | |||||
Done Inline ActionsYou should unindent this docstring for more consistency with the rest of the code anlambert: You should unindent this docstring for more consistency with the rest of the code |
-2019
(same on other headers ;)