diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ setuptools xmltodict iso8601 +beautifulsoup4 diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -12,4 +12,5 @@ 'swh.lister.npm.tasks', 'swh.lister.pypi.tasks', 'swh.lister.phabricator.tasks', + 'swh.lister.maven_central.tasks', ] diff --git a/swh/lister/maven_central/__init__.py b/swh/lister/maven_central/__init__.py new file mode 100644 diff --git a/swh/lister/maven_central/lister.py b/swh/lister/maven_central/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven_central/lister.py @@ -0,0 +1,135 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from bs4 import BeautifulSoup + +from .models import Maven_CentralModel + +from swh.scheduler import utils +from swh.lister.core.simple_lister import SimpleLister +from swh.lister.core.lister_transports import ListerOnePageApiTransport + + +class Maven_CentralLister(ListerOnePageApiTransport, SimpleLister): + MODEL = Maven_CentralModel + LISTER_NAME = 'maven_central' + PAGE = 'http://central.maven.org/maven2/' + _packages = [] + + def __init__(self, override_config=None): + ListerOnePageApiTransport .__init__(self) + SimpleLister.__init__(self, override_config=override_config) + + def task_dict(self, origin_type, origin_url, **kwargs): + """(Override) Return task format dict + + This is overridden from the lister_base as more information is + needed for the ingestion task creation. + + """ + _type = 'origin-update-%s' % origin_type + _policy = 'recurring' + project_name = kwargs.get('name') + project_metadata_url = kwargs.get('html_url') + return utils.create_task_dict( + _type, _policy, project_name, origin_url, + project_metadata_url=project_metadata_url) + + def list_packages(self, response): + """(Override) List the actual maven central origins from the response. + + """ + soup = BeautifulSoup(response.text, features="lxml") + groups = [] + for b in soup.find_all('a'): + groups.append(b.text) + + for group in groups[1:-6]: + self.find_packages_recursively(group, self.PAGE) + + return self._packages + + def find_packages_recursively(self, package_name, url): + """ + Visits all the directories recursively and populate _packages variable + with all the packages present in different groups. + """ + self.PAGE = url+package_name + response = self.safely_issue_request(1) + file_system = file_structure(response) + + if 'maven-metadata.xml' in file_system: + package = { + 'name': package_name[:-1], + 'url': url+package_name, + 'metadata_url': url+package_name + 'maven-metadata.xml' + } + self._packages.append(package) + return None + + # Some packages don't have maven-metadata.xml file, to deal with those + # cases this loop checks of there is no more directory to visit in the + # file system, hence it has reached the package file. + for files in file_system: + if files[-1] != '/': + previous_directory = special_name(url) + package = { + 'name': previous_directory, + 'url': url, + 'metadata_url': None + } + self._packages.append(package) + return None + + for directory in file_system: + self.find_packages_recursively(directory, url+package_name) + + return None + + def get_model_from_repo(self, repo): + """(Override) Transform from repository representation to model + + """ + return { + 'uid': repo['name'], + 'name': repo['name'], + 'full_name': repo['name'], + 'html_url': repo['metadata_url'], + 'origin_url': repo['url'], + 'origin_type': 'maven_central', + 'description': None, + } + + def transport_response_simplified(self, response): + """(Override) Transform response to list for model manipulation + + """ + return [self.get_model_from_repo(repo) for repo in response] + + +def file_structure(response): + ''' + Lists all the files and folders present in the response + + Args : + HTML response + + Returns: + List of all the files and folders + ''' + soup = BeautifulSoup(response.text, features="lxml") + files = [] + for row in soup.find_all('a'): + files.append(row.text) + return files[1:] + + +def special_name(url): + ''' + Construct name of package from url for those packages which + do not have maven-metadata.xml file + ''' + position_in_url = ([pos for pos, char in enumerate(url) if char == '/']) + name = url[position_in_url[-2]+1:position_in_url[-1]] + return name diff --git a/swh/lister/maven_central/models.py b/swh/lister/maven_central/models.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven_central/models.py @@ -0,0 +1,16 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from ..core.models import ModelBase + + +class Maven_CentralModel(ModelBase): + """a Maven Central repository representation + + """ + __tablename__ = 'maven_central_repo' + + uid = Column(String, primary_key=True) diff --git a/swh/lister/maven_central/tasks.py b/swh/lister/maven_central/tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven_central/tasks.py @@ -0,0 +1,17 @@ +# Copyright (C) 2018 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.celery_backend.config import app + +from .lister import Maven_CentralLister + + +@app.task(name=__name__ + '.Maven_CentralListerTask') +def maven_central_lister(**lister_args): + Maven_CentralLister(**lister_args).run() + + +@app.task(name=__name__ + '.ping') +def ping(): + return 'OK' diff --git a/swh/lister/maven_central/tests/__init__.py b/swh/lister/maven_central/tests/__init__.py new file mode 100644 diff --git a/swh/lister/maven_central/tests/conftest.py b/swh/lister/maven_central/tests/conftest.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven_central/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/maven_central/tests/test_tasks.py b/swh/lister/maven_central/tests/test_tasks.py new file mode 100644 --- /dev/null +++ b/swh/lister/maven_central/tests/test_tasks.py @@ -0,0 +1,27 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.pypi.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.pypi.tasks.PyPILister') +def test_lister(lister, swh_app, celery_session_worker): + # setup the mocked PypiLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.pypi.tasks.PyPIListerTask') + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with() + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with()