Page MenuHomeSoftware Heritage

D1497.id4906.diff
No OneTemporary

D1497.id4906.diff

diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@
setuptools
xmltodict
iso8601
+beautifulsoup4
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -12,4 +12,5 @@
'swh.lister.npm.tasks',
'swh.lister.pypi.tasks',
'swh.lister.phabricator.tasks',
+ 'swh.lister.maven_central.tasks',
]
diff --git a/swh/lister/maven_central/__init__.py b/swh/lister/maven_central/__init__.py
new file mode 100644
diff --git a/swh/lister/maven_central/lister.py b/swh/lister/maven_central/lister.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/lister.py
@@ -0,0 +1,135 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from bs4 import BeautifulSoup
+
+from .models import Maven_CentralModel
+
+from swh.scheduler import utils
+from swh.lister.core.simple_lister import SimpleLister
+from swh.lister.core.lister_transports import ListerOnePageApiTransport
+
+
+class Maven_CentralLister(ListerOnePageApiTransport, SimpleLister):
+ MODEL = Maven_CentralModel
+ LISTER_NAME = 'maven_central'
+ PAGE = 'http://central.maven.org/maven2/'
+ _packages = []
+
+ def __init__(self, override_config=None):
+ ListerOnePageApiTransport .__init__(self)
+ SimpleLister.__init__(self, override_config=override_config)
+
+ def task_dict(self, origin_type, origin_url, **kwargs):
+ """(Override) Return task format dict
+
+ This is overridden from the lister_base as more information is
+ needed for the ingestion task creation.
+
+ """
+ _type = 'origin-update-%s' % origin_type
+ _policy = 'recurring'
+ project_name = kwargs.get('name')
+ project_metadata_url = kwargs.get('html_url')
+ return utils.create_task_dict(
+ _type, _policy, project_name, origin_url,
+ project_metadata_url=project_metadata_url)
+
+ def list_packages(self, response):
+ """(Override) List the actual maven central origins from the response.
+
+ """
+ soup = BeautifulSoup(response.text, features="lxml")
+ groups = []
+ for b in soup.find_all('a'):
+ groups.append(b.text)
+
+ for group in groups[1:-6]:
+ self.find_packages_recursively(group, self.PAGE)
+
+ return self._packages
+
+ def find_packages_recursively(self, package_name, url):
+ """
+ Visits all the directories recursively and populate _packages variable
+ with all the packages present in different groups.
+ """
+ self.PAGE = url+package_name
+ response = self.safely_issue_request(1)
+ file_system = file_structure(response)
+
+ if 'maven-metadata.xml' in file_system:
+ package = {
+ 'name': package_name[:-1],
+ 'url': url+package_name,
+ 'metadata_url': url+package_name + 'maven-metadata.xml'
+ }
+ self._packages.append(package)
+ return None
+
+ # Some packages don't have maven-metadata.xml file, to deal with those
+ # cases this loop checks of there is no more directory to visit in the
+ # file system, hence it has reached the package file.
+ for files in file_system:
+ if files[-1] != '/':
+ previous_directory = special_name(url)
+ package = {
+ 'name': previous_directory,
+ 'url': url,
+ 'metadata_url': None
+ }
+ self._packages.append(package)
+ return None
+
+ for directory in file_system:
+ self.find_packages_recursively(directory, url+package_name)
+
+ return None
+
+ def get_model_from_repo(self, repo):
+ """(Override) Transform from repository representation to model
+
+ """
+ return {
+ 'uid': repo['name'],
+ 'name': repo['name'],
+ 'full_name': repo['name'],
+ 'html_url': repo['metadata_url'],
+ 'origin_url': repo['url'],
+ 'origin_type': 'maven_central',
+ 'description': None,
+ }
+
+ def transport_response_simplified(self, response):
+ """(Override) Transform response to list for model manipulation
+
+ """
+ return [self.get_model_from_repo(repo) for repo in response]
+
+
+def file_structure(response):
+ '''
+ Lists all the files and folders present in the response
+
+ Args :
+ HTML response
+
+ Returns:
+ List of all the files and folders
+ '''
+ soup = BeautifulSoup(response.text, features="lxml")
+ files = []
+ for row in soup.find_all('a'):
+ files.append(row.text)
+ return files[1:]
+
+
+def special_name(url):
+ '''
+ Construct name of package from url for those packages which
+ do not have maven-metadata.xml file
+ '''
+ position_in_url = ([pos for pos, char in enumerate(url) if char == '/'])
+ name = url[position_in_url[-2]+1:position_in_url[-1]]
+ return name
diff --git a/swh/lister/maven_central/models.py b/swh/lister/maven_central/models.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/models.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String
+
+from ..core.models import ModelBase
+
+
+class Maven_CentralModel(ModelBase):
+ """a Maven Central repository representation
+
+ """
+ __tablename__ = 'maven_central_repo'
+
+ uid = Column(String, primary_key=True)
diff --git a/swh/lister/maven_central/tasks.py b/swh/lister/maven_central/tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tasks.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2018 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from .lister import Maven_CentralLister
+
+
+@app.task(name=__name__ + '.Maven_CentralListerTask')
+def maven_central_lister(**lister_args):
+ Maven_CentralLister(**lister_args).run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+ return 'OK'
diff --git a/swh/lister/maven_central/tests/__init__.py b/swh/lister/maven_central/tests/__init__.py
new file mode 100644
diff --git a/swh/lister/maven_central/tests/conftest.py b/swh/lister/maven_central/tests/conftest.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import * # noqa
diff --git a/swh/lister/maven_central/tests/test_tasks.py b/swh/lister/maven_central/tests/test_tasks.py
new file mode 100644
--- /dev/null
+++ b/swh/lister/maven_central/tests/test_tasks.py
@@ -0,0 +1,27 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+ res = swh_app.send_task(
+ 'swh.lister.pypi.tasks.ping')
+ assert res
+ res.wait()
+ assert res.successful()
+ assert res.result == 'OK'
+
+
+@patch('swh.lister.pypi.tasks.PyPILister')
+def test_lister(lister, swh_app, celery_session_worker):
+ # setup the mocked PypiLister
+ lister.return_value = lister
+ lister.run.return_value = None
+
+ res = swh_app.send_task(
+ 'swh.lister.pypi.tasks.PyPIListerTask')
+ assert res
+ res.wait()
+ assert res.successful()
+
+ lister.assert_called_once_with()
+ lister.db_last_index.assert_not_called()
+ lister.run.assert_called_once_with()

File Metadata

Mime Type
text/plain
Expires
Dec 21 2024, 8:06 AM (11 w, 4 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3219605

Event Timeline