diff --git a/etc/crontab b/etc/crontab --- a/etc/crontab +++ b/etc/crontab @@ -1,5 +1,5 @@ SHELL=/bin/bash -GHLISTER_ROOT=/home/zack/src/swh-lister-github +GHLISTER_ROOT=/home/ssushant/project_paris/swh-environment/swh-lister # m h dom mon dow command 0 8 * * * PYTHONPATH=$GHLISTER_ROOT $GHLISTER_ROOT/bin/ghlister catchup >> ~/.cache/swh/lister-github/$(date +\%Y\%m\%d).log 2>&1 diff --git a/swh/lister/core/debian_lister_base.py b/swh/lister/core/debian_lister_base.py new file mode 100644 --- /dev/null +++ b/swh/lister/core/debian_lister_base.py @@ -0,0 +1,206 @@ +# Copyright (C) 2015-2017 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import abc +import gzip +# import logging +import os +from datetime import datetime + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from swh.core import config +# from swh.scheduler.backend import SchedulerBackend +# from swh.storage import get_storage + +from .abstractattribute import AbstractAttribute + + +class FetchError(RuntimeError): + def __init__(self, response): + self.response = response + + def __str__(self): + return repr(self.response) + + +class SWHDebListerBase(abc.ABC, config.SWHConfig): + """Lister core base class for debian. + + The core method in this class is ingest_data. Any subclasses should be + calling this method one or more times to fetch and ingest data from API + endpoints. See swh.lister.core.lister_base.SWHIndexingLister for + example usage. + + This class cannot be instantiated. Any instantiable Lister descending + from SWHListerBase must provide at least the required overrides. + (see member docstrings for details): + + Required Overrides: + MODEL + def transport_response_simplified + """ + + MODEL = AbstractAttribute('Subclass type (not instance)' + ' of swh.lister.core.models.ModelBase' + ' customized for a specific service.') + + @abc.abstractmethod + def transport_response_simplified(self, response): + pass + + DEFAULT_CONFIG = { + 'storage': ('dict', { + 'cls': 'remote', + 'args': { + 'url': 'http://localhost:5002/' + }, + }), + 'scheduling_db': ('str', 'dbname=softwareheritage-scheduler'), + } + + @property + def CONFIG_BASE_FILENAME(self): # noqa: N802 + return 'lister-%s' % self.lister_name + + @property + def ADDITIONAL_CONFIG(self): # noqa: N802 + return { + 'lister_db_url': + ('str', 'postgresql:///lister-%s' % self.lister_name), + 'credentials': + ('list[dict]', []), + 'cache_responses': + ('bool', False), + 'cache_dir': + ('str', '~/.cache/swh/lister/%s' % self.lister_name), + } + + MAX_RETRIES = 7 + CONN_SLEEP = 10 + + def __init__(self, lister_name=None, override_config=None): + if lister_name is None: + raise NameError("Every lister must be assigned a lister_name.") + self.lister_name = lister_name # 'debian' + self.db_engine = create_engine('postgresql+psycopg2:///test_lister') + self.MODEL.metadata.create_all(self.db_engine) + self.mk_session = sessionmaker(bind=self.db_engine) + self.db_session = self.mk_session() + """ + self.config = self.parse_config_file( + base_filename=self.CONFIG_BASE_FILENAME, + additional_configs=[self.ADDITIONAL_CONFIG] + ) + self.config['cache_dir'] = os.path.expanduser(self.config['cache_dir']) + if self.config['cache_responses']: + config.prepare_folders(self.config, ['cache_dir']) + + if override_config: + self.config.update(override_config) + + self.storage = get_storage(**self.config['storage']) + self.scheduler = SchedulerBackend( + scheduling_db=self.config['scheduling_db'], + ) + + self.db_engine = create_engine(self.config['lister_db_url']) + self.mk_session = sessionmaker(bind=self.db_engine) + self.db_session = self.mk_session() + """ + + # to be implemented + def get_src_address(self, identifier): + response = identifier + return response + + def db_query_equal(self, key, value): + """Look in the db for a row with key == value + + Args: + key: column key to look at + value: value to look for in that column + Returns: + sqlalchemy.ext.declarative.declarative_base object + with the given key == value + """ + if isinstance(key, str): + key = self.MODEL.__dict__[key] + return self.db_session.query(self.MODEL) \ + .filter(key == value).first() + + def db_inject_repo(self, model_dict): + """Add/update a new repo to the db. + + Args: + model_dict: dictionary mapping model keys to values + Returns: + new or updated sqlalchemy.ext.declarative.declarative_base + object associated with the injection + """ + sql_repo = self.db_query_equal('package_version', + model_dict['package_version']) + if not sql_repo: + sql_repo = self.MODEL(**model_dict) + self.db_session.add(sql_repo) + else: + for k in model_dict: + setattr(sql_repo, k, model_dict[k]) + sql_repo.last_seen = datetime.now() + + return sql_repo + + def inject_repo_data_into_db(self, models_list): + """Inject data into the db. + + Args: + models_list: list of dicts mapping keys from the db model + for each repo to be injected + Returns: + dict of package_version:sql_repo pairs + """ + injected_repos = {} + for m in models_list: + injected_repos[m['package_version']] = self.db_inject_repo(m) + return injected_repos + + def ingest_data(self, identifier): + """The core data fetch sequence. Request server endpoint. Simplify and + filter response list of repositories. Inject repo information into + local db. Queue loader tasks for linked repositories. + + Args: + identifier: Resource identifier. + """ + # Request (partial?) list of repositories info + response = self.get_src_address(identifier) + models_list = self.transport_response_simplified(response) + # inject into local db + injected = self.inject_repo_data_into_db(models_list) + # queue workers + + return response, injected + + def save_response(self, response): + """Log the response from a server request to a cache dir. + + Args: + response: full server response + cache_dir: system path for cache dir + Returns: + nothing + """ + def escape_url_path(p): + return p.replace('/', '__') + + fname = os.path.join( + self.config['cache_dir'], + escape_url_path(response.request.path_url) + '.gz' + ) + with gzip.open(fname, 'w') as f: + f.write(bytes( + self.transport_response_to_string(response), + 'UTF-8' + )) diff --git a/swh/lister/debian/__init__.py b/swh/lister/debian/__init__.py new file mode 100644 --- /dev/null +++ b/swh/lister/debian/__init__.py @@ -0,0 +1 @@ +# placeholder diff --git a/swh/lister/debian/debian_models.py b/swh/lister/debian/debian_models.py new file mode 100644 --- /dev/null +++ b/swh/lister/debian/debian_models.py @@ -0,0 +1,82 @@ +from sqlalchemy import ForeignKey, Column, Integer, String +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship + +SQLBase = declarative_base() + + +class DistributionModelBase(SQLBase): + __tablename__ = 'distrubution_repo' + uid = Column(Integer, primary_key=True) + name = Column(String) + dist_type = Column(String) + mirror_url = Column(String) + + def __init__(self, uid=None, name=None, dist_type=None, mirror_url=None): + + self.uid = uid + + if name is not None: + self.name = name + + if dist_type is not None: + self.dist_type = dist_type + + if mirror_url is not None: + self.mirror_url = mirror_url + + +class SuiteModelBase(SQLBase): + __tablename__ = 'suite_repo' + uid = Column(Integer, primary_key=True) + distribution_id = Column(Integer, ForeignKey('distrubution_repo.uid')) + name = Column(String) + + distribution = relationship(DistributionModelBase, backref="suite_repo") + + def __init__(self, uid=None, name=None, distribution_id=None): + self.uid = uid + + if name is not None: + self.name = name + + +class CompModelBase(SQLBase): + __tablename__ = 'comp_repo' + uid = Column(Integer, primary_key=True) + suite_id = Column(Integer, ForeignKey('suite_repo.uid')) + name = Column(String) + + distribution_component = relationship(SuiteModelBase, backref="comp_repo") + + def __init__(self, uid=None, name=None, suite_id=None): + self.uid = uid + + if name is not None: + self.name = name + + +class PackageModelBase(SQLBase): + __tablename__ = 'package_repo' + component = Column(Integer, ForeignKey('comp_repo.uid')) + name = Column(String) + version = Column(String) + directory = Column(String) + files = Column(String) + package_version = Column(String, primary_key=True) + + package_component = relationship(CompModelBase, backref="package_repo") + + def __init__(self, name=None, version=None, directory=None, + component=None, files=None, package_version=None): + + if name is not None: + self.name = name + if version is not None: + self.version = version + if directory is not None: + self.directory = directory + if files is not None: + self.files = files + if package_version is not None: + self.package_version = package_version diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/debian/lister.py @@ -0,0 +1,54 @@ +# Copyright (C) 2017 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from debian import deb822 +from urllib.request import urlopen +import gzip +from swh.lister.core.debian_lister_base import SWHDebListerBase +from swh.lister.debian.models_package import PackageModelBase + + +class DebianLister(SWHDebListerBase): + MODEL = PackageModelBase + + def get_model_from_repo(self, repo, html_url): + all_files = "" + for file in repo['Files']: + all_files = all_files + file['name'] + ',' + + dictionary = { + 'name': str(repo['Package']), + 'version': str(repo['Version']), + 'directory': str(repo['Directory']), + 'files': all_files[:-1], + # ' component' : component, + 'html_url': html_url, + # 'suite' : suite, + 'package_version': str(repo['Package'] + "_" + repo['Version']) + } + return(dictionary) + + def transport_response_simplified(self, response): + target_list = response.split() + target_urls = [] + html_url = target_list[1] + suite = target_list[2] + initial = html_url + 'dists/' + suite + '/' + components = [] + for i in range(3, len(target_list)): + target_urls.append(initial + target_list[i] + '/source/Sources.gz') + components.append(target_list[i]) + final_list = [] + k = 0 + for target_url in target_urls: + url = urlopen(target_url) + with gzip.open(url, 'rb') as data: + data_read = data.read() + for repo in deb822.Sources.iter_paragraphs(data_read): + dictionary = {} + dictionary = self.get_model_from_repo(repo, html_url) + if bool(dictionary) is True: + final_list.append(dictionary) + k = k+1 + return final_list diff --git a/swh/lister/debian/models_package.py b/swh/lister/debian/models_package.py new file mode 100644 --- /dev/null +++ b/swh/lister/debian/models_package.py @@ -0,0 +1,34 @@ +from datetime import datetime +from sqlalchemy import Column, DateTime, String +from sqlalchemy.ext.declarative import declarative_base + +SQLBase = declarative_base() + + +class PackageModelBase(SQLBase): + __tablename__ = 'package_repo' + name = Column(String) + version = Column(String) + directory = Column(String) + files = Column(String) + html_url = Column(String) + package_version = Column(String, primary_key=True) + last_seen = Column(DateTime, nullable=False) + + def __init__(self, name=None, version=None, directory=None, + files=None, html_url=None, package_version=None): + + self.last_seen = datetime.now() + + if name is not None: + self.name = name + if version is not None: + self.version = version + if directory is not None: + self.directory = directory + if files is not None: + self.files = files + if html_url is not None: + self.html_url = html_url + if package_version is not None: + self.package_version = package_version diff --git a/swh/lister/debian/tests/test_db_lister.py b/swh/lister/debian/tests/test_db_lister.py new file mode 100644 --- /dev/null +++ b/swh/lister/debian/tests/test_db_lister.py @@ -0,0 +1,17 @@ +from swh.lister.debian.lister import DebianLister + + +class test(DebianLister): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def run(self): + response = "deb-src http://deb.debian.org/debian/ testing contrib" + resp, injected = self.ingest_data(response) + self.db_session.commit() + # print(injected) + + +abc = test("debian") +abc.run()