Page Menu
Home
Software Heritage
Search
Configure Global Search
Log In
Files
F9345545
D209.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Subscribers
None
D209.diff
View Options
diff --git a/seirl/debian-tagreadmes/import.py b/seirl/debian-tagreadmes/import.py
new file mode 100644
--- /dev/null
+++ b/seirl/debian-tagreadmes/import.py
@@ -0,0 +1,207 @@
+import aiohttp
+import asyncio
+import concurrent.futures
+import gzip
+import hashlib
+import io
+import logging
+import subprocess
+import sys
+import tempfile
+import tqdm
+
+from debian.deb822 import Sources, Packages, Deb822
+from pathlib import Path
+from peewee import Model, TextField, CharField, ForeignKeyField
+from playhouse.fields import ManyToManyField
+from playhouse.sqlite_ext import SqliteExtDatabase
+
+
+MIRROR = 'http://ftp.fr.debian.org/debian'
+RELEASE = 'jessie'
+BASE_URL = '{mirror}/dists/{release}'.format(mirror=MIRROR,
+ release=RELEASE)
+PKG_LIST = {}
+SRC_LIST = {}
+
+README_FILES = {
+ 'readme',
+ 'readme.markdown',
+ 'readme.mdown',
+ 'readme.mkdn',
+ 'readme.md',
+ 'readme.textile',
+ 'readme.rdoc',
+ 'readme.org',
+ 'readme.creole',
+ 'readme.mediawiki',
+ 'readme.wiki',
+ 'readme.rst',
+ 'readme.asciidoc',
+ 'readme.adoc',
+ 'readme.asc',
+ 'readme.pod',
+ 'README',
+ 'README.markdown',
+ 'README.mdown',
+ 'README.mkdn',
+ 'README.md',
+ 'README.textile',
+ 'README.rdoc',
+ 'README.org',
+ 'README.creole',
+ 'README.mediawiki',
+ 'README.wiki',
+ 'README.rst',
+ 'README.asciidoc',
+ 'README.adoc',
+ 'README.asc',
+ 'README.pod'
+}
+
+
+db = SqliteExtDatabase('debian_readme.db')
+
+class BaseModel(Model):
+ class Meta:
+ database = db
+
+
+class Tag(BaseModel):
+ tag = TextField(unique=True)
+
+
+class ReadmeFile(BaseModel):
+ sha1 = CharField(unique=True)
+
+
+class ReadmeTag(BaseModel):
+ readme = ForeignKeyField(ReadmeFile)
+ tag = ForeignKeyField(Tag)
+
+ class Meta:
+ indexes = ((('readme', 'tag'), True),)
+
+
+class Package(BaseModel):
+ name = TextField(unique=True)
+
+ # this should be a manytomany...
+ #readme = ForeignKeyField(ReadmeFile, related_name='packages')
+
+
+db.connect()
+db.create_tables([Tag, ReadmeFile, ReadmeTag, Package], safe=True)
+
+
+def sha1file(path):
+ BUF_SIZE = 2 ** 22
+ sha1 = hashlib.sha1()
+ with path.open('rb') as f:
+ while True:
+ data = f.read(BUF_SIZE)
+ if not data:
+ break
+ sha1.update(data)
+ return sha1.digest()
+
+
+async def communicate(cmdline, data=None, **kwargs):
+ logging.debug('Running %s', ' '.join(cmdline))
+ proc = await asyncio.create_subprocess_exec(
+ *cmdline, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE, **kwargs)
+ stdout, stderr = await proc.communicate(data)
+ retcode = await proc.wait()
+ return retcode, stdout, stderr
+
+
+async def getdeb822(session, url, cls=Deb822):
+ res = {}
+ async with session.get(url) as resp:
+ content = await resp.read()
+ with gzip.GzipFile(fileobj=io.BytesIO(content)) as gf:
+ content = gf.read()
+ for src in cls.iter_paragraphs(content.decode()):
+ res[src['Package']] = src
+ return res
+
+
+
+async def pkg_list(session, group):
+ sources = '{url}/{group}/source/Sources.gz'.format(url=BASE_URL,
+ group=group)
+ packages = '{url}/{group}/binary-amd64/Packages.gz'.format(url=BASE_URL,
+ group=group)
+ SRC_LIST.update(await getdeb822(session, sources, cls=Sources))
+ PKG_LIST.update(await getdeb822(session, packages, cls=Packages))
+
+
+def handle_file(pkgname, f, tags):
+ tags = (t.strip() for t in tags.split(','))
+ tags_entries = [Tag.get_or_create(tag=tagname)[0] for tagname in tags]
+
+ sha1 = sha1file(f)
+ readme, _ = ReadmeFile.get_or_create(sha1=sha1)
+
+ for te in tags_entries:
+ rt, _ = ReadmeTag.get_or_create(tag=te, readme=readme)
+
+ package, _ = Package.get_or_create(name=pkgname)#, readme=readme)
+
+
+async def handle_package(package_name, sem):
+ pkg = PKG_LIST[package_name]
+
+ src = SRC_LIST[pkg['Source'].split()[0] if 'Source' in pkg else package_name]
+ dsc = next(x for x in src['Files'] if x['name'].endswith('.dsc'))['name']
+ directory = src['Directory']
+ url_dsc = '{mirror}/{directory}/{dsc}'.format(mirror=MIRROR,
+ directory=directory, dsc=dsc)
+ async with sem:
+ with tempfile.TemporaryDirectory(prefix='debianreadme',
+ dir='/srv/hdd/tmp') as tdir:
+ await communicate(['dget', url_dsc], cwd=tdir)
+ extract_root = Path(tdir)
+ source_folder = next((d for d in extract_root.iterdir()
+ if d.is_dir()), None)
+ if source_folder is not None:
+ for f in source_folder.iterdir():
+ if f.is_file() and f.name in README_FILES:
+ await asyncio.get_event_loop().run_in_executor(None,
+ handle_file, package_name, f, pkg['Tag'])
+
+
+async def handle_package_pg(package_name, sem, pbar):
+ await handle_package(package_name, sem)
+ pbar.update(1)
+
+
+async def main():
+ print('Loading packages… ', end='', flush=True)
+ with aiohttp.ClientSession() as session:
+ await pkg_list(session, 'main')
+ await pkg_list(session, 'contrib')
+ await pkg_list(session, 'non-free')
+ print('done.')
+
+ print('Filtering packages… ', end='', flush=True)
+ todo = [k for k, v in PKG_LIST.items()
+ if 'Tag' in v
+ and not Package.select().where(Package.name == k).exists()]
+ print('done.')
+
+ semaphore = asyncio.BoundedSemaphore(64)
+ pbar = tqdm.tqdm(total=len(todo))
+ tasks = [handle_package_pg(pkg, semaphore, pbar) for pkg in todo]
+ await asyncio.gather(*tasks)
+
+
+if __name__ == '__main__':
+ # One worker to have sequential database access outside the main loop
+ executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+
+ loop = asyncio.get_event_loop()
+ loop.set_default_executor(executor)
+ loop.run_until_complete(main())
+ loop.close()
diff --git a/seirl/debian-tagreadmes/requirements.txt b/seirl/debian-tagreadmes/requirements.txt
new file mode 100644
--- /dev/null
+++ b/seirl/debian-tagreadmes/requirements.txt
@@ -0,0 +1,4 @@
+aiohttp
+peewee
+python-debian
+tqdm
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jul 3, 3:24 PM (1 w, 35 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3222695
Attached To
D209: seirl: add debian-tagreadmes
Event Timeline
Log In to Comment