Page MenuHomeSoftware Heritage

D211.id699.diff
No OneTemporary

D211.id699.diff

diff --git a/seirl/debian-tagreadmes/import.py b/seirl/debian-tagreadmes/import.py
--- a/seirl/debian-tagreadmes/import.py
+++ b/seirl/debian-tagreadmes/import.py
@@ -18,6 +18,7 @@
MIRROR = 'http://ftp.fr.debian.org/debian'
+SOURCES = 'http://sources.debian.net'
RELEASE = 'jessie'
BASE_URL = '{mirror}/dists/{release}'.format(mirror=MIRROR,
release=RELEASE)
@@ -71,39 +72,26 @@
tag = TextField(unique=True)
-class ReadmeFile(BaseModel):
- sha1 = CharField(unique=True)
+class Package(BaseModel):
+ name = TextField(unique=True)
-class ReadmeTag(BaseModel):
- readme = ForeignKeyField(ReadmeFile)
+class PackageTag(BaseModel):
+ package = ForeignKeyField(Package)
tag = ForeignKeyField(Tag)
class Meta:
- indexes = ((('readme', 'tag'), True),)
+ indexes = ((('package', 'tag'), True),)
-class Package(BaseModel):
- name = TextField(unique=True)
-
- # this should be a manytomany...
- #readme = ForeignKeyField(ReadmeFile, related_name='packages')
+class Readme(BaseModel):
+ name = TextField()
+ sha256 = CharField(max_length=256)
+ package = ForeignKeyField(Package, related_name='readmes')
db.connect()
-db.create_tables([Tag, ReadmeFile, ReadmeTag, Package], safe=True)
-
-
-def sha1file(path):
- BUF_SIZE = 2 ** 22
- sha1 = hashlib.sha1()
- with path.open('rb') as f:
- while True:
- data = f.read(BUF_SIZE)
- if not data:
- break
- sha1.update(data)
- return sha1.digest()
+db.create_tables([Tag, Package, PackageTag, Readme], safe=True)
async def communicate(cmdline, data=None, **kwargs):
@@ -137,43 +125,51 @@
PKG_LIST.update(await getdeb822(session, packages, cls=Packages))
-def handle_file(pkgname, f, tags):
+def handle_file(pkgname, readme_name, sha256, tags):
tags = (t.strip() for t in tags.split(','))
tags_entries = [Tag.get_or_create(tag=tagname)[0] for tagname in tags]
- sha1 = sha1file(f)
- readme, _ = ReadmeFile.get_or_create(sha1=sha1)
-
+ package, _ = Package.get_or_create(name=pkgname)
for te in tags_entries:
- rt, _ = ReadmeTag.get_or_create(tag=te, readme=readme)
+ pt, _ = PackageTag.get_or_create(tag=te, package=package)
- package, _ = Package.get_or_create(name=pkgname)#, readme=readme)
+ readme, _ = Readme.get_or_create(name=readme_name, sha256=sha256, package=package)
-async def handle_package(package_name, sem):
+async def handle_package(package_name, session, sem):
pkg = PKG_LIST[package_name]
src = SRC_LIST[pkg['Source'].split()[0] if 'Source' in pkg else package_name]
- dsc = next(x for x in src['Files'] if x['name'].endswith('.dsc'))['name']
- directory = src['Directory']
- url_dsc = '{mirror}/{directory}/{dsc}'.format(mirror=MIRROR,
- directory=directory, dsc=dsc)
+ url_index = '{sources}/api/src/{package}/{version}/'.format(
+ sources=SOURCES, package=src['Package'], version=src['Version'])
async with sem:
- with tempfile.TemporaryDirectory(prefix='debianreadme',
- dir='/srv/hdd/tmp') as tdir:
- await communicate(['dget', url_dsc], cwd=tdir)
- extract_root = Path(tdir)
- source_folder = next((d for d in extract_root.iterdir()
- if d.is_dir()), None)
- if source_folder is not None:
- for f in source_folder.iterdir():
- if f.is_file() and f.name in README_FILES:
- await asyncio.get_event_loop().run_in_executor(None,
- handle_file, package_name, f, pkg['Tag'])
-
-
-async def handle_package_pg(package_name, sem, pbar):
- await handle_package(package_name, sem)
+ data = await session.get(url_index)
+ index = await data.json()
+
+ # Bug with non-ascii file paths
+ if 'error' in index and index['error'] == 500:
+ tqdm.tqdm.write('error 500 with package {}'.format(package_name))
+ return
+
+ for f in index['content']:
+ if f['type'] == 'file' and f['name'] in README_FILES:
+ url_content = '{url_index}{fname}/'.format(url_index=url_index,
+ fname=f['name'])
+ content = await (await session.get(url_content)).json()
+
+ # Broken symlink
+ if 'error' in content and content['error'] == 404:
+ tqdm.tqdm.write('broken symlink for {} in {}'.format(
+ f['name'], package_name))
+ continue
+
+ await asyncio.get_event_loop().run_in_executor(
+ None, handle_file, package_name, f['name'],
+ content['checksum'], pkg['Tag'])
+
+
+async def handle_package_pg(package_name, session, sem, pbar):
+ await handle_package(package_name, session, sem)
pbar.update(1)
@@ -191,10 +187,11 @@
and not Package.select().where(Package.name == k).exists()]
print('done.')
- semaphore = asyncio.BoundedSemaphore(64)
+ semaphore = asyncio.BoundedSemaphore(10)
pbar = tqdm.tqdm(total=len(todo))
- tasks = [handle_package_pg(pkg, semaphore, pbar) for pkg in todo]
- await asyncio.gather(*tasks)
+ with aiohttp.ClientSession() as session:
+ tasks = [handle_package_pg(pkg, session, semaphore, pbar) for pkg in todo]
+ await asyncio.gather(*tasks)
if __name__ == '__main__':

File Metadata

Mime Type
text/plain
Expires
Sun, Aug 17, 8:57 PM (17 h, 34 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3215676

Event Timeline