From c1697ab694fe549d7b6ff81a00737a2ad63e9461 Mon Sep 17 00:00:00 2001
From: Dan McGee <dan@archlinux.org>
Date: Wed, 10 Feb 2010 21:28:49 -0600
Subject: reporead: turn into a django-admin command

Rather than struggle with getting the environment set up, let's make this a
custom Django admin command and use the flexibility that gives us. This is
the initial rough cut of making it happen; further commits should clean up
some of the rough edges.

Signed-off-by: Dan McGee <dan@archlinux.org>
---
 devel/management/commands/__init__.py |   0
 devel/management/commands/reporead.py | 339 ++++++++++++++++++++++++++++++++++
 2 files changed, 339 insertions(+)
 create mode 100644 devel/management/commands/__init__.py
 create mode 100755 devel/management/commands/reporead.py

(limited to 'devel/management/commands')

diff --git a/devel/management/commands/__init__.py b/devel/management/commands/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py
new file mode 100755
index 00000000..b53e259c
--- /dev/null
+++ b/devel/management/commands/reporead.py
@@ -0,0 +1,339 @@
+# -*- coding: utf-8 -*-
+"""
+reporead command
+
+Parses a repo.db.tar.gz file and updates the Arch database with the relevant
+changes.
+
+Usage: ./manage.py reporead ARCH PATH
+ ARCH:  architecture to update, and can be one of: i686, x86_64
+ PATH:  full path to the repo.db.tar.gz file.
+
+Example:
+  ./manage.py reporead i686 /tmp/core.db.tar.gz
+"""
+
+# multi value blocks
+REPOVARS = ['arch', 'backup', 'base', 'builddate', 'conflicts', 'csize',
+            'deltas', 'depends', 'desc', 'filename', 'files', 'force', 
+            'groups', 'installdate', 'isize', 'license', 'md5sum', 
+            'name', 'optdepends', 'packager', 'provides', 'reason', 
+            'replaces', 'size', 'url', 'version']
+
+
+from django.core.management.base import BaseCommand
+from django.conf import settings
+from django.db import models, transaction
+from django.core import management
+
+import os
+import re
+import sys
+import tarfile
+import logging
+from datetime import datetime
+from optparse import make_option
+
+from cStringIO import StringIO
+from logging import WARNING,INFO,DEBUG
+
+from main.models import Arch, Package, Repo
+
+class SomethingFishyException(Exception):
+    '''Raised when the database looks like its going to wipe out a bunch of
+    packages.'''
+    pass
+
+logging.basicConfig(
+    level=WARNING,
+    format='%(asctime)s -> %(levelname)s: %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+    stream=sys.stderr)
+logger = logging.getLogger()
+
+class Command(BaseCommand):
+    option_list = BaseCommand.option_list
+
+    def handle(self, arch=None, file=None, **options):
+        logger.level = INFO
+        if arch == None or file == None:
+            usage()
+            return 0
+        file = os.path.normpath(file)
+        read_repo(arch, file)
+
+
+class Pkg(object):
+    """An interim 'container' object for holding Arch package data."""
+
+    def __init__(self, val):
+        selfdict = {}
+        squash = ['arch', 'builddate', 'csize', 'desc', 'filename',
+                  'installdate', 'isize', 'license', 'md5sum', 
+                  'packager', 'size', 'url']
+        
+        selfdict['name'] = val['name'][0]
+        selfdict['base'] = None
+        del val['name']
+        if 'desc' not in val:
+            logger.warning("Package %s has no description" % selfdict['name'])
+            val['desc'] = None
+        if 'url' not in val:
+            val['url'] = None
+        if 'license' not in val:
+            val['license'] = []
+        for x in val.keys():
+            if x in squash:
+                if val[x] == None or len(val[x]) == 0:
+                    logger.warning("Package %s has no %s" % (selfdict['name'],x))
+                    selfdict[x] = None
+                else:
+                    selfdict[x] = ', '.join(val[x])
+                    # make sure we don't have elements larger than the db char
+                    # fields
+                    if len(selfdict[x]) > 255:
+                        selfdict[x] = selfdict[x][:254]
+            elif x == 'base':
+                selfdict[x] = val[x][0]
+            elif x == 'force':
+                selfdict[x] = True
+            elif x == 'version':
+                version = val[x][0].rsplit('-')
+                selfdict['ver'] = version[0]
+                selfdict['rel'] = version[1]
+            elif x == 'reason':
+                selfdict[x] = int(val[x][0])
+            else:
+                selfdict[x] = val[x]
+        self.__dict__ = selfdict
+    
+    def __getattr__(self,name):
+        if name == 'force':
+            return False
+        else:
+            return None
+
+
+def usage():
+    """Print the usage of this application."""
+    print __doc__.strip()
+
+
+def populate_pkg(dbpkg, repopkg, timestamp=None):
+    if not timestamp: timestamp = datetime.now()
+    dbpkg.pkgbase = repopkg.base
+    dbpkg.pkgver = repopkg.ver
+    dbpkg.pkgrel = repopkg.rel
+    dbpkg.pkgdesc = repopkg.desc
+    dbpkg.license = repopkg.license
+    dbpkg.url = repopkg.url
+    dbpkg.needupdate = False
+    dbpkg.last_update = timestamp
+    dbpkg.save()
+    # files are not in the repo.db.tar.gz
+    #for x in repopkg.files:
+    #    dbpkg.packagefile_set.create(path=x)
+    dbpkg.packagedepend_set.all().delete()
+    if 'depends' in repopkg.__dict__:
+        for y in repopkg.depends:
+            # make sure we aren't adding self depends..
+            # yes *sigh* i have seen them in pkgbuilds
+            dpname,dpvcmp = re.match(r"([a-z0-9._+-]+)(.*)", y).groups()
+            if dpname == repopkg.name:
+                logger.warning('Package %s has a depend on itself' % repopkg.name)
+                continue
+            dbpkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp)
+            logger.debug('Added %s as dep for pkg %s' % (dpname,repopkg.name))
+
+
+def db_update(archname, pkgs):
+    """
+    Parses a list and updates the Arch dev database accordingly.
+
+    Arguments:
+      pkgs -- A list of Pkg objects.
+    
+    """
+    logger.info('Updating Arch: %s' % archname)
+    repository = Repo.objects.get(name__iexact=pkgs[0].repo)
+    architecture = Arch.objects.get(name__iexact=archname)
+    dbpkgs = Package.objects.filter(arch=architecture, repo=repository)
+    # It makes sense to fully evaluate our DB query now because we will
+    # be using 99% of the objects in our "in both sets" loop. Force eval
+    # by calling list() on the QuerySet.
+    list(dbpkgs)
+    # This makes our inner loop where we find packages by name *way* more
+    # efficient by not having to go to the database for each package to
+    # SELECT them by name.
+    dbdict = dict([(pkg.pkgname, pkg) for pkg in dbpkgs])
+    now = datetime.now()
+
+    # go go set theory!
+    # thank you python for having a set class <3
+    logger.debug("Creating sets")
+    dbset = set([pkg.pkgname for pkg in dbpkgs])
+    syncset = set([pkg.name for pkg in pkgs])
+    logger.info("%d packages in current web DB" % len(dbset))
+    logger.info("%d packages in new updating db" % len(syncset))
+    # packages in syncdb and not in database (add to database)
+    logger.debug("Set theory: Packages in syncdb not in database")
+    in_sync_not_db = syncset - dbset
+    logger.info("%d packages in sync not db" % len(in_sync_not_db))
+
+    # Try to catch those random orphaning issues that make Eric so unhappy.
+    if len(dbset) > 20:
+        dbpercent = 100.0 * len(syncset) / len(dbset)
+    else:
+        # we don't have 20 packages in this repo/arch, so this check could
+        # produce a lot of false positives (or a div by zero). fake it
+        dbpercent = 100.0
+    logger.info("DB package ratio: %.1f%%" % dbpercent)
+    if dbpercent < 50.0 and repository.name.lower().find('testing') == -1:
+        logger.error(".db.tar.gz has %.1f%% the number of packages in the web database" % dbpercent)
+        raise SomethingFishyException(
+            'It looks like the syncdb is less than half the size of the web db. WTF?')
+
+    if dbpercent < 75.0:
+        logger.warning(".db.tar.gz has %.1f%% the number of packages in the web database." % dbpercent)
+    
+    for p in [x for x in pkgs if x.name in in_sync_not_db]:
+        logger.info("Adding package %s", p.name)
+        pkg = Package(pkgname = p.name, arch = architecture, repo = repository)
+        populate_pkg(pkg, p, timestamp=now)
+
+    # packages in database and not in syncdb (remove from database)
+    logger.debug("Set theory: Packages in database not in syncdb")
+    in_db_not_sync = dbset - syncset
+    for p in in_db_not_sync:
+        logger.info("Removing package %s from database", p)
+        Package.objects.get(
+            pkgname=p, arch=architecture, repo=repository).delete()
+
+    # packages in both database and in syncdb (update in database)
+    logger.debug("Set theory: Packages in database and syncdb")
+    pkg_in_both = syncset & dbset
+    for p in [x for x in pkgs if x.name in pkg_in_both]:
+        logger.debug("Looking for package updates")
+        dbp = dbdict[p.name]
+        if ''.join((p.ver,p.rel)) == ''.join((dbp.pkgver,dbp.pkgrel)):
+            continue
+        logger.info("Updating package %s in database", p.name)
+        pkg = Package.objects.get(
+            pkgname=p.name,arch=architecture, repo=repository)
+        populate_pkg(pkg, p, timestamp=now)
+
+    logger.info('Finished updating Arch: %s' % archname)
+
+
+def parse_inf(iofile):
+    """
+    Parses an Arch repo db information file, and returns variables as a list.
+
+    Arguments:
+     iofile -- A StringIO, FileType, or other object with readlines method.
+
+    """
+    store = {}
+    lines = iofile.readlines()
+    blockname = None
+    max = len(lines)
+    i = 0
+    while i < max:
+        line = lines[i].strip()
+        if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS:
+            blockname = line[1:-1].lower()
+            logger.debug("Parsing package block %s",blockname)
+            store[blockname] = []
+            i += 1
+            while i < max and len(lines[i].strip()) > 0:
+                store[blockname].append(lines[i].strip())
+                i += 1
+            # here is where i would convert arrays to strings
+            # based on count and type, but i dont think it is needed now
+        i += 1
+
+    return store
+
+
+def parse_repo(repopath):
+    """
+    Parses an Arch repo db file, and returns a list of Pkg objects.
+
+    Arguments:
+     repopath -- The path of a repository db file.
+
+    """
+    logger.info("Starting repo parsing")
+    if not os.path.exists(repopath):
+        logger.error("Could not read file %s", repopath)
+    
+    logger.info("Reading repo tarfile %s", repopath)
+    filename = os.path.split(repopath)[1]
+    rindex = filename.rindex('.db.tar.gz')
+    reponame = filename[:rindex]
+    
+    repodb = tarfile.open(repopath,"r:gz")
+    ## assuming well formed tar, with dir first then files after
+    ## repo-add enforces this
+    logger.debug("Starting package parsing")
+    pkgs = []
+    tpkg = None
+    while True:
+        tarinfo = repodb.next()
+        if tarinfo == None or tarinfo.isdir():
+            if tpkg != None:
+                tpkg.reset()
+                data = parse_inf(tpkg)
+                p = Pkg(data)
+                p.repo = reponame
+                logger.debug("Done parsing package %s", p.name)
+                pkgs.append(p)
+            if tarinfo == None:
+                break
+            # set new tpkg
+            tpkg = StringIO()
+        if tarinfo.isreg():
+            if os.path.split(tarinfo.name)[1] in ('desc','depends'):
+                tpkg.write(repodb.extractfile(tarinfo).read())
+                tpkg.write('\n') # just in case 
+    repodb.close()
+    logger.info("Finished repo parsing")
+    return pkgs
+
+@transaction.commit_on_success
+def read_repo(arch, file):
+    """
+    Parses repo.db.tar.gz file and returns exit status.
+    """
+    # check if arch is valid
+    available_arches = [x.name for x in Arch.objects.all()]
+    if arch not in available_arches:
+        usage()
+        return 0
+    else:
+        primary_arch = arch
+
+    packages = parse_repo(file)
+    
+    # sort packages by arch -- to handle noarch stuff
+    packages_arches = {}
+    for arch in available_arches:
+        packages_arches[arch] = []
+    
+    for package in packages:
+        if package.arch in ('any', primary_arch):
+            packages_arches[package.arch].append(package)
+        else:
+            logger.warning("Package %s arch = %s" % (
+                package.name,package.arch))
+            #package.arch = primary_arch
+
+
+    logger.info('Starting database updates.')
+    for (arch, pkgs) in packages_arches.iteritems():
+        if len(pkgs) > 0:
+            db_update(arch,pkgs)
+    logger.info('Finished database updates.')
+    return 0
+
+# vim: set ts=4 sw=4 et:
-- 
cgit v1.2.3-54-g00ecf