From f6c41b273c8962718b303c6050c2fd8bcea533a8 Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Wed, 23 Feb 2011 09:46:54 -0600 Subject: reporead performance improvements When importing over a million files, it makes sense to take the slightly faster route and call the PackageFile() constructor directly rather than going through the related manager's create method. We can also get huge performance improvements, especially with files databases, by using the 'io' rather than 'codecs' module. The former is now implemented in C in 2.7 and results in a no-work import (so measuring only the DB read speed) of extra.files.tar.gz from ~30 seconds to ~5 seconds. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'devel') diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index 72595c63..bda3bd61 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -27,9 +27,17 @@ from datetime import datetime from optparse import make_option +# New in 2.6, but fast (C implementation) in 2.7. We will use it over codecs if +# available. Eventually remove the codecs import completely. +io = None +try: + import io +except ImportError: + pass + from logging import ERROR, WARNING, INFO, DEBUG -from main.models import Arch, Package, PackageDepend, Repo +from main.models import Arch, Package, PackageDepend, PackageFile, Repo logging.basicConfig( level=WARNING, @@ -241,10 +249,13 @@ def populate_files(dbpkg, repopkg, force=False): dirname, filename = f.rsplit('/', 1) if filename == '': filename = None - dbpkg.packagefile_set.create( + # this is basically like calling dbpkg.packagefile_set.create(), + # but much faster as we can skip a lot of the repeated code paths + pkgfile = PackageFile(pkg=dbpkg, is_directory=(filename is None), directory=dirname + '/', filename=filename) + pkgfile.save() dbpkg.files_last_update = datetime.now() dbpkg.save() @@ -394,7 +405,11 @@ def parse_repo(repopath): if fname not in dbfiles: continue data_file = repodb.extractfile(tarinfo) - data_file = codecs.EncodedFile(data_file, 'utf-8') + if io is None: + data_file = codecs.EncodedFile(data_file, 'utf-8') + else: + data_file = io.TextIOWrapper(io.BytesIO(data_file.read()), + encoding='utf=8') try: data = parse_info(data_file) p = pkgs.setdefault(pkgid, Pkg(reponame)) -- cgit v1.2.3-54-g00ecf