Calculate average URL delay in the database

Rather than doing this in the Python code and needing 12,000+ rows returned from the database, we can do it in the database and get fewer than 300 rows back. If I recall correctly, the reason this was not done originally was due to our usage of MySQL and some really bad date math/overflow stuff it did when the interval between last_sync and check_time were greater than about a week. Luckily, we have switched to using a more sane database. Signed-off-by: Dan McGee <dan@archlinux.org>
author: Dan McGee <dan@archlinux.org> 2013-04-13 11:33:17 -0500
committer: Dan McGee <dan@archlinux.org> 2013-04-13 11:41:33 -0500
commit: 2c24ee9100a9e60fec16055d6496caeda3a1d8e2 (patch)
tree: 7d7ac7f0ab84cd22d57545fc408d9f41e9028cfc /mirrors
parent: 06e1e857abfdf7f95661d337ce3c315bd51fb837 (diff)
1 files changed, 30 insertions, 19 deletions
diff --git a/mirrors/utils.py b/mirrors/utils.py
index 3ab176b3..2721e20e 100644
--- a/mirrors/utils.py
+++ b/mirrors/utils.py
@@ -1,5 +1,6 @@
 from datetime import timedelta
 
+from django.db import connection
 from django.db.models import Avg, Count, Max, Min, StdDev
 from django.utils.timezone import now
 from django_countries.fields import Country
@@ -10,13 +11,12 @@ from .models import MirrorLog, MirrorProtocol, MirrorUrl
 
 DEFAULT_CUTOFF = timedelta(hours=24)
 
-def annotate_url(url, delays):
+def annotate_url(url, delay):
     '''Given a MirrorURL object, add a few more attributes to it regarding
     status, including completion_pct, delay, and score.'''
     url.completion_pct = float(url.success_count) / url.check_count
-    if url.id in delays:
-        url_delays = delays[url.id]
-        url.delay = sum(url_delays, timedelta()) / len(url_delays)
+    if delay is not None:
+        url.delay = delay
         hours = url.delay.days * 24.0 + url.delay.seconds / 3600.0
 
         if url.completion_pct > 0:
@@ -30,6 +30,30 @@ def annotate_url(url, delays):
         url.score = None
 
 
+def url_delays(cutoff_time, mirror_id=None):
+    cursor = connection.cursor()
+    if mirror_id is None:
+        sql= """
+SELECT url_id, AVG(check_time - last_sync)
+FROM mirrors_mirrorlog
+WHERE is_success = %s AND check_time >= %s AND last_sync IS NOT NULL
+GROUP BY url_id
+"""
+        cursor.execute(sql, [True, cutoff_time])
+    else:
+        sql = """
+SELECT l.url_id, avg(check_time - last_sync)
+FROM mirrors_mirrorlog l
+JOIN mirrors_mirrorurl u ON u.id = l.url_id
+WHERE is_success = %s AND check_time >= %s AND last_sync IS NOT NULL
+AND mirror_id = %s
+GROUP BY url_id
+"""
+        cursor.execute(sql, [True, cutoff_time, mirror_id])
+
+    return {url_id: delay for url_id, delay in cursor.fetchall()}
+
+
 @cache_function(123)
 def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None):
     cutoff_time = now() - cutoff
@@ -55,20 +79,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None):
 
     urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter(
             id__in=valid_urls).order_by('mirror__id', 'url')
-
-    # The Django ORM makes it really hard to get actual average delay in the
-    # above query, so run a seperate query for it and we will process the
-    # results here.
-    times = MirrorLog.objects.values_list(
-            'url_id', 'check_time', 'last_sync').filter(
-            is_success=True, last_sync__isnull=False,
-            check_time__gte=cutoff_time)
-    if mirror_ids:
-        times = times.filter(url__mirror_id__in=mirror_ids)
-    delays = {}
-    for url_id, check_time, last_sync in times:
-        delay = check_time - last_sync
-        delays.setdefault(url_id, []).append(delay)
+    delays = url_delays(cutoff_time)
 
     if urls:
         url_data = dict((item['id'], item) for item in url_data)
@@ -97,7 +108,7 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None):
         # fake the standard deviation for local testing setups
         if vendor == 'sqlite':
             setattr(url, 'duration_stddev', 0.0)
-        annotate_url(url, delays)
+        annotate_url(url, delays.get(url.id, None))
 
     return {
         'cutoff': cutoff,
author	Dan McGee <dan@archlinux.org>	2013-04-13 11:33:17 -0500
committer	Dan McGee <dan@archlinux.org>	2013-04-13 11:41:33 -0500
commit	2c24ee9100a9e60fec16055d6496caeda3a1d8e2 (patch)
tree	7d7ac7f0ab84cd22d57545fc408d9f41e9028cfc /mirrors
parent	06e1e857abfdf7f95661d337ce3c315bd51fb837 (diff)