diff options
author | Dan McGee <dan@archlinux.org> | 2012-11-16 15:39:56 -0600 |
---|---|---|
committer | Dan McGee <dan@archlinux.org> | 2012-11-16 16:37:52 -0600 |
commit | a2cfa7edbbed8edb1ad4d3391c6edb055c13de1b (patch) | |
tree | b703737a5f4afcceeea4691a0d8fb8129569807b | |
parent | 45d81a9578e846062550335495dbceb82f16a1a0 (diff) |
Optimize mirror status data fetching
Now that we have as many mirror URLs as we do, we can do a better job
fetching and aggregating this data. The prior method resulted in a
rather unwieldy query being pushed down to the database with a
horrendously long GROUP BY clause. Instead of trying to group by
everything at once so we can retrieve mirror URL info at the same time,
separate the two queries- one for getting URL performance data, one for
the qualitative data.
The impetus behind fixing this is the PostgreSQL slow query log in
production; this currently shows up the most of any queries we run in
the system.
Signed-off-by: Dan McGee <dan@archlinux.org>
-rw-r--r-- | mirrors/utils.py | 24 |
1 files changed, 16 insertions, 8 deletions
diff --git a/mirrors/utils.py b/mirrors/utils.py index 85e4ee93..07a7138f 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -33,23 +33,26 @@ def annotate_url(url, delays): @cache_function(123) def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): cutoff_time = now() - cutoff - # I swear, this actually has decent performance... - urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( + url_data = MirrorUrl.objects.values('id', 'mirror_id').filter( mirror__active=True, mirror__public=True, logs__check_time__gte=cutoff_time).annotate( check_count=Count('logs'), success_count=Count('logs__duration'), last_sync=Max('logs__last_sync'), last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration')).order_by( - 'mirror', 'url') - - if mirror_ids: - urls = urls.filter(mirror_id__in=mirror_ids) + duration_avg=Avg('logs__duration')) vendor = database_vendor(MirrorUrl) if vendor != 'sqlite': - urls = urls.annotate(duration_stddev=StdDev('logs__duration')) + url_data = url_data.annotate(duration_stddev=StdDev('logs__duration')) + + urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( + mirror__active=True, mirror__public=True, + logs__check_time__gte=cutoff_time).order_by('mirror__id', 'url') + + if mirror_ids: + url_data = url_data.filter(mirror_id__in=mirror_ids) + urls = urls.filter(mirror_id__in=mirror_ids) # The Django ORM makes it really hard to get actual average delay in the # above query, so run a seperate query for it and we will process the @@ -66,6 +69,11 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_ids=None): delays.setdefault(url_id, []).append(delay) if urls: + url_data = dict((item['id'], item) for item in url_data) + for url in urls: + for k, v in url_data.get(url.id, {}).items(): + if k not in ('id', 'mirror_id'): + setattr(url, k, v) last_check = max([u.last_check for u in urls]) num_checks = max([u.check_count for u in urls]) check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time) |