diff options
Diffstat (limited to 'mirrors/utils.py')
-rw-r--r-- | mirrors/utils.py | 247 |
1 files changed, 187 insertions, 60 deletions
diff --git a/mirrors/utils.py b/mirrors/utils.py index 0463247a..7c2f5d17 100644 --- a/mirrors/utils.py +++ b/mirrors/utils.py @@ -1,64 +1,150 @@ -from django.db.models import Avg, Count, Max, Min, StdDev - -from main.utils import cache_function -from .models import MirrorLog, MirrorProtocol, MirrorUrl - -import datetime - -default_cutoff = datetime.timedelta(hours=24) - -@cache_function(300) -def get_mirror_statuses(cutoff=default_cutoff): - cutoff_time = datetime.datetime.utcnow() - cutoff - protocols = MirrorProtocol.objects.exclude(protocol__iexact='rsync') - # I swear, this actually has decent performance... - urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter( - mirror__active=True, mirror__public=True, - protocol__in=protocols, - logs__check_time__gte=cutoff_time).annotate( - check_count=Count('logs'), - success_count=Count('logs__duration'), - last_sync=Max('logs__last_sync'), - last_check=Max('logs__check_time'), - duration_avg=Avg('logs__duration'), - duration_stddev=StdDev('logs__duration') - ).order_by('-last_sync', '-duration_avg') - - # The Django ORM makes it really hard to get actual average delay in the - # above query, so run a seperate query for it and we will process the - # results here. - times = MirrorLog.objects.filter(is_success=True, last_sync__isnull=False, - check_time__gte=cutoff_time) - delays = {} - for log in times: - d = log.check_time - log.last_sync - delays.setdefault(log.url_id, []).append(d) +from datetime import timedelta + +from django.db import connection +from django.db.models import Count, Max, Min +from django.utils.dateparse import parse_datetime +from django.utils.timezone import now + +from main.utils import cache_function, database_vendor +from .models import MirrorLog, MirrorUrl + + +DEFAULT_CUTOFF = timedelta(hours=24) + + +def dictfetchall(cursor): + "Returns all rows from a cursor as a dict." + desc = cursor.description + return [ + dict(zip([col[0] for col in desc], row)) + for row in cursor.fetchall() + ] + +def status_data(cutoff_time, mirror_id=None): + if mirror_id is not None: + params = [cutoff_time, mirror_id] + mirror_where = 'AND u.mirror_id = %s' + else: + params = [cutoff_time] + mirror_where = '' + + vendor = database_vendor(MirrorUrl) + if vendor == 'sqlite': + sql = """ +SELECT l.url_id, u.mirror_id, + COUNT(l.id) AS check_count, + COUNT(l.last_sync) AS success_count, + MAX(l.last_sync) AS last_sync, + MAX(l.check_time) AS last_check, + AVG(l.duration) AS duration_avg, + 0.0 AS duration_stddev, + AVG(STRFTIME('%%s', check_time) - STRFTIME('%%s', last_sync)) AS delay +FROM mirrors_mirrorlog l +JOIN mirrors_mirrorurl u ON u.id = l.url_id +WHERE l.check_time >= %s +""" + mirror_where + """ +GROUP BY l.url_id, u.mirror_id +""" + else: + sql = """ +SELECT l.url_id, u.mirror_id, + COUNT(l.id) AS check_count, + COUNT(l.last_sync) AS success_count, + MAX(l.last_sync) AS last_sync, + MAX(l.check_time) AS last_check, + AVG(l.duration) AS duration_avg, + STDDEV(l.duration) AS duration_stddev, + AVG(check_time - last_sync) AS delay +FROM mirrors_mirrorlog l +JOIN mirrors_mirrorurl u ON u.id = l.url_id +WHERE l.check_time >= %s +""" + mirror_where + """ +GROUP BY l.url_id, u.mirror_id +""" + + cursor = connection.cursor() + cursor.execute(sql, params) + url_data = dictfetchall(cursor) + + # sqlite loves to return less than ideal types + if vendor == 'sqlite': + for item in url_data: + if item['delay'] is not None: + item['delay'] = timedelta(seconds=item['delay']) + if item['last_sync'] is not None: + item['last_sync'] = parse_datetime(item['last_sync']) + item['last_check'] = parse_datetime(item['last_check']) + + return {item['url_id']: item for item in url_data} + + +def annotate_url(url, url_data): + '''Given a MirrorURL object, add a few more attributes to it regarding + status, including completion_pct, delay, and score.''' + # set up some sane default values in case we are missing anything + url.success_count = 0 + url.check_count = 0 + url.completion_pct = None + url.duration_avg = None + url.duration_stddev = None + url.last_check = None + url.last_sync = None + url.delay = None + url.score = None + for k, v in url_data.items(): + if k not in ('url_id', 'mirror_id'): + setattr(url, k, v) + + if url.check_count > 0: + url.completion_pct = float(url.success_count) / url.check_count + + if url.delay is not None: + hours = url.delay.days * 24.0 + url.delay.seconds / 3600.0 + + if url.completion_pct > 0.0: + divisor = url.completion_pct + else: + # arbitrary small value + divisor = 0.005 + stddev = url.duration_stddev or 0.0 + url.score = (hours + url.duration_avg + stddev) / divisor + + return url + + +@cache_function(178) +def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_id=None, show_all=False): + cutoff_time = now() - cutoff + + urls = MirrorUrl.objects.select_related( + 'mirror', 'protocol').order_by('mirror__id', 'url') + if mirror_id: + urls = urls.filter(mirror_id=mirror_id) + if not show_all: + urls = urls.filter(active=True, mirror__active=True, + mirror__public=True) if urls: - last_check = max([u.last_check for u in urls]) - num_checks = max([u.check_count for u in urls]) - check_info = MirrorLog.objects.filter( - check_time__gte=cutoff_time).aggregate( + url_data = status_data(cutoff_time, mirror_id) + urls = [annotate_url(url, url_data.get(url.id, {})) for url in urls] + last_check = max([u.last_check for u in urls if u.last_check] or [None]) + num_checks = max(u.check_count for u in urls) + check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time) + if mirror_id: + check_info = check_info.filter(url__mirror_id=mirror_id) + check_info = check_info.aggregate( mn=Min('check_time'), mx=Max('check_time')) - check_frequency = (check_info['mx'] - check_info['mn']) / num_checks + if num_checks > 1: + check_frequency = (check_info['mx'] - check_info['mn']) \ + / (num_checks - 1) + else: + check_frequency = None else: + urls = [] last_check = None num_checks = 0 check_frequency = None - for url in urls: - url.completion_pct = float(url.success_count) / num_checks - if url.id in delays: - url_delays = delays[url.id] - d = sum(url_delays, datetime.timedelta()) / len(url_delays) - url.delay = d - hours = d.days * 24.0 + d.seconds / 3600.0 - url.score = hours + url.duration_avg + url.duration_stddev - else: - url.delay = None - url.score = None - url.completion = 0.0 - return { 'cutoff': cutoff, 'last_check': last_check, @@ -67,16 +153,57 @@ def get_mirror_statuses(cutoff=default_cutoff): 'urls': urls, } -@cache_function(300) -def get_mirror_errors(cutoff=default_cutoff): - cutoff_time = datetime.datetime.utcnow() - cutoff + +def get_mirror_errors(cutoff=DEFAULT_CUTOFF, mirror_id=None, show_all=False): + cutoff_time = now() - cutoff errors = MirrorLog.objects.filter( is_success=False, check_time__gte=cutoff_time, - url__mirror__active=True, url__mirror__public=True).values( - 'url__url', 'url__protocol__protocol', 'url__mirror__country', - 'error').annotate( + url__mirror__public=True).values('url__id', 'error').annotate( error_count=Count('error'), last_occurred=Max('check_time') ).order_by('-last_occurred', '-error_count') - return list(errors) + + if mirror_id: + errors = errors.filter(url__mirror_id=mirror_id) + if not show_all: + errors = errors.filter(url__active=True, url__mirror__active=True, + url__mirror__public=True) + + errors = list(errors) + to_fetch = [err['url__id'] for err in errors] + urls = MirrorUrl.objects.select_related( + 'mirror', 'protocol').in_bulk(to_fetch) + for err in errors: + err['url'] = urls[err['url__id']] + return errors + + +@cache_function(295) +def get_mirror_url_for_download(cutoff=DEFAULT_CUTOFF): + '''Find a good mirror URL to use for package downloads. If we have mirror + status data available, it is used to determine a good choice by looking at + the last batch of status rows.''' + cutoff_time = now() - cutoff + log_data = MirrorLog.objects.filter( + check_time__gte=cutoff_time).aggregate( + Max('check_time'), Max('last_sync')) + if log_data['check_time__max'] is not None: + min_check_time = log_data['check_time__max'] - timedelta(minutes=5) + min_sync_time = log_data['last_sync__max'] - timedelta(minutes=20) + best_logs = MirrorLog.objects.select_related('url').filter( + is_success=True, + check_time__gte=min_check_time, last_sync__gte=min_sync_time, + url__active=True, + url__mirror__public=True, url__mirror__active=True, + url__protocol__default=True).order_by( + 'duration')[:1] + if best_logs: + return best_logs[0].url + + mirror_urls = MirrorUrl.objects.filter(active=True, + mirror__public=True, mirror__active=True, + protocol__default=True)[:1] + if not mirror_urls: + return None + return mirror_urls[0] # vim: set ts=4 sw=4 et: |