diff options
Diffstat (limited to 'kernel/time')
33 files changed, 18728 insertions, 0 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 000000000..7ceb68656 --- /dev/null +++ b/kernel/time/Kconfig @@ -0,0 +1,199 @@ +# +# Timer subsystem related configuration options +# + +# Options selectable by arch Kconfig + +# Watchdog function for clocksources to detect instabilities +config CLOCKSOURCE_WATCHDOG + bool + +# Architecture has extra clocksource data +config ARCH_CLOCKSOURCE_DATA + bool + +# Clocksources require validation of the clocksource against the last +# cycle update - x86/TSC misfeature +config CLOCKSOURCE_VALIDATE_LAST_CYCLE + bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL + bool + +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL_OLD + bool + +# Old style timekeeping +config ARCH_USES_GETTIMEOFFSET + bool + +# The generic clock events infrastructure +config GENERIC_CLOCKEVENTS + bool + +# Architecture can handle broadcast in a driver-agnostic way +config ARCH_HAS_TICK_BROADCAST + bool + +# Clockevents broadcasting infrastructure +config GENERIC_CLOCKEVENTS_BROADCAST + bool + depends on GENERIC_CLOCKEVENTS + +# Automatically adjust the min. reprogramming time for +# clock event device +config GENERIC_CLOCKEVENTS_MIN_ADJUST + bool + +# Generic update of CMOS clock +config GENERIC_CMOS_UPDATE + bool + +if GENERIC_CLOCKEVENTS +menu "Timers subsystem" + +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independ of this. +config TICK_ONESHOT + bool + +config NO_HZ_COMMON + bool + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + +choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC + bool "Periodic timer ticks (constant rate, no dynticks)" + help + This option keeps the tick running periodically at a constant + rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE + bool "Idle dynticks system (tickless idle)" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select NO_HZ_COMMON + help + This option enables a tickless idle system: timer interrupts + will only trigger on an as-needed basis when the system is idle. + This is usually interesting for energy saving. + + Most of the time you want to say Y here. + +config NO_HZ_FULL + bool "Full dynticks system (tickless)" + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_BFS + # We need at least one periodic CPU for timekeeping + depends on SMP + # RCU_USER_QS dependency + depends on HAVE_CONTEXT_TRACKING + # VIRT_CPU_ACCOUNTING_GEN dependency + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + select NO_HZ_COMMON + select RCU_USER_QS + select RCU_NOCB_CPU + select VIRT_CPU_ACCOUNTING_GEN + select IRQ_WORK + help + Adaptively try to shutdown the tick whenever possible, even when + the CPU is running tasks. Typically this requires running a single + task on the CPU. Chances for running tickless are maximized when + the task mostly runs in userspace and has few kernel activity. + + You need to fill up the nohz_full boot parameter with the + desired range of dynticks CPUs. + + This is implemented at the expense of some overhead in user <-> kernel + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + + Say N. + +endchoice + +config NO_HZ_FULL_ALL + bool "Full dynticks system on all CPUs by default (except CPU 0)" + depends on NO_HZ_FULL + help + If the user doesn't pass the nohz_full boot option to + define the range of full dynticks CPUs, consider that all + CPUs in the system are full dynticks by default. + Note the boot CPU will still be kept outside the range to + handle the timekeeping duty. + +config NO_HZ_FULL_SYSIDLE + bool "Detect full-system idle state for full dynticks system" + depends on NO_HZ_FULL + default n + help + At least one CPU must keep the scheduling-clock tick running for + timekeeping purposes whenever there is a non-idle CPU, where + "non-idle" also includes dynticks CPUs as long as they are + running non-idle tasks. Because the underlying adaptive-tick + support cannot distinguish between all CPUs being idle and + all CPUs each running a single task in dynticks mode, the + underlying support simply ensures that there is always a CPU + handling the scheduling-clock tick, whether or not all CPUs + are idle. This Kconfig option enables scalable detection of + the all-CPUs-idle state, thus allowing the scheduling-clock + tick to be disabled when all CPUs are idle. Note that scalable + detection of the all-CPUs-idle state means that larger systems + will be slower to declare the all-CPUs-idle state. + + Say Y if you would like to help debug all-CPUs-idle detection. + + Say N if you are unsure. + +config NO_HZ_FULL_SYSIDLE_SMALL + int "Number of CPUs above which large-system approach is used" + depends on NO_HZ_FULL_SYSIDLE + range 1 NR_CPUS + default 8 + help + The full-system idle detection mechanism takes a lazy approach + on large systems, as is required to attain decent scalability. + However, on smaller systems, scalability is not anywhere near as + large a concern as is energy efficiency. The sysidle subsystem + therefore uses a fast but non-scalable algorithm for small + systems and a lazier but scalable algorithm for large systems. + This Kconfig parameter defines the number of CPUs in the largest + system that will be considered to be "small". + + The default value will be fine in most cases. Battery-powered + systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger + numbers of CPUs, and (3) are suffering from battery-lifetime + problems due to long sysidle latencies might wish to experiment + with larger values for this Kconfig parameter. On the other + hand, they might be even better served by disabling NO_HZ_FULL + entirely, given that NO_HZ_FULL is intended for HPC and + real-time workloads that at present do not tend to be run on + battery-powered systems. + + Take the default if you are unsure. + +config NO_HZ + bool "Old Idle dynticks config" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + help + This is the old config entry that enables dynticks idle. + We keep it around for a little while to enforce backward + compatibility with older config files. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + +endmenu +endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 000000000..01f031241 --- /dev/null +++ b/kernel/time/Makefile @@ -0,0 +1,31 @@ +obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o + +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif +obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o +obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o +obj-$(CONFIG_TEST_UDELAY) += test_udelay.o + +$(obj)/time.o: $(obj)/timeconst.h + +quiet_cmd_hzfile = HZFILE $@ + cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE + $(call if_changed,hzfile) + +quiet_cmd_bc = BC $@ + cmd_bc = bc -q $(filter-out FORCE,$^) > $@ + +targets += timeconst.h +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE + $(call if_changed,bc) + diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 000000000..1b001ed1e --- /dev/null +++ b/kernel/time/alarmtimer.c @@ -0,0 +1,873 @@ +/* + * Alarmtimer interface + * + * This interface provides a timer which is similarto hrtimers, + * but triggers a RTC alarm if the box is suspend. + * + * This interface is influenced by the Android RTC Alarm timer + * interface. + * + * Copyright (C) 2010 IBM Corperation + * + * Author: John Stultz <john.stultz@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/time.h> +#include <linux/hrtimer.h> +#include <linux/timerqueue.h> +#include <linux/rtc.h> +#include <linux/alarmtimer.h> +#include <linux/mutex.h> +#include <linux/platform_device.h> +#include <linux/posix-timers.h> +#include <linux/workqueue.h> +#include <linux/freezer.h> + +/** + * struct alarm_base - Alarm timer bases + * @lock: Lock for syncrhonized access to the base + * @timerqueue: Timerqueue head managing the list of events + * @timer: hrtimer used to schedule events while running + * @gettime: Function to read the time correlating to the base + * @base_clockid: clockid for the base + */ +static struct alarm_base { + spinlock_t lock; + struct timerqueue_head timerqueue; + ktime_t (*gettime)(void); + clockid_t base_clockid; +} alarm_bases[ALARM_NUMTYPE]; + +/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +static ktime_t freezer_delta; +static DEFINE_SPINLOCK(freezer_delta_lock); + +static struct wakeup_source *ws; + +#ifdef CONFIG_RTC_CLASS +/* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer rtctimer; +static struct rtc_device *rtcdev; +static DEFINE_SPINLOCK(rtcdev_lock); + +/** + * alarmtimer_get_rtcdev - Return selected rtcdevice + * + * This function returns the rtc device to use for wakealarms. + * If one has not already been chosen, it checks to see if a + * functional rtc device is available. + */ +struct rtc_device *alarmtimer_get_rtcdev(void) +{ + unsigned long flags; + struct rtc_device *ret; + + spin_lock_irqsave(&rtcdev_lock, flags); + ret = rtcdev; + spin_unlock_irqrestore(&rtcdev_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); + +static int alarmtimer_rtc_add_device(struct device *dev, + struct class_interface *class_intf) +{ + unsigned long flags; + struct rtc_device *rtc = to_rtc_device(dev); + + if (rtcdev) + return -EBUSY; + + if (!rtc->ops->set_alarm) + return -1; + if (!device_may_wakeup(rtc->dev.parent)) + return -1; + + spin_lock_irqsave(&rtcdev_lock, flags); + if (!rtcdev) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + } + spin_unlock_irqrestore(&rtcdev_lock, flags); + return 0; +} + +static inline void alarmtimer_rtc_timer_init(void) +{ + rtc_timer_init(&rtctimer, NULL, NULL); +} + +static struct class_interface alarmtimer_rtc_interface = { + .add_dev = &alarmtimer_rtc_add_device, +}; + +static int alarmtimer_rtc_interface_setup(void) +{ + alarmtimer_rtc_interface.class = rtc_class; + return class_interface_register(&alarmtimer_rtc_interface); +} +static void alarmtimer_rtc_interface_remove(void) +{ + class_interface_unregister(&alarmtimer_rtc_interface); +} +#else +struct rtc_device *alarmtimer_get_rtcdev(void) +{ + return NULL; +} +#define rtcdev (NULL) +static inline int alarmtimer_rtc_interface_setup(void) { return 0; } +static inline void alarmtimer_rtc_interface_remove(void) { } +static inline void alarmtimer_rtc_timer_init(void) { } +#endif + +/** + * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue + * @base: pointer to the base where the timer is being run + * @alarm: pointer to alarm being enqueued. + * + * Adds alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) +{ + if (alarm->state & ALARMTIMER_STATE_ENQUEUED) + timerqueue_del(&base->timerqueue, &alarm->node); + + timerqueue_add(&base->timerqueue, &alarm->node); + alarm->state |= ALARMTIMER_STATE_ENQUEUED; +} + +/** + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue + * @base: pointer to the base where the timer is running + * @alarm: pointer to alarm being removed + * + * Removes alarm to a alarm_base timerqueue + * + * Must hold base->lock when calling. + */ +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm) +{ + if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) + return; + + timerqueue_del(&base->timerqueue, &alarm->node); + alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; +} + + +/** + * alarmtimer_fired - Handles alarm hrtimer being fired. + * @timer: pointer to hrtimer being run + * + * When a alarm timer fires, this runs through the timerqueue to + * see which alarms expired, and runs those. If there are more alarm + * timers queued for the future, we set the hrtimer to fire when + * when the next future alarm timer expires. + */ +static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) +{ + struct alarm *alarm = container_of(timer, struct alarm, timer); + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret = HRTIMER_NORESTART; + int restart = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&base->lock, flags); + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); + + if (alarm->function) + restart = alarm->function(alarm, base->gettime()); + + spin_lock_irqsave(&base->lock, flags); + if (restart != ALARMTIMER_NORESTART) { + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + alarmtimer_enqueue(base, alarm); + ret = HRTIMER_RESTART; + } + spin_unlock_irqrestore(&base->lock, flags); + + return ret; + +} + +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + return ktime_sub(alarm->node.expires, base->gettime()); +} +EXPORT_SYMBOL_GPL(alarm_expires_remaining); + +#ifdef CONFIG_RTC_CLASS +/** + * alarmtimer_suspend - Suspend time callback + * @dev: unused + * @state: unused + * + * When we are going into suspend, we look through the bases + * to see which is the soonest timer to expire. We then + * set an rtc timer to fire that far into the future, which + * will wake us from suspend. + */ +static int alarmtimer_suspend(struct device *dev) +{ + struct rtc_time tm; + ktime_t min, now; + unsigned long flags; + struct rtc_device *rtc; + int i; + int ret; + + spin_lock_irqsave(&freezer_delta_lock, flags); + min = freezer_delta; + freezer_delta = ktime_set(0, 0); + spin_unlock_irqrestore(&freezer_delta_lock, flags); + + rtc = alarmtimer_get_rtcdev(); + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + + /* Find the soonest timer to expire*/ + for (i = 0; i < ALARM_NUMTYPE; i++) { + struct alarm_base *base = &alarm_bases[i]; + struct timerqueue_node *next; + ktime_t delta; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + if (!next) + continue; + delta = ktime_sub(next->expires, base->gettime()); + if (!min.tv64 || (delta.tv64 < min.tv64)) + min = delta; + } + if (min.tv64 == 0) + return 0; + + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } + + /* Setup an rtc timer to fire that far in the future */ + rtc_timer_cancel(rtc, &rtctimer); + rtc_read_time(rtc, &tm); + now = rtc_tm_to_ktime(tm); + now = ktime_add(now, min); + + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, MSEC_PER_SEC); + return ret; +} +#else +static int alarmtimer_suspend(struct device *dev) +{ + return 0; +} +#endif + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + ktime_t delta; + unsigned long flags; + struct alarm_base *base = &alarm_bases[type]; + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) + freezer_delta = delta; + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} + + +/** + * alarm_init - Initialize an alarm structure + * @alarm: ptr to alarm to be initialized + * @type: the type of the alarm + * @function: callback that is run when the alarm fires + */ +void alarm_init(struct alarm *alarm, enum alarmtimer_type type, + enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) +{ + timerqueue_init(&alarm->node); + hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, + HRTIMER_MODE_ABS); + alarm->timer.function = alarmtimer_fired; + alarm->function = function; + alarm->type = type; + alarm->state = ALARMTIMER_STATE_INACTIVE; +} +EXPORT_SYMBOL_GPL(alarm_init); + +/** + * alarm_start - Sets an absolute alarm to fire + * @alarm: ptr to alarm to set + * @start: time to run the alarm + */ +int alarm_start(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret; + + spin_lock_irqsave(&base->lock, flags); + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + ret = hrtimer_start(&alarm->timer, alarm->node.expires, + HRTIMER_MODE_ABS); + spin_unlock_irqrestore(&base->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(alarm_start); + +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + start = ktime_add(start, base->gettime()); + return alarm_start(alarm, start); +} +EXPORT_SYMBOL_GPL(alarm_start_relative); + +void alarm_restart(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + hrtimer_set_expires(&alarm->timer, alarm->node.expires); + hrtimer_restart(&alarm->timer); + alarmtimer_enqueue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(alarm_restart); + +/** + * alarm_try_to_cancel - Tries to cancel an alarm timer + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not running, + * and -1 if the callback was running + */ +int alarm_try_to_cancel(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + int ret; + + spin_lock_irqsave(&base->lock, flags); + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + spin_unlock_irqrestore(&base->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(alarm_try_to_cancel); + + +/** + * alarm_cancel - Spins trying to cancel an alarm timer until it is done + * @alarm: ptr to alarm to be canceled + * + * Returns 1 if the timer was canceled, 0 if it was not active. + */ +int alarm_cancel(struct alarm *alarm) +{ + for (;;) { + int ret = alarm_try_to_cancel(alarm); + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(alarm_cancel); + + +u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) +{ + u64 overrun = 1; + ktime_t delta; + + delta = ktime_sub(now, alarm->node.expires); + + if (delta.tv64 < 0) + return 0; + + if (unlikely(delta.tv64 >= interval.tv64)) { + s64 incr = ktime_to_ns(interval); + + overrun = ktime_divns(delta, incr); + + alarm->node.expires = ktime_add_ns(alarm->node.expires, + incr*overrun); + + if (alarm->node.expires.tv64 > now.tv64) + return overrun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + overrun++; + } + + alarm->node.expires = ktime_add(alarm->node.expires, interval); + return overrun; +} +EXPORT_SYMBOL_GPL(alarm_forward); + +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + + return alarm_forward(alarm, base->gettime(), interval); +} +EXPORT_SYMBOL_GPL(alarm_forward_now); + + +/** + * clock2alarm - helper that converts from clockid to alarmtypes + * @clockid: clockid. + */ +static enum alarmtimer_type clock2alarm(clockid_t clockid) +{ + if (clockid == CLOCK_REALTIME_ALARM) + return ALARM_REALTIME; + if (clockid == CLOCK_BOOTTIME_ALARM) + return ALARM_BOOTTIME; + return -1; +} + +/** + * alarm_handle_timer - Callback for posix timers + * @alarm: alarm that fired + * + * Posix timer callback for expired alarm timers. + */ +static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, + ktime_t now) +{ + unsigned long flags; + struct k_itimer *ptr = container_of(alarm, struct k_itimer, + it.alarm.alarmtimer); + enum alarmtimer_restart result = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&ptr->it_lock, flags); + if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; + } + + /* Re-add periodic timers */ + if (ptr->it.alarm.interval.tv64) { + ptr->it_overrun += alarm_forward(alarm, now, + ptr->it.alarm.interval); + result = ALARMTIMER_RESTART; + } + spin_unlock_irqrestore(&ptr->it_lock, flags); + + return result; +} + +/** + * alarm_clock_getres - posix getres interface + * @which_clock: clockid + * @tp: timespec to fill + * + * Returns the granularity of underlying alarm base clock + */ +static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + return hrtimer_get_res(baseid, tp); +} + +/** + * alarm_clock_get - posix clock_get interface + * @which_clock: clockid + * @tp: timespec to fill. + * + * Provides the underlying alarm base time. + */ +static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + *tp = ktime_to_timespec(base->gettime()); + return 0; +} + +/** + * alarm_timer_create - posix timer_create interface + * @new_timer: k_itimer pointer to manage + * + * Initializes the k_itimer structure. + */ +static int alarm_timer_create(struct k_itimer *new_timer) +{ + enum alarmtimer_type type; + struct alarm_base *base; + + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + type = clock2alarm(new_timer->it_clock); + base = &alarm_bases[type]; + alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); + return 0; +} + +/** + * alarm_timer_get - posix timer_get interface + * @new_timer: k_itimer pointer + * @cur_setting: itimerspec data to fill + * + * Copies out the current itimerspec data + */ +static void alarm_timer_get(struct k_itimer *timr, + struct itimerspec *cur_setting) +{ + ktime_t relative_expiry_time = + alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); + + if (ktime_to_ns(relative_expiry_time) > 0) { + cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + } else { + cur_setting->it_value.tv_sec = 0; + cur_setting->it_value.tv_nsec = 0; + } + + cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); +} + +/** + * alarm_timer_del - posix timer_del interface + * @timr: k_itimer pointer to be deleted + * + * Cancels any programmed alarms for the given timer. + */ +static int alarm_timer_del(struct k_itimer *timr) +{ + if (!rtcdev) + return -ENOTSUPP; + + if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) + return TIMER_RETRY; + + return 0; +} + +/** + * alarm_timer_set - posix timer_set interface + * @timr: k_itimer pointer to be deleted + * @flags: timer flags + * @new_setting: itimerspec to be used + * @old_setting: itimerspec being replaced + * + * Sets the timer to new_setting, and starts the timer. + */ +static int alarm_timer_set(struct k_itimer *timr, int flags, + struct itimerspec *new_setting, + struct itimerspec *old_setting) +{ + ktime_t exp; + + if (!rtcdev) + return -ENOTSUPP; + + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + + if (old_setting) + alarm_timer_get(timr, old_setting); + + /* If the timer was already set, cancel it */ + if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) + return TIMER_RETRY; + + /* start the timer */ + timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); + exp = timespec_to_ktime(new_setting->it_value); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now; + + now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); + exp = ktime_add(now, exp); + } + + alarm_start(&timr->it.alarm.alarmtimer, exp); + return 0; +} + +/** + * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep + * @alarm: ptr to alarm that fired + * + * Wakes up the task that set the alarmtimer + */ +static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, + ktime_t now) +{ + struct task_struct *task = (struct task_struct *)alarm->data; + + alarm->data = NULL; + if (task) + wake_up_process(task); + return ALARMTIMER_NORESTART; +} + +/** + * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation + * @alarm: ptr to alarmtimer + * @absexp: absolute expiration time + * + * Sets the alarm timer and sleeps until it is fired or interrupted. + */ +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) +{ + alarm->data = (void *)current; + do { + set_current_state(TASK_INTERRUPTIBLE); + alarm_start(alarm, absexp); + if (likely(alarm->data)) + schedule(); + + alarm_cancel(alarm); + } while (alarm->data && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + return (alarm->data == NULL); +} + + +/** + * update_rmtp - Update remaining timespec value + * @exp: expiration time + * @type: timer type + * @rmtp: user pointer to remaining timepsec value + * + * Helper function that fills in rmtp value with time between + * now and the exp value + */ +static int update_rmtp(ktime_t exp, enum alarmtimer_type type, + struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = ktime_sub(exp, alarm_bases[type].gettime()); + + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; + +} + +/** + * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep + * @restart: ptr to restart block + * + * Handles restarted clock_nanosleep calls + */ +static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) +{ + enum alarmtimer_type type = restart->nanosleep.clockid; + ktime_t exp; + struct timespec __user *rmtp; + struct alarm alarm; + int ret = 0; + + exp.tv64 = restart->nanosleep.expires; + alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + + if (alarmtimer_do_nsleep(&alarm, exp)) + goto out; + + if (freezing(current)) + alarmtimer_freezerset(exp, type); + + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + ret = update_rmtp(exp, type, rmtp); + if (ret <= 0) + goto out; + } + + + /* The other values in restart are already filled in */ + ret = -ERESTART_RESTARTBLOCK; +out: + return ret; +} + +/** + * alarm_timer_nsleep - alarmtimer nanosleep + * @which_clock: clockid + * @flags: determins abstime or relative + * @tsreq: requested sleep time (abs or rel) + * @rmtp: remaining sleep time saved + * + * Handles clock_nanosleep calls against _ALARM clockids + */ +static int alarm_timer_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsreq, struct timespec __user *rmtp) +{ + enum alarmtimer_type type = clock2alarm(which_clock); + struct alarm alarm; + ktime_t exp; + int ret = 0; + struct restart_block *restart; + + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + + exp = timespec_to_ktime(*tsreq); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now = alarm_bases[type].gettime(); + exp = ktime_add(now, exp); + } + + if (alarmtimer_do_nsleep(&alarm, exp)) + goto out; + + if (freezing(current)) + alarmtimer_freezerset(exp, type); + + /* abs timers don't set remaining time or restart */ + if (flags == TIMER_ABSTIME) { + ret = -ERESTARTNOHAND; + goto out; + } + + if (rmtp) { + ret = update_rmtp(exp, type, rmtp); + if (ret <= 0) + goto out; + } + + restart = ¤t->restart_block; + restart->fn = alarm_timer_nsleep_restart; + restart->nanosleep.clockid = type; + restart->nanosleep.expires = exp.tv64; + restart->nanosleep.rmtp = rmtp; + ret = -ERESTART_RESTARTBLOCK; + +out: + return ret; +} + + +/* Suspend hook structures */ +static const struct dev_pm_ops alarmtimer_pm_ops = { + .suspend = alarmtimer_suspend, +}; + +static struct platform_driver alarmtimer_driver = { + .driver = { + .name = "alarmtimer", + .pm = &alarmtimer_pm_ops, + } +}; + +/** + * alarmtimer_init - Initialize alarm timer code + * + * This function initializes the alarm bases and registers + * the posix clock ids. + */ +static int __init alarmtimer_init(void) +{ + struct platform_device *pdev; + int error = 0; + int i; + struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .nsleep = alarm_timer_nsleep, + }; + + alarmtimer_rtc_timer_init(); + + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); + posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + + /* Initialize alarm bases */ + alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; + alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; + alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + for (i = 0; i < ALARM_NUMTYPE; i++) { + timerqueue_init_head(&alarm_bases[i].timerqueue); + spin_lock_init(&alarm_bases[i].lock); + } + + error = alarmtimer_rtc_interface_setup(); + if (error) + return error; + + error = platform_driver_register(&alarmtimer_driver); + if (error) + goto out_if; + + pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); + if (IS_ERR(pdev)) { + error = PTR_ERR(pdev); + goto out_drv; + } + ws = wakeup_source_register("alarmtimer"); + return 0; + +out_drv: + platform_driver_unregister(&alarmtimer_driver); +out_if: + alarmtimer_rtc_interface_remove(); + return error; +} +device_initcall(alarmtimer_init); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 000000000..637a09461 --- /dev/null +++ b/kernel/time/clockevents.c @@ -0,0 +1,792 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event devices. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ + +#include <linux/clockchips.h> +#include <linux/hrtimer.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/device.h> + +#include "tick-internal.h" + +/* The registered clock event devices */ +static LIST_HEAD(clockevent_devices); +static LIST_HEAD(clockevents_released); +/* Protection for the above */ +static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { + struct clock_event_device *ce; + int res; +}; + +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) +{ + u64 clc = (u64) latch << evt->shift; + u64 rnd; + + if (unlikely(!evt->mult)) { + evt->mult = 1; + WARN_ON(1); + } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1ULL << evt->shift))) + clc += rnd; + + do_div(clc, evt->mult); + + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); +} +EXPORT_SYMBOL_GPL(clockevent_delta2ns); + +static int __clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + /* Transition with legacy set_mode() callback */ + if (dev->set_mode) { + /* Legacy callback doesn't support new modes */ + if (state > CLOCK_EVT_STATE_ONESHOT) + return -ENOSYS; + /* + * 'clock_event_state' and 'clock_event_mode' have 1-to-1 + * mapping until *_ONESHOT, and so a simple cast will work. + */ + dev->set_mode((enum clock_event_mode)state, dev); + dev->mode = (enum clock_event_mode)state; + return 0; + } + + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* Transition with new state-specific callbacks */ + switch (state) { + case CLOCK_EVT_STATE_DETACHED: + /* The clockevent device is getting replaced. Shut it down. */ + + case CLOCK_EVT_STATE_SHUTDOWN: + return dev->set_state_shutdown(dev); + + case CLOCK_EVT_STATE_PERIODIC: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) + return -ENOSYS; + return dev->set_state_periodic(dev); + + case CLOCK_EVT_STATE_ONESHOT: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return -ENOSYS; + return dev->set_state_oneshot(dev); + + default: + return -ENOSYS; + } +} + +/** + * clockevents_set_state - set the operating state of a clock event device + * @dev: device to modify + * @state: new state + * + * Must be called with interrupts disabled ! + */ +void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + if (dev->state != state) { + if (__clockevents_set_state(dev, state)) + return; + + dev->state = state; + + /* + * A nsec2cyc multiplicator of 0 is invalid and we'd crash + * on it, so fix it up and emit a warning: + */ + if (state == CLOCK_EVT_STATE_ONESHOT) { + if (unlikely(!dev->mult)) { + dev->mult = 1; + WARN_ON(1); + } + } + } +} + +/** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + +/** + * clockevents_tick_resume - Resume the tick device before using it again + * @dev: device to resume + */ +int clockevents_tick_resume(struct clock_event_device *dev) +{ + int ret = 0; + + if (dev->set_mode) { + dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); + dev->mode = CLOCK_EVT_MODE_RESUME; + } else if (dev->tick_resume) { + ret = dev->tick_resume(dev); + } + + return ret; +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +/** + * clockevents_increase_min_delta - raise minimum delta of a clock event device + * @dev: device to increase the minimum delta + * + * Returns 0 on success, -ETIME when the minimum delta reached the limit. + */ +static int clockevents_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { + printk_deferred(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk_deferred(KERN_WARNING + "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + int i; + + for (i = 0;;) { + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + if (dev->set_next_event((unsigned long) clc, dev) == 0) + return 0; + + if (++i > 2) { + /* + * We tried 3 times to program the device with the + * given min_delta_ns. Try to increase the minimum + * delta, if that fails as well get out of here. + */ + if (clockevents_increase_min_delta(dev)) + return -ETIME; + i = 0; + } + } +} + +#else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_min_delta - Set clock event device to the minimum delay. + * @dev: device to program + * + * Returns 0 on success, -ETIME when the retry loop failed. + */ +static int clockevents_program_min_delta(struct clock_event_device *dev) +{ + unsigned long long clc; + int64_t delta; + + delta = dev->min_delta_ns; + dev->next_event = ktime_add_ns(ktime_get(), delta); + + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + return 0; + + dev->retries++; + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + return dev->set_next_event((unsigned long) clc, dev); +} + +#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ + +/** + * clockevents_program_event - Reprogram the clock event device. + * @dev: device to program + * @expires: absolute expiry time (monotonic clock) + * @force: program minimum delay if expires can not be set + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, + bool force) +{ + unsigned long long clc; + int64_t delta; + int rc; + + if (unlikely(expires.tv64 < 0)) { + WARN_ON_ONCE(1); + return -ETIME; + } + + dev->next_event = expires; + + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + return 0; + + /* Shortcut for clockevent devices that can deal with ktime. */ + if (dev->features & CLOCK_EVT_FEAT_KTIME) + return dev->set_next_ktime(expires, dev); + + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); + if (delta <= 0) + return force ? clockevents_program_min_delta(dev) : -ETIME; + + delta = min(delta, (int64_t) dev->max_delta_ns); + delta = max(delta, (int64_t) dev->min_delta_ns); + + clc = ((unsigned long long) delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) clc, dev); + + return (rc && force) ? clockevents_program_min_delta(dev) : rc; +} + +/* + * Called after a notify add to make devices available which were + * released from the notifier call. + */ +static void clockevents_notify_released(void) +{ + struct clock_event_device *dev; + + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + list_add(&dev->list, &clockevent_devices); + tick_check_new_device(dev); + } +} + +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ + struct clock_event_device *dev, *newdev = NULL; + + list_for_each_entry(dev, &clockevent_devices, list) { + if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) + continue; + + if (!tick_check_replacement(newdev, dev)) + continue; + + if (!try_module_get(dev->owner)) + continue; + + if (newdev) + module_put(newdev->owner); + newdev = dev; + } + if (newdev) { + tick_install_replacement(newdev); + list_del_init(&ced->list); + } + return newdev ? 0 : -EBUSY; +} + +/* + * Called with clockevents_mutex and clockevents_lock held + */ +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) +{ + /* Fast track. Device is unused */ + if (ced->state == CLOCK_EVT_STATE_DETACHED) { + list_del_init(&ced->list); + return 0; + } + + return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; +} + +/* + * SMP function call to unbind a device + */ +static void __clockevents_unbind(void *arg) +{ + struct ce_unbind *cu = arg; + int res; + + raw_spin_lock(&clockevents_lock); + res = __clockevents_try_unbind(cu->ce, smp_processor_id()); + if (res == -EAGAIN) + res = clockevents_replace(cu->ce); + cu->res = res; + raw_spin_unlock(&clockevents_lock); +} + +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ + struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + + smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); + return cu.res; +} + +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ + int ret; + + mutex_lock(&clockevents_mutex); + ret = clockevents_unbind(ced, cpu); + mutex_unlock(&clockevents_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind_device); + +/* Sanity check of state transition callbacks */ +static int clockevents_sanity_check(struct clock_event_device *dev) +{ + /* Legacy set_mode() callback */ + if (dev->set_mode) { + /* We shouldn't be supporting new modes now */ + WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || + dev->set_state_shutdown || dev->tick_resume); + + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + return 0; + } + + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* New state-specific callbacks */ + if (!dev->set_state_shutdown) + return -EINVAL; + + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !dev->set_state_periodic) + return -EINVAL; + + if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && + !dev->set_state_oneshot) + return -EINVAL; + + return 0; +} + +/** + * clockevents_register_device - register a clock event device + * @dev: device to register + */ +void clockevents_register_device(struct clock_event_device *dev) +{ + unsigned long flags; + + BUG_ON(clockevents_sanity_check(dev)); + + /* Initialize state to DETACHED */ + dev->state = CLOCK_EVT_STATE_DETACHED; + + if (!dev->cpumask) { + WARN_ON(num_possible_cpus() > 1); + dev->cpumask = cpumask_of(smp_processor_id()); + } + + raw_spin_lock_irqsave(&clockevents_lock, flags); + + list_add(&dev->list, &clockevent_devices); + tick_check_new_device(dev); + clockevents_notify_released(); + + raw_spin_unlock_irqrestore(&clockevents_lock, flags); +} +EXPORT_SYMBOL_GPL(clockevents_register_device); + +void clockevents_config(struct clock_event_device *dev, u32 freq) +{ + u64 sec; + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * Calculate the maximum number of seconds we can sleep. Limit + * to 10 minutes for hardware which can program more than + * 32bit ticks so we still get reasonable conversion values. + */ + sec = dev->max_delta_ticks; + do_div(sec, freq); + if (!sec) + sec = 1; + else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) + sec = 600; + + clockevents_calc_mult_shift(dev, freq, sec); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); +} + +/** + * clockevents_config_and_register - Configure and register a clock event device + * @dev: device to register + * @freq: The clock frequency + * @min_delta: The minimum clock ticks to program in oneshot mode + * @max_delta: The maximum clock ticks to program in oneshot mode + * + * min/max_delta can be 0 for devices which do not support oneshot mode. + */ +void clockevents_config_and_register(struct clock_event_device *dev, + u32 freq, unsigned long min_delta, + unsigned long max_delta) +{ + dev->min_delta_ticks = min_delta; + dev->max_delta_ticks = max_delta; + clockevents_config(dev, freq); + clockevents_register_device(dev); +} +EXPORT_SYMBOL_GPL(clockevents_config_and_register); + +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->state == CLOCK_EVT_STATE_ONESHOT) + return clockevents_program_event(dev, dev->next_event, false); + + if (dev->state == CLOCK_EVT_STATE_PERIODIC) + return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + + return 0; +} + +/** + * clockevents_update_freq - Update frequency and reprogram a clock event device. + * @dev: device to modify + * @freq: new device frequency + * + * Reconfigure and reprogram a clock event device in oneshot + * mode. Must be called on the cpu for which the device delivers per + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; +} + +/* + * Noop handler when we shut down an event device + */ +void clockevents_handle_noop(struct clock_event_device *dev) +{ +} + +/** + * clockevents_exchange_device - release and request clock devices + * @old: device to release (can be NULL) + * @new: device to request (can be NULL) + * + * Called from various tick functions with clockevents_lock held and + * interrupts disabled. + */ +void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new) +{ + /* + * Caller releases a clock event device. We queue it into the + * released list and do a notify add later. + */ + if (old) { + module_put(old->owner); + clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); + list_del(&old->list); + list_add(&old->list, &clockevents_released); + } + + if (new) { + BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); + clockevents_shutdown(new); + } +} + +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ + struct clock_event_device *dev; + + list_for_each_entry_reverse(dev, &clockevent_devices, list) + if (dev->suspend) + dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ + struct clock_event_device *dev; + + list_for_each_entry(dev, &clockevent_devices, list) + if (dev->resume) + dev->resume(dev); +} + +#ifdef CONFIG_HOTPLUG_CPU +/** + * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu + */ +void tick_cleanup_dead_cpu(int cpu) +{ + struct clock_event_device *dev, *tmp; + unsigned long flags; + + raw_spin_lock_irqsave(&clockevents_lock, flags); + + tick_shutdown_broadcast_oneshot(cpu); + tick_shutdown_broadcast(cpu); + tick_shutdown(cpu); + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { + BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); + list_del(&dev->list); + } + } + raw_spin_unlock_irqrestore(&clockevents_lock, flags); +} +#endif + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { + .name = "clockevents", + .dev_name = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct tick_device *td; + ssize_t count = 0; + + raw_spin_lock_irq(&clockevents_lock); + td = tick_get_tick_dev(dev); + if (td && td->evtdev) + count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); + raw_spin_unlock_irq(&clockevents_lock); + return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + char name[CS_NAME_LEN]; + ssize_t ret = sysfs_get_uname(buf, name, count); + struct clock_event_device *ce; + + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clockevents_mutex); + raw_spin_lock_irq(&clockevents_lock); + list_for_each_entry(ce, &clockevent_devices, list) { + if (!strcmp(ce->name, name)) { + ret = __clockevents_try_unbind(ce, dev->id); + break; + } + } + raw_spin_unlock_irq(&clockevents_lock); + /* + * We hold clockevents_mutex, so ce can't go away + */ + if (ret == -EAGAIN) + ret = clockevents_unbind(ce, dev->id); + mutex_unlock(&clockevents_mutex); + return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { + .init_name = "broadcast", + .id = 0, + .bus = &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return dev == &tick_bc_dev ? tick_get_broadcast_device() : + &per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ + int err = device_register(&tick_bc_dev); + + if (!err) + err = device_create_file(&tick_bc_dev, &dev_attr_current_device); + return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ + return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; } +#endif + +static int __init tick_init_sysfs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device *dev = &per_cpu(tick_percpu_dev, cpu); + int err; + + dev->id = cpu; + dev->bus = &clockevents_subsys; + err = device_register(dev); + if (!err) + err = device_create_file(dev, &dev_attr_current_device); + if (!err) + err = device_create_file(dev, &dev_attr_unbind_device); + if (err) + return err; + } + return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ + int err = subsys_system_register(&clockevents_subsys, NULL); + + if (!err) + err = tick_init_sysfs(); + return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 000000000..15facb1b9 --- /dev/null +++ b/kernel/time/clocksource.c @@ -0,0 +1,1020 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + */ + +#include <linux/device.h> +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ +#include <linux/tick.h> +#include <linux/kthread.h> + +#include "tick-internal.h" +#include "timekeeping_internal.h" + +/** + * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks + * @mult: pointer to mult variable + * @shift: pointer to shift variable + * @from: frequency to convert from + * @to: frequency to convert to + * @maxsec: guaranteed runtime conversion range in seconds + * + * The function evaluates the shift/mult pair for the scaled math + * operations of clocksources and clockevents. + * + * @to and @from are frequency values in HZ. For clock sources @to is + * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock + * event @to is the counter frequency and @from is NSEC_PER_SEC. + * + * The @maxsec conversion range argument controls the time frame in + * seconds which must be covered by the runtime conversion with the + * calculated mult and shift factors. This guarantees that no 64bit + * overflow happens when the input value of the conversion is + * multiplied with the calculated mult factor. Larger ranges may + * reduce the conversion accuracy by chosing smaller mult and shift + * factors. + */ +void +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) +{ + u64 tmp; + u32 sft, sftacc= 32; + + /* + * Calculate the shift factor which is limiting the conversion + * range: + */ + tmp = ((u64)maxsec * from) >> 32; + while (tmp) { + tmp >>=1; + sftacc--; + } + + /* + * Find the conversion shift/mult pair which has the best + * accuracy and fits the maxsec conversion range: + */ + for (sft = 32; sft > 0; sft--) { + tmp = (u64) to << sft; + tmp += from / 2; + do_div(tmp, from); + if ((tmp >> sftacc) == 0) + break; + } + *mult = tmp; + *shift = sft; +} + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_MUTEX(clocksource_mutex); +static char override_name[CS_NAME_LEN]; +static int finished_booting; + +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void); + +static LIST_HEAD(watchdog_list); +static struct clocksource *watchdog; +static struct timer_list watchdog_timer; +static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); +static DEFINE_SPINLOCK(watchdog_lock); +static int watchdog_running; +static atomic_t watchdog_reset_pending; + +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + +/* + * Interval: 0.5sec Threshold: 0.0625s + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) + +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + +static void __clocksource_unstable(struct clocksource *cs) +{ + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + if (finished_booting) + schedule_work(&watchdog_work); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_watchdog(unsigned long data) +{ + struct clocksource *cs; + cycle_t csnow, wdnow, cslast, wdlast, delta; + int64_t wd_nsec, cs_nsec; + int next_cpu, reset_pending; + + spin_lock(&watchdog_lock); + if (!watchdog_running) + goto out; + + reset_pending = atomic_read(&watchdog_reset_pending); + + list_for_each_entry(cs, &watchdog_list, wd_list) { + + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + if (finished_booting) + schedule_work(&watchdog_work); + continue; + } + + local_irq_disable(); + csnow = cs->read(cs); + wdnow = watchdog->read(watchdog); + local_irq_enable(); + + /* Clocksource initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || + atomic_read(&watchdog_reset_pending)) { + cs->flags |= CLOCK_SOURCE_WATCHDOG; + cs->wd_last = wdnow; + cs->cs_last = csnow; + continue; + } + + delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); + wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, + watchdog->shift); + + delta = clocksource_delta(csnow, cs->cs_last, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + wdlast = cs->wd_last; /* save these in case we print them */ + cslast = cs->cs_last; + cs->cs_last = csnow; + cs->wd_last = wdnow; + + if (atomic_read(&watchdog_reset_pending)) + continue; + + /* Check the deviation from the watchdog clocksource. */ + if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { + pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + watchdog->name, wdnow, wdlast, watchdog->mask); + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + cs->name, csnow, cslast, cs->mask); + __clocksource_unstable(cs); + continue; + } + + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + /* Mark it valid for high-res. */ + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + + /* + * clocksource_done_booting() will sort it if + * finished_booting is not set yet. + */ + if (!finished_booting) + continue; + + /* + * If this is not the current clocksource let + * the watchdog thread reselect it. Due to the + * change to high res this clocksource might + * be preferred now. If it is the current + * clocksource let the tick code know about + * that change. + */ + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); + } + } + } + + /* + * We only clear the watchdog_reset_pending, when we did a + * full cycle through all clocksources. + */ + if (reset_pending) + atomic_dec(&watchdog_reset_pending); + + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); +out: + spin_unlock(&watchdog_lock); +} + +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + +static void clocksource_resume_watchdog(void) +{ + atomic_inc(&watchdog_reset_pending); +} + +static void clocksource_enqueue_watchdog(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a clocksource to be watched. */ + list_add(&cs->wd_list, &watchdog_list); + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + } else { + /* cs is a watchdog. */ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* Pick the best watchdog. */ + if (!watchdog || cs->rating > watchdog->rating) { + watchdog = cs; + /* Reset watchdog cycles */ + clocksource_reset_watchdog(); + } + } + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs != watchdog) { + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + } + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static int __clocksource_watchdog_kthread(void) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + LIST_HEAD(unstable); + int select = 0; + + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + list_add(&cs->wd_list, &unstable); + select = 1; + } + if (cs->flags & CLOCK_SOURCE_RESELECT) { + cs->flags &= ~CLOCK_SOURCE_RESELECT; + select = 1; + } + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); + + /* Needs to be done outside of watchdog lock */ + list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { + list_del_init(&cs->wd_list); + __clocksource_change_rating(cs, 0); + } + return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ + mutex_lock(&clocksource_mutex); + if (__clocksource_watchdog_kthread()) + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} + +static bool clocksource_is_watchdog(struct clocksource *cs) +{ + return cs == watchdog; +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) +{ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; +} + +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } +static inline void clocksource_resume_watchdog(void) { } +static inline int __clocksource_watchdog_kthread(void) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { } + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +/** + * clocksource_suspend - suspend the clocksource(s) + */ +void clocksource_suspend(void) +{ + struct clocksource *cs; + + list_for_each_entry_reverse(cs, &clocksource_list, list) + if (cs->suspend) + cs->suspend(cs); +} + +/** + * clocksource_resume - resume the clocksource(s) + */ +void clocksource_resume(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &clocksource_list, list) + if (cs->resume) + cs->resume(cs); + + clocksource_resume_watchdog(); +} + +/** + * clocksource_touch_watchdog - Update watchdog + * + * Update the watchdog after exception contexts such as kgdb so as not + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. + */ +void clocksource_touch_watchdog(void) +{ + clocksource_resume_watchdog(); +} + +/** + * clocksource_max_adjustment- Returns max adjustment amount + * @cs: Pointer to clocksource + * + */ +static u32 clocksource_max_adjustment(struct clocksource *cs) +{ + u64 ret; + /* + * We won't try to correct for more than 11% adjustments (110,000 ppm), + */ + ret = (u64)cs->mult * 11; + do_div(ret,100); + return (u32)ret; +} + +/** + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @maxadj: maximum adjustment value to mult (~11%) + * @mask: bitmask for two's complement subtraction of non 64 bit counters + * @max_cyc: maximum cycle value before potential overflow (does not include + * any safety margin) + * + * NOTE: This function includes a safety margin of 50%, in other words, we + * return half the number of nanoseconds the hardware counter can technically + * cover. This is done so that we can potentially detect problems caused by + * delayed timers or bad hardware, which might result in time intervals that + * are larger then what the math used can handle without overflows. + */ +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) +{ + u64 max_nsecs, max_cycles; + + /* + * Calculate the maximum number of cycles that we can pass to the + * cyc2ns() function without overflowing a 64-bit result. + */ + max_cycles = ULLONG_MAX; + do_div(max_cycles, mult+maxadj); + + /* + * The actual maximum number of cycles we can defer the clocksource is + * determined by the minimum of max_cycles and mask. + * Note: Here we subtract the maxadj to make sure we don't sleep for + * too long if there's a large negative adjustment. + */ + max_cycles = min(max_cycles, mask); + max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + + /* return the max_cycles value as well if requested */ + if (max_cyc) + *max_cyc = max_cycles; + + /* Return 50% of the actual maximum, so we can detect bad values */ + max_nsecs >>= 1; + + return max_nsecs; +} + +/** + * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles + * @cs: Pointer to clocksource to be updated + * + */ +static inline void clocksource_update_max_deferment(struct clocksource *cs) +{ + cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, + cs->maxadj, cs->mask, + &cs->max_cycles); +} + +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET + +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) +{ + struct clocksource *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return NULL; + + /* + * We pick the clocksource with the highest rating. If oneshot + * mode is active, we pick the highres valid clocksource with + * the best rating. + */ + list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; + if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + continue; + return cs; + } + return NULL; +} + +static void __clocksource_select(bool skipcur) +{ + bool oneshot = tick_oneshot_mode_active(); + struct clocksource *best, *cs; + + /* Find the best suitable clocksource */ + best = clocksource_find_best(oneshot, skipcur); + if (!best) + return; + + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (skipcur && cs == curr_clocksource) + continue; + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { + /* Override clocksource cannot be used. */ + printk(KERN_WARNING "Override clocksource %s is not " + "HRT compatible. Cannot switch while in " + "HRT/NOHZ mode\n", cs->name); + override_name[0] = 0; + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + + if (curr_clocksource != best && !timekeeping_notify(best)) { + pr_info("Switched to clocksource %s\n", best->name); + curr_clocksource = best; + } +} + +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + return __clocksource_select(false); +} + +static void clocksource_select_fallback(void) +{ + return __clocksource_select(true); +} + +#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ + +static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { } + +#endif + +/* + * clocksource_done_booting - Called near the end of core bootup + * + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. + */ +static int __init clocksource_done_booting(void) +{ + mutex_lock(&clocksource_mutex); + curr_clocksource = clocksource_default_clock(); + finished_booting = 1; + /* + * Run the watchdog first to eliminate unstable clock sources + */ + __clocksource_watchdog_kthread(); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} +fs_initcall(clocksource_done_booting); + +/* + * Enqueue the clocksource sorted by rating + */ +static void clocksource_enqueue(struct clocksource *cs) +{ + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; + + list_for_each_entry(tmp, &clocksource_list, list) + /* Keep track of the place, where to insert */ + if (tmp->rating >= cs->rating) + entry = &tmp->list; + list_add(&cs->list, entry); +} + +/** + * __clocksource_update_freq_scale - Used update clocksource with new freq + * @cs: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * This should only be called from the clocksource->enable() method. + * + * This *SHOULD NOT* be called directly! Please use the + * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper + * functions. + */ +void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + u64 sec; + + /* + * Default clocksources are *special* and self-define their mult/shift. + * But, you're not special, so you should specify a freq value. + */ + if (freq) { + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32-bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40-bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. + */ + sec = cs->mask; + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + } + /* + * Ensure clocksources that have large 'mult' values don't overflow + * when adjusted. + */ + cs->maxadj = clocksource_max_adjustment(cs); + while (freq && ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult))) { + cs->mult >>= 1; + cs->shift--; + cs->maxadj = clocksource_max_adjustment(cs); + } + + /* + * Only warn for *special* clocksources that self-define + * their mult/shift values and don't specify a freq. + */ + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + + clocksource_update_max_deferment(cs); + + pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); +} +EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @cs: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* Initialize mult/shift and max_idle_ns */ + __clocksource_update_freq_scale(cs, scale, freq); + + /* Add clocksource to the clocksource list */ + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_enqueue_watchdog(cs); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(__clocksource_register_scale); + +static void __clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); +} + +/** + * clocksource_change_rating - Change the rating of a registered clocksource + * @cs: clocksource to be changed + * @rating: new rating + */ +void clocksource_change_rating(struct clocksource *cs, int rating) +{ + mutex_lock(&clocksource_mutex); + __clocksource_change_rating(cs, rating); + clocksource_select(); + mutex_unlock(&clocksource_mutex); +} +EXPORT_SYMBOL(clocksource_change_rating); + +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ + /* + * I really can't convince myself to support this on hardware + * designed by lobotomized monkeys. + */ + if (clocksource_is_watchdog(cs)) + return -EBUSY; + + if (cs == curr_clocksource) { + /* Select and try to install a replacement clock source */ + clocksource_select_fallback(); + if (curr_clocksource == cs) + return -EBUSY; + } + clocksource_dequeue_watchdog(cs); + list_del_init(&cs->list); + return 0; +} + +/** + * clocksource_unregister - remove a registered clocksource + * @cs: clocksource to be unregistered + */ +int clocksource_unregister(struct clocksource *cs) +{ + int ret = 0; + + mutex_lock(&clocksource_mutex); + if (!list_empty(&cs->list)) + ret = clocksource_unbind(cs); + mutex_unlock(&clocksource_mutex); + return ret; +} +EXPORT_SYMBOL(clocksource_unregister); + +#ifdef CONFIG_SYSFS +/** + * sysfs_show_current_clocksources - sysfs interface for current clocksource + * @dev: unused + * @attr: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t +sysfs_show_current_clocksources(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t count = 0; + + mutex_lock(&clocksource_mutex); + count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); + mutex_unlock(&clocksource_mutex); + + return count; +} + +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +{ + size_t ret = cnt; + + /* strings from sysfs write are not 0 terminated! */ + if (!cnt || cnt >= CS_NAME_LEN) + return -EINVAL; + + /* strip of \n: */ + if (buf[cnt-1] == '\n') + cnt--; + if (cnt > 0) + memcpy(dst, buf, cnt); + dst[cnt] = 0; + return ret; +} + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @attr: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selection. + */ +static ssize_t sysfs_override_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + mutex_lock(&clocksource_mutex); + + ret = sysfs_get_uname(buf, override_name, count); + if (ret >= 0) + clocksource_select(); + + mutex_unlock(&clocksource_mutex); + + return ret; +} + +/** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev: unused + * @attr: unused + * @buf: unused + * @count: length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct clocksource *cs; + char name[CS_NAME_LEN]; + ssize_t ret; + + ret = sysfs_get_uname(buf, name, count); + if (ret < 0) + return ret; + + ret = -ENODEV; + mutex_lock(&clocksource_mutex); + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, name)) + continue; + ret = clocksource_unbind(cs); + break; + } + mutex_unlock(&clocksource_mutex); + + return ret ? ret : count; +} + +/** + * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @attr: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t +sysfs_show_available_clocksources(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct clocksource *src; + ssize_t count = 0; + + mutex_lock(&clocksource_mutex); + list_for_each_entry(src, &clocksource_list, list) { + /* + * Don't show non-HRES clocksource if the tick code is + * in one shot mode (highres=on or nohz=on) + */ + if (!tick_oneshot_mode_active() || + (src->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), + "%s ", src->name); + } + mutex_unlock(&clocksource_mutex); + + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); + + return count; +} + +/* + * Sysfs setup bits: + */ +static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, + sysfs_override_clocksource); + +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); + +static DEVICE_ATTR(available_clocksource, 0444, + sysfs_show_available_clocksources, NULL); + +static struct bus_type clocksource_subsys = { + .name = "clocksource", + .dev_name = "clocksource", +}; + +static struct device device_clocksource = { + .id = 0, + .bus = &clocksource_subsys, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = subsys_system_register(&clocksource_subsys, NULL); + + if (!error) + error = device_register(&device_clocksource); + if (!error) + error = device_create_file( + &device_clocksource, + &dev_attr_current_clocksource); + if (!error) + error = device_create_file(&device_clocksource, + &dev_attr_unbind_clocksource); + if (!error) + error = device_create_file( + &device_clocksource, + &dev_attr_available_clocksource); + return error; +} + +device_initcall(init_clocksource_sysfs); +#endif /* CONFIG_SYSFS */ + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + mutex_lock(&clocksource_mutex); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + mutex_unlock(&clocksource_mutex); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + printk("Warning: clock=pmtmr is deprecated. " + "Use clocksource=acpi_pm.\n"); + return boot_override_clocksource("acpi_pm"); + } + printk("Warning! clock= boot option is deprecated. " + "Use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c new file mode 100644 index 000000000..93ef7190b --- /dev/null +++ b/kernel/time/hrtimer.c @@ -0,0 +1,1852 @@ +/* + * linux/kernel/hrtimer.c + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * High-resolution kernel timers + * + * In contrast to the low-resolution timeout API implemented in + * kernel/timer.c, hrtimers provide finer resolution and accuracy + * depending on system configuration and capabilities. + * + * These timers are currently used for: + * - itimers + * - POSIX timers + * - nanosleep + * - precise in-kernel timing + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Credits: + * based on kernel/timer.c + * + * Help, testing, suggestions, bugfixes, improvements were + * provided by: + * + * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel + * et. al. + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/cpu.h> +#include <linux/export.h> +#include <linux/percpu.h> +#include <linux/hrtimer.h> +#include <linux/notifier.h> +#include <linux/syscalls.h> +#include <linux/kallsyms.h> +#include <linux/interrupt.h> +#include <linux/tick.h> +#include <linux/seq_file.h> +#include <linux/err.h> +#include <linux/debugobjects.h> +#include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h> +#include <linux/timer.h> +#include <linux/freezer.h> + +#include <asm/uaccess.h> + +#include <trace/events/timer.h> + +#include "tick-internal.h" + +/* + * The timer bases: + * + * There are more clockids then hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. + */ +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = +{ + + .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .clock_base = + { + { + .index = HRTIMER_BASE_MONOTONIC, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_REALTIME, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_TAI, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + .resolution = KTIME_LOW_RES, + }, + } +}; + +static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, + [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, + [CLOCK_TAI] = HRTIMER_BASE_TAI, +}; + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + return hrtimer_clock_to_base_table[clock_id]; +} + + +/* + * Get the coarse grained time at the softirq based on xtime and + * wall_to_monotonic. + */ +static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) +{ + ktime_t xtim, mono, boot, tai; + ktime_t off_real, off_boot, off_tai; + + mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); + boot = ktime_add(mono, off_boot); + xtim = ktime_add(mono, off_real); + tai = ktime_add(mono, off_tai); + + base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; + base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; + base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; +} + +/* + * Functions and macros which are different for UP/SMP systems are kept in a + * single place + */ +#ifdef CONFIG_SMP + +/* + * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on the lists/queues. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) +{ + struct hrtimer_clock_base *base; + + for (;;) { + base = timer->base; + if (likely(base != NULL)) { + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + if (likely(base == timer->base)) + return base; + /* The timer has migrated to another CPU: */ + raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + } + cpu_relax(); + } +} + +/* + * With HIGHRES=y we do not migrate the timer when it is expiring + * before the next event on the target cpu because we cannot reprogram + * the target cpu hardware and we would cause it to fire late. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires; + + if (!new_base->cpu_base->hres_active) + return 0; + + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); + return expires.tv64 <= new_base->cpu_base->expires_next.tv64; +#else + return 0; +#endif +} + +/* + * Switch the timer base to the current CPU when possible. + */ +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + int pinned) +{ + struct hrtimer_clock_base *new_base; + struct hrtimer_cpu_base *new_cpu_base; + int this_cpu = smp_processor_id(); + int cpu = get_nohz_timer_target(pinned); + int basenum = base->index; + +again: + new_cpu_base = &per_cpu(hrtimer_bases, cpu); + new_base = &new_cpu_base->clock_base[basenum]; + + if (base != new_base) { + /* + * We are trying to move timer to new_base. + * However we can't change timer's base while it is running, + * so we keep it on the same CPU. No hassle vs. reprogramming + * the event source in the high resolution case. The softirq + * code will take care of this when the timer function has + * completed. There is no conflict as we hold the lock until + * the timer is enqueued. + */ + if (unlikely(hrtimer_callback_running(timer))) + return base; + + /* See the comment in lock_timer_base() */ + timer->base = NULL; + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); + + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; + } + timer->base = new_base; + } else { + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + goto again; + } + } + return new_base; +} + +#else /* CONFIG_SMP */ + +static inline struct hrtimer_clock_base * +lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +{ + struct hrtimer_clock_base *base = timer->base; + + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + + return base; +} + +# define switch_hrtimer_base(t, b, p) (b) + +#endif /* !CONFIG_SMP */ + +/* + * Functions for the union type storage format of ktime_t which are + * too large for inlining: + */ +#if BITS_PER_LONG < 64 +/* + * Divide a ktime value by a nanosecond value + */ +s64 __ktime_divns(const ktime_t kt, s64 div) +{ + int sft = 0; + s64 dclc; + u64 tmp; + + dclc = ktime_to_ns(kt); + tmp = dclc < 0 ? -dclc : dclc; + + /* Make sure the divisor is less than 2^32: */ + while (div >> 32) { + sft++; + div >>= 1; + } + tmp >>= sft; + do_div(tmp, (unsigned long) div); + return dclc < 0 ? -tmp : tmp; +} +EXPORT_SYMBOL_GPL(__ktime_divns); +#endif /* BITS_PER_LONG >= 64 */ + +/* + * Add two ktime values and do a safety check for overflow: + */ +ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) +{ + ktime_t res = ktime_add(lhs, rhs); + + /* + * We use KTIME_SEC_MAX here, the maximum timeout which we can + * return to user space in a timespec: + */ + if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) + res = ktime_set(KTIME_SEC_MAX, 0); + + return res; +} + +EXPORT_SYMBOL_GPL(ktime_add_safe); + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr hrtimer_debug_descr; + +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_init(timer, &hrtimer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +{ + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_free(timer, &hrtimer_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr hrtimer_debug_descr = { + .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, +}; + +static inline void debug_hrtimer_init(struct hrtimer *timer) +{ + debug_object_init(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_activate(struct hrtimer *timer) +{ + debug_object_activate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) +{ + debug_object_deactivate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_free(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode); + +void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); + +void destroy_hrtimer_on_stack(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +#else +static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer) { } +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +#endif + +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +{ + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct timerqueue_node *next; + struct hrtimer *timer; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + } + /* + * clock_was_set() might have changed base->offset of any of + * the clock bases so the result might be negative. Fix it up + * to prevent a false positive in clockevents_program_event(). + */ + if (expires_next.tv64 < 0) + expires_next.tv64 = 0; + return expires_next; +} +#endif + +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static int hrtimer_hres_enabled __read_mostly = 1; + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + if (!strcmp(str, "off")) + hrtimer_hres_enabled = 0; + else if (!strcmp(str, "on")) + hrtimer_hres_enabled = 1; + else + return 0; + return 1; +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + +/* + * Is the high resolution mode active ? + */ +static inline int hrtimer_hres_active(void) +{ + return __this_cpu_read(hrtimer_bases.hres_active); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + ktime_t expires_next = __hrtimer_get_next_event(cpu_base); + + if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) + return; + + cpu_base->expires_next.tv64 = expires_next.tv64; + + /* + * If a hang was detected in the last timer interrupt then we + * leave the hang delay active in the hardware. We want the + * system to make progress. That also prevents the following + * scenario: + * T1 expires 50ms from now + * T2 expires 5s from now + * + * T1 is removed, so this code is called and would reprogram + * the hardware to 5s from now. Any hrtimer_start after that + * will not reprogram the hardware due to hang_detected being + * set. So we'd effectivly block all timers until the T2 event + * fires. + */ + if (cpu_base->hang_detected) + return; + + if (cpu_base->expires_next.tv64 != KTIME_MAX) + tick_program_event(cpu_base->expires_next, 1); +} + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming + * and no expiry check happens. The timer gets enqueued into the rbtree. The + * reprogramming and expiry check is done in the hrtimer_interrupt or in the + * softirq. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + int res; + + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + + /* + * When the callback is running, we do not reprogram the clock event + * device. The timer callback is either running on a different CPU or + * the callback is executed in the hrtimer_interrupt context. The + * reprogramming is handled either by the softirq, which called the + * callback or at the end of the hrtimer_interrupt. + */ + if (hrtimer_callback_running(timer)) + return 0; + + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Nothing wrong + * about that, just avoid to call into the tick code, which + * has now objections against negative expiry values. + */ + if (expires.tv64 < 0) + return -ETIME; + + if (expires.tv64 >= cpu_base->expires_next.tv64) + return 0; + + /* + * When the target cpu of the timer is currently executing + * hrtimer_interrupt(), then we do not touch the clock event + * device. hrtimer_interrupt() will reevaluate all clock bases + * before reprogramming the device. + */ + if (cpu_base->in_hrtirq) + return 0; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) + return 0; + + /* + * Clockevents returns -ETIME, when the event was in the past. + */ + res = tick_program_event(expires, 0); + if (!IS_ERR_VALUE(res)) + cpu_base->expires_next = expires; + return res; +} + +/* + * Initialize the high resolution related parts of cpu_base + */ +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +{ + base->expires_next.tv64 = KTIME_MAX; + base->hres_active = 0; +} + +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); +} + +/* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); + + if (!hrtimer_hres_active()) + return; + + raw_spin_lock(&base->lock); + hrtimer_update_base(base); + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); +} + +/* + * Switch to high resolution mode + */ +static int hrtimer_switch_to_hres(void) +{ + int i, cpu = smp_processor_id(); + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); + unsigned long flags; + + if (base->hres_active) + return 1; + + local_irq_save(flags); + + if (tick_init_highres()) { + local_irq_restore(flags); + printk(KERN_WARNING "Could not switch to high resolution " + "mode on CPU %d\n", cpu); + return 0; + } + base->hres_active = 1; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + base->clock_base[i].resolution = KTIME_HIGH_RES; + + tick_setup_sched_timer(); + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + return 1; +} + +static void clock_was_set_work(struct work_struct *work) +{ + clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); + +/* + * Called from timekeeping and resume code to reprogramm the hrtimer + * interrupt device on all cpus. + */ +void clock_was_set_delayed(void) +{ + schedule_work(&hrtimer_work); +} + +#else + +static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline void +hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } +static inline int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return 0; +} +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } +static inline void retrigger_next_event(void *arg) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 1); +#endif + timerfd_clock_was_set(); +} + +/* + * During resume we might have to reprogram the high resolution timer + * interrupt on all online CPUs. However, all other CPUs will be + * stopped with IRQs interrupts disabled so the clock_was_set() call + * must be deferred. + */ +void hrtimers_resume(void) +{ + WARN_ONCE(!irqs_disabled(), + KERN_INFO "hrtimers_resume() called with IRQs enabled!"); + + /* Retrigger on the local CPU */ + retrigger_next_event(NULL); + /* And schedule a retrigger for all others */ + clock_was_set_delayed(); +} + +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (timer->start_site) + return; + timer->start_site = __builtin_return_address(0); + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +#endif +} + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; +#endif +} + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (likely(!timer_stats_active)) + return; + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, 0); +#endif +} + +/* + * Counterpart to lock_hrtimer_base above: + */ +static inline +void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +{ + raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); +} + +/** + * hrtimer_forward - forward the timer expiry + * @timer: hrtimer to forward + * @now: forward past this time + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire in the future. + * Returns the number of overruns. + */ +u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) +{ + u64 orun = 1; + ktime_t delta; + + delta = ktime_sub(now, hrtimer_get_expires(timer)); + + if (delta.tv64 < 0) + return 0; + + if (interval.tv64 < timer->base->resolution.tv64) + interval.tv64 = timer->base->resolution.tv64; + + if (unlikely(delta.tv64 >= interval.tv64)) { + s64 incr = ktime_to_ns(interval); + + orun = ktime_divns(delta, incr); + hrtimer_add_expires_ns(timer, incr * orun); + if (hrtimer_get_expires_tv64(timer) > now.tv64) + return orun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + orun++; + } + hrtimer_add_expires(timer, interval); + + return orun; +} +EXPORT_SYMBOL_GPL(hrtimer_forward); + +/* + * enqueue_hrtimer - internal function to (re)start a timer + * + * The timer is inserted in expiry order. Insertion into the + * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. + */ +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + debug_activate(timer); + + timerqueue_add(&base->active, &timer->node); + base->cpu_base->active_bases |= 1 << base->index; + + /* + * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the + * state of a possibly running callback. + */ + timer->state |= HRTIMER_STATE_ENQUEUED; + + return (&timer->node == base->active.next); +} + +/* + * __remove_hrtimer - internal function to remove a timer + * + * Caller must hold the base lock. + * + * High resolution timer mode reprograms the clock event device when the + * timer is the one which expires next. The caller can disable this by setting + * reprogram to zero. This is useful, when the context does a reprogramming + * anyway (e.g. timer interrupt) + */ +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + unsigned long newstate, int reprogram) +{ + struct timerqueue_node *next_timer; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; + + next_timer = timerqueue_getnext(&base->active); + timerqueue_del(&base->active, &timer->node); + if (&timer->node == next_timer) { +#ifdef CONFIG_HIGH_RES_TIMERS + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (base->cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(base->cpu_base, 1); + } +#endif + } + if (!timerqueue_getnext(&base->active)) + base->cpu_base->active_bases &= ~(1 << base->index); +out: + timer->state = newstate; +} + +/* + * remove hrtimer, called with base lock held + */ +static inline int +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) +{ + if (hrtimer_is_queued(timer)) { + unsigned long state; + int reprogram; + + /* + * Remove the timer and force reprogramming when high + * resolution mode is active and the timer is on the current + * CPU. If we remove a timer on another CPU, reprogramming is + * skipped. The interrupt event on this CPU is fired and + * reprogramming happens in the interrupt handler. This is a + * rare case and less expensive than a smp call. + */ + debug_deactivate(timer); + timer_stats_hrtimer_clear_start_info(timer); + reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + /* + * We must preserve the CALLBACK state flag here, + * otherwise we could move the timer base in + * switch_hrtimer_base. + */ + state = timer->state & HRTIMER_STATE_CALLBACK; + __remove_hrtimer(timer, base, state, reprogram); + return 1; + } + return 0; +} + +int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode, + int wakeup) +{ + struct hrtimer_clock_base *base, *new_base; + unsigned long flags; + int ret, leftmost; + + base = lock_hrtimer_base(timer, &flags); + + /* Remove an active timer from the queue: */ + ret = remove_hrtimer(timer, base); + + if (mode & HRTIMER_MODE_REL) { + tim = ktime_add_safe(tim, base->get_time()); + /* + * CONFIG_TIME_LOW_RES is a temporary way for architectures + * to signal that they simply return xtime in + * do_gettimeoffset(). In this case we want to round up by + * resolution when starting a relative timer, to avoid short + * timeouts. This will go away with the GTOD framework. + */ +#ifdef CONFIG_TIME_LOW_RES + tim = ktime_add_safe(tim, base->resolution); +#endif + } + + hrtimer_set_expires_range_ns(timer, tim, delta_ns); + + /* Switch the timer base, if necessary: */ + new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + + timer_stats_hrtimer_set_start_info(timer); + + leftmost = enqueue_hrtimer(timer, new_base); + + if (!leftmost) { + unlock_hrtimer_base(timer, &flags); + return ret; + } + + if (!hrtimer_is_hres_active(timer)) { + /* + * Kick to reschedule the next tick to handle the new timer + * on dynticks target. + */ + wake_up_nohz_cpu(new_base->cpu_base->cpu); + } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && + hrtimer_reprogram(timer, new_base)) { + /* + * Only allow reprogramming if the new base is on this CPU. + * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? + */ + if (wakeup) { + /* + * We need to drop cpu_base->lock to avoid a + * lock ordering issue vs. rq->lock. + */ + raw_spin_unlock(&new_base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + local_irq_restore(flags); + return ret; + } else { + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + } + + unlock_hrtimer_base(timer, &flags); + + return ret; +} +EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); + +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) +{ + return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); +} +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +/** + * hrtimer_start - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int +hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) +{ + return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); +} +EXPORT_SYMBOL_GPL(hrtimer_start); + + +/** + * hrtimer_try_to_cancel - try to deactivate a timer + * @timer: hrtimer to stop + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + * -1 when the timer is currently excuting the callback function and + * cannot be stopped + */ +int hrtimer_try_to_cancel(struct hrtimer *timer) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + int ret = -1; + + base = lock_hrtimer_base(timer, &flags); + + if (!hrtimer_callback_running(timer)) + ret = remove_hrtimer(timer, base); + + unlock_hrtimer_base(timer, &flags); + + return ret; + +} +EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); + +/** + * hrtimer_cancel - cancel a timer and wait for the handler to finish. + * @timer: the timer to be cancelled + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + */ +int hrtimer_cancel(struct hrtimer *timer) +{ + for (;;) { + int ret = hrtimer_try_to_cancel(timer); + + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(hrtimer_cancel); + +/** + * hrtimer_get_remaining - get remaining time for the timer + * @timer: the timer to read + */ +ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + unsigned long flags; + ktime_t rem; + + lock_hrtimer_base(timer, &flags); + rem = hrtimer_expires_remaining(timer); + unlock_hrtimer_base(timer, &flags); + + return rem; +} +EXPORT_SYMBOL_GPL(hrtimer_get_remaining); + +#ifdef CONFIG_NO_HZ_COMMON +/** + * hrtimer_get_next_event - get the time until next expiry event + * + * Returns the delta to the next expiry event or KTIME_MAX if no timer + * is pending. + */ +ktime_t hrtimer_get_next_event(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t mindelta = { .tv64 = KTIME_MAX }; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (!hrtimer_hres_active()) + mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), + ktime_get()); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + if (mindelta.tv64 < 0) + mindelta.tv64 = 0; + return mindelta; +} +#endif + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + struct hrtimer_cpu_base *cpu_base; + int base; + + memset(timer, 0, sizeof(struct hrtimer)); + + cpu_base = raw_cpu_ptr(&hrtimer_bases); + + if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) + clock_id = CLOCK_MONOTONIC; + + base = hrtimer_clockid_to_base(clock_id); + timer->base = &cpu_base->clock_base[base]; + timerqueue_init(&timer->node); + +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif +} + +/** + * hrtimer_init - initialize a timer to the given clock + * @timer: the timer to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init(timer, clock_id, mode); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init); + +/** + * hrtimer_get_res - get the timer resolution for a clock + * @which_clock: which clock to query + * @tp: pointer to timespec variable to store the resolution + * + * Store the resolution of the clock selected by @which_clock in the + * variable pointed to by @tp. + */ +int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) +{ + struct hrtimer_cpu_base *cpu_base; + int base = hrtimer_clockid_to_base(which_clock); + + cpu_base = raw_cpu_ptr(&hrtimer_bases); + *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); + + return 0; +} +EXPORT_SYMBOL_GPL(hrtimer_get_res); + +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +{ + struct hrtimer_clock_base *base = timer->base; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; + enum hrtimer_restart (*fn)(struct hrtimer *); + int restart; + + WARN_ON(!irqs_disabled()); + + debug_deactivate(timer); + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + fn = timer->function; + + /* + * Because we run timers from hardirq context, there is no chance + * they get migrated to another cpu, therefore its safe to unlock + * the timer base. + */ + raw_spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); + restart = fn(timer); + trace_hrtimer_expire_exit(timer); + raw_spin_lock(&cpu_base->lock); + + /* + * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * we do not reprogramm the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() + */ + if (restart != HRTIMER_NORESTART) { + BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + enqueue_hrtimer(timer, base); + } + + WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + + timer->state &= ~HRTIMER_STATE_CALLBACK; +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int i, retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + struct hrtimer_clock_base *base; + struct timerqueue_node *node; + ktime_t basenow; + + if (!(cpu_base->active_bases & (1 << i))) + continue; + + base = cpu_base->clock_base + i; + basenow = ktime_add(now, base->offset); + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + + /* + * The immediate goal for using the softexpires is + * minimizing wakeups, not running timers at the + * earliest interrupt after their soft expiration. + * This allows us to avoid using a Priority Search + * Tree, which can answer a stabbing querry for + * overlapping intervals and instead use the simple + * BST we already have. + * We don't add extra wakeups by delaying timers that + * are right-of a not yet expired timer, because that + * timer will have to trigger a wakeup anyway. + */ + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) + break; + + __run_hrtimer(timer, &basenow); + } + } + /* Reevaluate the clock bases for the next expiry */ + expires_next = __hrtimer_get_next_event(cpu_base); + /* + * Store the new expiry value so the migration code can verify + * against it. + */ + cpu_base->expires_next = expires_next; + cpu_base->in_hrtirq = 0; + raw_spin_unlock(&cpu_base->lock); + + /* Reprogramming necessary ? */ + if (expires_next.tv64 == KTIME_MAX || + !tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; + } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. + */ + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + raw_spin_unlock(&cpu_base->lock); + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta.tv64 > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", + ktime_to_ns(delta)); +} + +/* + * local version of hrtimer_peek_ahead_timers() called with interrupts + * disabled. + */ +static void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = this_cpu_ptr(&tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + +/** + * hrtimer_peek_ahead_timers -- run soft-expired timers now + * + * hrtimer_peek_ahead_timers will peek at the timer queue of + * the current cpu and check if there are any timers for which + * the soft expires time has passed. If any such timers exist, + * they are run immediately and then removed from the timer queue. + * + */ +void hrtimer_peek_ahead_timers(void) +{ + unsigned long flags; + + local_irq_save(flags); + __hrtimer_peek_ahead_timers(); + local_irq_restore(flags); +} + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_peek_ahead_timers(); +} + +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ + +/* + * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. + */ +void hrtimer_run_pending(void) +{ + if (hrtimer_hres_active()) + return; + + /* + * This _is_ ugly: We have to check in the softirq context, + * whether we can switch to highres and / or nohz mode. The + * clocksource switch happens in the timer interrupt with + * xtime_lock held. Notification from there only sets the + * check bit in the tick_oneshot code, otherwise we might + * deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + hrtimer_switch_to_hres(); +} + +/* + * Called from hardirq context every jiffy + */ +void hrtimer_run_queues(void) +{ + struct timerqueue_node *node; + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *base; + int index, gettime = 1; + + if (hrtimer_hres_active()) + return; + + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { + base = &cpu_base->clock_base[index]; + if (!timerqueue_getnext(&base->active)) + continue; + + if (gettime) { + hrtimer_get_softirq_time(cpu_base); + gettime = 0; + } + + raw_spin_lock(&cpu_base->lock); + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + if (base->softirq_time.tv64 <= + hrtimer_get_expires_tv64(timer)) + break; + + __run_hrtimer(timer, &base->softirq_time); + } + raw_spin_unlock(&cpu_base->lock); + } +} + +/* + * Sleep related functions: + */ +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) +{ + struct hrtimer_sleeper *t = + container_of(timer, struct hrtimer_sleeper, timer); + struct task_struct *task = t->task; + + t->task = NULL; + if (task) + wake_up_process(task); + + return HRTIMER_NORESTART; +} + +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +{ + sl->timer.function = hrtimer_wakeup; + sl->task = task; +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); + +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) +{ + hrtimer_init_sleeper(t, current); + + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t->timer, mode); + if (!hrtimer_active(&t->timer)) + t->task = NULL; + + if (likely(t->task)) + freezable_schedule(); + + hrtimer_cancel(&t->timer); + mode = HRTIMER_MODE_ABS; + + } while (t->task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + return t->task == NULL; +} + +static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = hrtimer_expires_remaining(timer); + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; +} + +long __sched hrtimer_nanosleep_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper t; + struct timespec __user *rmtp; + int ret = 0; + + hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, + HRTIMER_MODE_ABS); + hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + + if (do_nanosleep(&t, HRTIMER_MODE_ABS)) + goto out; + + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + goto out; + } + + /* The other values in restart are already filled in */ + ret = -ERESTART_RESTARTBLOCK; +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, + const enum hrtimer_mode mode, const clockid_t clockid) +{ + struct restart_block *restart; + struct hrtimer_sleeper t; + int ret = 0; + unsigned long slack; + + slack = current->timer_slack_ns; + if (dl_task(current) || rt_task(current)) + slack = 0; + + hrtimer_init_on_stack(&t.timer, clockid, mode); + hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); + if (do_nanosleep(&t, mode)) + goto out; + + /* Absolute timers do not update the rmtp value and restart: */ + if (mode == HRTIMER_MODE_ABS) { + ret = -ERESTARTNOHAND; + goto out; + } + + if (rmtp) { + ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + goto out; + } + + restart = ¤t->restart_block; + restart->fn = hrtimer_nanosleep_restart; + restart->nanosleep.clockid = t.timer.base->clockid; + restart->nanosleep.rmtp = rmtp; + restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + + ret = -ERESTART_RESTARTBLOCK; +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct timespec tu; + + if (copy_from_user(&tu, rqtp, sizeof(tu))) + return -EFAULT; + + if (!timespec_valid(&tu)) + return -EINVAL; + + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} + +/* + * Functions related to boot-time initialization: + */ +static void init_hrtimers_cpu(int cpu) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + cpu_base->clock_base[i].cpu_base = cpu_base; + timerqueue_init_head(&cpu_base->clock_base[i].active); + } + + cpu_base->cpu = cpu; + hrtimer_init_hres(cpu_base); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) +{ + struct hrtimer *timer; + struct timerqueue_node *node; + + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); + BUG_ON(hrtimer_callback_running(timer)); + debug_deactivate(timer); + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); + timer->base = new_base; + /* + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. + */ + enqueue_hrtimer(timer, new_base); + + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; + } +} + +static void migrate_hrtimers(int scpu) +{ + struct hrtimer_cpu_base *old_base, *new_base; + int i; + + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = this_cpu_ptr(&hrtimer_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i]); + } + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); + + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_enable(); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static int hrtimer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int scpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + init_hrtimers_cpu(scpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + migrate_hrtimers(scpu); + break; +#endif + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block hrtimers_nb = { + .notifier_call = hrtimer_cpu_notify, +}; + +void __init hrtimers_init(void) +{ + hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif +} + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME + */ +int __sched +schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode, int clock) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && !expires->tv64) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + return -EINTR; + } + + hrtimer_init_on_stack(&t.timer, clock, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + + hrtimer_init_sleeper(&t, current); + + hrtimer_start_expires(&t.timer, mode); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c new file mode 100644 index 000000000..8d262b467 --- /dev/null +++ b/kernel/time/itimer.c @@ -0,0 +1,301 @@ +/* + * linux/kernel/itimer.c + * + * Copyright (C) 1992 Darren Senn + */ + +/* These are all the functions necessary to implement itimers */ + +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/syscalls.h> +#include <linux/time.h> +#include <linux/posix-timers.h> +#include <linux/hrtimer.h> +#include <trace/events/timer.h> + +#include <asm/uaccess.h> + +/** + * itimer_get_remtime - get remaining time for the timer + * + * @timer: the timer to read + * + * Returns the delta between the expiry time and now, which can be + * less than zero or 1usec for an pending expired timer + */ +static struct timeval itimer_get_remtime(struct hrtimer *timer) +{ + ktime_t rem = hrtimer_get_remaining(timer); + + /* + * Racy but safe: if the itimer expires after the above + * hrtimer_get_remtime() call but before this condition + * then we return 0 - which is correct. + */ + if (hrtimer_active(timer)) { + if (rem.tv64 <= 0) + rem.tv64 = NSEC_PER_USEC; + } else + rem.tv64 = 0; + + return ktime_to_timeval(rem); +} + +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (cval) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime.utime + cputime.stime; + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cval < t) + /* about to fire */ + cval = cputime_one_jiffy; + else + cval = cval - t; + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + +int do_getitimer(int which, struct itimerval *value) +{ + struct task_struct *tsk = current; + + switch (which) { + case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); + value->it_value = itimer_get_remtime(&tsk->signal->real_timer); + value->it_interval = + ktime_to_timeval(tsk->signal->it_real_incr); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); + break; + case ITIMER_PROF: + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); + break; + default: + return(-EINVAL); + } + return 0; +} + +SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) +{ + int error = -EFAULT; + struct itimerval get_buffer; + + if (value) { + error = do_getitimer(which, &get_buffer); + if (!error && + copy_to_user(value, &get_buffer, sizeof(get_buffer))) + error = -EFAULT; + } + return error; +} + + +/* + * The timer is automagically restarted, when interval != 0 + */ +enum hrtimer_restart it_real_fn(struct hrtimer *timer) +{ + struct signal_struct *sig = + container_of(timer, struct signal_struct, real_timer); + + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); + + return HRTIMER_NORESTART; +} + +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; + u32 error, incr_error; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + error = cputime_sub_ns(nval, ns_nval); + incr_error = cputime_sub_ns(ninterval, ns_ninterval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (cval || nval) { + if (nval > 0) + nval += cputime_one_jiffy; + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + it->error = error; + it->incr_error = incr_error; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + +/* + * Returns true if the timeval is in canonical form + */ +#define timeval_valid(t) \ + (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) + +int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +{ + struct task_struct *tsk = current; + struct hrtimer *timer; + ktime_t expires; + + /* + * Validate the timevals in value. + */ + if (!timeval_valid(&value->it_value) || + !timeval_valid(&value->it_interval)) + return -EINVAL; + + switch (which) { + case ITIMER_REAL: +again: + spin_lock_irq(&tsk->sighand->siglock); + timer = &tsk->signal->real_timer; + if (ovalue) { + ovalue->it_value = itimer_get_remtime(timer); + ovalue->it_interval + = ktime_to_timeval(tsk->signal->it_real_incr); + } + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); + goto again; + } + expires = timeval_to_ktime(value->it_value); + if (expires.tv64 != 0) { + tsk->signal->it_real_incr = + timeval_to_ktime(value->it_interval); + hrtimer_start(timer, expires, HRTIMER_MODE_REL); + } else + tsk->signal->it_real_incr.tv64 = 0; + + trace_itimer_state(ITIMER_REAL, value, 0); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); + break; + case ITIMER_PROF: + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); + break; + default: + return -EINVAL; + } + return 0; +} + +/** + * alarm_setitimer - set alarm in seconds + * + * @seconds: number of seconds until alarm + * 0 disables the alarm + * + * Returns the remaining time in seconds of a pending timer or 0 when + * the timer is not active. + * + * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid + * negative timeval settings which would cause immediate expiry. + */ +unsigned int alarm_setitimer(unsigned int seconds) +{ + struct itimerval it_new, it_old; + +#if BITS_PER_LONG < 64 + if (seconds > INT_MAX) + seconds = INT_MAX; +#endif + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + + do_setitimer(ITIMER_REAL, &it_new, &it_old); + + /* + * We can't return 0 if we have an alarm pending ... And we'd + * better return too much than too little anyway + */ + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || + it_old.it_value.tv_usec >= 500000) + it_old.it_value.tv_sec++; + + return it_old.it_value.tv_sec; +} + +SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, + struct itimerval __user *, ovalue) +{ + struct itimerval set_buffer, get_buffer; + int error; + + if (value) { + if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) + return -EFAULT; + } else { + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); + } + + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) + return error; + + if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + return -EFAULT; + return 0; +} diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 000000000..347fecf86 --- /dev/null +++ b/kernel/time/jiffies.c @@ -0,0 +1,136 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/init.h> + +#include "timekeeping.h" + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not recommended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else +#define JIFFIES_SHIFT 8 +#endif + +static cycle_t jiffies_read(struct clocksource *cs) +{ + return (cycle_t) jiffies; +} + +static struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 1, /* lowest valid rating*/ + .read = jiffies_read, + .mask = 0xffffffff, /*32bits*/ + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .max_cycles = 10, +}; + +__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&jiffies_lock); + ret = jiffies_64; + } while (read_seqretry(&jiffies_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + +static int __init init_jiffies_clocksource(void) +{ + return __clocksource_register(&clocksource_jiffies); +} + +core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ + u64 nsec_per_tick, shift_hz; + long cycles_per_tick; + + + + refined_jiffies = clocksource_jiffies; + refined_jiffies.name = "refined-jiffies"; + refined_jiffies.rating++; + + /* Calc cycles per tick */ + cycles_per_tick = (cycles_per_second + HZ/2)/HZ; + /* shift_hz stores hz<<8 for extra accuracy */ + shift_hz = (u64)cycles_per_second << 8; + shift_hz += cycles_per_tick/2; + do_div(shift_hz, cycles_per_tick); + /* Calculate nsec_per_tick using shift_hz */ + nsec_per_tick = (u64)NSEC_PER_SEC << 8; + nsec_per_tick += (u32)shift_hz/2; + do_div(nsec_per_tick, (u32)shift_hz); + + refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + + __clocksource_register(&refined_jiffies); + return 0; +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c new file mode 100644 index 000000000..7a6810030 --- /dev/null +++ b/kernel/time/ntp.c @@ -0,0 +1,965 @@ +/* + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ +#include <linux/capability.h> +#include <linux/clocksource.h> +#include <linux/workqueue.h> +#include <linux/hrtimer.h> +#include <linux/jiffies.h> +#include <linux/math64.h> +#include <linux/timex.h> +#include <linux/time.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/rtc.h> + +#include "ntp_internal.h" + +/* + * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks. + */ + + +/* USER_HZ period (usecs): */ +unsigned long tick_usec = TICK_USEC; + +/* SHIFTED_HZ period (nsecs): */ +unsigned long tick_nsec; + +static u64 tick_length; +static u64 tick_length_base; + +#define MAX_TICKADJ 500LL /* usecs */ +#define MAX_TICKADJ_SCALED \ + (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) + +/* + * phase-lock loop variables + */ + +/* + * clock synchronization status + * + * (TIME_ERROR prevents overwriting the CMOS clock) + */ +static int time_state = TIME_OK; + +/* clock status bits: */ +static int time_status = STA_UNSYNC; + +/* time adjustment (nsecs): */ +static s64 time_offset; + +/* pll time constant: */ +static long time_constant = 2; + +/* maximum error (usecs): */ +static long time_maxerror = NTP_PHASE_LIMIT; + +/* estimated error (usecs): */ +static long time_esterror = NTP_PHASE_LIMIT; + +/* frequency offset (scaled nsecs/secs): */ +static s64 time_freq; + +/* time at last adjustment (secs): */ +static long time_reftime; + +static long time_adjust; + +/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ +static s64 ntp_tick_adj; + +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID 10 /* PPS signal watchdog max (s) */ +#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ +#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ +#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ +#define PPS_INTCOUNT 4 /* number of consecutive good intervals to + increase pps_shift or consecutive bad + intervals to decrease it */ +#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ + +static int pps_valid; /* signal watchdog counter */ +static long pps_tf[3]; /* phase median filter */ +static long pps_jitter; /* current jitter (ns) */ +static struct timespec pps_fbase; /* beginning of the last freq interval */ +static int pps_shift; /* current interval duration (s) (shift) */ +static int pps_intcnt; /* interval counter */ +static s64 pps_freq; /* frequency offset (scaled ns/s) */ +static long pps_stabil; /* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + return offset; + else + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ + /* the PPS calibration interval may end + surprisingly early */ + pps_shift = PPS_INTMIN; + pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + */ +static inline void pps_clear(void) +{ + pps_reset_freq_interval(); + pps_tf[0] = 0; + pps_tf[1] = 0; + pps_tf[2] = 0; + pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + */ +static inline void pps_dec_valid(void) +{ + if (pps_valid > 0) + pps_valid--; + else { + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(); + } +} + +static inline void pps_set_freq(s64 freq) +{ + pps_freq = freq; +} + +static inline int is_error_status(int status) +{ + return (status & (STA_UNSYNC|STA_CLOCKERR)) + /* PPS signal lost when either PPS time or + * PPS frequency synchronization requested + */ + || ((status & (STA_PPSFREQ|STA_PPSTIME)) + && !(status & STA_PPSSIGNAL)) + /* PPS jitter exceeded when + * PPS time synchronization requested */ + || ((status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* PPS wander exceeded or calibration error when + * PPS frequency synchronization requested + */ + || ((status & STA_PPSFREQ) + && (status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->jitter = pps_jitter; + if (!(time_status & STA_NANO)) + txc->jitter /= NSEC_PER_USEC; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ + return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; +} + +#endif /* CONFIG_NTP_PPS */ + + +/** + * ntp_synced - Returns 1 if the NTP status is not UNSYNC + * + */ +static inline int ntp_synced(void) +{ + return !(time_status & STA_UNSYNC); +} + + +/* + * NTP methods: + */ + +/* + * Update (tick_length, tick_length_base, tick_nsec), based + * on (tick_usec, ntp_tick_adj, time_freq): + */ +static void ntp_update_frequency(void) +{ + u64 second_length; + u64 new_base; + + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << NTP_SCALE_SHIFT; + + second_length += ntp_tick_adj; + second_length += time_freq; + + tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; + new_base = div_u64(second_length, NTP_INTERVAL_FREQ); + + /* + * Don't wait for the next second_overflow, apply + * the change to the tick length immediately: + */ + tick_length += new_base - tick_length_base; + tick_length_base = new_base; +} + +static inline s64 ntp_update_offset_fll(s64 offset64, long secs) +{ + time_status &= ~STA_MODE; + + if (secs < MINSEC) + return 0; + + if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + return 0; + + time_status |= STA_MODE; + + return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); +} + +static void ntp_update_offset(long offset) +{ + s64 freq_adj; + s64 offset64; + long secs; + + if (!(time_status & STA_PLL)) + return; + + if (!(time_status & STA_NANO)) + offset *= NSEC_PER_USEC; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + offset = min(offset, MAXPHASE); + offset = max(offset, -MAXPHASE); + + /* + * Select how the frequency is to be controlled + * and in which mode (PLL or FLL). + */ + secs = get_seconds() - time_reftime; + if (unlikely(time_status & STA_FREQHOLD)) + secs = 0; + + time_reftime = get_seconds(); + + offset64 = offset; + freq_adj = ntp_update_offset_fll(offset64, secs); + + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + + freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + + time_freq = max(freq_adj, -MAXFREQ_SCALED); + + time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); +} + +/** + * ntp_clear - Clears the NTP state variables + */ +void ntp_clear(void) +{ + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + + tick_length = tick_length_base; + time_offset = 0; + + /* Clear PPS state variables */ + pps_clear(); +} + + +u64 ntp_tick_length(void) +{ + return tick_length; +} + + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset + */ +int second_overflow(unsigned long secs) +{ + s64 delta; + int leap = 0; + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ + switch (time_state) { + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + case TIME_INS: + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { + leap = -1; + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + } + break; + case TIME_DEL: + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { + leap = 1; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + } + break; + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + } + + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ / NSEC_PER_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* Compute the phase adjustment for the next second */ + tick_length = tick_length_base; + + delta = ntp_offset_chunk(time_offset); + time_offset -= delta; + tick_length += delta; + + /* Check PPS signal */ + pps_dec_valid(); + + if (!time_adjust) + goto out; + + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + goto out; + } + + if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + goto out; + } + + tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + time_adjust = 0; + +out: + return leap; +} + +#ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock64(struct timespec64 now64) +{ + struct timespec now; + + now = timespec64_to_timespec(now64); + return update_persistent_clock(now); +} +#endif + +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) +static void sync_cmos_clock(struct work_struct *work); + +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); + +static void sync_cmos_clock(struct work_struct *work) +{ + struct timespec64 now; + struct timespec next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + * We want the clock to be within a couple of ticks from the target. + */ + if (!ntp_synced()) { + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + } + + getnstimeofday64(&now); + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { + struct timespec64 adjust = now; + + fail = -ENODEV; + if (persistent_clock_is_local) + adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); +#ifdef CONFIG_GENERIC_CMOS_UPDATE + fail = update_persistent_clock64(adjust); +#endif + +#ifdef CONFIG_RTC_SYSTOHC + if (fail == -ENODEV) + fail = rtc_set_ntp_time(adjust); +#endif + } + + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + + if (!fail || fail == -ENODEV) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); +} + +void ntp_notify_cmos_timer(void) +{ + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); +} + +#else +void ntp_notify_cmos_timer(void) { } +#endif + + +/* + * Propagate a new txc->status value into the NTP state: + */ +static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) +{ + if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { + time_state = TIME_OK; + time_status = STA_UNSYNC; + /* restart PPS frequency calibration */ + pps_reset_freq_interval(); + } + + /* + * If we turn on PLL adjustments then reset the + * reference time to current time. + */ + if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + time_reftime = get_seconds(); + + /* only set allowed bits */ + time_status &= STA_RONLY; + time_status |= txc->status & ~STA_RONLY; +} + + +static inline void process_adjtimex_modes(struct timex *txc, + struct timespec64 *ts, + s32 *time_tai) +{ + if (txc->modes & ADJ_STATUS) + process_adj_status(txc, ts); + + if (txc->modes & ADJ_NANO) + time_status |= STA_NANO; + + if (txc->modes & ADJ_MICRO) + time_status &= ~STA_NANO; + + if (txc->modes & ADJ_FREQUENCY) { + time_freq = txc->freq * PPM_SCALE; + time_freq = min(time_freq, MAXFREQ_SCALED); + time_freq = max(time_freq, -MAXFREQ_SCALED); + /* update pps_freq */ + pps_set_freq(time_freq); + } + + if (txc->modes & ADJ_MAXERROR) + time_maxerror = txc->maxerror; + + if (txc->modes & ADJ_ESTERROR) + time_esterror = txc->esterror; + + if (txc->modes & ADJ_TIMECONST) { + time_constant = txc->constant; + if (!(time_status & STA_NANO)) + time_constant += 4; + time_constant = min(time_constant, (long)MAXTC); + time_constant = max(time_constant, 0l); + } + + if (txc->modes & ADJ_TAI && txc->constant > 0) + *time_tai = txc->constant; + + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); + + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); +} + + + +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex + */ +int ntp_validate_timex(struct timex *txc) +{ + if (txc->modes & ADJ_ADJTIME) { + /* singleshot must not be used with any other mode bits */ + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) + return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + } + + if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) + return -EPERM; + + /* + * Check for potential multiplication overflows that can + * only happen on 64-bit systems: + */ + if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { + if (LLONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LLONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + + return 0; +} + + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) +{ + int result; + + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + } else { + + /* If there are input parameters, then process them: */ + if (txc->modes) + process_adjtimex_modes(txc, ts, time_tai); + + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + } + + result = time_state; /* mostly `TIME_OK' */ + /* check for errors */ + if (is_error_status(time_status)) + result = TIME_ERROR; + + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = 1; + txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; + txc->tick = tick_usec; + txc->tai = *time_tai; + + /* fill PPS status fields */ + pps_fill_timex(txc); + + txc->time.tv_sec = (time_t)ts->tv_sec; + txc->time.tv_usec = ts->tv_nsec; + if (!(time_status & STA_NANO)) + txc->time.tv_usec /= NSEC_PER_USEC; + + return result; +} + +#ifdef CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { + __kernel_time_t sec; /* seconds */ + long nsec; /* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the + ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +{ + struct pps_normtime norm = { + .sec = ts.tv_sec, + .nsec = ts.tv_nsec + }; + + if (norm.nsec > (NSEC_PER_SEC >> 1)) { + norm.nsec -= NSEC_PER_SEC; + norm.sec++; + } + + return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ + *jitter = pps_tf[0] - pps_tf[1]; + if (*jitter < 0) + *jitter = -*jitter; + + /* TODO: test various filters */ + return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ + if (--pps_intcnt <= -PPS_INTCOUNT) { + pps_intcnt = -PPS_INTCOUNT; + if (pps_shift > PPS_INTMIN) { + pps_shift--; + pps_intcnt = 0; + } + } +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ + if (++pps_intcnt >= PPS_INTCOUNT) { + pps_intcnt = PPS_INTCOUNT; + if (pps_shift < PPS_INTMAX) { + pps_shift++; + pps_intcnt = 0; + } + } +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ + long delta, delta_mod; + s64 ftemp; + + /* check if the frequency interval was too long */ + if (freq_norm.sec > (2 << pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + pps_dec_freq_interval(); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); + return 0; + } + + /* here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * the interval is increased; otherwise it is decreased. + */ + ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, + freq_norm.sec); + delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); + pps_freq = ftemp; + if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); + time_status |= STA_PPSWANDER; + pps_stbcnt++; + pps_dec_freq_interval(); + } else { /* good sample */ + pps_inc_freq_interval(); + } + + /* the stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring + */ + delta_mod = delta; + if (delta_mod < 0) + delta_mod = -delta_mod; + pps_stabil += (div_s64(((s64)delta_mod) << + (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + + /* if enabled, the system clock frequency is updated */ + if ((time_status & STA_PPSFREQ) != 0 && + (time_status & STA_FREQHOLD) == 0) { + time_freq = pps_freq; + ntp_update_frequency(); + } + + return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ + long correction = -error; + long jitter; + + /* add the sample to the median filter */ + pps_phase_filter_add(correction); + correction = pps_phase_filter_get(&jitter); + + /* Nominal jitter is due to PPS signal noise. If it exceeds the + * threshold, the sample is discarded; otherwise, if so enabled, + * the time offset is updated. + */ + if (jitter > (pps_jitter << PPS_POPCORN)) { + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + /* correct the time using the phase offset */ + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* cancel running adjtime() */ + time_adjust = 0; + } + /* update jitter */ + pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * __hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + struct pps_normtime pts_norm, freq_norm; + + pts_norm = pps_normalize_ts(*phase_ts); + + /* clear the error bits, they will be set again if needed */ + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + + /* indicate signal presence */ + time_status |= STA_PPSSIGNAL; + pps_valid = PPS_VALID; + + /* when called for the first time, + * just start the frequency interval */ + if (unlikely(pps_fbase.tv_sec == 0)) { + pps_fbase = *raw_ts; + return; + } + + /* ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + + /* check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + if ((freq_norm.sec == 0) || + (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + time_status |= STA_PPSJITTER; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); + return; + } + + /* signal is ok */ + + /* check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << pps_shift)) { + pps_calcnt++; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + hardpps_update_freq(freq_norm); + } + + hardpps_update_phase(pts_norm.nsec); + +} +#endif /* CONFIG_NTP_PPS */ + +static int __init ntp_tick_adj_setup(char *str) +{ + int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); + + if (rc) + return rc; + ntp_tick_adj <<= NTP_SCALE_SHIFT; + + return 1; +} + +__setup("ntp_tick_adj=", ntp_tick_adj_setup); + +void __init ntp_init(void) +{ + ntp_clear(); +} diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 000000000..bbd102ad9 --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern int second_overflow(unsigned long secs); +extern int ntp_validate_timex(struct timex *); +extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); +extern void __hardpps(const struct timespec *, const struct timespec *); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 000000000..ce033c7aa --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,446 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/device.h> +#include <linux/export.h> +#include <linux/file.h> +#include <linux/posix-clock.h> +#include <linux/slab.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + down_read(&clk->rwsem); + + if (!clk->zombie) + return clk; + + up_read(&clk->rwsem); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + up_read(&clk->rwsem); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + int result = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.fasync) + err = clk->ops.fasync(clk, fd, fp, on); + + put_posix_clock(clk); + + return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENODEV; + + if (!clk) + return -ENODEV; + + if (clk->ops.mmap) + err = clk->ops.mmap(clk, vma); + + put_posix_clock(clk); + + return err; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + down_read(&clk->rwsem); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + kref_get(&clk->kref); + fp->private_data = clk; + } +out: + up_read(&clk->rwsem); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + kref_put(&clk->kref, delete_clock); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, + .fasync = posix_clock_fasync, + .mmap = posix_clock_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ + int err; + + kref_init(&clk->kref); + init_rwsem(&clk->rwsem); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + clk->cdev.owner = clk->ops.owner; + err = cdev_add(&clk->cdev, devid, 1); + + return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ + struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + + if (clk->release) + clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_del(&clk->cdev); + + down_write(&clk->rwsem); + clk->zombie = true; + up_write(&clk->rwsem); + + kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(CLOCKID_TO_FD(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_create) + err = cd.clk->ops.timer_create(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_delete) + err = cd.clk->ops.timer_delete(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + + if (get_clock_desc(id, &cd)) + return; + + if (cd.clk->ops.timer_gettime) + cd.clk->ops.timer_gettime(cd.clk, kit, ts); + + put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, + struct itimerspec *ts, struct itimerspec *old) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_settime) + err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, + .timer_create = pc_timer_create, + .timer_set = pc_timer_settime, + .timer_del = pc_timer_delete, + .timer_get = pc_timer_gettime, +}; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c new file mode 100644 index 000000000..0ac829b48 --- /dev/null +++ b/kernel/time/posix-cpu-timers.c @@ -0,0 +1,1475 @@ +/* + * Implement CPU time clocks for the POSIX clock interface. + */ + +#include <linux/sched.h> +#include <linux/posix-timers.h> +#include <linux/errno.h> +#include <linux/math64.h> +#include <asm/uaccess.h> +#include <linux/kernel_stat.h> +#include <trace/events/timer.h> +#include <linux/random.h> +#include <linux/tick.h> +#include <linux/workqueue.h> + +/* + * Called after updating RLIMIT_CPU to run cpu timer and update + * tsk->signal->cputime_expires expiration cache if necessary. Needs + * siglock protection since other code may update expiration cache as + * well. + */ +void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) +{ + cputime_t cputime = secs_to_cputime(rlim_new); + + spin_lock_irq(&task->sighand->siglock); + set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(&task->sighand->siglock); +} + +static int check_clock(const clockid_t which_clock) +{ + int error = 0; + struct task_struct *p; + const pid_t pid = CPUCLOCK_PID(which_clock); + + if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + if (pid == 0) + return 0; + + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? + same_thread_group(p, current) : has_group_leader_pid(p))) { + error = -EINVAL; + } + rcu_read_unlock(); + + return error; +} + +static inline unsigned long long +timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) +{ + unsigned long long ret; + + ret = 0; /* high half always zero when .cpu used */ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + } else { + ret = cputime_to_expires(timespec_to_cputime(tp)); + } + return ret; +} + +static void sample_to_timespec(const clockid_t which_clock, + unsigned long long expires, + struct timespec *tp) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) + *tp = ns_to_timespec(expires); + else + cputime_to_timespec((__force cputime_t)expires, tp); +} + +/* + * Update expiry time from increment, and increase overrun count, + * given the current clock sample. + */ +static void bump_cpu_timer(struct k_itimer *timer, + unsigned long long now) +{ + int i; + unsigned long long delta, incr; + + if (timer->it.cpu.incr == 0) + return; + + if (now < timer->it.cpu.expires) + return; + + incr = timer->it.cpu.incr; + delta = now + incr - timer->it.cpu.expires; + + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + + timer->it.cpu.expires += incr; + timer->it_overrun += 1 << i; + delta -= incr; + } +} + +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) + return 1; + return 0; +} + +static inline unsigned long long prof_ticks(struct task_struct *p) +{ + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + + return cputime_to_expires(utime + stime); +} +static inline unsigned long long virt_ticks(struct task_struct *p) +{ + cputime_t utime; + + task_cputime(p, &utime, NULL); + + return cputime_to_expires(utime); +} + +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + int error = check_clock(which_clock); + if (!error) { + tp->tv_sec = 0; + tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + /* + * If sched_clock is using a cycle counter, we + * don't have any idea of its true resolution + * exported, but it is much more than 1s/HZ. + */ + tp->tv_nsec = 1; + } + } + return error; +} + +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +{ + /* + * You can never reset a CPU clock, but we check for other errors + * in the call before failing with EPERM. + */ + int error = check_clock(which_clock); + if (error == 0) { + error = -EPERM; + } + return error; +} + + +/* + * Sample a per-thread clock for the given task. + */ +static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, + unsigned long long *sample) +{ + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + *sample = prof_ticks(p); + break; + case CPUCLOCK_VIRT: + *sample = virt_ticks(p); + break; + case CPUCLOCK_SCHED: + *sample = task_sched_runtime(p); + break; + } + return 0; +} + +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +{ + if (b->utime > a->utime) + a->utime = b->utime; + + if (b->stime > a->stime) + a->stime = b->stime; + + if (b->sum_exec_runtime > a->sum_exec_runtime) + a->sum_exec_runtime = b->sum_exec_runtime; +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + unsigned long flags; + + if (!cputimer->running) { + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start + * it. + */ + thread_group_cputime(tsk, &sum); + raw_spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 1; + update_gt_cputime(&cputimer->cputime, &sum); + } else + raw_spin_lock_irqsave(&cputimer->lock, flags); + *times = cputimer->cputime; + raw_spin_unlock_irqrestore(&cputimer->lock, flags); +} + +/* + * Sample a process (thread group) clock for the given group_leader task. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. + */ +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + unsigned long long *sample) +{ + struct task_cputime cputime; + + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + thread_group_cputime(p, &cputime); + *sample = cputime_to_expires(cputime.utime + cputime.stime); + break; + case CPUCLOCK_VIRT: + thread_group_cputime(p, &cputime); + *sample = cputime_to_expires(cputime.utime); + break; + case CPUCLOCK_SCHED: + thread_group_cputime(p, &cputime); + *sample = cputime.sum_exec_runtime; + break; + } + return 0; +} + +static int posix_cpu_clock_get_task(struct task_struct *tsk, + const clockid_t which_clock, + struct timespec *tp) +{ + int err = -EINVAL; + unsigned long long rtn; + + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (same_thread_group(tsk, current)) + err = cpu_clock_sample(which_clock, tsk, &rtn); + } else { + if (tsk == current || thread_group_leader(tsk)) + err = cpu_clock_sample_group(which_clock, tsk, &rtn); + } + + if (!err) + sample_to_timespec(which_clock, rtn, tp); + + return err; +} + + +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +{ + const pid_t pid = CPUCLOCK_PID(which_clock); + int err = -EINVAL; + + if (pid == 0) { + /* + * Special case constant value for our own clocks. + * We don't have to do any lookup to find ourselves. + */ + err = posix_cpu_clock_get_task(current, which_clock, tp); + } else { + /* + * Find the given PID, and validate that the caller + * should be able to see it. + */ + struct task_struct *p; + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p) + err = posix_cpu_clock_get_task(p, which_clock, tp); + rcu_read_unlock(); + } + + return err; +} + + +/* + * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. + * This is called from sys_timer_create() and do_cpu_nanosleep() with the + * new timer already all-zeros initialized. + */ +static int posix_cpu_timer_create(struct k_itimer *new_timer) +{ + int ret = 0; + const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); + struct task_struct *p; + + if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + INIT_LIST_HEAD(&new_timer->it.cpu.entry); + + rcu_read_lock(); + if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { + if (pid == 0) { + p = current; + } else { + p = find_task_by_vpid(pid); + if (p && !same_thread_group(p, current)) + p = NULL; + } + } else { + if (pid == 0) { + p = current->group_leader; + } else { + p = find_task_by_vpid(pid); + if (p && !has_group_leader_pid(p)) + p = NULL; + } + } + new_timer->it.cpu.task = p; + if (p) { + get_task_struct(p); + } else { + ret = -EINVAL; + } + rcu_read_unlock(); + + return ret; +} + +/* + * Clean up a CPU-clock timer that is about to be destroyed. + * This is called from timer deletion with the timer already locked. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_del(struct k_itimer *timer) +{ + int ret = 0; + unsigned long flags; + struct sighand_struct *sighand; + struct task_struct *p = timer->it.cpu.task; + + WARN_ON_ONCE(p == NULL); + + /* + * Protect against sighand release/switch in exit/exec and process/ + * thread timer list entry concurrent read/writes. + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * We raced with the reaping of the task. + * The deletion should have cleared us off the list. + */ + WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); + } else { + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + list_del(&timer->it.cpu.entry); + + unlock_task_sighand(p, &flags); + } + + if (!ret) + put_task_struct(p); + + return ret; +} + +static void cleanup_timers_list(struct list_head *head) +{ + struct cpu_timer_list *timer, *next; + + list_for_each_entry_safe(timer, next, head, entry) + list_del_init(&timer->entry); +} + +/* + * Clean out CPU timers still ticking when a thread exited. The task + * pointer is cleared, and the expiry time is replaced with the residual + * time for later timer_gettime calls to return. + * This must be called with the siglock held. + */ +static void cleanup_timers(struct list_head *head) +{ + cleanup_timers_list(head); + cleanup_timers_list(++head); + cleanup_timers_list(++head); +} + +/* + * These are both called with the siglock held, when the current thread + * is being reaped. When the final (leader) thread in the group is reaped, + * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. + */ +void posix_cpu_timers_exit(struct task_struct *tsk) +{ + add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + cleanup_timers(tsk->cpu_timers); + +} +void posix_cpu_timers_exit_group(struct task_struct *tsk) +{ + cleanup_timers(tsk->signal->cpu_timers); +} + +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return expires == 0 || expires > new_exp; +} + +/* + * Insert the timer on the appropriate list before any timers that + * expire later. This must be called with the sighand lock held. + */ +static void arm_timer(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + struct list_head *head, *listpos; + struct task_cputime *cputime_expires; + struct cpu_timer_list *const nt = &timer->it.cpu; + struct cpu_timer_list *next; + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + head = p->cpu_timers; + cputime_expires = &p->cputime_expires; + } else { + head = p->signal->cpu_timers; + cputime_expires = &p->signal->cputime_expires; + } + head += CPUCLOCK_WHICH(timer->it_clock); + + listpos = head; + list_for_each_entry(next, head, entry) { + if (nt->expires < next->expires) + break; + listpos = &next->entry; + } + list_add(&nt->entry, listpos); + + if (listpos == head) { + unsigned long long exp = nt->expires; + + /* + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. + */ + + switch (CPUCLOCK_WHICH(timer->it_clock)) { + case CPUCLOCK_PROF: + if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) + cputime_expires->prof_exp = expires_to_cputime(exp); + break; + case CPUCLOCK_VIRT: + if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) + cputime_expires->virt_exp = expires_to_cputime(exp); + break; + case CPUCLOCK_SCHED: + if (cputime_expires->sched_exp == 0 || + cputime_expires->sched_exp > exp) + cputime_expires->sched_exp = exp; + break; + } + } +} + +/* + * The timer is locked, fire it and arrange for its reload. + */ +static void cpu_timer_fire(struct k_itimer *timer) +{ + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + /* + * User don't want any signal. + */ + timer->it.cpu.expires = 0; + } else if (unlikely(timer->sigq == NULL)) { + /* + * This a special case for clock_nanosleep, + * not a normal timer from sys_timer_create. + */ + wake_up_process(timer->it_process); + timer->it.cpu.expires = 0; + } else if (timer->it.cpu.incr == 0) { + /* + * One-shot timer. Clear it as soon as it's fired. + */ + posix_timer_event(timer, 0); + timer->it.cpu.expires = 0; + } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { + /* + * The signal did not get queued because the signal + * was ignored, so we won't get any callback to + * reload the timer. But we need to keep it + * ticking in case the signal is deliverable next time. + */ + posix_cpu_timer_schedule(timer); + } +} + +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + unsigned long long *sample) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + *sample = cputime_to_expires(cputime.utime + cputime.stime); + break; + case CPUCLOCK_VIRT: + *sample = cputime_to_expires(cputime.utime); + break; + case CPUCLOCK_SCHED: + *sample = cputime.sum_exec_runtime; + break; + } + return 0; +} + +#ifdef CONFIG_NO_HZ_FULL +static void nohz_kick_work_fn(struct work_struct *work) +{ + tick_nohz_full_kick_all(); +} + +static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); + +/* + * We need the IPIs to be sent from sane process context. + * The posix cpu timers are always set with irqs disabled. + */ +static void posix_cpu_timer_kick_nohz(void) +{ + if (context_tracking_is_enabled()) + schedule_work(&nohz_kick_work); +} + +bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) +{ + if (!task_cputime_zero(&tsk->cputime_expires)) + return false; + + if (tsk->signal->cputimer.running) + return false; + + return true; +} +#else +static inline void posix_cpu_timer_kick_nohz(void) { } +#endif + +/* + * Guts of sys_timer_settime for CPU timers. + * This is called with the timer locked and interrupts disabled. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, + struct itimerspec *new, struct itimerspec *old) +{ + unsigned long flags; + struct sighand_struct *sighand; + struct task_struct *p = timer->it.cpu.task; + unsigned long long old_expires, new_expires, old_incr, val; + int ret; + + WARN_ON_ONCE(p == NULL); + + new_expires = timespec_to_sample(timer->it_clock, &new->it_value); + + /* + * Protect against sighand release/switch in exit/exec and p->cpu_timers + * and p->signal->cpu_timers read/write in arm_timer() + */ + sighand = lock_task_sighand(p, &flags); + /* + * If p has just been reaped, we can no + * longer get any information about it at all. + */ + if (unlikely(sighand == NULL)) { + return -ESRCH; + } + + /* + * Disarm any old timer after extracting its expiry time. + */ + WARN_ON_ONCE(!irqs_disabled()); + + ret = 0; + old_incr = timer->it.cpu.incr; + old_expires = timer->it.cpu.expires; + if (unlikely(timer->it.cpu.firing)) { + timer->it.cpu.firing = -1; + ret = TIMER_RETRY; + } else + list_del_init(&timer->it.cpu.entry); + + /* + * We need to sample the current value to convert the new + * value from to relative and absolute, and to convert the + * old value from absolute to relative. To set a process + * timer, we need a sample to balance the thread expiry + * times (in arm_timer). With an absolute time, we must + * check if it's already passed. In short, we need a sample. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &val); + } else { + cpu_timer_sample_group(timer->it_clock, p, &val); + } + + if (old) { + if (old_expires == 0) { + old->it_value.tv_sec = 0; + old->it_value.tv_nsec = 0; + } else { + /* + * Update the timer in case it has + * overrun already. If it has, + * we'll report it as having overrun + * and with the next reloaded timer + * already ticking, though we are + * swallowing that pending + * notification here to install the + * new setting. + */ + bump_cpu_timer(timer, val); + if (val < timer->it.cpu.expires) { + old_expires = timer->it.cpu.expires - val; + sample_to_timespec(timer->it_clock, + old_expires, + &old->it_value); + } else { + old->it_value.tv_nsec = 1; + old->it_value.tv_sec = 0; + } + } + } + + if (unlikely(ret)) { + /* + * We are colliding with the timer actually firing. + * Punt after filling in the timer's old value, and + * disable this firing since we are already reporting + * it as an overrun (thanks to bump_cpu_timer above). + */ + unlock_task_sighand(p, &flags); + goto out; + } + + if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { + new_expires += val; + } + + /* + * Install the new expiry time (or zero). + * For a timer with no notification action, we don't actually + * arm the timer (we'll just fake it for timer_gettime). + */ + timer->it.cpu.expires = new_expires; + if (new_expires != 0 && val < new_expires) { + arm_timer(timer); + } + + unlock_task_sighand(p, &flags); + /* + * Install the new reload setting, and + * set up the signal and overrun bookkeeping. + */ + timer->it.cpu.incr = timespec_to_sample(timer->it_clock, + &new->it_interval); + + /* + * This acts as a modification timestamp for the timer, + * so any automatic reload attempt will punt on seeing + * that we have reset the timer manually. + */ + timer->it_requeue_pending = (timer->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timer->it_overrun_last = 0; + timer->it_overrun = -1; + + if (new_expires != 0 && !(val < new_expires)) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + + ret = 0; + out: + if (old) { + sample_to_timespec(timer->it_clock, + old_incr, &old->it_interval); + } + if (!ret) + posix_cpu_timer_kick_nohz(); + return ret; +} + +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +{ + unsigned long long now; + struct task_struct *p = timer->it.cpu.task; + + WARN_ON_ONCE(p == NULL); + + /* + * Easy part: convert the reload time. + */ + sample_to_timespec(timer->it_clock, + timer->it.cpu.incr, &itp->it_interval); + + if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ + itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; + return; + } + + /* + * Sample the clock to take the difference with the expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + } else { + struct sighand_struct *sighand; + unsigned long flags; + + /* + * Protect against sighand release/switch in exit/exec and + * also make timer sampling safe if it ends up calling + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + * Call the timer disarmed, nothing else to do. + */ + timer->it.cpu.expires = 0; + sample_to_timespec(timer->it_clock, timer->it.cpu.expires, + &itp->it_value); + } else { + cpu_timer_sample_group(timer->it_clock, p, &now); + unlock_task_sighand(p, &flags); + } + } + + if (now < timer->it.cpu.expires) { + sample_to_timespec(timer->it_clock, + timer->it.cpu.expires - now, + &itp->it_value); + } else { + /* + * The timer should have expired already, but the firing + * hasn't taken place yet. Say it's just about to expire. + */ + itp->it_value.tv_nsec = 1; + itp->it_value.tv_sec = 0; + } +} + +static unsigned long long +check_timers_list(struct list_head *timers, + struct list_head *firing, + unsigned long long curr) +{ + int maxfire = 20; + + while (!list_empty(timers)) { + struct cpu_timer_list *t; + + t = list_first_entry(timers, struct cpu_timer_list, entry); + + if (!--maxfire || curr < t->expires) + return t->expires; + + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + return 0; +} + +/* + * Check for any per-thread CPU timers that have fired and move them off + * the tsk->cpu_timers[N] list onto the firing list. Here we update the + * tsk->it_*_expires values to reflect the remaining thread CPU timers. + */ +static void check_thread_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct list_head *timers = tsk->cpu_timers; + struct signal_struct *const sig = tsk->signal; + struct task_cputime *tsk_expires = &tsk->cputime_expires; + unsigned long long expires; + unsigned long soft; + + expires = check_timers_list(timers, firing, prof_ticks(tsk)); + tsk_expires->prof_exp = expires_to_cputime(expires); + + expires = check_timers_list(++timers, firing, virt_ticks(tsk)); + tsk_expires->virt_exp = expires_to_cputime(expires); + + tsk_expires->sched_exp = check_timers_list(++timers, firing, + tsk_seruntime(tsk)); + + /* + * Check for the special case thread timers. + */ + soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + if (soft != RLIM_INFINITY) { + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + + if (hard != RLIM_INFINITY && + tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + if (soft < hard) { + soft += USEC_PER_SEC; + sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; + } + printk(KERN_INFO + "RT Watchdog Timeout: %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } +} + +static void stop_process_timers(struct signal_struct *sig) +{ + struct thread_group_cputimer *cputimer = &sig->cputimer; + unsigned long flags; + + raw_spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 0; + raw_spin_unlock_irqrestore(&cputimer->lock, flags); +} + +static u32 onecputick; + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + unsigned long long *expires, + unsigned long long cur_time, int signo) +{ + if (!it->expires) + return; + + if (cur_time >= it->expires) { + if (it->incr) { + it->expires += it->incr; + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires -= cputime_one_jiffy; + it->error -= onecputick; + } + } else { + it->expires = 0; + } + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (it->expires && (!*expires || it->expires < *expires)) { + *expires = it->expires; + } +} + +/* + * Check for any per-thread CPU timers that have fired and move them + * off the tsk->*_timers list onto the firing list. Per-thread timers + * have already been taken off. + */ +static void check_process_timers(struct task_struct *tsk, + struct list_head *firing) +{ + struct signal_struct *const sig = tsk->signal; + unsigned long long utime, ptime, virt_expires, prof_expires; + unsigned long long sum_sched_runtime, sched_expires; + struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; + unsigned long soft; + + /* + * Collect the current process totals. + */ + thread_group_cputimer(tsk, &cputime); + utime = cputime_to_expires(cputime.utime); + ptime = utime + cputime_to_expires(cputime.stime); + sum_sched_runtime = cputime.sum_exec_runtime; + + prof_expires = check_timers_list(timers, firing, ptime); + virt_expires = check_timers_list(++timers, firing, utime); + sched_expires = check_timers_list(++timers, firing, sum_sched_runtime); + + /* + * Check for the special case process timers. + */ + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + if (soft != RLIM_INFINITY) { + unsigned long psecs = cputime_to_secs(ptime); + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + cputime_t x; + if (psecs >= hard) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (psecs >= soft) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + if (soft < hard) { + soft++; + sig->rlim[RLIMIT_CPU].rlim_cur = soft; + } + } + x = secs_to_cputime(soft); + if (!prof_expires || x < prof_expires) { + prof_expires = x; + } + } + + sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); + sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); + sig->cputime_expires.sched_exp = sched_expires; + if (task_cputime_zero(&sig->cputime_expires)) + stop_process_timers(sig); +} + +/* + * This is called from the signal code (via do_schedule_next_timer) + * when the last timer signal was delivered and we have to reload the timer. + */ +void posix_cpu_timer_schedule(struct k_itimer *timer) +{ + struct sighand_struct *sighand; + unsigned long flags; + struct task_struct *p = timer->it.cpu.task; + unsigned long long now; + + WARN_ON_ONCE(p == NULL); + + /* + * Fetch the current sample and update the timer's expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + if (unlikely(p->exit_state)) + goto out; + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (!sighand) + goto out; + } else { + /* + * Protect arm_timer() and timer sampling in case of call to + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + */ + timer->it.cpu.expires = 0; + goto out; + } else if (unlikely(p->exit_state) && thread_group_empty(p)) { + unlock_task_sighand(p, &flags); + /* Optimizations: if the process is dying, no need to rearm */ + goto out; + } + cpu_timer_sample_group(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + /* Leave the sighand locked for the call below. */ + } + + /* + * Now re-arm for the new expiry time. + */ + WARN_ON_ONCE(!irqs_disabled()); + arm_timer(timer); + unlock_task_sighand(p, &flags); + + /* Kick full dynticks CPUs in case they need to tick on the new timer */ + posix_cpu_timer_kick_nohz(); +out: + timer->it_overrun_last = timer->it_overrun; + timer->it_overrun = -1; + ++timer->it_requeue_pending; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (expires->utime && sample->utime >= expires->utime) + return 1; + if (expires->stime && sample->utime + sample->stime >= expires->stime) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk) +{ + struct signal_struct *sig; + cputime_t utime, stime; + + task_cputime(tsk, &utime, &stime); + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample = { + .utime = utime, + .stime = stime, + .sum_exec_runtime = tsk_seruntime(tsk) + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + + sig = tsk->signal; + if (sig->cputimer.running) { + struct task_cputime group_sample; + + raw_spin_lock(&sig->cputimer.lock); + group_sample = sig->cputimer.cputime; + raw_spin_unlock(&sig->cputimer.lock); + + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + + return 0; +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(struct task_struct *tsk) +{ + LIST_HEAD(firing); + struct k_itimer *timer, *next; + unsigned long flags; + + WARN_ON_ONCE(!irqs_disabled()); + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + if (!lock_task_sighand(tsk, &flags)) + return; + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + /* + * If there are any active process wide timers (POSIX 1.b, itimers, + * RLIMIT_CPU) cputimer must be running. + */ + if (tsk->signal->cputimer.running) + check_process_timers(tsk, &firing); + + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + unlock_task_sighand(tsk, &flags); + + /* + * Now that all the timers on our list have the firing flag, + * no one will touch their list entries but us. We'll take + * each timer's lock before clearing its firing flag, so no + * timer call will interfere. + */ + list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { + int cpu_firing; + + spin_lock(&timer->it_lock); + list_del_init(&timer->it.cpu.entry); + cpu_firing = timer->it.cpu.firing; + timer->it.cpu.firing = 0; + /* + * The firing flag is -1 if we collided with a reset + * of the timer, which already reported this + * almost-firing as an overrun. So don't generate an event. + */ + if (likely(cpu_firing >= 0)) + cpu_timer_fire(timer); + spin_unlock(&timer->it_lock); + } +} + +/* + * Set one of the process-wide special case CPU timers or RLIMIT_CPU. + * The tsk->sighand->siglock must be held by the caller. + */ +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, + cputime_t *newval, cputime_t *oldval) +{ + unsigned long long now; + + WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); + cpu_timer_sample_group(clock_idx, tsk, &now); + + if (oldval) { + /* + * We are setting itimer. The *oldval is absolute and we update + * it to be relative, *newval argument is relative and we update + * it to be absolute. + */ + if (*oldval) { + if (*oldval <= now) { + /* Just about to fire. */ + *oldval = cputime_one_jiffy; + } else { + *oldval -= now; + } + } + + if (!*newval) + goto out; + *newval += now; + } + + /* + * Update expiration cache if we are the earliest timer, or eventually + * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. + */ + switch (clock_idx) { + case CPUCLOCK_PROF: + if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } +out: + posix_cpu_timer_kick_nohz(); +} + +static int do_cpu_nanosleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct itimerspec *it) +{ + struct k_itimer timer; + int error; + + /* + * Set up a temporary timer and then wait for it to go off. + */ + memset(&timer, 0, sizeof timer); + spin_lock_init(&timer.it_lock); + timer.it_clock = which_clock; + timer.it_overrun = -1; + error = posix_cpu_timer_create(&timer); + timer.it_process = current; + if (!error) { + static struct itimerspec zero_it; + + memset(it, 0, sizeof *it); + it->it_value = *rqtp; + + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_set(&timer, flags, it, NULL); + if (error) { + spin_unlock_irq(&timer.it_lock); + return error; + } + + while (!signal_pending(current)) { + if (timer.it.cpu.expires == 0) { + /* + * Our timer fired and was reset, below + * deletion can not fail. + */ + posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + return 0; + } + + /* + * Block until cpu_timer_fire (or a signal) wakes us. + */ + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&timer.it_lock); + schedule(); + spin_lock_irq(&timer.it_lock); + } + + /* + * We were interrupted by a signal. + */ + sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); + error = posix_cpu_timer_set(&timer, 0, &zero_it, it); + if (!error) { + /* + * Timer is now unarmed, deletion can not fail. + */ + posix_cpu_timer_del(&timer); + } + spin_unlock_irq(&timer.it_lock); + + while (error == TIMER_RETRY) { + /* + * We need to handle case when timer was or is in the + * middle of firing. In other cases we already freed + * resources. + */ + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_del(&timer); + spin_unlock_irq(&timer.it_lock); + } + + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { + /* + * It actually did fire already. + */ + return 0; + } + + error = -ERESTART_RESTARTBLOCK; + } + + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) +{ + struct restart_block *restart_block = ¤t->restart_block; + struct itimerspec it; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == current->pid)) + return -EINVAL; + + error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + + if (flags & TIMER_ABSTIME) + return -ERESTARTNOHAND; + /* + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->fn = posix_cpu_nsleep_restart; + restart_block->nanosleep.clockid = which_clock; + restart_block->nanosleep.rmtp = rmtp; + restart_block->nanosleep.expires = timespec_to_ns(rqtp); + } + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->nanosleep.clockid; + struct timespec t; + struct itimerspec it; + int error; + + t = ns_to_timespec(restart_block->nanosleep.expires); + + error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + struct timespec __user *rmtp = restart_block->nanosleep.rmtp; + /* + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->nanosleep.expires = timespec_to_ns(&t); + } + return error; + +} + +#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) + +static int process_cpu_clock_getres(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_getres(PROCESS_CLOCK, tp); +} +static int process_cpu_clock_get(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_get(PROCESS_CLOCK, tp); +} +static int process_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = PROCESS_CLOCK; + return posix_cpu_timer_create(timer); +} +static int process_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, + struct timespec __user *rmtp) +{ + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); +} +static long process_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} +static int thread_cpu_clock_getres(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_getres(THREAD_CLOCK, tp); +} +static int thread_cpu_clock_get(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_get(THREAD_CLOCK, tp); +} +static int thread_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = THREAD_CLOCK; + return posix_cpu_timer_create(timer); +} + +struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .nsleep_restart = posix_cpu_nsleep_restart, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, +}; + +static __init int init_posix_cpu_timers(void) +{ + struct k_clock process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, + }; + struct k_clock thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, + }; + struct timespec ts; + + posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + + cputime_to_timespec(cputime_one_jiffy, &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + + return 0; +} +__initcall(init_posix_cpu_timers); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c new file mode 100644 index 000000000..31ea01f42 --- /dev/null +++ b/kernel/time/posix-timers.c @@ -0,0 +1,1124 @@ +/* + * linux/kernel/posix-timers.c + * + * + * 2002-10-15 Posix Clocks & timers + * by George Anzinger george@mvista.com + * + * Copyright (C) 2002 2003 by MontaVista Software. + * + * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. + * Copyright (C) 2004 Boris Hu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA + */ + +/* These are all the functions necessary to implement + * POSIX clocks & timers + */ +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/mutex.h> + +#include <asm/uaccess.h> +#include <linux/list.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/hash.h> +#include <linux/posix-clock.h> +#include <linux/posix-timers.h> +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/workqueue.h> +#include <linux/export.h> +#include <linux/hashtable.h> + +#include "timekeeping.h" + +/* + * Management arrays for POSIX timers. Timers are now kept in static hash table + * with 512 entries. + * Timer ids are allocated by local routine, which selects proper hash head by + * key, constructed from current->signal address and per signal struct counter. + * This keeps timer ids unique per process, but now they can intersect between + * processes. + */ + +/* + * Lets keep our timers in a slab cache :-) + */ +static struct kmem_cache *posix_timers_cache; + +static DEFINE_HASHTABLE(posix_timers_hashtable, 9); +static DEFINE_SPINLOCK(hash_lock); + +/* + * we assume that the new SIGEV_THREAD_ID shares no bits with the other + * SIGEV values. Here we put out an error if this assumption fails. + */ +#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) +#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" +#endif + +/* + * parisc wants ENOTSUP instead of EOPNOTSUPP + */ +#ifndef ENOTSUP +# define ENANOSLEEP_NOTSUP EOPNOTSUPP +#else +# define ENANOSLEEP_NOTSUP ENOTSUP +#endif + +/* + * The timer ID is turned into a timer address by idr_find(). + * Verifying a valid ID consists of: + * + * a) checking that idr_find() returns other than -1. + * b) checking that the timer id matches the one in the timer itself. + * c) that the timer owner is in the callers thread group. + */ + +/* + * CLOCKs: The POSIX standard calls for a couple of clocks and allows us + * to implement others. This structure defines the various + * clocks. + * + * RESOLUTION: Clock resolution is used to round up timer and interval + * times, NOT to report clock times, which are reported with as + * much resolution as the system can muster. In some cases this + * resolution may depend on the underlying clock hardware and + * may not be quantifiable until run time, and only then is the + * necessary code is written. The standard says we should say + * something about this issue in the documentation... + * + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. + * + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. + * + * Permissions: It is assumed that the clock_settime() function defined + * for each clock will take care of permission checks. Some + * clocks may be set able by any user (i.e. local process + * clocks) others not. Currently the only set able clock we + * have is CLOCK_REALTIME and its high res counter part, both of + * which we beg off on and pass to do_sys_settimeofday(). + */ + +static struct k_clock posix_clocks[MAX_CLOCKS]; + +/* + * These ones are defined below. + */ +static int common_nsleep(const clockid_t, int flags, struct timespec *t, + struct timespec __user *rmtp); +static int common_timer_create(struct k_itimer *new_timer); +static void common_timer_get(struct k_itimer *, struct itimerspec *); +static int common_timer_set(struct k_itimer *, int, + struct itimerspec *, struct itimerspec *); +static int common_timer_del(struct k_itimer *timer); + +static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); + +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); + +#define lock_timer(tid, flags) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ + __timr; \ +}) + +static int hash(struct signal_struct *sig, unsigned int nr) +{ + return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); +} + +static struct k_itimer *__posix_timers_find(struct hlist_head *head, + struct signal_struct *sig, + timer_t id) +{ + struct k_itimer *timer; + + hlist_for_each_entry_rcu(timer, head, t_hash) { + if ((timer->it_signal == sig) && (timer->it_id == id)) + return timer; + } + return NULL; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + + return __posix_timers_find(head, sig, id); +} + +static int posix_timer_add(struct k_itimer *timer) +{ + struct signal_struct *sig = current->signal; + int first_free_id = sig->posix_timer_id; + struct hlist_head *head; + int ret = -ENOENT; + + do { + spin_lock(&hash_lock); + head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)]; + if (!__posix_timers_find(head, sig, sig->posix_timer_id)) { + hlist_add_head_rcu(&timer->t_hash, head); + ret = sig->posix_timer_id; + } + if (++sig->posix_timer_id < 0) + sig->posix_timer_id = 0; + if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT)) + /* Loop over all possible ids completed */ + ret = -EAGAIN; + spin_unlock(&hash_lock); + } while (ret == -ENOENT); + return ret; +} + +static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +{ + spin_unlock_irqrestore(&timr->it_lock, flags); +} + +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_real_ts(tp); + return 0; +} + +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec *tp) +{ + return do_sys_settimeofday(tp, NULL); +} + +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) +{ + return do_adjtimex(t); +} + +/* + * Get monotonic time for posix timers + */ +static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_ts(tp); + return 0; +} + +/* + * Get monotonic-raw time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} + +static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +{ + get_monotonic_boottime(tp); + return 0; +} + +static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +{ + timekeeping_clocktai(tp); + return 0; +} + +/* + * Initialize everything, well, just everything in Posix clocks/timers ;) + */ +static __init int init_posix_timers(void) +{ + struct k_clock clock_realtime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + struct k_clock clock_monotonic = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + }; + struct k_clock clock_tai = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + struct k_clock clock_boottime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + + posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); + posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); + posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); + posix_timers_register_clock(CLOCK_TAI, &clock_tai); + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof (struct k_itimer), 0, SLAB_PANIC, + NULL); + return 0; +} + +__initcall(init_posix_timers); + +static void schedule_next_timer(struct k_itimer *timr) +{ + struct hrtimer *timer = &timr->it.real.timer; + + if (timr->it.real.interval.tv64 == 0) + return; + + timr->it_overrun += (unsigned int) hrtimer_forward(timer, + timer->base->get_time(), + timr->it.real.interval); + + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1; + ++timr->it_requeue_pending; + hrtimer_restart(timer); +} + +/* + * This function is exported for use by the signal deliver code. It is + * called just prior to the info block being released and passes that + * block to us. It's function is to update the overrun entry AND to + * restart the timer. It should only be called if the timer is to be + * restarted (i.e. we have flagged this in the sys_private entry of the + * info block). + * + * To protect against the timer going away while the interrupt is queued, + * we require that the it_requeue_pending flag be set. + */ +void do_schedule_next_timer(struct siginfo *info) +{ + struct k_itimer *timr; + unsigned long flags; + + timr = lock_timer(info->si_tid, &flags); + + if (timr && timr->it_requeue_pending == info->si_sys_private) { + if (timr->it_clock < 0) + posix_cpu_timer_schedule(timr); + else + schedule_next_timer(timr); + + info->si_overrun += timr->it_overrun_last; + } + + if (timr) + unlock_timer(timr, flags); +} + +int posix_timer_event(struct k_itimer *timr, int si_private) +{ + struct task_struct *task; + int shared, ret = -1; + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->do_schedule_next_timer(). + * + * If dequeue_signal() sees the "right" value of + * si_sys_private it calls do_schedule_next_timer(). + * We re-queue ->sigq and drop ->it_lock(). + * do_schedule_next_timer() locks the timer + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ + timr->sigq->info.si_sys_private = si_private; + + rcu_read_lock(); + task = pid_task(timr->it_pid, PIDTYPE_PID); + if (task) { + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, task, shared); + } + rcu_read_unlock(); + /* If we failed to send the signal the timer stops. */ + return ret > 0; +} +EXPORT_SYMBOL_GPL(posix_timer_event); + +/* + * This function gets called when a POSIX.1b interval timer expires. It + * is used as a callback from the kernel internal timer. The + * run_timer_list code ALWAYS calls with interrupts on. + + * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + */ +static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) +{ + struct k_itimer *timr; + unsigned long flags; + int si_private = 0; + enum hrtimer_restart ret = HRTIMER_NORESTART; + + timr = container_of(timer, struct k_itimer, it.real.timer); + spin_lock_irqsave(&timr->it_lock, flags); + + if (timr->it.real.interval.tv64 != 0) + si_private = ++timr->it_requeue_pending; + + if (posix_timer_event(timr, si_private)) { + /* + * signal was not sent because of sig_ignor + * we will not get a call back to restart it AND + * it should be restarted. + */ + if (timr->it.real.interval.tv64 != 0) { + ktime_t now = hrtimer_cb_get_time(timer); + + /* + * FIXME: What we really want, is to stop this + * timer completely and restart it in case the + * SIG_IGN is removed. This is a non trivial + * change which involves sighand locking + * (sigh !), which we don't want to do late in + * the release cycle. + * + * For now we just let timers with an interval + * less than a jiffie expire every jiffie to + * avoid softirq starvation in case of SIG_IGN + * and a very small interval, which would put + * the timer right back on the softirq pending + * list. By moving now ahead of time we trick + * hrtimer_forward() to expire the timer + * later, while we still maintain the overrun + * accuracy, but have some inconsistency in + * the timer_gettime() case. This is at least + * better than a starved softirq. A more + * complex fix which solves also another related + * inconsistency is already in the pipeline. + */ +#ifdef CONFIG_HIGH_RES_TIMERS + { + ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ); + + if (timr->it.real.interval.tv64 < kj.tv64) + now = ktime_add(now, kj); + } +#endif + timr->it_overrun += (unsigned int) + hrtimer_forward(timer, now, + timr->it.real.interval); + ret = HRTIMER_RESTART; + ++timr->it_requeue_pending; + } + } + + unlock_timer(timr, flags); + return ret; +} + +static struct pid *good_sigevent(sigevent_t * event) +{ + struct task_struct *rtn = current->group_leader; + + if ((event->sigev_notify & SIGEV_THREAD_ID ) && + (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || + !same_thread_group(rtn, current) || + (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + return NULL; + + if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && + ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) + return NULL; + + return task_pid(rtn); +} + +void posix_timers_register_clock(const clockid_t clock_id, + struct k_clock *new_clock) +{ + if ((unsigned) clock_id >= MAX_CLOCKS) { + printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + if (!new_clock->clock_get) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + clock_id); + return; + } + if (!new_clock->clock_getres) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", + clock_id); + return; + } + + posix_clocks[clock_id] = *new_clock; +} +EXPORT_SYMBOL_GPL(posix_timers_register_clock); + +static struct k_itimer * alloc_posix_timer(void) +{ + struct k_itimer *tmr; + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + if (!tmr) + return tmr; + if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + kmem_cache_free(posix_timers_cache, tmr); + return NULL; + } + memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); + return tmr; +} + +static void k_itimer_rcu_free(struct rcu_head *head) +{ + struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + + kmem_cache_free(posix_timers_cache, tmr); +} + +#define IT_ID_SET 1 +#define IT_ID_NOT_SET 0 +static void release_posix_timer(struct k_itimer *tmr, int it_id_set) +{ + if (it_id_set) { + unsigned long flags; + spin_lock_irqsave(&hash_lock, flags); + hlist_del_rcu(&tmr->t_hash); + spin_unlock_irqrestore(&hash_lock, flags); + } + put_pid(tmr->it_pid); + sigqueue_free(tmr->sigq); + call_rcu(&tmr->it.rcu, k_itimer_rcu_free); +} + +static struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + return NULL; + return &posix_clocks[id]; +} + +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + +/* Create a POSIX.1b interval timer. */ + +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct k_itimer *new_timer; + int error, new_timer_id; + sigevent_t event; + int it_id_set = IT_ID_NOT_SET; + + if (!kc) + return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; + + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + new_timer_id = posix_timer_add(new_timer); + if (new_timer_id < 0) { + error = new_timer_id; + goto out; + } + + it_id_set = IT_ID_SET; + new_timer->it_id = (timer_t) new_timer_id; + new_timer->it_clock = which_clock; + new_timer->it_overrun = -1; + + if (timer_event_spec) { + if (copy_from_user(&event, timer_event_spec, sizeof (event))) { + error = -EFAULT; + goto out; + } + rcu_read_lock(); + new_timer->it_pid = get_pid(good_sigevent(&event)); + rcu_read_unlock(); + if (!new_timer->it_pid) { + error = -EINVAL; + goto out; + } + } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGALRM; + event.sigev_value.sival_int = new_timer->it_id; + new_timer->it_pid = get_pid(task_tgid(current)); + } + + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->sigq->info.si_signo = event.sigev_signo; + new_timer->sigq->info.si_value = event.sigev_value; + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_code = SI_TIMER; + + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + + error = kc->timer_create(new_timer); + if (error) + goto out; + + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_signal = current->signal; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); + + return 0; + /* + * In the case of the timer belonging to another task, after + * the task is unlocked, the timer is owned by the other task + * and may cease to exist at any time. Don't use or modify + * new_timer after the unlock call. + */ +out: + release_posix_timer(new_timer, it_id_set); + return error; +} + +/* + * Locking issues: We need to protect the result of the id look up until + * we get the timer locked down so it is not deleted under us. The + * removal is done under the idr spinlock so we use that here to bridge + * the find to the timer lock. To avoid a dead lock, the timer id MUST + * be release with out holding the timer lock. + */ +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +{ + struct k_itimer *timr; + + /* + * timer_t could be any type >= int and we want to make sure any + * @timer_id outside positive int range fails lookup. + */ + if ((unsigned long long)timer_id > INT_MAX) + return NULL; + + rcu_read_lock(); + timr = posix_timer_by_id(timer_id); + if (timr) { + spin_lock_irqsave(&timr->it_lock, *flags); + if (timr->it_signal == current->signal) { + rcu_read_unlock(); + return timr; + } + spin_unlock_irqrestore(&timr->it_lock, *flags); + } + rcu_read_unlock(); + + return NULL; +} + +/* + * Get the time remaining on a POSIX.1b interval timer. This function + * is ALWAYS called with spin_lock_irq on the timer, thus it must not + * mess with irq. + * + * We have a couple of messes to clean up here. First there is the case + * of a timer that has a requeue pending. These timers should appear to + * be in the timer list with an expiry as if we were to requeue them + * now. + * + * The second issue is the SIGEV_NONE timer which may be active but is + * not really ever put in the timer list (to save system resources). + * This timer may be expired, and if so, we will do it here. Otherwise + * it is the same as a requeue pending timer WRT to what we should + * report. + */ +static void +common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) +{ + ktime_t now, remaining, iv; + struct hrtimer *timer = &timr->it.real.timer; + + memset(cur_setting, 0, sizeof(struct itimerspec)); + + iv = timr->it.real.interval; + + /* interval timer ? */ + if (iv.tv64) + cur_setting->it_interval = ktime_to_timespec(iv); + else if (!hrtimer_active(timer) && + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + return; + + now = timer->base->get_time(); + + /* + * When a requeue is pending or this is a SIGEV_NONE + * timer move the expiry time forward by intervals, so + * expiry is > now. + */ + if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + + remaining = ktime_sub(hrtimer_get_expires(timer), now); + /* Return 0 only, when the timer is expired and not pending */ + if (remaining.tv64 <= 0) { + /* + * A single shot SIGEV_NONE timer must return 0, when + * it is expired ! + */ + if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + cur_setting->it_value.tv_nsec = 1; + } else + cur_setting->it_value = ktime_to_timespec(remaining); +} + +/* Get the time remaining on a POSIX.1b interval timer. */ +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct itimerspec __user *, setting) +{ + struct itimerspec cur_setting; + struct k_itimer *timr; + struct k_clock *kc; + unsigned long flags; + int ret = 0; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, &cur_setting); + + unlock_timer(timr, flags); + + if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + return -EFAULT; + + return ret; +} + +/* + * Get the number of overruns of a POSIX.1b interval timer. This is to + * be the overrun of the timer last delivered. At the same time we are + * accumulating overruns on the next timer. The overrun is frozen when + * the signal is delivered, either at the notify time (if the info block + * is not queued) or at the actual delivery time (as we are informed by + * the call back to do_schedule_next_timer(). So all we need to do is + * to pick up the frozen overrun. + */ +SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) +{ + struct k_itimer *timr; + int overrun; + unsigned long flags; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + overrun = timr->it_overrun_last; + unlock_timer(timr, flags); + + return overrun; +} + +/* Set a POSIX.1b interval timer. */ +/* timr->it_lock is taken. */ +static int +common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec *new_setting, struct itimerspec *old_setting) +{ + struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; + + if (old_setting) + common_timer_get(timr, old_setting); + + /* disable the timer */ + timr->it.real.interval.tv64 = 0; + /* + * careful here. If smp we could be in the "fire" routine which will + * be spinning as we hold the lock. But this is ONLY an SMP issue. + */ + if (hrtimer_try_to_cancel(timer) < 0) + return TIMER_RETRY; + + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timr->it_overrun_last = 0; + + /* switch off the timer when it_value is zero */ + if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) + return 0; + + mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.function = posix_timer_fn; + + hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); + + /* Convert interval */ + timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); + + /* SIGEV_NONE timers are not queued ! See common_timer_get */ + if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { + /* Setup correct expiry time for relative timers */ + if (mode == HRTIMER_MODE_REL) { + hrtimer_add_expires(timer, timer->base->get_time()); + } + return 0; + } + + hrtimer_start_expires(timer, mode); + return 0; +} + +/* Set a POSIX.1b interval timer */ +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct itimerspec __user *, new_setting, + struct itimerspec __user *, old_setting) +{ + struct k_itimer *timr; + struct itimerspec new_spec, old_spec; + int error = 0; + unsigned long flag; + struct itimerspec *rtn = old_setting ? &old_spec : NULL; + struct k_clock *kc; + + if (!new_setting) + return -EINVAL; + + if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + return -EFAULT; + + if (!timespec_valid(&new_spec.it_interval) || + !timespec_valid(&new_spec.it_value)) + return -EINVAL; +retry: + timr = lock_timer(timer_id, &flag); + if (!timr) + return -EINVAL; + + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, &new_spec, rtn); + + unlock_timer(timr, flag); + if (error == TIMER_RETRY) { + rtn = NULL; // We already got the old time... + goto retry; + } + + if (old_setting && !error && + copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + error = -EFAULT; + + return error; +} + +static int common_timer_del(struct k_itimer *timer) +{ + timer->it.real.interval.tv64 = 0; + + if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) + return TIMER_RETRY; + return 0; +} + +static inline int timer_delete_hook(struct k_itimer *timer) +{ + struct k_clock *kc = clockid_to_kclock(timer->it_clock); + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); +} + +/* Delete a POSIX.1b interval timer. */ +SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) +{ + struct k_itimer *timer; + unsigned long flags; + +retry_delete: + timer = lock_timer(timer_id, &flags); + if (!timer) + return -EINVAL; + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } + + spin_lock(¤t->sighand->siglock); + list_del(&timer->list); + spin_unlock(¤t->sighand->siglock); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + timer->it_signal = NULL; + + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); + return 0; +} + +/* + * return timer owned by the process, used by exit_itimers + */ +static void itimer_delete(struct k_itimer *timer) +{ + unsigned long flags; + +retry_delete: + spin_lock_irqsave(&timer->it_lock, flags); + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } + list_del(&timer->list); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + timer->it_signal = NULL; + + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); +} + +/* + * This is called by do_exit or de_thread, only when there are no more + * references to the shared signal_struct. + */ +void exit_itimers(struct signal_struct *sig) +{ + struct k_itimer *tmr; + + while (!list_empty(&sig->posix_timers)) { + tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + itimer_delete(tmr); + } +} + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct timespec __user *, tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec new_tp; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (copy_from_user(&new_tp, tp, sizeof (*tp))) + return -EFAULT; + + return kc->clock_set(which_clock, &new_tp); +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *,tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec kernel_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_get(which_clock, &kernel_tp); + + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + error = -EFAULT; + + return error; +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + + return err; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct timespec __user *, tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec rtn_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_getres(which_clock, &rtn_tp); + + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + error = -EFAULT; + + return error; +} + +/* + * nanosleep for monotonic and realtime clocks + */ +static int common_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsave, struct timespec __user *rmtp) +{ + return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec t; + + if (!kc) + return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; + + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + + if (!timespec_valid(&t)) + return -EINVAL; + + return kc->nsleep(which_clock, flags, &t, rmtp); +} + +/* + * This will restart clock_nanosleep. This is required only by + * compat_clock_nanosleep_restart for now. + */ +long clock_nanosleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->nanosleep.clockid; + struct k_clock *kc = clockid_to_kclock(which_clock); + + if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + return -EINVAL; + + return kc->nsleep_restart(restart_block); +} diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 000000000..a26036d37 --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,303 @@ +/* + * sched_clock.c: Generic sched_clock() support, to extend low level + * hardware time counters to full 64-bit ns values. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/ktime.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/syscore_ops.h> +#include <linux/hrtimer.h> +#include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h> + +/** + * struct clock_read_data - data required to read from sched_clock() + * + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=40 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { + u64 epoch_ns; + u64 epoch_cyc; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); + u32 mult; + u32 shift; +}; + +/** + * struct clock_data - all data needed for sched_clock() (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping. + * @rate: Tick rate of the registered clock. + * @actual_read_sched_clock: Registered hardware level clock read function. + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular 'seq' and 'read_data[0]' (combined) should fit + * into a single 64-byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data[2]; + ktime_t wrap_kt; + unsigned long rate; + + u64 (*actual_read_sched_clock)(void); +}; + +static struct hrtimer sched_clock_timer; +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static u64 notrace jiffy_sched_clock_read(void) +{ + /* + * We don't need to use get_jiffies_64 on 32-bit arches here + * because we register with BITS_PER_LONG + */ + return (u64)(jiffies - INITIAL_JIFFIES); +} + +static struct clock_data cd ____cacheline_aligned = { + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, +}; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ + return (cyc * mult) >> shift; +} + +unsigned long long notrace sched_clock(void) +{ + u64 cyc, res; + unsigned long seq; + struct clock_read_data *rd; + + do { + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1); + + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); + } while (read_seqcount_retry(&cd.seq, seq)); + + return res; +} + +/* + * Updating the data required to read the clock. + * + * sched_clock() will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock() to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock() as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); +} + +/* + * Atomically update the sched_clock() epoch. + */ +static void update_sched_clock(void) +{ + u64 cyc; + u64 ns; + struct clock_read_data rd; + + rd = cd.read_data[0]; + + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; + + update_clock_read_data(&rd); +} + +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) +{ + update_sched_clock(); + hrtimer_forward_now(hrt, cd.wrap_kt); + + return HRTIMER_RESTART; +} + +void __init +sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +{ + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + unsigned long r; + char r_unit; + struct clock_read_data rd; + + if (cd.rate > rate) + return; + + WARN_ON(!irqs_disabled()); + + /* Calculate the mult/shift to convert counter ticks to ns. */ + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate; + + /* Calculate how many nanosecs until we risk wrapping */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); + cd.wrap_kt = ns_to_ktime(wrap); + + rd = cd.read_data[0]; + + /* Update epoch for new counter and update 'epoch_ns' from old counter*/ + new_epoch = read(); + cyc = cd.actual_read_sched_clock(); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); + cd.actual_read_sched_clock = read; + + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + + update_clock_read_data(&rd); + + r = rate; + if (r >= 4000000) { + r /= 1000000; + r_unit = 'M'; + } else { + if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else { + r_unit = ' '; + } + } + + /* Calculate the ns resolution of this counter */ + res = cyc_to_ns(1ULL, new_mult, new_shift); + + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", + bits, r, r_unit, res, wrap); + + /* Enable IRQ time accounting if we have a fast enough sched_clock() */ + if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) + enable_sched_clock_irqtime(); + + pr_debug("Registered %pF as sched_clock source\n", read); +} + +void __init sched_clock_postinit(void) +{ + /* + * If no sched_clock() function has been provided at that point, + * make it the final one one. + */ + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) + sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); + + update_sched_clock(); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + sched_clock_timer.function = sched_clock_poll; + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); +} + +/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of 'epoch_cyc'. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + unsigned long seq = raw_read_seqcount(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; +} + +static int sched_clock_suspend(void) +{ + struct clock_read_data *rd = &cd.read_data[0]; + + update_sched_clock(); + hrtimer_cancel(&sched_clock_timer); + rd->read_sched_clock = suspended_sched_clock_read; + + return 0; +} + +static void sched_clock_resume(void) +{ + struct clock_read_data *rd = &cd.read_data[0]; + + rd->epoch_cyc = cd.actual_read_sched_clock(); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + rd->read_sched_clock = cd.actual_read_sched_clock; +} + +static struct syscore_ops sched_clock_ops = { + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ + register_syscore_ops(&sched_clock_ops); + + return 0; +} +device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c new file mode 100644 index 000000000..e622ba365 --- /dev/null +++ b/kernel/time/test_udelay.c @@ -0,0 +1,168 @@ +/* + * udelay() test kernel module + * + * Test is executed by writing and reading to /sys/kernel/debug/udelay_test + * Tests are configured by writing: USECS ITERATIONS + * Tests are executed by reading from the same file. + * Specifying usecs of 0 or negative values will run multiples tests. + * + * Copyright (C) 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#define DEFAULT_ITERATIONS 100 + +#define DEBUGFS_FILENAME "udelay_test" + +static DEFINE_MUTEX(udelay_test_lock); +static struct dentry *udelay_test_debugfs_file; +static int udelay_test_usecs; +static int udelay_test_iterations = DEFAULT_ITERATIONS; + +static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) +{ + int min = 0, max = 0, fail_count = 0; + uint64_t sum = 0; + uint64_t avg; + int i; + /* Allow udelay to be up to 0.5% fast */ + int allowed_error_ns = usecs * 5; + + for (i = 0; i < iters; ++i) { + struct timespec ts1, ts2; + int time_passed; + + ktime_get_ts(&ts1); + udelay(usecs); + ktime_get_ts(&ts2); + time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + + if (i == 0 || time_passed < min) + min = time_passed; + if (i == 0 || time_passed > max) + max = time_passed; + if ((time_passed + allowed_error_ns) / 1000 < usecs) + ++fail_count; + WARN_ON(time_passed < 0); + sum += time_passed; + } + + avg = sum; + do_div(avg, iters); + seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", + usecs, iters, usecs * 1000, + (usecs * 1000) - allowed_error_ns, min, avg, max); + if (fail_count) + seq_printf(s, " FAIL=%d", fail_count); + seq_puts(s, "\n"); + + return 0; +} + +static int udelay_test_show(struct seq_file *s, void *v) +{ + int usecs; + int iters; + int ret = 0; + + mutex_lock(&udelay_test_lock); + usecs = udelay_test_usecs; + iters = udelay_test_iterations; + mutex_unlock(&udelay_test_lock); + + if (usecs > 0 && iters > 0) { + return udelay_test_single(s, usecs, iters); + } else if (usecs == 0) { + struct timespec ts; + + ktime_get_ts(&ts); + seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", + loops_per_jiffy, ts.tv_sec, ts.tv_nsec); + seq_puts(s, "usage:\n"); + seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); + seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); + } + + return ret; +} + +static int udelay_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, udelay_test_show, inode->i_private); +} + +static ssize_t udelay_test_write(struct file *file, const char __user *buf, + size_t count, loff_t *pos) +{ + char lbuf[32]; + int ret; + int usecs; + int iters; + + if (count >= sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + lbuf[count] = '\0'; + + ret = sscanf(lbuf, "%d %d", &usecs, &iters); + if (ret < 1) + return -EINVAL; + else if (ret < 2) + iters = DEFAULT_ITERATIONS; + + mutex_lock(&udelay_test_lock); + udelay_test_usecs = usecs; + udelay_test_iterations = iters; + mutex_unlock(&udelay_test_lock); + + return count; +} + +static const struct file_operations udelay_test_debugfs_ops = { + .owner = THIS_MODULE, + .open = udelay_test_open, + .read = seq_read, + .write = udelay_test_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init udelay_test_init(void) +{ + mutex_lock(&udelay_test_lock); + udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, + S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); + mutex_unlock(&udelay_test_lock); + + return 0; +} + +module_init(udelay_test_init); + +static void __exit udelay_test_exit(void) +{ + mutex_lock(&udelay_test_lock); + debugfs_remove(udelay_test_debugfs_file); + mutex_unlock(&udelay_test_lock); +} + +module_exit(udelay_test_exit); + +MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); +MODULE_LICENSE("GPL"); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 000000000..6aac4beed --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,113 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/clockchips.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, + struct clock_event_device *bc) +{ + switch (mode) { + case CLOCK_EVT_MODE_SHUTDOWN: + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + break; + default: + break; + } +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + int bc_moved; + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we + * were able to cancel the timer nothing can rearm it as we + * own broadcast_lock. + * + * However we can also be called from the event handler of + * ce_broadcast_hrtimer itself when it expires. We cannot + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. + * + * Since we are in the idle loop at this point and because + * hrtimer_{start/cancel} functions call into tracing, + * calls to these functions must be bound within RCU_NONIDLE. + */ + RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? + !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : + 0); + if (bc_moved) { + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { + hrtimer_set_expires(&bctimer, expires); + } + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .set_mode = bc_set_mode, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = ULONG_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + return HRTIMER_NORESTART; + + return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 000000000..7e8ca4f44 --- /dev/null +++ b/kernel/time/tick-broadcast.c @@ -0,0 +1,964 @@ +/* + * linux/kernel/time/tick-broadcast.c + * + * This file contains functions which emulate a local clock-event + * device via a broadcast event source. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +/* + * Broadcast support for broken x86 hardware, where the local apic + * timer stops in C3 state. + */ + +static struct tick_device tick_broadcast_device; +static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tick_broadcast_on; +static cpumask_var_t tmpmask; +static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); +static int tick_broadcast_forced; + +#ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_clear_oneshot(int cpu); +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +#else +static inline void tick_broadcast_clear_oneshot(int cpu) { } +static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +#endif + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_broadcast_device(void) +{ + return &tick_broadcast_device; +} + +struct cpumask *tick_get_broadcast_mask(void) +{ + return tick_broadcast_mask; +} + +/* + * Start the device in periodic mode + */ +static void tick_broadcast_start_periodic(struct clock_event_device *bc) +{ + if (bc) + tick_setup_periodic(bc, 1); +} + +/* + * Check, if the device can be utilized as broadcast device: + */ +static bool tick_check_broadcast_device(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_PERCPU) || + (newdev->features & CLOCK_EVT_FEAT_C3STOP)) + return false; + + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + + return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ +void tick_install_broadcast_device(struct clock_event_device *dev) +{ + struct clock_event_device *cur = tick_broadcast_device.evtdev; + + if (!tick_check_broadcast_device(cur, dev)) + return; + + if (!try_module_get(dev->owner)) + return; + + clockevents_exchange_device(cur, dev); + if (cur) + cur->event_handler = clockevents_handle_noop; + tick_broadcast_device.evtdev = dev; + if (!cpumask_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(dev); + /* + * Inform all cpus about this. We might be in a situation + * where we did not switch to oneshot mode because the per cpu + * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack + * of a oneshot capable broadcast device. Without that + * notification the systems stays stuck in periodic mode + * forever. + */ + if (dev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_clock_notify(); +} + +/* + * Check, if the device is the broadcast device + */ +int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return (dev && tick_broadcast_device.evtdev == dev); +} + +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + +static void err_broadcast(const struct cpumask *mask) +{ + pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); +} + +static void tick_device_setup_broadcast_func(struct clock_event_device *dev) +{ + if (!dev->broadcast) + dev->broadcast = tick_broadcast; + if (!dev->broadcast) { + pr_warn_once("%s depends on broadcast, but no broadcast function available\n", + dev->name); + dev->broadcast = err_broadcast; + } +} + +/* + * Check, if the device is disfunctional and a place holder, which + * needs to be handled by the broadcast device. + */ +int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Devices might be registered with both periodic and oneshot + * mode disabled. This signals, that the device needs to be + * operated from the broadcast device and is a placeholder for + * the cpu local device. + */ + if (!tick_device_is_functional(dev)) { + dev->event_handler = tick_handle_periodic; + tick_device_setup_broadcast_func(dev); + cpumask_set_cpu(cpu, tick_broadcast_mask); + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + ret = 1; + } else { + /* + * Clear the broadcast bit for this cpu if the + * device is not power state affected. + */ + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + else + tick_device_setup_broadcast_func(dev); + + /* + * Clear the broadcast bit if the CPU is not in + * periodic broadcast on state. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_on)) + cpumask_clear_cpu(cpu, tick_broadcast_mask); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_ONESHOT: + /* + * If the system is in oneshot mode we can + * unconditionally clear the oneshot mask bit, + * because the CPU is running and therefore + * not in an idle state which causes the power + * state affected device to stop. Let the + * caller initialize the device. + */ + tick_broadcast_clear_oneshot(cpu); + ret = 0; + break; + + case TICKDEV_MODE_PERIODIC: + /* + * If the system is in periodic mode, check + * whether the broadcast device can be + * switched off now. + */ + if (cpumask_empty(tick_broadcast_mask) && bc) + clockevents_shutdown(bc); + /* + * If we kept the cpu in the broadcast mask, + * tell the caller to leave the per cpu device + * in shutdown state. The periodic interrupt + * is delivered by the broadcast device. + */ + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + break; + default: + /* Nothing to do */ + ret = 0; + break; + } + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +int tick_receive_broadcast(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *evt = td->evtdev; + + if (!evt) + return -ENODEV; + + if (!evt->event_handler) + return -EINVAL; + + evt->event_handler(evt); + return 0; +} +#endif + +/* + * Broadcast the event to the cpus, which are set in the mask (mangled). + */ +static void tick_do_broadcast(struct cpumask *mask) +{ + int cpu = smp_processor_id(); + struct tick_device *td; + + /* + * Check, if the current cpu is in the mask + */ + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->event_handler(td->evtdev); + } + + if (!cpumask_empty(mask)) { + /* + * It might be necessary to actually check whether the devices + * have different broadcast functions. For now, just use the + * one of the first device. This works as long as we have this + * misfeature only on x86 (lapic) + */ + td = &per_cpu(tick_cpu_device, cpumask_first(mask)); + td->evtdev->broadcast(mask); + } +} + +/* + * Periodic broadcast: + * - invoke the broadcast handlers + */ +static void tick_do_periodic_broadcast(void) +{ + cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); + tick_do_broadcast(tmpmask); +} + +/* + * Event handler for periodic broadcast ticks + */ +static void tick_handle_periodic_broadcast(struct clock_event_device *dev) +{ + ktime_t next; + + raw_spin_lock(&tick_broadcast_lock); + + tick_do_periodic_broadcast(); + + /* + * The device is in periodic mode. No reprogramming necessary: + */ + if (dev->state == CLOCK_EVT_STATE_PERIODIC) + goto unlock; + + /* + * Setup the next period for devices, which do not have + * periodic mode. We read dev->next_event first and add to it + * when the event already expired. clockevents_program_event() + * sets dev->next_event only when the event is really + * programmed to the device. + */ + for (next = dev->next_event; ;) { + next = ktime_add(next, tick_period); + + if (!clockevents_program_event(dev, next, false)) + goto unlock; + tick_do_periodic_broadcast(); + } +unlock: + raw_spin_unlock(&tick_broadcast_lock); +} + +/** + * tick_broadcast_control - Enable/disable or force broadcast mode + * @mode: The selected broadcast mode + * + * Called when the system enters a state where affected tick devices + * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +void tick_broadcast_control(enum tick_broadcast_mode mode) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + int cpu, bc_stopped; + + td = this_cpu_ptr(&tick_cpu_device); + dev = td->evtdev; + + /* + * Is the device not affected by the powerstate ? + */ + if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return; + + if (!tick_device_is_functional(dev)) + return; + + raw_spin_lock(&tick_broadcast_lock); + cpu = smp_processor_id(); + bc = tick_broadcast_device.evtdev; + bc_stopped = cpumask_empty(tick_broadcast_mask); + + switch (mode) { + case TICK_BROADCAST_FORCE: + tick_broadcast_forced = 1; + case TICK_BROADCAST_ON: + cpumask_set_cpu(cpu, tick_broadcast_on); + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + clockevents_shutdown(dev); + } + break; + + case TICK_BROADCAST_OFF: + if (tick_broadcast_forced) + break; + cpumask_clear_cpu(cpu, tick_broadcast_on); + if (!tick_device_is_functional(dev)) + break; + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + tick_setup_periodic(dev, 0); + } + break; + } + + if (cpumask_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } + raw_spin_unlock(&tick_broadcast_lock); +} +EXPORT_SYMBOL_GPL(tick_broadcast_control); + +/* + * Set the periodic handler depending on broadcast on/off + */ +void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + if (!broadcast) + dev->event_handler = tick_handle_periodic; + else + dev->event_handler = tick_handle_periodic_broadcast; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Remove a CPU from broadcasting + */ +void tick_shutdown_broadcast(unsigned int cpu) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); + + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + if (bc && cpumask_empty(tick_broadcast_mask)) + clockevents_shutdown(bc); + } + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} +#endif + +void tick_suspend_broadcast(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + if (bc) + clockevents_shutdown(bc); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * This is called from tick_resume_local() on a resuming CPU. That's + * called from the core resume function, tick_unfreeze() and the magic XEN + * resume hackery. + * + * In none of these cases the broadcast device mode can change and the + * bit of the resuming CPU in the broadcast mask is safe as well. + */ +bool tick_resume_check_broadcast(void) +{ + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) + return false; + else + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); +} + +void tick_resume_broadcast(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + + if (bc) { + clockevents_tick_resume(bc); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_PERIODIC: + if (!cpumask_empty(tick_broadcast_mask)) + tick_broadcast_start_periodic(bc); + break; + case TICKDEV_MODE_ONESHOT: + if (!cpumask_empty(tick_broadcast_mask)) + tick_resume_broadcast_oneshot(bc); + break; + } + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_TICK_ONESHOT + +static cpumask_var_t tick_broadcast_oneshot_mask; +static cpumask_var_t tick_broadcast_pending_mask; +static cpumask_var_t tick_broadcast_force_mask; + +/* + * Exposed for debugging: see timer_list.c + */ +struct cpumask *tick_get_broadcast_oneshot_mask(void) +{ + return tick_broadcast_oneshot_mask; +} + +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void) +{ + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, + const struct cpumask *cpumask) +{ + if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) + return; + + if (cpumask_equal(bc->cpumask, cpumask)) + return; + + bc->cpumask = cpumask; + irq_set_affinity(bc->irq, bc->cpumask); +} + +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires, int force) +{ + int ret; + + if (bc->state != CLOCK_EVT_STATE_ONESHOT) + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + + ret = clockevents_program_event(bc, expires, force); + if (!ret) + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); + return ret; +} + +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) +{ + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); +} + +/* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast_this_cpu(void) +{ + if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + /* + * We might be in the middle of switching over from + * periodic to oneshot. If the CPU has not yet + * switched over, leave the device alone. + */ + if (td->mode == TICKDEV_MODE_ONESHOT) { + clockevents_set_state(td->evtdev, + CLOCK_EVT_STATE_ONESHOT); + } + } +} + +/* + * Handle oneshot mode broadcasting + */ +static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td; + ktime_t now, next_event; + int cpu, next_cpu = 0; + + raw_spin_lock(&tick_broadcast_lock); +again: + dev->next_event.tv64 = KTIME_MAX; + next_event.tv64 = KTIME_MAX; + cpumask_clear(tmpmask); + now = ktime_get(); + /* Find all expired events */ + for_each_cpu(cpu, tick_broadcast_oneshot_mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tmpmask); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + cpumask_set_cpu(cpu, tick_broadcast_pending_mask); + } else if (td->evtdev->next_event.tv64 < next_event.tv64) { + next_event.tv64 = td->evtdev->next_event.tv64; + next_cpu = cpu; + } + } + + /* + * Remove the current cpu from the pending mask. The event is + * delivered immediately in tick_do_broadcast() ! + */ + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask); + + /* Take care of enforced broadcast requests */ + cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); + cpumask_clear(tick_broadcast_force_mask); + + /* + * Sanity check. Catch the case where we try to broadcast to + * offline cpus. + */ + if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) + cpumask_and(tmpmask, tmpmask, cpu_online_mask); + + /* + * Wakeup the cpus which have an expired event. + */ + tick_do_broadcast(tmpmask); + + /* + * Two reasons for reprogram: + * + * - The global event did not expire any CPU local + * events. This happens in dyntick mode, as the maximum PIT + * delta is quite small. + * + * - There are pending events on sleeping CPUs which were not + * in the event mask + */ + if (next_event.tv64 != KTIME_MAX) { + /* + * Rearm the broadcast device. If event expired, + * repeat the above + */ + if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) + goto again; + } + raw_spin_unlock(&tick_broadcast_lock); +} + +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event.tv64 == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event.tv64 < bc->next_event.tv64) + return; + } + clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); +} + +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + int cpu, ret = 0; + ktime_t now; + + /* + * Periodic mode does not care about the enter/exit of power + * states + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + return 0; + + /* + * We are called with preemtion disabled from the depth of the + * idle code, so we can't be moved away. + */ + td = this_cpu_ptr(&tick_cpu_device); + dev = td->evtdev; + + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + raw_spin_lock(&tick_broadcast_lock); + bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); + + if (state == TICK_BROADCAST_ENTER) { + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { + WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + broadcast_shutdown_local(bc, dev); + /* + * We only reprogram the broadcast timer if we + * did not mark ourself in the force mask and + * if the cpu local event is earlier than the + * broadcast event. If the current CPU is in + * the force mask, then we are going to be + * woken by the IPI right away. + */ + if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && + dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(bc, cpu, dev->next_event, 1); + } + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we remove the + * CPU from the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + } else { + if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + /* + * The cpu which was handling the broadcast + * timer marked this cpu in the broadcast + * pending mask and fired the broadcast + * IPI. So we are going to handle the expired + * event anyway via the broadcast IPI + * handler. No need to reprogram the timer + * with an already expired event. + */ + if (cpumask_test_and_clear_cpu(cpu, + tick_broadcast_pending_mask)) + goto out; + + /* + * Bail out if there is no next event. + */ + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are not longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefor rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, tick_broadcast_force_mask); + goto out; + } + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ + tick_program_event(dev->next_event, 1); + } + } +out: + raw_spin_unlock(&tick_broadcast_lock); + return ret; +} +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); + +/* + * Reset the one shot broadcast for a cpu + * + * Called with tick_broadcast_lock held + */ +static void tick_broadcast_clear_oneshot(int cpu) +{ + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); +} + +static void tick_broadcast_init_next_event(struct cpumask *mask, + ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu(cpu, mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + +/** + * tick_broadcast_setup_oneshot - setup the broadcast device + */ +void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + int cpu = smp_processor_id(); + + /* Set it up only once ! */ + if (bc->event_handler != tick_handle_oneshot_broadcast) { + int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; + + bc->event_handler = tick_handle_oneshot_broadcast; + + /* + * We must be careful here. There might be other CPUs + * waiting for periodic broadcast. We need to set the + * oneshot_mask bits for those and program the + * broadcast device to fire. + */ + cpumask_copy(tmpmask, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tmpmask); + cpumask_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, tmpmask); + + if (was_periodic && !cpumask_empty(tmpmask)) { + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + tick_broadcast_init_next_event(tmpmask, + tick_next_period); + tick_broadcast_set_event(bc, cpu, tick_next_period, 1); + } else + bc->next_event.tv64 = KTIME_MAX; + } else { + /* + * The first cpu which switches to oneshot mode sets + * the bit for all other cpus which are in the general + * (periodic) broadcast mask. So the bit is set and + * would prevent the first broadcast enter after this + * to program the bc device. + */ + tick_broadcast_clear_oneshot(cpu); + } +} + +/* + * Select oneshot operating mode for the broadcast device + */ +void tick_broadcast_switch_to_oneshot(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + bc = tick_broadcast_device.evtdev; + if (bc) + tick_broadcast_setup_oneshot(bc); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +#ifdef CONFIG_HOTPLUG_CPU +void hotplug_cpu__broadcast_tick_pull(int deadcpu) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + bc = tick_broadcast_device.evtdev; + + if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* This moves the broadcast assignment to this CPU: */ + clockevents_program_event(bc, bc->next_event, 1); + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Remove a dead CPU from broadcasting + */ +void tick_shutdown_broadcast_oneshot(unsigned int cpu) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Clear the broadcast masks for the dead cpu, but do not stop + * the broadcast device! + */ + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); + cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} +#endif + +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + +/* + * Check whether the broadcast device supports oneshot. + */ +bool tick_broadcast_oneshot_available(void) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; +} + +#endif + +void __init tick_broadcast_init(void) +{ + zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT); + zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT + zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); + zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 000000000..3ae6afa1e --- /dev/null +++ b/kernel/time/tick-common.c @@ -0,0 +1,498 @@ +/* + * linux/kernel/time/tick-common.c + * + * This file contains the base functions to manage periodic tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/module.h> + +#include <asm/irq_regs.h> + +#include "tick-internal.h" + +/* + * Tick devices + */ +DEFINE_PER_CPU(struct tick_device, tick_cpu_device); +/* + * Tick next event: keeps track of the tick time + */ +ktime_t tick_next_period; +ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_device(int cpu) +{ + return &per_cpu(tick_cpu_device, cpu); +} + +/** + * tick_is_oneshot_available - check for a oneshot capable event device + */ +int tick_is_oneshot_available(void) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return 0; + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 1; + return tick_broadcast_oneshot_available(); +} + +/* + * Periodic tick + */ +static void tick_periodic(int cpu) +{ + if (tick_do_timer_cpu == cpu) { + write_seqlock(&jiffies_lock); + + /* Keep track of the next tick event */ + tick_next_period = ktime_add(tick_next_period, tick_period); + + do_timer(1); + write_sequnlock(&jiffies_lock); + update_wall_time(); + } + + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} + +/* + * Event handler for periodic ticks + */ +void tick_handle_periodic(struct clock_event_device *dev) +{ + int cpu = smp_processor_id(); + ktime_t next = dev->next_event; + + tick_periodic(cpu); + + if (dev->state != CLOCK_EVT_STATE_ONESHOT) + return; + for (;;) { + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(next, tick_period); + + if (!clockevents_program_event(dev, next, false)) + return; + /* + * Have to be careful here. If we're in oneshot mode, + * before we call tick_periodic() in a loop, we need + * to be sure we're using a real hardware clocksource. + * Otherwise we could get trapped in an infinite + * loop, as the tick_periodic() increments jiffies, + * which then will increment time, possibly causing + * the loop to trigger again and again. + */ + if (timekeeping_valid_for_hres()) + tick_periodic(cpu); + } +} + +/* + * Setup the device for a periodic tick + */ +void tick_setup_periodic(struct clock_event_device *dev, int broadcast) +{ + tick_set_periodic_handler(dev, broadcast); + + /* Broadcast setup ? */ + if (!tick_device_is_functional(dev)) + return; + + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { + clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + } else { + unsigned long seq; + ktime_t next; + + do { + seq = read_seqbegin(&jiffies_lock); + next = tick_next_period; + } while (read_seqretry(&jiffies_lock, seq)); + + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + + for (;;) { + if (!clockevents_program_event(dev, next, false)) + return; + next = ktime_add(next, tick_period); + } + } +} + +/* + * Setup the tick device + */ +static void tick_setup_device(struct tick_device *td, + struct clock_event_device *newdev, int cpu, + const struct cpumask *cpumask) +{ + ktime_t next_event; + void (*handler)(struct clock_event_device *) = NULL; + + /* + * First device setup ? + */ + if (!td->evtdev) { + /* + * If no cpu took the do_timer update, assign it to + * this cpu: + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { + if (!tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; + else + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + tick_next_period = ktime_get(); + tick_period = ktime_set(0, NSEC_PER_SEC / HZ); + } + + /* + * Startup in periodic mode first. + */ + td->mode = TICKDEV_MODE_PERIODIC; + } else { + handler = td->evtdev->event_handler; + next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; + } + + td->evtdev = newdev; + + /* + * When the device is not per cpu, pin the interrupt to the + * current cpu: + */ + if (!cpumask_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); + + /* + * When global broadcasting is active, check if the current + * device is registered as a placeholder for broadcast mode. + * This allows us to handle this x86 misfeature in a generic + * way. This function also returns !=0 when we keep the + * current active broadcast state for this CPU. + */ + if (tick_device_uses_broadcast(newdev, cpu)) + return; + + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(newdev, 0); + else + tick_setup_oneshot(newdev, handler, next_event); +} + +void tick_install_replacement(struct clock_event_device *newdev) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + int cpu = smp_processor_id(); + + clockevents_exchange_device(td->evtdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); +} + +static bool tick_check_percpu(struct clock_event_device *curdev, + struct clock_event_device *newdev, int cpu) +{ + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + return false; + if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) + return true; + /* Check if irq affinity can be set */ + if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) + return false; + /* Prefer an existing cpu local device */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + return false; + return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + /* Prefer oneshot capable device */ + if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { + if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) + return false; + if (tick_oneshot_mode_active()) + return false; + } + + /* + * Use the higher rated one, but prefer a CPU local device with a lower + * rating than a non-CPU local device + */ + return !curdev || + newdev->rating > curdev->rating || + !cpumask_equal(curdev->cpumask, newdev->cpumask); +} + +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev) +{ + if (!tick_check_percpu(curdev, newdev, smp_processor_id())) + return false; + + return tick_check_preferred(curdev, newdev); +} + +/* + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled. + */ +void tick_check_new_device(struct clock_event_device *newdev) +{ + struct clock_event_device *curdev; + struct tick_device *td; + int cpu; + + cpu = smp_processor_id(); + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + goto out_bc; + + td = &per_cpu(tick_cpu_device, cpu); + curdev = td->evtdev; + + /* cpu local device ? */ + if (!tick_check_percpu(curdev, newdev, cpu)) + goto out_bc; + + /* Preference decision */ + if (!tick_check_preferred(curdev, newdev)) + goto out_bc; + + if (!try_module_get(newdev->owner)) + return; + + /* + * Replace the eventually existing device by the new + * device. If the current device is the broadcast device, do + * not give it back to the clockevents layer ! + */ + if (tick_is_broadcast_device(curdev)) { + clockevents_shutdown(curdev); + curdev = NULL; + } + clockevents_exchange_device(curdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); + return; + +out_bc: + /* + * Can the new device be used as a broadcast device ? + */ + tick_install_broadcast_device(newdev); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. Not locking required. If + * tick_do_timer_cpu is owned by this cpu, nothing can change it. + */ +void tick_handover_do_timer(void) +{ + if (tick_do_timer_cpu == smp_processor_id()) { + int cpu = cpumask_first(cpu_online_mask); + + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : + TICK_DO_TIMER_NONE; + } +} + +/* + * Shutdown an event device on a given cpu: + * + * This is called on a life CPU, when a CPU is dead. So we cannot + * access the hardware device itself. + * We just set the mode and remove it from the lists. + */ +void tick_shutdown(unsigned int cpu) +{ + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct clock_event_device *dev = td->evtdev; + + td->mode = TICKDEV_MODE_PERIODIC; + if (dev) { + /* + * Prevent that the clock events layer tries to call + * the set mode function! + */ + dev->state = CLOCK_EVT_STATE_DETACHED; + dev->mode = CLOCK_EVT_MODE_UNUSED; + clockevents_exchange_device(dev, NULL); + dev->event_handler = clockevents_handle_noop; + td->evtdev = NULL; + } +} +#endif + +/** + * tick_suspend_local - Suspend the local tick device + * + * Called from the local cpu for freeze with interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend_local(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + clockevents_shutdown(td->evtdev); +} + +/** + * tick_resume_local - Resume the local tick device + * + * Called from the local CPU for unfreeze or XEN resume magic. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume_local(void) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool broadcast = tick_resume_check_broadcast(); + + clockevents_tick_resume(td->evtdev); + if (!broadcast) { + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(td->evtdev, 0); + else + tick_resume_oneshot(); + } +} + +/** + * tick_suspend - Suspend the tick and the broadcast device + * + * Called from syscore_suspend() via timekeeping_suspend with only one + * CPU online and interrupts disabled or from tick_unfreeze() under + * tick_freeze_lock. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend(void) +{ + tick_suspend_local(); + tick_suspend_broadcast(); +} + +/** + * tick_resume - Resume the tick and the broadcast device + * + * Called from syscore_resume() via timekeeping_resume with only one + * CPU online and interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume(void) +{ + tick_resume_broadcast(); + tick_resume_local(); +} + +static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static unsigned int tick_freeze_depth; + +/** + * tick_freeze - Suspend the local tick and (possibly) timekeeping. + * + * Check if this is the last online CPU executing the function and if so, + * suspend timekeeping. Otherwise suspend the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_unfreeze(). + * Interrupts must not be enabled before the subsequent %tick_unfreeze(). + */ +void tick_freeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + tick_freeze_depth++; + if (tick_freeze_depth == num_online_cpus()) + timekeeping_suspend(); + else + tick_suspend_local(); + + raw_spin_unlock(&tick_freeze_lock); +} + +/** + * tick_unfreeze - Resume the local tick and (possibly) timekeeping. + * + * Check if this is the first CPU executing the function and if so, resume + * timekeeping. Otherwise resume the local tick. + * + * Call with interrupts disabled. Must be balanced with %tick_freeze(). + * Interrupts must not be enabled after the preceding %tick_freeze(). + */ +void tick_unfreeze(void) +{ + raw_spin_lock(&tick_freeze_lock); + + if (tick_freeze_depth == num_online_cpus()) + timekeeping_resume(); + else + tick_resume_local(); + + tick_freeze_depth--; + + raw_spin_unlock(&tick_freeze_lock); +} + +/** + * tick_init - initialize the tick control + */ +void __init tick_init(void) +{ + tick_broadcast_init(); + tick_nohz_init(); +} diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 000000000..b64fdd805 --- /dev/null +++ b/kernel/time/tick-internal.h @@ -0,0 +1,139 @@ +/* + * tick internal variable and functions used by low/high res code + */ +#include <linux/hrtimer.h> +#include <linux/tick.h> + +#include "timekeeping.h" +#include "tick-sched.h" + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + +# define TICK_DO_TIMER_NONE -1 +# define TICK_DO_TIMER_BOOT -2 + +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); +extern ktime_t tick_next_period; +extern ktime_t tick_period; +extern int tick_do_timer_cpu __read_mostly; + +extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); +extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_shutdown(unsigned int cpu); +extern void tick_suspend(void); +extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, + struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev); +extern int tick_is_oneshot_available(void); +extern struct tick_device *tick_get_device(int cpu); + +extern int clockevents_tick_resume(struct clock_event_device *dev); +/* Check, if the device is functional or a dummy for broadcast */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} + +extern void clockevents_shutdown(struct clock_event_device *dev); +extern void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new); +extern void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state); +extern int clockevents_program_event(struct clock_event_device *dev, + ktime_t expires, bool force); +extern void clockevents_handle_noop(struct clock_event_device *dev); +extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); + +/* Broadcasting support */ +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern void tick_install_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_shutdown_broadcast(unsigned int cpu); +extern void tick_suspend_broadcast(void); +extern void tick_resume_broadcast(void); +extern bool tick_resume_check_broadcast(void); +extern void tick_broadcast_init(void); +extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); +extern struct tick_device *tick_get_broadcast_device(void); +extern struct cpumask *tick_get_broadcast_mask(void); +# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } +static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_shutdown_broadcast(unsigned int cpu) { } +static inline void tick_suspend_broadcast(void) { } +static inline void tick_resume_broadcast(void) { } +static inline bool tick_resume_check_broadcast(void) { return false; } +static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } + +/* Set the periodic handler in non broadcast mode */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ + +#else /* !GENERIC_CLOCKEVENTS: */ +static inline void tick_suspend(void) { } +static inline void tick_resume(void) { } +#endif /* !GENERIC_CLOCKEVENTS */ + +/* Oneshot related functions */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); +extern void tick_resume_oneshot(void); +static inline bool tick_oneshot_possible(void) { return true; } +extern int tick_oneshot_mode_active(void); +extern void tick_clock_notify(void); +extern int tick_check_oneshot_change(int allow_nohz); +extern int tick_init_highres(void); +#else /* !CONFIG_TICK_ONESHOT: */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) { BUG(); } +static inline void tick_resume_oneshot(void) { BUG(); } +static inline int tick_program_event(ktime_t expires, int force) { return 0; } +static inline void tick_oneshot_notify(void) { } +static inline bool tick_oneshot_possible(void) { return false; } +static inline int tick_oneshot_mode_active(void) { return 0; } +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +#endif /* !CONFIG_TICK_ONESHOT */ + +/* Functions related to oneshot broadcasting */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) +extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +extern void tick_broadcast_switch_to_oneshot(void); +extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); +extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast_this_cpu(void); +bool tick_broadcast_oneshot_available(void); +extern struct cpumask *tick_get_broadcast_oneshot_mask(void); +#else /* !(BROADCAST && ONESHOT): */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } +static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } +#endif /* !(BROADCAST && ONESHOT) */ + +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } +#endif diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 000000000..67a64b167 --- /dev/null +++ b/kernel/time/tick-oneshot.c @@ -0,0 +1,116 @@ +/* + * linux/kernel/time/tick-oneshot.c + * + * This file contains functions which manage high resolution tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> + +#include "tick-internal.h" + +/** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + return clockevents_program_event(dev, expires, force); +} + +/** + * tick_resume_onshot - resume oneshot mode + */ +void tick_resume_oneshot(void) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_program_event(dev, ktime_get(), true); +} + +/** + * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + */ +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t next_event) +{ + newdev->event_handler = handler; + clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); + clockevents_program_event(newdev, next_event, true); +} + +/** + * tick_switch_to_oneshot - switch to oneshot mode + */ +int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || + !tick_device_is_functional(dev)) { + + printk(KERN_INFO "Clockevents: " + "could not switch to one-shot mode:"); + if (!dev) { + printk(" no tick device\n"); + } else { + if (!tick_device_is_functional(dev)) + printk(" %s is not functional.\n", dev->name); + else + printk(" %s does not support one-shot mode.\n", + dev->name); + } + return -EINVAL; + } + + td->mode = TICKDEV_MODE_ONESHOT; + dev->event_handler = handler; + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + tick_broadcast_switch_to_oneshot(); + return 0; +} + +/** + * tick_check_oneshot_mode - check whether the system is in oneshot mode + * + * returns 1 when either nohz or highres are enabled. otherwise 0. + */ +int tick_oneshot_mode_active(void) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; + local_irq_restore(flags); + + return ret; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * tick_init_highres - switch to high resolution mode + * + * Called with interrupts disabled. + */ +int tick_init_highres(void) +{ + return tick_switch_to_oneshot(hrtimer_interrupt); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 000000000..914259128 --- /dev/null +++ b/kernel/time/tick-sched.c @@ -0,0 +1,1250 @@ +/* + * linux/kernel/time/tick-sched.c + * + * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * No idle tick implementation for low and high resolution timers + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Distribute under GPLv2. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/perf_event.h> +#include <linux/context_tracking.h> + +#include <asm/irq_regs.h> + +#include "tick-internal.h" + +#include <trace/events/timer.h> + +/* + * Per cpu nohz control structure + */ +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); + +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + +struct tick_sched *tick_get_tick_sched(int cpu) +{ + return &per_cpu(tick_cpu_sched, cpu); +} + +/* + * Must be called with interrupts disabled ! + */ +static void tick_do_update_jiffies64(ktime_t now) +{ + unsigned long ticks = 0; + ktime_t delta; + + /* + * Do a quick check without holding jiffies_lock: + */ + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 < tick_period.tv64) + return; + + /* Reevalute with jiffies_lock held */ + write_seqlock(&jiffies_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 >= tick_period.tv64) { + + delta = ktime_sub(delta, tick_period); + last_jiffies_update = ktime_add(last_jiffies_update, + tick_period); + + /* Slow path for long timeouts */ + if (unlikely(delta.tv64 >= tick_period.tv64)) { + s64 incr = ktime_to_ns(tick_period); + + ticks = ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } + do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); + } else { + write_sequnlock(&jiffies_lock); + return; + } + write_sequnlock(&jiffies_lock); + update_wall_time(); +} + +/* + * Initialize and return retrieve the jiffies update. + */ +static ktime_t tick_init_jiffy_update(void) +{ + ktime_t period; + + write_seqlock(&jiffies_lock); + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update.tv64 == 0) + last_jiffies_update = tick_next_period; + period = last_jiffies_update; + write_sequnlock(&jiffies_lock); + return period; +} + + +static void tick_sched_do_timer(ktime_t now) +{ + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ_COMMON + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * jiffies_lock. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) + && !tick_nohz_full_cpu(cpu)) + tick_do_timer_cpu = cpu; +#endif + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); +} + +static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) +{ +#ifdef CONFIG_NO_HZ_COMMON + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + if (is_idle_task(current)) + ts->idle_jiffies++; + } +#endif + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); +} + +#ifdef CONFIG_NO_HZ_FULL +cpumask_var_t tick_nohz_full_mask; +cpumask_var_t housekeeping_mask; +bool tick_nohz_full_running; + +static bool can_stop_full_tick(void) +{ + WARN_ON_ONCE(!irqs_disabled()); + + if (!sched_can_stop_tick()) { + trace_tick_stop(0, "more than 1 task in runqueue\n"); + return false; + } + + if (!posix_cpu_timers_can_stop_tick(current)) { + trace_tick_stop(0, "posix timers running\n"); + return false; + } + + if (!perf_event_can_stop_tick()) { + trace_tick_stop(0, "perf events running\n"); + return false; + } + + /* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + /* + * TODO: kick full dynticks CPUs when + * sched_clock_stable is set. + */ + if (!sched_clock_stable()) { + trace_tick_stop(0, "unstable sched clock\n"); + /* + * Don't allow the user to think they can get + * full NO_HZ with this machine. + */ + WARN_ONCE(tick_nohz_full_running, + "NO_HZ FULL will not work with unstable sched clock"); + return false; + } +#endif + + return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +void __tick_nohz_full_check(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (tick_nohz_full_cpu(smp_processor_id())) { + if (ts->tick_stopped && !is_idle_task(current)) { + if (!can_stop_full_tick()) + tick_nohz_restart_sched_tick(ts, ktime_get()); + } + } +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ + __tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { + .func = nohz_full_kick_work_func, +}; + +/* + * Kick this CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. + */ +void tick_nohz_full_kick(void) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); +} + +/* + * Kick the CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_cpu(int cpu) +{ + if (!tick_nohz_full_cpu(cpu)) + return; + + irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); +} + +static void nohz_full_kick_ipi(void *info) +{ + __tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ + if (!tick_nohz_full_running) + return; + + preempt_disable(); + smp_call_function_many(tick_nohz_full_mask, + nohz_full_kick_ipi, NULL, false); + tick_nohz_full_kick(); + preempt_enable(); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void __tick_nohz_task_switch(struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + + if (!tick_nohz_full_cpu(smp_processor_id())) + goto out; + + if (tick_nohz_tick_stopped() && !can_stop_full_tick()) + tick_nohz_full_kick(); + +out: + local_irq_restore(flags); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_full_setup(char *str) +{ + alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + if (cpulist_parse(str, tick_nohz_full_mask) < 0) { + pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + free_bootmem_cpumask_var(tick_nohz_full_mask); + return 1; + } + tick_nohz_full_running = true; + + return 1; +} +__setup("nohz_full=", tick_nohz_full_setup); + +static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* + * If we handle the timekeeping duty for full dynticks CPUs, + * we can't safely shutdown that CPU. + */ + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return NOTIFY_BAD; + break; + } + return NOTIFY_OK; +} + +static int tick_nohz_init_all(void) +{ + int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL + if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); + return err; + } + err = 0; + cpumask_setall(tick_nohz_full_mask); + tick_nohz_full_running = true; +#endif + return err; +} + +void __init tick_nohz_init(void) +{ + int cpu; + + if (!tick_nohz_full_running) { + if (tick_nohz_init_all() < 0) + return; + } + + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + /* + * Full dynticks uses irq work to drive the tick rescheduling on safe + * locking contexts. But then we need irq work to raise its own + * interrupts to avoid circular dependency on the tick + */ + if (!arch_irq_work_has_interrupt()) { + pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " + "support irq work self-IPIs\n"); + cpumask_clear(tick_nohz_full_mask); + cpumask_copy(housekeeping_mask, cpu_possible_mask); + tick_nohz_full_running = false; + return; + } + + cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } + + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); + + for_each_cpu(cpu, tick_nohz_full_mask) + context_tracking_cpu_set(cpu); + + cpu_notifier(tick_nohz_cpu_down_callback, 0); + pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", + cpumask_pr_args(tick_nohz_full_mask)); +} +#endif + +/* + * NOHZ - aka dynamic tick functionality + */ +#ifdef CONFIG_NO_HZ_COMMON +/* + * NO HZ enabled ? + */ +static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_active __read_mostly; +/* + * Enable / Disable tickless mode + */ +static int __init setup_tick_nohz(char *str) +{ + if (!strcmp(str, "off")) + tick_nohz_enabled = 0; + else if (!strcmp(str, "on")) + tick_nohz_enabled = 1; + else + return 0; + return 1; +} + +__setup("nohz=", setup_tick_nohz); + +int tick_nohz_tick_stopped(void) +{ + return __this_cpu_read(tick_cpu_sched.tick_stopped); +} + +/** + * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * + * Called from interrupt entry when the CPU was idle + * + * In case the sched_tick was stopped on this CPU, we have to check if jiffies + * must be updated. Otherwise an interrupt handler could use a stale jiffy + * value. We do this unconditionally on any cpu, as we don't know whether the + * cpu, which has the update task assigned is in a long sleep. + */ +static void tick_nohz_update_jiffies(ktime_t now) +{ + unsigned long flags; + + __this_cpu_write(tick_cpu_sched.idle_waketime, now); + + local_irq_save(flags); + tick_do_update_jiffies64(now); + local_irq_restore(flags); + + touch_softlockup_watchdog(); +} + +/* + * Updates the per cpu time idle statistics counters + */ +static void +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +{ + ktime_t delta; + + if (ts->idle_active) { + delta = ktime_sub(now, ts->idle_entrytime); + if (nr_iowait_cpu(cpu) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + else + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + ts->idle_entrytime = now; + } + + if (last_update_time) + *last_update_time = ktime_to_us(now); + +} + +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) +{ + update_ts_time_stats(smp_processor_id(), ts, now, NULL); + ts->idle_active = 0; + + sched_clock_idle_wakeup_event(0); +} + +static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +{ + ktime_t now = ktime_get(); + + ts->idle_entrytime = now; + ts->idle_active = 1; + sched_clock_idle_sleep_event(); + return now; +} + +/** + * get_cpu_idle_time_us - get the total idle time of a cpu + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cummulative idle time (since boot) for a given + * CPU, in microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, idle; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + idle = ts->idle_sleeptime; + } else { + if (ts->idle_active && !nr_iowait_cpu(cpu)) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + idle = ktime_add(ts->idle_sleeptime, delta); + } else { + idle = ts->idle_sleeptime; + } + } + + return ktime_to_us(idle); + +} +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); + +/** + * get_cpu_iowait_time_us - get the total iowait time of a cpu + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cummulative iowait time (since boot) for a given + * CPU, in microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now, iowait; + + if (!tick_nohz_active) + return -1; + + now = ktime_get(); + if (last_update_time) { + update_ts_time_stats(cpu, ts, now, last_update_time); + iowait = ts->iowait_sleeptime; + } else { + if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { + ktime_t delta = ktime_sub(now, ts->idle_entrytime); + + iowait = ktime_add(ts->iowait_sleeptime, delta); + } else { + iowait = ts->iowait_sleeptime; + } + } + + return ktime_to_us(iowait); +} +EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); + +static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, + ktime_t now, int cpu) +{ + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + ktime_t last_update, expires, ret = { .tv64 = 0 }; + unsigned long rcu_delta_jiffies; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + u64 time_delta; + + time_delta = timekeeping_max_deferment(); + + /* Read jiffies and the time when jiffies were updated last */ + do { + seq = read_seqbegin(&jiffies_lock); + last_update = last_jiffies_update; + last_jiffies = jiffies; + } while (read_seqretry(&jiffies_lock, seq)); + + if (rcu_needs_cpu(&rcu_delta_jiffies) || + arch_needs_cpu() || irq_work_needs_cpu()) { + next_jiffies = last_jiffies + 1; + delta_jiffies = 1; + } else { + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } + } + + /* + * Do not stop the tick, if we are only one off (or less) + * or if the cpu is required for RCU: + */ + if (!ts->tick_stopped && delta_jiffies <= 1) + goto out; + + /* Schedule the tick, if we are at least one jiffie off */ + if ((long)delta_jiffies >= 1) { + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. Keep track of the fact that it was the one + * which had the do_timer() duty last. If this cpu is + * the one which had the do_timer() duty last, we + * limit the sleep time to the timekeeping + * max_deferement value which we retrieved + * above. Otherwise we can sleep as long as we want. + */ + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + time_delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + time_delta = KTIME_MAX; + } + +#ifdef CONFIG_NO_HZ_FULL + if (!ts->inidle) { + time_delta = min(time_delta, + scheduler_tick_max_deferment()); + } +#endif + + /* + * calculate the expiry time for the next timer wheel + * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals + * that there is no timer pending or at least extremely + * far into the future (12 days for HZ=1000). In this + * case we set the expiry to the end of time. + */ + if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { + /* + * Calculate the time delta for the next timer event. + * If the time delta exceeds the maximum time delta + * permitted by the current clocksource then adjust + * the time delta accordingly to ensure the + * clocksource does not wrap. + */ + time_delta = min_t(u64, time_delta, + tick_period.tv64 * delta_jiffies); + } + + if (time_delta < KTIME_MAX) + expires = ktime_add_ns(last_update, time_delta); + else + expires.tv64 = KTIME_MAX; + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + goto out; + + ret = expires; + + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + nohz_balance_enter_idle(cpu); + calc_load_enter_idle(); + + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; + trace_tick_stop(1, " "); + } + + /* + * If the expiration time == KTIME_MAX, then + * in this case we simply stop the tick timer. + */ + if (unlikely(expires.tv64 == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + goto out; + } + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS_PINNED); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + goto out; + } else if (!tick_program_event(expires, 0)) + goto out; + /* + * We are past the event already. So we crossed a + * jiffie boundary. Update jiffies and raise the + * softirq. + */ + tick_do_update_jiffies64(ktime_get()); + } + raise_softirq_irqoff(TIMER_SOFTIRQ); +out: + ts->next_jiffies = next_jiffies; + ts->last_jiffies = last_jiffies; + ts->sleep_length = ktime_sub(dev->next_event, now); + + return ret; +} + +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL + int cpu = smp_processor_id(); + + if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + return; + + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; + + if (!can_stop_full_tick()) + return; + + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} + +static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) +{ + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + return false; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; + return false; + } + + if (need_resched()) + return false; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10 && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { + pr_warn("NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + return false; + } + + if (tick_nohz_full_enabled()) { + /* + * Keep the tick alive to guarantee timekeeping progression + * if there are full dynticks CPUs around + */ + if (tick_do_timer_cpu == cpu) + return false; + /* + * Boot safety: make sure the timekeeping duty has been + * assigned before entering dyntick-idle mode, + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + return false; + } + + return true; +} + +static void __tick_nohz_idle_enter(struct tick_sched *ts) +{ + ktime_t now, expires; + int cpu = smp_processor_id(); + + now = tick_nohz_start_idle(ts); + + if (can_stop_idle_tick(cpu, ts)) { + int was_stopped = ts->tick_stopped; + + ts->idle_calls++; + + expires = tick_nohz_stop_sched_tick(ts, now, cpu); + if (expires.tv64 > 0LL) { + ts->idle_sleeps++; + ts->idle_expires = expires; + } + + if (!was_stopped && ts->tick_stopped) + ts->idle_jiffies = ts->last_jiffies; + } +} + +/** + * tick_nohz_idle_enter - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called when we start the idle loop. + * + * The arch is responsible of calling: + * + * - rcu_idle_enter() after its last use of RCU before the CPU is put + * to sleep. + * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. + */ +void tick_nohz_idle_enter(void) +{ + struct tick_sched *ts; + + WARN_ON_ONCE(irqs_disabled()); + + /* + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ + set_cpu_sd_state_idle(); + + local_irq_disable(); + + ts = this_cpu_ptr(&tick_cpu_sched); + ts->inidle = 1; + __tick_nohz_idle_enter(ts); + + local_irq_enable(); +} + +/** + * tick_nohz_irq_exit - update next tick event from interrupt exit + * + * When an interrupt fires while we are idle and it doesn't cause + * a reschedule, it may still add, modify or delete a timer, enqueue + * an RCU callback, etc... + * So we need to re-calculate and reprogram the next tick event. + */ +void tick_nohz_irq_exit(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (ts->inidle) + __tick_nohz_idle_enter(ts); + else + tick_nohz_full_stop_tick(ts); +} + +/** + * tick_nohz_get_sleep_length - return the length of the current sleep + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_sleep_length(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->sleep_length; +} + +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start_expires(&ts->sched_timer, + HRTIMER_MODE_ABS_PINNED); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event( + hrtimer_get_expires(&ts->sched_timer), 0)) + break; + } + /* Reread time and update jiffies */ + now = ktime_get(); + tick_do_update_jiffies64(now); + } +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +{ + /* Update jiffies first */ + tick_do_update_jiffies64(now); + update_cpu_load_nohz(); + + calc_load_exit_idle(); + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); +} + +static void tick_nohz_account_idle_ticks(struct tick_sched *ts) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + unsigned long ticks; + + if (vtime_accounting_enabled()) + return; + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif +} + +/** + * tick_nohz_idle_exit - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + * This also exit the RCU extended quiescent state. The CPU + * can use RCU again after this function is called. + */ +void tick_nohz_idle_exit(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t now; + + local_irq_disable(); + + WARN_ON_ONCE(!ts->inidle); + + ts->inidle = 0; + + if (ts->idle_active || ts->tick_stopped) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(ts, now); + + if (ts->tick_stopped) { + tick_nohz_restart_sched_tick(ts, now); + tick_nohz_account_idle_ticks(ts); + } + + local_irq_enable(); +} + +static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) +{ + hrtimer_forward(&ts->sched_timer, now, tick_period); + return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); +} + +/* + * The nohz low res interrupt handler + */ +static void tick_nohz_handler(struct clock_event_device *dev) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + dev->next_event.tv64 = KTIME_MAX; + + tick_sched_do_timer(now); + tick_sched_handle(ts, regs); + + /* No need to reprogram if we are running tickless */ + if (unlikely(ts->tick_stopped)) + return; + + while (tick_nohz_reprogram(ts, now)) { + now = ktime_get(); + tick_do_update_jiffies64(now); + } +} + +/** + * tick_nohz_switch_to_nohz - switch to nohz mode + */ +static void tick_nohz_switch_to_nohz(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t next; + + if (!tick_nohz_enabled) + return; + + local_irq_disable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) { + local_irq_enable(); + return; + } + tick_nohz_active = 1; + ts->nohz_mode = NOHZ_MODE_LOWRES; + + /* + * Recycle the hrtimer in ts, so we can share the + * hrtimer_forward with the highres code. + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + /* Get the next period */ + next = tick_init_jiffy_update(); + + for (;;) { + hrtimer_set_expires(&ts->sched_timer, next); + if (!tick_program_event(next, 0)) + break; + next = ktime_add(next, tick_period); + } + local_irq_enable(); +} + +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) +{ +#if 0 + /* Switch back to 2.6.27 behaviour */ + ktime_t delta; + + /* + * Do not touch the tick device, when the next expiry is either + * already reached or less/equal than the tick period. + */ + delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); + if (delta.tv64 <= tick_period.tv64) + return; + + tick_nohz_restart(ts, now); +#endif +} + +static inline void tick_nohz_irq_enter(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t now; + + if (!ts->idle_active && !ts->tick_stopped) + return; + now = ktime_get(); + if (ts->idle_active) + tick_nohz_stop_idle(ts, now); + if (ts->tick_stopped) { + tick_nohz_update_jiffies(now); + tick_nohz_kick_tick(ts, now); + } +} + +#else + +static inline void tick_nohz_switch_to_nohz(void) { } +static inline void tick_nohz_irq_enter(void) { } + +#endif /* CONFIG_NO_HZ_COMMON */ + +/* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_irq_enter(void) +{ + tick_check_oneshot_broadcast_this_cpu(); + tick_nohz_irq_enter(); +} + +/* + * High resolution timer specific code + */ +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We rearm the timer until we get disabled by the idle code. + * Called with interrupts disabled. + */ +static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +{ + struct tick_sched *ts = + container_of(timer, struct tick_sched, sched_timer); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + + tick_sched_do_timer(now); + + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (regs) + tick_sched_handle(ts, regs); + + /* No need to reprogram if we are in idle or full dynticks mode */ + if (unlikely(ts->tick_stopped)) + return HRTIMER_NORESTART; + + hrtimer_forward(timer, now, tick_period); + + return HRTIMER_RESTART; +} + +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + +/** + * tick_setup_sched_timer - setup the tick emulation timer + */ +void tick_setup_sched_timer(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + ktime_t now = ktime_get(); + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.function = tick_sched_timer; + + /* Get the next period (per cpu) */ + hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + + /* Offset the tick to avert jiffies_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + + for (;;) { + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start_expires(&ts->sched_timer, + HRTIMER_MODE_ABS_PINNED); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + now = ktime_get(); + } + +#ifdef CONFIG_NO_HZ_COMMON + if (tick_nohz_enabled) { + ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } +#endif +} +#endif /* HIGH_RES_TIMERS */ + +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +void tick_cancel_sched_timer(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + +# ifdef CONFIG_HIGH_RES_TIMERS + if (ts->sched_timer.base) + hrtimer_cancel(&ts->sched_timer); +# endif + + memset(ts, 0, sizeof(*ts)); +} +#endif + +/** + * Async notification about clocksource changes + */ +void tick_clock_notify(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); +} + +/* + * Async notification about clock event changes + */ +void tick_oneshot_notify(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + set_bit(0, &ts->check_clocks); +} + +/** + * Check, if a change happened, which makes oneshot possible. + * + * Called cyclic from the hrtimer softirq (driven by the timer + * softirq) allow_nohz signals, that we can switch into low-res nohz + * mode, because high resolution timers are disabled (either compile + * or runtime). + */ +int tick_check_oneshot_change(int allow_nohz) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + if (!test_and_clear_bit(0, &ts->check_clocks)) + return 0; + + if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + return 0; + + if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) + return 0; + + if (!allow_nohz) + return 1; + + tick_nohz_switch_to_nohz(); + return 0; +} diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 000000000..28b5da3e1 --- /dev/null +++ b/kernel/time/tick-sched.h @@ -0,0 +1,74 @@ +#ifndef _TICK_SCHED_H +#define _TICK_SCHED_H + +#include <linux/hrtimer.h> + +enum tick_device_mode { + TICKDEV_MODE_PERIODIC, + TICKDEV_MODE_ONESHOT, +}; + +struct tick_device { + struct clock_event_device *evtdev; + enum tick_device_mode mode; +}; + +enum tick_nohz_mode { + NOHZ_MODE_INACTIVE, + NOHZ_MODE_LOWRES, + NOHZ_MODE_HIGHRES, +}; + +/** + * struct tick_sched - sched tick emulation and no idle tick control/stats + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode + * @last_tick: Store the last tick expiry time when the tick + * timer is modified for nohz sleeps. This is necessary + * to resume the tick timer operation in the timeline + * when the CPU returns from nohz sleep. + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_calls: Total number of idle calls + * @idle_sleeps: Number of idle calls, where the sched tick was stopped + * @idle_entrytime: Time when the idle call was entered + * @idle_waketime: Time when the idle was interrupted + * @idle_exittime: Time when the idle state was left + * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding + * @sleep_length: Duration of the current idle sleep + * @do_timer_lst: CPU was the last one doing do_timer before going idle + */ +struct tick_sched { + struct hrtimer sched_timer; + unsigned long check_clocks; + enum tick_nohz_mode nohz_mode; + ktime_t last_tick; + int inidle; + int tick_stopped; + unsigned long idle_jiffies; + unsigned long idle_calls; + unsigned long idle_sleeps; + int idle_active; + ktime_t idle_entrytime; + ktime_t idle_waketime; + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + ktime_t sleep_length; + unsigned long last_jiffies; + unsigned long next_jiffies; + ktime_t idle_expires; + int do_timer_last; +}; + +extern struct tick_sched *tick_get_tick_sched(int cpu); + +extern void tick_setup_sched_timer(void); +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +extern void tick_cancel_sched_timer(int cpu); +#else +static inline void tick_cancel_sched_timer(int cpu) { } +#endif + +#endif diff --git a/kernel/time/time.c b/kernel/time/time.c new file mode 100644 index 000000000..2c85b7724 --- /dev/null +++ b/kernel/time/time.c @@ -0,0 +1,785 @@ +/* + * linux/kernel/time.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file contains the interface functions for the various + * time related system calls: time, stime, gettimeofday, settimeofday, + * adjtime + */ +/* + * Modification history kernel/time.c + * + * 1993-09-02 Philip Gladstone + * Created file with time related functions from sched/core.c and adjtimex() + * 1993-10-08 Torsten Duwe + * adjtime interface update and CMOS clock write code + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1589) + * 1999-01-16 Ulrich Windl + * Introduced error checking for many cases in adjtimex(). + * Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) + * (Even though the technical memorandum forbids it) + * 2004-07-14 Christoph Lameter + * Added getnstimeofday to allow the posix timer functions to return + * with nanosecond accuracy + */ + +#include <linux/export.h> +#include <linux/timex.h> +#include <linux/capability.h> +#include <linux/timekeeper_internal.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/fs.h> +#include <linux/math64.h> +#include <linux/ptrace.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> + +#include "timeconst.h" +#include "timekeeping.h" + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. + */ +struct timezone sys_tz; + +EXPORT_SYMBOL(sys_tz); + +#ifdef __ARCH_WANT_SYS_TIME + +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +SYSCALL_DEFINE1(time, time_t __user *, tloc) +{ + time_t i = get_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ + +SYSCALL_DEFINE1(stime, time_t __user *, tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME */ + +SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + if (likely(tv != NULL)) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (copy_to_user(tv, &ktv, sizeof(ktv))) + return -EFAULT; + } + if (unlikely(tz != NULL)) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +/* + * Indicates if there is an offset between the system clock and the hardware + * clock/persistent clock/rtc. + */ +int persistent_clock_is_local; + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +static inline void warp_clock(void) +{ + if (sys_tz.tz_minuteswest != 0) { + struct timespec adjust; + + persistent_clock_is_local = 1; + adjust.tv_sec = sys_tz.tz_minuteswest * 60; + adjust.tv_nsec = 0; + timekeeping_inject_offset(&adjust); + } +} + +/* + * In case for some reason the CMOS clock has not already been running + * in UTC, but in some local time: The first time we set the timezone, + * we will warp the clock so that it is ticking UTC time instead of + * local time. Presumably, if someone is setting the timezone then we + * are running in an environment where the programs understand about + * timezones. This should be done at boot time in the /etc/rc script, + * as soon as possible, so that the clock can be set right. Otherwise, + * various programs will get confused when the clock gets warped. + */ + +int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) +{ + static int firsttime = 1; + int error = 0; + + if (tv && !timespec_valid(tv)) + return -EINVAL; + + error = security_settime(tv, tz); + if (error) + return error; + + if (tz) { + sys_tz = *tz; + update_vsyscall_tz(); + if (firsttime) { + firsttime = 0; + if (!tv) + warp_clock(); + } + } + if (tv) + return do_settimeofday(tv); + return 0; +} + +SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; + + if (tv) { + if (copy_from_user(&user_tv, tv, sizeof(*tv))) + return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} + +SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +{ + struct timex txc; /* Local copy of parameter */ + int ret; + + /* Copy the user data space into the kernel copy + * structure. But bear in mind that the structures + * may change + */ + if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + return -EFAULT; + ret = do_adjtimex(&txc); + return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; +} + +/** + * current_fs_time - Return FS time + * @sb: Superblock. + * + * Return the current time truncated to the time granularity supported by + * the fs. + */ +struct timespec current_fs_time(struct super_block *sb) +{ + struct timespec now = current_kernel_time(); + return timespec_trunc(now, sb->s_time_gran); +} +EXPORT_SYMBOL(current_fs_time); + +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; +# else + return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; +# else + return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/** + * timespec_trunc - Truncate timespec to a granularity + * @t: Timespec + * @gran: Granularity in ns. + * + * Truncate a timespec to a granularity. gran must be smaller than a second. + * Always rounds down. + * + * This function should be only used for timestamps returned by + * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because + * it doesn't handle the better resolution of the latter. + */ +struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* + * Division is pretty slow so avoid it for common cases. + * Currently current_kernel_time() never returns better than + * jiffies resolution. Exploit that. + */ + if (gran <= jiffies_to_usecs(1) * 1000) { + /* nothing */ + } else if (gran == 1000000000) { + t.tv_nsec = 0; + } else { + t.tv_nsec -= t.tv_nsec % gran; + } + return t; +} +EXPORT_SYMBOL(timespec_trunc); + +/* + * mktime64 - Converts date to seconds. + * Converts Gregorian date to seconds since 1970-01-01 00:00:00. + * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 + * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. + * + * [For the Julian calendar (which was used in Russia before 1917, + * Britain & colonies before 1752, anywhere else before 1582, + * and is still in use by some communities) leave out the + * -year/100+year/400 terms, and add 10.] + * + * This algorithm was first published by Gauss (I think). + */ +time64_t mktime64(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) +{ + unsigned int mon = mon0, year = year0; + + /* 1..12 -> 11,12,1..10 */ + if (0 >= (int) (mon -= 2)) { + mon += 12; /* Puts Feb last since it has leap day */ + year -= 1; + } + + return ((((time64_t) + (year/4 - year/100 + year/400 + 367*mon/12 + day) + + year*365 - 719499 + )*24 + hour /* now have hours */ + )*60 + min /* now have minutes */ + )*60 + sec; /* finally seconds */ +} +EXPORT_SYMBOL(mktime64); + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec); + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec representation of the nsec parameter. + */ +struct timespec ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + s32 rem; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec); + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * @nsec: the nanoseconds value to be converted + * + * Returns the timeval representation of the nsec parameter. + */ +struct timeval ns_to_timeval(const s64 nsec) +{ + struct timespec ts = ns_to_timespec(nsec); + struct timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_timeval); + +#if BITS_PER_LONG == 32 +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec64); + +/** + * ns_to_timespec64 - Convert nanoseconds to timespec64 + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec64 representation of the nsec parameter. + */ +struct timespec64 ns_to_timespec64(const s64 nsec) +{ + struct timespec64 ts; + s32 rem; + + if (!nsec) + return (struct timespec64) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec64); +#endif +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldn't overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + >> MSEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * Note that due to the small error in the multiplier here, this + * rounding is incorrect for sufficiently large values of tv_nsec, but + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're + * OK. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) +{ + nsec = nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} + +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); +} + +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_nsec = rem; +} +EXPORT_SYMBOL(jiffies_to_timespec); + +/* + * We could use a similar algorithm to timespec_to_jiffies (with a + * different multiplier for usec instead of nsec). But this has a + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the + * usec value, since it's not necessarily integral. + * + * We could instead round in the intermediate scaled representation + * (i.e. in units of 1/2^(large scale) jiffies) but that's also + * perilous: the scaling introduces a small positive error, which + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 + * units to the intermediate before shifting) leads to accidental + * overflow and overestimates. + * + * At the cost of one additional multiplication by a constant, just + * use the timespec implementation. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + return __timespec_to_jiffies(value->tv_sec, + value->tv_usec * NSEC_PER_USEC); +} +EXPORT_SYMBOL(timeval_to_jiffies); + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_usec = rem / NSEC_PER_USEC; +} +EXPORT_SYMBOL(jiffies_to_timeval); + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(unsigned long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + return x * (USER_HZ / HZ); +# else + return x / (HZ / USER_HZ); +# endif +#else + return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + return div_u64((u64)x * HZ, USER_HZ); +#endif +} +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + x = div_u64(x * USER_HZ, HZ); +# elif HZ > USER_HZ + x = div_u64(x, HZ / USER_HZ); +# else + /* Nothing to do */ +# endif +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + return div_u64(x, NSEC_PER_SEC / USER_HZ); +#elif (USER_HZ % 512) == 0 + return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); +#endif +} + +/** + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +u64 nsecs_to_jiffies64(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} +EXPORT_SYMBOL(nsecs_to_jiffies64); + +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ + return (unsigned long)nsecs_to_jiffies64(n); +} +EXPORT_SYMBOL_GPL(nsecs_to_jiffies); + +/* + * Add two timespec values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0) + */ +struct timespec timespec_add_safe(const struct timespec lhs, + const struct timespec rhs) +{ + struct timespec res; + + set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) + res.tv_sec = TIME_T_MAX; + + return res; +} diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc new file mode 100644 index 000000000..511bdf2ca --- /dev/null +++ b/kernel/time/timeconst.bc @@ -0,0 +1,108 @@ +scale=0 + +define gcd(a,b) { + auto t; + while (b) { + t = b; + b = a % b; + a = t; + } + return a; +} + +/* Division by reciprocal multiplication. */ +define fmul(b,n,d) { + return (2^b*n+d-1)/d; +} + +/* Adjustment factor when a ceiling value is used. Use as: + (imul * n) + (fmulxx * n + fadjxx) >> xx) */ +define fadj(b,n,d) { + auto v; + d = d/gcd(n,d); + v = 2^b*(d-1)/d; + return v; +} + +/* Compute the appropriate mul/adj values as well as a shift count, + which brings the mul value into the range 2^b-1 <= x < 2^b. Such + a shift value will be correct in the signed integer range and off + by at most one in the upper half of the unsigned range. */ +define fmuls(b,n,d) { + auto s, m; + for (s = 0; 1; s++) { + m = fmul(s,n,d); + if (m >= 2^(b-1)) + return s; + } + return 0; +} + +define timeconst(hz) { + print "/* Automatically generated by kernel/timeconst.bc */\n" + print "/* Time conversion constants for HZ == ", hz, " */\n" + print "\n" + + print "#ifndef KERNEL_TIMECONST_H\n" + print "#define KERNEL_TIMECONST_H\n\n" + + print "#include <linux/param.h>\n" + print "#include <linux/types.h>\n\n" + + print "#if HZ != ", hz, "\n" + print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#endif\n\n" + + if (hz < 2) { + print "#error Totally bogus HZ value!\n" + } else { + s=fmuls(32,1000,hz) + obase=16 + print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n" + print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n" + obase=10 + print "#define HZ_TO_MSEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000) + obase=16 + print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n" + print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n" + obase=10 + print "#define MSEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000) + print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n" + print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n" + print "\n" + + s=fmuls(32,1000000,hz) + obase=16 + print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n" + print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n" + obase=10 + print "#define HZ_TO_USEC_SHR32\t", s, "\n" + + s=fmuls(32,hz,1000000) + obase=16 + print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n" + print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n" + obase=10 + print "#define USEC_TO_HZ_SHR32\t", s, "\n" + + obase=10 + cd=gcd(hz,1000000) + print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n" + print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" + print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" + print "\n" + + print "#endif /* KERNEL_TIMECONST_H */\n" + } + halt +} + +timeconst(hz) diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 000000000..86628e755 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + */ + +#include <linux/time.h> +#include <linux/module.h> + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + const unsigned short *ip; + + days = totalsecs / SECS_PER_DAY; + rem = totalsecs % SECS_PER_DAY; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time_to_tm); diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c new file mode 100644 index 000000000..4687b3104 --- /dev/null +++ b/kernel/time/timecounter.c @@ -0,0 +1,112 @@ +/* + * linux/kernel/time/timecounter.c + * + * based on code that migrated away from + * linux/kernel/time/clocksource.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/export.h> +#include <linux/timecounter.h> + +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; + tc->mask = (1ULL << cc->shift) - 1; + tc->frac = 0; +} +EXPORT_SYMBOL_GPL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + cycle_t cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta, + tc->mask, &tc->frac); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_read); + +/* + * This is like cyclecounter_cyc2ns(), but it is used for computing a + * time previous to the time stored in the cycle counter. + */ +static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, + cycle_t cycles, u64 mask, u64 frac) +{ + u64 ns = (u64) cycles; + + ns = ((ns * cc->mult) - frac) >> cc->shift; + + return ns; +} + +u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp) +{ + u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec = tc->nsec, frac = tc->frac; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (delta > tc->cc->mask / 2) { + delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); + } else { + nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); + } + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c new file mode 100644 index 000000000..946acb721 --- /dev/null +++ b/kernel/time/timekeeping.c @@ -0,0 +1,2072 @@ +/* + * linux/kernel/time/timekeeping.c + * + * Kernel timekeeping code and accessor functions + * + * This code was moved from linux/kernel/timer.c. + * Please see that file for copyright and history logs. + * + */ + +#include <linux/timekeeper_internal.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/syscore_ops.h> +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/tick.h> +#include <linux/stop_machine.h> +#include <linux/pvclock_gtod.h> +#include <linux/compiler.h> + +#include "tick-internal.h" +#include "ntp_internal.h" +#include "timekeeping_internal.h" + +#define TK_CLEAR_NTP (1 << 0) +#define TK_MIRROR (1 << 1) +#define TK_CLOCK_WAS_SET (1 << 2) + +/* + * The most important data for readout fits into a single 64 byte + * cache line. + */ +static struct { + seqcount_t seq; + struct timekeeper timekeeper; +} tk_core ____cacheline_aligned; + +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static struct timekeeper shadow_timekeeper; + +/** + * struct tk_fast - NMI safe timekeeper + * @seq: Sequence counter for protecting updates. The lowest bit + * is the index for the tk_read_base array + * @base: tk_read_base array. Access is indexed by the lowest bit of + * @seq. + * + * See @update_fast_timekeeper() below. + */ +struct tk_fast { + seqcount_t seq; + struct tk_read_base base[2]; +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned; +static struct tk_fast tk_fast_raw ____cacheline_aligned; + +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + +static inline void tk_normalize_xtime(struct timekeeper *tk) +{ + while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { + tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; + tk->xtime_sec++; + } +} + +static inline struct timespec64 tk_xtime(struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + return ts; +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) +{ + tk->xtime_sec = ts->tv_sec; + tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; +} + +static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) +{ + tk->xtime_sec += ts->tv_sec; + tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_normalize_xtime(tk); +} + +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) +{ + struct timespec64 tmp; + + /* + * Verify consistency of: offset_real = -wall_to_monotonic + * before modifying anything + */ + set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, + -tk->wall_to_monotonic.tv_nsec); + WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64); + tk->wall_to_monotonic = wtm; + set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); + tk->offs_real = timespec64_to_ktime(tmp); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); +} + +static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) +{ + tk->offs_boot = ktime_add(tk->offs_boot, delta); +} + +#ifdef CONFIG_DEBUG_TIMEKEEPING +#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ +/* + * These simple flag variables are managed + * without locks, which is racy, but ok since + * we don't really care about being super + * precise about how many events were seen, + * just that a problem was observed. + */ +static int timekeeping_underflow_seen; +static int timekeeping_overflow_seen; + +/* last_warning is only modified under the timekeeping lock */ +static long timekeeping_last_warning; + +static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +{ + + cycle_t max_cycles = tk->tkr_mono.clock->max_cycles; + const char *name = tk->tkr_mono.clock->name; + + if (offset > max_cycles) { + printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", + offset, name, max_cycles); + printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); + } else { + if (offset > (max_cycles >> 1)) { + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + offset, name, max_cycles >> 1); + printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); + } + } + + if (timekeeping_underflow_seen) { + if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + timekeeping_last_warning = jiffies; + } + timekeeping_underflow_seen = 0; + } + + if (timekeeping_overflow_seen) { + if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + timekeeping_last_warning = jiffies; + } + timekeeping_overflow_seen = 0; + } +} + +static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +{ + cycle_t now, last, mask, max, delta; + unsigned int seq; + + /* + * Since we're called holding a seqlock, the data may shift + * under us while we're doing the calculation. This can cause + * false positives, since we'd note a problem but throw the + * results away. So nest another seqlock here to atomically + * grab the points we are checking with. + */ + do { + seq = read_seqcount_begin(&tk_core.seq); + now = tkr->read(tkr->clock); + last = tkr->cycle_last; + mask = tkr->mask; + max = tkr->clock->max_cycles; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + delta = clocksource_delta(now, last, mask); + + /* + * Try to catch underflows by checking if we are seeing small + * mask-relative negative values. + */ + if (unlikely((~delta & mask) < (mask >> 3))) { + timekeeping_underflow_seen = 1; + delta = 0; + } + + /* Cap delta value to the max_cycles values to avoid mult overflows */ + if (unlikely(delta > max)) { + timekeeping_overflow_seen = 1; + delta = tkr->clock->max_cycles; + } + + return delta; +} +#else +static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +{ +} +static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +{ + cycle_t cycle_now, delta; + + /* read clocksource */ + cycle_now = tkr->read(tkr->clock); + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + + return delta; +} +#endif + +/** + * tk_setup_internals - Set up internals to use clocksource clock. + * + * @tk: The target timekeeper to setup. + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) +{ + cycle_t interval; + u64 tmp, ntpinterval; + struct clocksource *old_clock; + + old_clock = tk->tkr_mono.clock; + tk->tkr_mono.clock = clock; + tk->tkr_mono.read = clock->read; + tk->tkr_mono.mask = clock->mask; + tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + + tk->tkr_raw.clock = clock; + tk->tkr_raw.read = clock->read; + tk->tkr_raw.mask = clock->mask; + tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; + + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + ntpinterval = tmp; + tmp += clock->mult/2; + do_div(tmp, clock->mult); + if (tmp == 0) + tmp = 1; + + interval = (cycle_t) tmp; + tk->cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + tk->xtime_interval = (u64) interval * clock->mult; + tk->xtime_remainder = ntpinterval - tk->xtime_interval; + tk->raw_interval = + ((u64) interval * clock->mult) >> clock->shift; + + /* if changing clocks, convert xtime_nsec shift units */ + if (old_clock) { + int shift_change = clock->shift - old_clock->shift; + if (shift_change < 0) + tk->tkr_mono.xtime_nsec >>= -shift_change; + else + tk->tkr_mono.xtime_nsec <<= shift_change; + } + tk->tkr_raw.xtime_nsec = 0; + + tk->tkr_mono.shift = clock->shift; + tk->tkr_raw.shift = clock->shift; + + tk->ntp_error = 0; + tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + tk->ntp_tick = ntpinterval << tk->ntp_error_shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + tk->tkr_mono.mult = clock->mult; + tk->tkr_raw.mult = clock->mult; + tk->ntp_err_mult = 0; +} + +/* Timekeeper helper functions. */ + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +static u32 default_arch_gettimeoffset(void) { return 0; } +u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; +#else +static inline u32 arch_gettimeoffset(void) { return 0; } +#endif + +static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) +{ + cycle_t delta; + s64 nsec; + + delta = timekeeping_get_delta(tkr); + + nsec = delta * tkr->mult + tkr->xtime_nsec; + nsec >>= tkr->shift; + + /* If arch requires, add in get_arch_timeoffset() */ + return nsec + arch_gettimeoffset(); +} + +/** + * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. + * @tkr: Timekeeping readout base from which we take the update + * + * We want to use this from any context including NMI and tracing / + * instrumenting the timekeeping code itself. + * + * So we handle this differently than the other timekeeping accessor + * functions which retry when the sequence count has changed. The + * update side does: + * + * smp_wmb(); <- Ensure that the last base[1] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[0], tkr); + * smp_wmb(); <- Ensure that the base[0] update is visible + * tkf->seq++; + * smp_wmb(); <- Ensure that the seqcount update is visible + * update(tkf->base[1], tkr); + * + * The reader side does: + * + * do { + * seq = tkf->seq; + * smp_rmb(); + * idx = seq & 0x01; + * now = now(tkf->base[idx]); + * smp_rmb(); + * } while (seq != tkf->seq) + * + * As long as we update base[0] readers are forced off to + * base[1]. Once base[0] is updated readers are redirected to base[0] + * and the base[1] update takes place. + * + * So if a NMI hits the update of base[0] then it will use base[1] + * which is still consistent. In the worst case this can result is a + * slightly wrong timestamp (a few nanoseconds). See + * @ktime_get_mono_fast_ns. + */ +static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf) +{ + struct tk_read_base *base = tkf->base; + + /* Force readers off to base[1] */ + raw_write_seqcount_latch(&tkf->seq); + + /* Update base[0] */ + memcpy(base, tkr, sizeof(*base)); + + /* Force readers back to base[0] */ + raw_write_seqcount_latch(&tkf->seq); + + /* Update base[1] */ + memcpy(base + 1, base, sizeof(*base)); +} + +/** + * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic + * + * This timestamp is not guaranteed to be monotonic across an update. + * The timestamp is calculated by: + * + * now = base_mono + clock_delta * slope + * + * So if the update lowers the slope, readers who are forced to the + * not yet updated second array are still using the old steeper slope. + * + * tmono + * ^ + * | o n + * | o n + * | u + * | o + * |o + * |12345678---> reader order + * + * o = old slope + * u = update + * n = new slope + * + * So reader 6 will observe time going backwards versus reader 5. + * + * While other CPUs are likely to be able observe that, the only way + * for a CPU local observation is when an NMI hits in the middle of + * the update. Timestamps taken from that NMI context might be ahead + * of the following timestamps. Callers need to be aware of that and + * deal with it. + */ +static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + } while (read_seqcount_retry(&tkf->seq, seq)); + + return now; +} + +u64 ktime_get_mono_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_mono); +} +EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); + +u64 ktime_get_raw_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); + +/* Suspend-time cycles value for halted fast timekeeper. */ +static cycle_t cycles_at_suspend; + +static cycle_t dummy_clock_read(struct clocksource *cs) +{ + return cycles_at_suspend; +} + +/** + * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. + * @tk: Timekeeper to snapshot. + * + * It generally is unsafe to access the clocksource after timekeeping has been + * suspended, so take a snapshot of the readout base of @tk and use it as the + * fast timekeeper's readout base while suspended. It will return the same + * number of cycles every time until timekeeping is resumed at which time the + * proper readout base for the fast timekeeper will be restored automatically. + */ +static void halt_fast_timekeeper(struct timekeeper *tk) +{ + static struct tk_read_base tkr_dummy; + struct tk_read_base *tkr = &tk->tkr_mono; + + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + cycles_at_suspend = tkr->read(tkr->clock); + tkr_dummy.read = dummy_clock_read; + update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); + + tkr = &tk->tkr_raw; + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + tkr_dummy.read = dummy_clock_read; + update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); +} + +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD + +static inline void update_vsyscall(struct timekeeper *tk) +{ + struct timespec xt, wm; + + xt = timespec64_to_timespec(tk_xtime(tk)); + wm = timespec64_to_timespec(tk->wall_to_monotonic); + update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult, + tk->tkr_mono.cycle_last); +} + +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ + s64 remainder; + + /* + * Store only full nanoseconds into xtime_nsec after rounding + * it up and add the remainder to the error difference. + * XXX - This is necessary to avoid small 1ns inconsistnecies caused + * by truncating the remainder in vsyscalls. However, it causes + * additional work to be done in timekeeping_adjust(). Once + * the vsyscall implementations are converted to use xtime_nsec + * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD + * users are removed, this can be killed. + */ + remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); + tk->tkr_mono.xtime_nsec -= remainder; + tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; + tk->ntp_error += remainder << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; +} +#else +#define old_vsyscall_fixup(tk) +#endif + +static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); + +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) +{ + raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); +} + +/** + * pvclock_gtod_register_notifier - register a pvclock timedata update listener + */ +int pvclock_gtod_register_notifier(struct notifier_block *nb) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); + update_pvclock_gtod(tk, true); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); + +/** + * pvclock_gtod_unregister_notifier - unregister a pvclock + * timedata update listener + */ +int pvclock_gtod_unregister_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); + +/* + * Update the ktime_t based scalar nsec members of the timekeeper + */ +static inline void tk_update_ktime_data(struct timekeeper *tk) +{ + u64 seconds; + u32 nsec; + + /* + * The xtime based monotonic readout is: + * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); + * The ktime based monotonic readout is: + * nsec = base_mono + now(); + * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + */ + seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); + nsec = (u32) tk->wall_to_monotonic.tv_nsec; + tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + + /* Update the monotonic raw base */ + tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); + + /* + * The sum of the nanoseconds portions of xtime and + * wall_to_monotonic can be greater/equal one second. Take + * this into account before updating tk->ktime_sec. + */ + nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + if (nsec >= NSEC_PER_SEC) + seconds++; + tk->ktime_sec = seconds; +} + +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, unsigned int action) +{ + if (action & TK_CLEAR_NTP) { + tk->ntp_error = 0; + ntp_clear(); + } + + tk_update_ktime_data(tk); + + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + + if (action & TK_MIRROR) + memcpy(&shadow_timekeeper, &tk_core.timekeeper, + sizeof(tk_core.timekeeper)); + + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); +} + +/** + * timekeeping_forward_now - update clock to the current time + * + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. + */ +static void timekeeping_forward_now(struct timekeeper *tk) +{ + struct clocksource *clock = tk->tkr_mono.clock; + cycle_t cycle_now, delta; + s64 nsec; + + cycle_now = tk->tkr_mono.read(clock); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + + tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; + + /* If arch requires, add in get_arch_timeoffset() */ + tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; + + tk_normalize_xtime(tk); + + nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); + timespec64_add_ns(&tk->raw_time, nsec); +} + +/** + * __getnstimeofday64 - Returns the time of day in a timespec64. + * @ts: pointer to the timespec to be set + * + * Updates the time of day in the timespec. + * Returns 0 on success, or -ve when suspended (timespec will be undefined). + */ +int __getnstimeofday64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + s64 nsecs = 0; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ts->tv_sec = tk->xtime_sec; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); + + /* + * Do not bail out early, in case there were callers still using + * the value, even in the face of the WARN_ON. + */ + if (unlikely(timekeeping_suspended)) + return -EAGAIN; + return 0; +} +EXPORT_SYMBOL(__getnstimeofday64); + +/** + * getnstimeofday64 - Returns the time of day in a timespec64. + * @ts: pointer to the timespec64 to be set + * + * Returns the time of day in a timespec64 (WARN if suspended). + */ +void getnstimeofday64(struct timespec64 *ts) +{ + WARN_ON(__getnstimeofday64(ts)); +} +EXPORT_SYMBOL(getnstimeofday64); + +ktime_t ktime_get(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +static ktime_t *offsets[TK_OFFS_MAX] = { + [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, + [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, + [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, +}; + +ktime_t ktime_get_with_offset(enum tk_offsets offs) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base, *offset = offsets[offs]; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); + +} +EXPORT_SYMBOL_GPL(ktime_get_with_offset); + +/** + * ktime_mono_to_any() - convert mononotic time to any other time + * @tmono: time to convert. + * @offs: which offset to use + */ +ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) +{ + ktime_t *offset = offsets[offs]; + unsigned long seq; + ktime_t tconv; + + do { + seq = read_seqcount_begin(&tk_core.seq); + tconv = ktime_add(tmono, *offset); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return tconv; +} +EXPORT_SYMBOL_GPL(ktime_mono_to_any); + +/** + * ktime_get_raw - Returns the raw monotonic time in ktime_t format + */ +ktime_t ktime_get_raw(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + s64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + base = tk->tkr_raw.base; + nsecs = timekeeping_get_ns(&tk->tkr_raw); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_raw); + +/** + * ktime_get_ts64 - get the monotonic clock in timespec64 format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec64 format in the variable pointed to by @ts. + */ +void ktime_get_ts64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 tomono; + s64 nsec; + unsigned int seq; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->xtime_sec; + nsec = timekeeping_get_ns(&tk->tkr_mono); + tomono = tk->wall_to_monotonic; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsec + tomono.tv_nsec); +} +EXPORT_SYMBOL_GPL(ktime_get_ts64); + +/** + * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC + * + * Returns the seconds portion of CLOCK_MONOTONIC with a single non + * serialized read. tk->ktime_sec is of type 'unsigned long' so this + * works on both 32 and 64 bit systems. On 32 bit systems the readout + * covers ~136 years of uptime which should be enough to prevent + * premature wrap arounds. + */ +time64_t ktime_get_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + WARN_ON(timekeeping_suspended); + return tk->ktime_sec; +} +EXPORT_SYMBOL_GPL(ktime_get_seconds); + +/** + * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME + * + * Returns the wall clock seconds since 1970. This replaces the + * get_seconds() interface which is not y2038 safe on 32bit systems. + * + * For 64bit systems the fast access to tk->xtime_sec is preserved. On + * 32bit systems the access must be protected with the sequence + * counter to provide "atomic" access to the 64bit tk->xtime_sec + * value. + */ +time64_t ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + time64_t seconds; + unsigned int seq; + + if (IS_ENABLED(CONFIG_64BIT)) + return tk->xtime_sec; + + do { + seq = read_seqcount_begin(&tk_core.seq); + seconds = tk->xtime_sec; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return seconds; +} +EXPORT_SYMBOL_GPL(ktime_get_real_seconds); + +#ifdef CONFIG_NTP_PPS + +/** + * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * @ts_raw: pointer to the timespec to be set to raw monotonic time + * @ts_real: pointer to the timespec to be set to the time of day + * + * This function reads both the time of day and raw monotonic time at the + * same time atomically and stores the resulting timestamps in timespec + * format. + */ +void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + s64 nsecs_raw, nsecs_real; + + WARN_ON_ONCE(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + + *ts_raw = timespec64_to_timespec(tk->raw_time); + ts_real->tv_sec = tk->xtime_sec; + ts_real->tv_nsec = 0; + + nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); + nsecs_real = timekeeping_get_ns(&tk->tkr_mono); + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + timespec_add_ns(ts_raw, nsecs_raw); + timespec_add_ns(ts_real, nsecs_real); +} +EXPORT_SYMBOL(getnstime_raw_and_real); + +#endif /* CONFIG_NTP_PPS */ + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using getnstimeofday() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec64 now; + + getnstimeofday64(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} +EXPORT_SYMBOL(do_gettimeofday); + +/** + * do_settimeofday64 - Sets the time of day. + * @ts: pointer to the timespec64 variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday64(const struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts_delta, xt; + unsigned long flags; + + if (!timespec64_valid_strict(ts)) + return -EINVAL; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + xt = tk_xtime(tk); + ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; + ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; + + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); + + tk_set_xtime(tk, ts); + + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} +EXPORT_SYMBOL(do_settimeofday64); + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +int timekeeping_inject_offset(struct timespec *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + struct timespec64 ts64, tmp; + int ret = 0; + + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + ts64 = timespec_to_timespec64(*ts); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tk), ts64); + if (!timespec64_valid_strict(&tmp)) { + ret = -EINVAL; + goto error; + } + + tk_xtime_add(tk, &ts64); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64)); + +error: /* even if we error out, we forwarded the time, so call update */ + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return ret; +} +EXPORT_SYMBOL(timekeeping_inject_offset); + + +/** + * timekeeping_get_tai_offset - Returns current TAI offset from UTC + * + */ +s32 timekeeping_get_tai_offset(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + s32 ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + ret = tk->tai_offset; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ret; +} + +/** + * __timekeeping_set_tai_offset - Lock free worker function + * + */ +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ + tk->tai_offset = tai_offset; + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); +} + +/** + * timekeeping_set_tai_offset - Sets the current TAI offset from UTC + * + */ +void timekeeping_set_tai_offset(s32 tai_offset) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + __timekeeping_set_tai_offset(tk, tai_offset); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + clock_was_set(); +} + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void *data) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *new, *old; + unsigned long flags; + + new = (struct clocksource *) data; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + /* + * If the cs is in module, get a module reference. Succeeds + * for built-in code (owner == NULL) as well. + */ + if (try_module_get(new->owner)) { + if (!new->enable || new->enable(new) == 0) { + old = tk->tkr_mono.clock; + tk_setup_internals(tk, new); + if (old->disable) + old->disable(old); + module_put(old->owner); + } else { + module_put(new->owner); + } + } + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + return 0; +} + +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +int timekeeping_notify(struct clocksource *clock) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + if (tk->tkr_mono.clock == clock) + return 0; + stop_machine(change_clocksource, clock, NULL); + tick_clock_notify(); + return tk->tkr_mono.clock == clock ? 0 : -1; +} + +/** + * getrawmonotonic64 - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec64 to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 ts64; + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + nsecs = timekeeping_get_ns(&tk->tkr_raw); + ts64 = tk->raw_time; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + timespec64_add_ns(&ts64, nsecs); + *ts = ts64; +} +EXPORT_SYMBOL(getrawmonotonic64); + + +/** + * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres + */ +int timekeeping_valid_for_hres(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ret; +} + +/** + * timekeeping_max_deferment - Returns max time the clocksource can be deferred + */ +u64 timekeeping_max_deferment(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long seq; + u64 ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + ret = tk->tkr_mono.clock->max_idle_ns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ret; +} + +/** + * read_persistent_clock - Return time from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __weak read_persistent_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +void __weak read_persistent_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_persistent_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + +/** + * read_boot_clock - Return time of the system start. + * + * Weak dummy function for arches that do not yet support it. + * Function to read the exact time the system has been started. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __weak read_boot_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +void __weak read_boot_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_boot_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + +/* Flag for if timekeeping_resume() has injected sleeptime */ +static bool sleeptime_injected; + +/* Flag for if there is a persistent clock on this platform */ +static bool persistent_clock_exists; + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock; + unsigned long flags; + struct timespec64 now, boot, tmp; + + read_persistent_clock64(&now); + if (!timespec64_valid_strict(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } else if (now.tv_sec || now.tv_nsec) + persistent_clock_exists = true; + + read_boot_clock64(&boot); + if (!timespec64_valid_strict(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + ntp_init(); + + clock = clocksource_default_clock(); + if (clock->enable) + clock->enable(clock); + tk_setup_internals(tk, clock); + + tk_set_xtime(tk, &now); + tk->raw_time.tv_sec = 0; + tk->raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) + boot = tk_xtime(tk); + + set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); + tk_set_wall_to_mono(tk, tmp); + + timekeeping_update(tk, TK_MIRROR); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} + +/* time in seconds when suspend began for persistent clock */ +static struct timespec64 timekeeping_suspend_time; + +/** + * __timekeeping_inject_sleeptime - Internal function to add sleep interval + * @delta: pointer to a timespec delta value + * + * Takes a timespec offset measuring a suspend interval and properly + * adds the sleep offset to the timekeeping variables. + */ +static void __timekeeping_inject_sleeptime(struct timekeeper *tk, + struct timespec64 *delta) +{ + if (!timespec64_valid_strict(delta)) { + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; + } + tk_xtime_add(tk, delta); + tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); + tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); + tk_debug_account_sleep_time(delta); +} + +#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) +/** + * We have three kinds of time sources to use for sleep time + * injection, the preference order is: + * 1) non-stop clocksource + * 2) persistent clock (ie: RTC accessible when irqs are off) + * 3) RTC + * + * 1) and 2) are used by timekeeping, 3) by RTC subsystem. + * If system has neither 1) nor 2), 3) will be used finally. + * + * + * If timekeeping has injected sleeptime via either 1) or 2), + * 3) becomes needless, so in this case we don't need to call + * rtc_resume(), and this is what timekeeping_rtc_skipresume() + * means. + */ +bool timekeeping_rtc_skipresume(void) +{ + return sleeptime_injected; +} + +/** + * 1) can be determined whether to use or not only when doing + * timekeeping_resume() which is invoked after rtc_suspend(), + * so we can't skip rtc_suspend() surely if system has 1). + * + * But if system has 2), 2) will definitely be used, so in this + * case we don't need to call rtc_suspend(), and this is what + * timekeeping_rtc_skipsuspend() means. + */ +bool timekeeping_rtc_skipsuspend(void) +{ + return persistent_clock_exists; +} + +/** + * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec64 delta value + * + * This hook is for architectures that cannot support read_persistent_clock64 + * because their RTC/persistent clock is only accessible when irqs are enabled. + * and also don't have an effective nonstop clocksource. + * + * This function should only be called by rtc_resume(), and allows + * a suspend offset to be injected into the timekeeping values. + */ +void timekeeping_inject_sleeptime64(struct timespec64 *delta) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + timekeeping_forward_now(tk); + + __timekeeping_inject_sleeptime(tk, delta); + + timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); +} +#endif + +/** + * timekeeping_resume - Resumes the generic timekeeping subsystem. + */ +void timekeeping_resume(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock = tk->tkr_mono.clock; + unsigned long flags; + struct timespec64 ts_new, ts_delta; + cycle_t cycle_now, cycle_delta; + + sleeptime_injected = false; + read_persistent_clock64(&ts_new); + + clockevents_resume(); + clocksource_resume(); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + /* + * After system resumes, we need to calculate the suspended time and + * compensate it for the OS time. There are 3 sources that could be + * used: Nonstop clocksource during suspend, persistent clock and rtc + * device. + * + * One specific platform may have 1 or 2 or all of them, and the + * preference will be: + * suspend-nonstop clocksource -> persistent clock -> rtc + * The less preferred source will only be tried if there is no better + * usable source. The rtc part is handled separately in rtc core code. + */ + cycle_now = tk->tkr_mono.read(clock); + if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && + cycle_now > tk->tkr_mono.cycle_last) { + u64 num, max = ULLONG_MAX; + u32 mult = clock->mult; + u32 shift = clock->shift; + s64 nsec = 0; + + cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, + tk->tkr_mono.mask); + + /* + * "cycle_delta * mutl" may cause 64 bits overflow, if the + * suspended time is too long. In that case we need do the + * 64 bits math carefully + */ + do_div(max, mult); + if (cycle_delta > max) { + num = div64_u64(cycle_delta, max); + nsec = (((u64) max * mult) >> shift) * num; + cycle_delta -= num * max; + } + nsec += ((u64) cycle_delta * mult) >> shift; + + ts_delta = ns_to_timespec64(nsec); + sleeptime_injected = true; + } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { + ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); + sleeptime_injected = true; + } + + if (sleeptime_injected) + __timekeeping_inject_sleeptime(tk, &ts_delta); + + /* Re-base the last cycle value */ + tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + + tk->ntp_error = 0; + timekeeping_suspended = 0; + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + touch_softlockup_watchdog(); + + tick_resume(); + hrtimers_resume(); +} + +int timekeeping_suspend(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + struct timespec64 delta, delta_delta; + static struct timespec64 old_delta; + + read_persistent_clock64(&timekeeping_suspend_time); + + /* + * On some systems the persistent_clock can not be detected at + * timekeeping_init by its return value, so if we see a valid + * value returned, update the persistent_clock_exists flag. + */ + if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) + persistent_clock_exists = true; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + timekeeping_forward_now(tk); + timekeeping_suspended = 1; + + if (persistent_clock_exists) { + /* + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. + */ + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occurred and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec64_add(timekeeping_suspend_time, delta_delta); + } + } + + timekeeping_update(tk, TK_MIRROR); + halt_fast_timekeeper(tk); + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + tick_suspend(); + clocksource_suspend(); + clockevents_suspend(); + + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_resume, + .suspend = timekeeping_suspend, +}; + +static int __init timekeeping_init_ops(void) +{ + register_syscore_ops(&timekeeping_syscore_ops); + return 0; +} +device_initcall(timekeeping_init_ops); + +/* + * Apply a multiplier adjustment to the timekeeper + */ +static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, + s64 offset, + bool negative, + int adj_scale) +{ + s64 interval = tk->cycle_interval; + s32 mult_adj = 1; + + if (negative) { + mult_adj = -mult_adj; + interval = -interval; + offset = -offset; + } + mult_adj <<= adj_scale; + interval <<= adj_scale; + offset <<= adj_scale; + + /* + * So the following can be confusing. + * + * To keep things simple, lets assume mult_adj == 1 for now. + * + * When mult_adj != 1, remember that the interval and offset values + * have been appropriately scaled so the math is the same. + * + * The basic idea here is that we're increasing the multiplier + * by one, this causes the xtime_interval to be incremented by + * one cycle_interval. This is because: + * xtime_interval = cycle_interval * mult + * So if mult is being incremented by one: + * xtime_interval = cycle_interval * (mult + 1) + * Its the same as: + * xtime_interval = (cycle_interval * mult) + cycle_interval + * Which can be shortened to: + * xtime_interval += cycle_interval + * + * So offset stores the non-accumulated cycles. Thus the current + * time (in shifted nanoseconds) is: + * now = (offset * adj) + xtime_nsec + * Now, even though we're adjusting the clock frequency, we have + * to keep time consistent. In other words, we can't jump back + * in time, and we also want to avoid jumping forward in time. + * + * So given the same offset value, we need the time to be the same + * both before and after the freq adjustment. + * now = (offset * adj_1) + xtime_nsec_1 + * now = (offset * adj_2) + xtime_nsec_2 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_2) + xtime_nsec_2 + * And we know: + * adj_2 = adj_1 + 1 + * So: + * (offset * adj_1) + xtime_nsec_1 = + * (offset * (adj_1+1)) + xtime_nsec_2 + * (offset * adj_1) + xtime_nsec_1 = + * (offset * adj_1) + offset + xtime_nsec_2 + * Canceling the sides: + * xtime_nsec_1 = offset + xtime_nsec_2 + * Which gives us: + * xtime_nsec_2 = xtime_nsec_1 - offset + * Which simplfies to: + * xtime_nsec -= offset + * + * XXX - TODO: Doc ntp_error calculation. + */ + if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { + /* NTP adjustment caused clocksource mult overflow */ + WARN_ON_ONCE(1); + return; + } + + tk->tkr_mono.mult += mult_adj; + tk->xtime_interval += interval; + tk->tkr_mono.xtime_nsec -= offset; + tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +} + +/* + * Calculate the multiplier adjustment needed to match the frequency + * specified by NTP + */ +static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, + s64 offset) +{ + s64 interval = tk->cycle_interval; + s64 xinterval = tk->xtime_interval; + s64 tick_error; + bool negative; + u32 adj; + + /* Remove any current error adj from freq calculation */ + if (tk->ntp_err_mult) + xinterval -= tk->cycle_interval; + + tk->ntp_tick = ntp_tick_length(); + + /* Calculate current error per tick */ + tick_error = ntp_tick_length() >> tk->ntp_error_shift; + tick_error -= (xinterval + tk->xtime_remainder); + + /* Don't worry about correcting it if its small */ + if (likely((tick_error >= 0) && (tick_error <= interval))) + return; + + /* preserve the direction of correction */ + negative = (tick_error < 0); + + /* Sort out the magnitude of the correction */ + tick_error = abs(tick_error); + for (adj = 0; tick_error > interval; adj++) + tick_error >>= 1; + + /* scale the corrections */ + timekeeping_apply_adjustment(tk, offset, negative, adj); +} + +/* + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. + */ +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +{ + /* Correct for the current frequency error */ + timekeeping_freqadjust(tk, offset); + + /* Next make a small adjustment to fix any cumulative error */ + if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { + tk->ntp_err_mult = 1; + timekeeping_apply_adjustment(tk, offset, 0, 0); + } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { + /* Undo any existing error adjustment */ + timekeeping_apply_adjustment(tk, offset, 1, 0); + tk->ntp_err_mult = 0; + } + + if (unlikely(tk->tkr_mono.clock->maxadj && + (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) + > tk->tkr_mono.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", + tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, + (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); + } + + /* + * It may be possible that when we entered this function, xtime_nsec + * was very small. Further, if we're slightly speeding the clocksource + * in the code above, its possible the required corrective factor to + * xtime_nsec could cause it to underflow. + * + * Now, since we already accumulated the second, cannot simply roll + * the accumulated second back, since the NTP subsystem has been + * notified via second_overflow. So instead we push xtime_nsec forward + * by the amount we underflowed, and add that amount into the error. + * + * We'll correct this error next time through this function, when + * xtime_nsec is not as small. + */ + if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { + s64 neg = -(s64)tk->tkr_mono.xtime_nsec; + tk->tkr_mono.xtime_nsec = 0; + tk->ntp_error += neg << tk->ntp_error_shift; + } +} + +/** + * accumulate_nsecs_to_secs - Accumulates nsecs into secs + * + * Helper function that accumulates a the nsecs greater then a second + * from the xtime_nsec field to the xtime_secs field. + * It also calls into the NTP code to handle leapsecond processing. + * + */ +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) +{ + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; + unsigned int clock_set = 0; + + while (tk->tkr_mono.xtime_nsec >= nsecps) { + int leap; + + tk->tkr_mono.xtime_nsec -= nsecps; + tk->xtime_sec++; + + /* Figure out if its a leap sec and apply if needed */ + leap = second_overflow(tk->xtime_sec); + if (unlikely(leap)) { + struct timespec64 ts; + + tk->xtime_sec += leap; + + ts.tv_sec = leap; + ts.tv_nsec = 0; + tk_set_wall_to_mono(tk, + timespec64_sub(tk->wall_to_monotonic, ts)); + + __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); + + clock_set = TK_CLOCK_WAS_SET; + } + } + return clock_set; +} + +/** + * logarithmic_accumulation - shifted accumulation of cycles + * + * This functions accumulates a shifted interval of cycles into + * into a shifted interval nanoseconds. Allows for O(log) accumulation + * loop. + * + * Returns the unconsumed cycles. + */ +static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, + u32 shift, + unsigned int *clock_set) +{ + cycle_t interval = tk->cycle_interval << shift; + u64 raw_nsecs; + + /* If the offset is smaller then a shifted interval, do nothing */ + if (offset < interval) + return offset; + + /* Accumulate one shifted interval */ + offset -= interval; + tk->tkr_mono.cycle_last += interval; + tk->tkr_raw.cycle_last += interval; + + tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; + *clock_set |= accumulate_nsecs_to_secs(tk); + + /* Accumulate raw time */ + raw_nsecs = (u64)tk->raw_interval << shift; + raw_nsecs += tk->raw_time.tv_nsec; + if (raw_nsecs >= NSEC_PER_SEC) { + u64 raw_secs = raw_nsecs; + raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); + tk->raw_time.tv_sec += raw_secs; + } + tk->raw_time.tv_nsec = raw_nsecs; + + /* Accumulate error between NTP and clock interval */ + tk->ntp_error += tk->ntp_tick << shift; + tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << + (tk->ntp_error_shift + shift); + + return offset; +} + +/** + * update_wall_time - Uses the current clocksource to increment the wall time + * + */ +void update_wall_time(void) +{ + struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &shadow_timekeeper; + cycle_t offset; + int shift = 0, maxshift; + unsigned int clock_set = 0; + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + goto out; + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET + offset = real_tk->cycle_interval; +#else + offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + tk->tkr_mono.cycle_last, tk->tkr_mono.mask); +#endif + + /* Check if there's really nothing to do */ + if (offset < real_tk->cycle_interval) + goto out; + + /* Do some additional sanity checking */ + timekeeping_check_update(real_tk, offset); + + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller than the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. + */ + shift = ilog2(offset) - ilog2(tk->cycle_interval); + shift = max(0, shift); + /* Bound shift to one less than what overflows tick_length */ + maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + shift = min(shift, maxshift); + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift, + &clock_set); + if (offset < tk->cycle_interval<<shift) + shift--; + } + + /* correct the clock when NTP error is too big */ + timekeeping_adjust(tk, offset); + + /* + * XXX This can be killed once everyone converts + * to the new update_vsyscall. + */ + old_vsyscall_fixup(tk); + + /* + * Finally, make sure that after the rounding + * xtime_nsec isn't larger than NSEC_PER_SEC + */ + clock_set |= accumulate_nsecs_to_secs(tk); + + write_seqcount_begin(&tk_core.seq); + /* + * Update the real timekeeper. + * + * We could avoid this memcpy by switching pointers, but that + * requires changes to all other timekeeper usage sites as + * well, i.e. move the timekeeper pointer getter into the + * spinlocked/seqcount protected sections. And we trade this + * memcpy under the tk_core.seq against one before we start + * updating. + */ + memcpy(real_tk, tk, sizeof(*tk)); + timekeeping_update(real_tk, clock_set); + write_seqcount_end(&tk_core.seq); +out: + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (clock_set) + /* Have to call _delayed version, since in irq context*/ + clock_was_set_delayed(); +} + +/** + * getboottime64 - Return the real time of system boot. + * @ts: pointer to the timespec64 to be set + * + * Returns the wall-time of boot in a timespec64. + * + * This is based on the wall_to_monotonic offset and the total suspend + * time. Calls to settimeofday will affect the value returned (which + * basically means that however wrong your real time clock is at boot time, + * you get the right time here). + */ +void getboottime64(struct timespec64 *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); + + *ts = ktime_to_timespec64(t); +} +EXPORT_SYMBOL_GPL(getboottime64); + +unsigned long get_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return tk->xtime_sec; +} +EXPORT_SYMBOL(get_seconds); + +struct timespec __current_kernel_time(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return timespec64_to_timespec(tk_xtime(tk)); +} + +struct timespec current_kernel_time(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now; + unsigned long seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk_xtime(tk); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return timespec64_to_timespec(now); +} +EXPORT_SYMBOL(current_kernel_time); + +struct timespec64 get_monotonic_coarse64(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct timespec64 now, mono; + unsigned long seq; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + now = tk_xtime(tk); + mono = tk->wall_to_monotonic; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + + return now; +} + +/* + * Must hold jiffies_lock + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + calc_global_load(ticks); +} + +/** + * ktime_get_update_offsets_tick - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns monotonic time at last tick and various offsets + */ +ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + base = tk->tkr_mono.base; + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets_now - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interrupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, + ktime_t *offs_tai) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + ktime_t base; + u64 nsecs; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); + + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return ktime_add_ns(base, nsecs); +} +#endif + +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned long flags; + struct timespec64 ts; + s32 orig_tai, tai; + int ret; + + /* Validate the data before disabling interrupts */ + ret = ntp_validate_timex(txc); + if (ret) + return ret; + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = timekeeping_inject_offset(&delta); + if (ret) + return ret; + } + + getnstimeofday64(&ts); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + orig_tai = tai = tk->tai_offset; + ret = __do_adjtimex(txc, &ts, &tai); + + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tk, tai); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); + } + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + + if (tai != orig_tai) + clock_was_set(); + + ntp_notify_cmos_timer(); + + return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); + + __hardpps(phase_ts, raw_ts); + + write_seqcount_end(&tk_core.seq); + raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&jiffies_lock); + do_timer(ticks); + write_sequnlock(&jiffies_lock); + update_wall_time(); +} diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 000000000..ead8794b9 --- /dev/null +++ b/kernel/time/timekeeping.h @@ -0,0 +1,29 @@ +#ifndef _KERNEL_TIME_TIMEKEEPING_H +#define _KERNEL_TIME_TIMEKEEPING_H +/* + * Internal interfaces for kernel/time/ + */ +extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); + +extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); +extern int timekeeping_inject_offset(struct timespec *ts); +extern s32 timekeeping_get_tai_offset(void); +extern void timekeeping_set_tai_offset(s32 tai_offset); +extern void timekeeping_clocktai(struct timespec *ts); +extern int timekeeping_suspend(void); +extern void timekeeping_resume(void); + +extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); + +extern seqlock_t jiffies_lock; + +#define CS_NAME_LEN 32 + +#endif diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000000000..f6bd65236 --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,74 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/time.h> + +#include "timekeeping_internal.h" + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ + unsigned int bin; + seq_puts(s, " time (secs) count\n"); + seq_puts(s, "------------------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (sleep_time_bin[bin] == 0) + continue; + seq_printf(s, "%10u - %-10u %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + sleep_time_bin[bin]); + } + return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ + return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { + .open = tk_debug_sleep_time_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); + if (!d) { + pr_err("Failed to create sleep_time debug file\n"); + return -ENOMEM; + } + + return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec64 *t) +{ + sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000000000..4ea005a7f --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,29 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include <linux/clocksource.h> +#include <linux/time.h> + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec64 *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + cycle_t ret = (now - last) & mask; + + return (s64) ret > 0 ? ret : 0; +} +#else +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ + return (now - last) & mask; +} +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c new file mode 100644 index 000000000..2ece3aa50 --- /dev/null +++ b/kernel/time/timer.c @@ -0,0 +1,1720 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000, 2001, 2002 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + */ + +#include <linux/kernel_stat.h> +#include <linux/export.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pid_namespace.h> +#include <linux/notifier.h> +#include <linux/thread_info.h> +#include <linux/time.h> +#include <linux/jiffies.h> +#include <linux/posix-timers.h> +#include <linux/cpu.h> +#include <linux/syscalls.h> +#include <linux/delay.h> +#include <linux/tick.h> +#include <linux/kallsyms.h> +#include <linux/irq_work.h> +#include <linux/sched.h> +#include <linux/sched/sysctl.h> +#include <linux/slab.h> +#include <linux/compat.h> + +#include <asm/uaccess.h> +#include <asm/unistd.h> +#include <asm/div64.h> +#include <asm/timex.h> +#include <asm/io.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/timer.h> + +__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +/* + * per-CPU timer vector definitions: + */ +#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) +#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) + +struct tvec { + struct list_head vec[TVN_SIZE]; +}; + +struct tvec_root { + struct list_head vec[TVR_SIZE]; +}; + +struct tvec_base { + spinlock_t lock; + struct timer_list *running_timer; + unsigned long timer_jiffies; + unsigned long next_timer; + unsigned long active_timers; + unsigned long all_timers; + int cpu; + struct tvec_root tv1; + struct tvec tv2; + struct tvec tv3; + struct tvec tv4; + struct tvec tv5; +} ____cacheline_aligned; + +/* + * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've + * made NULL special, hint: lock_timer_base()) and we cannot get a compile time + * pointer to per-cpu entries because we don't know where we'll map the section, + * even for the boot cpu. + * + * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the + * rest of them. + */ +struct tvec_base boot_tvec_bases; +EXPORT_SYMBOL(boot_tvec_bases); + +static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(struct tvec_base *base) +{ + return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); +} + +static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) +{ + return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); +} + +static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +{ + return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); +} + +static inline void +timer_set_base(struct timer_list *timer, struct tvec_base *new_base) +{ + unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; + + timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); +} + +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. + */ + if (rem < HZ/4 && !force_up) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + /* + * Make sure j is still in the future. Otherwise return the + * unmodified value. + */ + return time_is_after_jiffies(j) ? j : original; +} + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), false); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + +/** + * set_timer_slack - set the allowed slack for a timer + * @timer: the timer to be modified + * @slack_hz: the amount of time (in jiffies) allowed for rounding + * + * Set the amount of time, in jiffies, that a certain timer has + * in terms of slack. By setting this value, the timer subsystem + * will schedule the actual timer somewhere between + * the time mod_timer() asks for, and that time plus the slack. + * + * By setting the slack to -1, a percentage of the delay is used + * instead. + */ +void set_timer_slack(struct timer_list *timer, int slack_hz) +{ + timer->slack = slack_hz; +} +EXPORT_SYMBOL_GPL(set_timer_slack); + +/* + * If the list is empty, catch up ->timer_jiffies to the current time. + * The caller must hold the tvec_base lock. Returns true if the list + * was empty and therefore ->timer_jiffies was updated. + */ +static bool catchup_timer_jiffies(struct tvec_base *base) +{ + if (!base->all_timers) { + base->timer_jiffies = jiffies; + return true; + } + return false; +} + +static void +__internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + unsigned long expires = timer->expires; + unsigned long idx = expires - base->timer_jiffies; + struct list_head *vec; + + if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + vec = base->tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + vec = base->tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = base->tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + vec = base->tv4.vec + i; + } else if ((signed long) idx < 0) { + /* + * Can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + } else { + int i; + /* If the timeout is larger than MAX_TVAL (on 64-bit + * architectures or with CONFIG_BASE_SMALL=1) then we + * use the maximum timeout. + */ + if (idx > MAX_TVAL) { + idx = MAX_TVAL; + expires = idx + base->timer_jiffies; + } + i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = base->tv5.vec + i; + } + /* + * Timers are FIFO: + */ + list_add_tail(&timer->entry, vec); +} + +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + (void)catchup_timer_jiffies(base); + __internal_add_timer(base, timer); + /* + * Update base->active_timers and base->next_timer + */ + if (!tbase_get_deferrable(timer->base)) { + if (!base->active_timers++ || + time_before(timer->expires, base->next_timer)) + base->next_timer = timer->expires; + } + base->all_timers++; + + /* + * Check whether the other CPU is in dynticks mode and needs + * to be triggered to reevaluate the timer wheel. + * We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to stop its tick can not + * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. + */ + if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); +} + +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} + +static void timer_stats_account_timer(struct timer_list *timer) +{ + unsigned int flag = 0; + + if (likely(!timer->start_site)) + return; + if (unlikely(tbase_get_deferrable(timer->base))) + flag |= TIMER_STATS_FLAG_DEFERRABLE; + + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, flag); +} + +#else +static void timer_stats_account_timer(struct timer_list *timer) {} +#endif + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr timer_debug_descr; + +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int timer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_init(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +/* Stub timer callback for improperly used timers. */ +static void stub_timer(unsigned long data) +{ + WARN_ON(1); +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int timer_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (timer->entry.next == NULL && + timer->entry.prev == TIMER_ENTRY_STATIC) { + debug_object_init(timer, &timer_debug_descr); + debug_object_activate(timer, &timer_debug_descr); + return 0; + } else { + setup_timer(timer, stub_timer, 0); + return 1; + } + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int timer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_free(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + if (timer->entry.prev == TIMER_ENTRY_STATIC) { + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + debug_object_init(timer, &timer_debug_descr); + return 0; + } else { + setup_timer(timer, stub_timer, 0); + return 1; + } + default: + return 0; + } +} + +static struct debug_obj_descr timer_debug_descr = { + .name = "timer_list", + .debug_hint = timer_debug_hint, + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, + .fixup_assert_init = timer_fixup_assert_init, +}; + +static inline void debug_timer_init(struct timer_list *timer) +{ + debug_object_init(timer, &timer_debug_descr); +} + +static inline void debug_timer_activate(struct timer_list *timer) +{ + debug_object_activate(timer, &timer_debug_descr); +} + +static inline void debug_timer_deactivate(struct timer_list *timer) +{ + debug_object_deactivate(timer, &timer_debug_descr); +} + +static inline void debug_timer_free(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} + +static inline void debug_timer_assert_init(struct timer_list *timer) +{ + debug_object_assert_init(timer, &timer_debug_descr); +} + +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key); + +void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) +{ + debug_object_init_on_stack(timer, &timer_debug_descr); + do_init_timer(timer, flags, name, key); +} +EXPORT_SYMBOL_GPL(init_timer_on_stack_key); + +void destroy_timer_on_stack(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_timer_on_stack); + +#else +static inline void debug_timer_init(struct timer_list *timer) { } +static inline void debug_timer_activate(struct timer_list *timer) { } +static inline void debug_timer_deactivate(struct timer_list *timer) { } +static inline void debug_timer_assert_init(struct timer_list *timer) { } +#endif + +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + +static inline void debug_assert_init(struct timer_list *timer) +{ + debug_timer_assert_init(timer); +} + +static void do_init_timer(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) +{ + struct tvec_base *base = raw_cpu_read(tvec_bases); + + timer->entry.next = NULL; + timer->base = (void *)((unsigned long)base | flags); + timer->slack = -1; +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif + lockdep_init_map(&timer->lockdep_map, name, key, 0); +} + +/** + * init_timer_key - initialize a timer + * @timer: the timer to be initialized + * @flags: timer flags + * @name: name of the timer + * @key: lockdep class key of the fake lock used for tracking timer + * sync lock dependencies + * + * init_timer_key() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void init_timer_key(struct timer_list *timer, unsigned int flags, + const char *name, struct lock_class_key *key) +{ + debug_init(timer); + do_init_timer(timer, flags, name, key); +} +EXPORT_SYMBOL(init_timer_key); + +static inline void detach_timer(struct timer_list *timer, bool clear_pending) +{ + struct list_head *entry = &timer->entry; + + debug_deactivate(timer); + + __list_del(entry->prev, entry->next); + if (clear_pending) + entry->next = NULL; + entry->prev = LIST_POISON2; +} + +static inline void +detach_expired_timer(struct timer_list *timer, struct tvec_base *base) +{ + detach_timer(timer, true); + if (!tbase_get_deferrable(timer->base)) + base->active_timers--; + base->all_timers--; + (void)catchup_timer_jiffies(base); +} + +static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, + bool clear_pending) +{ + if (!timer_pending(timer)) + return 0; + + detach_timer(timer, clear_pending); + if (!tbase_get_deferrable(timer->base)) { + base->active_timers--; + if (timer->expires == base->next_timer) + base->next_timer = base->timer_jiffies; + } + base->all_timers--; + (void)catchup_timer_jiffies(base); + return 1; +} + +/* + * We are using hashed locking: holding per_cpu(tvec_bases).lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on ->tvX lists. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static struct tvec_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) + __acquires(timer->base->lock) +{ + struct tvec_base *base; + + for (;;) { + struct tvec_base *prelock_base = timer->base; + base = tbase_get_base(prelock_base); + if (likely(base != NULL)) { + spin_lock_irqsave(&base->lock, *flags); + if (likely(prelock_base == timer->base)) + return base; + /* The timer has migrated to another CPU */ + spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, + bool pending_only, int pinned) +{ + struct tvec_base *base, *new_base; + unsigned long flags; + int ret = 0 , cpu; + + timer_stats_timer_set_start_info(timer); + BUG_ON(!timer->function); + + base = lock_timer_base(timer, &flags); + + ret = detach_if_pending(timer, base, false); + if (!ret && pending_only) + goto out_unlock; + + debug_activate(timer, expires); + + cpu = get_nohz_timer_target(pinned); + new_base = per_cpu(tvec_bases, cpu); + + if (base != new_base) { + /* + * We are trying to schedule the timer on the local CPU. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that + * the timer is serialized wrt itself. + */ + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ + timer_set_base(timer, NULL); + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + timer_set_base(timer, base); + } + } + + timer->expires = expires; + internal_add_timer(base, timer); + +out_unlock: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +/** + * mod_timer_pending - modify a pending timer's timeout + * @timer: the pending timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pending() is the same for pending timers as mod_timer(), + * but will not re-activate and modify already deleted timers. + * + * It is useful for unserialized use of timers. + */ +int mod_timer_pending(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); +} +EXPORT_SYMBOL(mod_timer_pending); + +/* + * Decide where to put the timer while taking the slack into account + * + * Algorithm: + * 1) calculate the maximum (absolute) time + * 2) calculate the highest bit where the expires and new max are different + * 3) use this bit to make a mask + * 4) use the bitmask to round down the maximum time, so that all last + * bits are zeros + */ +static inline +unsigned long apply_slack(struct timer_list *timer, unsigned long expires) +{ + unsigned long expires_limit, mask; + int bit; + + if (timer->slack >= 0) { + expires_limit = expires + timer->slack; + } else { + long delta = expires - jiffies; + + if (delta < 256) + return expires; + + expires_limit = expires + delta / 256; + } + mask = expires ^ expires_limit; + if (mask == 0) + return expires; + + bit = find_last_bit(&mask, BITS_PER_LONG); + + mask = (1UL << bit) - 1; + + expires_limit = expires_limit & ~(mask); + + return expires_limit; +} + +/** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer() is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * The function returns whether it has modified a pending timer or not. + * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an + * active timer returns 1.) + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + expires = apply_slack(timer, expires); + + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer_pending(timer) && timer->expires == expires) + return 1; + + return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); +} +EXPORT_SYMBOL(mod_timer); + +/** + * mod_timer_pinned - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pinned() is a way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * and to ensure that the timer is scheduled on the current CPU. + * + * Note that this does not prevent the timer from being migrated + * when the current CPU goes offline. If this is a problem for + * you, use CPU-hotplug notifiers to handle it correctly, for + * example, cancelling the timer when the corresponding CPU goes + * offline. + * + * mod_timer_pinned(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + */ +int mod_timer_pinned(struct timer_list *timer, unsigned long expires) +{ + if (timer->expires == expires && timer_pending(timer)) + return 1; + + return __mod_timer(timer, expires, false, TIMER_PINNED); +} +EXPORT_SYMBOL(mod_timer_pinned); + +/** + * add_timer - start a timer + * @timer: the timer to be added + * + * The kernel will do a ->function(->data) callback from the + * timer interrupt at the ->expires point in the future. The + * current time is 'jiffies'. + * + * The timer's ->expires, ->function (and if the handler uses it, ->data) + * fields must be set prior calling this function. + * + * Timers with an ->expires field in the past will be executed in the next + * timer tick. + */ +void add_timer(struct timer_list *timer) +{ + BUG_ON(timer_pending(timer)); + mod_timer(timer, timer->expires); +} +EXPORT_SYMBOL(add_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct tvec_base *base = per_cpu(tvec_bases, cpu); + unsigned long flags; + + timer_stats_timer_set_start_info(timer); + BUG_ON(timer_pending(timer) || !timer->function); + spin_lock_irqsave(&base->lock, flags); + timer_set_base(timer, base); + debug_activate(timer, timer->expires); + internal_add_timer(base, timer); + spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(add_timer_on); + +/** + * del_timer - deactive a timer. + * @timer: the timer to be deactivated + * + * del_timer() deactivates a timer - this works on both active and inactive + * timers. + * + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + */ +int del_timer(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = 0; + + debug_assert_init(timer); + + timer_stats_timer_clear_start_info(timer); + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + ret = detach_if_pending(timer, base, true); + spin_unlock_irqrestore(&base->lock, flags); + } + + return ret; +} +EXPORT_SYMBOL(del_timer); + +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del + * + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + debug_assert_init(timer); + + base = lock_timer_base(timer, &flags); + + if (base->running_timer != timer) { + timer_stats_timer_clear_start_info(timer); + ret = detach_if_pending(timer, base, true); + } + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} +EXPORT_SYMBOL(try_to_del_timer_sync); + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); + +/** + * del_timer_sync - deactivate a timer and wait for the handler to finish. + * @timer: the timer to be deactivated + * + * This function only differs from del_timer() on SMP: besides deactivating + * the timer it also makes sure the handler has finished executing on other + * CPUs. + * + * Synchronization rules: Callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts unless the timer is an irqsafe one. The caller must + * not hold locks which would prevent completion of the timer's + * handler. The timer's handler must not call add_timer_on(). Upon exit the + * timer is not queued and the handler is not running on any CPU. + * + * Note: For !irqsafe timers, you must not hold locks that are held in + * interrupt context while calling this function. Even if the lock has + * nothing to do with the timer in question. Here's why: + * + * CPU0 CPU1 + * ---- ---- + * <SOFTIRQ> + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * <IRQ> + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * + * The function returns whether it has deactivated a pending timer or not. + */ +int del_timer_sync(struct timer_list *timer) +{ +#ifdef CONFIG_LOCKDEP + unsigned long flags; + + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ + local_irq_save(flags); + lock_map_acquire(&timer->lockdep_map); + lock_map_release(&timer->lockdep_map); + local_irq_restore(flags); +#endif + /* + * don't use it in hardirq context, because it + * could lead to deadlock. + */ + WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL(del_timer_sync); +#endif + +static int cascade(struct tvec_base *base, struct tvec *tv, int index) +{ + /* cascade all the timers from tv up one level */ + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); + + /* + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. + */ + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(tbase_get_base(timer->base) != base); + /* No accounting, while moving them */ + __internal_add_timer(base, timer); + } + + return index; +} + +static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), + unsigned long data) +{ + int count = preempt_count(); + +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the timer from inside the + * function that is called from it, this we need to take into + * account for lockdep too. To avoid bogus "held lock freed" + * warnings as well as problems when looking into + * timer->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &timer->lockdep_map); +#endif + /* + * Couple the lock chain with the lock chain at + * del_timer_sync() by acquiring the lock_map around the fn() + * call here and in del_timer_sync(). + */ + lock_map_acquire(&lockdep_map); + + trace_timer_expire_entry(timer); + fn(data); + trace_timer_expire_exit(timer); + + lock_map_release(&lockdep_map); + + if (count != preempt_count()) { + WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + fn, count, preempt_count()); + /* + * Restore the preempt count. That gives us a decent + * chance to survive and extract information. If the + * callback kept a lock held, bad luck, but not worse + * than the BUG() we had. + */ + preempt_count_set(count); + } +} + +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + * + * This function cascades all vectors and executes all expired timer + * vectors. + */ +static inline void __run_timers(struct tvec_base *base) +{ + struct timer_list *timer; + + spin_lock_irq(&base->lock); + if (catchup_timer_jiffies(base)) { + spin_unlock_irq(&base->lock); + return; + } + while (time_after_eq(jiffies, base->timer_jiffies)) { + struct list_head work_list; + struct list_head *head = &work_list; + int index = base->timer_jiffies & TVR_MASK; + + /* + * Cascade timers: + */ + if (!index && + (!cascade(base, &base->tv2, INDEX(0))) && + (!cascade(base, &base->tv3, INDEX(1))) && + !cascade(base, &base->tv4, INDEX(2))) + cascade(base, &base->tv5, INDEX(3)); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, head); + while (!list_empty(head)) { + void (*fn)(unsigned long); + unsigned long data; + bool irqsafe; + + timer = list_first_entry(head, struct timer_list,entry); + fn = timer->function; + data = timer->data; + irqsafe = tbase_get_irqsafe(timer->base); + + timer_stats_account_timer(timer); + + base->running_timer = timer; + detach_expired_timer(timer, base); + + if (irqsafe) { + spin_unlock(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock(&base->lock); + } else { + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); + } + } + } + base->running_timer = NULL; + spin_unlock_irq(&base->lock); +} + +#ifdef CONFIG_NO_HZ_COMMON +/* + * Find out when the next timer event is due to happen. This + * is used on S/390 to stop all activity when a CPU is idle. + * This function needs to be called with interrupts disabled. + */ +static unsigned long __next_timer_interrupt(struct tvec_base *base) +{ + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; + int index, slot, array, found = 0; + struct timer_list *nte; + struct tvec *varray[4]; + + /* Look for timer events in tv1. */ + index = slot = timer_jiffies & TVR_MASK; + do { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; + expires = nte->expires; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; + } + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; + + /* Check tv2-tv5. */ + varray[0] = &base->tv2; + varray[1] = &base->tv3; + varray[2] = &base->tv4; + varray[3] = &base->tv5; + + for (array = 0; array < 4; array++) { + struct tvec *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; + do { + list_for_each_entry(nte, varp->vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; + if (time_before(nte->expires, expires)) + expires = nte->expires; + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); + + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; + } + return expires; +} + +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) +{ + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; + unsigned long delta; + + if (hr_delta.tv64 == KTIME_MAX) + return expires; + + /* + * Expired timer available, let it expire in the next tick + */ + if (hr_delta.tv64 <= 0) + return now + 1; + + tsdelta = ktime_to_timespec(hr_delta); + delta = timespec_to_jiffies(&tsdelta); + + /* + * Limit the delta to the max value, which is checked in + * tick_nohz_stop_sched_tick(): + */ + if (delta > NEXT_TIMER_MAX_DELTA) + delta = NEXT_TIMER_MAX_DELTA; + + /* + * Take rounding errors in to account and make sure, that it + * expires in the next tick. Otherwise we go into an endless + * ping pong due to tick_nohz_stop_sched_tick() retriggering + * the timer softirq + */ + if (delta < 1) + delta = 1; + now += delta; + if (time_before(now, expires)) + return now; + return expires; +} + +/** + * get_next_timer_interrupt - return the jiffy of the next pending timer + * @now: current time (in jiffies) + */ +unsigned long get_next_timer_interrupt(unsigned long now) +{ + struct tvec_base *base = __this_cpu_read(tvec_bases); + unsigned long expires = now + NEXT_TIMER_MAX_DELTA; + + /* + * Pretend that there is no timer pending if the cpu is offline. + * Possible pending timers will be migrated later to an active cpu. + */ + if (cpu_is_offline(smp_processor_id())) + return expires; + + spin_lock(&base->lock); + if (base->active_timers) { + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; + } + spin_unlock(&base->lock); + + if (time_before_eq(expires, now)) + return now; + + return cmp_next_hrtimer_event(now, expires); +} +#endif + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + rcu_check_callbacks(user_tick); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_tick(); +#endif + scheduler_tick(); + run_posix_cpu_timers(p); +} + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + struct tvec_base *base = __this_cpu_read(tvec_bases); + + hrtimer_run_pending(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +void run_local_timers(void) +{ + hrtimer_run_queues(); + raise_softirq(TIMER_SOFTIRQ); +} + +#ifdef __ARCH_WANT_SYS_ALARM + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + +static void process_timeout(unsigned long __data) +{ + wake_up_process((struct task_struct *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx\n", timeout); + dump_stack(); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); + schedule(); + del_singleshot_timer_sync(&timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +{ + struct timer_list *timer; + + while (!list_empty(head)) { + timer = list_first_entry(head, struct timer_list, entry); + /* We ignore the accounting on the dying cpu */ + detach_timer(timer, false); + timer_set_base(timer, new_base); + internal_add_timer(new_base, timer); + } +} + +static void migrate_timers(int cpu) +{ + struct tvec_base *old_base; + struct tvec_base *new_base; + int i; + + BUG_ON(cpu_online(cpu)); + old_base = per_cpu(tvec_bases, cpu); + new_base = get_cpu_var(tvec_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < TVR_SIZE; i++) + migrate_timer_list(new_base, old_base->tv1.vec + i); + for (i = 0; i < TVN_SIZE; i++) { + migrate_timer_list(new_base, old_base->tv2.vec + i); + migrate_timer_list(new_base, old_base->tv3.vec + i); + migrate_timer_list(new_base, old_base->tv4.vec + i); + migrate_timer_list(new_base, old_base->tv5.vec + i); + } + + old_base->active_timers = 0; + old_base->all_timers = 0; + + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + put_cpu_var(tvec_bases); +} + +static int timer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + migrate_timers((long)hcpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static inline void timer_register_cpu_notifier(void) +{ + cpu_notifier(timer_cpu_notify, 0); +} +#else +static inline void timer_register_cpu_notifier(void) { } +#endif /* CONFIG_HOTPLUG_CPU */ + +static void __init init_timer_cpu(struct tvec_base *base, int cpu) +{ + int j; + + BUG_ON(base != tbase_get_base(base)); + + base->cpu = cpu; + per_cpu(tvec_bases, cpu) = base; + spin_lock_init(&base->lock); + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; +} + +static void __init init_timer_cpus(void) +{ + struct tvec_base *base; + int local_cpu = smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == local_cpu) + base = &boot_tvec_bases; +#ifdef CONFIG_SMP + else + base = per_cpu_ptr(&__tvec_bases, cpu); +#endif + + init_timer_cpu(base, cpu); + } +} + +void __init init_timers(void) +{ + /* ensure there are enough low bits for flags in timer->base pointer */ + BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); + + init_timer_cpus(); + init_timer_stats(); + timer_register_cpu_notifier(); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); +} + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} + +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} + +EXPORT_SYMBOL(msleep_interruptible); + +static int __sched do_usleep_range(unsigned long min, unsigned long max) +{ + ktime_t kmin; + unsigned long delta; + + kmin = ktime_set(0, min * NSEC_PER_USEC); + delta = (max - min) * NSEC_PER_USEC; + return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); +} + +/** + * usleep_range - Drop in replacement for udelay where wakeup is flexible + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + */ +void usleep_range(unsigned long min, unsigned long max) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + do_usleep_range(min, max); +} +EXPORT_SYMBOL(usleep_range); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 000000000..e878c2e0b --- /dev/null +++ b/kernel/time/timer_list.c @@ -0,0 +1,396 @@ +/* + * kernel/time/timer_list.c + * + * List pending timers + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> + +#include <asm/uaccess.h> + +#include "tick-internal.h" + +struct timer_list_iter { + int cpu; + bool second_pass; + u64 now; +}; + +typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +/* + * This allows printing both to /proc/timer_list and + * to the console (on SysRq-Q): + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +static void print_name_offset(struct seq_file *m, void *sym) +{ + char symname[KSYM_NAME_LEN]; + + if (lookup_symbol_name((unsigned long)sym, symname) < 0) + SEQ_printf(m, "<%pK>", sym); + else + SEQ_printf(m, "%s", symname); +} + +static void +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) +{ +#ifdef CONFIG_TIMER_STATS + char tmp[TASK_COMM_LEN + 1]; +#endif + SEQ_printf(m, " #%d: ", idx); + print_name_offset(m, taddr); + SEQ_printf(m, ", "); + print_name_offset(m, timer->function); + SEQ_printf(m, ", S:%02lx", timer->state); +#ifdef CONFIG_TIMER_STATS + SEQ_printf(m, ", "); + print_name_offset(m, timer->start_site); + memcpy(tmp, timer->start_comm, TASK_COMM_LEN); + tmp[TASK_COMM_LEN] = 0; + SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); +#endif + SEQ_printf(m, "\n"); + SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), + (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), + (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), + (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); +} + +static void +print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, + u64 now) +{ + struct hrtimer *timer, tmp; + unsigned long next = 0, i; + struct timerqueue_node *curr; + unsigned long flags; + +next_one: + i = 0; + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); + + curr = timerqueue_getnext(&base->active); + /* + * Crude but we have to do this O(N*N) thing, because + * we have to unlock the base when printing: + */ + while (curr && i < next) { + curr = timerqueue_iterate_next(curr); + i++; + } + + if (curr) { + + timer = container_of(curr, struct hrtimer, node); + tmp = *timer; + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); + + print_timer(m, timer, &tmp, i, now); + next++; + goto next_one; + } + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); +} + +static void +print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) +{ + SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .index: %d\n", + base->index); + SEQ_printf(m, " .resolution: %Lu nsecs\n", + (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .get_time: "); + print_name_offset(m, base->get_time); + SEQ_printf(m, "\n"); +#ifdef CONFIG_HIGH_RES_TIMERS + SEQ_printf(m, " .offset: %Lu nsecs\n", + (unsigned long long) ktime_to_ns(base->offset)); +#endif + SEQ_printf(m, "active timers:\n"); + print_active_timers(m, base, now); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + SEQ_printf(m, "cpu: %d\n", cpu); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + SEQ_printf(m, " clock %d:\n", i); + print_base(m, cpu_base->clock_base + i, now); + } +#define P(x) \ + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(cpu_base->x)) +#define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(cpu_base->x))) + +#ifdef CONFIG_HIGH_RES_TIMERS + P_ns(expires_next); + P(hres_active); + P(nr_events); + P(nr_retries); + P(nr_hangs); + P_ns(max_hang_time); +#endif +#undef P +#undef P_ns + +#ifdef CONFIG_TICK_ONESHOT +# define P(x) \ + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(ts->x)) +# define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(ts->x))) + { + struct tick_sched *ts = tick_get_tick_sched(cpu); + P(nohz_mode); + P_ns(last_tick); + P(tick_stopped); + P(idle_jiffies); + P(idle_calls); + P(idle_sleeps); + P_ns(idle_entrytime); + P_ns(idle_waketime); + P_ns(idle_exittime); + P_ns(idle_sleeptime); + P_ns(iowait_sleeptime); + P(last_jiffies); + P(next_jiffies); + P_ns(idle_expires); + SEQ_printf(m, "jiffies: %Lu\n", + (unsigned long long)jiffies); + } +#endif + +#undef P +#undef P_ns + SEQ_printf(m, "\n"); +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static void +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) +{ + struct clock_event_device *dev = td->evtdev; + + SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); + + SEQ_printf(m, "Clock Event Device: "); + if (!dev) { + SEQ_printf(m, "<NULL>\n"); + return; + } + SEQ_printf(m, "%s\n", dev->name); + SEQ_printf(m, " max_delta_ns: %llu\n", + (unsigned long long) dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %llu\n", + (unsigned long long) dev->min_delta_ns); + SEQ_printf(m, " mult: %u\n", dev->mult); + SEQ_printf(m, " shift: %u\n", dev->shift); + SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " next_event: %Ld nsecs\n", + (unsigned long long) ktime_to_ns(dev->next_event)); + + SEQ_printf(m, " set_next_event: "); + print_name_offset(m, dev->set_next_event); + SEQ_printf(m, "\n"); + + if (dev->set_mode) { + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + } else { + if (dev->set_state_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_state_shutdown); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_state_periodic); + SEQ_printf(m, "\n"); + } + + if (dev->set_state_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_state_oneshot); + SEQ_printf(m, "\n"); + } + + if (dev->tick_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->tick_resume); + SEQ_printf(m, "\n"); + } + } + + SEQ_printf(m, " event_handler: "); + print_name_offset(m, dev->event_handler); + SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); + SEQ_printf(m, "\n"); +} + +static void timer_list_show_tickdevices_header(struct seq_file *m) +{ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + print_tickdevice(m, tick_get_broadcast_device(), -1); + SEQ_printf(m, "tick_broadcast_mask: %08lx\n", + cpumask_bits(tick_get_broadcast_mask())[0]); +#ifdef CONFIG_TICK_ONESHOT + SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", + cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); +#endif + SEQ_printf(m, "\n"); +#endif +} +#endif + +static inline void timer_list_header(struct seq_file *m, u64 now) +{ + SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + SEQ_printf(m, "\n"); +} + +static int timer_list_show(struct seq_file *m, void *v) +{ + struct timer_list_iter *iter = v; + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, iter->now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + +void sysrq_timer_list_show(void) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + timer_list_header(NULL, now); + + for_each_online_cpu(cpu) + print_cpu(NULL, cpu, now); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS + timer_list_show_tickdevices_header(NULL); + for_each_online_cpu(cpu) + print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif + return; +} + +static void *move_iter(struct timer_list_iter *iter, loff_t offset) +{ + for (; offset; offset--) { + iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); + if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (!iter->second_pass) { + iter->cpu = -1; + iter->second_pass = true; + } else + return NULL; +#else + return NULL; +#endif + } + } + return iter; +} + +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + + if (!*offset) + iter->now = ktime_to_ns(ktime_get()); + iter->cpu = -1; + iter->second_pass = false; + return move_iter(iter, *offset); +} + +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ + struct timer_list_iter *iter = file->private; + ++*offset; + return move_iter(iter, 1); +} + +static void timer_list_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations timer_list_sops = { + .start = timer_list_start, + .next = timer_list_next, + .stop = timer_list_stop, + .show = timer_list_show, +}; + +static int timer_list_open(struct inode *inode, struct file *filp) +{ + return seq_open_private(filp, &timer_list_sops, + sizeof(struct timer_list_iter)); +} + +static const struct file_operations timer_list_fops = { + .open = timer_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init init_timer_list_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create("timer_list", 0444, NULL, &timer_list_fops); + if (!pe) + return -ENOMEM; + return 0; +} +__initcall(init_timer_list_procfs); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 000000000..1fb08f213 --- /dev/null +++ b/kernel/time/timer_stats.c @@ -0,0 +1,425 @@ +/* + * kernel/time/timer_stats.c + * + * Collect timer usage statistics. + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * timer_stats is based on timer_top, a similar functionality which was part of + * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the + * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based + * on dynamic allocation of the statistics entries and linear search based + * lookup combined with a global lock, rather than the static array, hash + * and per-CPU locking which is used by timer_stats. It was written for the + * pre hrtimer kernel code and therefore did not take hrtimers into account. + * Nevertheless it provided the base for the timer_stats implementation and + * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks + * for this effort. + * + * timer_top.c is + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini <d.pensator@gmail.com> + * timer_top.c was released under the GNU General Public License version 2 + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * Start/stop data collection: + * # echo [1|0] >/proc/timer_stats + * + * Display the information collected so far: + * # cat /proc/timer_stats + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> + +#include <asm/uaccess.h> + +/* + * This is our basic unit of interest: a timer expiry event identified + * by the timer, its start/expire functions and the PID of the task that + * started the timer. We count the number of times an event happens: + */ +struct entry { + /* + * Hash list: + */ + struct entry *next; + + /* + * Hash keys: + */ + void *timer; + void *start_func; + void *expire_func; + pid_t pid; + + /* + * Number of timeout events: + */ + unsigned long count; + unsigned int timer_flag; + + /* + * We save the command-line string to preserve + * this information past task exit: + */ + char comm[TASK_COMM_LEN + 1]; + +} ____cacheline_aligned_in_smp; + +/* + * Spinlock protecting the tables - not taken during lookup: + */ +static DEFINE_RAW_SPINLOCK(table_lock); + +/* + * Per-CPU lookup locks for fast hash lookup: + */ +static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock); + +/* + * Mutex to serialize state changes with show-stats activities: + */ +static DEFINE_MUTEX(show_mutex); + +/* + * Collection status, active/inactive: + */ +int __read_mostly timer_stats_active; + +/* + * Beginning/end timestamps of measurement: + */ +static ktime_t time_start, time_stop; + +/* + * tstat entry structs only get allocated while collection is + * active and never freed during that time - this simplifies + * things quite a bit. + * + * They get freed when a new collection period is started. + */ +#define MAX_ENTRIES_BITS 10 +#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) + +static unsigned long nr_entries; +static struct entry entries[MAX_ENTRIES]; + +static atomic_t overflow_count; + +/* + * The entries are in a hash-table, for fast lookup: + */ +#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) +#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) +#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) + +#define __tstat_hashfn(entry) \ + (((unsigned long)(entry)->timer ^ \ + (unsigned long)(entry)->start_func ^ \ + (unsigned long)(entry)->expire_func ^ \ + (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) + +#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) + +static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; + +static void reset_entries(void) +{ + nr_entries = 0; + memset(entries, 0, sizeof(entries)); + memset(tstat_hash_table, 0, sizeof(tstat_hash_table)); + atomic_set(&overflow_count, 0); +} + +static struct entry *alloc_entry(void) +{ + if (nr_entries >= MAX_ENTRIES) + return NULL; + + return entries + nr_entries++; +} + +static int match_entries(struct entry *entry1, struct entry *entry2) +{ + return entry1->timer == entry2->timer && + entry1->start_func == entry2->start_func && + entry1->expire_func == entry2->expire_func && + entry1->pid == entry2->pid; +} + +/* + * Look up whether an entry matching this item is present + * in the hash already. Must be called with irqs off and the + * lookup lock held: + */ +static struct entry *tstat_lookup(struct entry *entry, char *comm) +{ + struct entry **head, *curr, *prev; + + head = tstat_hashentry(entry); + curr = *head; + + /* + * The fastpath is when the entry is already hashed, + * we do this with the lookup lock held, but with the + * table lock not held: + */ + while (curr) { + if (match_entries(curr, entry)) + return curr; + + curr = curr->next; + } + /* + * Slowpath: allocate, set up and link a new hash entry: + */ + prev = NULL; + curr = *head; + + raw_spin_lock(&table_lock); + /* + * Make sure we have not raced with another CPU: + */ + while (curr) { + if (match_entries(curr, entry)) + goto out_unlock; + + prev = curr; + curr = curr->next; + } + + curr = alloc_entry(); + if (curr) { + *curr = *entry; + curr->count = 0; + curr->next = NULL; + memcpy(curr->comm, comm, TASK_COMM_LEN); + + smp_mb(); /* Ensure that curr is initialized before insert */ + + if (prev) + prev->next = curr; + else + *head = curr; + } + out_unlock: + raw_spin_unlock(&table_lock); + + return curr; +} + +/** + * timer_stats_update_stats - Update the statistics for a timer. + * @timer: pointer to either a timer_list or a hrtimer + * @pid: the pid of the task which set up the timer + * @startf: pointer to the function which did the timer setup + * @timerf: pointer to the timer callback function of the timer + * @comm: name of the process which set up the timer + * + * When the timer is already registered, then the event counter is + * incremented. Otherwise the timer is registered in a free slot. + */ +void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char *comm, + unsigned int timer_flag) +{ + /* + * It doesn't matter which lock we take: + */ + raw_spinlock_t *lock; + struct entry *entry, input; + unsigned long flags; + + if (likely(!timer_stats_active)) + return; + + lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id()); + + input.timer = timer; + input.start_func = startf; + input.expire_func = timerf; + input.pid = pid; + input.timer_flag = timer_flag; + + raw_spin_lock_irqsave(lock, flags); + if (!timer_stats_active) + goto out_unlock; + + entry = tstat_lookup(&input, comm); + if (likely(entry)) + entry->count++; + else + atomic_inc(&overflow_count); + + out_unlock: + raw_spin_unlock_irqrestore(lock, flags); +} + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char symname[KSYM_NAME_LEN]; + + if (lookup_symbol_name(addr, symname) < 0) + seq_printf(m, "<%p>", (void *)addr); + else + seq_printf(m, "%s", symname); +} + +static int tstats_show(struct seq_file *m, void *v) +{ + struct timespec period; + struct entry *entry; + unsigned long ms; + long events = 0; + ktime_t time; + int i; + + mutex_lock(&show_mutex); + /* + * If still active then calculate up to now: + */ + if (timer_stats_active) + time_stop = ktime_get(); + + time = ktime_sub(time_stop, time_start); + + period = ktime_to_timespec(time); + ms = period.tv_nsec / 1000000; + + seq_puts(m, "Timer Stats Version: v0.3\n"); + seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + if (atomic_read(&overflow_count)) + seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); + seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); + + for (i = 0; i < nr_entries; i++) { + entry = entries + i; + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + seq_printf(m, "%4luD, %5d %-16s ", + entry->count, entry->pid, entry->comm); + } else { + seq_printf(m, " %4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + } + + print_name_offset(m, (unsigned long)entry->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)entry->expire_func); + seq_puts(m, ")\n"); + + events += entry->count; + } + + ms += period.tv_sec * 1000; + if (!ms) + ms = 1; + + if (events && period.tv_sec) + seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", + events, events * 1000 / ms, + (events * 1000000 / ms) % 1000); + else + seq_printf(m, "%ld total events\n", events); + + mutex_unlock(&show_mutex); + + return 0; +} + +/* + * After a state change, make sure all concurrent lookup/update + * activities have stopped: + */ +static void sync_access(void) +{ + unsigned long flags; + int cpu; + + for_each_online_cpu(cpu) { + raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); + + raw_spin_lock_irqsave(lock, flags); + /* nothing */ + raw_spin_unlock_irqrestore(lock, flags); + } +} + +static ssize_t tstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + char ctl[2]; + + if (count != 2 || *offs) + return -EINVAL; + + if (copy_from_user(ctl, buf, count)) + return -EFAULT; + + mutex_lock(&show_mutex); + switch (ctl[0]) { + case '0': + if (timer_stats_active) { + timer_stats_active = 0; + time_stop = ktime_get(); + sync_access(); + } + break; + case '1': + if (!timer_stats_active) { + reset_entries(); + time_start = ktime_get(); + smp_mb(); + timer_stats_active = 1; + } + break; + default: + count = -EINVAL; + } + mutex_unlock(&show_mutex); + + return count; +} + +static int tstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, tstats_show, NULL); +} + +static const struct file_operations tstats_fops = { + .open = tstats_open, + .read = seq_read, + .write = tstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +void __init init_timer_stats(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); +} + +static int __init init_tstats_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create("timer_stats", 0644, NULL, &tstats_fops); + if (!pe) + return -ENOMEM; + return 0; +} +__initcall(init_tstats_procfs); |