From 7304611fc3842e40556158122f75935f17c6365f Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Sat, 6 Nov 2010 19:22:42 -0400 Subject: [PATCH] cpufreq: interactive: Changes to interactive governor Changes include: * May scale up to intermediate speeds after scaling down, rather than scale to max speed and then only scale down until max speed needed. * Tweaked thresholds at which max speed requested (previously CPU must have been 100% busy since idle exit timer started, now will go max if at least 85% busy) and default minimum sample time raised to 80ms. Tweaking based on UI tests, still in progress. * SMP fixes. * Fixed attempted multiple delete of sysfs group on governor stop. Set a just-in-case-CPU-goes-busy-again timer even if nr_running == 0 at timer function run time, but cancel if that CPU goes idle (and don't re-arm timer if that CPU is currently idle). * Re-evaluate speed if a CPU goes idle while above min speed (and no timer currently set) in case the platform requires all CPUs to be at the same speed. * Realtime workqueues disappeared upstream, convert speed up workqueue to a realtime task. Average scheduling latency measured significantly less than WQ_HIGHPRI. * Timers are not deferrable, must wake CPU from idle, since we now re-evaluate speed for idle CPUs. * CPU load is computed from higher of short-term load since idle exit vs. long-term load since last frequency change, to avoid dropping speed during temporary dips in load on long-term-busy CPU. * Avoid 1 CPU starting new idle exit load eval interval in a race with timer running on another CPU. * New fugly debugging printfs should be reworked or go away eventually. Change-Id: I606b5c1850637c35a7814309df12362d5c044825 via: https://review.source.android.com//#change,15809 --- Documentation/cpu-freq/governors.txt | 33 +- drivers/cpufreq/Kconfig | 12 +- drivers/cpufreq/cpufreq_interactive.c | 606 ++++++++++++++++++++------ 3 files changed, 489 insertions(+), 162 deletions(-) diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt index d155c054..be679e4c 100644 --- a/Documentation/cpu-freq/governors.txt +++ b/Documentation/cpu-freq/governors.txt @@ -187,23 +187,32 @@ default value of '20' it means that if the CPU usage needs to be below 2.6 Interactive --------------- -The CPUfreq governor "interactive" is designed for low latency, +The CPUfreq governor "interactive" is designed for latency-sensitive, interactive workloads. This governor sets the CPU speed depending on -usage, similar to "ondemand" and "conservative" governors. However -there is no polling, or 'sample_rate' required to scale the CPU up. +usage, similar to "ondemand" and "conservative" governors. However, +the governor is more aggressive about scaling the CPU speed up in +response to CPU-intensive activity. -Sampling CPU load every X ms can lead to under powering the CPU -for X ms, leading to dropped framerate, stuttering UI etc.. - -Scaling the CPU up is done when coming out of idle, and like "ondemand" -scaling up will always go to MAX, then step down based off of cpu load. +Sampling the CPU load every X ms can lead to under-powering the CPU +for X ms, leading to dropped frames, stuttering UI, etc. Instead of +sampling the cpu at a specified rate, the interactive governor will +check whether to scale the cpu frequency up soon after coming out of +idle. When the cpu comes out of idle, a timer is configured to fire +within 1-2 ticks. If the cpu is very busy between exiting idle and +when the timer fires then we assume the cpu is underpowered and ramp +to MAX speed. + +If the cpu was not sufficiently busy to immediately ramp to MAX speed, +then governor evaluates the cpu load since the last speed adjustment, +choosing th highest value between that longer-term load or the +short-term load since idle exit to determine the cpu speed to ramp to. There is only one tuneable value for this governor: -min_sample_time: The ammount of time the CPU must spend (in uS) -at the current frequency before scaling DOWN. This is done to -more accurately determine the cpu workload and the best speed for that -workload. The default is 50ms. +min_sample_time: The minimum amount of time to spend at the current +frequency before ramping down. This is to ensure that the governor has +seen enough historic cpu load data to determine the appropriate +workload. Default is 80000 uS. 3. The Governor Interface in the CPUfreq Core diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index a883b8a3..701913f4 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -122,9 +122,10 @@ config CPU_FREQ_DEFAULT_GOV_INTERACTIVE bool "interactive" select CPU_FREQ_GOV_INTERACTIVE help - Use the 'interactive' governor as default. This gets full cpu frequency - scaling for workloads that are latency sensitive, typically interactive - workloads. + Use the CPUFreq governor 'interactive' as default. This allows + you to get a full dynamic cpu frequency capable system by simply + loading your cpufreq low-level hardware driver, using the + 'interactive' governor for latency-sensitive workloads. endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -185,9 +186,8 @@ config CPU_FREQ_GOV_ONDEMAND config CPU_FREQ_GOV_INTERACTIVE tristate "'interactive' cpufreq governor" help - 'interactive' - This driver adds a dynamic cpufreq policy governor. - Designed for low latency burst workloads. Sclaing is done when - coming out idle instead of polling. + 'interactive' - This driver adds a dynamic cpufreq policy governor + designed for latency-sensitive workloads config CPU_FREQ_GOV_CONSERVATIVE tristate "'conservative' cpufreq governor" diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 38e76c87..5ebfa475 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -24,37 +24,132 @@ #include #include #include +#include #include static void (*pm_idle_old)(void); static atomic_t active_count = ATOMIC_INIT(0); -static DEFINE_PER_CPU(struct timer_list, cpu_timer); +struct cpufreq_interactive_cpuinfo { + struct timer_list cpu_timer; + int timer_idlecancel; + u64 time_in_idle; + u64 idle_exit_time; + u64 timer_run_time; + int idling; + u64 freq_change_time; + u64 freq_change_time_in_idle; + struct cpufreq_policy *policy; + struct cpufreq_frequency_table *freq_table; + unsigned int target_freq; + int governor_enabled; +}; -static DEFINE_PER_CPU(u64, time_in_idle); -static DEFINE_PER_CPU(u64, idle_exit_time); - -static struct cpufreq_policy *policy; -static unsigned int target_freq; +static DEFINE_PER_CPU(struct cpufreq_interactive_cpuinfo, cpuinfo); /* Workqueues handle frequency scaling */ -static struct workqueue_struct *up_wq; +static struct task_struct *up_task; static struct workqueue_struct *down_wq; -static struct work_struct freq_scale_work; - -static u64 freq_change_time; -static u64 freq_change_time_in_idle; - -static cpumask_t work_cpumask; +static struct work_struct freq_scale_down_work; +static cpumask_t up_cpumask; +static cpumask_t down_cpumask; /* - * The minimum ammount of time to spend at a frequency before we can ramp down, - * default is 50ms. + * The minimum amount of time to spend at a frequency before we can ramp down. */ -#define DEFAULT_MIN_SAMPLE_TIME 50000; +#define DEFAULT_MIN_SAMPLE_TIME 80000; static unsigned long min_sample_time; +#define LOAD_SCALE_MAX 85 + +#define DEBUG 0 +#define BUFSZ 128 + +#if DEBUG +#include + +struct dbgln { + int cpu; + unsigned long jiffy; + unsigned long run; + char buf[BUFSZ]; +}; + +#define NDBGLNS 256 + +static struct dbgln dbgbuf[NDBGLNS]; +static int dbgbufs; +static int dbgbufe; +static struct proc_dir_entry *dbg_proc; +static spinlock_t dbgpr_lock; + +static u64 up_request_time; +static unsigned int up_max_latency; + +static void dbgpr(char *fmt, ...) +{ + va_list args; + int n; + unsigned long flags; + + spin_lock_irqsave(&dbgpr_lock, flags); + n = dbgbufe; + va_start(args, fmt); + vsnprintf(dbgbuf[n].buf, BUFSZ, fmt, args); + va_end(args); + dbgbuf[n].cpu = smp_processor_id(); + dbgbuf[n].run = nr_running(); + dbgbuf[n].jiffy = jiffies; + + if (++dbgbufe >= NDBGLNS) + dbgbufe = 0; + + if (dbgbufe == dbgbufs) + if (++dbgbufs >= NDBGLNS) + dbgbufs = 0; + + spin_unlock_irqrestore(&dbgpr_lock, flags); +} + +static void dbgdump(void) +{ + int i, j; + unsigned long flags; + static struct dbgln prbuf[NDBGLNS]; + + spin_lock_irqsave(&dbgpr_lock, flags); + i = dbgbufs; + j = dbgbufe; + memcpy(prbuf, dbgbuf, sizeof(dbgbuf)); + dbgbufs = 0; + dbgbufe = 0; + spin_unlock_irqrestore(&dbgpr_lock, flags); + + while (i != j) + { + printk("%lu %d %lu %s", + prbuf[i].jiffy, prbuf[i].cpu, prbuf[i].run, + prbuf[i].buf); + if (++i == NDBGLNS) + i = 0; + } +} + +static int dbg_proc_read(char *buffer, char **start, off_t offset, + int count, int *peof, void *dat) +{ + printk("max up_task latency=%uus\n", up_max_latency); + dbgdump(); + *peof = 1; + return 0; +} + + +#else +#define dbgpr(...) do {} while (0) +#endif + static int cpufreq_governor_interactive(struct cpufreq_policy *policy, unsigned int event); @@ -70,142 +165,329 @@ struct cpufreq_governor cpufreq_gov_interactive = { static void cpufreq_interactive_timer(unsigned long data) { - u64 delta_idle; - u64 update_time; - u64 *cpu_time_in_idle; - u64 *cpu_idle_exit_time; - struct timer_list *t; - - u64 now_idle = get_cpu_idle_time_us(data, - &update_time); - - - cpu_time_in_idle = &per_cpu(time_in_idle, data); - cpu_idle_exit_time = &per_cpu(idle_exit_time, data); - - if (update_time == *cpu_idle_exit_time) - return; - - delta_idle = cputime64_sub(now_idle, *cpu_time_in_idle); - - /* Scale up if there were no idle cycles since coming out of idle */ - if (delta_idle == 0) { - if (policy->cur == policy->max) - return; - - if (nr_running() < 1) - return; - - target_freq = policy->max; - cpumask_set_cpu(data, &work_cpumask); - queue_work(up_wq, &freq_scale_work); - return; - } + unsigned int delta_idle; + unsigned int delta_time; + int cpu_load; + int load_since_change; + u64 time_in_idle; + u64 idle_exit_time; + struct cpufreq_interactive_cpuinfo *pcpu = + &per_cpu(cpuinfo, data); + u64 now_idle; + unsigned int new_freq; + unsigned int index; /* - * There is a window where if the cpu utlization can go from low to high - * between the timer expiring, delta_idle will be > 0 and the cpu will - * be 100% busy, preventing idle from running, and this timer from - * firing. So setup another timer to fire to check cpu utlization. - * Do not setup the timer if there is no scheduled work. + * Once pcpu->timer_run_time is updated to >= pcpu->idle_exit_time, + * this lets idle exit know the current idle time sample has + * been processed, and idle exit can generate a new sample and + * re-arm the timer. This prevents a concurrent idle + * exit on that CPU from writing a new set of info at the same time + * the timer function runs (the timer function can't use that info + * until more time passes). */ - t = &per_cpu(cpu_timer, data); - if (!timer_pending(t) && nr_running() > 0) { - *cpu_time_in_idle = get_cpu_idle_time_us( - data, cpu_idle_exit_time); - mod_timer(t, jiffies + 2); + time_in_idle = pcpu->time_in_idle; + idle_exit_time = pcpu->idle_exit_time; + now_idle = get_cpu_idle_time_us(data, &pcpu->timer_run_time); + smp_wmb(); + + /* If we raced with cancelling a timer, skip. */ + if (!idle_exit_time) { + dbgpr("timer %d: no valid idle exit sample\n", (int) data); + goto exit; } - if (policy->cur == policy->min) - return; +#if DEBUG + if ((int) jiffies - (int) pcpu->cpu_timer.expires >= 10) + dbgpr("timer %d: late by %d ticks\n", + (int) data, jiffies - pcpu->cpu_timer.expires); +#endif + + delta_idle = (unsigned int) cputime64_sub(now_idle, time_in_idle); + delta_time = (unsigned int) cputime64_sub(pcpu->timer_run_time, + idle_exit_time); + + /* + * If timer ran less than 1ms after short-term sample started, retry. + */ + if (delta_time < 1000) { + dbgpr("timer %d: time delta %u too short exit=%llu now=%llu\n", (int) data, + delta_time, idle_exit_time, pcpu->timer_run_time); + goto rearm; + } + + if (delta_idle > delta_time) + cpu_load = 0; + else + cpu_load = 100 * (delta_time - delta_idle) / delta_time; + + delta_idle = (unsigned int) cputime64_sub(now_idle, + pcpu->freq_change_time_in_idle); + delta_time = (unsigned int) cputime64_sub(pcpu->timer_run_time, + pcpu->freq_change_time); + + if (delta_idle > delta_time) + load_since_change = 0; + else + load_since_change = + 100 * (delta_time - delta_idle) / delta_time; + + /* + * Choose greater of short-term load (since last idle timer + * started or timer function re-armed itself) or long-term load + * (since last frequency change). + */ + if (load_since_change > cpu_load) + cpu_load = load_since_change; + + if (cpu_load >= LOAD_SCALE_MAX) + new_freq = pcpu->policy->max; + else + new_freq = pcpu->policy->max * cpu_load / 100; + + if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table, + new_freq, CPUFREQ_RELATION_H, + &index)) { + dbgpr("timer %d: cpufreq_frequency_table_target error\n", (int) data); + goto rearm; + } + + new_freq = pcpu->freq_table[index].frequency; + + if (pcpu->target_freq == new_freq) + { + dbgpr("timer %d: load=%d, already at %d\n", (int) data, cpu_load, new_freq); + goto rearm_if_notmax; + } /* * Do not scale down unless we have been at this frequency for the * minimum sample time. */ - if (cputime64_sub(update_time, freq_change_time) < min_sample_time) - return; - - target_freq = policy->min; - cpumask_set_cpu(data, &work_cpumask); - queue_work(down_wq, &freq_scale_work); -} - -static void cpufreq_idle(void) -{ - struct timer_list *t; - u64 *cpu_time_in_idle; - u64 *cpu_idle_exit_time; - - pm_idle_old(); - - if (!cpumask_test_cpu(smp_processor_id(), policy->cpus)) - return; - - /* Timer to fire in 1-2 ticks, jiffie aligned. */ - t = &per_cpu(cpu_timer, smp_processor_id()); - cpu_idle_exit_time = &per_cpu(idle_exit_time, smp_processor_id()); - cpu_time_in_idle = &per_cpu(time_in_idle, smp_processor_id()); - - if (timer_pending(t) == 0) { - *cpu_time_in_idle = get_cpu_idle_time_us( - smp_processor_id(), cpu_idle_exit_time); - mod_timer(t, jiffies + 2); + if (new_freq < pcpu->target_freq) { + if (cputime64_sub(pcpu->timer_run_time, pcpu->freq_change_time) < + min_sample_time) { + dbgpr("timer %d: load=%d cur=%d tgt=%d not yet\n", (int) data, cpu_load, pcpu->target_freq, new_freq); + goto rearm; + } } -} -/* - * Choose the cpu frequency based off the load. For now choose the minimum - * frequency that will satisfy the load, which is not always the lower power. - */ -static unsigned int cpufreq_interactive_calc_freq(unsigned int cpu) -{ - unsigned int delta_time; - unsigned int idle_time; - unsigned int cpu_load; - u64 current_wall_time; - u64 current_idle_time;; + dbgpr("timer %d: load=%d cur=%d tgt=%d queue\n", (int) data, cpu_load, pcpu->target_freq, new_freq); - current_idle_time = get_cpu_idle_time_us(cpu, ¤t_wall_time); + if (new_freq < pcpu->target_freq) { + pcpu->target_freq = new_freq; + cpumask_set_cpu(data, &down_cpumask); + queue_work(down_wq, &freq_scale_down_work); + } else { + pcpu->target_freq = new_freq; +#if DEBUG + up_request_time = ktime_to_us(ktime_get()); +#endif + cpumask_set_cpu(data, &up_cpumask); + wake_up_process(up_task); + } - idle_time = (unsigned int) current_idle_time - freq_change_time_in_idle; - delta_time = (unsigned int) current_wall_time - freq_change_time; +rearm_if_notmax: + /* + * Already set max speed and don't see a need to change that, + * wait until next idle to re-evaluate, don't need timer. + */ + if (pcpu->target_freq == pcpu->policy->max) + goto exit; - if (delta_time == 0) - return policy->cur; +rearm: + if (!timer_pending(&pcpu->cpu_timer)) { + /* + * If already at min: if that CPU is idle, don't set timer. + * Else cancel the timer if that CPU goes idle. We don't + * need to re-evaluate speed until the next idle exit. + */ + if (pcpu->target_freq == pcpu->policy->min) { + smp_rmb(); - cpu_load = 100 * (delta_time - idle_time) / delta_time; - - return policy->cur * cpu_load / 100; -} - - -/* We use the same work function to sale up and down */ -static void cpufreq_interactive_freq_change_time_work(struct work_struct *work) -{ - unsigned int cpu; - cpumask_t *tmp_mask = &work_cpumask; - for_each_cpu(cpu, tmp_mask) { - if (target_freq == policy->max) { - if (nr_running() == 1) { - cpumask_clear_cpu(cpu, &work_cpumask); - return; + if (pcpu->idling) { + dbgpr("timer %d: cpu idle, don't re-arm\n", (int) data); + goto exit; } - __cpufreq_driver_target(policy, target_freq, - CPUFREQ_RELATION_H); - } else { - target_freq = cpufreq_interactive_calc_freq(cpu); - __cpufreq_driver_target(policy, target_freq, - CPUFREQ_RELATION_L); + pcpu->timer_idlecancel = 1; } - freq_change_time_in_idle = get_cpu_idle_time_us(cpu, - &freq_change_time); - cpumask_clear_cpu(cpu, &work_cpumask); + pcpu->time_in_idle = get_cpu_idle_time_us( + data, &pcpu->idle_exit_time); + mod_timer(&pcpu->cpu_timer, jiffies + 2); + dbgpr("timer %d: set timer for %lu exit=%llu\n", (int) data, pcpu->cpu_timer.expires, pcpu->idle_exit_time); } +exit: + return; +} +static void cpufreq_interactive_idle(void) +{ + struct cpufreq_interactive_cpuinfo *pcpu = + &per_cpu(cpuinfo, smp_processor_id()); + int pending; + + if (!pcpu->governor_enabled) { + pm_idle_old(); + return; + } + + pcpu->idling = 1; + smp_wmb(); + pending = timer_pending(&pcpu->cpu_timer); + + if (pcpu->target_freq != pcpu->policy->min) { +#ifdef CONFIG_SMP + /* + * Entering idle while not at lowest speed. On some + * platforms this can hold the other CPU(s) at that speed + * even though the CPU is idle. Set a timer to re-evaluate + * speed so this idle CPU doesn't hold the other CPUs above + * min indefinitely. This should probably be a quirk of + * the CPUFreq driver. + */ + if (!pending) { + pcpu->time_in_idle = get_cpu_idle_time_us( + smp_processor_id(), &pcpu->idle_exit_time); + pcpu->timer_idlecancel = 0; + mod_timer(&pcpu->cpu_timer, jiffies + 2); + dbgpr("idle: enter at %d, set timer for %lu exit=%llu\n", + pcpu->target_freq, pcpu->cpu_timer.expires, + pcpu->idle_exit_time); + } +#endif + } else { + /* + * If at min speed and entering idle after load has + * already been evaluated, and a timer has been set just in + * case the CPU suddenly goes busy, cancel that timer. The + * CPU didn't go busy; we'll recheck things upon idle exit. + */ + if (pending && pcpu->timer_idlecancel) { + dbgpr("idle: cancel timer for %lu\n", pcpu->cpu_timer.expires); + del_timer(&pcpu->cpu_timer); + /* + * Ensure last timer run time is after current idle + * sample start time, so next idle exit will always + * start a new idle sampling period. + */ + pcpu->idle_exit_time = 0; + pcpu->timer_idlecancel = 0; + } + } + + pm_idle_old(); + pcpu->idling = 0; + smp_wmb(); + + /* + * Arm the timer for 1-2 ticks later if not already, and if the timer + * function has already processed the previous load sampling + * interval. (If the timer is not pending but has not processed + * the previous interval, it is probably racing with us on another + * CPU. Let it compute load based on the previous sample and then + * re-arm the timer for another interval when it's done, rather + * than updating the interval start time to be "now", which doesn't + * give the timer function enough time to make a decision on this + * run.) + */ + if (timer_pending(&pcpu->cpu_timer) == 0 && + pcpu->timer_run_time >= pcpu->idle_exit_time) { + pcpu->time_in_idle = + get_cpu_idle_time_us(smp_processor_id(), + &pcpu->idle_exit_time); + pcpu->timer_idlecancel = 0; + mod_timer(&pcpu->cpu_timer, jiffies + 2); + dbgpr("idle: exit, set timer for %lu exit=%llu\n", pcpu->cpu_timer.expires, pcpu->idle_exit_time); +#if DEBUG + } else if (timer_pending(&pcpu->cpu_timer) == 0 && + pcpu->timer_run_time < pcpu->idle_exit_time) { + dbgpr("idle: timer not run yet: exit=%llu tmrrun=%llu\n", + pcpu->idle_exit_time, pcpu->timer_run_time); +#endif + } + +} + +static int cpufreq_interactive_up_task(void *data) +{ + unsigned int cpu; + cpumask_t tmp_mask; + struct cpufreq_interactive_cpuinfo *pcpu; + +#if DEBUG + u64 now; + u64 then; + unsigned int lat; +#endif + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (cpumask_empty(&up_cpumask)) + schedule(); + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + break; +#if DEBUG + then = up_request_time; + now = ktime_to_us(ktime_get()); + + if (now > then) { + lat = ktime_to_us(ktime_get()) - then; + + if (lat > up_max_latency) + up_max_latency = lat; + } +#endif + + tmp_mask = up_cpumask; + + for_each_cpu(cpu, &tmp_mask) { + cpumask_clear_cpu(cpu, &up_cpumask); + pcpu = &per_cpu(cpuinfo, cpu); + + if (nr_running() == 1) { + dbgpr("up %d: tgt=%d nothing else running\n", cpu, + pcpu->target_freq); + } + + __cpufreq_driver_target(pcpu->policy, + pcpu->target_freq, + CPUFREQ_RELATION_H); + pcpu->freq_change_time_in_idle = + get_cpu_idle_time_us(cpu, + &pcpu->freq_change_time); + dbgpr("up %d: set tgt=%d (actual=%d)\n", cpu, pcpu->target_freq, pcpu->policy->cur); + } + } + + return 0; +} + +static void cpufreq_interactive_freq_down(struct work_struct *work) +{ + unsigned int cpu; + cpumask_t tmp_mask = down_cpumask; + struct cpufreq_interactive_cpuinfo *pcpu; + + for_each_cpu(cpu, &tmp_mask) { + cpumask_clear_cpu(cpu, &down_cpumask); + pcpu = &per_cpu(cpuinfo, cpu); + + __cpufreq_driver_target(pcpu->policy, + pcpu->target_freq, + CPUFREQ_RELATION_H); + + pcpu->freq_change_time_in_idle = + get_cpu_idle_time_us(cpu, + &pcpu->freq_change_time); + dbgpr("down %d: set tgt=%d (actual=%d)\n", cpu, pcpu->target_freq, pcpu->policy->cur); + } } static ssize_t show_min_sample_time(struct kobject *kobj, @@ -237,11 +519,21 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *new_policy, unsigned int event) { int rc; + struct cpufreq_interactive_cpuinfo *pcpu = + &per_cpu(cpuinfo, new_policy->cpu); + switch (event) { case CPUFREQ_GOV_START: if (!cpu_online(new_policy->cpu)) return -EINVAL; + pcpu->policy = new_policy; + pcpu->freq_table = cpufreq_frequency_get_table(new_policy->cpu); + pcpu->target_freq = new_policy->cur; + pcpu->freq_change_time_in_idle = + get_cpu_idle_time_us(new_policy->cpu, + &pcpu->freq_change_time); + pcpu->governor_enabled = 1; /* * Do not register the idle hook and create sysfs * entries if we have already done so. @@ -255,20 +547,21 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *new_policy, return rc; pm_idle_old = pm_idle; - pm_idle = cpufreq_idle; - policy = new_policy; + pm_idle = cpufreq_interactive_idle; break; case CPUFREQ_GOV_STOP: - if (atomic_dec_return(&active_count) > 1) + pcpu->governor_enabled = 0; + + if (atomic_dec_return(&active_count) > 0) return 0; sysfs_remove_group(cpufreq_global_kobject, &interactive_attr_group); pm_idle = pm_idle_old; - del_timer(&per_cpu(cpu_timer, new_policy->cpu)); - break; + del_timer(&pcpu->cpu_timer); + break; case CPUFREQ_GOV_LIMITS: if (new_policy->max < new_policy->cur) @@ -285,28 +578,52 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *new_policy, static int __init cpufreq_interactive_init(void) { unsigned int i; - struct timer_list *t; + struct cpufreq_interactive_cpuinfo *pcpu; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + min_sample_time = DEFAULT_MIN_SAMPLE_TIME; /* Initalize per-cpu timers */ for_each_possible_cpu(i) { - t = &per_cpu(cpu_timer, i); - init_timer_deferrable(t); - t->function = cpufreq_interactive_timer; - t->data = i; + pcpu = &per_cpu(cpuinfo, i); + init_timer(&pcpu->cpu_timer); + pcpu->cpu_timer.function = cpufreq_interactive_timer; + pcpu->cpu_timer.data = i; } - /* Scale up is high priority */ - up_wq = create_rt_workqueue("kinteractive_up"); + up_task = kthread_create(cpufreq_interactive_up_task, NULL, + "kinteractiveup"); + if (IS_ERR(up_task)) + return PTR_ERR(up_task); + + sched_setscheduler_nocheck(up_task, SCHED_FIFO, ¶m); + get_task_struct(up_task); + + /* No rescuer thread, bind to CPU queuing the work for possibly + warm cache (probably doesn't matter much). */ down_wq = create_workqueue("knteractive_down"); - INIT_WORK(&freq_scale_work, cpufreq_interactive_freq_change_time_work); + if (! down_wq) + goto err_freeuptask; + + INIT_WORK(&freq_scale_down_work, + cpufreq_interactive_freq_down); + +#if DEBUG + spin_lock_init(&dbgpr_lock); + dbg_proc = create_proc_entry("igov", S_IWUSR | S_IRUGO, NULL); + dbg_proc->read_proc = dbg_proc_read; +#endif return cpufreq_register_governor(&cpufreq_gov_interactive); + +err_freeuptask: + put_task_struct(up_task); + return -ENOMEM; } #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE -pure_initcall(cpufreq_interactive_init); +fs_initcall(cpufreq_interactive_init); #else module_init(cpufreq_interactive_init); #endif @@ -314,7 +631,8 @@ module_init(cpufreq_interactive_init); static void __exit cpufreq_interactive_exit(void) { cpufreq_unregister_governor(&cpufreq_gov_interactive); - destroy_workqueue(up_wq); + kthread_stop(up_task); + put_task_struct(up_task); destroy_workqueue(down_wq); }