
From: Hu Zhaodong <huzhaodong@huawei.com> ohos inclusion category: feature issue: #I4TNS2 CVE: NA Signed-off-by: Hu Zhaodong <huzhaodong@huawei.com> ------------------------------------------- RT task detects capacity during CPU selection Signed-off-by: gaochao <gaochao49@huawei.com> --- include/linux/sched/sysctl.h | 3 + include/trace/events/eas_sched.h | 76 +++++++++++++ include/trace/events/sched.h | 4 + init/Kconfig | 7 ++ kernel/sched/fair.c | 8 ++ kernel/sched/rt.c | 177 +++++++++++++++++++++++++++++++ kernel/sched/sched.h | 17 +++ kernel/sched/topology.c | 14 +++ kernel/sysctl.c | 9 ++ 9 files changed, 315 insertions(+) create mode 100644 include/trace/events/eas_sched.h diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index acec3b1fd469..a08551ebd23d 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,6 +41,9 @@ sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_RT_CAS +extern unsigned int sysctl_sched_enable_rt_cas; +#endif #ifdef CONFIG_SCHED_RT_ACTIVE_LB extern unsigned int sysctl_sched_enable_rt_active_lb; #endif diff --git a/include/trace/events/eas_sched.h b/include/trace/events/eas_sched.h new file mode 100644 index 000000000000..bd24c9ef5b6e --- /dev/null +++ b/include/trace/events/eas_sched.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifdef CONFIG_SCHED_RT_CAS +TRACE_EVENT(sched_find_cas_cpu_each, + + TP_PROTO(struct task_struct *task, int cpu, int target_cpu, + int isolated, int idle, unsigned long task_util, + unsigned long cpu_util, int cpu_cap), + + TP_ARGS(task, cpu, target_cpu, isolated, idle, task_util, cpu_util, cpu_cap), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(int, prio) + __field(int, cpu) + __field(int, target_cpu) + __field(int, isolated) + __field(unsigned long, idle) + __field(unsigned long, task_util) + __field(unsigned long, cpu_util) + __field(unsigned long, cpu_cap) + ), + + TP_fast_assign( + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + __entry->prio = task->prio; + __entry->cpu = cpu; + __entry->target_cpu = target_cpu; + __entry->isolated = isolated; + __entry->idle = idle; + __entry->task_util = task_util; + __entry->cpu_util = cpu_util; + __entry->cpu_cap = cpu_cap; + ), + + TP_printk("comm=%s pid=%d prio=%d cpu=%d target_cpu=%d isolated=%d idle=%d task_util=%lu cpu_util=%lu cpu_cap=%lu", + __entry->comm, __entry->pid, __entry->prio, + __entry->cpu, __entry->target_cpu, __entry->isolated, + __entry->idle, __entry->task_util, + __entry->cpu_util, __entry->cpu_cap) +); + +TRACE_EVENT(sched_find_cas_cpu, + + TP_PROTO(struct task_struct *task, struct cpumask *lowest_mask, + unsigned long tutil, int prev_cpu, int target_cpu), + + TP_ARGS(task, lowest_mask, tutil, prev_cpu, target_cpu), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned int, prio) + __bitmask(lowest, num_possible_cpus()) + __field(unsigned long, tutil) + __field(int, prev_cpu) + __field(int, target_cpu) + ), + + TP_fast_assign( + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->pid = task->pid; + __entry->prio = task->prio; + __assign_bitmask(lowest, cpumask_bits(lowest_mask), num_possible_cpus()); + __entry->tutil = tutil; + __entry->prev_cpu = prev_cpu; + __entry->target_cpu = target_cpu; + ), + + TP_printk("comm=%s pid=%d prio=%d lowest_mask=%s tutil=%lu perfer_idle=%u prev=%d target=%d ", + __entry->comm, __entry->pid, __entry->prio, + __get_bitmask(lowest), __entry->tutil, + __entry->prev_cpu, __entry->target_cpu) +); +#endif /* CONFIG_SCHED_RT_CAS */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 27b6ed3c9e58..dd5fff2bb1b2 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -10,6 +10,10 @@ #include <linux/tracepoint.h> #include <linux/binfmts.h> +#ifdef CONFIG_SCHED_RT_CAS +#include "eas_sched.h" +#endif + /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ diff --git a/init/Kconfig b/init/Kconfig index 57554d795040..ded631516e22 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -866,6 +866,13 @@ config SCHED_EAS help Check and migrate the CFS process to a more suitable CPU in the tick. +config SCHED_RT_CAS + bool "rt-cas optimization" + depends on SCHED_EAS + default n + help + RT task detects capacity during CPU selection + config SCHED_RT_ACTIVE_LB bool "RT Capacity Aware Misfit Task" depends on SCHED_EAS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9789a385fecd..945bd3fb2478 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3953,14 +3953,22 @@ static inline unsigned long task_util_est(struct task_struct *p) } #ifdef CONFIG_UCLAMP_TASK +#ifdef CONFIG_SCHED_RT_CAS +unsigned long uclamp_task_util(struct task_struct *p) +#else static inline unsigned long uclamp_task_util(struct task_struct *p) +#endif { return clamp(task_util_est(p), uclamp_eff_value(p, UCLAMP_MIN), uclamp_eff_value(p, UCLAMP_MAX)); } #else +#ifdef CONFIG_SCHED_RT_CAS +unsigned long uclamp_task_util(struct task_struct *p) +#else static inline unsigned long uclamp_task_util(struct task_struct *p) +#endif { return task_util_est(p); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9adcbf0e0bee..5926d6f6efbb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -17,6 +17,10 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); struct rt_bandwidth def_rt_bandwidth; +#ifdef CONFIG_SCHED_RT_CAS +unsigned int sysctl_sched_enable_rt_cas = 1; +#endif + #ifdef CONFIG_SCHED_RT_ACTIVE_LB unsigned int sysctl_sched_enable_rt_active_lb = 1; #endif @@ -1709,6 +1713,170 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) return NULL; } +#ifdef CONFIG_SCHED_RT_CAS +int find_cas_cpu(struct sched_domain *sd, + struct task_struct *task, struct cpumask *lowest_mask) +{ + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + struct sched_group *sg = NULL; + struct sched_group *sg_target = NULL; + struct sched_group *sg_backup = NULL; + struct cpumask search_cpu, backup_search_cpu; + int cpu = -1; + int target_cpu = -1; + unsigned long cpu_capacity; + unsigned long boosted_tutil = uclamp_task_util(task); + unsigned long target_capacity = ULONG_MAX; + unsigned long util; + unsigned long target_cpu_util = ULONG_MAX; + int prev_cpu = task_cpu(task); +#ifdef CONFIG_SCHED_RTG + struct cpumask *rtg_target = NULL; +#endif + bool boosted = uclamp_boosted(task); + + if (!sysctl_sched_enable_rt_cas) + return -1; + + rcu_read_lock(); + +#ifdef CONFIG_SCHED_RTG + rtg_target = find_rtg_target(task); +#endif + + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, 0)); + if (!sd) { + rcu_read_unlock(); + return -1; + } + + sg = sd->groups; + do { + if (!cpumask_intersects(lowest_mask, sched_group_span(sg))) + continue; + + if (boosted) { + if (cpumask_test_cpu(rd->max_cap_orig_cpu, + sched_group_span(sg))) { + sg_target = sg; + break; + } + } + + cpu = group_first_cpu(sg); +#ifdef CONFIG_SCHED_RTG + /* honor the rtg tasks */ + if (rtg_target) { + if (cpumask_test_cpu(cpu, rtg_target)) { + sg_target = sg; + break; + } + + /* active LB or big_task favor cpus with more capacity */ + if (task->state == TASK_RUNNING || boosted) { + if (capacity_orig_of(cpu) > + capacity_orig_of(cpumask_any(rtg_target))) { + sg_target = sg; + break; + } + + sg_backup = sg; + continue; + } + } +#endif + /* + * 1. add margin to support task migration + * 2. if task_util is high then all cpus, make sure the + * sg_backup with the most powerful cpus is selected + */ + if (!rt_task_fits_capacity(task, cpu)) { + sg_backup = sg; + continue; + } + + /* support task boost */ + cpu_capacity = capacity_orig_of(cpu); + if (boosted_tutil > cpu_capacity) { + sg_backup = sg; + continue; + } + + /* sg_target: select the sg with smaller capacity */ + if (cpu_capacity < target_capacity) { + target_capacity = cpu_capacity; + sg_target = sg; + } + } while (sg = sg->next, sg != sd->groups); + + if (!sg_target) + sg_target = sg_backup; + + if (sg_target) { + cpumask_and(&search_cpu, lowest_mask, sched_group_span(sg_target)); + cpumask_copy(&backup_search_cpu, lowest_mask); + cpumask_andnot(&backup_search_cpu, &backup_search_cpu, &search_cpu); + } else { + cpumask_copy(&search_cpu, lowest_mask); + cpumask_clear(&backup_search_cpu); + } + +retry: + cpu = cpumask_first(&search_cpu); + do { + trace_sched_find_cas_cpu_each(task, cpu, target_cpu, + cpu_isolated(cpu), + idle_cpu(cpu), boosted_tutil, cpu_util(cpu), + capacity_orig_of(cpu)); + + if (cpu_isolated(cpu)) + continue; + + if (!cpumask_test_cpu(cpu, task->cpus_ptr)) + continue; + + /* find best cpu with smallest max_capacity */ + if (target_cpu != -1 && + capacity_orig_of(cpu) > capacity_orig_of(target_cpu)) + continue; + + util = cpu_util(cpu); + + /* Find the least loaded CPU */ + if (util > target_cpu_util) + continue; + + /* + * If the preivous CPU has same load, keep it as + * target_cpu + */ + if (target_cpu_util == util && target_cpu == prev_cpu) + continue; + + /* + * If candidate CPU is the previous CPU, select it. + * If all above conditions are same, select the least + * cumulative window demand CPU. + */ + target_cpu_util = util; + target_cpu = cpu; + } while ((cpu = cpumask_next(cpu, &search_cpu)) < nr_cpu_ids); + + if (target_cpu != -1 && cpumask_test_cpu(target_cpu, lowest_mask)) { + goto done; + } else if (!cpumask_empty(&backup_search_cpu)) { + cpumask_copy(&search_cpu, &backup_search_cpu); + cpumask_clear(&backup_search_cpu); + goto retry; + } + +done: + trace_sched_find_cas_cpu(task, lowest_mask, boosted_tutil, prev_cpu, target_cpu); + rcu_read_unlock(); + return target_cpu; +} +#endif + static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static int find_lowest_rq(struct task_struct *task) @@ -1718,6 +1886,9 @@ static int find_lowest_rq(struct task_struct *task) int this_cpu = smp_processor_id(); int cpu = task_cpu(task); int ret; +#ifdef CONFIG_SCHED_RT_CAS + int cas_cpu; +#endif /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) @@ -1744,6 +1915,12 @@ static int find_lowest_rq(struct task_struct *task) if (!ret) return -1; /* No targets found */ +#ifdef CONFIG_SCHED_RT_CAS + cas_cpu = find_cas_cpu(sd, task, lowest_mask); + if (cas_cpu != -1) + return cas_cpu; +#endif + /* * At this point we have built a mask of CPUs representing the * lowest priority tasks in the system. Now we want to elect diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 09ad491bed45..e4c65d96185e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -87,6 +87,10 @@ struct rq; struct cpuidle_state; +#ifdef CONFIG_SCHED_RT_CAS +extern unsigned long uclamp_task_util(struct task_struct *p); +#endif + #ifdef CONFIG_SCHED_WALT extern unsigned int sched_ravg_window; extern unsigned int walt_cpu_util_freq_divisor; @@ -893,6 +897,9 @@ struct root_domain { * CPUs of the rd. Protected by RCU. */ struct perf_domain __rcu *pd; +#ifdef CONFIG_SCHED_RT_CAS + int max_cap_orig_cpu; +#endif }; extern void init_defrootdomain(void); @@ -2582,6 +2589,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return clamp(util, min_util, max_util); } +static inline bool uclamp_boosted(struct task_struct *p) +{ + return uclamp_eff_value(p, UCLAMP_MIN) > 0; +} + /* * When uclamp is compiled in, the aggregation at rq level is 'turned off' * by default in the fast path and only gets turned on once userspace performs @@ -2602,6 +2614,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, return util; } +static inline bool uclamp_boosted(struct task_struct *p) +{ + return false; +} + static inline bool uclamp_is_used(void) { return false; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b30b62f0d683..9191e5daaa3c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -522,6 +522,10 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_cpudl; + +#ifdef CONFIG_SCHED_RT_CAS + rd->max_cap_orig_cpu = -1; +#endif return 0; free_cpudl: @@ -2121,9 +2125,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { +#ifdef CONFIG_SCHED_RT_CAS + int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu); +#endif + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); +#ifdef CONFIG_SCHED_RT_CAS + if (max_cpu < 0 || arch_scale_cpu_capacity(i) > + arch_scale_cpu_capacity(max_cpu)) + WRITE_ONCE(d.rd->max_cap_orig_cpu, i); +#endif + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d5fef7aba276..e34d6937594c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_RT_CAS + { + .procname = "sched_enable_rt_cas", + .data = &sysctl_sched_enable_rt_cas, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHED_RT_ACTIVE_LB { .procname = "sched_enable_rt_active_lb", -- 2.25.1