From: Hu Zhaodong
ohos inclusion
category: feature
issue: #I4TNS2
CVE: NA
Signed-off-by: Hu Zhaodong
-------------------------------------------
RT task detects capacity during CPU selection
Signed-off-by: gaochao
---
include/linux/sched/sysctl.h | 3 +
include/trace/events/eas_sched.h | 76 +++++++++++++
include/trace/events/sched.h | 4 +
init/Kconfig | 7 ++
kernel/sched/fair.c | 8 ++
kernel/sched/rt.c | 177 +++++++++++++++++++++++++++++++
kernel/sched/sched.h | 17 +++
kernel/sched/topology.c | 14 +++
kernel/sysctl.c | 9 ++
9 files changed, 315 insertions(+)
create mode 100644 include/trace/events/eas_sched.h
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index acec3b1fd469..a08551ebd23d 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -41,6 +41,9 @@ sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table,
int write, void __user *buffer, size_t *length, loff_t *ppos);
#endif
+#ifdef CONFIG_SCHED_RT_CAS
+extern unsigned int sysctl_sched_enable_rt_cas;
+#endif
#ifdef CONFIG_SCHED_RT_ACTIVE_LB
extern unsigned int sysctl_sched_enable_rt_active_lb;
#endif
diff --git a/include/trace/events/eas_sched.h b/include/trace/events/eas_sched.h
new file mode 100644
index 000000000000..bd24c9ef5b6e
--- /dev/null
+++ b/include/trace/events/eas_sched.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_SCHED_RT_CAS
+TRACE_EVENT(sched_find_cas_cpu_each,
+
+ TP_PROTO(struct task_struct *task, int cpu, int target_cpu,
+ int isolated, int idle, unsigned long task_util,
+ unsigned long cpu_util, int cpu_cap),
+
+ TP_ARGS(task, cpu, target_cpu, isolated, idle, task_util, cpu_util, cpu_cap),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, prio)
+ __field(int, cpu)
+ __field(int, target_cpu)
+ __field(int, isolated)
+ __field(unsigned long, idle)
+ __field(unsigned long, task_util)
+ __field(unsigned long, cpu_util)
+ __field(unsigned long, cpu_cap)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+ __entry->pid = task->pid;
+ __entry->prio = task->prio;
+ __entry->cpu = cpu;
+ __entry->target_cpu = target_cpu;
+ __entry->isolated = isolated;
+ __entry->idle = idle;
+ __entry->task_util = task_util;
+ __entry->cpu_util = cpu_util;
+ __entry->cpu_cap = cpu_cap;
+ ),
+
+ TP_printk("comm=%s pid=%d prio=%d cpu=%d target_cpu=%d isolated=%d idle=%d task_util=%lu cpu_util=%lu cpu_cap=%lu",
+ __entry->comm, __entry->pid, __entry->prio,
+ __entry->cpu, __entry->target_cpu, __entry->isolated,
+ __entry->idle, __entry->task_util,
+ __entry->cpu_util, __entry->cpu_cap)
+);
+
+TRACE_EVENT(sched_find_cas_cpu,
+
+ TP_PROTO(struct task_struct *task, struct cpumask *lowest_mask,
+ unsigned long tutil, int prev_cpu, int target_cpu),
+
+ TP_ARGS(task, lowest_mask, tutil, prev_cpu, target_cpu),
+
+ TP_STRUCT__entry(
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(unsigned int, prio)
+ __bitmask(lowest, num_possible_cpus())
+ __field(unsigned long, tutil)
+ __field(int, prev_cpu)
+ __field(int, target_cpu)
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+ __entry->pid = task->pid;
+ __entry->prio = task->prio;
+ __assign_bitmask(lowest, cpumask_bits(lowest_mask), num_possible_cpus());
+ __entry->tutil = tutil;
+ __entry->prev_cpu = prev_cpu;
+ __entry->target_cpu = target_cpu;
+ ),
+
+ TP_printk("comm=%s pid=%d prio=%d lowest_mask=%s tutil=%lu perfer_idle=%u prev=%d target=%d ",
+ __entry->comm, __entry->pid, __entry->prio,
+ __get_bitmask(lowest), __entry->tutil,
+ __entry->prev_cpu, __entry->target_cpu)
+);
+#endif /* CONFIG_SCHED_RT_CAS */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 27b6ed3c9e58..dd5fff2bb1b2 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -10,6 +10,10 @@
#include
#include
+#ifdef CONFIG_SCHED_RT_CAS
+#include "eas_sched.h"
+#endif
+
/*
* Tracepoint for calling kthread_stop, performed to end a kthread:
*/
diff --git a/init/Kconfig b/init/Kconfig
index 57554d795040..ded631516e22 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -866,6 +866,13 @@ config SCHED_EAS
help
Check and migrate the CFS process to a more suitable CPU in the tick.
+config SCHED_RT_CAS
+ bool "rt-cas optimization"
+ depends on SCHED_EAS
+ default n
+ help
+ RT task detects capacity during CPU selection
+
config SCHED_RT_ACTIVE_LB
bool "RT Capacity Aware Misfit Task"
depends on SCHED_EAS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9789a385fecd..945bd3fb2478 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3953,14 +3953,22 @@ static inline unsigned long task_util_est(struct task_struct *p)
}
#ifdef CONFIG_UCLAMP_TASK
+#ifdef CONFIG_SCHED_RT_CAS
+unsigned long uclamp_task_util(struct task_struct *p)
+#else
static inline unsigned long uclamp_task_util(struct task_struct *p)
+#endif
{
return clamp(task_util_est(p),
uclamp_eff_value(p, UCLAMP_MIN),
uclamp_eff_value(p, UCLAMP_MAX));
}
#else
+#ifdef CONFIG_SCHED_RT_CAS
+unsigned long uclamp_task_util(struct task_struct *p)
+#else
static inline unsigned long uclamp_task_util(struct task_struct *p)
+#endif
{
return task_util_est(p);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9adcbf0e0bee..5926d6f6efbb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -17,6 +17,10 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
+#ifdef CONFIG_SCHED_RT_CAS
+unsigned int sysctl_sched_enable_rt_cas = 1;
+#endif
+
#ifdef CONFIG_SCHED_RT_ACTIVE_LB
unsigned int sysctl_sched_enable_rt_active_lb = 1;
#endif
@@ -1709,6 +1713,170 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
return NULL;
}
+#ifdef CONFIG_SCHED_RT_CAS
+int find_cas_cpu(struct sched_domain *sd,
+ struct task_struct *task, struct cpumask *lowest_mask)
+{
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ struct sched_group *sg = NULL;
+ struct sched_group *sg_target = NULL;
+ struct sched_group *sg_backup = NULL;
+ struct cpumask search_cpu, backup_search_cpu;
+ int cpu = -1;
+ int target_cpu = -1;
+ unsigned long cpu_capacity;
+ unsigned long boosted_tutil = uclamp_task_util(task);
+ unsigned long target_capacity = ULONG_MAX;
+ unsigned long util;
+ unsigned long target_cpu_util = ULONG_MAX;
+ int prev_cpu = task_cpu(task);
+#ifdef CONFIG_SCHED_RTG
+ struct cpumask *rtg_target = NULL;
+#endif
+ bool boosted = uclamp_boosted(task);
+
+ if (!sysctl_sched_enable_rt_cas)
+ return -1;
+
+ rcu_read_lock();
+
+#ifdef CONFIG_SCHED_RTG
+ rtg_target = find_rtg_target(task);
+#endif
+
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, 0));
+ if (!sd) {
+ rcu_read_unlock();
+ return -1;
+ }
+
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(lowest_mask, sched_group_span(sg)))
+ continue;
+
+ if (boosted) {
+ if (cpumask_test_cpu(rd->max_cap_orig_cpu,
+ sched_group_span(sg))) {
+ sg_target = sg;
+ break;
+ }
+ }
+
+ cpu = group_first_cpu(sg);
+#ifdef CONFIG_SCHED_RTG
+ /* honor the rtg tasks */
+ if (rtg_target) {
+ if (cpumask_test_cpu(cpu, rtg_target)) {
+ sg_target = sg;
+ break;
+ }
+
+ /* active LB or big_task favor cpus with more capacity */
+ if (task->state == TASK_RUNNING || boosted) {
+ if (capacity_orig_of(cpu) >
+ capacity_orig_of(cpumask_any(rtg_target))) {
+ sg_target = sg;
+ break;
+ }
+
+ sg_backup = sg;
+ continue;
+ }
+ }
+#endif
+ /*
+ * 1. add margin to support task migration
+ * 2. if task_util is high then all cpus, make sure the
+ * sg_backup with the most powerful cpus is selected
+ */
+ if (!rt_task_fits_capacity(task, cpu)) {
+ sg_backup = sg;
+ continue;
+ }
+
+ /* support task boost */
+ cpu_capacity = capacity_orig_of(cpu);
+ if (boosted_tutil > cpu_capacity) {
+ sg_backup = sg;
+ continue;
+ }
+
+ /* sg_target: select the sg with smaller capacity */
+ if (cpu_capacity < target_capacity) {
+ target_capacity = cpu_capacity;
+ sg_target = sg;
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ if (!sg_target)
+ sg_target = sg_backup;
+
+ if (sg_target) {
+ cpumask_and(&search_cpu, lowest_mask, sched_group_span(sg_target));
+ cpumask_copy(&backup_search_cpu, lowest_mask);
+ cpumask_andnot(&backup_search_cpu, &backup_search_cpu, &search_cpu);
+ } else {
+ cpumask_copy(&search_cpu, lowest_mask);
+ cpumask_clear(&backup_search_cpu);
+ }
+
+retry:
+ cpu = cpumask_first(&search_cpu);
+ do {
+ trace_sched_find_cas_cpu_each(task, cpu, target_cpu,
+ cpu_isolated(cpu),
+ idle_cpu(cpu), boosted_tutil, cpu_util(cpu),
+ capacity_orig_of(cpu));
+
+ if (cpu_isolated(cpu))
+ continue;
+
+ if (!cpumask_test_cpu(cpu, task->cpus_ptr))
+ continue;
+
+ /* find best cpu with smallest max_capacity */
+ if (target_cpu != -1 &&
+ capacity_orig_of(cpu) > capacity_orig_of(target_cpu))
+ continue;
+
+ util = cpu_util(cpu);
+
+ /* Find the least loaded CPU */
+ if (util > target_cpu_util)
+ continue;
+
+ /*
+ * If the preivous CPU has same load, keep it as
+ * target_cpu
+ */
+ if (target_cpu_util == util && target_cpu == prev_cpu)
+ continue;
+
+ /*
+ * If candidate CPU is the previous CPU, select it.
+ * If all above conditions are same, select the least
+ * cumulative window demand CPU.
+ */
+ target_cpu_util = util;
+ target_cpu = cpu;
+ } while ((cpu = cpumask_next(cpu, &search_cpu)) < nr_cpu_ids);
+
+ if (target_cpu != -1 && cpumask_test_cpu(target_cpu, lowest_mask)) {
+ goto done;
+ } else if (!cpumask_empty(&backup_search_cpu)) {
+ cpumask_copy(&search_cpu, &backup_search_cpu);
+ cpumask_clear(&backup_search_cpu);
+ goto retry;
+ }
+
+done:
+ trace_sched_find_cas_cpu(task, lowest_mask, boosted_tutil, prev_cpu, target_cpu);
+ rcu_read_unlock();
+ return target_cpu;
+}
+#endif
+
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
static int find_lowest_rq(struct task_struct *task)
@@ -1718,6 +1886,9 @@ static int find_lowest_rq(struct task_struct *task)
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
int ret;
+#ifdef CONFIG_SCHED_RT_CAS
+ int cas_cpu;
+#endif
/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
@@ -1744,6 +1915,12 @@ static int find_lowest_rq(struct task_struct *task)
if (!ret)
return -1; /* No targets found */
+#ifdef CONFIG_SCHED_RT_CAS
+ cas_cpu = find_cas_cpu(sd, task, lowest_mask);
+ if (cas_cpu != -1)
+ return cas_cpu;
+#endif
+
/*
* At this point we have built a mask of CPUs representing the
* lowest priority tasks in the system. Now we want to elect
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 09ad491bed45..e4c65d96185e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -87,6 +87,10 @@
struct rq;
struct cpuidle_state;
+#ifdef CONFIG_SCHED_RT_CAS
+extern unsigned long uclamp_task_util(struct task_struct *p);
+#endif
+
#ifdef CONFIG_SCHED_WALT
extern unsigned int sched_ravg_window;
extern unsigned int walt_cpu_util_freq_divisor;
@@ -893,6 +897,9 @@ struct root_domain {
* CPUs of the rd. Protected by RCU.
*/
struct perf_domain __rcu *pd;
+#ifdef CONFIG_SCHED_RT_CAS
+ int max_cap_orig_cpu;
+#endif
};
extern void init_defrootdomain(void);
@@ -2582,6 +2589,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
return clamp(util, min_util, max_util);
}
+static inline bool uclamp_boosted(struct task_struct *p)
+{
+ return uclamp_eff_value(p, UCLAMP_MIN) > 0;
+}
+
/*
* When uclamp is compiled in, the aggregation at rq level is 'turned off'
* by default in the fast path and only gets turned on once userspace performs
@@ -2602,6 +2614,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
return util;
}
+static inline bool uclamp_boosted(struct task_struct *p)
+{
+ return false;
+}
+
static inline bool uclamp_is_used(void)
{
return false;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b30b62f0d683..9191e5daaa3c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -522,6 +522,10 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_cpudl;
+
+#ifdef CONFIG_SCHED_RT_CAS
+ rd->max_cap_orig_cpu = -1;
+#endif
return 0;
free_cpudl:
@@ -2121,9 +2125,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
+#ifdef CONFIG_SCHED_RT_CAS
+ int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+#endif
+
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
+#ifdef CONFIG_SCHED_RT_CAS
+ if (max_cpu < 0 || arch_scale_cpu_capacity(i) >
+ arch_scale_cpu_capacity(max_cpu))
+ WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+#endif
+
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d5fef7aba276..e34d6937594c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1659,6 +1659,15 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHED_RT_CAS
+ {
+ .procname = "sched_enable_rt_cas",
+ .data = &sysctl_sched_enable_rt_cas,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#ifdef CONFIG_SCHED_RT_ACTIVE_LB
{
.procname = "sched_enable_rt_active_lb",
--
2.25.1