[Kernel] [PATCH OpenHarmony-5.10 3/3] sched: rt-cas optimization

28 Feb 2022

From: Hu Zhaodong <huzhaodong@huawei.com>

ohos inclusion
category: feature
issue: #I4TNS2
CVE: NA

Signed-off-by: Hu Zhaodong <huzhaodong@huawei.com>

-------------------------------------------

RT task detects capacity during CPU selection

Signed-off-by: gaochao <gaochao49@huawei.com>
---
 include/linux/sched/sysctl.h     |   3 +
 include/trace/events/eas_sched.h |  76 +++++++++++++
 include/trace/events/sched.h     |   4 +
 init/Kconfig                     |   7 ++
 kernel/sched/fair.c              |   8 ++
 kernel/sched/rt.c                | 177 +++++++++++++++++++++++++++++++
 kernel/sched/sched.h             |  17 +++
 kernel/sched/topology.c          |  14 +++
 kernel/sysctl.c                  |   9 ++
 9 files changed, 315 insertions(+)
 create mode 100644 include/trace/events/eas_sched.h

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index acec3b1fd469..a08551ebd23d 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -41,6 +41,9 @@ sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table,
 		int write, void __user *buffer, size_t *length, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_SCHED_RT_CAS
+extern unsigned int sysctl_sched_enable_rt_cas;
+#endif
 #ifdef CONFIG_SCHED_RT_ACTIVE_LB
 extern unsigned int sysctl_sched_enable_rt_active_lb;
 #endif
diff --git a/include/trace/events/eas_sched.h b/include/trace/events/eas_sched.h
new file mode 100644
index 000000000000..bd24c9ef5b6e
--- /dev/null
+++ b/include/trace/events/eas_sched.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_SCHED_RT_CAS
+TRACE_EVENT(sched_find_cas_cpu_each,
+
+	TP_PROTO(struct task_struct *task, int cpu, int target_cpu,
+		 int isolated, int idle, unsigned long task_util,
+		 unsigned long cpu_util, int cpu_cap),
+
+	TP_ARGS(task, cpu, target_cpu, isolated, idle, task_util, cpu_util, cpu_cap),
+
+	TP_STRUCT__entry(
+		__array(char, comm, TASK_COMM_LEN)
+		__field(pid_t, pid)
+		__field(int, prio)
+		__field(int, cpu)
+		__field(int, target_cpu)
+		__field(int, isolated)
+		__field(unsigned long, idle)
+		__field(unsigned long, task_util)
+		__field(unsigned long, cpu_util)
+		__field(unsigned long, cpu_cap)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->pid		= task->pid;
+		__entry->prio		= task->prio;
+		__entry->cpu		= cpu;
+		__entry->target_cpu	= target_cpu;
+		__entry->isolated	= isolated;
+		__entry->idle		= idle;
+		__entry->task_util	= task_util;
+		__entry->cpu_util	= cpu_util;
+		__entry->cpu_cap	= cpu_cap;
+	),
+
+	TP_printk("comm=%s pid=%d prio=%d cpu=%d target_cpu=%d isolated=%d idle=%d task_util=%lu cpu_util=%lu cpu_cap=%lu",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->cpu, __entry->target_cpu, __entry->isolated,
+		  __entry->idle, __entry->task_util,
+		  __entry->cpu_util, __entry->cpu_cap)
+);
+
+TRACE_EVENT(sched_find_cas_cpu,
+
+	TP_PROTO(struct task_struct *task, struct cpumask *lowest_mask,
+		 unsigned long tutil,  int prev_cpu, int target_cpu),
+
+	TP_ARGS(task, lowest_mask, tutil, prev_cpu, target_cpu),
+
+	TP_STRUCT__entry(
+		__array(char, comm, TASK_COMM_LEN)
+		__field(pid_t, pid)
+		__field(unsigned int, prio)
+		__bitmask(lowest, num_possible_cpus())
+		__field(unsigned long, tutil)
+		__field(int, prev_cpu)
+		__field(int, target_cpu)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__entry->pid		= task->pid;
+		__entry->prio		= task->prio;
+		__assign_bitmask(lowest, cpumask_bits(lowest_mask), num_possible_cpus());
+		__entry->tutil		= tutil;
+		__entry->prev_cpu	= prev_cpu;
+		__entry->target_cpu	= target_cpu;
+	),
+
+	TP_printk("comm=%s pid=%d prio=%d lowest_mask=%s tutil=%lu perfer_idle=%u prev=%d target=%d ",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __get_bitmask(lowest), __entry->tutil,
+		  __entry->prev_cpu, __entry->target_cpu)
+);
+#endif /* CONFIG_SCHED_RT_CAS */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 27b6ed3c9e58..dd5fff2bb1b2 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -10,6 +10,10 @@
 #include <linux/tracepoint.h>
 #include <linux/binfmts.h>
 
+#ifdef CONFIG_SCHED_RT_CAS
+#include "eas_sched.h"
+#endif
+
 /*
  * Tracepoint for calling kthread_stop, performed to end a kthread:
  */
diff --git a/init/Kconfig b/init/Kconfig
index 57554d795040..ded631516e22 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -866,6 +866,13 @@ config SCHED_EAS
 	help
 	  Check and migrate the CFS process to a more suitable CPU in the tick.
 
+config SCHED_RT_CAS
+	bool "rt-cas optimization"
+	depends on SCHED_EAS
+	default n
+	help
+	  RT task detects capacity during CPU selection
+
 config SCHED_RT_ACTIVE_LB
 	bool "RT Capacity Aware Misfit Task"
 	depends on SCHED_EAS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9789a385fecd..945bd3fb2478 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3953,14 +3953,22 @@ static inline unsigned long task_util_est(struct task_struct *p)
 }
 
 #ifdef CONFIG_UCLAMP_TASK
+#ifdef CONFIG_SCHED_RT_CAS
+unsigned long uclamp_task_util(struct task_struct *p)
+#else
 static inline unsigned long uclamp_task_util(struct task_struct *p)
+#endif
 {
 	return clamp(task_util_est(p),
 		     uclamp_eff_value(p, UCLAMP_MIN),
 		     uclamp_eff_value(p, UCLAMP_MAX));
 }
 #else
+#ifdef CONFIG_SCHED_RT_CAS
+unsigned long uclamp_task_util(struct task_struct *p)
+#else
 static inline unsigned long uclamp_task_util(struct task_struct *p)
+#endif
 {
 	return task_util_est(p);
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9adcbf0e0bee..5926d6f6efbb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -17,6 +17,10 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
 struct rt_bandwidth def_rt_bandwidth;
 
+#ifdef CONFIG_SCHED_RT_CAS
+unsigned int sysctl_sched_enable_rt_cas = 1;
+#endif
+
 #ifdef CONFIG_SCHED_RT_ACTIVE_LB
 unsigned int sysctl_sched_enable_rt_active_lb = 1;
 #endif
@@ -1709,6 +1713,170 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
 	return NULL;
 }
 
+#ifdef CONFIG_SCHED_RT_CAS
+int find_cas_cpu(struct sched_domain *sd,
+		 struct task_struct *task, struct cpumask *lowest_mask)
+{
+	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+	struct sched_group *sg = NULL;
+	struct sched_group *sg_target = NULL;
+	struct sched_group *sg_backup = NULL;
+	struct cpumask search_cpu, backup_search_cpu;
+	int cpu = -1;
+	int target_cpu = -1;
+	unsigned long cpu_capacity;
+	unsigned long boosted_tutil = uclamp_task_util(task);
+	unsigned long target_capacity = ULONG_MAX;
+	unsigned long util;
+	unsigned long target_cpu_util = ULONG_MAX;
+	int prev_cpu = task_cpu(task);
+#ifdef CONFIG_SCHED_RTG
+	struct cpumask *rtg_target = NULL;
+#endif
+	bool boosted = uclamp_boosted(task);
+
+	if (!sysctl_sched_enable_rt_cas)
+		return -1;
+
+	rcu_read_lock();
+
+#ifdef CONFIG_SCHED_RTG
+	rtg_target = find_rtg_target(task);
+#endif
+
+	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, 0));
+	if (!sd) {
+		rcu_read_unlock();
+		return -1;
+	}
+
+	sg = sd->groups;
+	do {
+		if (!cpumask_intersects(lowest_mask, sched_group_span(sg)))
+			continue;
+
+		if (boosted) {
+			if (cpumask_test_cpu(rd->max_cap_orig_cpu,
+					     sched_group_span(sg))) {
+				sg_target = sg;
+				break;
+			}
+		}
+
+		cpu = group_first_cpu(sg);
+#ifdef CONFIG_SCHED_RTG
+		/* honor the rtg tasks */
+		if (rtg_target) {
+			if (cpumask_test_cpu(cpu, rtg_target)) {
+				sg_target = sg;
+				break;
+			}
+
+			/* active LB or big_task favor cpus with more capacity */
+			if (task->state == TASK_RUNNING || boosted) {
+				if (capacity_orig_of(cpu) >
+				    capacity_orig_of(cpumask_any(rtg_target))) {
+					sg_target = sg;
+					break;
+				}
+
+				sg_backup = sg;
+				continue;
+			}
+		}
+#endif
+		/*
+		 * 1. add margin to support task migration
+		 * 2. if task_util is high then all cpus, make sure the
+		 * sg_backup with the most powerful cpus is selected
+		 */
+		if (!rt_task_fits_capacity(task, cpu)) {
+			sg_backup = sg;
+			continue;
+		}
+
+		/* support task boost */
+		cpu_capacity = capacity_orig_of(cpu);
+		if (boosted_tutil > cpu_capacity) {
+			sg_backup = sg;
+			continue;
+		}
+
+		/* sg_target: select the sg with smaller capacity */
+		if (cpu_capacity < target_capacity) {
+			target_capacity = cpu_capacity;
+			sg_target = sg;
+		}
+	} while (sg = sg->next, sg != sd->groups);
+
+	if (!sg_target)
+		sg_target = sg_backup;
+
+	if (sg_target) {
+		cpumask_and(&search_cpu, lowest_mask, sched_group_span(sg_target));
+		cpumask_copy(&backup_search_cpu, lowest_mask);
+		cpumask_andnot(&backup_search_cpu, &backup_search_cpu, &search_cpu);
+	} else {
+		cpumask_copy(&search_cpu, lowest_mask);
+		cpumask_clear(&backup_search_cpu);
+	}
+
+retry:
+	cpu = cpumask_first(&search_cpu);
+	do {
+		trace_sched_find_cas_cpu_each(task, cpu, target_cpu,
+			cpu_isolated(cpu),
+			idle_cpu(cpu), boosted_tutil, cpu_util(cpu),
+			capacity_orig_of(cpu));
+
+		if (cpu_isolated(cpu))
+			continue;
+
+		if (!cpumask_test_cpu(cpu, task->cpus_ptr))
+			continue;
+
+		/* find best cpu with smallest max_capacity */
+		if (target_cpu != -1 &&
+		    capacity_orig_of(cpu) > capacity_orig_of(target_cpu))
+			continue;
+
+		util = cpu_util(cpu);
+
+		/* Find the least loaded CPU */
+		if (util > target_cpu_util)
+			continue;
+
+		/*
+		 * If the preivous CPU has same load, keep it as
+		 * target_cpu
+		 */
+		if (target_cpu_util == util && target_cpu == prev_cpu)
+			continue;
+
+		/*
+		 * If candidate CPU is the previous CPU, select it.
+		 * If all above conditions are same, select the least
+		 * cumulative window demand CPU.
+		 */
+		target_cpu_util = util;
+		target_cpu = cpu;
+	} while ((cpu = cpumask_next(cpu, &search_cpu)) < nr_cpu_ids);
+
+	if (target_cpu != -1 && cpumask_test_cpu(target_cpu, lowest_mask)) {
+		goto done;
+	} else if (!cpumask_empty(&backup_search_cpu)) {
+		cpumask_copy(&search_cpu, &backup_search_cpu);
+		cpumask_clear(&backup_search_cpu);
+		goto retry;
+	}
+
+done:
+	trace_sched_find_cas_cpu(task, lowest_mask, boosted_tutil, prev_cpu, target_cpu);
+	rcu_read_unlock();
+	return target_cpu;
+}
+#endif
+
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
 static int find_lowest_rq(struct task_struct *task)
@@ -1718,6 +1886,9 @@ static int find_lowest_rq(struct task_struct *task)
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
 	int ret;
+#ifdef CONFIG_SCHED_RT_CAS
+	int cas_cpu;
+#endif
 
 	/* Make sure the mask is initialized first */
 	if (unlikely(!lowest_mask))
@@ -1744,6 +1915,12 @@ static int find_lowest_rq(struct task_struct *task)
 	if (!ret)
 		return -1; /* No targets found */
 
+#ifdef CONFIG_SCHED_RT_CAS
+	cas_cpu = find_cas_cpu(sd, task, lowest_mask);
+	if (cas_cpu != -1)
+		return cas_cpu;
+#endif
+
 	/*
 	 * At this point we have built a mask of CPUs representing the
 	 * lowest priority tasks in the system.  Now we want to elect
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 09ad491bed45..e4c65d96185e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -87,6 +87,10 @@
 struct rq;
 struct cpuidle_state;
 
+#ifdef CONFIG_SCHED_RT_CAS
+extern unsigned long uclamp_task_util(struct task_struct *p);
+#endif
+
 #ifdef CONFIG_SCHED_WALT
 extern unsigned int sched_ravg_window;
 extern unsigned int walt_cpu_util_freq_divisor;
@@ -893,6 +897,9 @@ struct root_domain {
 	 * CPUs of the rd. Protected by RCU.
 	 */
 	struct perf_domain __rcu *pd;
+#ifdef CONFIG_SCHED_RT_CAS
+	int max_cap_orig_cpu;
+#endif
 };
 
 extern void init_defrootdomain(void);
@@ -2582,6 +2589,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
 	return clamp(util, min_util, max_util);
 }
 
+static inline bool uclamp_boosted(struct task_struct *p)
+{
+	return uclamp_eff_value(p, UCLAMP_MIN) > 0;
+}
+
 /*
  * When uclamp is compiled in, the aggregation at rq level is 'turned off'
  * by default in the fast path and only gets turned on once userspace performs
@@ -2602,6 +2614,11 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
 	return util;
 }
 
+static inline bool uclamp_boosted(struct task_struct *p)
+{
+	return false;
+}
+
 static inline bool uclamp_is_used(void)
 {
 	return false;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b30b62f0d683..9191e5daaa3c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -522,6 +522,10 @@ static int init_rootdomain(struct root_domain *rd)
 
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_cpudl;
+
+#ifdef CONFIG_SCHED_RT_CAS
+	rd->max_cap_orig_cpu = -1;
+#endif
 	return 0;
 
 free_cpudl:
@@ -2121,9 +2125,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
+#ifdef CONFIG_SCHED_RT_CAS
+		int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+#endif
+
 		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
 
+#ifdef CONFIG_SCHED_RT_CAS
+		if (max_cpu < 0 || arch_scale_cpu_capacity(i) >
+			arch_scale_cpu_capacity(max_cpu))
+			WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+#endif
+
 		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
 		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
 			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d5fef7aba276..e34d6937594c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1659,6 +1659,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_RT_CAS
+	{
+		.procname	= "sched_enable_rt_cas",
+		.data		= &sysctl_sched_enable_rt_cas,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_SCHED_RT_ACTIVE_LB
 	{
 		.procname	= "sched_enable_rt_active_lb",
-- 
2.25.1

    

[Kernel] [PATCH OpenHarmony-5.10 3/3] sched: rt-cas optimization

gc1202