sched/psi: Fix avgs_work re-arm in psi_avgs_work()
[ Upstream commit 2fcd7bbae90a6d844da8660a9d27079281dfbba2 ]
Pavan reported a problem that PSI avgs_work idle shutoff is not
working at all. Because PSI_NONIDLE condition would be observed in
psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
only the kworker running avgs_work on the CPU.
Although commit 1b69ac6b40 ("psi: fix aggregation idle shut-off")
avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
still will always re-arm the avgs_work, so shutoff is not working.
This patch changes to use PSI_STATE_RESCHEDULE to flag whether to
re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm
avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0),
for other CPUs we can just check PSI_NONIDLE delta. The new flag
is only used in psi_avgs_work(), so we check in get_recent_times()
that current_work() is avgs_work.
One potential problem is that the brief period of non-idle time
incurred between the aggregation run and the kworker's dequeue will
be stranded in the per-cpu buckets until avgs_work run next time.
The buckets can hold 4s worth of time, and future activity will wake
the avgs_work with a 2s delay, giving us 2s worth of data we can leave
behind when shut off the avgs_work. If the kworker run other works after
avgs_work shut off and doesn't have any scheduler activities for 2s,
this maybe a problem.
Reported-by: Pavan Kondeti <quic_pkondeti@quicinc.com>
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Chengming Zhou <zhouchengming@bytedance.com>
Link: https://lore.kernel.org/r/20221014110551.22695-1-zhouchengming@bytedance.com
Stable-dep-of: aff037078eca ("sched/psi: use kernfs polling functions for PSI trigger polling")
Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
parent
45f739e8fb
commit
7d8bba4da1
2 changed files with 30 additions and 3 deletions
|
|
@ -72,6 +72,9 @@ enum psi_states {
|
||||||
/* Use one bit in the state mask to track TSK_ONCPU */
|
/* Use one bit in the state mask to track TSK_ONCPU */
|
||||||
#define PSI_ONCPU (1 << NR_PSI_STATES)
|
#define PSI_ONCPU (1 << NR_PSI_STATES)
|
||||||
|
|
||||||
|
/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
|
||||||
|
#define PSI_STATE_RESCHEDULE (1 << (NR_PSI_STATES + 1))
|
||||||
|
|
||||||
enum psi_aggregators {
|
enum psi_aggregators {
|
||||||
PSI_AVGS = 0,
|
PSI_AVGS = 0,
|
||||||
PSI_POLL,
|
PSI_POLL,
|
||||||
|
|
|
||||||
|
|
@ -243,6 +243,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
|
||||||
u32 *pchanged_states)
|
u32 *pchanged_states)
|
||||||
{
|
{
|
||||||
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
|
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||||
|
int current_cpu = raw_smp_processor_id();
|
||||||
|
unsigned int tasks[NR_PSI_TASK_COUNTS];
|
||||||
u64 now, state_start;
|
u64 now, state_start;
|
||||||
enum psi_states s;
|
enum psi_states s;
|
||||||
unsigned int seq;
|
unsigned int seq;
|
||||||
|
|
@ -257,6 +259,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
|
||||||
memcpy(times, groupc->times, sizeof(groupc->times));
|
memcpy(times, groupc->times, sizeof(groupc->times));
|
||||||
state_mask = groupc->state_mask;
|
state_mask = groupc->state_mask;
|
||||||
state_start = groupc->state_start;
|
state_start = groupc->state_start;
|
||||||
|
if (cpu == current_cpu)
|
||||||
|
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
|
||||||
} while (read_seqcount_retry(&groupc->seq, seq));
|
} while (read_seqcount_retry(&groupc->seq, seq));
|
||||||
|
|
||||||
/* Calculate state time deltas against the previous snapshot */
|
/* Calculate state time deltas against the previous snapshot */
|
||||||
|
|
@ -281,6 +285,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
|
||||||
if (delta)
|
if (delta)
|
||||||
*pchanged_states |= (1 << s);
|
*pchanged_states |= (1 << s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When collect_percpu_times() from the avgs_work, we don't want to
|
||||||
|
* re-arm avgs_work when all CPUs are IDLE. But the current CPU running
|
||||||
|
* this avgs_work is never IDLE, cause avgs_work can't be shut off.
|
||||||
|
* So for the current CPU, we need to re-arm avgs_work only when
|
||||||
|
* (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
|
||||||
|
* we can just check PSI_NONIDLE delta.
|
||||||
|
*/
|
||||||
|
if (current_work() == &group->avgs_work.work) {
|
||||||
|
bool reschedule;
|
||||||
|
|
||||||
|
if (cpu == current_cpu)
|
||||||
|
reschedule = tasks[NR_RUNNING] +
|
||||||
|
tasks[NR_IOWAIT] +
|
||||||
|
tasks[NR_MEMSTALL] > 1;
|
||||||
|
else
|
||||||
|
reschedule = *pchanged_states & (1 << PSI_NONIDLE);
|
||||||
|
|
||||||
|
if (reschedule)
|
||||||
|
*pchanged_states |= PSI_STATE_RESCHEDULE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void calc_avgs(unsigned long avg[3], int missed_periods,
|
static void calc_avgs(unsigned long avg[3], int missed_periods,
|
||||||
|
|
@ -416,7 +442,6 @@ static void psi_avgs_work(struct work_struct *work)
|
||||||
struct delayed_work *dwork;
|
struct delayed_work *dwork;
|
||||||
struct psi_group *group;
|
struct psi_group *group;
|
||||||
u32 changed_states;
|
u32 changed_states;
|
||||||
bool nonidle;
|
|
||||||
u64 now;
|
u64 now;
|
||||||
|
|
||||||
dwork = to_delayed_work(work);
|
dwork = to_delayed_work(work);
|
||||||
|
|
@ -427,7 +452,6 @@ static void psi_avgs_work(struct work_struct *work)
|
||||||
now = sched_clock();
|
now = sched_clock();
|
||||||
|
|
||||||
collect_percpu_times(group, PSI_AVGS, &changed_states);
|
collect_percpu_times(group, PSI_AVGS, &changed_states);
|
||||||
nonidle = changed_states & (1 << PSI_NONIDLE);
|
|
||||||
/*
|
/*
|
||||||
* If there is task activity, periodically fold the per-cpu
|
* If there is task activity, periodically fold the per-cpu
|
||||||
* times and feed samples into the running averages. If things
|
* times and feed samples into the running averages. If things
|
||||||
|
|
@ -438,7 +462,7 @@ static void psi_avgs_work(struct work_struct *work)
|
||||||
if (now >= group->avg_next_update)
|
if (now >= group->avg_next_update)
|
||||||
group->avg_next_update = update_averages(group, now);
|
group->avg_next_update = update_averages(group, now);
|
||||||
|
|
||||||
if (nonidle) {
|
if (changed_states & PSI_STATE_RESCHEDULE) {
|
||||||
schedule_delayed_work(dwork, nsecs_to_jiffies(
|
schedule_delayed_work(dwork, nsecs_to_jiffies(
|
||||||
group->avg_next_update - now) + 1);
|
group->avg_next_update - now) + 1);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue