From 336e51283ae376835b30b9edfcddd8e7b7615798 Mon Sep 17 00:00:00 2001
From: Viresh Kumar
Date: Thu, 26 May 2022 17:21:19 +0530
Subject: cpufreq: Optimize cpufreq_show_cpus()

Instead of specially adding a space for each CPU, except the first one,
lets add space for each of them and remove it at the end.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 2cad42774164..e24aa5d4bca5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -843,12 +843,14 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf)
 	unsigned int cpu;
 
 	for_each_cpu(cpu, mask) {
-		if (i)
-			i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), " ");
-		i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), "%u", cpu);
+		i += scnprintf(&buf[i], (PAGE_SIZE - i - 2), "%u ", cpu);
 		if (i >= (PAGE_SIZE - 5))
 			break;
 	}
+
+	/* Remove the extra space at the end */
+	i--;
+
 	i += sprintf(&buf[i], "\n");
 	return i;
 }
-- 
cgit v1.2.3


From 9ab9b9d3fb9231cdcfda8e0fb3d9c24a2f95ed26 Mon Sep 17 00:00:00 2001
From: Viresh Kumar
Date: Thu, 26 May 2022 17:21:21 +0530
Subject: cpufreq: Drop unnecessary cpus locking from store()

This change was introduced long back by commit 4f750c930822 ("cpufreq:
Synchronize the cpufreq store_*() routines with CPU hotplug").

Since then, both cpufreq and hotplug core have been reworked and have
much better locking in place. The race mentioned in commit 4f750c930822
isn't possible anymore.

Drop the unnecessary locking.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index e24aa5d4bca5..73432360e6e9 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -973,21 +973,10 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
 	if (!fattr->store)
 		return -EIO;
 
-	/*
-	 * cpus_read_trylock() is used here to work around a circular lock
-	 * dependency problem with respect to the cpufreq_register_driver().
-	 */
-	if (!cpus_read_trylock())
-		return -EBUSY;
-
-	if (cpu_online(policy->cpu)) {
-		down_write(&policy->rwsem);
-		if (likely(!policy_is_inactive(policy)))
-			ret = fattr->store(policy, buf, count);
-		up_write(&policy->rwsem);
-	}
-
-	cpus_read_unlock();
+	down_write(&policy->rwsem);
+	if (likely(!policy_is_inactive(policy)))
+		ret = fattr->store(policy, buf, count);
+	up_write(&policy->rwsem);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 0da11bf0cab9029db8b85e48d962ff05c00a4faa Mon Sep 17 00:00:00 2001
From: Eiichi Tsukata
Date: Fri, 27 May 2022 00:53:45 +0000
Subject: cpuidle: haltpoll: Add trace points for guest_halt_poll_ns
 grow/shrink

Add trace points as are implemented in KVM host halt polling.
This helps tune guest halt polling params.

Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governors/haltpoll.c |  3 +++
 include/trace/events/power.h         | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'drivers')

diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c
index cb2a96eafc02..1dff3a52917d 100644
--- a/drivers/cpuidle/governors/haltpoll.c
+++ b/drivers/cpuidle/governors/haltpoll.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/kvm_para.h>
+#include <trace/events/power.h>
 
 static unsigned int guest_halt_poll_ns __read_mostly = 200000;
 module_param(guest_halt_poll_ns, uint, 0644);
@@ -90,6 +91,7 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
 		if (val > guest_halt_poll_ns)
 			val = guest_halt_poll_ns;
 
+		trace_guest_halt_poll_ns_grow(val, dev->poll_limit_ns);
 		dev->poll_limit_ns = val;
 	} else if (block_ns > guest_halt_poll_ns &&
 		   guest_halt_poll_allow_shrink) {
@@ -100,6 +102,7 @@ static void adjust_poll_limit(struct cpuidle_device *dev, u64 block_ns)
 			val = 0;
 		else
 			val /= shrink;
+		trace_guest_halt_poll_ns_shrink(val, dev->poll_limit_ns);
 		dev->poll_limit_ns = val;
 	}
 }
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index af5018aa9517..c708521e4ed5 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -500,6 +500,35 @@ DEFINE_EVENT(dev_pm_qos_request, dev_pm_qos_remove_request,
 
 	TP_ARGS(name, type, new_value)
 );
+
+TRACE_EVENT(guest_halt_poll_ns,
+
+	TP_PROTO(bool grow, unsigned int new, unsigned int old),
+
+	TP_ARGS(grow, new, old),
+
+	TP_STRUCT__entry(
+		__field(bool, grow)
+		__field(unsigned int, new)
+		__field(unsigned int, old)
+	),
+
+	TP_fast_assign(
+		__entry->grow   = grow;
+		__entry->new    = new;
+		__entry->old    = old;
+	),
+
+	TP_printk("halt_poll_ns %u (%s %u)",
+		__entry->new,
+		__entry->grow ? "grow" : "shrink",
+		__entry->old)
+);
+
+#define trace_guest_halt_poll_ns_grow(new, old) \
+	trace_guest_halt_poll_ns(true, new, old)
+#define trace_guest_halt_poll_ns_shrink(new, old) \
+	trace_guest_halt_poll_ns(false, new, old)
 #endif /* _TRACE_POWER_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From d6f895965592d5cd2771860639219964df85d555 Mon Sep 17 00:00:00 2001
From: Tony W Wang-oc
Date: Thu, 23 Jun 2022 09:21:26 +0800
Subject: cpufreq: ACPI: Add Zhaoxin/Centaur turbo boost control interface
 support

Recent Zhaoxin/Centaur CPUs support X86_FEATURE_IDA and the turbo boost
can be dynamically enabled or disabled through MSR 0x1a0[38] in the same
way as Intel. So add turbo boost control support for these CPUs too.

Signed-off-by: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/acpi-cpufreq.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers')

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 3d514b82d055..1bb2b90ebb21 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -78,6 +78,8 @@ static bool boost_state(unsigned int cpu)
 
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
+	case X86_VENDOR_CENTAUR:
+	case X86_VENDOR_ZHAOXIN:
 		rdmsr_on_cpu(cpu, MSR_IA32_MISC_ENABLE, &lo, &hi);
 		msr = lo | ((u64)hi << 32);
 		return !(msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
@@ -97,6 +99,8 @@ static int boost_set_msr(bool enable)
 
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
+	case X86_VENDOR_CENTAUR:
+	case X86_VENDOR_ZHAOXIN:
 		msr_addr = MSR_IA32_MISC_ENABLE;
 		msr_mask = MSR_IA32_MISC_ENABLE_TURBO_DISABLE;
 		break;
-- 
cgit v1.2.3


From 09d3154a6f0f0bb5b604832095804780f3684b96 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas
Date: Mon, 6 Jun 2022 22:51:58 -0500
Subject: PM: wakeup: Unify device_init_wakeup() for PM_SLEEP and !PM_SLEEP

Previously the CONFIG_PM_SLEEP and !CONFIG_PM_SLEEP device_init_wakeup()
implementations differed in confusing ways:

  - The PM_SLEEP version checked for a NULL device pointer and returned
    -EINVAL, while the !PM_SLEEP version did not and would simply
    dereference a NULL pointer.

  - When called with "false", the !PM_SLEEP version cleared "capable" and
    "enable" in the opposite order of the PM_SLEEP version.  That was
    harmless because for !PM_SLEEP they're simple assignments, but it's
    unnecessary confusion.

Use a simplified version of the PM_SLEEP implementation for both cases.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 30 ------------------------------
 include/linux/pm_wakeup.h   | 31 +++++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'drivers')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 11a4ffe91367..e3befa2c1b66 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -500,36 +500,6 @@ void device_set_wakeup_capable(struct device *dev, bool capable)
 }
 EXPORT_SYMBOL_GPL(device_set_wakeup_capable);
 
-/**
- * device_init_wakeup - Device wakeup initialization.
- * @dev: Device to handle.
- * @enable: Whether or not to enable @dev as a wakeup device.
- *
- * By default, most devices should leave wakeup disabled.  The exceptions are
- * devices that everyone expects to be wakeup sources: keyboards, power buttons,
- * possibly network interfaces, etc.  Also, devices that don't generate their
- * own wakeup requests but merely forward requests from one bus to another
- * (like PCI bridges) should have wakeup enabled by default.
- */
-int device_init_wakeup(struct device *dev, bool enable)
-{
-	int ret = 0;
-
-	if (!dev)
-		return -EINVAL;
-
-	if (enable) {
-		device_set_wakeup_capable(dev, true);
-		ret = device_wakeup_enable(dev);
-	} else {
-		device_wakeup_disable(dev);
-		device_set_wakeup_capable(dev, false);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(device_init_wakeup);
-
 /**
  * device_set_wakeup_enable - Enable or disable a device to wake up the system.
  * @dev: Device to handle.
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 196a157456aa..77f4849e3418 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -109,7 +109,6 @@ extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws);
 extern int device_wakeup_enable(struct device *dev);
 extern int device_wakeup_disable(struct device *dev);
 extern void device_set_wakeup_capable(struct device *dev, bool capable);
-extern int device_init_wakeup(struct device *dev, bool val);
 extern int device_set_wakeup_enable(struct device *dev, bool enable);
 extern void __pm_stay_awake(struct wakeup_source *ws);
 extern void pm_stay_awake(struct device *dev);
@@ -167,13 +166,6 @@ static inline int device_set_wakeup_enable(struct device *dev, bool enable)
 	return 0;
 }
 
-static inline int device_init_wakeup(struct device *dev, bool val)
-{
-	device_set_wakeup_capable(dev, val);
-	device_set_wakeup_enable(dev, val);
-	return 0;
-}
-
 static inline bool device_may_wakeup(struct device *dev)
 {
 	return dev->power.can_wakeup && dev->power.should_wakeup;
@@ -217,4 +209,27 @@ static inline void pm_wakeup_hard_event(struct device *dev)
 	return pm_wakeup_dev_event(dev, 0, true);
 }
 
+/**
+ * device_init_wakeup - Device wakeup initialization.
+ * @dev: Device to handle.
+ * @enable: Whether or not to enable @dev as a wakeup device.
+ *
+ * By default, most devices should leave wakeup disabled.  The exceptions are
+ * devices that everyone expects to be wakeup sources: keyboards, power buttons,
+ * possibly network interfaces, etc.  Also, devices that don't generate their
+ * own wakeup requests but merely forward requests from one bus to another
+ * (like PCI bridges) should have wakeup enabled by default.
+ */
+static inline int device_init_wakeup(struct device *dev, bool enable)
+{
+	if (enable) {
+		device_set_wakeup_capable(dev, true);
+		return device_wakeup_enable(dev);
+	} else {
+		device_wakeup_disable(dev);
+		device_set_wakeup_capable(dev, false);
+		return 0;
+	}
+}
+
 #endif /* _LINUX_PM_WAKEUP_H */
-- 
cgit v1.2.3


From 2755714656d0f2f41adfe231f3865e72da2cbe39 Mon Sep 17 00:00:00 2001
From: George D Sworo
Date: Wed, 1 Jun 2022 12:22:16 -0700
Subject: powercap: intel_rapl: Add support for RAPTORLAKE_P

Add RAPTORLAKE_P to the list of supported processor models in the Intel
RAPL power capping driver.

Signed-off-by: George D Sworo <george.d.sworo@intel.com>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
[ rjw: Minor changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index a9c99d9e8b42..21d624f9f5fb 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -1109,6 +1109,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		&rapl_defaults_core),
 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,		&rapl_defaults_core),
 	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,		&rapl_defaults_core),
+	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,        &rapl_defaults_core),
 	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&rapl_defaults_spr_server),
 	X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD,		&rapl_defaults_core),
 
-- 
cgit v1.2.3


From c46a0d5ae4f93800d2d90f3a3290e58a2f4b6bdf Mon Sep 17 00:00:00 2001
From: Ulf Hansson
Date: Wed, 8 Jun 2022 17:31:20 +0200
Subject: PM: runtime: Extend support for wakeirq for force_suspend|resume

A driver that makes use of pm_runtime_force_suspend|resume() to support
system suspend/resume, currently needs to manage the wakeirq support
itself. To avoid the boilerplate code in the driver's system suspend/resume
callbacks in particular, let's extend pm_runtime_force_suspend|resume() to
deal with the wakeirq.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers')

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 949907e2e242..997be3ac20a7 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1862,10 +1862,13 @@ int pm_runtime_force_suspend(struct device *dev)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_suspend);
 
+	dev_pm_enable_wake_irq_check(dev, true);
 	ret = callback ? callback(dev) : 0;
 	if (ret)
 		goto err;
 
+	dev_pm_enable_wake_irq_complete(dev);
+
 	/*
 	 * If the device can stay in suspend after the system-wide transition
 	 * to the working state that will follow, drop the children counter of
@@ -1882,6 +1885,7 @@ int pm_runtime_force_suspend(struct device *dev)
 	return 0;
 
 err:
+	dev_pm_disable_wake_irq_check(dev, true);
 	pm_runtime_enable(dev);
 	return ret;
 }
@@ -1915,9 +1919,11 @@ int pm_runtime_force_resume(struct device *dev)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_resume);
 
+	dev_pm_disable_wake_irq_check(dev, false);
 	ret = callback ? callback(dev) : 0;
 	if (ret) {
 		pm_runtime_set_suspended(dev);
+		dev_pm_enable_wake_irq_check(dev, false);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 37101d3c719386040ded735a5ec06974f1d94d1f Mon Sep 17 00:00:00 2001
From: Hsin-Yi Wang
Date: Wed, 6 Jul 2022 01:16:49 +0800
Subject: PM: domains: Ensure genpd_debugfs_dir exists before remove

Both genpd_debug_add() and genpd_debug_remove() may be called
indirectly by other drivers while genpd_debugfs_dir is not yet
set. For example, drivers can call pm_genpd_init() in probe or
pm_genpd_init() in probe fail/cleanup path:

pm_genpd_init()
 --> genpd_debug_add()

pm_genpd_remove()
 --> genpd_remove()
   --> genpd_debug_remove()

At this time, genpd_debug_init() may not yet be called.

genpd_debug_add() checks that if genpd_debugfs_dir is NULL, it
will return directly. Make sure this is also checked
in pm_genpd_remove(), otherwise components under debugfs root
which has the same name as other components under pm_genpd may
be accidentally removed, since NULL represents debugfs root.

Fixes: 718072ceb211 ("PM: domains: create debugfs nodes when adding power domains")
Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 739e52cd4aba..55a10e6d4e2a 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -222,6 +222,9 @@ static void genpd_debug_remove(struct generic_pm_domain *genpd)
 {
 	struct dentry *d;
 
+	if (!genpd_debugfs_dir)
+		return;
+
 	d = debugfs_lookup(genpd->name, genpd_debugfs_dir);
 	debugfs_remove(d);
 }
-- 
cgit v1.2.3


From 86d231b1db1be0666c2fc607f5025d63d7afc5f3 Mon Sep 17 00:00:00 2001
From: Johnson Wang
Date: Tue, 7 Jun 2022 22:05:56 +0800
Subject: PM / devfreq: mediatek: Introduce MediaTek CCI devfreq driver

We introduce a devfreq driver for the MediaTek Cache Coherent Interconnect
(CCI) used by some MediaTek SoCs.

In this driver, we use the passive devfreq driver to get target frequencies
and adjust voltages accordingly. In MT8183 and MT8186, the MediaTek CCI
is supplied by the same regulators with the little core CPUs.

Signed-off-by: Jia-Wei Chang <jia-wei.chang@mediatek.com>
Signed-off-by: Johnson Wang <johnson.wang@mediatek.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Tested-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/Kconfig           |  10 +
 drivers/devfreq/Makefile          |   1 +
 drivers/devfreq/mtk-cci-devfreq.c | 440 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 451 insertions(+)
 create mode 100644 drivers/devfreq/mtk-cci-devfreq.c

(limited to 'drivers')

diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
index 87eb2b837e68..9754d8b31621 100644
--- a/drivers/devfreq/Kconfig
+++ b/drivers/devfreq/Kconfig
@@ -120,6 +120,16 @@ config ARM_TEGRA_DEVFREQ
 	  It reads ACTMON counters of memory controllers and adjusts the
 	  operating frequencies and voltages with OPP support.
 
+config ARM_MEDIATEK_CCI_DEVFREQ
+	tristate "MEDIATEK CCI DEVFREQ Driver"
+	depends on ARM_MEDIATEK_CPUFREQ || COMPILE_TEST
+	select DEVFREQ_GOV_PASSIVE
+	help
+	  This adds a devfreq driver for MediaTek Cache Coherent Interconnect
+	  which is shared the same regulators with the cpu cluster. It can track
+	  buck voltages and update a proper CCI frequency. Use the notification
+	  to get the regulator status.
+
 config ARM_RK3399_DMC_DEVFREQ
 	tristate "ARM RK3399 DMC DEVFREQ Driver"
 	depends on (ARCH_ROCKCHIP && HAVE_ARM_SMCCC) || \
diff --git a/drivers/devfreq/Makefile b/drivers/devfreq/Makefile
index 0b6be92a25d9..bf40d04928d0 100644
--- a/drivers/devfreq/Makefile
+++ b/drivers/devfreq/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_DEVFREQ_GOV_PASSIVE)	+= governor_passive.o
 obj-$(CONFIG_ARM_EXYNOS_BUS_DEVFREQ)	+= exynos-bus.o
 obj-$(CONFIG_ARM_IMX_BUS_DEVFREQ)	+= imx-bus.o
 obj-$(CONFIG_ARM_IMX8M_DDRC_DEVFREQ)	+= imx8m-ddrc.o
+obj-$(CONFIG_ARM_MEDIATEK_CCI_DEVFREQ)	+= mtk-cci-devfreq.o
 obj-$(CONFIG_ARM_RK3399_DMC_DEVFREQ)	+= rk3399_dmc.o
 obj-$(CONFIG_ARM_SUN8I_A33_MBUS_DEVFREQ)	+= sun8i-a33-mbus.o
 obj-$(CONFIG_ARM_TEGRA_DEVFREQ)		+= tegra30-devfreq.o
diff --git a/drivers/devfreq/mtk-cci-devfreq.c b/drivers/devfreq/mtk-cci-devfreq.c
new file mode 100644
index 000000000000..71abb3fbd042
--- /dev/null
+++ b/drivers/devfreq/mtk-cci-devfreq.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 MediaTek Inc.
+ */
+
+#include <linux/clk.h>
+#include <linux/devfreq.h>
+#include <linux/minmax.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/pm_opp.h>
+#include <linux/regulator/consumer.h>
+
+struct mtk_ccifreq_platform_data {
+	int min_volt_shift;
+	int max_volt_shift;
+	int proc_max_volt;
+	int sram_min_volt;
+	int sram_max_volt;
+};
+
+struct mtk_ccifreq_drv {
+	struct device *dev;
+	struct devfreq *devfreq;
+	struct regulator *proc_reg;
+	struct regulator *sram_reg;
+	struct clk *cci_clk;
+	struct clk *inter_clk;
+	int inter_voltage;
+	unsigned long pre_freq;
+	/* Avoid race condition for regulators between notify and policy */
+	struct mutex reg_lock;
+	struct notifier_block opp_nb;
+	const struct mtk_ccifreq_platform_data *soc_data;
+	int vtrack_max;
+};
+
+static int mtk_ccifreq_set_voltage(struct mtk_ccifreq_drv *drv, int new_voltage)
+{
+	const struct mtk_ccifreq_platform_data *soc_data = drv->soc_data;
+	struct device *dev = drv->dev;
+	int pre_voltage, pre_vsram, new_vsram, vsram, voltage, ret;
+	int retry_max = drv->vtrack_max;
+
+	if (!drv->sram_reg) {
+		ret = regulator_set_voltage(drv->proc_reg, new_voltage,
+					    drv->soc_data->proc_max_volt);
+		return ret;
+	}
+
+	pre_voltage = regulator_get_voltage(drv->proc_reg);
+	if (pre_voltage < 0) {
+		dev_err(dev, "invalid vproc value: %d\n", pre_voltage);
+		return pre_voltage;
+	}
+
+	pre_vsram = regulator_get_voltage(drv->sram_reg);
+	if (pre_vsram < 0) {
+		dev_err(dev, "invalid vsram value: %d\n", pre_vsram);
+		return pre_vsram;
+	}
+
+	new_vsram = clamp(new_voltage + soc_data->min_volt_shift,
+			  soc_data->sram_min_volt, soc_data->sram_max_volt);
+
+	do {
+		if (pre_voltage <= new_voltage) {
+			vsram = clamp(pre_voltage + soc_data->max_volt_shift,
+				      soc_data->sram_min_volt, new_vsram);
+			ret = regulator_set_voltage(drv->sram_reg, vsram,
+						    soc_data->sram_max_volt);
+			if (ret)
+				return ret;
+
+			if (vsram == soc_data->sram_max_volt ||
+			    new_vsram == soc_data->sram_min_volt)
+				voltage = new_voltage;
+			else
+				voltage = vsram - soc_data->min_volt_shift;
+
+			ret = regulator_set_voltage(drv->proc_reg, voltage,
+						    soc_data->proc_max_volt);
+			if (ret) {
+				regulator_set_voltage(drv->sram_reg, pre_vsram,
+						      soc_data->sram_max_volt);
+				return ret;
+			}
+		} else if (pre_voltage > new_voltage) {
+			voltage = max(new_voltage,
+				      pre_vsram - soc_data->max_volt_shift);
+			ret = regulator_set_voltage(drv->proc_reg, voltage,
+						    soc_data->proc_max_volt);
+			if (ret)
+				return ret;
+
+			if (voltage == new_voltage)
+				vsram = new_vsram;
+			else
+				vsram = max(new_vsram,
+					    voltage + soc_data->min_volt_shift);
+
+			ret = regulator_set_voltage(drv->sram_reg, vsram,
+						    soc_data->sram_max_volt);
+			if (ret) {
+				regulator_set_voltage(drv->proc_reg, pre_voltage,
+						      soc_data->proc_max_volt);
+				return ret;
+			}
+		}
+
+		pre_voltage = voltage;
+		pre_vsram = vsram;
+
+		if (--retry_max < 0) {
+			dev_err(dev,
+				"over loop count, failed to set voltage\n");
+			return -EINVAL;
+		}
+	} while (voltage != new_voltage || vsram != new_vsram);
+
+	return 0;
+}
+
+static int mtk_ccifreq_target(struct device *dev, unsigned long *freq,
+			      u32 flags)
+{
+	struct mtk_ccifreq_drv *drv = dev_get_drvdata(dev);
+	struct clk *cci_pll = clk_get_parent(drv->cci_clk);
+	struct dev_pm_opp *opp;
+	unsigned long opp_rate;
+	int voltage, pre_voltage, inter_voltage, target_voltage, ret;
+
+	if (!drv)
+		return -EINVAL;
+
+	if (drv->pre_freq == *freq)
+		return 0;
+
+	inter_voltage = drv->inter_voltage;
+
+	opp_rate = *freq;
+	opp = devfreq_recommended_opp(dev, &opp_rate, 1);
+	if (IS_ERR(opp)) {
+		dev_err(dev, "failed to find opp for freq: %ld\n", opp_rate);
+		return PTR_ERR(opp);
+	}
+
+	mutex_lock(&drv->reg_lock);
+
+	voltage = dev_pm_opp_get_voltage(opp);
+	dev_pm_opp_put(opp);
+
+	pre_voltage = regulator_get_voltage(drv->proc_reg);
+	if (pre_voltage < 0) {
+		dev_err(dev, "invalid vproc value: %d\n", pre_voltage);
+		ret = pre_voltage;
+		goto out_unlock;
+	}
+
+	/* scale up: set voltage first then freq. */
+	target_voltage = max(inter_voltage, voltage);
+	if (pre_voltage <= target_voltage) {
+		ret = mtk_ccifreq_set_voltage(drv, target_voltage);
+		if (ret) {
+			dev_err(dev, "failed to scale up voltage\n");
+			goto out_restore_voltage;
+		}
+	}
+
+	/* switch the cci clock to intermediate clock source. */
+	ret = clk_set_parent(drv->cci_clk, drv->inter_clk);
+	if (ret) {
+		dev_err(dev, "failed to re-parent cci clock\n");
+		goto out_restore_voltage;
+	}
+
+	/* set the original clock to target rate. */
+	ret = clk_set_rate(cci_pll, *freq);
+	if (ret) {
+		dev_err(dev, "failed to set cci pll rate: %d\n", ret);
+		clk_set_parent(drv->cci_clk, cci_pll);
+		goto out_restore_voltage;
+	}
+
+	/* switch the cci clock back to the original clock source. */
+	ret = clk_set_parent(drv->cci_clk, cci_pll);
+	if (ret) {
+		dev_err(dev, "failed to re-parent cci clock\n");
+		mtk_ccifreq_set_voltage(drv, inter_voltage);
+		goto out_unlock;
+	}
+
+	/*
+	 * If the new voltage is lower than the intermediate voltage or the
+	 * original voltage, scale down to the new voltage.
+	 */
+	if (voltage < inter_voltage || voltage < pre_voltage) {
+		ret = mtk_ccifreq_set_voltage(drv, voltage);
+		if (ret) {
+			dev_err(dev, "failed to scale down voltage\n");
+			goto out_unlock;
+		}
+	}
+
+	drv->pre_freq = *freq;
+	mutex_unlock(&drv->reg_lock);
+
+	return 0;
+
+out_restore_voltage:
+	mtk_ccifreq_set_voltage(drv, pre_voltage);
+
+out_unlock:
+	mutex_unlock(&drv->reg_lock);
+	return ret;
+}
+
+static int mtk_ccifreq_opp_notifier(struct notifier_block *nb,
+				    unsigned long event, void *data)
+{
+	struct dev_pm_opp *opp = data;
+	struct mtk_ccifreq_drv *drv;
+	unsigned long freq, volt;
+
+	drv = container_of(nb, struct mtk_ccifreq_drv, opp_nb);
+
+	if (event == OPP_EVENT_ADJUST_VOLTAGE) {
+		freq = dev_pm_opp_get_freq(opp);
+
+		mutex_lock(&drv->reg_lock);
+		/* current opp item is changed */
+		if (freq == drv->pre_freq) {
+			volt = dev_pm_opp_get_voltage(opp);
+			mtk_ccifreq_set_voltage(drv, volt);
+		}
+		mutex_unlock(&drv->reg_lock);
+	}
+
+	return 0;
+}
+
+static struct devfreq_dev_profile mtk_ccifreq_profile = {
+	.target = mtk_ccifreq_target,
+};
+
+static int mtk_ccifreq_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct mtk_ccifreq_drv *drv;
+	struct devfreq_passive_data *passive_data;
+	struct dev_pm_opp *opp;
+	unsigned long rate, opp_volt;
+	int ret;
+
+	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
+	if (!drv)
+		return -ENOMEM;
+
+	drv->dev = dev;
+	drv->soc_data = (const struct mtk_ccifreq_platform_data *)
+				of_device_get_match_data(&pdev->dev);
+	mutex_init(&drv->reg_lock);
+	platform_set_drvdata(pdev, drv);
+
+	drv->cci_clk = devm_clk_get(dev, "cci");
+	if (IS_ERR(drv->cci_clk)) {
+		ret = PTR_ERR(drv->cci_clk);
+		return dev_err_probe(dev, ret, "failed to get cci clk\n");
+	}
+
+	drv->inter_clk = devm_clk_get(dev, "intermediate");
+	if (IS_ERR(drv->inter_clk)) {
+		ret = PTR_ERR(drv->inter_clk);
+		return dev_err_probe(dev, ret,
+				     "failed to get intermediate clk\n");
+	}
+
+	drv->proc_reg = devm_regulator_get_optional(dev, "proc");
+	if (IS_ERR(drv->proc_reg)) {
+		ret = PTR_ERR(drv->proc_reg);
+		return dev_err_probe(dev, ret,
+				     "failed to get proc regulator\n");
+	}
+
+	ret = regulator_enable(drv->proc_reg);
+	if (ret) {
+		dev_err(dev, "failed to enable proc regulator\n");
+		return ret;
+	}
+
+	drv->sram_reg = devm_regulator_get_optional(dev, "sram");
+	if (IS_ERR(drv->sram_reg))
+		drv->sram_reg = NULL;
+	else {
+		ret = regulator_enable(drv->sram_reg);
+		if (ret) {
+			dev_err(dev, "failed to enable sram regulator\n");
+			goto out_free_resources;
+		}
+	}
+
+	/*
+	 * We assume min voltage is 0 and tracking target voltage using
+	 * min_volt_shift for each iteration.
+	 * The retry_max is 3 times of expected iteration count.
+	 */
+	drv->vtrack_max = 3 * DIV_ROUND_UP(max(drv->soc_data->sram_max_volt,
+					       drv->soc_data->proc_max_volt),
+					   drv->soc_data->min_volt_shift);
+
+	ret = clk_prepare_enable(drv->cci_clk);
+	if (ret)
+		goto out_free_resources;
+
+	ret = dev_pm_opp_of_add_table(dev);
+	if (ret) {
+		dev_err(dev, "failed to add opp table: %d\n", ret);
+		goto out_disable_cci_clk;
+	}
+
+	rate = clk_get_rate(drv->inter_clk);
+	opp = dev_pm_opp_find_freq_ceil(dev, &rate);
+	if (IS_ERR(opp)) {
+		ret = PTR_ERR(opp);
+		dev_err(dev, "failed to get intermediate opp: %d\n", ret);
+		goto out_remove_opp_table;
+	}
+	drv->inter_voltage = dev_pm_opp_get_voltage(opp);
+	dev_pm_opp_put(opp);
+
+	rate = U32_MAX;
+	opp = dev_pm_opp_find_freq_floor(drv->dev, &rate);
+	if (IS_ERR(opp)) {
+		dev_err(dev, "failed to get opp\n");
+		ret = PTR_ERR(opp);
+		goto out_remove_opp_table;
+	}
+
+	opp_volt = dev_pm_opp_get_voltage(opp);
+	dev_pm_opp_put(opp);
+	ret = mtk_ccifreq_set_voltage(drv, opp_volt);
+	if (ret) {
+		dev_err(dev, "failed to scale to highest voltage %lu in proc_reg\n",
+			opp_volt);
+		goto out_remove_opp_table;
+	}
+
+	passive_data = devm_kzalloc(dev, sizeof(*passive_data), GFP_KERNEL);
+	if (!passive_data) {
+		ret = -ENOMEM;
+		goto out_remove_opp_table;
+	}
+
+	passive_data->parent_type = CPUFREQ_PARENT_DEV;
+	drv->devfreq = devm_devfreq_add_device(dev, &mtk_ccifreq_profile,
+					       DEVFREQ_GOV_PASSIVE,
+					       passive_data);
+	if (IS_ERR(drv->devfreq)) {
+		ret = -EPROBE_DEFER;
+		dev_err(dev, "failed to add devfreq device: %ld\n",
+			PTR_ERR(drv->devfreq));
+		goto out_remove_opp_table;
+	}
+
+	drv->opp_nb.notifier_call = mtk_ccifreq_opp_notifier;
+	ret = dev_pm_opp_register_notifier(dev, &drv->opp_nb);
+	if (ret) {
+		dev_err(dev, "failed to register opp notifier: %d\n", ret);
+		goto out_remove_opp_table;
+	}
+	return 0;
+
+out_remove_opp_table:
+	dev_pm_opp_of_remove_table(dev);
+
+out_disable_cci_clk:
+	clk_disable_unprepare(drv->cci_clk);
+
+out_free_resources:
+	if (regulator_is_enabled(drv->proc_reg))
+		regulator_disable(drv->proc_reg);
+	if (drv->sram_reg && regulator_is_enabled(drv->sram_reg))
+		regulator_disable(drv->sram_reg);
+
+	return ret;
+}
+
+static int mtk_ccifreq_remove(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct mtk_ccifreq_drv *drv;
+
+	drv = platform_get_drvdata(pdev);
+
+	dev_pm_opp_unregister_notifier(dev, &drv->opp_nb);
+	dev_pm_opp_of_remove_table(dev);
+	clk_disable_unprepare(drv->cci_clk);
+	regulator_disable(drv->proc_reg);
+	if (drv->sram_reg)
+		regulator_disable(drv->sram_reg);
+
+	return 0;
+}
+
+static const struct mtk_ccifreq_platform_data mt8183_platform_data = {
+	.min_volt_shift = 100000,
+	.max_volt_shift = 200000,
+	.proc_max_volt = 1150000,
+};
+
+static const struct mtk_ccifreq_platform_data mt8186_platform_data = {
+	.min_volt_shift = 100000,
+	.max_volt_shift = 250000,
+	.proc_max_volt = 1118750,
+	.sram_min_volt = 850000,
+	.sram_max_volt = 1118750,
+};
+
+static const struct of_device_id mtk_ccifreq_machines[] = {
+	{ .compatible = "mediatek,mt8183-cci", .data = &mt8183_platform_data },
+	{ .compatible = "mediatek,mt8186-cci", .data = &mt8186_platform_data },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, mtk_ccifreq_machines);
+
+static struct platform_driver mtk_ccifreq_platdrv = {
+	.probe	= mtk_ccifreq_probe,
+	.remove	= mtk_ccifreq_remove,
+	.driver = {
+		.name = "mtk-ccifreq",
+		.of_match_table = mtk_ccifreq_machines,
+	},
+};
+module_platform_driver(mtk_ccifreq_platdrv);
+
+MODULE_DESCRIPTION("MediaTek CCI devfreq driver");
+MODULE_AUTHOR("Jia-Wei Chang <jia-wei.chang@mediatek.com>");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From c9deb748683851737c5e4df623c3283c185cd8f0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab
Date: Tue, 28 Jun 2022 10:46:12 +0100
Subject: PM / devfreq: shut up kernel-doc warnings

There are 4 warnings there:
	drivers/devfreq/devfreq.c:707: warning: Function parameter or member 'val' not described in 'qos_min_notifier_call'
	drivers/devfreq/devfreq.c:707: warning: Function parameter or member 'ptr' not described in 'qos_min_notifier_call'
	drivers/devfreq/devfreq.c:717: warning: Function parameter or member 'val' not described in 'qos_max_notifier_call'
	drivers/devfreq/devfreq.c:717: warning: Function parameter or member 'ptr' not described in 'qos_max_notifier_call'

It turns that neither val nor ptr are actually used on those
function, so document as such.

Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/devfreq.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index 9602141bb8ec..63347a5ae599 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -696,6 +696,8 @@ static int qos_notifier_call(struct devfreq *devfreq)
 /**
  * qos_min_notifier_call() - Callback for QoS min_freq changes.
  * @nb:		Should be devfreq->nb_min
+ * @val:	not used
+ * @ptr:	not used
  */
 static int qos_min_notifier_call(struct notifier_block *nb,
 					 unsigned long val, void *ptr)
@@ -706,6 +708,8 @@ static int qos_min_notifier_call(struct notifier_block *nb,
 /**
  * qos_max_notifier_call() - Callback for QoS max_freq changes.
  * @nb:		Should be devfreq->nb_max
+ * @val:	not used
+ * @ptr:	not used
  */
 static int qos_max_notifier_call(struct notifier_block *nb,
 					 unsigned long val, void *ptr)
-- 
cgit v1.2.3


From 2472934e36b51bbdf0a7cdd5e351910d9002c6ac Mon Sep 17 00:00:00 2001
From: Colin Ian King
Date: Thu, 23 Jun 2022 19:00:54 +0100
Subject: PM / devfreq: imx-bus: use NULL to pass a null pointer rather than
 zero

The 3rd argument to the function of_get_property is a pointer and it is
being passed using 0. Use NULL instead.

Cleans up sparse warning:
warning: Using plain integer as NULL pointer

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/imx-bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/devfreq/imx-bus.c b/drivers/devfreq/imx-bus.c
index f3f6e25053ed..f87067fc574d 100644
--- a/drivers/devfreq/imx-bus.c
+++ b/drivers/devfreq/imx-bus.c
@@ -59,7 +59,7 @@ static int imx_bus_init_icc(struct device *dev)
 	struct imx_bus *priv = dev_get_drvdata(dev);
 	const char *icc_driver_name;
 
-	if (!of_get_property(dev->of_node, "#interconnect-cells", 0))
+	if (!of_get_property(dev->of_node, "#interconnect-cells", NULL))
 		return 0;
 	if (!IS_ENABLED(CONFIG_INTERCONNECT_IMX)) {
 		dev_warn(dev, "imx interconnect drivers disabled\n");
-- 
cgit v1.2.3


From 53f853d55e312a639eaa910154248ce06eb754ac Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko
Date: Thu, 30 Jun 2022 10:57:51 +0300
Subject: PM / devfreq: tegra30: Add error message for
 devm_devfreq_add_device()

It's difficult to notice that driver failed to probe by looking at KMSG
when devm_devfreq_add_device() fails. Add explicit error message for this
case.

Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/tegra30-devfreq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c
index 65ecf17a36f4..585a95fe2bd6 100644
--- a/drivers/devfreq/tegra30-devfreq.c
+++ b/drivers/devfreq/tegra30-devfreq.c
@@ -922,8 +922,10 @@ static int tegra_devfreq_probe(struct platform_device *pdev)
 
 	devfreq = devm_devfreq_add_device(&pdev->dev, &tegra_devfreq_profile,
 					  "tegra_actmon", NULL);
-	if (IS_ERR(devfreq))
+	if (IS_ERR(devfreq)) {
+		dev_err(&pdev->dev, "Failed to add device: %pe\n", devfreq);
 		return PTR_ERR(devfreq);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From ae6ccaa650380d243cf43d31c864c5ced2fd4612 Mon Sep 17 00:00:00 2001
From: Lukasz Luba
Date: Thu, 7 Jul 2022 08:15:52 +0100
Subject: PM: EM: convert power field to micro-Watts precision and align
 drivers

The milli-Watts precision causes rounding errors while calculating
efficiency cost for each OPP. This is especially visible in the 'simple'
Energy Model (EM), where the power for each OPP is provided from OPP
framework. This can cause some OPPs to be marked inefficient, while
using micro-Watts precision that might not happen.

Update all EM users which access 'power' field and assume the value is
in milli-Watts.

Solve also an issue with potential overflow in calculation of energy
estimation on 32bit machine. It's needed now since the power value
(thus the 'cost' as well) are higher.

Example calculation which shows the rounding error and impact:

power = 'dyn-power-coeff' * volt_mV * volt_mV * freq_MHz

power_a_uW = (100 * 600mW * 600mW * 500MHz) / 10^6 = 18000
power_a_mW = (100 * 600mW * 600mW * 500MHz) / 10^9 = 18

power_b_uW = (100 * 605mW * 605mW * 600MHz) / 10^6 = 21961
power_b_mW = (100 * 605mW * 605mW * 600MHz) / 10^9 = 21

max_freq = 2000MHz

cost_a_mW = 18 * 2000MHz/500MHz = 72
cost_a_uW = 18000 * 2000MHz/500MHz = 72000

cost_b_mW = 21 * 2000MHz/600MHz = 70 // <- artificially better
cost_b_uW = 21961 * 2000MHz/600MHz = 73203

The 'cost_b_mW' (which is based on old milli-Watts) is misleadingly
better that the 'cost_b_uW' (this patch uses micro-Watts) and such
would have impact on the 'inefficient OPPs' information in the Cpufreq
framework. This patch set removes the rounding issue.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/mediatek-cpufreq-hw.c |  7 +++--
 drivers/cpufreq/scmi-cpufreq.c        |  6 ++++
 drivers/opp/of.c                      | 15 +++++-----
 drivers/powercap/dtpm_cpu.c           |  5 ++--
 drivers/thermal/cpufreq_cooling.c     | 13 +++++++--
 drivers/thermal/devfreq_cooling.c     | 19 +++++++++---
 include/linux/energy_model.h          | 54 ++++++++++++++++++++++++-----------
 kernel/power/energy_model.c           | 24 ++++++++++------
 8 files changed, 100 insertions(+), 43 deletions(-)

(limited to 'drivers')

diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c
index 813cccbfe934..f0e0a35c7f21 100644
--- a/drivers/cpufreq/mediatek-cpufreq-hw.c
+++ b/drivers/cpufreq/mediatek-cpufreq-hw.c
@@ -51,7 +51,7 @@ static const u16 cpufreq_mtk_offsets[REG_ARRAY_SIZE] = {
 };
 
 static int __maybe_unused
-mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW,
+mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *uW,
 			  unsigned long *KHz)
 {
 	struct mtk_cpufreq_data *data;
@@ -71,8 +71,9 @@ mtk_cpufreq_get_cpu_power(struct device *cpu_dev, unsigned long *mW,
 	i--;
 
 	*KHz = data->table[i].frequency;
-	*mW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] +
-			    i * LUT_ROW_SIZE) / 1000;
+	/* Provide micro-Watts value to the Energy Model */
+	*uW = readl_relaxed(data->reg_bases[REG_EM_POWER_TBL] +
+			    i * LUT_ROW_SIZE);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 6d2a4cf46db7..bfd35583d653 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/scmi_protocol.h>
 #include <linux/types.h>
+#include <linux/units.h>
 
 struct scmi_data {
 	int domain_id;
@@ -99,6 +100,7 @@ static int __maybe_unused
 scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
 		   unsigned long *KHz)
 {
+	bool power_scale_mw = perf_ops->power_scale_mw_get(ph);
 	unsigned long Hz;
 	int ret, domain;
 
@@ -112,6 +114,10 @@ scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
 	if (ret)
 		return ret;
 
+	/* Provide bigger resolution power to the Energy Model */
+	if (power_scale_mw)
+		*power *= MICROWATT_PER_MILLIWATT;
+
 	/* The EM framework specifies the frequency in KHz. */
 	*KHz = Hz / 1000;
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 30394929d700..eb89c9a75985 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1443,12 +1443,12 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
  * It provides the power used by @dev at @kHz if it is the frequency of an
  * existing OPP, or at the frequency of the first OPP above @kHz otherwise
  * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
- * frequency and @mW to the associated power.
+ * frequency and @uW to the associated power.
  *
  * Returns 0 on success or a proper -EINVAL value in case of error.
  */
 static int __maybe_unused
-_get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
+_get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz)
 {
 	struct dev_pm_opp *opp;
 	unsigned long opp_freq, opp_power;
@@ -1465,7 +1465,7 @@ _get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
 		return -EINVAL;
 
 	*kHz = opp_freq / 1000;
-	*mW = opp_power / 1000;
+	*uW = opp_power;
 
 	return 0;
 }
@@ -1475,14 +1475,14 @@ _get_dt_power(struct device *dev, unsigned long *mW, unsigned long *kHz)
  * This computes the power estimated by @dev at @kHz if it is the frequency
  * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise
  * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
- * frequency and @mW to the associated power. The power is estimated as
+ * frequency and @uW to the associated power. The power is estimated as
  * P = C * V^2 * f with C being the device's capacitance and V and f
  * respectively the voltage and frequency of the OPP.
  *
  * Returns -EINVAL if the power calculation failed because of missing
  * parameters, 0 otherwise.
  */
-static int __maybe_unused _get_power(struct device *dev, unsigned long *mW,
+static int __maybe_unused _get_power(struct device *dev, unsigned long *uW,
 				     unsigned long *kHz)
 {
 	struct dev_pm_opp *opp;
@@ -1512,9 +1512,10 @@ static int __maybe_unused _get_power(struct device *dev, unsigned long *mW,
 		return -EINVAL;
 
 	tmp = (u64)cap * mV * mV * (Hz / 1000000);
-	do_div(tmp, 1000000000);
+	/* Provide power in micro-Watts */
+	do_div(tmp, 1000000);
 
-	*mW = (unsigned long)tmp;
+	*uW = (unsigned long)tmp;
 	*kHz = Hz / 1000;
 
 	return 0;
diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
index f5eced0842b3..61c5ff80bd30 100644
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -53,7 +53,7 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
 
 	for (i = 0; i < pd->nr_perf_states; i++) {
 
-		power = pd->table[i].power * MICROWATT_PER_MILLIWATT * nr_cpus;
+		power = pd->table[i].power * nr_cpus;
 
 		if (power > power_limit)
 			break;
@@ -63,8 +63,7 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
 
 	freq_qos_update_request(&dtpm_cpu->qos_req, freq);
 
-	power_limit = pd->table[i - 1].power *
-		MICROWATT_PER_MILLIWATT * nr_cpus;
+	power_limit = pd->table[i - 1].power * nr_cpus;
 
 	return power_limit;
 }
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index b8151d95a806..dc19e7c80751 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -21,6 +21,7 @@
 #include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/thermal.h>
+#include <linux/units.h>
 
 #include <trace/events/thermal.h>
 
@@ -101,6 +102,7 @@ static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
 static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
 			     u32 freq)
 {
+	unsigned long power_mw;
 	int i;
 
 	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
@@ -108,16 +110,23 @@ static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
 			break;
 	}
 
-	return cpufreq_cdev->em->table[i + 1].power;
+	power_mw = cpufreq_cdev->em->table[i + 1].power;
+	power_mw /= MICROWATT_PER_MILLIWATT;
+
+	return power_mw;
 }
 
 static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
 			     u32 power)
 {
+	unsigned long em_power_mw;
 	int i;
 
 	for (i = cpufreq_cdev->max_level; i > 0; i--) {
-		if (power >= cpufreq_cdev->em->table[i].power)
+		/* Convert EM power to milli-Watts to make safe comparison */
+		em_power_mw = cpufreq_cdev->em->table[i].power;
+		em_power_mw /= MICROWATT_PER_MILLIWATT;
+		if (power >= em_power_mw)
 			break;
 	}
 
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 8c76f9655e57..8d1260f65061 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -200,7 +200,11 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 		res = dfc->power_ops->get_real_power(df, power, freq, voltage);
 		if (!res) {
 			state = dfc->capped_state;
+
+			/* Convert EM power into milli-Watts first */
 			dfc->res_util = dfc->em_pd->table[state].power;
+			dfc->res_util /= MICROWATT_PER_MILLIWATT;
+
 			dfc->res_util *= SCALE_ERROR_MITIGATION;
 
 			if (*power > 1)
@@ -218,8 +222,10 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 
 		_normalize_load(&status);
 
-		/* Scale power for utilization */
+		/* Convert EM power into milli-Watts first */
 		*power = dfc->em_pd->table[perf_idx].power;
+		*power /= MICROWATT_PER_MILLIWATT;
+		/* Scale power for utilization */
 		*power *= status.busy_time;
 		*power >>= 10;
 	}
@@ -244,6 +250,7 @@ static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
 
 	perf_idx = dfc->max_state - state;
 	*power = dfc->em_pd->table[perf_idx].power;
+	*power /= MICROWATT_PER_MILLIWATT;
 
 	return 0;
 }
@@ -254,7 +261,7 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 	struct devfreq_cooling_device *dfc = cdev->devdata;
 	struct devfreq *df = dfc->devfreq;
 	struct devfreq_dev_status status;
-	unsigned long freq;
+	unsigned long freq, em_power_mw;
 	s32 est_power;
 	int i;
 
@@ -279,9 +286,13 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 	 * Find the first cooling state that is within the power
 	 * budget. The EM power table is sorted ascending.
 	 */
-	for (i = dfc->max_state; i > 0; i--)
-		if (est_power >= dfc->em_pd->table[i].power)
+	for (i = dfc->max_state; i > 0; i--) {
+		/* Convert EM power to milli-Watts to make safe comparison */
+		em_power_mw = dfc->em_pd->table[i].power;
+		em_power_mw /= MICROWATT_PER_MILLIWATT;
+		if (est_power >= em_power_mw)
 			break;
+	}
 
 	*state = dfc->max_state - i;
 	dfc->capped_state = *state;
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 8419bffb4398..b9caa01dfac4 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -62,7 +62,7 @@ struct em_perf_domain {
 /*
  *  em_perf_domain flags:
  *
- *  EM_PERF_DOMAIN_MILLIWATTS: The power values are in milli-Watts or some
+ *  EM_PERF_DOMAIN_MICROWATTS: The power values are in micro-Watts or some
  *  other scale.
  *
  *  EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating
@@ -71,7 +71,7 @@ struct em_perf_domain {
  *  EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be
  *  created by platform missing real power information
  */
-#define EM_PERF_DOMAIN_MILLIWATTS BIT(0)
+#define EM_PERF_DOMAIN_MICROWATTS BIT(0)
 #define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1)
 #define EM_PERF_DOMAIN_ARTIFICIAL BIT(2)
 
@@ -79,22 +79,44 @@ struct em_perf_domain {
 #define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL)
 
 #ifdef CONFIG_ENERGY_MODEL
-#define EM_MAX_POWER 0xFFFF
+/*
+ * The max power value in micro-Watts. The limit of 64 Watts is set as
+ * a safety net to not overflow multiplications on 32bit platforms. The
+ * 32bit value limit for total Perf Domain power implies a limit of
+ * maximum CPUs in such domain to 64.
+ */
+#define EM_MAX_POWER (64000000) /* 64 Watts */
+
+/*
+ * To avoid possible energy estimation overflow on 32bit machines add
+ * limits to number of CPUs in the Perf. Domain.
+ * We are safe on 64bit machine, thus some big number.
+ */
+#ifdef CONFIG_64BIT
+#define EM_MAX_NUM_CPUS 4096
+#else
+#define EM_MAX_NUM_CPUS 16
+#endif
 
 /*
- * Increase resolution of energy estimation calculations for 64-bit
- * architectures. The extra resolution improves decision made by EAS for the
- * task placement when two Performance Domains might provide similar energy
- * estimation values (w/o better resolution the values could be equal).
+ * To avoid an overflow on 32bit machines while calculating the energy
+ * use a different order in the operation. First divide by the 'cpu_scale'
+ * which would reduce big value stored in the 'cost' field, then multiply by
+ * the 'sum_util'. This would allow to handle existing platforms, which have
+ * e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts.
+ * In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util'
+ * could be 4096, then multiplication: 'cost' * 'sum_util'  would overflow.
+ * This reordering of operations has some limitations, we lose small
+ * precision in the estimation (comparing to 64bit platform w/o reordering).
  *
- * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
- * are pretty high and the returns do not justify the increased costs.
+ * We are safe on 64bit machine.
  */
 #ifdef CONFIG_64BIT
-#define em_scale_power(p) ((p) * 1000)
+#define em_estimate_energy(cost, sum_util, scale_cpu) \
+	(((cost) * (sum_util)) / (scale_cpu))
 #else
-#define em_scale_power(p) (p)
+#define em_estimate_energy(cost, sum_util, scale_cpu) \
+	(((cost) / (scale_cpu)) * (sum_util))
 #endif
 
 struct em_data_callback {
@@ -112,7 +134,7 @@ struct em_data_callback {
 	 * and frequency.
 	 *
 	 * In case of CPUs, the power is the one of a single CPU in the domain,
-	 * expressed in milli-Watts or an abstract scale. It is expected to
+	 * expressed in micro-Watts or an abstract scale. It is expected to
 	 * fit in the [0, EM_MAX_POWER] range.
 	 *
 	 * Return 0 on success.
@@ -148,7 +170,7 @@ struct em_perf_domain *em_cpu_get(int cpu);
 struct em_perf_domain *em_pd_get(struct device *dev);
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *span,
-				bool milliwatts);
+				bool microwatts);
 void em_dev_unregister_perf_domain(struct device *dev);
 
 /**
@@ -273,7 +295,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 *   pd_nrg = ------------------------                       (4)
 	 *                  scale_cpu
 	 */
-	return ps->cost * sum_util / scale_cpu;
+	return em_estimate_energy(ps->cost, sum_util, scale_cpu);
 }
 
 /**
@@ -297,7 +319,7 @@ struct em_data_callback {};
 static inline
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *span,
-				bool milliwatts)
+				bool microwatts)
 {
 	return -EINVAL;
 }
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 6c373f2960e7..f82111837b8d 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -145,7 +145,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
 
 		/*
 		 * The power returned by active_state() is expected to be
-		 * positive and to fit into 16 bits.
+		 * positive and be in range.
 		 */
 		if (!power || power > EM_MAX_POWER) {
 			dev_err(dev, "EM: invalid power: %lu\n",
@@ -170,7 +170,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
 				goto free_ps_table;
 			}
 		} else {
-			power_res = em_scale_power(table[i].power);
+			power_res = table[i].power;
 			cost = div64_u64(fmax * power_res, table[i].frequency);
 		}
 
@@ -201,9 +201,17 @@ static int em_create_pd(struct device *dev, int nr_states,
 {
 	struct em_perf_domain *pd;
 	struct device *cpu_dev;
-	int cpu, ret;
+	int cpu, ret, num_cpus;
 
 	if (_is_cpu_device(dev)) {
+		num_cpus = cpumask_weight(cpus);
+
+		/* Prevent max possible energy calculation to not overflow */
+		if (num_cpus > EM_MAX_NUM_CPUS) {
+			dev_err(dev, "EM: too many CPUs, overflow possible\n");
+			return -EINVAL;
+		}
+
 		pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
 		if (!pd)
 			return -ENOMEM;
@@ -314,13 +322,13 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  * @cpus	: Pointer to cpumask_t, which in case of a CPU device is
  *		obligatory. It can be taken from i.e. 'policy->cpus'. For other
  *		type of devices this should be set to NULL.
- * @milliwatts	: Flag indicating that the power values are in milliWatts or
+ * @microwatts	: Flag indicating that the power values are in micro-Watts or
  *		in some other scale. It must be set properly.
  *
  * Create Energy Model tables for a performance domain using the callbacks
  * defined in cb.
  *
- * The @milliwatts is important to set with correct value. Some kernel
+ * The @microwatts is important to set with correct value. Some kernel
  * sub-systems might rely on this flag and check if all devices in the EM are
  * using the same scale.
  *
@@ -331,7 +339,7 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
  */
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 				struct em_data_callback *cb, cpumask_t *cpus,
-				bool milliwatts)
+				bool microwatts)
 {
 	unsigned long cap, prev_cap = 0;
 	unsigned long flags = 0;
@@ -381,8 +389,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
 		}
 	}
 
-	if (milliwatts)
-		flags |= EM_PERF_DOMAIN_MILLIWATTS;
+	if (microwatts)
+		flags |= EM_PERF_DOMAIN_MICROWATTS;
 	else if (cb->get_cost)
 		flags |= EM_PERF_DOMAIN_ARTIFICIAL;
 
-- 
cgit v1.2.3


From 5e0fd2026cdd474a85b3135c312912321e60f47a Mon Sep 17 00:00:00 2001
From: Lukasz Luba
Date: Thu, 7 Jul 2022 08:15:54 +0100
Subject: firmware: arm_scmi: Get detailed power scale from perf

In SCMI v3.1 the power scale can be in micro-Watts. The upper layers, e.g.
cpufreq and EM should handle received power values properly (upscale when
needed). Thus, provide an interface which allows to check what is the
scale for power values. The old interface allowed to distinguish between
bogo-Watts and milli-Watts only (which was good for older SCMI spec).

Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/firmware/arm_scmi/perf.c | 18 +++++++++++-------
 include/linux/scmi_protocol.h    |  8 +++++++-
 2 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index bbb0331801ff..92414e53f908 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -170,8 +170,7 @@ struct perf_dom_info {
 struct scmi_perf_info {
 	u32 version;
 	int num_domains;
-	bool power_scale_mw;
-	bool power_scale_uw;
+	enum scmi_power_scale power_scale;
 	u64 stats_addr;
 	u32 stats_size;
 	struct perf_dom_info *dom_info;
@@ -201,9 +200,13 @@ static int scmi_perf_attributes_get(const struct scmi_protocol_handle *ph,
 		u16 flags = le16_to_cpu(attr->flags);
 
 		pi->num_domains = le16_to_cpu(attr->num_domains);
-		pi->power_scale_mw = POWER_SCALE_IN_MILLIWATT(flags);
+
+		if (POWER_SCALE_IN_MILLIWATT(flags))
+			pi->power_scale = SCMI_POWER_MILLIWATTS;
 		if (PROTOCOL_REV_MAJOR(pi->version) >= 0x3)
-			pi->power_scale_uw = POWER_SCALE_IN_MICROWATT(flags);
+			if (POWER_SCALE_IN_MICROWATT(flags))
+				pi->power_scale = SCMI_POWER_MICROWATTS;
+
 		pi->stats_addr = le32_to_cpu(attr->stats_addr_low) |
 				(u64)le32_to_cpu(attr->stats_addr_high) << 32;
 		pi->stats_size = le32_to_cpu(attr->stats_size);
@@ -792,11 +795,12 @@ static bool scmi_fast_switch_possible(const struct scmi_protocol_handle *ph,
 	return dom->fc_info && dom->fc_info->level_set_addr;
 }
 
-static bool scmi_power_scale_mw_get(const struct scmi_protocol_handle *ph)
+static enum scmi_power_scale
+scmi_power_scale_get(const struct scmi_protocol_handle *ph)
 {
 	struct scmi_perf_info *pi = ph->get_priv(ph);
 
-	return pi->power_scale_mw;
+	return pi->power_scale;
 }
 
 static const struct scmi_perf_proto_ops perf_proto_ops = {
@@ -811,7 +815,7 @@ static const struct scmi_perf_proto_ops perf_proto_ops = {
 	.freq_get = scmi_dvfs_freq_get,
 	.est_power_get = scmi_dvfs_est_power_get,
 	.fast_switch_possible = scmi_fast_switch_possible,
-	.power_scale_mw_get = scmi_power_scale_mw_get,
+	.power_scale_get = scmi_power_scale_get,
 };
 
 static int scmi_perf_set_notify_enabled(const struct scmi_protocol_handle *ph,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 704111f63993..a0a246310ba1 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -60,6 +60,12 @@ struct scmi_clock_info {
 	};
 };
 
+enum scmi_power_scale {
+	SCMI_POWER_BOGOWATTS,
+	SCMI_POWER_MILLIWATTS,
+	SCMI_POWER_MICROWATTS
+};
+
 struct scmi_handle;
 struct scmi_device;
 struct scmi_protocol_handle;
@@ -135,7 +141,7 @@ struct scmi_perf_proto_ops {
 			     unsigned long *rate, unsigned long *power);
 	bool (*fast_switch_possible)(const struct scmi_protocol_handle *ph,
 				     struct device *dev);
-	bool (*power_scale_mw_get)(const struct scmi_protocol_handle *ph);
+	enum scmi_power_scale (*power_scale_get)(const struct scmi_protocol_handle *ph);
 };
 
 /**
-- 
cgit v1.2.3


From f3ac888fc5fbdeeec1e084327de06a2765542d56 Mon Sep 17 00:00:00 2001
From: Lukasz Luba
Date: Thu, 7 Jul 2022 08:15:55 +0100
Subject: cpufreq: scmi: Support the power scale in micro-Watts in SCMI v3.1

The SCMI v3.1 adds support for power values in micro-Watts. They are not
always in milli-Watts anymore (ignoring the bogo-Watts). Thus, the power
must be converted conditionally before sending to Energy Model. Add the
logic which handles the needed checks and conversions.

Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scmi-cpufreq.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index bfd35583d653..513a071845c2 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -100,7 +100,7 @@ static int __maybe_unused
 scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
 		   unsigned long *KHz)
 {
-	bool power_scale_mw = perf_ops->power_scale_mw_get(ph);
+	enum scmi_power_scale power_scale = perf_ops->power_scale_get(ph);
 	unsigned long Hz;
 	int ret, domain;
 
@@ -114,8 +114,8 @@ scmi_get_cpu_power(struct device *cpu_dev, unsigned long *power,
 	if (ret)
 		return ret;
 
-	/* Provide bigger resolution power to the Energy Model */
-	if (power_scale_mw)
+	/* Convert the power to uW if it is mW (ignore bogoW) */
+	if (power_scale == SCMI_POWER_MILLIWATTS)
 		*power *= MICROWATT_PER_MILLIWATT;
 
 	/* The EM framework specifies the frequency in KHz. */
@@ -255,8 +255,9 @@ static int scmi_cpufreq_exit(struct cpufreq_policy *policy)
 static void scmi_cpufreq_register_em(struct cpufreq_policy *policy)
 {
 	struct em_data_callback em_cb = EM_DATA_CB(scmi_get_cpu_power);
-	bool power_scale_mw = perf_ops->power_scale_mw_get(ph);
+	enum scmi_power_scale power_scale = perf_ops->power_scale_get(ph);
 	struct scmi_data *priv = policy->driver_data;
+	bool em_power_scale = false;
 
 	/*
 	 * This callback will be called for each policy, but we don't need to
@@ -268,9 +269,13 @@ static void scmi_cpufreq_register_em(struct cpufreq_policy *policy)
 	if (!priv->nr_opp)
 		return;
 
+	if (power_scale == SCMI_POWER_MILLIWATTS
+	    || power_scale == SCMI_POWER_MICROWATTS)
+		em_power_scale = true;
+
 	em_dev_register_perf_domain(get_cpu_device(policy->cpu), priv->nr_opp,
 				    &em_cb, priv->opp_shared_cpus,
-				    power_scale_mw);
+				    em_power_scale);
 }
 
 static struct cpufreq_driver scmi_cpufreq_driver = {
-- 
cgit v1.2.3


From a2f6a7ac60e234810a386aaa6aad106927c39c4f Mon Sep 17 00:00:00 2001
From: Viresh Kumar
Date: Thu, 7 Jul 2022 15:28:31 +0530
Subject: cpufreq: Warn users while freeing active policy

With the new design in place, the show() and store() callbacks check if
the policy is active or not before proceeding any further to avoid
potential races. And in order to guarantee that cpufreq_policy_free()
must be called after clearing the policy->cpus mask, i.e. by marking the
policy inactive.

In order to avoid introducing a bug around this later, print a warning
message if we end up freeing an active policy.

Also update cpufreq_online() a bit to make sure we clear the cpus mask
for each error case before calling cpufreq_policy_free().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 73432360e6e9..954eef26685f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1273,6 +1273,13 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 	unsigned long flags;
 	int cpu;
 
+	/*
+	 * The callers must ensure the policy is inactive by now, to avoid any
+	 * races with show()/store() callbacks.
+	 */
+	if (unlikely(!policy_is_inactive(policy)))
+		pr_warn("%s: Freeing active policy\n", __func__);
+
 	/* Remove policy from list */
 	write_lock_irqsave(&cpufreq_driver_lock, flags);
 	list_del(&policy->policy_list);
@@ -1527,8 +1534,6 @@ out_destroy_policy:
 	for_each_cpu(j, policy->real_cpus)
 		remove_cpu_dev_symlink(policy, j, get_cpu_device(j));
 
-	cpumask_clear(policy->cpus);
-
 out_offline_policy:
 	if (cpufreq_driver->offline)
 		cpufreq_driver->offline(policy);
@@ -1538,6 +1543,7 @@ out_exit_policy:
 		cpufreq_driver->exit(policy);
 
 out_free_policy:
+	cpumask_clear(policy->cpus);
 	up_write(&policy->rwsem);
 
 	cpufreq_policy_free(policy);
-- 
cgit v1.2.3


From 9d822ccf337299d08cf5121929013f3ac21932c9 Mon Sep 17 00:00:00 2001
From: Randy Dunlap
Date: Thu, 14 Jul 2022 18:59:03 -0700
Subject: cpufreq: loongson2: fix Kconfig "its" grammar

Use the possessive "its" instead of the contraction "it's"
where appropriate.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index c3038cdc6865..2a84fc63371e 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -268,7 +268,7 @@ config LOONGSON2_CPUFREQ
 	  This option adds a CPUFreq driver for loongson processors which
 	  support software configurable cpu frequency.
 
-	  Loongson2F and it's successors support this feature.
+	  Loongson2F and its successors support this feature.
 
 	  If in doubt, say N.
 
-- 
cgit v1.2.3


From 3e5c04f97c87f5061a37edad11a7b37c3fee2466 Mon Sep 17 00:00:00 2001
From: Zhao Liu
Date: Fri, 22 Jul 2022 10:50:24 +0800
Subject: cpufreq: ondemand: Use cpumask_var_t for on-stack cpu mask

A cpumask structure on the stack can cause a warning with
CONFIG_NR_CPUS=8192 (e.g. Ubuntu 22.04 uses this):

drivers/cpufreq/cpufreq_ondemand.c: In function 'od_set_powersave_bias':
drivers/cpufreq/cpufreq_ondemand.c:449:1: warning: the frame size of
	1032 bytes is larger than 1024 bytes [-Wframe-larger-than=]
  449 | }
      | ^

CONFIG_CPUMASK_OFFSTACK=y is enabled by default for most distros, and
hence we can work around the warning by using cpumask_var_t.

Signed-off-by: Zhao Liu <zhao1.liu@linux.intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_ondemand.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index e8fbf970ff07..c52d19d67557 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -416,10 +416,13 @@ static struct dbs_governor od_dbs_gov = {
 static void od_set_powersave_bias(unsigned int powersave_bias)
 {
 	unsigned int cpu;
-	cpumask_t done;
+	cpumask_var_t done;
+
+	if (!alloc_cpumask_var(&done, GFP_KERNEL))
+		return;
 
 	default_powersave_bias = powersave_bias;
-	cpumask_clear(&done);
+	cpumask_clear(done);
 
 	cpus_read_lock();
 	for_each_online_cpu(cpu) {
@@ -428,7 +431,7 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
 		struct dbs_data *dbs_data;
 		struct od_dbs_tuners *od_tuners;
 
-		if (cpumask_test_cpu(cpu, &done))
+		if (cpumask_test_cpu(cpu, done))
 			continue;
 
 		policy = cpufreq_cpu_get_raw(cpu);
@@ -439,13 +442,15 @@ static void od_set_powersave_bias(unsigned int powersave_bias)
 		if (!policy_dbs)
 			continue;
 
-		cpumask_or(&done, &done, policy->cpus);
+		cpumask_or(done, done, policy->cpus);
 
 		dbs_data = policy_dbs->dbs_data;
 		od_tuners = dbs_data->tuners;
 		od_tuners->powersave_bias = default_powersave_bias;
 	}
 	cpus_read_unlock();
+
+	free_cpumask_var(done);
 }
 
 void od_register_powersave_bias_handler(unsigned int (*f)
-- 
cgit v1.2.3


From 1548fac47a114b42063def551eb152a536ed9697 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy
Date: Sat, 16 Jul 2022 09:26:55 +0300
Subject: intel_idle: make SPR C1 and C1E be independent

This patch partially reverts the changes made by the following commit:

da0e58c038e6 intel_idle: add 'preferred_cstates' module argument

As that commit describes, on early Sapphire Rapids Xeon platforms the C1 and
C1E states were mutually exclusive, so that users could only have either C1 and
C6, or C1E and C6.

However, Intel firmware engineers managed to remove this limitation and make C1
and C1E to be completely independent, just like on previous Xeon platforms.

Therefore, this patch:
 * Removes commentary describing the old, and now non-existing SPR C1E
   limitation.
 * Marks SPR C1E as available by default.
 * Removes the 'preferred_cstates' parameter handling for SPR. Both C1 and
   C1E will be available regardless of 'preferred_cstates' value.

We expect that all SPR systems are shipping with new firmware, which includes
the C1/C1E improvement.

Cc: v5.18+ <stable@vger.kernel.org> # v5.18+
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 24 +-----------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

(limited to 'drivers')

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 424ef470223d..ba2b485a03ed 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -879,16 +879,6 @@ static struct cpuidle_state adl_l_cstates[] __initdata = {
 		.enter = NULL }
 };
 
-/*
- * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
- * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
- * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
- * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
- * both C1 and C1E requests end up with C1, so there is effectively no C1E.
- *
- * By default we enable C1 and disable C1E by marking it with
- * 'CPUIDLE_FLAG_UNUSABLE'.
- */
 static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1",
@@ -901,8 +891,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
-		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
-					   CPUIDLE_FLAG_UNUSABLE,
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
 		.exit_latency = 2,
 		.target_residency = 4,
 		.enter = &intel_idle,
@@ -1724,17 +1713,6 @@ static void __init spr_idle_state_table_update(void)
 {
 	unsigned long long msr;
 
-	/* Check if user prefers C1E over C1. */
-	if ((preferred_states_mask & BIT(2)) &&
-	    !(preferred_states_mask & BIT(1))) {
-		/* Disable C1 and enable C1E. */
-		spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
-		spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
-
-		/* Enable C1E using the "C1E promotion" bit. */
-		c1e_promotion = C1E_PROMOTION_ENABLE;
-	}
-
 	/*
 	 * By default, the C6 state assumes the worst-case scenario of package
 	 * C6. However, if PC6 is disabled, we update the numbers to match
-- 
cgit v1.2.3


From b08b95cf30f53b674bdef510d4cfd0623199b036 Mon Sep 17 00:00:00 2001
From: Sumeet Pawnikar
Date: Tue, 26 Jul 2022 18:32:29 +0530
Subject: powercap: RAPL: Add Power Limit4 support for Alder Lake-N and Raptor
 Lake-P

Add Alder Lake-N and Raptor Lake-P to the list of processor models
for which Power Limit4 is supported by the Intel RAPL driver.

Signed-off-by: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
Reviewed-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_msr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers')

diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 9d23984d8931..bc6adda58883 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -140,7 +140,9 @@ static const struct x86_cpu_id pl4_support_ids[] = {
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_TIGERLAKE_L, X86_FEATURE_ANY },
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE, X86_FEATURE_ANY },
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE_L, X86_FEATURE_ANY },
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_ALDERLAKE_N, X86_FEATURE_ANY },
 	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_RAPTORLAKE, X86_FEATURE_ANY },
+	{ X86_VENDOR_INTEL, 6, INTEL_FAM6_RAPTORLAKE_P, X86_FEATURE_ANY },
 	{}
 };
 
-- 
cgit v1.2.3