aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-power45
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt22
-rw-r--r--Documentation/cpu-freq/cpufreq-stats.txt6
-rw-r--r--Documentation/cpu-freq/intel-pstate.txt54
-rw-r--r--Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt78
-rw-r--r--Documentation/devicetree/bindings/opp/opp.txt27
-rw-r--r--Documentation/devicetree/bindings/power/domain-idle-state.txt33
-rw-r--r--Documentation/devicetree/bindings/power/power_domain.txt43
-rw-r--r--Documentation/power/devices.txt14
-rw-r--r--Documentation/power/states.txt62
-rw-r--r--MAINTAINERS12
-rw-r--r--arch/arm/mach-imx/gpc.c17
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S9
-rw-r--r--arch/x86/power/hibernate_64.c94
-rw-r--r--drivers/acpi/processor_perflib.c55
-rw-r--r--drivers/acpi/sleep.c8
-rw-r--r--drivers/base/power/domain.c363
-rw-r--r--drivers/base/power/main.c2
-rw-r--r--drivers/base/power/opp/core.c521
-rw-r--r--drivers/base/power/opp/debugfs.c52
-rw-r--r--drivers/base/power/opp/of.c111
-rw-r--r--drivers/base/power/opp/opp.h23
-rw-r--r--drivers/base/power/power.h19
-rw-r--r--drivers/base/power/qos.c6
-rw-r--r--drivers/base/power/runtime.c62
-rw-r--r--drivers/base/power/sysfs.c6
-rw-r--r--drivers/base/power/wakeirq.c76
-rw-r--r--drivers/base/power/wakeup.c6
-rw-r--r--drivers/cpufreq/Kconfig.arm29
-rw-r--r--drivers/cpufreq/Makefile2
-rw-r--r--drivers/cpufreq/acpi-cpufreq.c117
-rw-r--r--drivers/cpufreq/brcmstb-avs-cpufreq.c1057
-rw-r--r--drivers/cpufreq/cppc_cpufreq.c7
-rw-r--r--drivers/cpufreq/cpufreq-dt-platdev.c15
-rw-r--r--drivers/cpufreq/cpufreq-dt.c12
-rw-r--r--drivers/cpufreq/cpufreq.c25
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c46
-rw-r--r--drivers/cpufreq/cpufreq_governor.c30
-rw-r--r--drivers/cpufreq/cpufreq_governor.h5
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c17
-rw-r--r--drivers/cpufreq/cpufreq_stats.c22
-rw-r--r--drivers/cpufreq/integrator-cpufreq.c239
-rw-r--r--drivers/cpufreq/intel_pstate.c826
-rw-r--r--drivers/cpufreq/powernv-cpufreq.c65
-rw-r--r--drivers/cpuidle/cpuidle-powernv.c2
-rw-r--r--drivers/cpuidle/cpuidle.c19
-rw-r--r--drivers/cpuidle/dt_idle_states.c6
-rw-r--r--drivers/cpuidle/governor.c4
-rw-r--r--drivers/cpuidle/governors/ladder.c2
-rw-r--r--drivers/cpuidle/governors/menu.c2
-rw-r--r--drivers/cpuidle/sysfs.c4
-rw-r--r--drivers/devfreq/devfreq.c2
-rw-r--r--drivers/devfreq/event/exynos-nocp.c1
-rw-r--r--drivers/devfreq/event/exynos-ppmu.c6
-rw-r--r--drivers/devfreq/event/rockchip-dfi.c1
-rw-r--r--drivers/devfreq/exynos-bus.c29
-rw-r--r--drivers/devfreq/rk3399_dmc.c15
-rw-r--r--drivers/idle/intel_idle.c154
-rw-r--r--drivers/net/ethernet/smsc/smsc911x.c6
-rw-r--r--drivers/power/avs/rockchip-io-domain.c2
-rw-r--r--drivers/powercap/intel_rapl.c389
-rw-r--r--drivers/thermal/intel_powerclamp.c359
-rw-r--r--include/acpi/processor.h3
-rw-r--r--include/linux/cpu.h2
-rw-r--r--include/linux/cpufreq.h6
-rw-r--r--include/linux/cpuidle.h9
-rw-r--r--include/linux/pm_domain.h28
-rw-r--r--include/linux/pm_opp.h72
-rw-r--r--include/linux/pm_runtime.h11
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/suspend.h2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/power/main.c88
-rw-r--r--kernel/power/power.h6
-rw-r--r--kernel/power/suspend.c69
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c119
-rw-r--r--kernel/sched/idle.c175
-rw-r--r--mm/kasan/kasan.c9
79 files changed, 4399 insertions, 1549 deletions
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 50b368d490b5..f523e5a3ac33 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -7,30 +7,35 @@ Description:
subsystem.
What: /sys/power/state
-Date: May 2014
+Date: November 2016
Contact: Rafael J. Wysocki <rjw@rjwysocki.net>
Description:
The /sys/power/state file controls system sleep states.
Reading from this file returns the available sleep state
- labels, which may be "mem", "standby", "freeze" and "disk"
- (hibernation). The meanings of the first three labels depend on
- the relative_sleep_states command line argument as follows:
- 1) relative_sleep_states = 1
- "mem", "standby", "freeze" represent non-hibernation sleep
- states from the deepest ("mem", always present) to the
- shallowest ("freeze"). "standby" and "freeze" may or may
- not be present depending on the capabilities of the
- platform. "freeze" can only be present if "standby" is
- present.
- 2) relative_sleep_states = 0 (default)
- "mem" - "suspend-to-RAM", present if supported.
- "standby" - "power-on suspend", present if supported.
- "freeze" - "suspend-to-idle", always present.
-
- Writing to this file one of these strings causes the system to
- transition into the corresponding state, if available. See
- Documentation/power/states.txt for a description of what
- "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean.
+ labels, which may be "mem" (suspend), "standby" (power-on
+ suspend), "freeze" (suspend-to-idle) and "disk" (hibernation).
+
+ Writing one of the above strings to this file causes the system
+ to transition into the corresponding state, if available.
+
+ See Documentation/power/states.txt for more information.
+
+What: /sys/power/mem_sleep
+Date: November 2016
+Contact: Rafael J. Wysocki <rjw@rjwysocki.net>
+Description:
+ The /sys/power/mem_sleep file controls the operating mode of
+ system suspend. Reading from it returns the available modes
+ as "s2idle" (always present), "shallow" and "deep" (present if
+ supported). The mode that will be used on subsequent attempts
+ to suspend the system (by writing "mem" to the /sys/power/state
+ file described above) is enclosed in square brackets.
+
+ Writing one of the above strings to this file causes the mode
+ represented by it to be used on subsequent attempts to suspend
+ the system.
+
+ See Documentation/power/states.txt for more information.
What: /sys/power/disk
Date: September 2006
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 62d68b2056de..be2d6d0a03a4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1560,6 +1560,12 @@
disable
Do not enable intel_pstate as the default
scaling driver for the supported processors
+ passive
+ Use intel_pstate as a scaling driver, but configure it
+ to work with generic cpufreq governors (instead of
+ enabling its internal governor). This mode cannot be
+ used along with the hardware-managed P-states (HWP)
+ feature.
force
Enable intel_pstate on systems that prohibit it by default
in favor of acpi-cpufreq. Forcing the intel_pstate driver
@@ -1580,6 +1586,9 @@
Description Table, specifies preferred power management
profile as "Enterprise Server" or "Performance Server",
then this feature is turned on by default.
+ per_cpu_perf_limits
+ Allow per-logical-CPU P-State performance control limits using
+ cpufreq sysfs interface
intremap= [X86-64, Intel-IOMMU]
on enable Interrupt Remapping (default)
@@ -2122,6 +2131,12 @@
memory contents and reserves bad memory
regions that are detected.
+ mem_sleep_default= [SUSPEND] Default system suspend mode:
+ s2idle - Suspend-To-Idle
+ shallow - Power-On Suspend or equivalent (if supported)
+ deep - Suspend-To-RAM or equivalent (if supported)
+ See Documentation/power/states.txt.
+
meye.*= [HW] Set MotionEye Camera parameters
See Documentation/video4linux/meye.txt.
@@ -3475,13 +3490,6 @@
[KNL, SMP] Set scheduler's default relax_domain_level.
See Documentation/cgroup-v1/cpusets.txt.
- relative_sleep_states=
- [SUSPEND] Use sleep state labeling where the deepest
- state available other than hibernation is always "mem".
- Format: { "0" | "1" }
- 0 -- Traditional sleep state labels.
- 1 -- Relative sleep state labels.
-
reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
reservetop= [X86-32]
diff --git a/Documentation/cpu-freq/cpufreq-stats.txt b/Documentation/cpu-freq/cpufreq-stats.txt
index 8d9773f23550..3c355f6ad834 100644
--- a/Documentation/cpu-freq/cpufreq-stats.txt
+++ b/Documentation/cpu-freq/cpufreq-stats.txt
@@ -44,11 +44,17 @@ the stats driver insertion.
total 0
drwxr-xr-x 2 root root 0 May 14 16:06 .
drwxr-xr-x 3 root root 0 May 14 15:58 ..
+--w------- 1 root root 4096 May 14 16:06 reset
-r--r--r-- 1 root root 4096 May 14 16:06 time_in_state
-r--r--r-- 1 root root 4096 May 14 16:06 total_trans
-r--r--r-- 1 root root 4096 May 14 16:06 trans_table
--------------------------------------------------------------------------------
+- reset
+Write-only attribute that can be used to reset the stat counters. This can be
+useful for evaluating system behaviour under different governors without the
+need for a reboot.
+
- time_in_state
This gives the amount of time spent in each of the frequencies supported by
this CPU. The cat output will have "<frequency> <time>" pair in each line, which
diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
index e6bd1e6512a5..1953994ef5e6 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -48,7 +48,7 @@ In addition to the frequency-controlling interfaces provided by the cpufreq
core, the driver provides its own sysfs files to control the P-State selection.
These files have been added to /sys/devices/system/cpu/intel_pstate/.
Any changes made to these files are applicable to all CPUs (even in a
-multi-package system).
+multi-package system, Refer to later section on placing "Per-CPU limits").
max_perf_pct: Limits the maximum P-State that will be requested by
the driver. It states it as a percentage of the available performance. The
@@ -120,13 +120,57 @@ frequency is fictional for Intel Core processors. Even if the scaling
driver selects a single P-State, the actual frequency the processor
will run at is selected by the processor itself.
+Per-CPU limits
+
+The kernel command line option "intel_pstate=per_cpu_perf_limits" forces
+the intel_pstate driver to use per-CPU performance limits. When it is set,
+the sysfs control interface described above is subject to limitations.
+- The following controls are not available for both read and write
+ /sys/devices/system/cpu/intel_pstate/max_perf_pct
+ /sys/devices/system/cpu/intel_pstate/min_perf_pct
+- The following controls can be used to set performance limits, as far as the
+architecture of the processor permits:
+ /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
+ /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
+ /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+- User can still observe turbo percent and number of P-States from
+ /sys/devices/system/cpu/intel_pstate/turbo_pct
+ /sys/devices/system/cpu/intel_pstate/num_pstates
+- User can read write system wide turbo status
+ /sys/devices/system/cpu/no_turbo
+
+Support of energy performance hints
+It is possible to provide hints to the HWP algorithms in the processor
+to be more performance centric to more energy centric. When the driver
+is using HWP, two additional cpufreq sysfs attributes are presented for
+each logical CPU.
+These attributes are:
+ - energy_performance_available_preferences
+ - energy_performance_preference
+
+To get list of supported hints:
+$ cat energy_performance_available_preferences
+ default performance balance_performance balance_power power
+
+The current preference can be read or changed via cpufreq sysfs
+attribute "energy_performance_preference". Reading from this attribute
+will display current effective setting. User can write any of the valid
+preference string to this attribute. User can always restore to power-on
+default by writing "default".
+
+Since threads can migrate to different CPUs, this is possible that the
+new CPU may have different energy performance preference than the previous
+one. To avoid such issues, either threads can be pinned to specific CPUs
+or set the same energy performance preference value to all CPUs.
+
Tuning Intel P-State driver
-When HWP mode is not used, debugfs files have also been added to allow the
-tuning of the internal governor algorithm. These files are located at
-/sys/kernel/debug/pstate_snb/. The algorithm uses a PID (Proportional
-Integral Derivative) controller. The PID tunable parameters are:
+When the performance can be tuned using PID (Proportional Integral
+Derivative) controller, debugfs files are provided for adjusting performance.
+They are presented under:
+/sys/kernel/debug/pstate_snb/
+The PID tunable parameters are:
deadband
d_gain_pct
i_gain_pct
diff --git a/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
new file mode 100644
index 000000000000..af2385795d78
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
@@ -0,0 +1,78 @@
+Broadcom AVS mail box and interrupt register bindings
+=====================================================
+
+A total of three DT nodes are required. One node (brcm,avs-cpu-data-mem)
+references the mailbox register used to communicate with the AVS CPU[1]. The
+second node (brcm,avs-cpu-l2-intr) is required to trigger an interrupt on
+the AVS CPU. The interrupt tells the AVS CPU that it needs to process a
+command sent to it by a driver. Interrupting the AVS CPU is mandatory for
+commands to be processed.
+
+The interface also requires a reference to the AVS host interrupt controller,
+so a driver can react to interrupts generated by the AVS CPU whenever a command
+has been processed. See [2] for more information on the brcm,l2-intc node.
+
+[1] The AVS CPU is an independent co-processor that runs proprietary
+firmware. On some SoCs, this firmware supports DFS and DVFS in addition to
+Adaptive Voltage Scaling.
+
+[2] Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt
+
+
+Node brcm,avs-cpu-data-mem
+--------------------------
+
+Required properties:
+- compatible: must include: brcm,avs-cpu-data-mem and
+ should include: one of brcm,bcm7271-avs-cpu-data-mem or
+ brcm,bcm7268-avs-cpu-data-mem
+- reg: Specifies base physical address and size of the registers.
+- interrupts: The interrupt that the AVS CPU will use to interrupt the host
+ when a command completed.
+- interrupt-parent: The interrupt controller the above interrupt is routed
+ through.
+- interrupt-names: The name of the interrupt used to interrupt the host.
+
+Optional properties:
+- None
+
+Node brcm,avs-cpu-l2-intr
+-------------------------
+
+Required properties:
+- compatible: must include: brcm,avs-cpu-l2-intr and
+ should include: one of brcm,bcm7271-avs-cpu-l2-intr or
+ brcm,bcm7268-avs-cpu-l2-intr
+- reg: Specifies base physical address and size of the registers.
+
+Optional properties:
+- None
+
+
+Example
+=======
+
+ avs_host_l2_intc: interrupt-controller@f04d1200 {
+ #interrupt-cells = <1>;
+ compatible = "brcm,l2-intc";
+ interrupt-parent = <&intc>;
+ reg = <0xf04d1200 0x48>;
+ interrupt-controller;
+ interrupts = <0x0 0x19 0x0>;
+ interrupt-names = "avs";
+ };
+
+ avs-cpu-data-mem@f04c4000 {
+ compatible = "brcm,bcm7271-avs-cpu-data-mem",
+ "brcm,avs-cpu-data-mem";
+ reg = <0xf04c4000 0x60>;
+ interrupts = <0x1a>;
+ interrupt-parent = <&avs_host_l2_intc>;
+ interrupt-names = "sw_intr";
+ };
+
+ avs-cpu-l2-intr@f04d1100 {
+ compatible = "brcm,bcm7271-avs-cpu-l2-intr",
+ "brcm,avs-cpu-l2-intr";
+ reg = <0xf04d1100 0x10>;
+ };
diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index ee91cbdd95ee..9f5ca4457b5f 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -86,8 +86,14 @@ Optional properties:
Single entry is for target voltage and three entries are for <target min max>
voltages.
- Entries for multiple regulators must be present in the same order as
- regulators are specified in device's DT node.
+ Entries for multiple regulators shall be provided in the same field separated
+ by angular brackets <>. The OPP binding doesn't provide any provisions to
+ relate the values to their power supplies or the order in which the supplies
+ need to be configured and that is left for the implementation specific
+ binding.
+
+ Entries for all regulators shall be of the same size, i.e. either all use a
+ single value or triplets.
- opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to
the above opp-microvolt property, but allows multiple voltage ranges to be
@@ -104,10 +110,13 @@ Optional properties:
Should only be set if opp-microvolt is set for the OPP.
- Entries for multiple regulators must be present in the same order as
- regulators are specified in device's DT node. If this property isn't required
- for few regulators, then this should be marked as zero for them. If it isn't
- required for any regulator, then this property need not be present.
+ Entries for multiple regulators shall be provided in the same field separated
+ by angular brackets <>. If current values aren't required for a regulator,
+ then it shall be filled with 0. If current values aren't required for any of
+ the regulators, then this field is not required. The OPP binding doesn't
+ provide any provisions to relate the values to their power supplies or the
+ order in which the supplies need to be configured and that is left for the
+ implementation specific binding.
- opp-microamp-<name>: Named opp-microamp property. Similar to
opp-microvolt-<name> property, but for microamp instead.
@@ -386,10 +395,12 @@ Example 4: Handling multiple regulators
/ {
cpus {
cpu@0 {
- compatible = "arm,cortex-a7";
+ compatible = "vendor,cpu-type";
...
- cpu-supply = <&cpu_supply0>, <&cpu_supply1>, <&cpu_supply2>;
+ vcc0-supply = <&cpu_supply0>;
+ vcc1-supply = <&cpu_supply1>;
+ vcc2-supply = <&cpu_supply2>;
operating-points-v2 = <&cpu0_opp_table>;
};
};
diff --git a/Documentation/devicetree/bindings/power/domain-idle-state.txt b/Documentation/devicetree/bindings/power/domain-idle-state.txt
new file mode 100644
index 000000000000..eefc7ed22ca2
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/domain-idle-state.txt
@@ -0,0 +1,33 @@
+PM Domain Idle State Node:
+
+A domain idle state node represents the state parameters that will be used to
+select the state when there are no active components in the domain.
+
+The state node has the following parameters -
+
+- compatible:
+ Usage: Required
+ Value type: <string>
+ Definition: Must be "domain-idle-state".
+
+- entry-latency-us
+ Usage: Required
+ Value type: <prop-encoded-array>
+ Definition: u32 value representing worst case latency in
+ microseconds required to enter the idle state.
+ The exit-latency-us duration may be guaranteed
+ only after entry-latency-us has passed.
+
+- exit-latency-us
+ Usage: Required
+ Value type: <prop-encoded-array>
+ Definition: u32 value representing worst case latency
+ in microseconds required to exit the idle state.
+
+- min-residency-us
+ Usage: Required
+ Value type: <prop-encoded-array>
+ Definition: u32 value representing minimum residency duration
+ in microseconds after which the idle state will yield
+ power benefits after overcoming the overhead in entering
+i the idle state.
diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index 025b5e7df61c..723e1ad937da 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -29,6 +29,15 @@ Optional properties:
specified by this binding. More details about power domain specifier are
available in the next section.
+- domain-idle-states : A phandle of an idle-state that shall be soaked into a
+ generic domain power state. The idle state definitions are
+ compatible with domain-idle-state specified in [1].
+ The domain-idle-state property reflects the idle state of this PM domain and
+ not the idle states of the devices or sub-domains in the PM domain. Devices
+ and sub-domains have their own idle-states independent of the parent
+ domain's idle states. In the absence of this property, the domain would be
+ considered as capable of being powered-on or powered-off.
+
Example:
power: power-controller@12340000 {
@@ -59,6 +68,38 @@ The nodes above define two power controllers: 'parent' and 'child'.
Domains created by the 'child' power controller are subdomains of '0' power
domain provided by the 'parent' power controller.
+Example 3:
+ parent: power-controller@12340000 {
+ compatible = "foo,power-controller";
+ reg = <0x12340000 0x1000>;
+ #power-domain-cells = <0>;
+ domain-idle-states = <&DOMAIN_RET>, <&DOMAIN_PWR_DN>;
+ };
+
+ child: power-controller@12341000 {
+ compatible = "foo,power-controller";
+ reg = <0x12341000 0x1000>;
+ power-domains = <&parent 0>;
+ #power-domain-cells = <0>;
+ domain-idle-states = <&DOMAIN_PWR_DN>;
+ };
+
+ DOMAIN_RET: state@0 {
+ compatible = "domain-idle-state";
+ reg = <0x0>;
+ entry-latency-us = <1000>;
+ exit-latency-us = <2000>;
+ min-residency-us = <10000>;
+ };
+
+ DOMAIN_PWR_DN: state@1 {
+ compatible = "domain-idle-state";
+ reg = <0x1>;
+ entry-latency-us = <5000>;
+ exit-latency-us = <8000>;
+ min-residency-us = <7000>;
+ };
+
==PM domain consumers==
Required properties:
@@ -76,3 +117,5 @@ Example:
The node above defines a typical PM domain consumer device, which is located
inside a PM domain with index 0 of a power controller represented by a node
with the label "power".
+
+[1]. Documentation/devicetree/bindings/power/domain-idle-state.txt
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 8ba6625fdd63..73ddea39a9ce 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -607,7 +607,9 @@ individually. Instead, a set of devices sharing a power resource can be put
into a low-power state together at the same time by turning off the shared
power resource. Of course, they also need to be put into the full-power state
together, by turning the shared power resource on. A set of devices with this
-property is often referred to as a power domain.
+property is often referred to as a power domain. A power domain may also be
+nested inside another power domain. The nested domain is referred to as the
+sub-domain of the parent domain.
Support for power domains is provided through the pm_domain field of struct
device. This field is a pointer to an object of type struct dev_pm_domain,
@@ -629,6 +631,16 @@ support for power domains into subsystem-level callbacks, for example by
modifying the platform bus type. Other platforms need not implement it or take
it into account in any way.
+Devices may be defined as IRQ-safe which indicates to the PM core that their
+runtime PM callbacks may be invoked with disabled interrupts (see
+Documentation/power/runtime_pm.txt for more information). If an IRQ-safe
+device belongs to a PM domain, the runtime PM of the domain will be
+disallowed, unless the domain itself is defined as IRQ-safe. However, it
+makes sense to define a PM domain as IRQ-safe only if all the devices in it
+are IRQ-safe. Moreover, if an IRQ-safe domain has a parent domain, the runtime
+PM of the parent is only allowed if the parent itself is IRQ-safe too with the
+additional restriction that all child domains of an IRQ-safe parent must also
+be IRQ-safe.
Device Low Power (suspend) States
---------------------------------
diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt
index 50f3ef9177c1..8a39ce45d8a0 100644
--- a/Documentation/power/states.txt
+++ b/Documentation/power/states.txt
@@ -8,25 +8,43 @@ for each state.
The states are represented by strings that can be read or written to the
/sys/power/state file. Those strings may be "mem", "standby", "freeze" and
-"disk", where the last one always represents hibernation (Suspend-To-Disk) and
-the meaning of the remaining ones depends on the relative_sleep_states command
-line argument.
-
-For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the
-available non-hibernation sleep states from the deepest to the shallowest,
-respectively. In that case, "mem" is always present in /sys/power/state,
-because there is at least one non-hibernation sleep state in every system. If
-the given system supports two non-hibernation sleep states, "standby" is present
-in /sys/power/state in addition to "mem". If the system supports three
-non-hibernation sleep states, "freeze" will be present in /sys/power/state in
-addition to "mem" and "standby".
-
-For relative_sleep_states=0, which is the default, the following descriptions
-apply.
-
-state: Suspend-To-Idle
+"disk", where the last three always represent Power-On Suspend (if supported),
+Suspend-To-Idle and hibernation (Suspend-To-Disk), respectively.
+
+The meaning of the "mem" string is controlled by the /sys/power/mem_sleep file.
+It contains strings representing the available modes of system suspend that may
+be triggered by writing "mem" to /sys/power/state. These modes are "s2idle"
+(Suspend-To-Idle), "shallow" (Power-On Suspend) and "deep" (Suspend-To-RAM).
+The "s2idle" mode is always available, while the other ones are only available
+if supported by the platform (if not supported, the strings representing them
+are not present in /sys/power/mem_sleep). The string representing the suspend
+mode to be used subsequently is enclosed in square brackets. Writing one of
+the other strings present in /sys/power/mem_sleep to it causes the suspend mode
+to be used subsequently to change to the one represented by that string.
+
+Consequently, there are two ways to cause the system to go into the
+Suspend-To-Idle sleep state. The first one is to write "freeze" directly to
+/sys/power/state. The second one is to write "s2idle" to /sys/power/mem_sleep
+and then to wrtie "mem" to /sys/power/state. Similarly, there are two ways
+to cause the system to go into the Power-On Suspend sleep state (the strings to
+write to the control files in that case are "standby" or "shallow" and "mem",
+respectively) if that state is supported by the platform. In turn, there is
+only one way to cause the system to go into the Suspend-To-RAM state (write
+"deep" into /sys/power/mem_sleep and "mem" into /sys/power/state).
+
+The default suspend mode (ie. the one to be used without writing anything into
+/sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or
+"s2idle", but it can be overridden by the value of the "mem_sleep_default"
+parameter in the kernel command line. On some ACPI-based systems, depending on
+the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM
+is supported.
+
+The properties of all of the sleep states are described below.
+
+
+State: Suspend-To-Idle
ACPI state: S0
-Label: "freeze"
+Label: "s2idle" ("freeze")
This state is a generic, pure software, light-weight, system sleep state.
It allows more energy to be saved relative to runtime idle by freezing user
@@ -35,13 +53,13 @@ lower-power than available at run time), such that the processors can
spend more time in their idle states.
This state can be used for platforms without Power-On Suspend/Suspend-to-RAM
-support, or it can be used in addition to Suspend-to-RAM (memory sleep)
-to provide reduced resume latency. It is always supported.
+support, or it can be used in addition to Suspend-to-RAM to provide reduced
+resume latency. It is always supported.
State: Standby / Power-On Suspend
ACPI State: S1
-Label: "standby"
+Label: "shallow" ("standby")
This state, if supported, offers moderate, though real, power savings, while
providing a relatively low-latency transition back to a working system. No
@@ -58,7 +76,7 @@ state.
State: Suspend-to-RAM
ACPI State: S3
-Label: "mem"
+Label: "deep"
This state, if supported, offers significant power savings as everything in the
system is put into a low-power state, except for memory, which should be placed
diff --git a/MAINTAINERS b/MAINTAINERS
index 3d7d66cdb44c..34ef63763566 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2764,6 +2764,14 @@ L: bcm-kernel-feedback-list@broadcom.com
S: Maintained
F: drivers/mtd/nand/brcmnand/
+BROADCOM STB AVS CPUFREQ DRIVER
+M: Markus Mayer <mmayer@broadcom.com>
+M: bcm-kernel-feedback-list@broadcom.com
+L: linux-pm@vger.kernel.org
+S: Maintained
+F: Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
+F: drivers/cpufreq/brcmstb*
+
BROADCOM SPECIFIC AMBA DRIVER (BCMA)
M: Rafał Miłecki <zajec5@gmail.com>
L: linux-wireless@vger.kernel.org
@@ -3356,6 +3364,7 @@ L: linux-pm@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
T: git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates)
+B: https://bugzilla.kernel.org
F: Documentation/cpu-freq/
F: drivers/cpufreq/
F: include/linux/cpufreq.h
@@ -3395,6 +3404,7 @@ M: Daniel Lezcano <daniel.lezcano@linaro.org>
L: linux-pm@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
+B: https://bugzilla.kernel.org
F: drivers/cpuidle/*
F: include/linux/cpuidle.h
@@ -6362,9 +6372,11 @@ S: Maintained
F: drivers/platform/x86/intel-vbtn.c
INTEL IDLE DRIVER
+M: Jacob Pan <jacob.jun.pan@linux.intel.com>
M: Len Brown <lenb@kernel.org>
L: linux-pm@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git
+B: https://bugzilla.kernel.org
S: Supported
F: drivers/idle/intel_idle.c
diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c
index b54db47f6f32..1dc2a34b9dbd 100644
--- a/arch/arm/mach-imx/gpc.c
+++ b/arch/arm/mach-imx/gpc.c
@@ -380,13 +380,6 @@ static struct pu_domain imx6q_pu_domain = {
.name = "PU",
.power_off = imx6q_pm_pu_power_off,
.power_on = imx6q_pm_pu_power_on,
- .states = {
- [0] = {
- .power_off_latency_ns = 25000,
- .power_on_latency_ns = 2000000,
- },
- },
- .state_count = 1,
},
};
@@ -430,6 +423,16 @@ static int imx_gpc_genpd_init(struct device *dev, struct regulator *pu_reg)
if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS))
return 0;
+ imx6q_pu_domain.base.states = devm_kzalloc(dev,
+ sizeof(*imx6q_pu_domain.base.states),
+ GFP_KERNEL);
+ if (!imx6q_pu_domain.base.states)
+ return -ENOMEM;
+
+ imx6q_pu_domain.base.states[0].power_off_latency_ns = 25000;
+ imx6q_pu_domain.base.states[0].power_on_latency_ns = 2000000;
+ imx6q_pu_domain.base.state_count = 1;
+
for (i = 0; i < ARRAY_SIZE(imx_gpc_domains); i++)
pm_genpd_init(imx_gpc_domains[i], NULL, false);
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 169963f471bb..50b8ed0317a3 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
+#ifdef CONFIG_KASAN
+ /*
+ * The suspend path may have poisoned some areas deeper in the stack,
+ * which we now need to unpoison.
+ */
+ movq %rsp, %rdi
+ call kasan_unpoison_task_stack_below
+#endif
+
xorl %eax, %eax
addq $8, %rsp
FRAME_END
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 9634557a5444..ded2e8272382 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,10 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/suspend.h>
+#include <linux/scatterlist.h>
+#include <linux/kdebug.h>
+
+#include <crypto/hash.h>
#include <asm/init.h>
#include <asm/proto.h>
@@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn)
return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
}
+#define MD5_DIGEST_SIZE 16
+
struct restore_data_record {
unsigned long jump_address;
unsigned long jump_address_phys;
unsigned long cr3;
unsigned long magic;
+ u8 e820_digest[MD5_DIGEST_SIZE];
};
-#define RESTORE_MAGIC 0x123456789ABCDEF0UL
+#define RESTORE_MAGIC 0x23456789ABCDEF01UL
+
+#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
+/**
+ * get_e820_md5 - calculate md5 according to given e820 map
+ *
+ * @map: the e820 map to be calculated
+ * @buf: the md5 result to be stored to
+ */
+static int get_e820_md5(struct e820map *map, void *buf)
+{
+ struct scatterlist sg;
+ struct crypto_ahash *tfm;
+ int size;
+ int ret = 0;
+
+ tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return -ENOMEM;
+
+ {
+ AHASH_REQUEST_ON_STACK(req, tfm);
+ size = offsetof(struct e820map, map)
+ + sizeof(struct e820entry) * map->nr_map;
+ ahash_request_set_tfm(req, tfm);
+ sg_init_one(&sg, (u8 *)map, size);
+ ahash_request_set_callback(req, 0, NULL, NULL);
+ ahash_request_set_crypt(req, &sg, buf, size);
+
+ if (crypto_ahash_digest(req))
+ ret = -EINVAL;
+ ahash_request_zero(req);
+ }
+ crypto_free_ahash(tfm);
+
+ return ret;
+}
+
+static void hibernation_e820_save(void *buf)
+{
+ get_e820_md5(e820_saved, buf);
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+ int ret;
+ u8 result[MD5_DIGEST_SIZE];
+
+ memset(result, 0, MD5_DIGEST_SIZE);
+ /* If there is no digest in suspend kernel, let it go. */
+ if (!memcmp(result, buf, MD5_DIGEST_SIZE))
+ return false;
+
+ ret = get_e820_md5(e820_saved, result);
+ if (ret)
+ return true;
+
+ return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
+}
+#else
+static void hibernation_e820_save(void *buf)
+{
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+ /* If md5 is not builtin for restore kernel, let it go. */
+ return false;
+}
+#endif
/**
* arch_hibernation_header_save - populate the architecture specific part
@@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
rdr->jump_address_phys = __pa_symbol(&restore_registers);
rdr->cr3 = restore_cr3;
rdr->magic = RESTORE_MAGIC;
+
+ hibernation_e820_save(rdr->e820_digest);
+
return 0;
}
@@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr)
restore_jump_address = rdr->jump_address;
jump_address_phys = rdr->jump_address_phys;
restore_cr3 = rdr->cr3;
- return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
+
+ if (rdr->magic != RESTORE_MAGIC) {
+ pr_crit("Unrecognized hibernate image header format!\n");
+ return -EINVAL;
+ }
+
+ if (hibernation_e820_mismatch(rdr->e820_digest)) {
+ pr_crit("Hibernate inconsistent memory map detected!\n");
+ return -ENODEV;
+ }
+
+ return 0;
}
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index bb01dea39fdc..f0b4a981b8d3 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -157,7 +157,7 @@ static void acpi_processor_ppc_ost(acpi_handle handle, int status)
status, NULL);
}
-int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
+void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
{
int ret;
@@ -168,7 +168,7 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
*/
if (event_flag)
acpi_processor_ppc_ost(pr->handle, 1);
- return 0;
+ return;
}
ret = acpi_processor_get_platform_limit(pr);
@@ -182,10 +182,8 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
else
acpi_processor_ppc_ost(pr->handle, 0);
}
- if (ret < 0)
- return (ret);
- else
- return cpufreq_update_policy(pr->id);
+ if (ret >= 0)
+ cpufreq_update_policy(pr->id);
}
int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
@@ -465,11 +463,33 @@ int acpi_processor_get_performance_info(struct acpi_processor *pr)
return result;
}
EXPORT_SYMBOL_GPL(acpi_processor_get_performance_info);
-int acpi_processor_notify_smm(struct module *calling_module)
+
+int acpi_processor_pstate_control(void)
{
acpi_status status;
- static int is_done = 0;
+ if (!acpi_gbl_FADT.smi_command || !acpi_gbl_FADT.pstate_control)
+ return 0;
+
+ ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+ "Writing pstate_control [0x%x] to smi_command [0x%x]\n",
+ acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+
+ status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+ (u32)acpi_gbl_FADT.pstate_control, 8);
+ if (ACPI_SUCCESS(status))
+ return 1;
+
+ ACPI_EXCEPTION((AE_INFO, status,
+ "Failed to write pstate_control [0x%x] to smi_command [0x%x]",
+ acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+ return -EIO;
+}
+
+int acpi_processor_notify_smm(struct module *calling_module)
+{
+ static int is_done = 0;
+ int result;
if (!(acpi_processor_ppc_status & PPC_REGISTERED))
return -EBUSY;
@@ -492,26 +512,15 @@ int acpi_processor_notify_smm(struct module *calling_module)
is_done = -EIO;
- /* Can't write pstate_control to smi_command if either value is zero */
- if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) {
+ result = acpi_processor_pstate_control();
+ if (!result) {
ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_control\n"));
module_put(calling_module);
return 0;
}
-
- ACPI_DEBUG_PRINT((ACPI_DB_INFO,
- "Writing pstate_control [0x%x] to smi_command [0x%x]\n",
- acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
-
- status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
- (u32) acpi_gbl_FADT.pstate_control, 8);
- if (ACPI_FAILURE(status)) {
- ACPI_EXCEPTION((AE_INFO, status,
- "Failed to write pstate_control [0x%x] to "
- "smi_command [0x%x]", acpi_gbl_FADT.pstate_control,
- acpi_gbl_FADT.smi_command));
+ if (result < 0) {
module_put(calling_module);
- return status;
+ return result;
}
/* Success. If there's no _PPC, we need to fear nothing, so
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 54abb26b7366..9b6cebe227a0 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -674,6 +674,14 @@ static void acpi_sleep_suspend_setup(void)
if (acpi_sleep_state_supported(i))
sleep_states[i] = 1;
+ /*
+ * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and
+ * the default suspend mode was not selected from the command line.
+ */
+ if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 &&
+ mem_sleep_default > PM_SUSPEND_MEM)
+ mem_sleep_default = PM_SUSPEND_FREEZE;
+
suspend_set_ops(old_suspend_ordering ?
&acpi_suspend_ops_old : &acpi_suspend_ops);
freeze_set_ops(&acpi_freeze_ops);
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index e023066e4215..5711708532db 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -39,6 +39,105 @@
static LIST_HEAD(gpd_list);
static DEFINE_MUTEX(gpd_list_lock);
+struct genpd_lock_ops {
+ void (*lock)(struct generic_pm_domain *genpd);
+ void (*lock_nested)(struct generic_pm_domain *genpd, int depth);
+ int (*lock_interruptible)(struct generic_pm_domain *genpd);
+ void (*unlock)(struct generic_pm_domain *genpd);
+};
+
+static void genpd_lock_mtx(struct generic_pm_domain *genpd)
+{
+ mutex_lock(&genpd->mlock);
+}
+
+static void genpd_lock_nested_mtx(struct generic_pm_domain *genpd,
+ int depth)
+{
+ mutex_lock_nested(&genpd->mlock, depth);
+}
+
+static int genpd_lock_interruptible_mtx(struct generic_pm_domain *genpd)
+{
+ return mutex_lock_interruptible(&genpd->mlock);
+}
+
+static void genpd_unlock_mtx(struct generic_pm_domain *genpd)
+{
+ return mutex_unlock(&genpd->mlock);
+}
+
+static const struct genpd_lock_ops genpd_mtx_ops = {
+ .lock = genpd_lock_mtx,
+ .lock_nested = genpd_lock_nested_mtx,
+ .lock_interruptible = genpd_lock_interruptible_mtx,
+ .unlock = genpd_unlock_mtx,
+};
+
+static void genpd_lock_spin(struct generic_pm_domain *genpd)
+ __acquires(&genpd->slock)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&genpd->slock, flags);
+ genpd->lock_flags = flags;
+}
+
+static void genpd_lock_nested_spin(struct generic_pm_domain *genpd,
+ int depth)
+ __acquires(&genpd->slock)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&genpd->slock, flags, depth);
+ genpd->lock_flags = flags;
+}
+
+static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd)
+ __acquires(&genpd->slock)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&genpd->slock, flags);
+ genpd->lock_flags = flags;
+ return 0;
+}
+
+static void genpd_unlock_spin(struct generic_pm_domain *genpd)
+ __releases(&genpd->slock)
+{
+ spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags);
+}
+
+static const struct genpd_lock_ops genpd_spin_ops = {
+ .lock = genpd_lock_spin,
+ .lock_nested = genpd_lock_nested_spin,
+ .lock_interruptible = genpd_lock_interruptible_spin,
+ .unlock = genpd_unlock_spin,
+};
+
+#define genpd_lock(p) p->lock_ops->lock(p)
+#define genpd_lock_nested(p, d) p->lock_ops->lock_nested(p, d)
+#define genpd_lock_interruptible(p) p->lock_ops->lock_interruptible(p)
+#define genpd_unlock(p) p->lock_ops->unlock(p)
+
+#define genpd_is_irq_safe(genpd) (genpd->flags & GENPD_FLAG_IRQ_SAFE)
+
+static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
+ struct generic_pm_domain *genpd)
+{
+ bool ret;
+
+ ret = pm_runtime_is_irq_safe(dev) && !genpd_is_irq_safe(genpd);
+
+ /* Warn once for each IRQ safe dev in no sleep domain */
+ if (ret)
+ dev_warn_once(dev, "PM domain %s will not be powered off\n",
+ genpd->name);
+
+ return ret;
+}
+
/*
* Get the generic PM domain for a particular struct device.
* This validates the struct device pointer, the PM domain pointer,
@@ -200,9 +299,9 @@ static int genpd_poweron(struct generic_pm_domain *genpd, unsigned int depth)
genpd_sd_counter_inc(master);
- mutex_lock_nested(&master->lock, depth + 1);
+ genpd_lock_nested(master, depth + 1);
ret = genpd_poweron(master, depth + 1);
- mutex_unlock(&master->lock);
+ genpd_unlock(master);
if (ret) {
genpd_sd_counter_dec(master);
@@ -255,9 +354,9 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
spin_unlock_irq(&dev->power.lock);
if (!IS_ERR(genpd)) {
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
genpd->max_off_time_changed = true;
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
}
dev = dev->parent;
@@ -303,7 +402,12 @@ static int genpd_poweroff(struct generic_pm_domain *genpd, bool is_async)
if (stat > PM_QOS_FLAGS_NONE)
return -EBUSY;
- if (!pm_runtime_suspended(pdd->dev) || pdd->dev->power.irq_safe)
+ /*
+ * Do not allow PM domain to be powered off, when an IRQ safe
+ * device is part of a non-IRQ safe domain.
+ */
+ if (!pm_runtime_suspended(pdd->dev) ||
+ irq_safe_dev_in_no_sleep_domain(pdd->dev, genpd))
not_suspended++;
}
@@ -354,9 +458,9 @@ static void genpd_power_off_work_fn(struct work_struct *work)
genpd = container_of(work, struct generic_pm_domain, power_off_work);
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
genpd_poweroff(genpd, true);
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
}
/**
@@ -466,15 +570,15 @@ static int genpd_runtime_suspend(struct device *dev)
}
/*
- * If power.irq_safe is set, this routine will be run with interrupts
- * off, so it can't use mutexes.
+ * If power.irq_safe is set, this routine may be run with
+ * IRQs disabled, so suspend only if the PM domain also is irq_safe.
*/
- if (dev->power.irq_safe)
+ if (irq_safe_dev_in_no_sleep_domain(dev, genpd))
return 0;
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
genpd_poweroff(genpd, false);
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
return 0;
}
@@ -503,15 +607,18 @@ static int genpd_runtime_resume(struct device *dev)
if (IS_ERR(genpd))
return -EINVAL;
- /* If power.irq_safe, the PM domain is never powered off. */
- if (dev->power.irq_safe) {
+ /*
+ * As we don't power off a non IRQ safe domain, which holds
+ * an IRQ safe device, we don't need to restore power to it.
+ */
+ if (irq_safe_dev_in_no_sleep_domain(dev, genpd)) {
timed = false;
goto out;
}
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
ret = genpd_poweron(genpd, 0);
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
if (ret)
return ret;
@@ -546,10 +653,11 @@ static int genpd_runtime_resume(struct device *dev)
err_stop:
genpd_stop_dev(genpd, dev);
err_poweroff:
- if (!dev->power.irq_safe) {
- mutex_lock(&genpd->lock);
+ if (!pm_runtime_is_irq_safe(dev) ||
+ (pm_runtime_is_irq_safe(dev) && genpd_is_irq_safe(genpd))) {
+ genpd_lock(genpd);
genpd_poweroff(genpd, 0);
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
}
return ret;
@@ -732,20 +840,20 @@ static int pm_genpd_prepare(struct device *dev)
if (resume_needed(dev, genpd))
pm_runtime_resume(dev);
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
if (genpd->prepared_count++ == 0)
genpd->suspended_count = 0;
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
ret = pm_generic_prepare(dev);
if (ret) {
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
genpd->prepared_count--;
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
}
return ret;
@@ -936,13 +1044,13 @@ static void pm_genpd_complete(struct device *dev)
pm_generic_complete(dev);
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
genpd->prepared_count--;
if (!genpd->prepared_count)
genpd_queue_power_off_work(genpd);
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
}
/**
@@ -1071,7 +1179,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
if (IS_ERR(gpd_data))
return PTR_ERR(gpd_data);
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
if (genpd->prepared_count > 0) {
ret = -EAGAIN;
@@ -1088,7 +1196,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
out:
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
if (ret)
genpd_free_dev_data(dev, gpd_data);
@@ -1130,7 +1238,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
gpd_data = to_gpd_data(pdd);
dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
if (genpd->prepared_count > 0) {
ret = -EAGAIN;
@@ -1145,14 +1253,14 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
list_del_init(&pdd->list_node);
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
genpd_free_dev_data(dev, gpd_data);
return 0;
out:
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
dev_pm_qos_add_notifier(dev, &gpd_data->nb);
return ret;
@@ -1183,12 +1291,23 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
|| genpd == subdomain)
return -EINVAL;
+ /*
+ * If the domain can be powered on/off in an IRQ safe
+ * context, ensure that the subdomain can also be
+ * powered on/off in that context.
+ */
+ if (!genpd_is_irq_safe(genpd) && genpd_is_irq_safe(subdomain)) {
+ WARN(1, "Parent %s of subdomain %s must be IRQ safe\n",
+ genpd->name, subdomain->name);
+ return -EINVAL;
+ }
+
link = kzalloc(sizeof(*link), GFP_KERNEL);
if (!link)
return -ENOMEM;
- mutex_lock(&subdomain->lock);
- mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+ genpd_lock(subdomain);
+ genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
if (genpd->status == GPD_STATE_POWER_OFF
&& subdomain->status != GPD_STATE_POWER_OFF) {
@@ -1211,8 +1330,8 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
genpd_sd_counter_inc(genpd);
out:
- mutex_unlock(&genpd->lock);
- mutex_unlock(&subdomain->lock);
+ genpd_unlock(genpd);
+ genpd_unlock(subdomain);
if (ret)
kfree(link);
return ret;
@@ -1250,8 +1369,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(subdomain))
return -EINVAL;
- mutex_lock(&subdomain->lock);
- mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+ genpd_lock(subdomain);
+ genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
if (!list_empty(&subdomain->master_links) || subdomain->device_count) {
pr_warn("%s: unable to remove subdomain %s\n", genpd->name,
@@ -1275,13 +1394,39 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
}
out:
- mutex_unlock(&genpd->lock);
- mutex_unlock(&subdomain->lock);
+ genpd_unlock(genpd);
+ genpd_unlock(subdomain);
return ret;
}
EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain);
+static int genpd_set_default_power_state(struct generic_pm_domain *genpd)
+{
+ struct genpd_power_state *state;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ genpd->states = state;
+ genpd->state_count = 1;
+ genpd->free = state;
+
+ return 0;
+}
+
+static void genpd_lock_init(struct generic_pm_domain *genpd)
+{
+ if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {
+ spin_lock_init(&genpd->slock);
+ genpd->lock_ops = &genpd_spin_ops;
+ } else {
+ mutex_init(&genpd->mlock);
+ genpd->lock_ops = &genpd_mtx_ops;
+ }
+}
+
/**
* pm_genpd_init - Initialize a generic I/O PM domain object.
* @genpd: PM domain object to initialize.
@@ -1293,13 +1438,15 @@ EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain);
int pm_genpd_init(struct generic_pm_domain *genpd,
struct dev_power_governor *gov, bool is_off)
{
+ int ret;
+
if (IS_ERR_OR_NULL(genpd))
return -EINVAL;
INIT_LIST_HEAD(&genpd->master_links);
INIT_LIST_HEAD(&genpd->slave_links);
INIT_LIST_HEAD(&genpd->dev_list);
- mutex_init(&genpd->lock);
+ genpd_lock_init(genpd);
genpd->gov = gov;
INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
atomic_set(&genpd->sd_count, 0);
@@ -1325,19 +1472,12 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
genpd->dev_ops.start = pm_clk_resume;
}
- if (genpd->state_idx >= GENPD_MAX_NUM_STATES) {
- pr_warn("Initial state index out of bounds.\n");
- genpd->state_idx = GENPD_MAX_NUM_STATES - 1;
- }
-
- if (genpd->state_count > GENPD_MAX_NUM_STATES) {
- pr_warn("Limiting states to %d\n", GENPD_MAX_NUM_STATES);
- genpd->state_count = GENPD_MAX_NUM_STATES;
- }
-
/* Use only one "off" state if there were no states declared */
- if (genpd->state_count == 0)
- genpd->state_count = 1;
+ if (genpd->state_count == 0) {
+ ret = genpd_set_default_power_state(genpd);
+ if (ret)
+ return ret;
+ }
mutex_lock(&gpd_list_lock);
list_add(&genpd->gpd_list_node, &gpd_list);
@@ -1354,16 +1494,16 @@ static int genpd_remove(struct generic_pm_domain *genpd)
if (IS_ERR_OR_NULL(genpd))
return -EINVAL;
- mutex_lock(&genpd->lock);
+ genpd_lock(genpd);
if (genpd->has_provider) {
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
pr_err("Provider present, unable to remove %s\n", genpd->name);
return -EBUSY;
}
if (!list_empty(&genpd->master_links) || genpd->device_count) {
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
pr_err("%s: unable to remove %s\n", __func__, genpd->name);
return -EBUSY;
}
@@ -1375,8 +1515,9 @@ static int genpd_remove(struct generic_pm_domain *genpd)
}
list_del(&genpd->gpd_list_node);
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
cancel_work_sync(&genpd->power_off_work);
+ kfree(genpd->free);
pr_debug("%s: removed %s\n", __func__, genpd->name);
return 0;
@@ -1890,21 +2031,117 @@ int genpd_dev_pm_attach(struct device *dev)
mutex_unlock(&gpd_list_lock);
if (ret < 0) {
- dev_err(dev, "failed to add to PM domain %s: %d",
- pd->name, ret);
+ if (ret != -EPROBE_DEFER)
+ dev_err(dev, "failed to add to PM domain %s: %d",
+ pd->name, ret);
goto out;
}
dev->pm_domain->detach = genpd_dev_pm_detach;
dev->pm_domain->sync = genpd_dev_pm_sync;
- mutex_lock(&pd->lock);
+ genpd_lock(pd);
ret = genpd_poweron(pd, 0);
- mutex_unlock(&pd->lock);
+ genpd_unlock(pd);
out:
return ret ? -EPROBE_DEFER : 0;
}
EXPORT_SYMBOL_GPL(genpd_dev_pm_attach);
+
+static const struct of_device_id idle_state_match[] = {
+ { .compatible = "domain-idle-state", },
+ { }
+};
+
+static int genpd_parse_state(struct genpd_power_state *genpd_state,
+ struct device_node *state_node)
+{
+ int err;
+ u32 residency;
+ u32 entry_latency, exit_latency;
+ const struct of_device_id *match_id;
+
+ match_id = of_match_node(idle_state_match, state_node);
+ if (!match_id)
+ return -EINVAL;
+
+ err = of_property_read_u32(state_node, "entry-latency-us",
+ &entry_latency);
+ if (err) {
+ pr_debug(" * %s missing entry-latency-us property\n",
+ state_node->full_name);
+ return -EINVAL;
+ }
+
+ err = of_property_read_u32(state_node, "exit-latency-us",
+ &exit_latency);
+ if (err) {
+ pr_debug(" * %s missing exit-latency-us property\n",
+ state_node->full_name);
+ return -EINVAL;
+ }
+
+ err = of_property_read_u32(state_node, "min-residency-us", &residency);
+ if (!err)
+ genpd_state->residency_ns = 1000 * residency;
+
+ genpd_state->power_on_latency_ns = 1000 * exit_latency;
+ genpd_state->power_off_latency_ns = 1000 * entry_latency;
+ genpd_state->fwnode = &state_node->fwnode;
+
+ return 0;
+}
+
+/**
+ * of_genpd_parse_idle_states: Return array of idle states for the genpd.
+ *
+ * @dn: The genpd device node
+ * @states: The pointer to which the state array will be saved.
+ * @n: The count of elements in the array returned from this function.
+ *
+ * Returns the device states parsed from the OF node. The memory for the states
+ * is allocated by this function and is the responsibility of the caller to
+ * free the memory after use.
+ */
+int of_genpd_parse_idle_states(struct device_node *dn,
+ struct genpd_power_state **states, int *n)
+{
+ struct genpd_power_state *st;
+ struct device_node *np;
+ int i = 0;
+ int err, ret;
+ int count;
+ struct of_phandle_iterator it;
+
+ count = of_count_phandle_with_args(dn, "domain-idle-states", NULL);
+ if (count <= 0)
+ return -EINVAL;
+
+ st = kcalloc(count, sizeof(*st), GFP_KERNEL);
+ if (!st)
+ return -ENOMEM;
+
+ /* Loop over the phandles until all the requested entry is found */
+ of_for_each_phandle(&it, err, dn, "domain-idle-states", NULL, 0) {
+ np = it.node;
+ ret = genpd_parse_state(&st[i++], np);
+ if (ret) {
+ pr_err
+ ("Parsing idle state node %s failed with err %d\n",
+ np->full_name, ret);
+ of_node_put(np);
+ kfree(st);
+ return ret;
+ }
+ }
+
+ *n = count;
+ *states = st;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states);
+
#endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
@@ -1958,7 +2195,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
char state[16];
int ret;
- ret = mutex_lock_interruptible(&genpd->lock);
+ ret = genpd_lock_interruptible(genpd);
if (ret)
return -ERESTARTSYS;
@@ -1984,7 +2221,9 @@ static int pm_genpd_summary_one(struct seq_file *s,
}
list_for_each_entry(pm_data, &genpd->dev_list, list_node) {
- kobj_path = kobject_get_path(&pm_data->dev->kobj, GFP_KERNEL);
+ kobj_path = kobject_get_path(&pm_data->dev->kobj,
+ genpd_is_irq_safe(genpd) ?
+ GFP_ATOMIC : GFP_KERNEL);
if (kobj_path == NULL)
continue;
@@ -1995,7 +2234,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
seq_puts(s, "\n");
exit:
- mutex_unlock(&genpd->lock);
+ genpd_unlock(genpd);
return 0;
}
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 2932a5bd892f..eb474c882ebe 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1460,10 +1460,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
dpm_watchdog_clear(&wd);
Complete:
- complete_all(&dev->power.completion);
if (error)
async_error = error;
+ complete_all(&dev->power.completion);
TRACE_SUSPEND(error);
return error;
}
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 4c7c6da7a989..35ff06283738 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -93,6 +93,8 @@ struct opp_table *_find_opp_table(struct device *dev)
* Return: voltage in micro volt corresponding to the opp, else
* return 0
*
+ * This is useful only for devices with single power supply.
+ *
* Locking: This function must be called under rcu_read_lock(). opp is a rcu
* protected pointer. This means that opp which could have been fetched by
* opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
@@ -112,7 +114,7 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
if (IS_ERR_OR_NULL(tmp_opp))
pr_err("%s: Invalid parameters\n", __func__);
else
- v = tmp_opp->u_volt;
+ v = tmp_opp->supplies[0].u_volt;
return v;
}
@@ -210,6 +212,24 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
}
EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
+static int _get_regulator_count(struct device *dev)
+{
+ struct opp_table *opp_table;
+ int count;
+
+ rcu_read_lock();
+
+ opp_table = _find_opp_table(dev);
+ if (!IS_ERR(opp_table))
+ count = opp_table->regulator_count;
+ else
+ count = 0;
+
+ rcu_read_unlock();
+
+ return count;
+}
+
/**
* dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
* @dev: device for which we do this operation
@@ -222,34 +242,51 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
{
struct opp_table *opp_table;
struct dev_pm_opp *opp;
- struct regulator *reg;
+ struct regulator *reg, **regulators;
unsigned long latency_ns = 0;
- unsigned long min_uV = ~0, max_uV = 0;
- int ret;
+ int ret, i, count;
+ struct {
+ unsigned long min;
+ unsigned long max;
+ } *uV;
+
+ count = _get_regulator_count(dev);
+
+ /* Regulator may not be required for the device */
+ if (!count)
+ return 0;
+
+ regulators = kmalloc_array(count, sizeof(*regulators), GFP_KERNEL);
+ if (!regulators)
+ return 0;
+
+ uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL);
+ if (!uV)
+ goto free_regulators;
rcu_read_lock();
opp_table = _find_opp_table(dev);
if (IS_ERR(opp_table)) {
rcu_read_unlock();
- return 0;
+ goto free_uV;
}
- reg = opp_table->regulator;
- if (IS_ERR(reg)) {
- /* Regulator may not be required for device */
- rcu_read_unlock();
- return 0;
- }
+ memcpy(regulators, opp_table->regulators, count * sizeof(*regulators));
- list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
- if (!opp->available)
- continue;
+ for (i = 0; i < count; i++) {
+ uV[i].min = ~0;
+ uV[i].max = 0;
+
+ list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
+ if (!opp->available)
+ continue;
- if (opp->u_volt_min < min_uV)
- min_uV = opp->u_volt_min;
- if (opp->u_volt_max > max_uV)
- max_uV = opp->u_volt_max;
+ if (opp->supplies[i].u_volt_min < uV[i].min)
+ uV[i].min = opp->supplies[i].u_volt_min;
+ if (opp->supplies[i].u_volt_max > uV[i].max)
+ uV[i].max = opp->supplies[i].u_volt_max;
+ }
}
rcu_read_unlock();
@@ -258,9 +295,16 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
* The caller needs to ensure that opp_table (and hence the regulator)
* isn't freed, while we are executing this routine.
*/
- ret = regulator_set_voltage_time(reg, min_uV, max_uV);
- if (ret > 0)
- latency_ns = ret * 1000;
+ for (i = 0; reg = regulators[i], i < count; i++) {
+ ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max);
+ if (ret > 0)
+ latency_ns += ret * 1000;
+ }
+
+free_uV:
+ kfree(uV);
+free_regulators:
+ kfree(regulators);
return latency_ns;
}
@@ -542,8 +586,7 @@ unlock:
}
static int _set_opp_voltage(struct device *dev, struct regulator *reg,
- unsigned long u_volt, unsigned long u_volt_min,
- unsigned long u_volt_max)
+ struct dev_pm_opp_supply *supply)
{
int ret;
@@ -554,14 +597,78 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
return 0;
}
- dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min,
- u_volt, u_volt_max);
+ dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__,
+ supply->u_volt_min, supply->u_volt, supply->u_volt_max);
- ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt,
- u_volt_max);
+ ret = regulator_set_voltage_triplet(reg, supply->u_volt_min,
+ supply->u_volt, supply->u_volt_max);
if (ret)
dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
- __func__, u_volt_min, u_volt, u_volt_max, ret);
+ __func__, supply->u_volt_min, supply->u_volt,
+ supply->u_volt_max, ret);
+
+ return ret;
+}
+
+static inline int
+_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
+ unsigned long old_freq, unsigned long freq)
+{
+ int ret;
+
+ ret = clk_set_rate(clk, freq);
+ if (ret) {
+ dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+ ret);
+ }
+
+ return ret;
+}
+
+static int _generic_set_opp(struct dev_pm_set_opp_data *data)
+{
+ struct dev_pm_opp_supply *old_supply = data->old_opp.supplies;
+ struct dev_pm_opp_supply *new_supply = data->new_opp.supplies;
+ unsigned long old_freq = data->old_opp.rate, freq = data->new_opp.rate;
+ struct regulator *reg = data->regulators[0];
+ struct device *dev= data->dev;
+ int ret;
+
+ /* This function only supports single regulator per device */
+ if (WARN_ON(data->regulator_count > 1)) {
+ dev_err(dev, "multiple regulators are not supported\n");
+ return -EINVAL;
+ }
+
+ /* Scaling up? Scale voltage before frequency */
+ if (freq > old_freq) {
+ ret = _set_opp_voltage(dev, reg, new_supply);
+ if (ret)
+ goto restore_voltage;
+ }
+
+ /* Change frequency */
+ ret = _generic_set_opp_clk_only(dev, data->clk, old_freq, freq);
+ if (ret)
+ goto restore_voltage;
+
+ /* Scaling down? Scale voltage after frequency */
+ if (freq < old_freq) {
+ ret = _set_opp_voltage(dev, reg, new_supply);
+ if (ret)
+ goto restore_freq;
+ }
+
+ return 0;
+
+restore_freq:
+ if (_generic_set_opp_clk_only(dev, data->clk, freq, old_freq))
+ dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+ __func__, old_freq);
+restore_voltage:
+ /* This shouldn't harm even if the voltages weren't updated earlier */
+ if (old_supply->u_volt)
+ _set_opp_voltage(dev, reg, old_supply);
return ret;
}
@@ -579,12 +686,13 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
{
struct opp_table *opp_table;
+ unsigned long freq, old_freq;
+ int (*set_opp)(struct dev_pm_set_opp_data *data);
struct dev_pm_opp *old_opp, *opp;
- struct regulator *reg;
+ struct regulator **regulators;
+ struct dev_pm_set_opp_data *data;
struct clk *clk;
- unsigned long freq, old_freq;
- unsigned long u_volt, u_volt_min, u_volt_max;
- int ret;
+ int ret, size;
if (unlikely(!target_freq)) {
dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
@@ -633,55 +741,41 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
return ret;
}
- u_volt = opp->u_volt;
- u_volt_min = opp->u_volt_min;
- u_volt_max = opp->u_volt_max;
+ dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__,
+ old_freq, freq);
- reg = opp_table->regulator;
+ regulators = opp_table->regulators;
- rcu_read_unlock();
-
- /* Scaling up? Scale voltage before frequency */
- if (freq > old_freq) {
- ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
- u_volt_max);
- if (ret)
- goto restore_voltage;
- }
-
- /* Change frequency */
-
- dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
- __func__, old_freq, freq);
-
- ret = clk_set_rate(clk, freq);
- if (ret) {
- dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
- ret);
- goto restore_voltage;
+ /* Only frequency scaling */
+ if (!regulators) {
+ rcu_read_unlock();
+ return _generic_set_opp_clk_only(dev, clk, old_freq, freq);
}
- /* Scaling down? Scale voltage after frequency */
- if (freq < old_freq) {
- ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
- u_volt_max);
- if (ret)
- goto restore_freq;
- }
+ if (opp_table->set_opp)
+ set_opp = opp_table->set_opp;
+ else
+ set_opp = _generic_set_opp;
+
+ data = opp_table->set_opp_data;
+ data->regulators = regulators;
+ data->regulator_count = opp_table->regulator_count;
+ data->clk = clk;
+ data->dev = dev;
+
+ data->old_opp.rate = old_freq;
+ size = sizeof(*opp->supplies) * opp_table->regulator_count;
+ if (IS_ERR(old_opp))
+ memset(data->old_opp.supplies, 0, size);
+ else
+ memcpy(data->old_opp.supplies, old_opp->supplies, size);
- return 0;
+ data->new_opp.rate = freq;
+ memcpy(data->new_opp.supplies, opp->supplies, size);
-restore_freq:
- if (clk_set_rate(clk, old_freq))
- dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
- __func__, old_freq);
-restore_voltage:
- /* This shouldn't harm even if the voltages weren't updated earlier */
- if (!IS_ERR(old_opp))
- _set_opp_voltage(dev, reg, old_opp->u_volt,
- old_opp->u_volt_min, old_opp->u_volt_max);
+ rcu_read_unlock();
- return ret;
+ return set_opp(data);
}
EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
@@ -764,9 +858,6 @@ static struct opp_table *_add_opp_table(struct device *dev)
_of_init_opp_table(opp_table, dev);
- /* Set regulator to a non-NULL error value */
- opp_table->regulator = ERR_PTR(-ENXIO);
-
/* Find clk for the device */
opp_table->clk = clk_get(dev, NULL);
if (IS_ERR(opp_table->clk)) {
@@ -815,7 +906,10 @@ static void _remove_opp_table(struct opp_table *opp_table)
if (opp_table->prop_name)
return;
- if (!IS_ERR(opp_table->regulator))
+ if (opp_table->regulators)
+ return;
+
+ if (opp_table->set_opp)
return;
/* Release clk */
@@ -924,34 +1018,50 @@ struct dev_pm_opp *_allocate_opp(struct device *dev,
struct opp_table **opp_table)
{
struct dev_pm_opp *opp;
+ int count, supply_size;
+ struct opp_table *table;
- /* allocate new OPP node */
- opp = kzalloc(sizeof(*opp), GFP_KERNEL);
- if (!opp)
+ table = _add_opp_table(dev);
+ if (!table)
return NULL;
- INIT_LIST_HEAD(&opp->node);
+ /* Allocate space for at least one supply */
+ count = table->regulator_count ? table->regulator_count : 1;
+ supply_size = sizeof(*opp->supplies) * count;
- *opp_table = _add_opp_table(dev);
- if (!*opp_table) {
- kfree(opp);
+ /* allocate new OPP node and supplies structures */
+ opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL);
+ if (!opp) {
+ kfree(table);
return NULL;
}
+ /* Put the supplies at the end of the OPP structure as an empty array */
+ opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
+ INIT_LIST_HEAD(&opp->node);
+
+ *opp_table = table;
+
return opp;
}
static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
struct opp_table *opp_table)
{
- struct regulator *reg = opp_table->regulator;
-
- if (!IS_ERR(reg) &&
- !regulator_is_supported_voltage(reg, opp->u_volt_min,
- opp->u_volt_max)) {
- pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
- __func__, opp->u_volt_min, opp->u_volt_max);
- return false;
+ struct regulator *reg;
+ int i;
+
+ for (i = 0; i < opp_table->regulator_count; i++) {
+ reg = opp_table->regulators[i];
+
+ if (!regulator_is_supported_voltage(reg,
+ opp->supplies[i].u_volt_min,
+ opp->supplies[i].u_volt_max)) {
+ pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+ __func__, opp->supplies[i].u_volt_min,
+ opp->supplies[i].u_volt_max);
+ return false;
+ }
}
return true;
@@ -983,11 +1093,13 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
/* Duplicate OPPs */
dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
- __func__, opp->rate, opp->u_volt, opp->available,
- new_opp->rate, new_opp->u_volt, new_opp->available);
+ __func__, opp->rate, opp->supplies[0].u_volt,
+ opp->available, new_opp->rate,
+ new_opp->supplies[0].u_volt, new_opp->available);
- return opp->available && new_opp->u_volt == opp->u_volt ?
- 0 : -EEXIST;
+ /* Should we compare voltages for all regulators here ? */
+ return opp->available &&
+ new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? 0 : -EEXIST;
}
new_opp->opp_table = opp_table;
@@ -1054,9 +1166,9 @@ int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
/* populate the opp table */
new_opp->rate = freq;
tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
- new_opp->u_volt = u_volt;
- new_opp->u_volt_min = u_volt - tol;
- new_opp->u_volt_max = u_volt + tol;
+ new_opp->supplies[0].u_volt = u_volt;
+ new_opp->supplies[0].u_volt_min = u_volt - tol;
+ new_opp->supplies[0].u_volt_max = u_volt + tol;
new_opp->available = true;
new_opp->dynamic = dynamic;
@@ -1300,13 +1412,47 @@ unlock:
}
EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
+static int _allocate_set_opp_data(struct opp_table *opp_table)
+{
+ struct dev_pm_set_opp_data *data;
+ int len, count = opp_table->regulator_count;
+
+ if (WARN_ON(!count))
+ return -EINVAL;
+
+ /* space for set_opp_data */
+ len = sizeof(*data);
+
+ /* space for old_opp.supplies and new_opp.supplies */
+ len += 2 * sizeof(struct dev_pm_opp_supply) * count;
+
+ data = kzalloc(len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->old_opp.supplies = (void *)(data + 1);
+ data->new_opp.supplies = data->old_opp.supplies + count;
+
+ opp_table->set_opp_data = data;
+
+ return 0;
+}
+
+static void _free_set_opp_data(struct opp_table *opp_table)
+{
+ kfree(opp_table->set_opp_data);
+ opp_table->set_opp_data = NULL;
+}
+
/**
- * dev_pm_opp_set_regulator() - Set regulator name for the device
+ * dev_pm_opp_set_regulators() - Set regulator names for the device
* @dev: Device for which regulator name is being set.
- * @name: Name of the regulator.
+ * @names: Array of pointers to the names of the regulator.
+ * @count: Number of regulators.
*
* In order to support OPP switching, OPP layer needs to know the name of the
- * device's regulator, as the core would be required to switch voltages as well.
+ * device's regulators, as the core would be required to switch voltages as
+ * well.
*
* This must be called before any OPPs are initialized for the device.
*
@@ -1316,11 +1462,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
* that this function is *NOT* called under RCU protection or in contexts where
* mutex cannot be locked.
*/
-int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
+ const char * const names[],
+ unsigned int count)
{
struct opp_table *opp_table;
struct regulator *reg;
- int ret;
+ int ret, i;
mutex_lock(&opp_table_lock);
@@ -1336,22 +1484,146 @@ int dev_pm_opp_set_regulator(struct device *dev, const char *name)
goto err;
}
- /* Already have a regulator set */
- if (WARN_ON(!IS_ERR(opp_table->regulator))) {
+ /* Already have regulators set */
+ if (opp_table->regulators) {
ret = -EBUSY;
goto err;
}
- /* Allocate the regulator */
- reg = regulator_get_optional(dev, name);
- if (IS_ERR(reg)) {
- ret = PTR_ERR(reg);
- if (ret != -EPROBE_DEFER)
- dev_err(dev, "%s: no regulator (%s) found: %d\n",
- __func__, name, ret);
+
+ opp_table->regulators = kmalloc_array(count,
+ sizeof(*opp_table->regulators),
+ GFP_KERNEL);
+ if (!opp_table->regulators) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < count; i++) {
+ reg = regulator_get_optional(dev, names[i]);
+ if (IS_ERR(reg)) {
+ ret = PTR_ERR(reg);
+ if (ret != -EPROBE_DEFER)
+ dev_err(dev, "%s: no regulator (%s) found: %d\n",
+ __func__, names[i], ret);
+ goto free_regulators;
+ }
+
+ opp_table->regulators[i] = reg;
+ }
+
+ opp_table->regulator_count = count;
+
+ /* Allocate block only once to pass to set_opp() routines */
+ ret = _allocate_set_opp_data(opp_table);
+ if (ret)
+ goto free_regulators;
+
+ mutex_unlock(&opp_table_lock);
+ return opp_table;
+
+free_regulators:
+ while (i != 0)
+ regulator_put(opp_table->regulators[--i]);
+
+ kfree(opp_table->regulators);
+ opp_table->regulators = NULL;
+ opp_table->regulator_count = 0;
+err:
+ _remove_opp_table(opp_table);
+unlock:
+ mutex_unlock(&opp_table_lock);
+
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
+
+/**
+ * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
+ * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_put_regulators(struct opp_table *opp_table)
+{
+ int i;
+
+ mutex_lock(&opp_table_lock);
+
+ if (!opp_table->regulators) {
+ pr_err("%s: Doesn't have regulators set\n", __func__);
+ goto unlock;
+ }
+
+ /* Make sure there are no concurrent readers while updating opp_table */
+ WARN_ON(!list_empty(&opp_table->opp_list));
+
+ for (i = opp_table->regulator_count - 1; i >= 0; i--)
+ regulator_put(opp_table->regulators[i]);
+
+ _free_set_opp_data(opp_table);
+
+ kfree(opp_table->regulators);
+ opp_table->regulators = NULL;
+ opp_table->regulator_count = 0;
+
+ /* Try freeing opp_table if this was the last blocking resource */
+ _remove_opp_table(opp_table);
+
+unlock:
+ mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
+
+/**
+ * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
+ * @dev: Device for which the helper is getting registered.
+ * @set_opp: Custom set OPP helper.
+ *
+ * This is useful to support complex platforms (like platforms with multiple
+ * regulators per device), instead of the generic OPP set rate helper.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_register_set_opp_helper(struct device *dev,
+ int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+ struct opp_table *opp_table;
+ int ret;
+
+ if (!set_opp)
+ return -EINVAL;
+
+ mutex_lock(&opp_table_lock);
+
+ opp_table = _add_opp_table(dev);
+ if (!opp_table) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ /* This should be called before OPPs are initialized */
+ if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+ ret = -EBUSY;
goto err;
}
- opp_table->regulator = reg;
+ /* Already have custom set_opp helper */
+ if (WARN_ON(opp_table->set_opp)) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ opp_table->set_opp = set_opp;
mutex_unlock(&opp_table_lock);
return 0;
@@ -1363,11 +1635,12 @@ unlock:
return ret;
}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
/**
- * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
- * @dev: Device for which regulator was set.
+ * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
+ * set_opp helper
+ * @dev: Device for which custom set_opp helper has to be cleared.
*
* Locking: The internal opp_table and opp structures are RCU protected.
* Hence this function internally uses RCU updater strategy with mutex locks
@@ -1375,7 +1648,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
* that this function is *NOT* called under RCU protection or in contexts where
* mutex cannot be locked.
*/
-void dev_pm_opp_put_regulator(struct device *dev)
+void dev_pm_opp_register_put_opp_helper(struct device *dev)
{
struct opp_table *opp_table;
@@ -1389,16 +1662,16 @@ void dev_pm_opp_put_regulator(struct device *dev)
goto unlock;
}
- if (IS_ERR(opp_table->regulator)) {
- dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
+ if (!opp_table->set_opp) {
+ dev_err(dev, "%s: Doesn't have custom set_opp helper set\n",
+ __func__);
goto unlock;
}
/* Make sure there are no concurrent readers while updating opp_table */
WARN_ON(!list_empty(&opp_table->opp_list));
- regulator_put(opp_table->regulator);
- opp_table->regulator = ERR_PTR(-ENXIO);
+ opp_table->set_opp = NULL;
/* Try freeing opp_table if this was the last blocking resource */
_remove_opp_table(opp_table);
@@ -1406,7 +1679,7 @@ void dev_pm_opp_put_regulator(struct device *dev)
unlock:
mutex_unlock(&opp_table_lock);
}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator);
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
/**
* dev_pm_opp_add() - Add an OPP table from a table definitions
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
index ef1ae6b52042..95f433db4ac7 100644
--- a/drivers/base/power/opp/debugfs.c
+++ b/drivers/base/power/opp/debugfs.c
@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/init.h>
#include <linux/limits.h>
+#include <linux/slab.h>
#include "opp.h"
@@ -34,6 +35,46 @@ void opp_debug_remove_one(struct dev_pm_opp *opp)
debugfs_remove_recursive(opp->dentry);
}
+static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
+ struct opp_table *opp_table,
+ struct dentry *pdentry)
+{
+ struct dentry *d;
+ int i = 0;
+ char *name;
+
+ /* Always create at least supply-0 directory */
+ do {
+ name = kasprintf(GFP_KERNEL, "supply-%d", i);
+
+ /* Create per-opp directory */
+ d = debugfs_create_dir(name, pdentry);
+
+ kfree(name);
+
+ if (!d)
+ return false;
+
+ if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
+ &opp->supplies[i].u_volt))
+ return false;
+
+ if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
+ &opp->supplies[i].u_volt_min))
+ return false;
+
+ if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
+ &opp->supplies[i].u_volt_max))
+ return false;
+
+ if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
+ &opp->supplies[i].u_amp))
+ return false;
+ } while (++i < opp_table->regulator_count);
+
+ return true;
+}
+
int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
{
struct dentry *pdentry = opp_table->dentry;
@@ -63,16 +104,7 @@ int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
return -ENOMEM;
- if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt))
- return -ENOMEM;
-
- if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min))
- return -ENOMEM;
-
- if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max))
- return -ENOMEM;
-
- if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp))
+ if (!opp_debug_create_supplies(opp, opp_table, d))
return -ENOMEM;
if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
index 5552211e6fcd..3f7d2591b173 100644
--- a/drivers/base/power/opp/of.c
+++ b/drivers/base/power/opp/of.c
@@ -17,6 +17,7 @@
#include <linux/errno.h>
#include <linux/device.h>
#include <linux/of.h>
+#include <linux/slab.h>
#include <linux/export.h>
#include "opp.h"
@@ -101,16 +102,16 @@ static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
return true;
}
-/* TODO: Support multiple regulators */
static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
struct opp_table *opp_table)
{
- u32 microvolt[3] = {0};
- u32 val;
- int count, ret;
+ u32 *microvolt, *microamp = NULL;
+ int supplies, vcount, icount, ret, i, j;
struct property *prop = NULL;
char name[NAME_MAX];
+ supplies = opp_table->regulator_count ? opp_table->regulator_count : 1;
+
/* Search for "opp-microvolt-<name>" */
if (opp_table->prop_name) {
snprintf(name, sizeof(name), "opp-microvolt-%s",
@@ -128,34 +129,29 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
return 0;
}
- count = of_property_count_u32_elems(opp->np, name);
- if (count < 0) {
+ vcount = of_property_count_u32_elems(opp->np, name);
+ if (vcount < 0) {
dev_err(dev, "%s: Invalid %s property (%d)\n",
- __func__, name, count);
- return count;
+ __func__, name, vcount);
+ return vcount;
}
- /* There can be one or three elements here */
- if (count != 1 && count != 3) {
- dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n",
- __func__, name, count);
+ /* There can be one or three elements per supply */
+ if (vcount != supplies && vcount != supplies * 3) {
+ dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+ __func__, name, vcount, supplies);
return -EINVAL;
}
- ret = of_property_read_u32_array(opp->np, name, microvolt, count);
+ microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL);
+ if (!microvolt)
+ return -ENOMEM;
+
+ ret = of_property_read_u32_array(opp->np, name, microvolt, vcount);
if (ret) {
dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
- return -EINVAL;
- }
-
- opp->u_volt = microvolt[0];
-
- if (count == 1) {
- opp->u_volt_min = opp->u_volt;
- opp->u_volt_max = opp->u_volt;
- } else {
- opp->u_volt_min = microvolt[1];
- opp->u_volt_max = microvolt[2];
+ ret = -EINVAL;
+ goto free_microvolt;
}
/* Search for "opp-microamp-<name>" */
@@ -172,10 +168,59 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
prop = of_find_property(opp->np, name, NULL);
}
- if (prop && !of_property_read_u32(opp->np, name, &val))
- opp->u_amp = val;
+ if (prop) {
+ icount = of_property_count_u32_elems(opp->np, name);
+ if (icount < 0) {
+ dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
+ name, icount);
+ ret = icount;
+ goto free_microvolt;
+ }
- return 0;
+ if (icount != supplies) {
+ dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+ __func__, name, icount, supplies);
+ ret = -EINVAL;
+ goto free_microvolt;
+ }
+
+ microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL);
+ if (!microamp) {
+ ret = -EINVAL;
+ goto free_microvolt;
+ }
+
+ ret = of_property_read_u32_array(opp->np, name, microamp,
+ icount);
+ if (ret) {
+ dev_err(dev, "%s: error parsing %s: %d\n", __func__,
+ name, ret);
+ ret = -EINVAL;
+ goto free_microamp;
+ }
+ }
+
+ for (i = 0, j = 0; i < supplies; i++) {
+ opp->supplies[i].u_volt = microvolt[j++];
+
+ if (vcount == supplies) {
+ opp->supplies[i].u_volt_min = opp->supplies[i].u_volt;
+ opp->supplies[i].u_volt_max = opp->supplies[i].u_volt;
+ } else {
+ opp->supplies[i].u_volt_min = microvolt[j++];
+ opp->supplies[i].u_volt_max = microvolt[j++];
+ }
+
+ if (microamp)
+ opp->supplies[i].u_amp = microamp[i];
+ }
+
+free_microamp:
+ kfree(microamp);
+free_microvolt:
+ kfree(microvolt);
+
+ return ret;
}
/**
@@ -198,7 +243,7 @@ void dev_pm_opp_of_remove_table(struct device *dev)
EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
/* Returns opp descriptor node for a device, caller must do of_node_put() */
-struct device_node *_of_get_opp_desc_node(struct device *dev)
+static struct device_node *_of_get_opp_desc_node(struct device *dev)
{
/*
* TODO: Support for multiple OPP tables.
@@ -303,9 +348,9 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
mutex_unlock(&opp_table_lock);
pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
- __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt,
- new_opp->u_volt_min, new_opp->u_volt_max,
- new_opp->clock_latency_ns);
+ __func__, new_opp->turbo, new_opp->rate,
+ new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
+ new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns);
/*
* Notify the changes in the availability of the operable
@@ -562,7 +607,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
/* Get OPP descriptor node */
np = _of_get_opp_desc_node(cpu_dev);
if (!np) {
- dev_dbg(cpu_dev, "%s: Couldn't find cpu_dev node.\n", __func__);
+ dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
return -ENOENT;
}
@@ -587,7 +632,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
/* Get OPP descriptor node */
tmp_np = _of_get_opp_desc_node(tcpu_dev);
if (!tmp_np) {
- dev_err(tcpu_dev, "%s: Couldn't find tcpu_dev node.\n",
+ dev_err(tcpu_dev, "%s: Couldn't find opp node.\n",
__func__);
ret = -ENOENT;
goto put_cpu_node;
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index fabd5ca1a083..af9f2b849a66 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -61,10 +61,7 @@ extern struct list_head opp_tables;
* @turbo: true if turbo (boost) OPP
* @suspend: true if suspend OPP
* @rate: Frequency in hertz
- * @u_volt: Target voltage in microvolts corresponding to this OPP
- * @u_volt_min: Minimum voltage in microvolts corresponding to this OPP
- * @u_volt_max: Maximum voltage in microvolts corresponding to this OPP
- * @u_amp: Maximum current drawn by the device in microamperes
+ * @supplies: Power supplies voltage/current values
* @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
* frequency from any other OPP's frequency.
* @opp_table: points back to the opp_table struct this opp belongs to
@@ -83,10 +80,8 @@ struct dev_pm_opp {
bool suspend;
unsigned long rate;
- unsigned long u_volt;
- unsigned long u_volt_min;
- unsigned long u_volt_max;
- unsigned long u_amp;
+ struct dev_pm_opp_supply *supplies;
+
unsigned long clock_latency_ns;
struct opp_table *opp_table;
@@ -144,7 +139,10 @@ enum opp_table_access {
* @supported_hw_count: Number of elements in supported_hw array.
* @prop_name: A name to postfix to many DT properties, while parsing them.
* @clk: Device's clock handle
- * @regulator: Supply regulator
+ * @regulators: Supply regulators
+ * @regulator_count: Number of power supply regulators
+ * @set_opp: Platform specific set_opp callback
+ * @set_opp_data: Data to be passed to set_opp callback
* @dentry: debugfs dentry pointer of the real device directory (not links).
* @dentry_name: Name of the real dentry.
*
@@ -179,7 +177,11 @@ struct opp_table {
unsigned int supported_hw_count;
const char *prop_name;
struct clk *clk;
- struct regulator *regulator;
+ struct regulator **regulators;
+ unsigned int regulator_count;
+
+ int (*set_opp)(struct dev_pm_set_opp_data *data);
+ struct dev_pm_set_opp_data *set_opp_data;
#ifdef CONFIG_DEBUG_FS
struct dentry *dentry;
@@ -190,7 +192,6 @@ struct opp_table {
/* Routines internal to opp core */
struct opp_table *_find_opp_table(struct device *dev);
struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
-struct device_node *_of_get_opp_desc_node(struct device *dev);
void _dev_pm_opp_remove_table(struct device *dev, bool remove_all);
struct dev_pm_opp *_allocate_opp(struct device *dev, struct opp_table **opp_table);
int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 50e30e7b059d..a84332aefc2d 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -21,14 +21,22 @@ extern void pm_runtime_init(struct device *dev);
extern void pm_runtime_reinit(struct device *dev);
extern void pm_runtime_remove(struct device *dev);
+#define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0)
+#define WAKE_IRQ_DEDICATED_MANAGED BIT(1)
+#define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \
+ WAKE_IRQ_DEDICATED_MANAGED)
+
struct wake_irq {
struct device *dev;
+ unsigned int status;
int irq;
- bool dedicated_irq:1;
};
extern void dev_pm_arm_wake_irq(struct wake_irq *wirq);
extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq);
+extern void dev_pm_enable_wake_irq_check(struct device *dev,
+ bool can_change_status);
+extern void dev_pm_disable_wake_irq_check(struct device *dev);
#ifdef CONFIG_PM_SLEEP
@@ -104,6 +112,15 @@ static inline void dev_pm_disarm_wake_irq(struct wake_irq *wirq)
{
}
+static inline void dev_pm_enable_wake_irq_check(struct device *dev,
+ bool can_change_status)
+{
+}
+
+static inline void dev_pm_disable_wake_irq_check(struct device *dev)
+{
+}
+
#endif
#ifdef CONFIG_PM_SLEEP
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 7f3646e459cb..58fcc758334e 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -856,7 +856,10 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
struct dev_pm_qos_request *req;
if (val < 0) {
- ret = -EINVAL;
+ if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT)
+ ret = 0;
+ else
+ ret = -EINVAL;
goto out;
}
req = kzalloc(sizeof(*req), GFP_KERNEL);
@@ -883,6 +886,7 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
mutex_unlock(&dev_pm_qos_mtx);
return ret;
}
+EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance);
/**
* dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 82a081ea4317..26856d050037 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -241,7 +241,8 @@ static int rpm_check_suspend_allowed(struct device *dev)
retval = -EACCES;
else if (atomic_read(&dev->power.usage_count) > 0)
retval = -EAGAIN;
- else if (!pm_children_suspended(dev))
+ else if (!dev->power.ignore_children &&
+ atomic_read(&dev->power.child_count))
retval = -EBUSY;
/* Pending resume requests take precedence over suspends. */
@@ -515,7 +516,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
callback = RPM_GET_CALLBACK(dev, runtime_suspend);
- dev_pm_enable_wake_irq(dev);
+ dev_pm_enable_wake_irq_check(dev, true);
retval = rpm_callback(callback, dev);
if (retval)
goto fail;
@@ -554,7 +555,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
return retval;
fail:
- dev_pm_disable_wake_irq(dev);
+ dev_pm_disable_wake_irq_check(dev);
__update_runtime_status(dev, RPM_ACTIVE);
dev->power.deferred_resume = false;
wake_up_all(&dev->power.wait_queue);
@@ -712,8 +713,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
spin_lock(&parent->power.lock);
/*
- * We can resume if the parent's runtime PM is disabled or it
- * is set to ignore children.
+ * Resume the parent if it has runtime PM enabled and not been
+ * set to ignore its children.
*/
if (!parent->power.disable_depth
&& !parent->power.ignore_children) {
@@ -737,12 +738,12 @@ static int rpm_resume(struct device *dev, int rpmflags)
callback = RPM_GET_CALLBACK(dev, runtime_resume);
- dev_pm_disable_wake_irq(dev);
+ dev_pm_disable_wake_irq_check(dev);
retval = rpm_callback(callback, dev);
if (retval) {
__update_runtime_status(dev, RPM_SUSPENDED);
pm_runtime_cancel_pending(dev);
- dev_pm_enable_wake_irq(dev);
+ dev_pm_enable_wake_irq_check(dev, false);
} else {
no_callback:
__update_runtime_status(dev, RPM_ACTIVE);
@@ -1027,7 +1028,17 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status)
goto out_set;
if (status == RPM_SUSPENDED) {
- /* It always is possible to set the status to 'suspended'. */
+ /*
+ * It is invalid to suspend a device with an active child,
+ * unless it has been set to ignore its children.
+ */
+ if (!dev->power.ignore_children &&
+ atomic_read(&dev->power.child_count)) {
+ dev_err(dev, "runtime PM trying to suspend device but active child\n");
+ error = -EBUSY;
+ goto out;
+ }
+
if (parent) {
atomic_add_unless(&parent->power.child_count, -1, 0);
notify_parent = !parent->power.ignore_children;
@@ -1478,6 +1489,16 @@ int pm_runtime_force_suspend(struct device *dev)
if (ret)
goto err;
+ /*
+ * Increase the runtime PM usage count for the device's parent, in case
+ * when we find the device being used when system suspend was invoked.
+ * This informs pm_runtime_force_resume() to resume the parent
+ * immediately, which is needed to be able to resume its children,
+ * when not deferring the resume to be managed via runtime PM.
+ */
+ if (dev->parent && atomic_read(&dev->power.usage_count) > 1)
+ pm_runtime_get_noresume(dev->parent);
+
pm_runtime_set_suspended(dev);
return 0;
err:
@@ -1487,16 +1508,20 @@ err:
EXPORT_SYMBOL_GPL(pm_runtime_force_suspend);
/**
- * pm_runtime_force_resume - Force a device into resume state.
+ * pm_runtime_force_resume - Force a device into resume state if needed.
* @dev: Device to resume.
*
* Prior invoking this function we expect the user to have brought the device
* into low power state by a call to pm_runtime_force_suspend(). Here we reverse
- * those actions and brings the device into full power. We update the runtime PM
- * status and re-enables runtime PM.
+ * those actions and brings the device into full power, if it is expected to be
+ * used on system resume. To distinguish that, we check whether the runtime PM
+ * usage count is greater than 1 (the PM core increases the usage count in the
+ * system PM prepare phase), as that indicates a real user (such as a subsystem,
+ * driver, userspace, etc.) is using it. If that is the case, the device is
+ * expected to be used on system resume as well, so then we resume it. In the
+ * other case, we defer the resume to be managed via runtime PM.
*
- * Typically this function may be invoked from a system resume callback to make
- * sure the device is put into full power state.
+ * Typically this function may be invoked from a system resume callback.
*/
int pm_runtime_force_resume(struct device *dev)
{
@@ -1513,6 +1538,17 @@ int pm_runtime_force_resume(struct device *dev)
if (!pm_runtime_status_suspended(dev))
goto out;
+ /*
+ * Decrease the parent's runtime PM usage count, if we increased it
+ * during system suspend in pm_runtime_force_suspend().
+ */
+ if (atomic_read(&dev->power.usage_count) > 1) {
+ if (dev->parent)
+ pm_runtime_put_noidle(dev->parent);
+ } else {
+ goto out;
+ }
+
ret = pm_runtime_set_active(dev);
if (ret)
goto out;
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index a7b46798c81d..33b4b902741a 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -263,7 +263,11 @@ static ssize_t pm_qos_latency_tolerance_store(struct device *dev,
s32 value;
int ret;
- if (kstrtos32(buf, 0, &value)) {
+ if (kstrtos32(buf, 0, &value) == 0) {
+ /* Users can't write negative values directly */
+ if (value < 0)
+ return -EINVAL;
+ } else {
if (!strcmp(buf, "auto") || !strcmp(buf, "auto\n"))
value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
else if (!strcmp(buf, "any") || !strcmp(buf, "any\n"))
diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c
index 0d77cd6fd8d1..404d94c6c8bc 100644
--- a/drivers/base/power/wakeirq.c
+++ b/drivers/base/power/wakeirq.c
@@ -110,8 +110,10 @@ void dev_pm_clear_wake_irq(struct device *dev)
dev->power.wakeirq = NULL;
spin_unlock_irqrestore(&dev->power.lock, flags);
- if (wirq->dedicated_irq)
+ if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED) {
free_irq(wirq->irq, wirq);
+ wirq->status &= ~WAKE_IRQ_DEDICATED_MASK;
+ }
kfree(wirq);
}
EXPORT_SYMBOL_GPL(dev_pm_clear_wake_irq);
@@ -179,7 +181,6 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
wirq->dev = dev;
wirq->irq = irq;
- wirq->dedicated_irq = true;
irq_set_status_flags(irq, IRQ_NOAUTOEN);
/*
@@ -195,6 +196,8 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
if (err)
goto err_free_irq;
+ wirq->status = WAKE_IRQ_DEDICATED_ALLOCATED;
+
return err;
err_free_irq:
@@ -210,9 +213,9 @@ EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq);
* dev_pm_enable_wake_irq - Enable device wake-up interrupt
* @dev: Device
*
- * Called from the bus code or the device driver for
- * runtime_suspend() to enable the wake-up interrupt while
- * the device is running.
+ * Optionally called from the bus code or the device driver for
+ * runtime_resume() to override the PM runtime core managed wake-up
+ * interrupt handling to enable the wake-up interrupt.
*
* Note that for runtime_suspend()) the wake-up interrupts
* should be unconditionally enabled unlike for suspend()
@@ -222,7 +225,7 @@ void dev_pm_enable_wake_irq(struct device *dev)
{
struct wake_irq *wirq = dev->power.wakeirq;
- if (wirq && wirq->dedicated_irq)
+ if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED))
enable_irq(wirq->irq);
}
EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
@@ -231,20 +234,73 @@ EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
* dev_pm_disable_wake_irq - Disable device wake-up interrupt
* @dev: Device
*
- * Called from the bus code or the device driver for
- * runtime_resume() to disable the wake-up interrupt while
- * the device is running.
+ * Optionally called from the bus code or the device driver for
+ * runtime_suspend() to override the PM runtime core managed wake-up
+ * interrupt handling to disable the wake-up interrupt.
*/
void dev_pm_disable_wake_irq(struct device *dev)
{
struct wake_irq *wirq = dev->power.wakeirq;
- if (wirq && wirq->dedicated_irq)
+ if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED))
disable_irq_nosync(wirq->irq);
}
EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq);
/**
+ * dev_pm_enable_wake_irq_check - Checks and enables wake-up interrupt
+ * @dev: Device
+ * @can_change_status: Can change wake-up interrupt status
+ *
+ * Enables wakeirq conditionally. We need to enable wake-up interrupt
+ * lazily on the first rpm_suspend(). This is needed as the consumer device
+ * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would
+ * otherwise try to disable already disabled wakeirq. The wake-up interrupt
+ * starts disabled with IRQ_NOAUTOEN set.
+ *
+ * Should be only called from rpm_suspend() and rpm_resume() path.
+ * Caller must hold &dev->power.lock to change wirq->status
+ */
+void dev_pm_enable_wake_irq_check(struct device *dev,
+ bool can_change_status)
+{
+ struct wake_irq *wirq = dev->power.wakeirq;
+
+ if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK)))
+ return;
+
+ if (likely(wirq->status & WAKE_IRQ_DEDICATED_MANAGED)) {
+ goto enable;
+ } else if (can_change_status) {
+ wirq->status |= WAKE_IRQ_DEDICATED_MANAGED;
+ goto enable;
+ }
+
+ return;
+
+enable:
+ enable_irq(wirq->irq);
+}
+
+/**
+ * dev_pm_disable_wake_irq_check - Checks and disables wake-up interrupt
+ * @dev: Device
+ *
+ * Disables wake-up interrupt conditionally based on status.
+ * Should be only called from rpm_suspend() and rpm_resume() path.
+ */
+void dev_pm_disable_wake_irq_check(struct device *dev)
+{
+ struct wake_irq *wirq = dev->power.wakeirq;
+
+ if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK)))
+ return;
+
+ if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED)
+ disable_irq_nosync(wirq->irq);
+}
+
+/**
* dev_pm_arm_wake_irq - Arm device wake-up
* @wirq: Device wake-up interrupt
*
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 62e4de2aa8d1..bf9ba26981a5 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -811,7 +811,7 @@ void pm_print_active_wakeup_sources(void)
rcu_read_lock();
list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
if (ws->active) {
- pr_info("active wakeup source: %s\n", ws->name);
+ pr_debug("active wakeup source: %s\n", ws->name);
active = 1;
} else if (!active &&
(!last_activity_ws ||
@@ -822,7 +822,7 @@ void pm_print_active_wakeup_sources(void)
}
if (!active && last_activity_ws)
- pr_info("last active wakeup source: %s\n",
+ pr_debug("last active wakeup source: %s\n",
last_activity_ws->name);
rcu_read_unlock();
}
@@ -905,7 +905,7 @@ bool pm_get_wakeup_count(unsigned int *count, bool block)
split_counters(&cnt, &inpr);
if (inpr == 0 || signal_pending(current))
break;
-
+ pm_print_active_wakeup_sources();
schedule();
}
finish_wait(&wakeup_count_wait_queue, &wait);
diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index d89b8afe23b6..920c469f3953 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -12,6 +12,27 @@ config ARM_BIG_LITTLE_CPUFREQ
help
This enables the Generic CPUfreq driver for ARM big.LITTLE platforms.
+config ARM_BRCMSTB_AVS_CPUFREQ
+ tristate "Broadcom STB AVS CPUfreq driver"
+ depends on ARCH_BRCMSTB || COMPILE_TEST
+ default y
+ help
+ Some Broadcom STB SoCs use a co-processor running proprietary firmware
+ ("AVS") to handle voltage and frequency scaling. This driver provides
+ a standard CPUfreq interface to to the firmware.
+
+ Say Y, if you have a Broadcom SoC with AVS support for DFS or DVFS.
+
+config ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+ bool "Broadcom STB AVS CPUfreq driver sysfs debug capability"
+ depends on ARM_BRCMSTB_AVS_CPUFREQ
+ help
+ Enabling this option turns on debug support via sysfs under
+ /sys/kernel/debug/brcmstb-avs-cpufreq. It is possible to read all and
+ write some AVS mailbox registers through sysfs entries.
+
+ If in doubt, say N.
+
config ARM_DT_BL_CPUFREQ
tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver"
depends on ARM_BIG_LITTLE_CPUFREQ && OF
@@ -60,14 +81,6 @@ config ARM_IMX6Q_CPUFREQ
If in doubt, say N.
-config ARM_INTEGRATOR
- tristate "CPUfreq driver for ARM Integrator CPUs"
- depends on ARCH_INTEGRATOR
- default y
- help
- This enables the CPUfreq driver for ARM Integrator CPUs.
- If in doubt, say Y.
-
config ARM_KIRKWOOD_CPUFREQ
def_bool MACH_KIRKWOOD
help
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 0a9b6a093646..1e46c3918e7a 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -51,12 +51,12 @@ obj-$(CONFIG_ARM_BIG_LITTLE_CPUFREQ) += arm_big_little.o
# LITTLE drivers, so that it is probed last.
obj-$(CONFIG_ARM_DT_BL_CPUFREQ) += arm_big_little_dt.o
+obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ) += brcmstb-avs-cpufreq.o
obj-$(CONFIG_ARCH_DAVINCI) += davinci-cpufreq.o
obj-$(CONFIG_UX500_SOC_DB8500) += dbx500-cpufreq.o
obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ) += exynos5440-cpufreq.o
obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ) += highbank-cpufreq.o
obj-$(CONFIG_ARM_IMX6Q_CPUFREQ) += imx6q-cpufreq.o
-obj-$(CONFIG_ARM_INTEGRATOR) += integrator-cpufreq.o
obj-$(CONFIG_ARM_KIRKWOOD_CPUFREQ) += kirkwood-cpufreq.o
obj-$(CONFIG_ARM_MT8173_CPUFREQ) += mt8173-cpufreq.o
obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ) += omap-cpufreq.o
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 297e9128fe9f..3a98702b7445 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -84,7 +84,6 @@ static inline struct acpi_processor_performance *to_perf_data(struct acpi_cpufre
static struct cpufreq_driver acpi_cpufreq_driver;
static unsigned int acpi_pstate_strict;
-static struct msr __percpu *msrs;
static bool boost_state(unsigned int cpu)
{
@@ -104,11 +103,10 @@ static bool boost_state(unsigned int cpu)
return false;
}
-static void boost_set_msrs(bool enable, const struct cpumask *cpumask)
+static int boost_set_msr(bool enable)
{
- u32 cpu;
u32 msr_addr;
- u64 msr_mask;
+ u64 msr_mask, val;
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
@@ -120,26 +118,31 @@ static void boost_set_msrs(bool enable, const struct cpumask *cpumask)
msr_mask = MSR_K7_HWCR_CPB_DIS;
break;
default:
- return;
+ return -EINVAL;
}
- rdmsr_on_cpus(cpumask, msr_addr, msrs);
+ rdmsrl(msr_addr, val);
- for_each_cpu(cpu, cpumask) {
- struct msr *reg = per_cpu_ptr(msrs, cpu);
- if (enable)
- reg->q &= ~msr_mask;
- else
- reg->q |= msr_mask;
- }
+ if (enable)
+ val &= ~msr_mask;
+ else
+ val |= msr_mask;
+
+ wrmsrl(msr_addr, val);
+ return 0;
+}
+
+static void boost_set_msr_each(void *p_en)
+{
+ bool enable = (bool) p_en;
- wrmsr_on_cpus(cpumask, msr_addr, msrs);
+ boost_set_msr(enable);
}
static int set_boost(int val)
{
get_online_cpus();
- boost_set_msrs(val, cpu_online_mask);
+ on_each_cpu(boost_set_msr_each, (void *)(long)val, 1);
put_online_cpus();
pr_debug("Core Boosting %sabled.\n", val ? "en" : "dis");
@@ -536,46 +539,24 @@ static void free_acpi_perf_data(void)
free_percpu(acpi_perf_data);
}
-static int boost_notify(struct notifier_block *nb, unsigned long action,
- void *hcpu)
+static int cpufreq_boost_online(unsigned int cpu)
{
- unsigned cpu = (long)hcpu;
- const struct cpumask *cpumask;
-
- cpumask = get_cpu_mask(cpu);
+ /*
+ * On the CPU_UP path we simply keep the boost-disable flag
+ * in sync with the current global state.
+ */
+ return boost_set_msr(acpi_cpufreq_driver.boost_enabled);
+}
+static int cpufreq_boost_down_prep(unsigned int cpu)
+{
/*
* Clear the boost-disable bit on the CPU_DOWN path so that
- * this cpu cannot block the remaining ones from boosting. On
- * the CPU_UP path we simply keep the boost-disable flag in
- * sync with the current global state.
+ * this cpu cannot block the remaining ones from boosting.
*/
-
- switch (action) {
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
- break;
-
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- boost_set_msrs(1, cpumask);
- break;
-
- default:
- break;
- }
-
- return NOTIFY_OK;
+ return boost_set_msr(1);
}
-
-static struct notifier_block boost_nb = {
- .notifier_call = boost_notify,
-};
-
/*
* acpi_cpufreq_early_init - initialize ACPI P-States library
*
@@ -922,37 +903,35 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
.attr = acpi_cpufreq_attr,
};
+static enum cpuhp_state acpi_cpufreq_online;
+
static void __init acpi_cpufreq_boost_init(void)
{
- if (boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)) {
- msrs = msrs_alloc();
-
- if (!msrs)
- return;
-
- acpi_cpufreq_driver.set_boost = set_boost;
- acpi_cpufreq_driver.boost_enabled = boost_state(0);
-
- cpu_notifier_register_begin();
+ int ret;
- /* Force all MSRs to the same value */
- boost_set_msrs(acpi_cpufreq_driver.boost_enabled,
- cpu_online_mask);
+ if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)))
+ return;
- __register_cpu_notifier(&boost_nb);
+ acpi_cpufreq_driver.set_boost = set_boost;
+ acpi_cpufreq_driver.boost_enabled = boost_state(0);
- cpu_notifier_register_done();
+ /*
+ * This calls the online callback on all online cpu and forces all
+ * MSRs to the same value.
+ */
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpufreq/acpi:online",
+ cpufreq_boost_online, cpufreq_boost_down_prep);
+ if (ret < 0) {
+ pr_err("acpi_cpufreq: failed to register hotplug callbacks\n");
+ return;
}
+ acpi_cpufreq_online = ret;
}
static void acpi_cpufreq_boost_exit(void)
{
- if (msrs) {
- unregister_cpu_notifier(&boost_nb);
-
- msrs_free(msrs);
- msrs = NULL;
- }
+ if (acpi_cpufreq_online >= 0)
+ cpuhp_remove_state_nocalls(acpi_cpufreq_online);
}
static int __init acpi_cpufreq_init(void)
diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
new file mode 100644
index 000000000000..4fda623e55bb
--- /dev/null
+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
@@ -0,0 +1,1057 @@
+/*
+ * CPU frequency scaling for Broadcom SoCs with AVS firmware that
+ * supports DVS or DVFS
+ *
+ * Copyright (c) 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * "AVS" is the name of a firmware developed at Broadcom. It derives
+ * its name from the technique called "Adaptive Voltage Scaling".
+ * Adaptive voltage scaling was the original purpose of this firmware.
+ * The AVS firmware still supports "AVS mode", where all it does is
+ * adaptive voltage scaling. However, on some newer Broadcom SoCs, the
+ * AVS Firmware, despite its unchanged name, also supports DFS mode and
+ * DVFS mode.
+ *
+ * In the context of this document and the related driver, "AVS" by
+ * itself always means the Broadcom firmware and never refers to the
+ * technique called "Adaptive Voltage Scaling".
+ *
+ * The Broadcom STB AVS CPUfreq driver provides voltage and frequency
+ * scaling on Broadcom SoCs using AVS firmware with support for DFS and
+ * DVFS. The AVS firmware is running on its own co-processor. The
+ * driver supports both uniprocessor (UP) and symmetric multiprocessor
+ * (SMP) systems which share clock and voltage across all CPUs.
+ *
+ * Actual voltage and frequency scaling is done solely by the AVS
+ * firmware. This driver does not change frequency or voltage itself.
+ * It provides a standard CPUfreq interface to the rest of the kernel
+ * and to userland. It interfaces with the AVS firmware to effect the
+ * requested changes and to report back the current system status in a
+ * way that is expected by existing tools.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/semaphore.h>
+
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#endif
+
+/* Max number of arguments AVS calls take */
+#define AVS_MAX_CMD_ARGS 4
+/*
+ * This macro is used to generate AVS parameter register offsets. For
+ * x >= AVS_MAX_CMD_ARGS, it returns 0 to protect against accidental memory
+ * access outside of the parameter range. (Offset 0 is the first parameter.)
+ */
+#define AVS_PARAM_MULT(x) ((x) < AVS_MAX_CMD_ARGS ? (x) : 0)
+
+/* AVS Mailbox Register offsets */
+#define AVS_MBOX_COMMAND 0x00
+#define AVS_MBOX_STATUS 0x04
+#define AVS_MBOX_VOLTAGE0 0x08
+#define AVS_MBOX_TEMP0 0x0c
+#define AVS_MBOX_PV0 0x10
+#define AVS_MBOX_MV0 0x14
+#define AVS_MBOX_PARAM(x) (0x18 + AVS_PARAM_MULT(x) * sizeof(u32))
+#define AVS_MBOX_REVISION 0x28
+#define AVS_MBOX_PSTATE 0x2c
+#define AVS_MBOX_HEARTBEAT 0x30
+#define AVS_MBOX_MAGIC 0x34
+#define AVS_MBOX_SIGMA_HVT 0x38
+#define AVS_MBOX_SIGMA_SVT 0x3c
+#define AVS_MBOX_VOLTAGE1 0x40
+#define AVS_MBOX_TEMP1 0x44
+#define AVS_MBOX_PV1 0x48
+#define AVS_MBOX_MV1 0x4c
+#define AVS_MBOX_FREQUENCY 0x50
+
+/* AVS Commands */
+#define AVS_CMD_AVAILABLE 0x00
+#define AVS_CMD_DISABLE 0x10
+#define AVS_CMD_ENABLE 0x11
+#define AVS_CMD_S2_ENTER 0x12
+#define AVS_CMD_S2_EXIT 0x13
+#define AVS_CMD_BBM_ENTER 0x14
+#define AVS_CMD_BBM_EXIT 0x15
+#define AVS_CMD_S3_ENTER 0x16
+#define AVS_CMD_S3_EXIT 0x17
+#define AVS_CMD_BALANCE 0x18
+/* PMAP and P-STATE commands */
+#define AVS_CMD_GET_PMAP 0x30
+#define AVS_CMD_SET_PMAP 0x31
+#define AVS_CMD_GET_PSTATE 0x40
+#define AVS_CMD_SET_PSTATE 0x41
+
+/* Different modes AVS supports (for GET_PMAP/SET_PMAP) */
+#define AVS_MODE_AVS 0x0
+#define AVS_MODE_DFS 0x1
+#define AVS_MODE_DVS 0x2
+#define AVS_MODE_DVFS 0x3
+
+/*
+ * PMAP parameter p1
+ * unused:31-24, mdiv_p0:23-16, unused:15-14, pdiv:13-10 , ndiv_int:9-0
+ */
+#define NDIV_INT_SHIFT 0
+#define NDIV_INT_MASK 0x3ff
+#define PDIV_SHIFT 10
+#define PDIV_MASK 0xf
+#define MDIV_P0_SHIFT 16
+#define MDIV_P0_MASK 0xff
+/*
+ * PMAP parameter p2
+ * mdiv_p4:31-24, mdiv_p3:23-16, mdiv_p2:15:8, mdiv_p1:7:0
+ */
+#define MDIV_P1_SHIFT 0
+#define MDIV_P1_MASK 0xff
+#define MDIV_P2_SHIFT 8
+#define MDIV_P2_MASK 0xff
+#define MDIV_P3_SHIFT 16
+#define MDIV_P3_MASK 0xff
+#define MDIV_P4_SHIFT 24
+#define MDIV_P4_MASK 0xff
+
+/* Different P-STATES AVS supports (for GET_PSTATE/SET_PSTATE) */
+#define AVS_PSTATE_P0 0x0
+#define AVS_PSTATE_P1 0x1
+#define AVS_PSTATE_P2 0x2
+#define AVS_PSTATE_P3 0x3
+#define AVS_PSTATE_P4 0x4
+#define AVS_PSTATE_MAX AVS_PSTATE_P4
+
+/* CPU L2 Interrupt Controller Registers */
+#define AVS_CPU_L2_SET0 0x04
+#define AVS_CPU_L2_INT_MASK BIT(31)
+
+/* AVS Command Status Values */
+#define AVS_STATUS_CLEAR 0x00
+/* Command/notification accepted */
+#define AVS_STATUS_SUCCESS 0xf0
+/* Command/notification rejected */
+#define AVS_STATUS_FAILURE 0xff
+/* Invalid command/notification (unknown) */
+#define AVS_STATUS_INVALID 0xf1
+/* Non-AVS modes are not supported */
+#define AVS_STATUS_NO_SUPP 0xf2
+/* Cannot set P-State until P-Map supplied */
+#define AVS_STATUS_NO_MAP 0xf3
+/* Cannot change P-Map after initial P-Map set */
+#define AVS_STATUS_MAP_SET 0xf4
+/* Max AVS status; higher numbers are used for debugging */
+#define AVS_STATUS_MAX 0xff
+
+/* Other AVS related constants */
+#define AVS_LOOP_LIMIT 10000
+#define AVS_TIMEOUT 300 /* in ms; expected completion is < 10ms */
+#define AVS_FIRMWARE_MAGIC 0xa11600d1
+
+#define BRCM_AVS_CPUFREQ_PREFIX "brcmstb-avs"
+#define BRCM_AVS_CPUFREQ_NAME BRCM_AVS_CPUFREQ_PREFIX "-cpufreq"
+#define BRCM_AVS_CPU_DATA "brcm,avs-cpu-data-mem"
+#define BRCM_AVS_CPU_INTR "brcm,avs-cpu-l2-intr"
+#define BRCM_AVS_HOST_INTR "sw_intr"
+
+struct pmap {
+ unsigned int mode;
+ unsigned int p1;
+ unsigned int p2;
+ unsigned int state;
+};
+
+struct private_data {
+ void __iomem *base;
+ void __iomem *avs_intr_base;
+ struct device *dev;
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+ struct dentry *debugfs;
+#endif
+ struct completion done;
+ struct semaphore sem;
+ struct pmap pmap;
+};
+
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+
+enum debugfs_format {
+ DEBUGFS_NORMAL,
+ DEBUGFS_FLOAT,
+ DEBUGFS_REV,
+};
+
+struct debugfs_data {
+ struct debugfs_entry *entry;
+ struct private_data *priv;
+};
+
+struct debugfs_entry {
+ char *name;
+ u32 offset;
+ fmode_t mode;
+ enum debugfs_format format;
+};
+
+#define DEBUGFS_ENTRY(name, mode, format) { \
+ #name, AVS_MBOX_##name, mode, format \
+}
+
+/*
+ * These are used for debugfs only. Otherwise we use AVS_MBOX_PARAM() directly.
+ */
+#define AVS_MBOX_PARAM1 AVS_MBOX_PARAM(0)
+#define AVS_MBOX_PARAM2 AVS_MBOX_PARAM(1)
+#define AVS_MBOX_PARAM3 AVS_MBOX_PARAM(2)
+#define AVS_MBOX_PARAM4 AVS_MBOX_PARAM(3)
+
+/*
+ * This table stores the name, access permissions and offset for each hardware
+ * register and is used to generate debugfs entries.
+ */
+static struct debugfs_entry debugfs_entries[] = {
+ DEBUGFS_ENTRY(COMMAND, S_IWUSR, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(STATUS, S_IWUSR, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(VOLTAGE0, 0, DEBUGFS_FLOAT),
+ DEBUGFS_ENTRY(TEMP0, 0, DEBUGFS_FLOAT),
+ DEBUGFS_ENTRY(PV0, 0, DEBUGFS_FLOAT),
+ DEBUGFS_ENTRY(MV0, 0, DEBUGFS_FLOAT),
+ DEBUGFS_ENTRY(PARAM1, S_IWUSR, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(PARAM2, S_IWUSR, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(PARAM3, S_IWUSR, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(PARAM4, S_IWUSR, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(REVISION, 0, DEBUGFS_REV),
+ DEBUGFS_ENTRY(PSTATE, 0, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(HEARTBEAT, 0, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(MAGIC, S_IWUSR, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(SIGMA_HVT, 0, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(SIGMA_SVT, 0, DEBUGFS_NORMAL),
+ DEBUGFS_ENTRY(VOLTAGE1, 0, DEBUGFS_FLOAT),
+ DEBUGFS_ENTRY(TEMP1, 0, DEBUGFS_FLOAT),
+ DEBUGFS_ENTRY(PV1, 0, DEBUGFS_FLOAT),
+ DEBUGFS_ENTRY(MV1, 0, DEBUGFS_FLOAT),
+ DEBUGFS_ENTRY(FREQUENCY, 0, DEBUGFS_NORMAL),
+};
+
+static int brcm_avs_target_index(struct cpufreq_policy *, unsigned int);
+
+static char *__strtolower(char *s)
+{
+ char *p;
+
+ for (p = s; *p; p++)
+ *p = tolower(*p);
+
+ return s;
+}
+
+#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */
+
+static void __iomem *__map_region(const char *name)
+{
+ struct device_node *np;
+ void __iomem *ptr;
+
+ np = of_find_compatible_node(NULL, NULL, name);
+ if (!np)
+ return NULL;
+
+ ptr = of_iomap(np, 0);
+ of_node_put(np);
+
+ return ptr;
+}
+
+static int __issue_avs_command(struct private_data *priv, int cmd, bool is_send,
+ u32 args[])
+{
+ unsigned long time_left = msecs_to_jiffies(AVS_TIMEOUT);
+ void __iomem *base = priv->base;
+ unsigned int i;
+ int ret;
+ u32 val;
+
+ ret = down_interruptible(&priv->sem);
+ if (ret)
+ return ret;
+
+ /*
+ * Make sure no other command is currently running: cmd is 0 if AVS
+ * co-processor is idle. Due to the guard above, we should almost never
+ * have to wait here.
+ */
+ for (i = 0, val = 1; val != 0 && i < AVS_LOOP_LIMIT; i++)
+ val = readl(base + AVS_MBOX_COMMAND);
+
+ /* Give the caller a chance to retry if AVS is busy. */
+ if (i == AVS_LOOP_LIMIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ /* Clear status before we begin. */
+ writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS);
+
+ /* We need to send arguments for this command. */
+ if (args && is_send) {
+ for (i = 0; i < AVS_MAX_CMD_ARGS; i++)
+ writel(args[i], base + AVS_MBOX_PARAM(i));
+ }
+
+ /* Protect from spurious interrupts. */
+ reinit_completion(&priv->done);
+
+ /* Now issue the command & tell firmware to wake up to process it. */
+ writel(cmd, base + AVS_MBOX_COMMAND);
+ writel(AVS_CPU_L2_INT_MASK, priv->avs_intr_base + AVS_CPU_L2_SET0);
+
+ /* Wait for AVS co-processor to finish processing the command. */
+ time_left = wait_for_completion_timeout(&priv->done, time_left);
+
+ /*
+ * If the AVS status is not in the expected range, it means AVS didn't
+ * complete our command in time, and we return an error. Also, if there
+ * is no "time left", we timed out waiting for the interrupt.
+ */
+ val = readl(base + AVS_MBOX_STATUS);
+ if (time_left == 0 || val == 0 || val > AVS_STATUS_MAX) {
+ dev_err(priv->dev, "AVS command %#x didn't complete in time\n",
+ cmd);
+ dev_err(priv->dev, " Time left: %u ms, AVS status: %#x\n",
+ jiffies_to_msecs(time_left), val);
+ ret = -ETIMEDOUT;
+ goto out;
+ }
+
+ /* This command returned arguments, so we read them back. */
+ if (args && !is_send) {
+ for (i = 0; i < AVS_MAX_CMD_ARGS; i++)
+ args[i] = readl(base + AVS_MBOX_PARAM(i));
+ }
+
+ /* Clear status to tell AVS co-processor we are done. */
+ writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS);
+
+ /* Convert firmware errors to errno's as much as possible. */
+ switch (val) {
+ case AVS_STATUS_INVALID:
+ ret = -EINVAL;
+ break;
+ case AVS_STATUS_NO_SUPP:
+ ret = -ENOTSUPP;
+ break;
+ case AVS_STATUS_NO_MAP:
+ ret = -ENOENT;
+ break;
+ case AVS_STATUS_MAP_SET:
+ ret = -EEXIST;
+ break;
+ case AVS_STATUS_FAILURE:
+ ret = -EIO;
+ break;
+ }
+
+out:
+ up(&priv->sem);
+
+ return ret;
+}
+
+static irqreturn_t irq_handler(int irq, void *data)
+{
+ struct private_data *priv = data;
+
+ /* AVS command completed execution. Wake up __issue_avs_command(). */
+ complete(&priv->done);
+
+ return IRQ_HANDLED;
+}
+
+static char *brcm_avs_mode_to_string(unsigned int mode)
+{
+ switch (mode) {
+ case AVS_MODE_AVS:
+ return "AVS";
+ case AVS_MODE_DFS:
+ return "DFS";
+ case AVS_MODE_DVS:
+ return "DVS";
+ case AVS_MODE_DVFS:
+ return "DVFS";
+ }
+ return NULL;
+}
+
+static void brcm_avs_parse_p1(u32 p1, unsigned int *mdiv_p0, unsigned int *pdiv,
+ unsigned int *ndiv)
+{
+ *mdiv_p0 = (p1 >> MDIV_P0_SHIFT) & MDIV_P0_MASK;
+ *pdiv = (p1 >> PDIV_SHIFT) & PDIV_MASK;
+ *ndiv = (p1 >> NDIV_INT_SHIFT) & NDIV_INT_MASK;
+}
+
+static void brcm_avs_parse_p2(u32 p2, unsigned int *mdiv_p1,
+ unsigned int *mdiv_p2, unsigned int *mdiv_p3,
+ unsigned int *mdiv_p4)
+{
+ *mdiv_p4 = (p2 >> MDIV_P4_SHIFT) & MDIV_P4_MASK;
+ *mdiv_p3 = (p2 >> MDIV_P3_SHIFT) & MDIV_P3_MASK;
+ *mdiv_p2 = (p2 >> MDIV_P2_SHIFT) & MDIV_P2_MASK;
+ *mdiv_p1 = (p2 >> MDIV_P1_SHIFT) & MDIV_P1_MASK;
+}
+
+static int brcm_avs_get_pmap(struct private_data *priv, struct pmap *pmap)
+{
+ u32 args[AVS_MAX_CMD_ARGS];
+ int ret;
+
+ ret = __issue_avs_command(priv, AVS_CMD_GET_PMAP, false, args);
+ if (ret || !pmap)
+ return ret;
+
+ pmap->mode = args[0];
+ pmap->p1 = args[1];
+ pmap->p2 = args[2];
+ pmap->state = args[3];
+
+ return 0;
+}
+
+static int brcm_avs_set_pmap(struct private_data *priv, struct pmap *pmap)
+{
+ u32 args[AVS_MAX_CMD_ARGS];
+
+ args[0] = pmap->mode;
+ args[1] = pmap->p1;
+ args[2] = pmap->p2;
+ args[3] = pmap->state;
+
+ return __issue_avs_command(priv, AVS_CMD_SET_PMAP, true, args);
+}
+
+static int brcm_avs_get_pstate(struct private_data *priv, unsigned int *pstate)
+{
+ u32 args[AVS_MAX_CMD_ARGS];
+ int ret;
+
+ ret = __issue_avs_command(priv, AVS_CMD_GET_PSTATE, false, args);
+ if (ret)
+ return ret;
+ *pstate = args[0];
+
+ return 0;
+}
+
+static int brcm_avs_set_pstate(struct private_data *priv, unsigned int pstate)
+{
+ u32 args[AVS_MAX_CMD_ARGS];
+
+ args[0] = pstate;
+
+ return __issue_avs_command(priv, AVS_CMD_SET_PSTATE, true, args);
+}
+
+static unsigned long brcm_avs_get_voltage(void __iomem *base)
+{
+ return readl(base + AVS_MBOX_VOLTAGE1);
+}
+
+static unsigned long brcm_avs_get_frequency(void __iomem *base)
+{
+ return readl(base + AVS_MBOX_FREQUENCY) * 1000; /* in kHz */
+}
+
+/*
+ * We determine which frequencies are supported by cycling through all P-states
+ * and reading back what frequency we are running at for each P-state.
+ */
+static struct cpufreq_frequency_table *
+brcm_avs_get_freq_table(struct device *dev, struct private_data *priv)
+{
+ struct cpufreq_frequency_table *table;
+ unsigned int pstate;
+ int i, ret;
+
+ /* Remember P-state for later */
+ ret = brcm_avs_get_pstate(priv, &pstate);
+ if (ret)
+ return ERR_PTR(ret);
+
+ table = devm_kzalloc(dev, (AVS_PSTATE_MAX + 1) * sizeof(*table),
+ GFP_KERNEL);
+ if (!table)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = AVS_PSTATE_P0; i <= AVS_PSTATE_MAX; i++) {
+ ret = brcm_avs_set_pstate(priv, i);
+ if (ret)
+ return ERR_PTR(ret);
+ table[i].frequency = brcm_avs_get_frequency(priv->base);
+ table[i].driver_data = i;
+ }
+ table[i].frequency = CPUFREQ_TABLE_END;
+
+ /* Restore P-state */
+ ret = brcm_avs_set_pstate(priv, pstate);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return table;
+}
+
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+
+#define MANT(x) (unsigned int)(abs((x)) / 1000)
+#define FRAC(x) (unsigned int)(abs((x)) - abs((x)) / 1000 * 1000)
+
+static int brcm_avs_debug_show(struct seq_file *s, void *data)
+{
+ struct debugfs_data *dbgfs = s->private;
+ void __iomem *base;
+ u32 val, offset;
+
+ if (!dbgfs) {
+ seq_puts(s, "No device pointer\n");
+ return 0;
+ }
+
+ base = dbgfs->priv->base;
+ offset = dbgfs->entry->offset;
+ val = readl(base + offset);
+ switch (dbgfs->entry->format) {
+ case DEBUGFS_NORMAL:
+ seq_printf(s, "%u\n", val);
+ break;
+ case DEBUGFS_FLOAT:
+ seq_printf(s, "%d.%03d\n", MANT(val), FRAC(val));
+ break;
+ case DEBUGFS_REV:
+ seq_printf(s, "%c.%c.%c.%c\n", (val >> 24 & 0xff),
+ (val >> 16 & 0xff), (val >> 8 & 0xff),
+ val & 0xff);
+ break;
+ }
+ seq_printf(s, "0x%08x\n", val);
+
+ return 0;
+}
+
+#undef MANT
+#undef FRAC
+
+static ssize_t brcm_avs_seq_write(struct file *file, const char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct seq_file *s = file->private_data;
+ struct debugfs_data *dbgfs = s->private;
+ struct private_data *priv = dbgfs->priv;
+ void __iomem *base, *avs_intr_base;
+ bool use_issue_command = false;
+ unsigned long val, offset;
+ char str[128];
+ int ret;
+ char *str_ptr = str;
+
+ if (size >= sizeof(str))
+ return -E2BIG;
+
+ memset(str, 0, sizeof(str));
+ ret = copy_from_user(str, buf, size);
+ if (ret)
+ return ret;
+
+ base = priv->base;
+ avs_intr_base = priv->avs_intr_base;
+ offset = dbgfs->entry->offset;
+ /*
+ * Special case writing to "command" entry only: if the string starts
+ * with a 'c', we use the driver's __issue_avs_command() function.
+ * Otherwise, we perform a raw write. This should allow testing of raw
+ * access as well as using the higher level function. (Raw access
+ * doesn't clear the firmware return status after issuing the command.)
+ */
+ if (str_ptr[0] == 'c' && offset == AVS_MBOX_COMMAND) {
+ use_issue_command = true;
+ str_ptr++;
+ }
+ if (kstrtoul(str_ptr, 0, &val) != 0)
+ return -EINVAL;
+
+ /*
+ * Setting the P-state is a special case. We need to update the CPU
+ * frequency we report.
+ */
+ if (val == AVS_CMD_SET_PSTATE) {
+ struct cpufreq_policy *policy;
+ unsigned int pstate;
+
+ policy = cpufreq_cpu_get(smp_processor_id());
+ /* Read back the P-state we are about to set */
+ pstate = readl(base + AVS_MBOX_PARAM(0));
+ if (use_issue_command) {
+ ret = brcm_avs_target_index(policy, pstate);
+ return ret ? ret : size;
+ }
+ policy->cur = policy->freq_table[pstate].frequency;
+ }
+
+ if (use_issue_command) {
+ ret = __issue_avs_command(priv, val, false, NULL);
+ } else {
+ /* Locking here is not perfect, but is only for debug. */
+ ret = down_interruptible(&priv->sem);
+ if (ret)
+ return ret;
+
+ writel(val, base + offset);
+ /* We have to wake up the firmware to process a command. */
+ if (offset == AVS_MBOX_COMMAND)
+ writel(AVS_CPU_L2_INT_MASK,
+ avs_intr_base + AVS_CPU_L2_SET0);
+ up(&priv->sem);
+ }
+
+ return ret ? ret : size;
+}
+
+static struct debugfs_entry *__find_debugfs_entry(const char *name)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++)
+ if (strcasecmp(debugfs_entries[i].name, name) == 0)
+ return &debugfs_entries[i];
+
+ return NULL;
+}
+
+static int brcm_avs_debug_open(struct inode *inode, struct file *file)
+{
+ struct debugfs_data *data;
+ fmode_t fmode;
+ int ret;
+
+ /*
+ * seq_open(), which is called by single_open(), clears "write" access.
+ * We need write access to some files, so we preserve our access mode
+ * and restore it.
+ */
+ fmode = file->f_mode;
+ /*
+ * Check access permissions even for root. We don't want to be writing
+ * to read-only registers. Access for regular users has already been
+ * checked by the VFS layer.
+ */
+ if ((fmode & FMODE_WRITER) && !(inode->i_mode & S_IWUSR))
+ return -EACCES;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ /*
+ * We use the same file system operations for all our debug files. To
+ * produce specific output, we look up the file name upon opening a
+ * debugfs entry and map it to a memory offset. This offset is then used
+ * in the generic "show" function to read a specific register.
+ */
+ data->entry = __find_debugfs_entry(file->f_path.dentry->d_iname);
+ data->priv = inode->i_private;
+
+ ret = single_open(file, brcm_avs_debug_show, data);
+ if (ret)
+ kfree(data);
+ file->f_mode = fmode;
+
+ return ret;
+}
+
+static int brcm_avs_debug_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq_priv = file->private_data;
+ struct debugfs_data *data = seq_priv->private;
+
+ kfree(data);
+ return single_release(inode, file);
+}
+
+static const struct file_operations brcm_avs_debug_ops = {
+ .open = brcm_avs_debug_open,
+ .read = seq_read,
+ .write = brcm_avs_seq_write,
+ .llseek = seq_lseek,
+ .release = brcm_avs_debug_release,
+};
+
+static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev)
+{
+ struct private_data *priv = platform_get_drvdata(pdev);
+ struct dentry *dir;
+ int i;
+
+ if (!priv)
+ return;
+
+ dir = debugfs_create_dir(BRCM_AVS_CPUFREQ_NAME, NULL);
+ if (IS_ERR_OR_NULL(dir))
+ return;
+ priv->debugfs = dir;
+
+ for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++) {
+ /*
+ * The DEBUGFS_ENTRY macro generates uppercase strings. We
+ * convert them to lowercase before creating the debugfs
+ * entries.
+ */
+ char *entry = __strtolower(debugfs_entries[i].name);
+ fmode_t mode = debugfs_entries[i].mode;
+
+ if (!debugfs_create_file(entry, S_IFREG | S_IRUGO | mode,
+ dir, priv, &brcm_avs_debug_ops)) {
+ priv->debugfs = NULL;
+ debugfs_remove_recursive(dir);
+ break;
+ }
+ }
+}
+
+static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev)
+{
+ struct private_data *priv = platform_get_drvdata(pdev);
+
+ if (priv && priv->debugfs) {
+ debugfs_remove_recursive(priv->debugfs);
+ priv->debugfs = NULL;
+ }
+}
+
+#else
+
+static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev) {}
+static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev) {}
+
+#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */
+
+/*
+ * To ensure the right firmware is running we need to
+ * - check the MAGIC matches what we expect
+ * - brcm_avs_get_pmap() doesn't return -ENOTSUPP or -EINVAL
+ * We need to set up our interrupt handling before calling brcm_avs_get_pmap()!
+ */
+static bool brcm_avs_is_firmware_loaded(struct private_data *priv)
+{
+ u32 magic;
+ int rc;
+
+ rc = brcm_avs_get_pmap(priv, NULL);
+ magic = readl(priv->base + AVS_MBOX_MAGIC);
+
+ return (magic == AVS_FIRMWARE_MAGIC) && (rc != -ENOTSUPP) &&
+ (rc != -EINVAL);
+}
+
+static unsigned int brcm_avs_cpufreq_get(unsigned int cpu)
+{
+ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+ struct private_data *priv = policy->driver_data;
+
+ return brcm_avs_get_frequency(priv->base);
+}
+
+static int brcm_avs_target_index(struct cpufreq_policy *policy,
+ unsigned int index)
+{
+ return brcm_avs_set_pstate(policy->driver_data,
+ policy->freq_table[index].driver_data);
+}
+
+static int brcm_avs_suspend(struct cpufreq_policy *policy)
+{
+ struct private_data *priv = policy->driver_data;
+
+ return brcm_avs_get_pmap(priv, &priv->pmap);
+}
+
+static int brcm_avs_resume(struct cpufreq_policy *policy)
+{
+ struct private_data *priv = policy->driver_data;
+ int ret;
+
+ ret = brcm_avs_set_pmap(priv, &priv->pmap);
+ if (ret == -EEXIST) {
+ struct platform_device *pdev = cpufreq_get_driver_data();
+ struct device *dev = &pdev->dev;
+
+ dev_warn(dev, "PMAP was already set\n");
+ ret = 0;
+ }
+
+ return ret;
+}
+
+/*
+ * All initialization code that we only want to execute once goes here. Setup
+ * code that can be re-tried on every core (if it failed before) can go into
+ * brcm_avs_cpufreq_init().
+ */
+static int brcm_avs_prepare_init(struct platform_device *pdev)
+{
+ struct private_data *priv;
+ struct device *dev;
+ int host_irq, ret;
+
+ dev = &pdev->dev;
+ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->dev = dev;
+ sema_init(&priv->sem, 1);
+ init_completion(&priv->done);
+ platform_set_drvdata(pdev, priv);
+
+ priv->base = __map_region(BRCM_AVS_CPU_DATA);
+ if (!priv->base) {
+ dev_err(dev, "Couldn't find property %s in device tree.\n",
+ BRCM_AVS_CPU_DATA);
+ return -ENOENT;
+ }
+
+ priv->avs_intr_base = __map_region(BRCM_AVS_CPU_INTR);
+ if (!priv->avs_intr_base) {
+ dev_err(dev, "Couldn't find property %s in device tree.\n",
+ BRCM_AVS_CPU_INTR);
+ ret = -ENOENT;
+ goto unmap_base;
+ }
+
+ host_irq = platform_get_irq_byname(pdev, BRCM_AVS_HOST_INTR);
+ if (host_irq < 0) {
+ dev_err(dev, "Couldn't find interrupt %s -- %d\n",
+ BRCM_AVS_HOST_INTR, host_irq);
+ ret = host_irq;
+ goto unmap_intr_base;
+ }
+
+ ret = devm_request_irq(dev, host_irq, irq_handler, IRQF_TRIGGER_RISING,
+ BRCM_AVS_HOST_INTR, priv);
+ if (ret) {
+ dev_err(dev, "IRQ request failed: %s (%d) -- %d\n",
+ BRCM_AVS_HOST_INTR, host_irq, ret);
+ goto unmap_intr_base;
+ }
+
+ if (brcm_avs_is_firmware_loaded(priv))
+ return 0;
+
+ dev_err(dev, "AVS firmware is not loaded or doesn't support DVFS\n");
+ ret = -ENODEV;
+
+unmap_intr_base:
+ iounmap(priv->avs_intr_base);
+unmap_base:
+ iounmap(priv->base);
+ platform_set_drvdata(pdev, NULL);
+
+ return ret;
+}
+
+static int brcm_avs_cpufreq_init(struct cpufreq_policy *policy)
+{
+ struct cpufreq_frequency_table *freq_table;
+ struct platform_device *pdev;
+ struct private_data *priv;
+ struct device *dev;
+ int ret;
+
+ pdev = cpufreq_get_driver_data();
+ priv = platform_get_drvdata(pdev);
+ policy->driver_data = priv;
+ dev = &pdev->dev;
+
+ freq_table = brcm_avs_get_freq_table(dev, priv);
+ if (IS_ERR(freq_table)) {
+ ret = PTR_ERR(freq_table);
+ dev_err(dev, "Couldn't determine frequency table (%d).\n", ret);
+ return ret;
+ }
+
+ ret = cpufreq_table_validate_and_show(policy, freq_table);
+ if (ret) {
+ dev_err(dev, "invalid frequency table: %d\n", ret);
+ return ret;
+ }
+
+ /* All cores share the same clock and thus the same policy. */
+ cpumask_setall(policy->cpus);
+
+ ret = __issue_avs_command(priv, AVS_CMD_ENABLE, false, NULL);
+ if (!ret) {
+ unsigned int pstate;
+
+ ret = brcm_avs_get_pstate(priv, &pstate);
+ if (!ret) {
+ policy->cur = freq_table[pstate].frequency;
+ dev_info(dev, "registered\n");
+ return 0;
+ }
+ }
+
+ dev_err(dev, "couldn't initialize driver (%d)\n", ret);
+
+ return ret;
+}
+
+static ssize_t show_brcm_avs_pstate(struct cpufreq_policy *policy, char *buf)
+{
+ struct private_data *priv = policy->driver_data;
+ unsigned int pstate;
+
+ if (brcm_avs_get_pstate(priv, &pstate))
+ return sprintf(buf, "<unknown>\n");
+
+ return sprintf(buf, "%u\n", pstate);
+}
+
+static ssize_t show_brcm_avs_mode(struct cpufreq_policy *policy, char *buf)
+{
+ struct private_data *priv = policy->driver_data;
+ struct pmap pmap;
+
+ if (brcm_avs_get_pmap(priv, &pmap))
+ return sprintf(buf, "<unknown>\n");
+
+ return sprintf(buf, "%s %u\n", brcm_avs_mode_to_string(pmap.mode),
+ pmap.mode);
+}
+
+static ssize_t show_brcm_avs_pmap(struct cpufreq_policy *policy, char *buf)
+{
+ unsigned int mdiv_p0, mdiv_p1, mdiv_p2, mdiv_p3, mdiv_p4;
+ struct private_data *priv = policy->driver_data;
+ unsigned int ndiv, pdiv;
+ struct pmap pmap;
+
+ if (brcm_avs_get_pmap(priv, &pmap))
+ return sprintf(buf, "<unknown>\n");
+
+ brcm_avs_parse_p1(pmap.p1, &mdiv_p0, &pdiv, &ndiv);
+ brcm_avs_parse_p2(pmap.p2, &mdiv_p1, &mdiv_p2, &mdiv_p3, &mdiv_p4);
+
+ return sprintf(buf, "0x%08x 0x%08x %u %u %u %u %u %u %u\n",
+ pmap.p1, pmap.p2, ndiv, pdiv, mdiv_p0, mdiv_p1, mdiv_p2,
+ mdiv_p3, mdiv_p4);
+}
+
+static ssize_t show_brcm_avs_voltage(struct cpufreq_policy *policy, char *buf)
+{
+ struct private_data *priv = policy->driver_data;
+
+ return sprintf(buf, "0x%08lx\n", brcm_avs_get_voltage(priv->base));
+}
+
+static ssize_t show_brcm_avs_frequency(struct cpufreq_policy *policy, char *buf)
+{
+ struct private_data *priv = policy->driver_data;
+
+ return sprintf(buf, "0x%08lx\n", brcm_avs_get_frequency(priv->base));
+}
+
+cpufreq_freq_attr_ro(brcm_avs_pstate);
+cpufreq_freq_attr_ro(brcm_avs_mode);
+cpufreq_freq_attr_ro(brcm_avs_pmap);
+cpufreq_freq_attr_ro(brcm_avs_voltage);
+cpufreq_freq_attr_ro(brcm_avs_frequency);
+
+static struct freq_attr *brcm_avs_cpufreq_attr[] = {
+ &cpufreq_freq_attr_scaling_available_freqs,
+ &brcm_avs_pstate,
+ &brcm_avs_mode,
+ &brcm_avs_pmap,
+ &brcm_avs_voltage,
+ &brcm_avs_frequency,
+ NULL
+};
+
+static struct cpufreq_driver brcm_avs_driver = {
+ .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+ .verify = cpufreq_generic_frequency_table_verify,
+ .target_index = brcm_avs_target_index,
+ .get = brcm_avs_cpufreq_get,
+ .suspend = brcm_avs_suspend,
+ .resume = brcm_avs_resume,
+ .init = brcm_avs_cpufreq_init,
+ .attr = brcm_avs_cpufreq_attr,
+ .name = BRCM_AVS_CPUFREQ_PREFIX,
+};
+
+static int brcm_avs_cpufreq_probe(struct platform_device *pdev)
+{
+ int ret;
+
+ ret = brcm_avs_prepare_init(pdev);
+ if (ret)
+ return ret;
+
+ brcm_avs_driver.driver_data = pdev;
+ ret = cpufreq_register_driver(&brcm_avs_driver);
+ if (!ret)
+ brcm_avs_cpufreq_debug_init(pdev);
+
+ return ret;
+}
+
+static int brcm_avs_cpufreq_remove(struct platform_device *pdev)
+{
+ struct private_data *priv;
+ int ret;
+
+ ret = cpufreq_unregister_driver(&brcm_avs_driver);
+ if (ret)
+ return ret;
+
+ brcm_avs_cpufreq_debug_exit(pdev);
+
+ priv = platform_get_drvdata(pdev);
+ iounmap(priv->base);
+ iounmap(priv->avs_intr_base);
+ platform_set_drvdata(pdev, NULL);
+
+ return 0;
+}
+
+static const struct of_device_id brcm_avs_cpufreq_match[] = {
+ { .compatible = BRCM_AVS_CPU_DATA },
+ { }
+};
+MODULE_DEVICE_TABLE(of, brcm_avs_cpufreq_match);
+
+static struct platform_driver brcm_avs_cpufreq_platdrv = {
+ .driver = {
+ .name = BRCM_AVS_CPUFREQ_NAME,
+ .of_match_table = brcm_avs_cpufreq_match,
+ },
+ .probe = brcm_avs_cpufreq_probe,
+ .remove = brcm_avs_cpufreq_remove,
+};
+module_platform_driver(brcm_avs_cpufreq_platdrv);
+
+MODULE_AUTHOR("Markus Mayer <mmayer@broadcom.com>");
+MODULE_DESCRIPTION("CPUfreq driver for Broadcom STB AVS");
+MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 4852d9efe74e..e82bb3c30b92 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -247,3 +247,10 @@ MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+ spec");
MODULE_LICENSE("GPL");
late_initcall(cppc_cpufreq_init);
+
+static const struct acpi_device_id cppc_acpi_ids[] = {
+ {ACPI_PROCESSOR_DEVICE_HID, },
+ {}
+};
+
+MODULE_DEVICE_TABLE(acpi, cppc_acpi_ids);
diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index 71267626456b..bc97b6a4b1cf 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -26,6 +26,9 @@ static const struct of_device_id machines[] __initconst = {
{ .compatible = "allwinner,sun8i-a83t", },
{ .compatible = "allwinner,sun8i-h3", },
+ { .compatible = "arm,integrator-ap", },
+ { .compatible = "arm,integrator-cp", },
+
{ .compatible = "hisilicon,hi6220", },
{ .compatible = "fsl,imx27", },
@@ -34,6 +37,8 @@ static const struct of_device_id machines[] __initconst = {
{ .compatible = "fsl,imx7d", },
{ .compatible = "marvell,berlin", },
+ { .compatible = "marvell,pxa250", },
+ { .compatible = "marvell,pxa270", },
{ .compatible = "samsung,exynos3250", },
{ .compatible = "samsung,exynos4210", },
@@ -50,6 +55,8 @@ static const struct of_device_id machines[] __initconst = {
{ .compatible = "renesas,r7s72100", },
{ .compatible = "renesas,r8a73a4", },
{ .compatible = "renesas,r8a7740", },
+ { .compatible = "renesas,r8a7743", },
+ { .compatible = "renesas,r8a7745", },
{ .compatible = "renesas,r8a7778", },
{ .compatible = "renesas,r8a7779", },
{ .compatible = "renesas,r8a7790", },
@@ -72,6 +79,12 @@ static const struct of_device_id machines[] __initconst = {
{ .compatible = "sigma,tango4" },
+ { .compatible = "socionext,uniphier-pro5", },
+ { .compatible = "socionext,uniphier-pxs2", },
+ { .compatible = "socionext,uniphier-ld6b", },
+ { .compatible = "socionext,uniphier-ld11", },
+ { .compatible = "socionext,uniphier-ld20", },
+
{ .compatible = "ti,am33xx", },
{ .compatible = "ti,dra7", },
{ .compatible = "ti,omap2", },
@@ -81,6 +94,8 @@ static const struct of_device_id machines[] __initconst = {
{ .compatible = "xlnx,zynq-7000", },
+ { .compatible = "zte,zx296718", },
+
{ }
};
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 5c07ae05d69a..269013311e79 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -28,6 +28,7 @@
#include "cpufreq-dt.h"
struct private_data {
+ struct opp_table *opp_table;
struct device *cpu_dev;
struct thermal_cooling_device *cdev;
const char *reg_name;
@@ -143,6 +144,7 @@ static int resources_available(void)
static int cpufreq_init(struct cpufreq_policy *policy)
{
struct cpufreq_frequency_table *freq_table;
+ struct opp_table *opp_table = NULL;
struct private_data *priv;
struct device *cpu_dev;
struct clk *cpu_clk;
@@ -186,8 +188,9 @@ static int cpufreq_init(struct cpufreq_policy *policy)
*/
name = find_supply_name(cpu_dev);
if (name) {
- ret = dev_pm_opp_set_regulator(cpu_dev, name);
- if (ret) {
+ opp_table = dev_pm_opp_set_regulators(cpu_dev, &name, 1);
+ if (IS_ERR(opp_table)) {
+ ret = PTR_ERR(opp_table);
dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
policy->cpu, ret);
goto out_put_clk;
@@ -237,6 +240,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
}
priv->reg_name = name;
+ priv->opp_table = opp_table;
ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
if (ret) {
@@ -285,7 +289,7 @@ out_free_priv:
out_free_opp:
dev_pm_opp_of_cpumask_remove_table(policy->cpus);
if (name)
- dev_pm_opp_put_regulator(cpu_dev);
+ dev_pm_opp_put_regulators(opp_table);
out_put_clk:
clk_put(cpu_clk);
@@ -300,7 +304,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
if (priv->reg_name)
- dev_pm_opp_put_regulator(priv->cpu_dev);
+ dev_pm_opp_put_regulators(priv->opp_table);
clk_put(policy->clk);
kfree(priv);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6e6c1fb60fbc..cc475eff90b3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1526,7 +1526,10 @@ unsigned int cpufreq_get(unsigned int cpu)
if (policy) {
down_read(&policy->rwsem);
- ret_freq = __cpufreq_get(policy);
+
+ if (!policy_is_inactive(policy))
+ ret_freq = __cpufreq_get(policy);
+
up_read(&policy->rwsem);
cpufreq_cpu_put(policy);
@@ -2254,17 +2257,19 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
* Useful for policy notifiers which have different necessities
* at different times.
*/
-int cpufreq_update_policy(unsigned int cpu)
+void cpufreq_update_policy(unsigned int cpu)
{
struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
struct cpufreq_policy new_policy;
- int ret;
if (!policy)
- return -ENODEV;
+ return;
down_write(&policy->rwsem);
+ if (policy_is_inactive(policy))
+ goto unlock;
+
pr_debug("updating policy for CPU %u\n", cpu);
memcpy(&new_policy, policy, sizeof(*policy));
new_policy.min = policy->user_policy.min;
@@ -2275,24 +2280,20 @@ int cpufreq_update_policy(unsigned int cpu)
* -> ask driver for current freq and notify governors about a change
*/
if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
- if (cpufreq_suspended) {
- ret = -EAGAIN;
+ if (cpufreq_suspended)
goto unlock;
- }
+
new_policy.cur = cpufreq_update_current_freq(policy);
- if (WARN_ON(!new_policy.cur)) {
- ret = -EIO;
+ if (WARN_ON(!new_policy.cur))
goto unlock;
- }
}
- ret = cpufreq_set_policy(policy, &new_policy);
+ cpufreq_set_policy(policy, &new_policy);
unlock:
up_write(&policy->rwsem);
cpufreq_cpu_put(policy);
- return ret;
}
EXPORT_SYMBOL(cpufreq_update_policy);
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 13475890d792..992f7c20760f 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -37,16 +37,16 @@ struct cs_dbs_tuners {
#define DEF_SAMPLING_DOWN_FACTOR (1)
#define MAX_SAMPLING_DOWN_FACTOR (10)
-static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
- struct cpufreq_policy *policy)
+static inline unsigned int get_freq_step(struct cs_dbs_tuners *cs_tuners,
+ struct cpufreq_policy *policy)
{
- unsigned int freq_target = (cs_tuners->freq_step * policy->max) / 100;
+ unsigned int freq_step = (cs_tuners->freq_step * policy->max) / 100;
/* max freq cannot be less than 100. But who knows... */
- if (unlikely(freq_target == 0))
- freq_target = DEF_FREQUENCY_STEP;
+ if (unlikely(freq_step == 0))
+ freq_step = DEF_FREQUENCY_STEP;
- return freq_target;
+ return freq_step;
}
/*
@@ -55,10 +55,10 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
* sampling_down_factor, we check, if current idle time is more than 80%
* (default), then we try to decrease frequency
*
- * Any frequency increase takes it to the maximum frequency. Frequency reduction
- * happens at minimum steps of 5% (default) of maximum frequency
+ * Frequency updates happen at minimum steps of 5% (default) of maximum
+ * frequency
*/
-static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
+static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
{
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
@@ -66,6 +66,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
struct dbs_data *dbs_data = policy_dbs->dbs_data;
struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
unsigned int load = dbs_update(policy);
+ unsigned int freq_step;
/*
* break out if we 'cannot' reduce the speed as the user might
@@ -82,6 +83,23 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
if (requested_freq > policy->max || requested_freq < policy->min)
requested_freq = policy->cur;
+ freq_step = get_freq_step(cs_tuners, policy);
+
+ /*
+ * Decrease requested_freq one freq_step for each idle period that
+ * we didn't update the frequency.
+ */
+ if (policy_dbs->idle_periods < UINT_MAX) {
+ unsigned int freq_steps = policy_dbs->idle_periods * freq_step;
+
+ if (requested_freq > freq_steps)
+ requested_freq -= freq_steps;
+ else
+ requested_freq = policy->min;
+
+ policy_dbs->idle_periods = UINT_MAX;
+ }
+
/* Check for frequency increase */
if (load > dbs_data->up_threshold) {
dbs_info->down_skip = 0;
@@ -90,7 +108,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
if (requested_freq == policy->max)
goto out;
- requested_freq += get_freq_target(cs_tuners, policy);
+ requested_freq += freq_step;
if (requested_freq > policy->max)
requested_freq = policy->max;
@@ -106,16 +124,14 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
/* Check for frequency decrease */
if (load < cs_tuners->down_threshold) {
- unsigned int freq_target;
/*
* if we cannot reduce the frequency anymore, break out early
*/
if (requested_freq == policy->min)
goto out;
- freq_target = get_freq_target(cs_tuners, policy);
- if (requested_freq > freq_target)
- requested_freq -= freq_target;
+ if (requested_freq > freq_step)
+ requested_freq -= freq_step;
else
requested_freq = policy->min;
@@ -305,7 +321,7 @@ static void cs_start(struct cpufreq_policy *policy)
static struct dbs_governor cs_governor = {
.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"),
.kobj_type = { .default_attrs = cs_attributes },
- .gov_dbs_timer = cs_dbs_timer,
+ .gov_dbs_update = cs_dbs_update,
.alloc = cs_alloc,
.free = cs_free,
.init = cs_init,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 642dd0f183a8..0196467280bd 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -61,7 +61,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
* entries can't be freed concurrently.
*/
list_for_each_entry(policy_dbs, &attr_set->policy_list, list) {
- mutex_lock(&policy_dbs->timer_mutex);
+ mutex_lock(&policy_dbs->update_mutex);
/*
* On 32-bit architectures this may race with the
* sample_delay_ns read in dbs_update_util_handler(), but that
@@ -76,7 +76,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
* taken, so it shouldn't be significant.
*/
gov_update_sample_delay(policy_dbs, 0);
- mutex_unlock(&policy_dbs->timer_mutex);
+ mutex_unlock(&policy_dbs->update_mutex);
}
return count;
@@ -117,7 +117,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *dbs_data = policy_dbs->dbs_data;
unsigned int ignore_nice = dbs_data->ignore_nice_load;
- unsigned int max_load = 0;
+ unsigned int max_load = 0, idle_periods = UINT_MAX;
unsigned int sampling_rate, io_busy, j;
/*
@@ -215,9 +215,19 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
j_cdbs->prev_load = load;
}
+ if (time_elapsed > 2 * sampling_rate) {
+ unsigned int periods = time_elapsed / sampling_rate;
+
+ if (periods < idle_periods)
+ idle_periods = periods;
+ }
+
if (load > max_load)
max_load = load;
}
+
+ policy_dbs->idle_periods = idle_periods;
+
return max_load;
}
EXPORT_SYMBOL_GPL(dbs_update);
@@ -236,9 +246,9 @@ static void dbs_work_handler(struct work_struct *work)
* Make sure cpufreq_governor_limits() isn't evaluating load or the
* ondemand governor isn't updating the sampling rate in parallel.
*/
- mutex_lock(&policy_dbs->timer_mutex);
- gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
- mutex_unlock(&policy_dbs->timer_mutex);
+ mutex_lock(&policy_dbs->update_mutex);
+ gov_update_sample_delay(policy_dbs, gov->gov_dbs_update(policy));
+ mutex_unlock(&policy_dbs->update_mutex);
/* Allow the utilization update handler to queue up more work. */
atomic_set(&policy_dbs->work_count, 0);
@@ -348,7 +358,7 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
return NULL;
policy_dbs->policy = policy;
- mutex_init(&policy_dbs->timer_mutex);
+ mutex_init(&policy_dbs->update_mutex);
atomic_set(&policy_dbs->work_count, 0);
init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
INIT_WORK(&policy_dbs->work, dbs_work_handler);
@@ -367,7 +377,7 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
{
int j;
- mutex_destroy(&policy_dbs->timer_mutex);
+ mutex_destroy(&policy_dbs->update_mutex);
for_each_cpu(j, policy_dbs->policy->related_cpus) {
struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
@@ -547,10 +557,10 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy)
{
struct policy_dbs_info *policy_dbs = policy->governor_data;
- mutex_lock(&policy_dbs->timer_mutex);
+ mutex_lock(&policy_dbs->update_mutex);
cpufreq_policy_apply_limits(policy);
gov_update_sample_delay(policy_dbs, 0);
- mutex_unlock(&policy_dbs->timer_mutex);
+ mutex_unlock(&policy_dbs->update_mutex);
}
EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index ef1037e9c92b..f5717ca070cc 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -85,7 +85,7 @@ struct policy_dbs_info {
* Per policy mutex that serializes load evaluation from limit-change
* and work-handler.
*/
- struct mutex timer_mutex;
+ struct mutex update_mutex;
u64 last_sample_time;
s64 sample_delay_ns;
@@ -97,6 +97,7 @@ struct policy_dbs_info {
struct list_head list;
/* Multiplier for increasing sample delay temporarily. */
unsigned int rate_mult;
+ unsigned int idle_periods; /* For conservative */
/* Status indicators */
bool is_shared; /* This object is used by multiple CPUs */
bool work_in_progress; /* Work is being queued up or in progress */
@@ -135,7 +136,7 @@ struct dbs_governor {
*/
struct dbs_data *gdbs_data;
- unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
+ unsigned int (*gov_dbs_update)(struct cpufreq_policy *policy);
struct policy_dbs_info *(*alloc)(void);
void (*free)(struct policy_dbs_info *policy_dbs);
int (*init)(struct dbs_data *dbs_data);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 3a1f49f5f4c6..4a017e895296 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -25,7 +25,7 @@
#define MAX_SAMPLING_DOWN_FACTOR (100000)
#define MICRO_FREQUENCY_UP_THRESHOLD (95)
#define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000)
-#define MIN_FREQUENCY_UP_THRESHOLD (11)
+#define MIN_FREQUENCY_UP_THRESHOLD (1)
#define MAX_FREQUENCY_UP_THRESHOLD (100)
static struct od_ops od_ops;
@@ -169,7 +169,7 @@ static void od_update(struct cpufreq_policy *policy)
}
}
-static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
+static unsigned int od_dbs_update(struct cpufreq_policy *policy)
{
struct policy_dbs_info *policy_dbs = policy->governor_data;
struct dbs_data *dbs_data = policy_dbs->dbs_data;
@@ -191,7 +191,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
od_update(policy);
if (dbs_info->freq_lo) {
- /* Setup timer for SUB_SAMPLE */
+ /* Setup SUB_SAMPLE */
dbs_info->sample_type = OD_SUB_SAMPLE;
return dbs_info->freq_hi_delay_us;
}
@@ -255,11 +255,11 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
list_for_each_entry(policy_dbs, &attr_set->policy_list, list) {
/*
* Doing this without locking might lead to using different
- * rate_mult values in od_update() and od_dbs_timer().
+ * rate_mult values in od_update() and od_dbs_update().
*/
- mutex_lock(&policy_dbs->timer_mutex);
+ mutex_lock(&policy_dbs->update_mutex);
policy_dbs->rate_mult = 1;
- mutex_unlock(&policy_dbs->timer_mutex);
+ mutex_unlock(&policy_dbs->update_mutex);
}
return count;
@@ -374,8 +374,7 @@ static int od_init(struct dbs_data *dbs_data)
dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
/*
* In nohz/micro accounting case we set the minimum frequency
- * not depending on HZ, but fixed (very low). The deferred
- * timer might skip some samples if idle/sleeping as needed.
+ * not depending on HZ, but fixed (very low).
*/
dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
} else {
@@ -415,7 +414,7 @@ static struct od_ops od_ops = {
static struct dbs_governor od_dbs_gov = {
.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"),
.kobj_type = { .default_attrs = od_attributes },
- .gov_dbs_timer = od_dbs_timer,
+ .gov_dbs_update = od_dbs_update,
.alloc = od_alloc,
.free = od_free,
.init = od_init,
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 06d3abdffd3a..ac284e66839c 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -41,6 +41,18 @@ static int cpufreq_stats_update(struct cpufreq_stats *stats)
return 0;
}
+static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
+{
+ unsigned int count = stats->max_state;
+
+ memset(stats->time_in_state, 0, count * sizeof(u64));
+#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
+ memset(stats->trans_table, 0, count * count * sizeof(int));
+#endif
+ stats->last_time = get_jiffies_64();
+ stats->total_trans = 0;
+}
+
static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
{
return sprintf(buf, "%d\n", policy->stats->total_trans);
@@ -64,6 +76,14 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
return len;
}
+static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
+ size_t count)
+{
+ /* We don't care what is written to the attribute. */
+ cpufreq_stats_clear_table(policy->stats);
+ return count;
+}
+
#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
{
@@ -113,10 +133,12 @@ cpufreq_freq_attr_ro(trans_table);
cpufreq_freq_attr_ro(total_trans);
cpufreq_freq_attr_ro(time_in_state);
+cpufreq_freq_attr_wo(reset);
static struct attribute *default_attrs[] = {
&total_trans.attr,
&time_in_state.attr,
+ &reset.attr,
#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
&trans_table.attr,
#endif
diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c
deleted file mode 100644
index 79e3ff2771a6..000000000000
--- a/drivers/cpufreq/integrator-cpufreq.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (C) 2001-2002 Deep Blue Solutions Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * CPU support functions
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-
-#include <asm/mach-types.h>
-#include <asm/hardware/icst.h>
-
-static void __iomem *cm_base;
-/* The cpufreq driver only use the OSC register */
-#define INTEGRATOR_HDR_OSC_OFFSET 0x08
-#define INTEGRATOR_HDR_LOCK_OFFSET 0x14
-
-static struct cpufreq_driver integrator_driver;
-
-static const struct icst_params lclk_params = {
- .ref = 24000000,
- .vco_max = ICST525_VCO_MAX_5V,
- .vco_min = ICST525_VCO_MIN,
- .vd_min = 8,
- .vd_max = 132,
- .rd_min = 24,
- .rd_max = 24,
- .s2div = icst525_s2div,
- .idx2s = icst525_idx2s,
-};
-
-static const struct icst_params cclk_params = {
- .ref = 24000000,
- .vco_max = ICST525_VCO_MAX_5V,
- .vco_min = ICST525_VCO_MIN,
- .vd_min = 12,
- .vd_max = 160,
- .rd_min = 24,
- .rd_max = 24,
- .s2div = icst525_s2div,
- .idx2s = icst525_idx2s,
-};
-
-/*
- * Validate the speed policy.
- */
-static int integrator_verify_policy(struct cpufreq_policy *policy)
-{
- struct icst_vco vco;
-
- cpufreq_verify_within_cpu_limits(policy);
-
- vco = icst_hz_to_vco(&cclk_params, policy->max * 1000);
- policy->max = icst_hz(&cclk_params, vco) / 1000;
-
- vco = icst_hz_to_vco(&cclk_params, policy->min * 1000);
- policy->min = icst_hz(&cclk_params, vco) / 1000;
-
- cpufreq_verify_within_cpu_limits(policy);
- return 0;
-}
-
-
-static int integrator_set_target(struct cpufreq_policy *policy,
- unsigned int target_freq,
- unsigned int relation)
-{
- cpumask_t cpus_allowed;
- int cpu = policy->cpu;
- struct icst_vco vco;
- struct cpufreq_freqs freqs;
- u_int cm_osc;
-
- /*
- * Save this threads cpus_allowed mask.
- */
- cpus_allowed = current->cpus_allowed;
-
- /*
- * Bind to the specified CPU. When this call returns,
- * we should be running on the right CPU.
- */
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- BUG_ON(cpu != smp_processor_id());
-
- /* get current setting */
- cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
- if (machine_is_integrator())
- vco.s = (cm_osc >> 8) & 7;
- else if (machine_is_cintegrator())
- vco.s = 1;
- vco.v = cm_osc & 255;
- vco.r = 22;
- freqs.old = icst_hz(&cclk_params, vco) / 1000;
-
- /* icst_hz_to_vco rounds down -- so we need the next
- * larger freq in case of CPUFREQ_RELATION_L.
- */
- if (relation == CPUFREQ_RELATION_L)
- target_freq += 999;
- if (target_freq > policy->max)
- target_freq = policy->max;
- vco = icst_hz_to_vco(&cclk_params, target_freq * 1000);
- freqs.new = icst_hz(&cclk_params, vco) / 1000;
-
- if (freqs.old == freqs.new) {
- set_cpus_allowed_ptr(current, &cpus_allowed);
- return 0;
- }
-
- cpufreq_freq_transition_begin(policy, &freqs);
-
- cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
- if (machine_is_integrator()) {
- cm_osc &= 0xfffff800;
- cm_osc |= vco.s << 8;
- } else if (machine_is_cintegrator()) {
- cm_osc &= 0xffffff00;
- }
- cm_osc |= vco.v;
-
- __raw_writel(0xa05f, cm_base + INTEGRATOR_HDR_LOCK_OFFSET);
- __raw_writel(cm_osc, cm_base + INTEGRATOR_HDR_OSC_OFFSET);
- __raw_writel(0, cm_base + INTEGRATOR_HDR_LOCK_OFFSET);
-
- /*
- * Restore the CPUs allowed mask.
- */
- set_cpus_allowed_ptr(current, &cpus_allowed);
-
- cpufreq_freq_transition_end(policy, &freqs, 0);
-
- return 0;
-}
-
-static unsigned int integrator_get(unsigned int cpu)
-{
- cpumask_t cpus_allowed;
- unsigned int current_freq;
- u_int cm_osc;
- struct icst_vco vco;
-
- cpus_allowed = current->cpus_allowed;
-
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- BUG_ON(cpu != smp_processor_id());
-
- /* detect memory etc. */
- cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
- if (machine_is_integrator())
- vco.s = (cm_osc >> 8) & 7;
- else
- vco.s = 1;
- vco.v = cm_osc & 255;
- vco.r = 22;
-
- current_freq = icst_hz(&cclk_params, vco) / 1000; /* current freq */
-
- set_cpus_allowed_ptr(current, &cpus_allowed);
-
- return current_freq;
-}
-
-static int integrator_cpufreq_init(struct cpufreq_policy *policy)
-{
-
- /* set default policy and cpuinfo */
- policy->max = policy->cpuinfo.max_freq = 160000;
- policy->min = policy->cpuinfo.min_freq = 12000;
- policy->cpuinfo.transition_latency = 1000000; /* 1 ms, assumed */
-
- return 0;
-}
-
-static struct cpufreq_driver integrator_driver = {
- .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK,
- .verify = integrator_verify_policy,
- .target = integrator_set_target,
- .get = integrator_get,
- .init = integrator_cpufreq_init,
- .name = "integrator",
-};
-
-static int __init integrator_cpufreq_probe(struct platform_device *pdev)
-{
- struct resource *res;
-
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- if (!res)
- return -ENODEV;
-
- cm_base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
- if (!cm_base)
- return -ENODEV;
-
- return cpufreq_register_driver(&integrator_driver);
-}
-
-static int __exit integrator_cpufreq_remove(struct platform_device *pdev)
-{
- return cpufreq_unregister_driver(&integrator_driver);
-}
-
-static const struct of_device_id integrator_cpufreq_match[] = {
- { .compatible = "arm,core-module-integrator"},
- { },
-};
-
-MODULE_DEVICE_TABLE(of, integrator_cpufreq_match);
-
-static struct platform_driver integrator_cpufreq_driver = {
- .driver = {
- .name = "integrator-cpufreq",
- .of_match_table = integrator_cpufreq_match,
- },
- .remove = __exit_p(integrator_cpufreq_remove),
-};
-
-module_platform_driver_probe(integrator_cpufreq_driver,
- integrator_cpufreq_probe);
-
-MODULE_AUTHOR("Russell M. King");
-MODULE_DESCRIPTION("cpufreq driver for ARM Integrator CPUs");
-MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e8dc42fc0915..6acbd4af632e 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -37,6 +37,8 @@
#include <asm/cpufeature.h>
#include <asm/intel-family.h>
+#define INTEL_CPUFREQ_TRANSITION_LATENCY 20000
+
#define ATOM_RATIOS 0x66a
#define ATOM_VIDS 0x66b
#define ATOM_TURBO_RATIOS 0x66c
@@ -53,6 +55,8 @@
#define EXT_BITS 6
#define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
+#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
+#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
static inline int32_t mul_fp(int32_t x, int32_t y)
{
@@ -123,6 +127,8 @@ struct sample {
* @scaling: Scaling factor to convert frequency to cpufreq
* frequency units
* @turbo_pstate: Max Turbo P state possible for this platform
+ * @max_freq: @max_pstate frequency in cpufreq units
+ * @turbo_freq: @turbo_pstate frequency in cpufreq units
*
* Stores the per cpu model P state limits and current P state.
*/
@@ -133,6 +139,8 @@ struct pstate_data {
int max_pstate_physical;
int scaling;
int turbo_pstate;
+ unsigned int max_freq;
+ unsigned int turbo_freq;
};
/**
@@ -178,6 +186,48 @@ struct _pid {
};
/**
+ * struct perf_limits - Store user and policy limits
+ * @no_turbo: User requested turbo state from intel_pstate sysfs
+ * @turbo_disabled: Platform turbo status either from msr
+ * MSR_IA32_MISC_ENABLE or when maximum available pstate
+ * matches the maximum turbo pstate
+ * @max_perf_pct: Effective maximum performance limit in percentage, this
+ * is minimum of either limits enforced by cpufreq policy
+ * or limits from user set limits via intel_pstate sysfs
+ * @min_perf_pct: Effective minimum performance limit in percentage, this
+ * is maximum of either limits enforced by cpufreq policy
+ * or limits from user set limits via intel_pstate sysfs
+ * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct
+ * This value is used to limit max pstate
+ * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct
+ * This value is used to limit min pstate
+ * @max_policy_pct: The maximum performance in percentage enforced by
+ * cpufreq setpolicy interface
+ * @max_sysfs_pct: The maximum performance in percentage enforced by
+ * intel pstate sysfs interface, unused when per cpu
+ * controls are enforced
+ * @min_policy_pct: The minimum performance in percentage enforced by
+ * cpufreq setpolicy interface
+ * @min_sysfs_pct: The minimum performance in percentage enforced by
+ * intel pstate sysfs interface, unused when per cpu
+ * controls are enforced
+ *
+ * Storage for user and policy defined limits.
+ */
+struct perf_limits {
+ int no_turbo;
+ int turbo_disabled;
+ int max_perf_pct;
+ int min_perf_pct;
+ int32_t max_perf;
+ int32_t min_perf;
+ int max_policy_pct;
+ int max_sysfs_pct;
+ int min_policy_pct;
+ int min_sysfs_pct;
+};
+
+/**
* struct cpudata - Per CPU instance data storage
* @cpu: CPU number for this instance data
* @policy: CPUFreq policy value
@@ -195,8 +245,19 @@ struct _pid {
* @prev_cummulative_iowait: IO Wait time difference from last and
* current sample
* @sample: Storage for storing last Sample data
+ * @perf_limits: Pointer to perf_limit unique to this CPU
+ * Not all field in the structure are applicable
+ * when per cpu controls are enforced
* @acpi_perf_data: Stores ACPI perf information read from _PSS
* @valid_pss_table: Set to true for valid ACPI _PSS entries found
+ * @epp_powersave: Last saved HWP energy performance preference
+ * (EPP) or energy performance bias (EPB),
+ * when policy switched to performance
+ * @epp_policy: Last saved policy used to set EPP/EPB
+ * @epp_default: Power on default HWP energy performance
+ * preference/bias
+ * @epp_saved: Saved EPP/EPB during system suspend or CPU offline
+ * operation
*
* This structure stores per CPU instance data for all CPUs.
*/
@@ -218,11 +279,16 @@ struct cpudata {
u64 prev_tsc;
u64 prev_cummulative_iowait;
struct sample sample;
+ struct perf_limits *perf_limits;
#ifdef CONFIG_ACPI
struct acpi_processor_performance acpi_perf_data;
bool valid_pss_table;
#endif
unsigned int iowait_boost;
+ s16 epp_powersave;
+ s16 epp_policy;
+ s16 epp_default;
+ s16 epp_saved;
};
static struct cpudata **all_cpu_data;
@@ -236,7 +302,6 @@ static struct cpudata **all_cpu_data;
* @p_gain_pct: PID proportional gain
* @i_gain_pct: PID integral gain
* @d_gain_pct: PID derivative gain
- * @boost_iowait: Whether or not to use iowait boosting.
*
* Stores per CPU model static PID configuration data.
*/
@@ -248,7 +313,6 @@ struct pstate_adjust_policy {
int p_gain_pct;
int d_gain_pct;
int i_gain_pct;
- bool boost_iowait;
};
/**
@@ -292,58 +356,19 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
static struct pstate_adjust_policy pid_params __read_mostly;
static struct pstate_funcs pstate_funcs __read_mostly;
static int hwp_active __read_mostly;
+static bool per_cpu_limits __read_mostly;
#ifdef CONFIG_ACPI
static bool acpi_ppc;
#endif
-/**
- * struct perf_limits - Store user and policy limits
- * @no_turbo: User requested turbo state from intel_pstate sysfs
- * @turbo_disabled: Platform turbo status either from msr
- * MSR_IA32_MISC_ENABLE or when maximum available pstate
- * matches the maximum turbo pstate
- * @max_perf_pct: Effective maximum performance limit in percentage, this
- * is minimum of either limits enforced by cpufreq policy
- * or limits from user set limits via intel_pstate sysfs
- * @min_perf_pct: Effective minimum performance limit in percentage, this
- * is maximum of either limits enforced by cpufreq policy
- * or limits from user set limits via intel_pstate sysfs
- * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct
- * This value is used to limit max pstate
- * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct
- * This value is used to limit min pstate
- * @max_policy_pct: The maximum performance in percentage enforced by
- * cpufreq setpolicy interface
- * @max_sysfs_pct: The maximum performance in percentage enforced by
- * intel pstate sysfs interface
- * @min_policy_pct: The minimum performance in percentage enforced by
- * cpufreq setpolicy interface
- * @min_sysfs_pct: The minimum performance in percentage enforced by
- * intel pstate sysfs interface
- *
- * Storage for user and policy defined limits.
- */
-struct perf_limits {
- int no_turbo;
- int turbo_disabled;
- int max_perf_pct;
- int min_perf_pct;
- int32_t max_perf;
- int32_t min_perf;
- int max_policy_pct;
- int max_sysfs_pct;
- int min_policy_pct;
- int min_sysfs_pct;
-};
-
static struct perf_limits performance_limits = {
.no_turbo = 0,
.turbo_disabled = 0,
.max_perf_pct = 100,
- .max_perf = int_tofp(1),
+ .max_perf = int_ext_tofp(1),
.min_perf_pct = 100,
- .min_perf = int_tofp(1),
+ .min_perf = int_ext_tofp(1),
.max_policy_pct = 100,
.max_sysfs_pct = 100,
.min_policy_pct = 0,
@@ -354,7 +379,7 @@ static struct perf_limits powersave_limits = {
.no_turbo = 0,
.turbo_disabled = 0,
.max_perf_pct = 100,
- .max_perf = int_tofp(1),
+ .max_perf = int_ext_tofp(1),
.min_perf_pct = 0,
.min_perf = 0,
.max_policy_pct = 100,
@@ -369,6 +394,8 @@ static struct perf_limits *limits = &performance_limits;
static struct perf_limits *limits = &powersave_limits;
#endif
+static DEFINE_MUTEX(intel_pstate_limits_lock);
+
#ifdef CONFIG_ACPI
static bool intel_pstate_get_ppc_enable_status(void)
@@ -513,11 +540,11 @@ static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
}
#else
-static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
+static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
{
}
-static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
{
}
#endif
@@ -613,24 +640,252 @@ static inline void update_turbo_state(void)
cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
}
+static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
+{
+ u64 epb;
+ int ret;
+
+ if (!static_cpu_has(X86_FEATURE_EPB))
+ return -ENXIO;
+
+ ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ if (ret)
+ return (s16)ret;
+
+ return (s16)(epb & 0x0f);
+}
+
+static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
+{
+ s16 epp;
+
+ if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+ /*
+ * When hwp_req_data is 0, means that caller didn't read
+ * MSR_HWP_REQUEST, so need to read and get EPP.
+ */
+ if (!hwp_req_data) {
+ epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
+ &hwp_req_data);
+ if (epp)
+ return epp;
+ }
+ epp = (hwp_req_data >> 24) & 0xff;
+ } else {
+ /* When there is no EPP present, HWP uses EPB settings */
+ epp = intel_pstate_get_epb(cpu_data);
+ }
+
+ return epp;
+}
+
+static int intel_pstate_set_epb(int cpu, s16 pref)
+{
+ u64 epb;
+ int ret;
+
+ if (!static_cpu_has(X86_FEATURE_EPB))
+ return -ENXIO;
+
+ ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ if (ret)
+ return ret;
+
+ epb = (epb & ~0x0f) | pref;
+ wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
+
+ return 0;
+}
+
+/*
+ * EPP/EPB display strings corresponding to EPP index in the
+ * energy_perf_strings[]
+ * index String
+ *-------------------------------------
+ * 0 default
+ * 1 performance
+ * 2 balance_performance
+ * 3 balance_power
+ * 4 power
+ */
+static const char * const energy_perf_strings[] = {
+ "default",
+ "performance",
+ "balance_performance",
+ "balance_power",
+ "power",
+ NULL
+};
+
+static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data)
+{
+ s16 epp;
+ int index = -EINVAL;
+
+ epp = intel_pstate_get_epp(cpu_data, 0);
+ if (epp < 0)
+ return epp;
+
+ if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+ /*
+ * Range:
+ * 0x00-0x3F : Performance
+ * 0x40-0x7F : Balance performance
+ * 0x80-0xBF : Balance power
+ * 0xC0-0xFF : Power
+ * The EPP is a 8 bit value, but our ranges restrict the
+ * value which can be set. Here only using top two bits
+ * effectively.
+ */
+ index = (epp >> 6) + 1;
+ } else if (static_cpu_has(X86_FEATURE_EPB)) {
+ /*
+ * Range:
+ * 0x00-0x03 : Performance
+ * 0x04-0x07 : Balance performance
+ * 0x08-0x0B : Balance power
+ * 0x0C-0x0F : Power
+ * The EPB is a 4 bit value, but our ranges restrict the
+ * value which can be set. Here only using top two bits
+ * effectively.
+ */
+ index = (epp >> 2) + 1;
+ }
+
+ return index;
+}
+
+static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
+ int pref_index)
+{
+ int epp = -EINVAL;
+ int ret;
+
+ if (!pref_index)
+ epp = cpu_data->epp_default;
+
+ mutex_lock(&intel_pstate_limits_lock);
+
+ if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+ u64 value;
+
+ ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value);
+ if (ret)
+ goto return_pref;
+
+ value &= ~GENMASK_ULL(31, 24);
+
+ /*
+ * If epp is not default, convert from index into
+ * energy_perf_strings to epp value, by shifting 6
+ * bits left to use only top two bits in epp.
+ * The resultant epp need to shifted by 24 bits to
+ * epp position in MSR_HWP_REQUEST.
+ */
+ if (epp == -EINVAL)
+ epp = (pref_index - 1) << 6;
+
+ value |= (u64)epp << 24;
+ ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value);
+ } else {
+ if (epp == -EINVAL)
+ epp = (pref_index - 1) << 2;
+ ret = intel_pstate_set_epb(cpu_data->cpu, epp);
+ }
+return_pref:
+ mutex_unlock(&intel_pstate_limits_lock);
+
+ return ret;
+}
+
+static ssize_t show_energy_performance_available_preferences(
+ struct cpufreq_policy *policy, char *buf)
+{
+ int i = 0;
+ int ret = 0;
+
+ while (energy_perf_strings[i] != NULL)
+ ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
+
+ ret += sprintf(&buf[ret], "\n");
+
+ return ret;
+}
+
+cpufreq_freq_attr_ro(energy_performance_available_preferences);
+
+static ssize_t store_energy_performance_preference(
+ struct cpufreq_policy *policy, const char *buf, size_t count)
+{
+ struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+ char str_preference[21];
+ int ret, i = 0;
+
+ ret = sscanf(buf, "%20s", str_preference);
+ if (ret != 1)
+ return -EINVAL;
+
+ while (energy_perf_strings[i] != NULL) {
+ if (!strcmp(str_preference, energy_perf_strings[i])) {
+ intel_pstate_set_energy_pref_index(cpu_data, i);
+ return count;
+ }
+ ++i;
+ }
+
+ return -EINVAL;
+}
+
+static ssize_t show_energy_performance_preference(
+ struct cpufreq_policy *policy, char *buf)
+{
+ struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+ int preference;
+
+ preference = intel_pstate_get_energy_pref_index(cpu_data);
+ if (preference < 0)
+ return preference;
+
+ return sprintf(buf, "%s\n", energy_perf_strings[preference]);
+}
+
+cpufreq_freq_attr_rw(energy_performance_preference);
+
+static struct freq_attr *hwp_cpufreq_attrs[] = {
+ &energy_performance_preference,
+ &energy_performance_available_preferences,
+ NULL,
+};
+
static void intel_pstate_hwp_set(const struct cpumask *cpumask)
{
int min, hw_min, max, hw_max, cpu, range, adj_range;
+ struct perf_limits *perf_limits = limits;
u64 value, cap;
for_each_cpu(cpu, cpumask) {
+ int max_perf_pct, min_perf_pct;
+ struct cpudata *cpu_data = all_cpu_data[cpu];
+ s16 epp;
+
+ if (per_cpu_limits)
+ perf_limits = all_cpu_data[cpu]->perf_limits;
+
rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
hw_min = HWP_LOWEST_PERF(cap);
hw_max = HWP_HIGHEST_PERF(cap);
range = hw_max - hw_min;
+ max_perf_pct = perf_limits->max_perf_pct;
+ min_perf_pct = perf_limits->min_perf_pct;
+
rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
- adj_range = limits->min_perf_pct * range / 100;
+ adj_range = min_perf_pct * range / 100;
min = hw_min + adj_range;
value &= ~HWP_MIN_PERF(~0L);
value |= HWP_MIN_PERF(min);
- adj_range = limits->max_perf_pct * range / 100;
+ adj_range = max_perf_pct * range / 100;
max = hw_min + adj_range;
if (limits->no_turbo) {
hw_max = HWP_GUARANTEED_PERF(cap);
@@ -640,6 +895,53 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask)
value &= ~HWP_MAX_PERF(~0L);
value |= HWP_MAX_PERF(max);
+
+ if (cpu_data->epp_policy == cpu_data->policy)
+ goto skip_epp;
+
+ cpu_data->epp_policy = cpu_data->policy;
+
+ if (cpu_data->epp_saved >= 0) {
+ epp = cpu_data->epp_saved;
+ cpu_data->epp_saved = -EINVAL;
+ goto update_epp;
+ }
+
+ if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
+ epp = intel_pstate_get_epp(cpu_data, value);
+ cpu_data->epp_powersave = epp;
+ /* If EPP read was failed, then don't try to write */
+ if (epp < 0)
+ goto skip_epp;
+
+
+ epp = 0;
+ } else {
+ /* skip setting EPP, when saved value is invalid */
+ if (cpu_data->epp_powersave < 0)
+ goto skip_epp;
+
+ /*
+ * No need to restore EPP when it is not zero. This
+ * means:
+ * - Policy is not changed
+ * - user has manually changed
+ * - Error reading EPB
+ */
+ epp = intel_pstate_get_epp(cpu_data, value);
+ if (epp)
+ goto skip_epp;
+
+ epp = cpu_data->epp_powersave;
+ }
+update_epp:
+ if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+ value &= ~GENMASK_ULL(31, 24);
+ value |= (u64)epp << 24;
+ } else {
+ intel_pstate_set_epb(cpu, epp);
+ }
+skip_epp:
wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
}
}
@@ -652,6 +954,28 @@ static int intel_pstate_hwp_set_policy(struct cpufreq_policy *policy)
return 0;
}
+static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
+{
+ struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+
+ if (!hwp_active)
+ return 0;
+
+ cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0);
+
+ return 0;
+}
+
+static int intel_pstate_resume(struct cpufreq_policy *policy)
+{
+ if (!hwp_active)
+ return 0;
+
+ all_cpu_data[policy->cpu]->epp_policy = 0;
+
+ return intel_pstate_hwp_set_policy(policy);
+}
+
static void intel_pstate_hwp_set_online_cpus(void)
{
get_online_cpus();
@@ -694,8 +1018,10 @@ static void __init intel_pstate_debug_expose_params(void)
struct dentry *debugfs_parent;
int i = 0;
- if (hwp_active)
+ if (hwp_active ||
+ pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load)
return;
+
debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
if (IS_ERR_OR_NULL(debugfs_parent))
return;
@@ -768,9 +1094,12 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
if (ret != 1)
return -EINVAL;
+ mutex_lock(&intel_pstate_limits_lock);
+
update_turbo_state();
if (limits->turbo_disabled) {
pr_warn("Turbo disabled by BIOS or unavailable on processor\n");
+ mutex_unlock(&intel_pstate_limits_lock);
return -EPERM;
}
@@ -779,6 +1108,8 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
if (hwp_active)
intel_pstate_hwp_set_online_cpus();
+ mutex_unlock(&intel_pstate_limits_lock);
+
return count;
}
@@ -792,6 +1123,8 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
if (ret != 1)
return -EINVAL;
+ mutex_lock(&intel_pstate_limits_lock);
+
limits->max_sysfs_pct = clamp_t(int, input, 0 , 100);
limits->max_perf_pct = min(limits->max_policy_pct,
limits->max_sysfs_pct);
@@ -799,10 +1132,13 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
limits->max_perf_pct);
limits->max_perf_pct = max(limits->min_perf_pct,
limits->max_perf_pct);
- limits->max_perf = div_fp(limits->max_perf_pct, 100);
+ limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
if (hwp_active)
intel_pstate_hwp_set_online_cpus();
+
+ mutex_unlock(&intel_pstate_limits_lock);
+
return count;
}
@@ -816,6 +1152,8 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
if (ret != 1)
return -EINVAL;
+ mutex_lock(&intel_pstate_limits_lock);
+
limits->min_sysfs_pct = clamp_t(int, input, 0 , 100);
limits->min_perf_pct = max(limits->min_policy_pct,
limits->min_sysfs_pct);
@@ -823,10 +1161,13 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
limits->min_perf_pct);
limits->min_perf_pct = min(limits->max_perf_pct,
limits->min_perf_pct);
- limits->min_perf = div_fp(limits->min_perf_pct, 100);
+ limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
if (hwp_active)
intel_pstate_hwp_set_online_cpus();
+
+ mutex_unlock(&intel_pstate_limits_lock);
+
return count;
}
@@ -841,8 +1182,6 @@ define_one_global_ro(num_pstates);
static struct attribute *intel_pstate_attributes[] = {
&no_turbo.attr,
- &max_perf_pct.attr,
- &min_perf_pct.attr,
&turbo_pct.attr,
&num_pstates.attr,
NULL
@@ -859,9 +1198,26 @@ static void __init intel_pstate_sysfs_expose_params(void)
intel_pstate_kobject = kobject_create_and_add("intel_pstate",
&cpu_subsys.dev_root->kobj);
- BUG_ON(!intel_pstate_kobject);
+ if (WARN_ON(!intel_pstate_kobject))
+ return;
+
rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
- BUG_ON(rc);
+ if (WARN_ON(rc))
+ return;
+
+ /*
+ * If per cpu limits are enforced there are no global limits, so
+ * return without creating max/min_perf_pct attributes
+ */
+ if (per_cpu_limits)
+ return;
+
+ rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr);
+ WARN_ON(rc);
+
+ rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
+ WARN_ON(rc);
+
}
/************************** sysfs end ************************/
@@ -872,6 +1228,9 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
+ cpudata->epp_policy = 0;
+ if (cpudata->epp_default == -EINVAL)
+ cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
}
static int atom_get_min_pstate(void)
@@ -1099,7 +1458,6 @@ static const struct cpu_defaults silvermont_params = {
.p_gain_pct = 14,
.d_gain_pct = 0,
.i_gain_pct = 4,
- .boost_iowait = true,
},
.funcs = {
.get_max = atom_get_max_pstate,
@@ -1121,7 +1479,6 @@ static const struct cpu_defaults airmont_params = {
.p_gain_pct = 14,
.d_gain_pct = 0,
.i_gain_pct = 4,
- .boost_iowait = true,
},
.funcs = {
.get_max = atom_get_max_pstate,
@@ -1163,7 +1520,6 @@ static const struct cpu_defaults bxt_params = {
.p_gain_pct = 14,
.d_gain_pct = 0,
.i_gain_pct = 4,
- .boost_iowait = true,
},
.funcs = {
.get_max = core_get_max_pstate,
@@ -1181,20 +1537,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
int max_perf = cpu->pstate.turbo_pstate;
int max_perf_adj;
int min_perf;
+ struct perf_limits *perf_limits = limits;
if (limits->no_turbo || limits->turbo_disabled)
max_perf = cpu->pstate.max_pstate;
+ if (per_cpu_limits)
+ perf_limits = cpu->perf_limits;
+
/*
* performance can be limited by user through sysfs, by cpufreq
* policy, or by cpu specific default values determined through
* experimentation.
*/
- max_perf_adj = fp_toint(max_perf * limits->max_perf);
+ max_perf_adj = fp_ext_toint(max_perf * perf_limits->max_perf);
*max = clamp_t(int, max_perf_adj,
cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
- min_perf = fp_toint(max_perf * limits->min_perf);
+ min_perf = fp_ext_toint(max_perf * perf_limits->min_perf);
*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
}
@@ -1232,6 +1592,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
cpu->pstate.scaling = pstate_funcs.get_scaling();
+ cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
+ cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
if (pstate_funcs.get_vid)
pstate_funcs.get_vid(cpu);
@@ -1370,15 +1732,19 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled);
}
-static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
{
int max_perf, min_perf;
- update_turbo_state();
-
intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
pstate = clamp_t(int, pstate, min_perf, max_perf);
trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
+ return pstate;
+}
+
+static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+{
+ pstate = intel_pstate_prepare_request(cpu, pstate);
if (pstate == cpu->pstate.current_pstate)
return;
@@ -1396,6 +1762,8 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ?
cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu);
+ update_turbo_state();
+
intel_pstate_update_pstate(cpu, target_pstate);
sample = &cpu->sample;
@@ -1416,7 +1784,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
struct cpudata *cpu = container_of(data, struct cpudata, update_util);
u64 delta_ns;
- if (pid_params.boost_iowait) {
+ if (pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load) {
if (flags & SCHED_CPUFREQ_IOWAIT) {
cpu->iowait_boost = int_tofp(1);
} else if (cpu->iowait_boost) {
@@ -1462,6 +1830,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_params),
ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params),
ICPU(INTEL_FAM6_XEON_PHI_KNL, knl_params),
+ ICPU(INTEL_FAM6_XEON_PHI_KNM, knl_params),
ICPU(INTEL_FAM6_ATOM_GOLDMONT, bxt_params),
{}
};
@@ -1478,11 +1847,26 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
{
struct cpudata *cpu;
- if (!all_cpu_data[cpunum])
- all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata),
- GFP_KERNEL);
- if (!all_cpu_data[cpunum])
- return -ENOMEM;
+ cpu = all_cpu_data[cpunum];
+
+ if (!cpu) {
+ unsigned int size = sizeof(struct cpudata);
+
+ if (per_cpu_limits)
+ size += sizeof(struct perf_limits);
+
+ cpu = kzalloc(size, GFP_KERNEL);
+ if (!cpu)
+ return -ENOMEM;
+
+ all_cpu_data[cpunum] = cpu;
+ if (per_cpu_limits)
+ cpu->perf_limits = (struct perf_limits *)(cpu + 1);
+
+ cpu->epp_default = -EINVAL;
+ cpu->epp_powersave = -EINVAL;
+ cpu->epp_saved = -EINVAL;
+ }
cpu = all_cpu_data[cpunum];
@@ -1541,18 +1925,57 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
limits->no_turbo = 0;
limits->turbo_disabled = 0;
limits->max_perf_pct = 100;
- limits->max_perf = int_tofp(1);
+ limits->max_perf = int_ext_tofp(1);
limits->min_perf_pct = 100;
- limits->min_perf = int_tofp(1);
+ limits->min_perf = int_ext_tofp(1);
limits->max_policy_pct = 100;
limits->max_sysfs_pct = 100;
limits->min_policy_pct = 0;
limits->min_sysfs_pct = 0;
}
+static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
+ struct perf_limits *limits)
+{
+
+ limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
+ policy->cpuinfo.max_freq);
+ limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100);
+ if (policy->max == policy->min) {
+ limits->min_policy_pct = limits->max_policy_pct;
+ } else {
+ limits->min_policy_pct = DIV_ROUND_UP(policy->min * 100,
+ policy->cpuinfo.max_freq);
+ limits->min_policy_pct = clamp_t(int, limits->min_policy_pct,
+ 0, 100);
+ }
+
+ /* Normalize user input to [min_policy_pct, max_policy_pct] */
+ limits->min_perf_pct = max(limits->min_policy_pct,
+ limits->min_sysfs_pct);
+ limits->min_perf_pct = min(limits->max_policy_pct,
+ limits->min_perf_pct);
+ limits->max_perf_pct = min(limits->max_policy_pct,
+ limits->max_sysfs_pct);
+ limits->max_perf_pct = max(limits->min_policy_pct,
+ limits->max_perf_pct);
+
+ /* Make sure min_perf_pct <= max_perf_pct */
+ limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
+
+ limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
+ limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
+ limits->max_perf = round_up(limits->max_perf, EXT_FRAC_BITS);
+ limits->min_perf = round_up(limits->min_perf, EXT_FRAC_BITS);
+
+ pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu,
+ limits->max_perf_pct, limits->min_perf_pct);
+}
+
static int intel_pstate_set_policy(struct cpufreq_policy *policy)
{
struct cpudata *cpu;
+ struct perf_limits *perf_limits = NULL;
if (!policy->cpuinfo.max_freq)
return -ENODEV;
@@ -1570,41 +1993,31 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
policy->max = policy->cpuinfo.max_freq;
}
- if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
- limits = &performance_limits;
+ if (per_cpu_limits)
+ perf_limits = cpu->perf_limits;
+
+ mutex_lock(&intel_pstate_limits_lock);
+
+ if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
+ if (!perf_limits) {
+ limits = &performance_limits;
+ perf_limits = limits;
+ }
if (policy->max >= policy->cpuinfo.max_freq) {
pr_debug("set performance\n");
- intel_pstate_set_performance_limits(limits);
+ intel_pstate_set_performance_limits(perf_limits);
goto out;
}
} else {
pr_debug("set powersave\n");
- limits = &powersave_limits;
- }
-
- limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
- limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100);
- limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
- policy->cpuinfo.max_freq);
- limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0 , 100);
-
- /* Normalize user input to [min_policy_pct, max_policy_pct] */
- limits->min_perf_pct = max(limits->min_policy_pct,
- limits->min_sysfs_pct);
- limits->min_perf_pct = min(limits->max_policy_pct,
- limits->min_perf_pct);
- limits->max_perf_pct = min(limits->max_policy_pct,
- limits->max_sysfs_pct);
- limits->max_perf_pct = max(limits->min_policy_pct,
- limits->max_perf_pct);
-
- /* Make sure min_perf_pct <= max_perf_pct */
- limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
+ if (!perf_limits) {
+ limits = &powersave_limits;
+ perf_limits = limits;
+ }
- limits->min_perf = div_fp(limits->min_perf_pct, 100);
- limits->max_perf = div_fp(limits->max_perf_pct, 100);
- limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
+ }
+ intel_pstate_update_perf_limits(policy, perf_limits);
out:
if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
/*
@@ -1619,6 +2032,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
intel_pstate_hwp_set_policy(policy);
+ mutex_unlock(&intel_pstate_limits_lock);
+
return 0;
}
@@ -1633,22 +2048,32 @@ static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
return 0;
}
+static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
+{
+ intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
+}
+
static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
{
- int cpu_num = policy->cpu;
- struct cpudata *cpu = all_cpu_data[cpu_num];
+ pr_debug("CPU %d exiting\n", policy->cpu);
- pr_debug("CPU %d exiting\n", cpu_num);
+ intel_pstate_clear_update_util_hook(policy->cpu);
+ if (hwp_active)
+ intel_pstate_hwp_save_state(policy);
+ else
+ intel_cpufreq_stop_cpu(policy);
+}
- intel_pstate_clear_update_util_hook(cpu_num);
+static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+{
+ intel_pstate_exit_perf_limits(policy);
- if (hwp_active)
- return;
+ policy->fast_switch_possible = false;
- intel_pstate_set_min_pstate(cpu);
+ return 0;
}
-static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
+static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
{
struct cpudata *cpu;
int rc;
@@ -1659,10 +2084,13 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
cpu = all_cpu_data[policy->cpu];
- if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
- policy->policy = CPUFREQ_POLICY_PERFORMANCE;
- else
- policy->policy = CPUFREQ_POLICY_POWERSAVE;
+ /*
+ * We need sane value in the cpu->perf_limits, so inherit from global
+ * perf_limits limits, which are seeded with values based on the
+ * CONFIG_CPU_FREQ_DEFAULT_GOV_*, during boot up.
+ */
+ if (per_cpu_limits)
+ memcpy(cpu->perf_limits, limits, sizeof(struct perf_limits));
policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
@@ -1675,24 +2103,35 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
policy->cpuinfo.max_freq *= cpu->pstate.scaling;
intel_pstate_init_acpi_perf_limits(policy);
- policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
cpumask_set_cpu(policy->cpu, policy->cpus);
+ policy->fast_switch_possible = true;
+
return 0;
}
-static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
{
- intel_pstate_exit_perf_limits(policy);
+ int ret = __intel_pstate_cpu_init(policy);
+
+ if (ret)
+ return ret;
+
+ policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+ if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
+ policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+ else
+ policy->policy = CPUFREQ_POLICY_POWERSAVE;
return 0;
}
-static struct cpufreq_driver intel_pstate_driver = {
+static struct cpufreq_driver intel_pstate = {
.flags = CPUFREQ_CONST_LOOPS,
.verify = intel_pstate_verify_policy,
.setpolicy = intel_pstate_set_policy,
- .resume = intel_pstate_hwp_set_policy,
+ .suspend = intel_pstate_hwp_save_state,
+ .resume = intel_pstate_resume,
.get = intel_pstate_get,
.init = intel_pstate_cpu_init,
.exit = intel_pstate_cpu_exit,
@@ -1700,6 +2139,118 @@ static struct cpufreq_driver intel_pstate_driver = {
.name = "intel_pstate",
};
+static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy)
+{
+ struct cpudata *cpu = all_cpu_data[policy->cpu];
+ struct perf_limits *perf_limits = limits;
+
+ update_turbo_state();
+ policy->cpuinfo.max_freq = limits->turbo_disabled ?
+ cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+
+ cpufreq_verify_within_cpu_limits(policy);
+
+ if (per_cpu_limits)
+ perf_limits = cpu->perf_limits;
+
+ intel_pstate_update_perf_limits(policy, perf_limits);
+
+ return 0;
+}
+
+static unsigned int intel_cpufreq_turbo_update(struct cpudata *cpu,
+ struct cpufreq_policy *policy,
+ unsigned int target_freq)
+{
+ unsigned int max_freq;
+
+ update_turbo_state();
+
+ max_freq = limits->no_turbo || limits->turbo_disabled ?
+ cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+ policy->cpuinfo.max_freq = max_freq;
+ if (policy->max > max_freq)
+ policy->max = max_freq;
+
+ if (target_freq > max_freq)
+ target_freq = max_freq;
+
+ return target_freq;
+}
+
+static int intel_cpufreq_target(struct cpufreq_policy *policy,
+ unsigned int target_freq,
+ unsigned int relation)
+{
+ struct cpudata *cpu = all_cpu_data[policy->cpu];
+ struct cpufreq_freqs freqs;
+ int target_pstate;
+
+ freqs.old = policy->cur;
+ freqs.new = intel_cpufreq_turbo_update(cpu, policy, target_freq);
+
+ cpufreq_freq_transition_begin(policy, &freqs);
+ switch (relation) {
+ case CPUFREQ_RELATION_L:
+ target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
+ break;
+ case CPUFREQ_RELATION_H:
+ target_pstate = freqs.new / cpu->pstate.scaling;
+ break;
+ default:
+ target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
+ break;
+ }
+ target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
+ if (target_pstate != cpu->pstate.current_pstate) {
+ cpu->pstate.current_pstate = target_pstate;
+ wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
+ pstate_funcs.get_val(cpu, target_pstate));
+ }
+ cpufreq_freq_transition_end(policy, &freqs, false);
+
+ return 0;
+}
+
+static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
+ unsigned int target_freq)
+{
+ struct cpudata *cpu = all_cpu_data[policy->cpu];
+ int target_pstate;
+
+ target_freq = intel_cpufreq_turbo_update(cpu, policy, target_freq);
+ target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
+ intel_pstate_update_pstate(cpu, target_pstate);
+ return target_freq;
+}
+
+static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+ int ret = __intel_pstate_cpu_init(policy);
+
+ if (ret)
+ return ret;
+
+ policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
+ /* This reflects the intel_pstate_get_cpu_pstates() setting. */
+ policy->cur = policy->cpuinfo.min_freq;
+
+ return 0;
+}
+
+static struct cpufreq_driver intel_cpufreq = {
+ .flags = CPUFREQ_CONST_LOOPS,
+ .verify = intel_cpufreq_verify_policy,
+ .target = intel_cpufreq_target,
+ .fast_switch = intel_cpufreq_fast_switch,
+ .init = intel_cpufreq_cpu_init,
+ .exit = intel_pstate_cpu_exit,
+ .stop_cpu = intel_cpufreq_stop_cpu,
+ .name = "intel_cpufreq",
+};
+
+static struct cpufreq_driver *intel_pstate_driver = &intel_pstate;
+
static int no_load __initdata;
static int no_hwp __initdata;
static int hwp_only __initdata;
@@ -1726,6 +2277,19 @@ static void __init copy_pid_params(struct pstate_adjust_policy *policy)
pid_params.setpoint = policy->setpoint;
}
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+ if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+ pstate_funcs.get_target_pstate =
+ get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(void)
+{
+}
+#endif
+
static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
{
pstate_funcs.get_max = funcs->get_max;
@@ -1737,6 +2301,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
pstate_funcs.get_vid = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
+ intel_pstate_use_acpi_profile();
}
#ifdef CONFIG_ACPI
@@ -1850,9 +2415,20 @@ static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
return false;
}
+
+static void intel_pstate_request_control_from_smm(void)
+{
+ /*
+ * It may be unsafe to request P-states control from SMM if _PPC support
+ * has not been enabled.
+ */
+ if (acpi_ppc)
+ acpi_processor_pstate_control();
+}
#else /* CONFIG_ACPI not enabled */
static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
+static inline void intel_pstate_request_control_from_smm(void) {}
#endif /* CONFIG_ACPI */
static const struct x86_cpu_id hwp_support_ids[] __initconst = {
@@ -1872,6 +2448,7 @@ static int __init intel_pstate_init(void)
if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
copy_cpu_funcs(&core_params.funcs);
hwp_active++;
+ intel_pstate.attr = hwp_cpufreq_attrs;
goto hwp_cpu_matched;
}
@@ -1904,7 +2481,9 @@ hwp_cpu_matched:
if (!hwp_active && hwp_only)
goto out;
- rc = cpufreq_register_driver(&intel_pstate_driver);
+ intel_pstate_request_control_from_smm();
+
+ rc = cpufreq_register_driver(intel_pstate_driver);
if (rc)
goto out;
@@ -1919,7 +2498,9 @@ out:
get_online_cpus();
for_each_online_cpu(cpu) {
if (all_cpu_data[cpu]) {
- intel_pstate_clear_update_util_hook(cpu);
+ if (intel_pstate_driver == &intel_pstate)
+ intel_pstate_clear_update_util_hook(cpu);
+
kfree(all_cpu_data[cpu]);
}
}
@@ -1935,8 +2516,13 @@ static int __init intel_pstate_setup(char *str)
if (!str)
return -EINVAL;
- if (!strcmp(str, "disable"))
+ if (!strcmp(str, "disable")) {
no_load = 1;
+ } else if (!strcmp(str, "passive")) {
+ pr_info("Passive mode enabled\n");
+ intel_pstate_driver = &intel_cpufreq;
+ no_hwp = 1;
+ }
if (!strcmp(str, "no_hwp")) {
pr_info("HWP disabled\n");
no_hwp = 1;
@@ -1945,6 +2531,8 @@ static int __init intel_pstate_setup(char *str)
force_load = 1;
if (!strcmp(str, "hwp_only"))
hwp_only = 1;
+ if (!strcmp(str, "per_cpu_perf_limits"))
+ per_cpu_limits = true;
#ifdef CONFIG_ACPI
if (!strcmp(str, "support_acpi_ppc"))
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index d3ffde806629..37671b545880 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -42,6 +42,10 @@
#define PMSR_PSAFE_ENABLE (1UL << 30)
#define PMSR_SPR_EM_DISABLE (1UL << 31)
#define PMSR_MAX(x) ((x >> 32) & 0xFF)
+#define LPSTATE_SHIFT 48
+#define GPSTATE_SHIFT 56
+#define GET_LPSTATE(x) (((x) >> LPSTATE_SHIFT) & 0xFF)
+#define GET_GPSTATE(x) (((x) >> GPSTATE_SHIFT) & 0xFF)
#define MAX_RAMP_DOWN_TIME 5120
/*
@@ -592,7 +596,8 @@ void gpstate_timer_handler(unsigned long data)
{
struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
struct global_pstate_info *gpstates = policy->driver_data;
- int gpstate_idx;
+ int gpstate_idx, lpstate_idx;
+ unsigned long val;
unsigned int time_diff = jiffies_to_msecs(jiffies)
- gpstates->last_sampled_time;
struct powernv_smp_call_data freq_data;
@@ -600,21 +605,37 @@ void gpstate_timer_handler(unsigned long data)
if (!spin_trylock(&gpstates->gpstate_lock))
return;
+ /*
+ * If PMCR was last updated was using fast_swtich then
+ * We may have wrong in gpstate->last_lpstate_idx
+ * value. Hence, read from PMCR to get correct data.
+ */
+ val = get_pmspr(SPRN_PMCR);
+ freq_data.gpstate_id = (s8)GET_GPSTATE(val);
+ freq_data.pstate_id = (s8)GET_LPSTATE(val);
+ if (freq_data.gpstate_id == freq_data.pstate_id) {
+ reset_gpstates(policy);
+ spin_unlock(&gpstates->gpstate_lock);
+ return;
+ }
+
gpstates->last_sampled_time += time_diff;
gpstates->elapsed_time += time_diff;
- freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx);
- if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) ||
- (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+ if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
gpstate_idx = pstate_to_idx(freq_data.pstate_id);
+ lpstate_idx = gpstate_idx;
reset_gpstates(policy);
gpstates->highest_lpstate_idx = gpstate_idx;
} else {
+ lpstate_idx = pstate_to_idx(freq_data.pstate_id);
gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
gpstates->highest_lpstate_idx,
- gpstates->last_lpstate_idx);
+ lpstate_idx);
}
-
+ freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
+ gpstates->last_gpstate_idx = gpstate_idx;
+ gpstates->last_lpstate_idx = lpstate_idx;
/*
* If local pstate is equal to global pstate, rampdown is over
* So timer is not required to be queued.
@@ -622,10 +643,6 @@ void gpstate_timer_handler(unsigned long data)
if (gpstate_idx != gpstates->last_lpstate_idx)
queue_gpstate_timer(gpstates);
- freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
- gpstates->last_gpstate_idx = pstate_to_idx(freq_data.gpstate_id);
- gpstates->last_lpstate_idx = pstate_to_idx(freq_data.pstate_id);
-
spin_unlock(&gpstates->gpstate_lock);
/* Timer may get migrated to a different cpu on cpu hot unplug */
@@ -647,8 +664,14 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
if (unlikely(rebooting) && new_index != get_nominal_index())
return 0;
- if (!throttled)
+ if (!throttled) {
+ /* we don't want to be preempted while
+ * checking if the CPU frequency has been throttled
+ */
+ preempt_disable();
powernv_cpufreq_throttle_check(NULL);
+ preempt_enable();
+ }
cur_msec = jiffies_to_msecs(get_jiffies_64());
@@ -752,9 +775,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
spin_lock_init(&gpstates->gpstate_lock);
ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
- if (ret < 0)
+ if (ret < 0) {
kfree(policy->driver_data);
+ return ret;
+ }
+ policy->fast_switch_possible = true;
return ret;
}
@@ -897,6 +923,20 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
del_timer_sync(&gpstates->timer);
}
+static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
+ unsigned int target_freq)
+{
+ int index;
+ struct powernv_smp_call_data freq_data;
+
+ index = cpufreq_table_find_index_dl(policy, target_freq);
+ freq_data.pstate_id = powernv_freqs[index].driver_data;
+ freq_data.gpstate_id = powernv_freqs[index].driver_data;
+ set_pstate(&freq_data);
+
+ return powernv_freqs[index].frequency;
+}
+
static struct cpufreq_driver powernv_cpufreq_driver = {
.name = "powernv-cpufreq",
.flags = CPUFREQ_CONST_LOOPS,
@@ -904,6 +944,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
.exit = powernv_cpufreq_cpu_exit,
.verify = cpufreq_generic_frequency_table_verify,
.target_index = powernv_cpufreq_target_index,
+ .fast_switch = powernv_fast_switch,
.get = powernv_cpufreq_get,
.stop_cpu = powernv_cpufreq_stop_cpu,
.attr = powernv_cpu_freq_attr,
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 7fe442ca38f4..0835a37a5f3a 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -22,7 +22,7 @@
#define POWERNV_THRESHOLD_LATENCY_NS 200000
-struct cpuidle_driver powernv_idle_driver = {
+static struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
.owner = THIS_MODULE,
};
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c73207abb5a4..62810ff3b00f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -97,7 +97,23 @@ static int find_deepest_state(struct cpuidle_driver *drv,
return ret;
}
-#ifdef CONFIG_SUSPEND
+/**
+ * cpuidle_use_deepest_state - Set/clear governor override flag.
+ * @enable: New value of the flag.
+ *
+ * Set/unset the current CPU to use the deepest idle state (override governors
+ * going forward if set).
+ */
+void cpuidle_use_deepest_state(bool enable)
+{
+ struct cpuidle_device *dev;
+
+ preempt_disable();
+ dev = cpuidle_get_device();
+ dev->use_deepest_state = enable;
+ preempt_enable();
+}
+
/**
* cpuidle_find_deepest_state - Find the deepest available idle state.
* @drv: cpuidle driver for the given CPU.
@@ -109,6 +125,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
return find_deepest_state(drv, dev, UINT_MAX, 0, false);
}
+#ifdef CONFIG_SUSPEND
static void enter_freeze_proper(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index)
{
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index a5c111b67f37..ffca4fc0061d 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state,
* state enter function.
*/
idle_state->enter = match_id->data;
+ /*
+ * Since this is not a "coupled" state, it's safe to assume interrupts
+ * won't be enabled when it exits allowing the tick to be frozen
+ * safely. So enter() can be also enter_freeze() callback.
+ */
+ idle_state->enter_freeze = match_id->data;
err = of_property_read_u32(state_node, "wakeup-latency-us",
&idle_state->exit_latency);
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index fb9f511cca23..4e78263e34a4 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -9,7 +9,6 @@
*/
#include <linux/mutex.h>
-#include <linux/module.h>
#include <linux/cpuidle.h>
#include "cpuidle.h"
@@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov)
if (cpuidle_curr_governor) {
list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
cpuidle_disable_device(dev);
- module_put(cpuidle_curr_governor->owner);
}
cpuidle_curr_governor = gov;
if (gov) {
- if (!try_module_get(cpuidle_curr_governor->owner))
- return -EINVAL;
list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
cpuidle_enable_device(dev);
cpuidle_install_idle_handler();
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 63bd5a403e22..fe8f08948fcb 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -15,7 +15,6 @@
#include <linux/kernel.h>
#include <linux/cpuidle.h>
#include <linux/pm_qos.h>
-#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/tick.h>
@@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = {
.enable = ladder_enable_device,
.select = ladder_select_state,
.reflect = ladder_reflect,
- .owner = THIS_MODULE,
};
/**
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 03d38c291de6..d9b5b9398a0f 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,7 +19,6 @@
#include <linux/tick.h>
#include <linux/sched.h>
#include <linux/math64.h>
-#include <linux/module.h>
/*
* Please note when changing the tuning values:
@@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = {
.enable = menu_enable_device,
.select = menu_select,
.reflect = menu_reflect,
- .owner = THIS_MODULE,
};
/**
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 832a2c3f01ff..c5adc8c9ac43 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -403,8 +403,10 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device)
/* state statistics */
for (i = 0; i < drv->state_count; i++) {
kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
- if (!kobj)
+ if (!kobj) {
+ ret = -ENOMEM;
goto error_state;
+ }
kobj->state = &drv->states[i];
kobj->state_usage = &device->states_usage[i];
init_completion(&kobj->kobj_unregister);
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index bf3ea7603a58..a324801d6a66 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -850,7 +850,7 @@ err_out:
EXPORT_SYMBOL(devfreq_add_governor);
/**
- * devfreq_remove_device() - Remove devfreq feature from a device.
+ * devfreq_remove_governor() - Remove devfreq feature from a device.
* @governor: the devfreq governor to be removed
*/
int devfreq_remove_governor(struct devfreq_governor *governor)
diff --git a/drivers/devfreq/event/exynos-nocp.c b/drivers/devfreq/event/exynos-nocp.c
index 49e712aca0c1..5c3e7b11e8a6 100644
--- a/drivers/devfreq/event/exynos-nocp.c
+++ b/drivers/devfreq/event/exynos-nocp.c
@@ -190,6 +190,7 @@ static const struct of_device_id exynos_nocp_id_match[] = {
{ .compatible = "samsung,exynos5420-nocp", },
{ /* sentinel */ },
};
+MODULE_DEVICE_TABLE(of, exynos_nocp_id_match);
static struct regmap_config exynos_nocp_regmap_config = {
.reg_bits = 32,
diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
index f55cf0eb2a66..107eb91a9415 100644
--- a/drivers/devfreq/event/exynos-ppmu.c
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -15,7 +15,6 @@
#include <linux/io.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/mutex.h>
#include <linux/of_address.h>
#include <linux/platform_device.h>
#include <linux/suspend.h>
@@ -34,7 +33,6 @@ struct exynos_ppmu {
unsigned int num_events;
struct device *dev;
- struct mutex lock;
struct exynos_ppmu_data ppmu;
};
@@ -90,8 +88,6 @@ struct __exynos_ppmu_events {
PPMU_EVENT(d1-cpu),
PPMU_EVENT(d1-general),
PPMU_EVENT(d1-rt),
-
- { /* sentinel */ },
};
static int exynos_ppmu_find_ppmu_id(struct devfreq_event_dev *edev)
@@ -351,6 +347,7 @@ static const struct of_device_id exynos_ppmu_id_match[] = {
},
{ /* sentinel */ },
};
+MODULE_DEVICE_TABLE(of, exynos_ppmu_id_match);
static struct devfreq_event_ops *exynos_bus_get_ops(struct device_node *np)
{
@@ -463,7 +460,6 @@ static int exynos_ppmu_probe(struct platform_device *pdev)
if (!info)
return -ENOMEM;
- mutex_init(&info->lock);
info->dev = &pdev->dev;
/* Parse dt data to get resource */
diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c
index 43fcc5a7f515..22b113363ffc 100644
--- a/drivers/devfreq/event/rockchip-dfi.c
+++ b/drivers/devfreq/event/rockchip-dfi.c
@@ -188,6 +188,7 @@ static const struct of_device_id rockchip_dfi_id_match[] = {
{ .compatible = "rockchip,rk3399-dfi" },
{ },
};
+MODULE_DEVICE_TABLE(of, rockchip_dfi_id_match);
static int rockchip_dfi_probe(struct platform_device *pdev)
{
diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index 29866f7e6d7e..a8ed7792ece2 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -35,7 +35,7 @@ struct exynos_bus {
unsigned int edev_count;
struct mutex lock;
- struct dev_pm_opp *curr_opp;
+ unsigned long curr_freq;
struct regulator *regulator;
struct clk *clk;
@@ -99,7 +99,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
{
struct exynos_bus *bus = dev_get_drvdata(dev);
struct dev_pm_opp *new_opp;
- unsigned long old_freq, new_freq, old_volt, new_volt, tol;
+ unsigned long old_freq, new_freq, new_volt, tol;
int ret = 0;
/* Get new opp-bus instance according to new bus clock */
@@ -113,8 +113,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
new_freq = dev_pm_opp_get_freq(new_opp);
new_volt = dev_pm_opp_get_voltage(new_opp);
- old_freq = dev_pm_opp_get_freq(bus->curr_opp);
- old_volt = dev_pm_opp_get_voltage(bus->curr_opp);
+ old_freq = bus->curr_freq;
rcu_read_unlock();
if (old_freq == new_freq)
@@ -146,7 +145,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
goto out;
}
}
- bus->curr_opp = new_opp;
+ bus->curr_freq = new_freq;
dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n",
old_freq/1000, new_freq/1000);
@@ -163,9 +162,7 @@ static int exynos_bus_get_dev_status(struct device *dev,
struct devfreq_event_data edata;
int ret;
- rcu_read_lock();
- stat->current_frequency = dev_pm_opp_get_freq(bus->curr_opp);
- rcu_read_unlock();
+ stat->current_frequency = bus->curr_freq;
ret = exynos_bus_get_event(bus, &edata);
if (ret < 0) {
@@ -226,7 +223,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq,
}
new_freq = dev_pm_opp_get_freq(new_opp);
- old_freq = dev_pm_opp_get_freq(bus->curr_opp);
+ old_freq = bus->curr_freq;
rcu_read_unlock();
if (old_freq == new_freq)
@@ -242,7 +239,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq,
}
*freq = new_freq;
- bus->curr_opp = new_opp;
+ bus->curr_freq = new_freq;
dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n",
old_freq/1000, new_freq/1000);
@@ -335,6 +332,7 @@ static int exynos_bus_parse_of(struct device_node *np,
struct exynos_bus *bus)
{
struct device *dev = bus->dev;
+ struct dev_pm_opp *opp;
unsigned long rate;
int ret;
@@ -352,22 +350,23 @@ static int exynos_bus_parse_of(struct device_node *np,
}
/* Get the freq and voltage from OPP table to scale the bus freq */
- rcu_read_lock();
ret = dev_pm_opp_of_add_table(dev);
if (ret < 0) {
dev_err(dev, "failed to get OPP table\n");
- rcu_read_unlock();
goto err_clk;
}
rate = clk_get_rate(bus->clk);
- bus->curr_opp = devfreq_recommended_opp(dev, &rate, 0);
- if (IS_ERR(bus->curr_opp)) {
+
+ rcu_read_lock();
+ opp = devfreq_recommended_opp(dev, &rate, 0);
+ if (IS_ERR(opp)) {
dev_err(dev, "failed to find dev_pm_opp\n");
rcu_read_unlock();
- ret = PTR_ERR(bus->curr_opp);
+ ret = PTR_ERR(opp);
goto err_opp;
}
+ bus->curr_freq = dev_pm_opp_get_freq(opp);
rcu_read_unlock();
return 0;
diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index e24b73d66659..27d2f349b53c 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -80,7 +80,6 @@ struct rk3399_dmcfreq {
struct regulator *vdd_center;
unsigned long rate, target_rate;
unsigned long volt, target_volt;
- struct dev_pm_opp *curr_opp;
};
static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
@@ -102,9 +101,6 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
target_rate = dev_pm_opp_get_freq(opp);
target_volt = dev_pm_opp_get_voltage(opp);
- dmcfreq->rate = dev_pm_opp_get_freq(dmcfreq->curr_opp);
- dmcfreq->volt = dev_pm_opp_get_voltage(dmcfreq->curr_opp);
-
rcu_read_unlock();
if (dmcfreq->rate == target_rate)
@@ -165,7 +161,9 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
if (err)
dev_err(dev, "Cannot to set vol %lu uV\n", target_volt);
- dmcfreq->curr_opp = opp;
+ dmcfreq->rate = target_rate;
+ dmcfreq->volt = target_volt;
+
out:
mutex_unlock(&dmcfreq->lock);
return err;
@@ -414,7 +412,6 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
*/
if (dev_pm_opp_of_add_table(dev)) {
dev_err(dev, "Invalid operating-points in device tree.\n");
- rcu_read_unlock();
return -EINVAL;
}
@@ -431,12 +428,13 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
rcu_read_unlock();
return PTR_ERR(opp);
}
+ data->rate = dev_pm_opp_get_freq(opp);
+ data->volt = dev_pm_opp_get_voltage(opp);
rcu_read_unlock();
- data->curr_opp = opp;
rk3399_devfreq_dmc_profile.initial_freq = data->rate;
- data->devfreq = devfreq_add_device(dev,
+ data->devfreq = devm_devfreq_add_device(dev,
&rk3399_devfreq_dmc_profile,
"simple_ondemand",
&data->ondemand_data);
@@ -454,6 +452,7 @@ static const struct of_device_id rk3399dmc_devfreq_of_match[] = {
{ .compatible = "rockchip,rk3399-dmc" },
{ },
};
+MODULE_DEVICE_TABLE(of, rk3399dmc_devfreq_of_match);
static struct platform_driver rk3399_dmcfreq_driver = {
.probe = rk3399_dmcfreq_probe,
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4466a2f969d7..7d8ea3d5fda6 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -98,8 +98,6 @@ static int intel_idle(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index);
static void intel_idle_freeze(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index);
-static int intel_idle_cpu_init(int cpu);
-
static struct cpuidle_state *cpuidle_state_table;
/*
@@ -724,6 +722,50 @@ static struct cpuidle_state atom_cstates[] = {
{
.enter = NULL }
};
+static struct cpuidle_state tangier_cstates[] = {
+ {
+ .name = "C1-TNG",
+ .desc = "MWAIT 0x00",
+ .flags = MWAIT2flg(0x00),
+ .exit_latency = 1,
+ .target_residency = 4,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .name = "C4-TNG",
+ .desc = "MWAIT 0x30",
+ .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 100,
+ .target_residency = 400,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .name = "C6-TNG",
+ .desc = "MWAIT 0x52",
+ .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 140,
+ .target_residency = 560,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .name = "C7-TNG",
+ .desc = "MWAIT 0x60",
+ .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 1200,
+ .target_residency = 4000,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .name = "C9-TNG",
+ .desc = "MWAIT 0x64",
+ .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 10000,
+ .target_residency = 20000,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .enter = NULL }
+};
static struct cpuidle_state avn_cstates[] = {
{
.name = "C1-AVN",
@@ -907,51 +949,15 @@ static void intel_idle_freeze(struct cpuidle_device *dev,
mwait_idle_with_hints(eax, ecx);
}
-static void __setup_broadcast_timer(void *arg)
+static void __setup_broadcast_timer(bool on)
{
- unsigned long on = (unsigned long)arg;
-
if (on)
tick_broadcast_enable();
else
tick_broadcast_disable();
}
-static int cpu_hotplug_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- int hotcpu = (unsigned long)hcpu;
- struct cpuidle_device *dev;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
-
- if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
- smp_call_function_single(hotcpu, __setup_broadcast_timer,
- (void *)true, 1);
-
- /*
- * Some systems can hotplug a cpu at runtime after
- * the kernel has booted, we have to initialize the
- * driver in this case
- */
- dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu);
- if (dev->registered)
- break;
-
- if (intel_idle_cpu_init(hotcpu))
- return NOTIFY_BAD;
-
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_hotplug_notifier = {
- .notifier_call = cpu_hotplug_notify,
-};
-
-static void auto_demotion_disable(void *dummy)
+static void auto_demotion_disable(void)
{
unsigned long long msr_bits;
@@ -959,7 +965,7 @@ static void auto_demotion_disable(void *dummy)
msr_bits &= ~(icpu->auto_demotion_disable_flags);
wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
}
-static void c1e_promotion_disable(void *dummy)
+static void c1e_promotion_disable(void)
{
unsigned long long msr_bits;
@@ -978,6 +984,10 @@ static const struct idle_cpu idle_cpu_atom = {
.state_table = atom_cstates,
};
+static const struct idle_cpu idle_cpu_tangier = {
+ .state_table = tangier_cstates,
+};
+
static const struct idle_cpu idle_cpu_lincroft = {
.state_table = atom_cstates,
.auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
@@ -1066,6 +1076,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb),
ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom),
ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt),
+ ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier),
ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht),
ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb),
ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt),
@@ -1084,6 +1095,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl),
ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx),
ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl),
+ ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl),
ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt),
ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv),
{}
@@ -1373,12 +1385,11 @@ static void __init intel_idle_cpuidle_driver_init(void)
* allocate, initialize, register cpuidle_devices
* @cpu: cpu/core to initialize
*/
-static int intel_idle_cpu_init(int cpu)
+static int intel_idle_cpu_init(unsigned int cpu)
{
struct cpuidle_device *dev;
dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
-
dev->cpu = cpu;
if (cpuidle_register_device(dev)) {
@@ -1387,17 +1398,36 @@ static int intel_idle_cpu_init(int cpu)
}
if (icpu->auto_demotion_disable_flags)
- smp_call_function_single(cpu, auto_demotion_disable, NULL, 1);
+ auto_demotion_disable();
if (icpu->disable_promotion_to_c1e)
- smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1);
+ c1e_promotion_disable();
+
+ return 0;
+}
+
+static int intel_idle_cpu_online(unsigned int cpu)
+{
+ struct cpuidle_device *dev;
+
+ if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
+ __setup_broadcast_timer(true);
+
+ /*
+ * Some systems can hotplug a cpu at runtime after
+ * the kernel has booted, we have to initialize the
+ * driver in this case
+ */
+ dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
+ if (!dev->registered)
+ return intel_idle_cpu_init(cpu);
return 0;
}
static int __init intel_idle_init(void)
{
- int retval, i;
+ int retval;
/* Do not load intel_idle at all for now if idle= is passed */
if (boot_option_idle_override != IDLE_NO_OVERRIDE)
@@ -1417,35 +1447,29 @@ static int __init intel_idle_init(void)
struct cpuidle_driver *drv = cpuidle_get_driver();
printk(KERN_DEBUG PREFIX "intel_idle yielding to %s",
drv ? drv->name : "none");
- free_percpu(intel_idle_cpuidle_devices);
- return retval;
+ goto init_driver_fail;
}
- cpu_notifier_register_begin();
-
- for_each_online_cpu(i) {
- retval = intel_idle_cpu_init(i);
- if (retval) {
- intel_idle_cpuidle_devices_uninit();
- cpu_notifier_register_done();
- cpuidle_unregister_driver(&intel_idle_driver);
- free_percpu(intel_idle_cpuidle_devices);
- return retval;
- }
- }
- __register_cpu_notifier(&cpu_hotplug_notifier);
-
if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */
lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
- else
- on_each_cpu(__setup_broadcast_timer, (void *)true, 1);
- cpu_notifier_register_done();
+ retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
+ intel_idle_cpu_online, NULL);
+ if (retval < 0)
+ goto hp_setup_fail;
pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n",
lapic_timer_reliable_states);
return 0;
+
+hp_setup_fail:
+ intel_idle_cpuidle_devices_uninit();
+ cpuidle_unregister_driver(&intel_idle_driver);
+init_driver_fail:
+ free_percpu(intel_idle_cpuidle_devices);
+ return retval;
+
}
device_initcall(intel_idle_init);
diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index c48fc0c4abd9..fa5ca0992be6 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2585,6 +2585,9 @@ static int smsc911x_suspend(struct device *dev)
PMT_CTRL_PM_MODE_D1_ | PMT_CTRL_WOL_EN_ |
PMT_CTRL_ED_EN_ | PMT_CTRL_PME_EN_);
+ pm_runtime_disable(dev);
+ pm_runtime_set_suspended(dev);
+
return 0;
}
@@ -2594,6 +2597,9 @@ static int smsc911x_resume(struct device *dev)
struct smsc911x_data *pdata = netdev_priv(ndev);
unsigned int to = 100;
+ pm_runtime_enable(dev);
+ pm_runtime_resume(dev);
+
/* Note 3.11 from the datasheet:
* "When the LAN9220 is in a power saving state, a write of any
* data to the BYTE_TEST register will wake-up the device."
diff --git a/drivers/power/avs/rockchip-io-domain.c b/drivers/power/avs/rockchip-io-domain.c
index 01b6d3f9b8fb..56bce1908be2 100644
--- a/drivers/power/avs/rockchip-io-domain.c
+++ b/drivers/power/avs/rockchip-io-domain.c
@@ -143,7 +143,7 @@ static int rockchip_iodomain_notify(struct notifier_block *nb,
if (ret && event == REGULATOR_EVENT_PRE_VOLTAGE_CHANGE)
return NOTIFY_BAD;
- dev_info(supply->iod->dev, "Setting to %d done\n", uV);
+ dev_dbg(supply->iod->dev, "Setting to %d done\n", uV);
return NOTIFY_OK;
}
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 243b233ff31b..9a25110c4a46 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -189,14 +189,13 @@ struct rapl_package {
unsigned int time_unit;
struct rapl_domain *domains; /* array of domains, sized at runtime */
struct powercap_zone *power_zone; /* keep track of parent zone */
- int nr_cpus; /* active cpus on the package, topology info is lost during
- * cpu hotplug. so we have to track ourselves.
- */
unsigned long power_limit_irq; /* keep track of package power limit
* notify interrupt enable status.
*/
struct list_head plist;
int lead_cpu; /* one active cpu per package for access */
+ /* Track active cpus */
+ struct cpumask cpumask;
};
struct rapl_defaults {
@@ -275,18 +274,6 @@ static struct rapl_package *find_package_by_id(int id)
return NULL;
}
-/* caller must hold cpu hotplug lock */
-static void rapl_cleanup_data(void)
-{
- struct rapl_package *p, *tmp;
-
- list_for_each_entry_safe(p, tmp, &rapl_packages, plist) {
- kfree(p->domains);
- list_del(&p->plist);
- kfree(p);
- }
-}
-
static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
{
struct rapl_domain *rd;
@@ -442,6 +429,7 @@ static int contraint_to_pl(struct rapl_domain *rd, int cid)
return i;
}
}
+ pr_err("Cannot find matching power limit for constraint %d\n", cid);
return -EINVAL;
}
@@ -457,6 +445,10 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid,
get_online_cpus();
rd = power_zone_to_rapl_domain(power_zone);
id = contraint_to_pl(rd, cid);
+ if (id < 0) {
+ ret = id;
+ goto set_exit;
+ }
rp = rd->rp;
@@ -496,6 +488,11 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
get_online_cpus();
rd = power_zone_to_rapl_domain(power_zone);
id = contraint_to_pl(rd, cid);
+ if (id < 0) {
+ ret = id;
+ goto get_exit;
+ }
+
switch (rd->rpl[id].prim_id) {
case PL1_ENABLE:
prim = POWER_LIMIT1;
@@ -512,6 +509,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
else
*data = val;
+get_exit:
put_online_cpus();
return ret;
@@ -527,6 +525,10 @@ static int set_time_window(struct powercap_zone *power_zone, int cid,
get_online_cpus();
rd = power_zone_to_rapl_domain(power_zone);
id = contraint_to_pl(rd, cid);
+ if (id < 0) {
+ ret = id;
+ goto set_time_exit;
+ }
switch (rd->rpl[id].prim_id) {
case PL1_ENABLE:
@@ -538,6 +540,8 @@ static int set_time_window(struct powercap_zone *power_zone, int cid,
default:
ret = -EINVAL;
}
+
+set_time_exit:
put_online_cpus();
return ret;
}
@@ -552,6 +556,10 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
get_online_cpus();
rd = power_zone_to_rapl_domain(power_zone);
id = contraint_to_pl(rd, cid);
+ if (id < 0) {
+ ret = id;
+ goto get_time_exit;
+ }
switch (rd->rpl[id].prim_id) {
case PL1_ENABLE:
@@ -566,6 +574,8 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
}
if (!ret)
*data = val;
+
+get_time_exit:
put_online_cpus();
return ret;
@@ -707,7 +717,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
case ENERGY_UNIT:
scale = ENERGY_UNIT_SCALE;
/* per domain unit takes precedence */
- if (rd && rd->domain_energy_unit)
+ if (rd->domain_energy_unit)
units = rd->domain_energy_unit;
else
units = rp->energy_unit;
@@ -976,10 +986,20 @@ static void package_power_limit_irq_save(struct rapl_package *rp)
smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
}
-static void power_limit_irq_restore_cpu(void *info)
+/*
+ * Restore per package power limit interrupt enable state. Called from cpu
+ * hotplug code on package removal.
+ */
+static void package_power_limit_irq_restore(struct rapl_package *rp)
{
- u32 l, h = 0;
- struct rapl_package *rp = (struct rapl_package *)info;
+ u32 l, h;
+
+ if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+ return;
+
+ /* irq enable state not saved, nothing to restore */
+ if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
+ return;
rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
@@ -991,19 +1011,6 @@ static void power_limit_irq_restore_cpu(void *info)
wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
}
-/* restore per package power limit interrupt enable state */
-static void package_power_limit_irq_restore(struct rapl_package *rp)
-{
- if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
- return;
-
- /* irq enable state not saved, nothing to restore */
- if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
- return;
-
- smp_call_function_single(rp->lead_cpu, power_limit_irq_restore_cpu, rp, 1);
-}
-
static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
{
int nr_powerlimit = find_nr_power_limit(rd);
@@ -1160,84 +1167,49 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core),
RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server),
+ RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM, rapl_defaults_hsw_server),
{}
};
MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
-/* read once for all raw primitive data for all packages, domains */
-static void rapl_update_domain_data(void)
+/* Read once for all raw primitive data for domains */
+static void rapl_update_domain_data(struct rapl_package *rp)
{
int dmn, prim;
u64 val;
- struct rapl_package *rp;
- list_for_each_entry(rp, &rapl_packages, plist) {
- for (dmn = 0; dmn < rp->nr_domains; dmn++) {
- pr_debug("update package %d domain %s data\n", rp->id,
- rp->domains[dmn].name);
- /* exclude non-raw primitives */
- for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++)
- if (!rapl_read_data_raw(&rp->domains[dmn], prim,
- rpi[prim].unit,
- &val))
- rp->domains[dmn].rdd.primitives[prim] =
- val;
+ for (dmn = 0; dmn < rp->nr_domains; dmn++) {
+ pr_debug("update package %d domain %s data\n", rp->id,
+ rp->domains[dmn].name);
+ /* exclude non-raw primitives */
+ for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
+ if (!rapl_read_data_raw(&rp->domains[dmn], prim,
+ rpi[prim].unit, &val))
+ rp->domains[dmn].rdd.primitives[prim] = val;
}
}
}
-static int rapl_unregister_powercap(void)
+static void rapl_unregister_powercap(void)
{
- struct rapl_package *rp;
- struct rapl_domain *rd, *rd_package = NULL;
-
- /* unregister all active rapl packages from the powercap layer,
- * hotplug lock held
- */
- list_for_each_entry(rp, &rapl_packages, plist) {
- package_power_limit_irq_restore(rp);
-
- for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
- rd++) {
- pr_debug("remove package, undo power limit on %d: %s\n",
- rp->id, rd->name);
- rapl_write_data_raw(rd, PL1_ENABLE, 0);
- rapl_write_data_raw(rd, PL1_CLAMP, 0);
- if (find_nr_power_limit(rd) > 1) {
- rapl_write_data_raw(rd, PL2_ENABLE, 0);
- rapl_write_data_raw(rd, PL2_CLAMP, 0);
- }
- if (rd->id == RAPL_DOMAIN_PACKAGE) {
- rd_package = rd;
- continue;
- }
- powercap_unregister_zone(control_type, &rd->power_zone);
- }
- /* do the package zone last */
- if (rd_package)
- powercap_unregister_zone(control_type,
- &rd_package->power_zone);
- }
-
if (platform_rapl_domain) {
powercap_unregister_zone(control_type,
&platform_rapl_domain->power_zone);
kfree(platform_rapl_domain);
}
-
powercap_unregister_control_type(control_type);
-
- return 0;
}
static int rapl_package_register_powercap(struct rapl_package *rp)
{
struct rapl_domain *rd;
- int ret = 0;
char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/
struct powercap_zone *power_zone = NULL;
- int nr_pl;
+ int nr_pl, ret;;
+
+ /* Update the domain data of the new package */
+ rapl_update_domain_data(rp);
/* first we register package domain as the parent zone*/
for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
@@ -1257,8 +1229,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
if (IS_ERR(power_zone)) {
pr_debug("failed to register package, %d\n",
rp->id);
- ret = PTR_ERR(power_zone);
- goto exit_package;
+ return PTR_ERR(power_zone);
}
/* track parent zone in per package/socket data */
rp->power_zone = power_zone;
@@ -1268,8 +1239,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
}
if (!power_zone) {
pr_err("no package domain found, unknown topology!\n");
- ret = -ENODEV;
- goto exit_package;
+ return -ENODEV;
}
/* now register domains as children of the socket/package*/
for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
@@ -1290,11 +1260,11 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
goto err_cleanup;
}
}
+ return 0;
-exit_package:
- return ret;
err_cleanup:
- /* clean up previously initialized domains within the package if we
+ /*
+ * Clean up previously initialized domains within the package if we
* failed after the first domain setup.
*/
while (--rd >= rp->domains) {
@@ -1305,7 +1275,7 @@ err_cleanup:
return ret;
}
-static int rapl_register_psys(void)
+static int __init rapl_register_psys(void)
{
struct rapl_domain *rd;
struct powercap_zone *power_zone;
@@ -1346,40 +1316,14 @@ static int rapl_register_psys(void)
return 0;
}
-static int rapl_register_powercap(void)
+static int __init rapl_register_powercap(void)
{
- struct rapl_domain *rd;
- struct rapl_package *rp;
- int ret = 0;
-
control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
if (IS_ERR(control_type)) {
pr_debug("failed to register powercap control_type.\n");
return PTR_ERR(control_type);
}
- /* read the initial data */
- rapl_update_domain_data();
- list_for_each_entry(rp, &rapl_packages, plist)
- if (rapl_package_register_powercap(rp))
- goto err_cleanup_package;
-
- /* Don't bail out if PSys is not supported */
- rapl_register_psys();
-
- return ret;
-
-err_cleanup_package:
- /* clean up previously initialized packages */
- list_for_each_entry_continue_reverse(rp, &rapl_packages, plist) {
- for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
- rd++) {
- pr_debug("unregister zone/package %d, %s domain\n",
- rp->id, rd->name);
- powercap_unregister_zone(control_type, &rd->power_zone);
- }
- }
-
- return ret;
+ return 0;
}
static int rapl_check_domain(int cpu, int domain)
@@ -1452,9 +1396,8 @@ static void rapl_detect_powerlimit(struct rapl_domain *rd)
*/
static int rapl_detect_domains(struct rapl_package *rp, int cpu)
{
- int i;
- int ret = 0;
struct rapl_domain *rd;
+ int i;
for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
/* use physical package id to read counters */
@@ -1466,84 +1409,20 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
if (!rp->nr_domains) {
pr_debug("no valid rapl domains found in package %d\n", rp->id);
- ret = -ENODEV;
- goto done;
+ return -ENODEV;
}
pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id);
rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
GFP_KERNEL);
- if (!rp->domains) {
- ret = -ENOMEM;
- goto done;
- }
+ if (!rp->domains)
+ return -ENOMEM;
+
rapl_init_domains(rp);
for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
rapl_detect_powerlimit(rd);
-
-
-done:
- return ret;
-}
-
-static bool is_package_new(int package)
-{
- struct rapl_package *rp;
-
- /* caller prevents cpu hotplug, there will be no new packages added
- * or deleted while traversing the package list, no need for locking.
- */
- list_for_each_entry(rp, &rapl_packages, plist)
- if (package == rp->id)
- return false;
-
- return true;
-}
-
-/* RAPL interface can be made of a two-level hierarchy: package level and domain
- * level. We first detect the number of packages then domains of each package.
- * We have to consider the possiblity of CPU online/offline due to hotplug and
- * other scenarios.
- */
-static int rapl_detect_topology(void)
-{
- int i;
- int phy_package_id;
- struct rapl_package *new_package, *rp;
-
- for_each_online_cpu(i) {
- phy_package_id = topology_physical_package_id(i);
- if (is_package_new(phy_package_id)) {
- new_package = kzalloc(sizeof(*rp), GFP_KERNEL);
- if (!new_package) {
- rapl_cleanup_data();
- return -ENOMEM;
- }
- /* add the new package to the list */
- new_package->id = phy_package_id;
- new_package->nr_cpus = 1;
- /* use the first active cpu of the package to access */
- new_package->lead_cpu = i;
- /* check if the package contains valid domains */
- if (rapl_detect_domains(new_package, i) ||
- rapl_defaults->check_unit(new_package, i)) {
- kfree(new_package->domains);
- kfree(new_package);
- /* free up the packages already initialized */
- rapl_cleanup_data();
- return -ENODEV;
- }
- INIT_LIST_HEAD(&new_package->plist);
- list_add(&new_package->plist, &rapl_packages);
- } else {
- rp = find_package_by_id(phy_package_id);
- if (rp)
- ++rp->nr_cpus;
- }
- }
-
return 0;
}
@@ -1552,12 +1431,21 @@ static void rapl_remove_package(struct rapl_package *rp)
{
struct rapl_domain *rd, *rd_package = NULL;
+ package_power_limit_irq_restore(rp);
+
for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+ rapl_write_data_raw(rd, PL1_ENABLE, 0);
+ rapl_write_data_raw(rd, PL1_CLAMP, 0);
+ if (find_nr_power_limit(rd) > 1) {
+ rapl_write_data_raw(rd, PL2_ENABLE, 0);
+ rapl_write_data_raw(rd, PL2_CLAMP, 0);
+ }
if (rd->id == RAPL_DOMAIN_PACKAGE) {
rd_package = rd;
continue;
}
- pr_debug("remove package %d, %s domain\n", rp->id, rd->name);
+ pr_debug("remove package, undo power limit on %d: %s\n",
+ rp->id, rd->name);
powercap_unregister_zone(control_type, &rd->power_zone);
}
/* do parent zone last */
@@ -1567,20 +1455,17 @@ static void rapl_remove_package(struct rapl_package *rp)
}
/* called from CPU hotplug notifier, hotplug lock held */
-static int rapl_add_package(int cpu)
+static struct rapl_package *rapl_add_package(int cpu, int pkgid)
{
- int ret = 0;
- int phy_package_id;
struct rapl_package *rp;
+ int ret;
- phy_package_id = topology_physical_package_id(cpu);
rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
if (!rp)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
/* add the new package to the list */
- rp->id = phy_package_id;
- rp->nr_cpus = 1;
+ rp->id = pkgid;
rp->lead_cpu = cpu;
/* check if the package contains valid domains */
@@ -1589,17 +1474,17 @@ static int rapl_add_package(int cpu)
ret = -ENODEV;
goto err_free_package;
}
- if (!rapl_package_register_powercap(rp)) {
+ ret = rapl_package_register_powercap(rp);
+ if (!ret) {
INIT_LIST_HEAD(&rp->plist);
list_add(&rp->plist, &rapl_packages);
- return ret;
+ return rp;
}
err_free_package:
kfree(rp->domains);
kfree(rp);
-
- return ret;
+ return ERR_PTR(ret);
}
/* Handles CPU hotplug on multi-socket systems.
@@ -1609,55 +1494,46 @@ err_free_package:
* associated domains. Cooling devices are handled accordingly at
* per-domain level.
*/
-static int rapl_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int rapl_cpu_online(unsigned int cpu)
{
- unsigned long cpu = (unsigned long)hcpu;
- int phy_package_id;
+ int pkgid = topology_physical_package_id(cpu);
struct rapl_package *rp;
- int lead_cpu;
- phy_package_id = topology_physical_package_id(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- rp = find_package_by_id(phy_package_id);
- if (rp)
- ++rp->nr_cpus;
- else
- rapl_add_package(cpu);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- rp = find_package_by_id(phy_package_id);
- if (!rp)
- break;
- if (--rp->nr_cpus == 0)
- rapl_remove_package(rp);
- else if (cpu == rp->lead_cpu) {
- /* choose another active cpu in the package */
- lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu);
- if (lead_cpu < nr_cpu_ids)
- rp->lead_cpu = lead_cpu;
- else /* should never go here */
- pr_err("no active cpu available for package %d\n",
- phy_package_id);
- }
+ rp = find_package_by_id(pkgid);
+ if (!rp) {
+ rp = rapl_add_package(cpu, pkgid);
+ if (IS_ERR(rp))
+ return PTR_ERR(rp);
}
+ cpumask_set_cpu(cpu, &rp->cpumask);
+ return 0;
+}
+
+static int rapl_cpu_down_prep(unsigned int cpu)
+{
+ int pkgid = topology_physical_package_id(cpu);
+ struct rapl_package *rp;
+ int lead_cpu;
+
+ rp = find_package_by_id(pkgid);
+ if (!rp)
+ return 0;
- return NOTIFY_OK;
+ cpumask_clear_cpu(cpu, &rp->cpumask);
+ lead_cpu = cpumask_first(&rp->cpumask);
+ if (lead_cpu >= nr_cpu_ids)
+ rapl_remove_package(rp);
+ else if (rp->lead_cpu == cpu)
+ rp->lead_cpu = lead_cpu;
+ return 0;
}
-static struct notifier_block rapl_cpu_notifier = {
- .notifier_call = rapl_cpu_callback,
-};
+static enum cpuhp_state pcap_rapl_online;
static int __init rapl_init(void)
{
- int ret = 0;
const struct x86_cpu_id *id;
+ int ret;
id = x86_match_cpu(rapl_ids);
if (!id) {
@@ -1669,36 +1545,29 @@ static int __init rapl_init(void)
rapl_defaults = (struct rapl_defaults *)id->driver_data;
- cpu_notifier_register_begin();
-
- /* prevent CPU hotplug during detection */
- get_online_cpus();
- ret = rapl_detect_topology();
+ ret = rapl_register_powercap();
if (ret)
- goto done;
+ return ret;
- if (rapl_register_powercap()) {
- rapl_cleanup_data();
- ret = -ENODEV;
- goto done;
- }
- __register_hotcpu_notifier(&rapl_cpu_notifier);
-done:
- put_online_cpus();
- cpu_notifier_register_done();
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
+ rapl_cpu_online, rapl_cpu_down_prep);
+ if (ret < 0)
+ goto err_unreg;
+ pcap_rapl_online = ret;
+
+ /* Don't bail out if PSys is not supported */
+ rapl_register_psys();
+ return 0;
+err_unreg:
+ rapl_unregister_powercap();
return ret;
}
static void __exit rapl_exit(void)
{
- cpu_notifier_register_begin();
- get_online_cpus();
- __unregister_hotcpu_notifier(&rapl_cpu_notifier);
+ cpuhp_remove_state(pcap_rapl_online);
rapl_unregister_powercap();
- rapl_cleanup_data();
- put_online_cpus();
- cpu_notifier_register_done();
}
module_init(rapl_init);
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 350cb5e22ff3..df64692e9e64 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -43,7 +43,6 @@
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/cpu.h>
#include <linux/thermal.h>
#include <linux/slab.h>
@@ -85,11 +84,26 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update
*/
static bool clamping;
+static const struct sched_param sparam = {
+ .sched_priority = MAX_USER_RT_PRIO / 2,
+};
+struct powerclamp_worker_data {
+ struct kthread_worker *worker;
+ struct kthread_work balancing_work;
+ struct kthread_delayed_work idle_injection_work;
+ unsigned int cpu;
+ unsigned int count;
+ unsigned int guard;
+ unsigned int window_size_now;
+ unsigned int target_ratio;
+ unsigned int duration_jiffies;
+ bool clamping;
+};
-static struct task_struct * __percpu *powerclamp_thread;
+static struct powerclamp_worker_data * __percpu worker_data;
static struct thermal_cooling_device *cooling_dev;
static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
- * clamping thread
+ * clamping kthread worker
*/
static unsigned int duration;
@@ -261,11 +275,6 @@ static u64 pkg_state_counter(void)
return count;
}
-static void noop_timer(unsigned long foo)
-{
- /* empty... just the fact that we get the interrupt wakes us up */
-}
-
static unsigned int get_compensation(int ratio)
{
unsigned int comp = 0;
@@ -367,103 +376,79 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
return set_target_ratio + guard <= current_ratio;
}
-static int clamp_thread(void *arg)
+static void clamp_balancing_func(struct kthread_work *work)
{
- int cpunr = (unsigned long)arg;
- DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
- static const struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
- unsigned int count = 0;
- unsigned int target_ratio;
+ struct powerclamp_worker_data *w_data;
+ int sleeptime;
+ unsigned long target_jiffies;
+ unsigned int compensated_ratio;
+ int interval; /* jiffies to sleep for each attempt */
- set_bit(cpunr, cpu_clamping_mask);
- set_freezable();
- init_timer_on_stack(&wakeup_timer);
- sched_setscheduler(current, SCHED_FIFO, &param);
-
- while (true == clamping && !kthread_should_stop() &&
- cpu_online(cpunr)) {
- int sleeptime;
- unsigned long target_jiffies;
- unsigned int guard;
- unsigned int compensated_ratio;
- int interval; /* jiffies to sleep for each attempt */
- unsigned int duration_jiffies = msecs_to_jiffies(duration);
- unsigned int window_size_now;
-
- try_to_freeze();
- /*
- * make sure user selected ratio does not take effect until
- * the next round. adjust target_ratio if user has changed
- * target such that we can converge quickly.
- */
- target_ratio = set_target_ratio;
- guard = 1 + target_ratio/20;
- window_size_now = window_size;
- count++;
-
- /*
- * systems may have different ability to enter package level
- * c-states, thus we need to compensate the injected idle ratio
- * to achieve the actual target reported by the HW.
- */
- compensated_ratio = target_ratio +
- get_compensation(target_ratio);
- if (compensated_ratio <= 0)
- compensated_ratio = 1;
- interval = duration_jiffies * 100 / compensated_ratio;
-
- /* align idle time */
- target_jiffies = roundup(jiffies, interval);
- sleeptime = target_jiffies - jiffies;
- if (sleeptime <= 0)
- sleeptime = 1;
- schedule_timeout_interruptible(sleeptime);
- /*
- * only elected controlling cpu can collect stats and update
- * control parameters.
- */
- if (cpunr == control_cpu && !(count%window_size_now)) {
- should_skip =
- powerclamp_adjust_controls(target_ratio,
- guard, window_size_now);
- smp_mb();
- }
+ w_data = container_of(work, struct powerclamp_worker_data,
+ balancing_work);
- if (should_skip)
- continue;
-
- target_jiffies = jiffies + duration_jiffies;
- mod_timer(&wakeup_timer, target_jiffies);
- if (unlikely(local_softirq_pending()))
- continue;
- /*
- * stop tick sched during idle time, interrupts are still
- * allowed. thus jiffies are updated properly.
- */
- preempt_disable();
- /* mwait until target jiffies is reached */
- while (time_before(jiffies, target_jiffies)) {
- unsigned long ecx = 1;
- unsigned long eax = target_mwait;
-
- /*
- * REVISIT: may call enter_idle() to notify drivers who
- * can save power during cpu idle. same for exit_idle()
- */
- local_touch_nmi();
- stop_critical_timings();
- mwait_idle_with_hints(eax, ecx);
- start_critical_timings();
- atomic_inc(&idle_wakeup_counter);
- }
- preempt_enable();
+ /*
+ * make sure user selected ratio does not take effect until
+ * the next round. adjust target_ratio if user has changed
+ * target such that we can converge quickly.
+ */
+ w_data->target_ratio = READ_ONCE(set_target_ratio);
+ w_data->guard = 1 + w_data->target_ratio / 20;
+ w_data->window_size_now = window_size;
+ w_data->duration_jiffies = msecs_to_jiffies(duration);
+ w_data->count++;
+
+ /*
+ * systems may have different ability to enter package level
+ * c-states, thus we need to compensate the injected idle ratio
+ * to achieve the actual target reported by the HW.
+ */
+ compensated_ratio = w_data->target_ratio +
+ get_compensation(w_data->target_ratio);
+ if (compensated_ratio <= 0)
+ compensated_ratio = 1;
+ interval = w_data->duration_jiffies * 100 / compensated_ratio;
+
+ /* align idle time */
+ target_jiffies = roundup(jiffies, interval);
+ sleeptime = target_jiffies - jiffies;
+ if (sleeptime <= 0)
+ sleeptime = 1;
+
+ if (clamping && w_data->clamping && cpu_online(w_data->cpu))
+ kthread_queue_delayed_work(w_data->worker,
+ &w_data->idle_injection_work,
+ sleeptime);
+}
+
+static void clamp_idle_injection_func(struct kthread_work *work)
+{
+ struct powerclamp_worker_data *w_data;
+
+ w_data = container_of(work, struct powerclamp_worker_data,
+ idle_injection_work.work);
+
+ /*
+ * only elected controlling cpu can collect stats and update
+ * control parameters.
+ */
+ if (w_data->cpu == control_cpu &&
+ !(w_data->count % w_data->window_size_now)) {
+ should_skip =
+ powerclamp_adjust_controls(w_data->target_ratio,
+ w_data->guard,
+ w_data->window_size_now);
+ smp_mb();
}
- del_timer_sync(&wakeup_timer);
- clear_bit(cpunr, cpu_clamping_mask);
- return 0;
+ if (should_skip)
+ goto balance;
+
+ play_idle(jiffies_to_msecs(w_data->duration_jiffies));
+
+balance:
+ if (clamping && w_data->clamping && cpu_online(w_data->cpu))
+ kthread_queue_work(w_data->worker, &w_data->balancing_work);
}
/*
@@ -507,10 +492,60 @@ static void poll_pkg_cstate(struct work_struct *dummy)
schedule_delayed_work(&poll_pkg_cstate_work, HZ);
}
+static void start_power_clamp_worker(unsigned long cpu)
+{
+ struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
+ struct kthread_worker *worker;
+
+ worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
+ if (IS_ERR(worker))
+ return;
+
+ w_data->worker = worker;
+ w_data->count = 0;
+ w_data->cpu = cpu;
+ w_data->clamping = true;
+ set_bit(cpu, cpu_clamping_mask);
+ sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
+ kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
+ kthread_init_delayed_work(&w_data->idle_injection_work,
+ clamp_idle_injection_func);
+ kthread_queue_work(w_data->worker, &w_data->balancing_work);
+}
+
+static void stop_power_clamp_worker(unsigned long cpu)
+{
+ struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
+
+ if (!w_data->worker)
+ return;
+
+ w_data->clamping = false;
+ /*
+ * Make sure that all works that get queued after this point see
+ * the clamping disabled. The counter part is not needed because
+ * there is an implicit memory barrier when the queued work
+ * is proceed.
+ */
+ smp_wmb();
+ kthread_cancel_work_sync(&w_data->balancing_work);
+ kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
+ /*
+ * The balancing work still might be queued here because
+ * the handling of the "clapming" variable, cancel, and queue
+ * operations are not synchronized via a lock. But it is not
+ * a big deal. The balancing work is fast and destroy kthread
+ * will wait for it.
+ */
+ clear_bit(w_data->cpu, cpu_clamping_mask);
+ kthread_destroy_worker(w_data->worker);
+
+ w_data->worker = NULL;
+}
+
static int start_power_clamp(void)
{
unsigned long cpu;
- struct task_struct *thread;
set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
/* prevent cpu hotplug */
@@ -524,22 +559,9 @@ static int start_power_clamp(void)
clamping = true;
schedule_delayed_work(&poll_pkg_cstate_work, 0);
- /* start one thread per online cpu */
+ /* start one kthread worker per online cpu */
for_each_online_cpu(cpu) {
- struct task_struct **p =
- per_cpu_ptr(powerclamp_thread, cpu);
-
- thread = kthread_create_on_node(clamp_thread,
- (void *) cpu,
- cpu_to_node(cpu),
- "kidle_inject/%ld", cpu);
- /* bind to cpu here */
- if (likely(!IS_ERR(thread))) {
- kthread_bind(thread, cpu);
- wake_up_process(thread);
- *p = thread;
- }
-
+ start_power_clamp_worker(cpu);
}
put_online_cpus();
@@ -549,71 +571,49 @@ static int start_power_clamp(void)
static void end_power_clamp(void)
{
int i;
- struct task_struct *thread;
- clamping = false;
/*
- * make clamping visible to other cpus and give per cpu clamping threads
- * sometime to exit, or gets killed later.
+ * Block requeuing in all the kthread workers. They will flush and
+ * stop faster.
*/
- smp_mb();
- msleep(20);
+ clamping = false;
if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
- pr_debug("clamping thread for cpu %d alive, kill\n", i);
- thread = *per_cpu_ptr(powerclamp_thread, i);
- kthread_stop(thread);
+ pr_debug("clamping worker for cpu %d alive, destroy\n",
+ i);
+ stop_power_clamp_worker(i);
}
}
}
-static int powerclamp_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int powerclamp_cpu_online(unsigned int cpu)
{
- unsigned long cpu = (unsigned long)hcpu;
- struct task_struct *thread;
- struct task_struct **percpu_thread =
- per_cpu_ptr(powerclamp_thread, cpu);
-
- if (false == clamping)
- goto exit_ok;
-
- switch (action) {
- case CPU_ONLINE:
- thread = kthread_create_on_node(clamp_thread,
- (void *) cpu,
- cpu_to_node(cpu),
- "kidle_inject/%lu", cpu);
- if (likely(!IS_ERR(thread))) {
- kthread_bind(thread, cpu);
- wake_up_process(thread);
- *percpu_thread = thread;
- }
- /* prefer BSP as controlling CPU */
- if (cpu == 0) {
- control_cpu = 0;
- smp_mb();
- }
- break;
- case CPU_DEAD:
- if (test_bit(cpu, cpu_clamping_mask)) {
- pr_err("cpu %lu dead but powerclamping thread is not\n",
- cpu);
- kthread_stop(*percpu_thread);
- }
- if (cpu == control_cpu) {
- control_cpu = smp_processor_id();
- smp_mb();
- }
+ if (clamping == false)
+ return 0;
+ start_power_clamp_worker(cpu);
+ /* prefer BSP as controlling CPU */
+ if (cpu == 0) {
+ control_cpu = 0;
+ smp_mb();
}
-
-exit_ok:
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block powerclamp_cpu_notifier = {
- .notifier_call = powerclamp_cpu_callback,
-};
+static int powerclamp_cpu_predown(unsigned int cpu)
+{
+ if (clamping == false)
+ return 0;
+
+ stop_power_clamp_worker(cpu);
+ if (cpu != control_cpu)
+ return 0;
+
+ control_cpu = cpumask_first(cpu_online_mask);
+ if (control_cpu == cpu)
+ control_cpu = cpumask_next(cpu, cpu_online_mask);
+ smp_mb();
+ return 0;
+}
static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
unsigned long *state)
@@ -741,6 +741,8 @@ file_error:
debugfs_remove_recursive(debug_dir);
}
+static enum cpuhp_state hp_state;
+
static int __init powerclamp_init(void)
{
int retval;
@@ -758,10 +760,17 @@ static int __init powerclamp_init(void)
/* set default limit, maybe adjusted during runtime based on feedback */
window_size = 2;
- register_hotcpu_notifier(&powerclamp_cpu_notifier);
+ retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "thermal/intel_powerclamp:online",
+ powerclamp_cpu_online,
+ powerclamp_cpu_predown);
+ if (retval < 0)
+ goto exit_free;
+
+ hp_state = retval;
- powerclamp_thread = alloc_percpu(struct task_struct *);
- if (!powerclamp_thread) {
+ worker_data = alloc_percpu(struct powerclamp_worker_data);
+ if (!worker_data) {
retval = -ENOMEM;
goto exit_unregister;
}
@@ -781,9 +790,9 @@ static int __init powerclamp_init(void)
return 0;
exit_free_thread:
- free_percpu(powerclamp_thread);
+ free_percpu(worker_data);
exit_unregister:
- unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
+ cpuhp_remove_state_nocalls(hp_state);
exit_free:
kfree(cpu_clamping_mask);
return retval;
@@ -792,9 +801,9 @@ module_init(powerclamp_init);
static void __exit powerclamp_exit(void)
{
- unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
end_power_clamp();
- free_percpu(powerclamp_thread);
+ cpuhp_remove_state_nocalls(hp_state);
+ free_percpu(worker_data);
thermal_cooling_device_unregister(cooling_dev);
kfree(cpu_clamping_mask);
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index f3db11c24654..c1ba00fc4888 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -249,6 +249,7 @@ extern int acpi_processor_register_performance(struct acpi_processor_performance
*performance, unsigned int cpu);
extern void acpi_processor_unregister_performance(unsigned int cpu);
+int acpi_processor_pstate_control(void);
/* note: this locks both the calling module and the processor module
if a _PPC object exists, rmmod is disallowed then */
int acpi_processor_notify_smm(struct module *calling_module);
@@ -294,7 +295,7 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx
#ifdef CONFIG_CPU_FREQ
void acpi_processor_ppc_init(void);
void acpi_processor_ppc_exit(void);
-int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag);
+void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag);
extern int acpi_processor_get_bios_limit(int cpu, unsigned int *limit);
#else
static inline void acpi_processor_ppc_init(void)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e571128ad99a..09807c2ce328 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -238,6 +238,8 @@ void arch_cpu_idle_dead(void);
int cpu_report_state(int cpu);
int cpu_check_up_prepare(int cpu);
void cpu_set_state_online(int cpu);
+void play_idle(unsigned long duration_ms);
+
#ifdef CONFIG_HOTPLUG_CPU
bool cpu_wait_death(unsigned int cpu, int seconds);
bool cpu_report_death(void);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 32dc0cbd51ca..7e05c5e4e45c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -175,7 +175,7 @@ void disable_cpufreq(void);
u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
-int cpufreq_update_policy(unsigned int cpu);
+void cpufreq_update_policy(unsigned int cpu);
bool have_governor_per_policy(void);
struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
@@ -234,6 +234,10 @@ __ATTR(_name, _perm, show_##_name, NULL)
static struct freq_attr _name = \
__ATTR(_name, 0644, show_##_name, store_##_name)
+#define cpufreq_freq_attr_wo(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0200, NULL, store_##_name)
+
struct global_attr {
struct attribute attr;
ssize_t (*show)(struct kobject *kobj,
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb31373c3478..da346f2817a8 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -74,6 +74,7 @@ struct cpuidle_driver_kobj;
struct cpuidle_device {
unsigned int registered:1;
unsigned int enabled:1;
+ unsigned int use_deepest_state:1;
unsigned int cpu;
int last_residency;
@@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
#endif
-#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND)
+#ifdef CONFIG_CPU_IDLE
extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev);
extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
struct cpuidle_device *dev);
+extern void cpuidle_use_deepest_state(bool enable);
#else
static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
@@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
{return -ENODEV; }
+static inline void cpuidle_use_deepest_state(bool enable)
+{
+}
#endif
/* kernel/sched/idle.c */
@@ -235,8 +240,6 @@ struct cpuidle_governor {
int (*select) (struct cpuidle_driver *drv,
struct cpuidle_device *dev);
void (*reflect) (struct cpuidle_device *dev, int index);
-
- struct module *owner;
};
#ifdef CONFIG_CPU_IDLE
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index a09fe5c009c8..81ece61075df 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -15,11 +15,11 @@
#include <linux/err.h>
#include <linux/of.h>
#include <linux/notifier.h>
+#include <linux/spinlock.h>
/* Defines used for the flags field in the struct generic_pm_domain */
#define GENPD_FLAG_PM_CLK (1U << 0) /* PM domain uses PM clk */
-
-#define GENPD_MAX_NUM_STATES 8 /* Number of possible low power states */
+#define GENPD_FLAG_IRQ_SAFE (1U << 1) /* PM domain operates in atomic */
enum gpd_status {
GPD_STATE_ACTIVE = 0, /* PM domain is active */
@@ -40,15 +40,18 @@ struct gpd_dev_ops {
struct genpd_power_state {
s64 power_off_latency_ns;
s64 power_on_latency_ns;
+ s64 residency_ns;
+ struct fwnode_handle *fwnode;
};
+struct genpd_lock_ops;
+
struct generic_pm_domain {
struct dev_pm_domain domain; /* PM domain operations */
struct list_head gpd_list_node; /* Node in the global PM domains list */
struct list_head master_links; /* Links with PM domain as a master */
struct list_head slave_links; /* Links with PM domain as a slave */
struct list_head dev_list; /* List of devices */
- struct mutex lock;
struct dev_power_governor *gov;
struct work_struct power_off_work;
struct fwnode_handle *provider; /* Identity of the domain provider */
@@ -70,9 +73,18 @@ struct generic_pm_domain {
void (*detach_dev)(struct generic_pm_domain *domain,
struct device *dev);
unsigned int flags; /* Bit field of configs for genpd */
- struct genpd_power_state states[GENPD_MAX_NUM_STATES];
+ struct genpd_power_state *states;
unsigned int state_count; /* number of states */
unsigned int state_idx; /* state that genpd will go to when off */
+ void *free; /* Free the state that was allocated for default */
+ const struct genpd_lock_ops *lock_ops;
+ union {
+ struct mutex mlock;
+ struct {
+ spinlock_t slock;
+ unsigned long lock_flags;
+ };
+ };
};
@@ -205,6 +217,8 @@ extern int of_genpd_add_device(struct of_phandle_args *args,
extern int of_genpd_add_subdomain(struct of_phandle_args *parent,
struct of_phandle_args *new_subdomain);
extern struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
+extern int of_genpd_parse_idle_states(struct device_node *dn,
+ struct genpd_power_state **states, int *n);
int genpd_dev_pm_attach(struct device *dev);
#else /* !CONFIG_PM_GENERIC_DOMAINS_OF */
@@ -234,6 +248,12 @@ static inline int of_genpd_add_subdomain(struct of_phandle_args *parent,
return -ENODEV;
}
+static inline int of_genpd_parse_idle_states(struct device_node *dn,
+ struct genpd_power_state **states, int *n)
+{
+ return -ENODEV;
+}
+
static inline int genpd_dev_pm_attach(struct device *dev)
{
return -ENODEV;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index bca26157f5b6..0edd88f93904 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -17,13 +17,65 @@
#include <linux/err.h>
#include <linux/notifier.h>
+struct clk;
+struct regulator;
struct dev_pm_opp;
struct device;
+struct opp_table;
enum dev_pm_opp_event {
OPP_EVENT_ADD, OPP_EVENT_REMOVE, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE,
};
+/**
+ * struct dev_pm_opp_supply - Power supply voltage/current values
+ * @u_volt: Target voltage in microvolts corresponding to this OPP
+ * @u_volt_min: Minimum voltage in microvolts corresponding to this OPP
+ * @u_volt_max: Maximum voltage in microvolts corresponding to this OPP
+ * @u_amp: Maximum current drawn by the device in microamperes
+ *
+ * This structure stores the voltage/current values for a single power supply.
+ */
+struct dev_pm_opp_supply {
+ unsigned long u_volt;
+ unsigned long u_volt_min;
+ unsigned long u_volt_max;
+ unsigned long u_amp;
+};
+
+/**
+ * struct dev_pm_opp_info - OPP freq/voltage/current values
+ * @rate: Target clk rate in hz
+ * @supplies: Array of voltage/current values for all power supplies
+ *
+ * This structure stores the freq/voltage/current values for a single OPP.
+ */
+struct dev_pm_opp_info {
+ unsigned long rate;
+ struct dev_pm_opp_supply *supplies;
+};
+
+/**
+ * struct dev_pm_set_opp_data - Set OPP data
+ * @old_opp: Old OPP info
+ * @new_opp: New OPP info
+ * @regulators: Array of regulator pointers
+ * @regulator_count: Number of regulators
+ * @clk: Pointer to clk
+ * @dev: Pointer to the struct device
+ *
+ * This structure contains all information required for setting an OPP.
+ */
+struct dev_pm_set_opp_data {
+ struct dev_pm_opp_info old_opp;
+ struct dev_pm_opp_info new_opp;
+
+ struct regulator **regulators;
+ unsigned int regulator_count;
+ struct clk *clk;
+ struct device *dev;
+};
+
#if defined(CONFIG_PM_OPP)
unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
@@ -62,8 +114,10 @@ int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
void dev_pm_opp_put_supported_hw(struct device *dev);
int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
void dev_pm_opp_put_prop_name(struct device *dev);
-int dev_pm_opp_set_regulator(struct device *dev, const char *name);
-void dev_pm_opp_put_regulator(struct device *dev);
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
+void dev_pm_opp_put_regulators(struct opp_table *opp_table);
+int dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
+void dev_pm_opp_register_put_opp_helper(struct device *dev);
int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -163,6 +217,14 @@ static inline int dev_pm_opp_set_supported_hw(struct device *dev,
static inline void dev_pm_opp_put_supported_hw(struct device *dev) {}
+static inline int dev_pm_opp_register_set_opp_helper(struct device *dev,
+ int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+ return -ENOTSUPP;
+}
+
+static inline void dev_pm_opp_register_put_opp_helper(struct device *dev) {}
+
static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
{
return -ENOTSUPP;
@@ -170,12 +232,12 @@ static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
-static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count)
{
- return -ENOTSUPP;
+ return ERR_PTR(-ENOTSUPP);
}
-static inline void dev_pm_opp_put_regulator(struct device *dev) {}
+static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
{
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 2e14d2667b6c..4957fc185ea9 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -61,12 +61,6 @@ static inline void pm_suspend_ignore_children(struct device *dev, bool enable)
dev->power.ignore_children = enable;
}
-static inline bool pm_children_suspended(struct device *dev)
-{
- return dev->power.ignore_children
- || !atomic_read(&dev->power.child_count);
-}
-
static inline void pm_runtime_get_noresume(struct device *dev)
{
atomic_inc(&dev->power.usage_count);
@@ -162,7 +156,6 @@ static inline void pm_runtime_allow(struct device *dev) {}
static inline void pm_runtime_forbid(struct device *dev) {}
static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {}
-static inline bool pm_children_suspended(struct device *dev) { return false; }
static inline void pm_runtime_get_noresume(struct device *dev) {}
static inline void pm_runtime_put_noidle(struct device *dev) {}
static inline bool device_run_wake(struct device *dev) { return false; }
@@ -265,9 +258,9 @@ static inline int pm_runtime_set_active(struct device *dev)
return __pm_runtime_set_status(dev, RPM_ACTIVE);
}
-static inline void pm_runtime_set_suspended(struct device *dev)
+static inline int pm_runtime_set_suspended(struct device *dev)
{
- __pm_runtime_set_status(dev, RPM_SUSPENDED);
+ return __pm_runtime_set_status(dev, RPM_SUSPENDED);
}
static inline void pm_runtime_disable(struct device *dev)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0e90f2973719..5ccbbfe41345 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2287,6 +2287,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
/*
* Per process flags
*/
+#define PF_IDLE 0x00000002 /* I am an IDLE thread */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
@@ -2648,7 +2649,7 @@ extern struct task_struct *idle_task(int cpu);
*/
static inline bool is_idle_task(const struct task_struct *p)
{
- return p->pid == 0;
+ return !!(p->flags & PF_IDLE);
}
extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d9718378a8be..0c729c3c8549 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -194,6 +194,8 @@ struct platform_freeze_ops {
};
#ifdef CONFIG_SUSPEND
+extern suspend_state_t mem_sleep_default;
+
/**
* suspend_set_ops - set platform dependent suspend operations
* @ops: The new suspend operations to set.
diff --git a/kernel/fork.c b/kernel/fork.c
index 7377f414f3ce..a439ac429669 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1544,7 +1544,7 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 281a697fd458..d401c21136d1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_async);
+#ifdef CONFIG_SUSPEND
+static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ char *s = buf;
+ suspend_state_t i;
+
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (mem_sleep_states[i]) {
+ const char *label = mem_sleep_states[i];
+
+ if (mem_sleep_current == i)
+ s += sprintf(s, "[%s] ", label);
+ else
+ s += sprintf(s, "%s ", label);
+ }
+
+ /* Convert the last space to a newline if needed. */
+ if (s != buf)
+ *(s-1) = '\n';
+
+ return (s - buf);
+}
+
+static suspend_state_t decode_suspend_state(const char *buf, size_t n)
+{
+ suspend_state_t state;
+ char *p;
+ int len;
+
+ p = memchr(buf, '\n', n);
+ len = p ? p - buf : n;
+
+ for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+ const char *label = mem_sleep_states[state];
+
+ if (label && len == strlen(label) && !strncmp(buf, label, len))
+ return state;
+ }
+
+ return PM_SUSPEND_ON;
+}
+
+static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+
+ state = decode_suspend_state(buf, n);
+ if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON)
+ mem_sleep_current = state;
+ else
+ error = -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
+ return error ? error : n;
+}
+
+power_attr(mem_sleep);
+#endif /* CONFIG_SUSPEND */
+
#ifdef CONFIG_PM_DEBUG
int pm_test_level = TEST_NONE;
@@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
}
state = decode_state(buf, n);
- if (state < PM_SUSPEND_MAX)
+ if (state < PM_SUSPEND_MAX) {
+ if (state == PM_SUSPEND_MEM)
+ state = mem_sleep_current;
+
error = pm_suspend(state);
- else if (state == PM_SUSPEND_MAX)
+ } else if (state == PM_SUSPEND_MAX) {
error = hibernate();
- else
+ } else {
error = -EINVAL;
+ }
out:
pm_autosleep_unlock();
@@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj,
&& strcmp(buf, "off") && strcmp(buf, "off\n"))
return -EINVAL;
+ if (state == PM_SUSPEND_MEM)
+ state = mem_sleep_current;
+
error = pm_autosleep_set_state(state);
return error ? error : n;
}
@@ -602,6 +681,9 @@ static struct attribute * g[] = {
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
&wakeup_count_attr.attr,
+#ifdef CONFIG_SUSPEND
+ &mem_sleep_attr.attr,
+#endif
#ifdef CONFIG_PM_AUTOSLEEP
&autosleep_attr.attr,
#endif
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 56d1d0dedf76..1dfa0da827d3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -189,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
-extern const char *pm_labels[];
+extern const char * const pm_labels[];
extern const char *pm_states[];
+extern const char *mem_sleep_states[];
+extern suspend_state_t mem_sleep_current;
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
+#define mem_sleep_current PM_SUSPEND_ON
+
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6ccb08f57fcb..f67ceb7768b8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -32,8 +32,21 @@
#include "power.h"
-const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
+const char * const pm_labels[] = {
+ [PM_SUSPEND_FREEZE] = "freeze",
+ [PM_SUSPEND_STANDBY] = "standby",
+ [PM_SUSPEND_MEM] = "mem",
+};
const char *pm_states[PM_SUSPEND_MAX];
+static const char * const mem_sleep_labels[] = {
+ [PM_SUSPEND_FREEZE] = "s2idle",
+ [PM_SUSPEND_STANDBY] = "shallow",
+ [PM_SUSPEND_MEM] = "deep",
+};
+const char *mem_sleep_states[PM_SUSPEND_MAX];
+
+suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
+suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
unsigned int pm_suspend_global_flags;
EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
@@ -110,30 +123,32 @@ static bool valid_state(suspend_state_t state)
return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
}
-/*
- * If this is set, the "mem" label always corresponds to the deepest sleep state
- * available, the "standby" label corresponds to the second deepest sleep state
- * available (if any), and the "freeze" label corresponds to the remaining
- * available sleep state (if there is one).
- */
-static bool relative_states;
-
void __init pm_states_init(void)
{
+ /* "mem" and "freeze" are always present in /sys/power/state. */
+ pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
/*
- * freeze state should be supported even without any suspend_ops,
- * initialize pm_states accordingly here
+ * Suspend-to-idle should be supported even without any suspend_ops,
+ * initialize mem_sleep_states[] accordingly here.
*/
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
+ mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
}
-static int __init sleep_states_setup(char *str)
+static int __init mem_sleep_default_setup(char *str)
{
- relative_states = !strncmp(str, "1", 1);
+ suspend_state_t state;
+
+ for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
+ if (mem_sleep_labels[state] &&
+ !strcmp(str, mem_sleep_labels[state])) {
+ mem_sleep_default = state;
+ break;
+ }
+
return 1;
}
-
-__setup("relative_sleep_states=", sleep_states_setup);
+__setup("mem_sleep_default=", mem_sleep_default_setup);
/**
* suspend_set_ops - Set the global suspend method table.
@@ -141,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup);
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
- suspend_state_t i;
- int j = 0;
-
lock_system_sleep();
suspend_ops = ops;
- for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
- if (valid_state(i)) {
- pm_states[i] = pm_labels[j++];
- } else if (!relative_states) {
- pm_states[i] = NULL;
- j++;
- }
- pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
+ if (valid_state(PM_SUSPEND_STANDBY)) {
+ mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY];
+ pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY];
+ if (mem_sleep_default == PM_SUSPEND_STANDBY)
+ mem_sleep_current = PM_SUSPEND_STANDBY;
+ }
+ if (valid_state(PM_SUSPEND_MEM)) {
+ mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
+ if (mem_sleep_default >= PM_SUSPEND_MEM)
+ mem_sleep_current = PM_SUSPEND_MEM;
+ }
unlock_system_sleep();
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d18804491d9f..966556ebdbb3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5280,6 +5280,7 @@ void init_idle(struct task_struct *idle, int cpu)
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+ idle->flags |= PF_IDLE;
kasan_unpoison_task_stack(idle);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 69e06898997d..fd4659313640 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,11 +12,14 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h>
+#include <linux/kthread.h>
#include <linux/slab.h>
#include <trace/events/power.h>
#include "sched.h"
+#define SUGOV_KTHREAD_PRIORITY 50
+
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
@@ -35,8 +38,10 @@ struct sugov_policy {
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
- struct work_struct work;
+ struct kthread_work work;
struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
bool work_in_progress;
bool need_freq_update;
@@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
raw_spin_unlock(&sg_policy->update_lock);
}
-static void sugov_work(struct work_struct *work)
+static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
@@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
struct sugov_policy *sg_policy;
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
- schedule_work_on(smp_processor_id(), &sg_policy->work);
+
+ /*
+ * For RT and deadline tasks, the schedutil governor shoots the
+ * frequency to maximum. Special care must be taken to ensure that this
+ * kthread doesn't result in the same behavior.
+ *
+ * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+ * updated only at the end of the sugov_work() function and before that
+ * the schedutil governor rejects all other frequency scaling requests.
+ *
+ * There is a very rare case though, where the RT thread yields right
+ * after the work_in_progress flag is cleared. The effects of that are
+ * neglected for now.
+ */
+ kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
/************************** sysfs interface ************************/
@@ -371,19 +390,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
return NULL;
sg_policy->policy = policy;
- init_irq_work(&sg_policy->irq_work, sugov_irq_work);
- INIT_WORK(&sg_policy->work, sugov_work);
- mutex_init(&sg_policy->work_lock);
raw_spin_lock_init(&sg_policy->update_lock);
return sg_policy;
}
static void sugov_policy_free(struct sugov_policy *sg_policy)
{
- mutex_destroy(&sg_policy->work_lock);
kfree(sg_policy);
}
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ kthread_init_work(&sg_policy->work, sugov_work);
+ kthread_init_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ kthread_flush_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
{
struct sugov_tunables *tunables;
@@ -416,16 +480,24 @@ static int sugov_init(struct cpufreq_policy *policy)
if (policy->governor_data)
return -EBUSY;
+ cpufreq_enable_fast_switch(policy);
+
sg_policy = sugov_policy_alloc(policy);
- if (!sg_policy)
- return -ENOMEM;
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
mutex_lock(&global_tunables_lock);
if (global_tunables) {
if (WARN_ON(have_governor_per_policy())) {
ret = -EINVAL;
- goto free_sg_policy;
+ goto stop_kthread;
}
policy->governor_data = sg_policy;
sg_policy->tunables = global_tunables;
@@ -437,7 +509,7 @@ static int sugov_init(struct cpufreq_policy *policy)
tunables = sugov_tunables_alloc(sg_policy);
if (!tunables) {
ret = -ENOMEM;
- goto free_sg_policy;
+ goto stop_kthread;
}
tunables->rate_limit_us = LATENCY_MULTIPLIER;
@@ -454,20 +526,25 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
- out:
+out:
mutex_unlock(&global_tunables_lock);
-
- cpufreq_enable_fast_switch(policy);
return 0;
- fail:
+fail:
policy->governor_data = NULL;
sugov_tunables_free(tunables);
- free_sg_policy:
+stop_kthread:
+ sugov_kthread_stop(sg_policy);
+
+free_sg_policy:
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
pr_err("initialization failed (error %d)\n", ret);
return ret;
}
@@ -478,8 +555,6 @@ static void sugov_exit(struct cpufreq_policy *policy)
struct sugov_tunables *tunables = sg_policy->tunables;
unsigned int count;
- cpufreq_disable_fast_switch(policy);
-
mutex_lock(&global_tunables_lock);
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -489,7 +564,9 @@ static void sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
+ sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
+ cpufreq_disable_fast_switch(policy);
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -535,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
synchronize_sched();
- irq_work_sync(&sg_policy->irq_work);
- cancel_work_sync(&sg_policy->work);
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
}
static void sugov_limits(struct cpufreq_policy *policy)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d5300d..6a4bae0a649d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -164,11 +164,14 @@ static void cpuidle_idle_call(void)
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
- if (entered_state > 0) {
- local_irq_enable();
- goto exit_idle;
+
+ if (idle_should_freeze() || dev->use_deepest_state) {
+ if (idle_should_freeze()) {
+ entered_state = cpuidle_enter_freeze(drv, dev);
+ if (entered_state > 0) {
+ local_irq_enable();
+ goto exit_idle;
+ }
}
next_state = cpuidle_find_deepest_state(drv, dev);
@@ -202,76 +205,65 @@ exit_idle:
*
* Called with polling cleared.
*/
-static void cpu_idle_loop(void)
+static void do_idle(void)
{
- int cpu = smp_processor_id();
-
- while (1) {
- /*
- * If the arch has a polling bit, we maintain an invariant:
- *
- * Our polling bit is clear if we're not scheduled (i.e. if
- * rq->curr != rq->idle). This means that, if rq->idle has
- * the polling bit set, then setting need_resched is
- * guaranteed to cause the cpu to reschedule.
- */
-
- __current_set_polling();
- quiet_vmstat();
- tick_nohz_idle_enter();
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+ * rq->idle). This means that, if rq->idle has the polling bit set,
+ * then setting need_resched is guaranteed to cause the CPU to
+ * reschedule.
+ */
- while (!need_resched()) {
- check_pgt_cache();
- rmb();
+ __current_set_polling();
+ tick_nohz_idle_enter();
- if (cpu_is_offline(cpu)) {
- cpuhp_report_idle_dead();
- arch_cpu_idle_dead();
- }
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
- local_irq_disable();
- arch_cpu_idle_enter();
-
- /*
- * In poll mode we reenable interrupts and spin.
- *
- * Also if we detected in the wakeup from idle
- * path that the tick broadcast device expired
- * for us, we don't want to go deep idle as we
- * know that the IPI is going to arrive right
- * away
- */
- if (cpu_idle_force_poll || tick_check_broadcast_expired())
- cpu_idle_poll();
- else
- cpuidle_idle_call();
-
- arch_cpu_idle_exit();
+ if (cpu_is_offline(smp_processor_id())) {
+ cpuhp_report_idle_dead();
+ arch_cpu_idle_dead();
}
- /*
- * Since we fell out of the loop above, we know
- * TIF_NEED_RESCHED must be set, propagate it into
- * PREEMPT_NEED_RESCHED.
- *
- * This is required because for polling idle loops we will
- * not have had an IPI to fold the state for us.
- */
- preempt_set_need_resched();
- tick_nohz_idle_exit();
- __current_clr_polling();
+ local_irq_disable();
+ arch_cpu_idle_enter();
/*
- * We promise to call sched_ttwu_pending and reschedule
- * if need_resched is set while polling is set. That
- * means that clearing polling needs to be visible
- * before doing these things.
+ * In poll mode we reenable interrupts and spin. Also if we
+ * detected in the wakeup from idle path that the tick
+ * broadcast device expired for us, we don't want to go deep
+ * idle as we know that the IPI is going to arrive right away.
*/
- smp_mb__after_atomic();
-
- sched_ttwu_pending();
- schedule_preempt_disabled();
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ cpu_idle_poll();
+ else
+ cpuidle_idle_call();
+ arch_cpu_idle_exit();
}
+
+ /*
+ * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+ * be set, propagate it into PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will not have had
+ * an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending() and reschedule if
+ * need_resched() is set while polling is set. That means that clearing
+ * polling needs to be visible before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
+ schedule_preempt_disabled();
}
bool cpu_in_idle(unsigned long pc)
@@ -280,6 +272,56 @@ bool cpu_in_idle(unsigned long pc)
pc < (unsigned long)__cpuidle_text_end;
}
+struct idle_timer {
+ struct hrtimer timer;
+ int done;
+};
+
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+ struct idle_timer *it = container_of(timer, struct idle_timer, timer);
+
+ WRITE_ONCE(it->done, 1);
+ set_tsk_need_resched(current);
+
+ return HRTIMER_NORESTART;
+}
+
+void play_idle(unsigned long duration_ms)
+{
+ struct idle_timer it;
+
+ /*
+ * Only FIFO tasks can disable the tick since they don't need the forced
+ * preemption.
+ */
+ WARN_ON_ONCE(current->policy != SCHED_FIFO);
+ WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+ WARN_ON_ONCE(!duration_ms);
+
+ rcu_sleep_check();
+ preempt_disable();
+ current->flags |= PF_IDLE;
+ cpuidle_use_deepest_state(true);
+
+ it.done = 0;
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ it.timer.function = idle_inject_timer_fn;
+ hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+
+ while (!READ_ONCE(it.done))
+ do_idle();
+
+ cpuidle_use_deepest_state(false);
+ current->flags &= ~PF_IDLE;
+
+ preempt_fold_need_resched();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(play_idle);
+
void cpu_startup_entry(enum cpuhp_state state)
{
/*
@@ -299,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state)
#endif
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
- cpu_idle_loop();
+ while (1)
+ do_idle();
}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 0e9505f66ec1..b2a0cff2bb35 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task)
/* Unpoison the stack for the current task beyond a watermark sp value. */
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
{
- __kasan_unpoison_stack(current, watermark);
+ /*
+ * Calculate the task stack base address. Avoid using 'current'
+ * because this function is called by early resume code which hasn't
+ * yet set up the percpu register (%gs).
+ */
+ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+ kasan_unpoison_shadow(base, watermark - base);
}
/*