habanalabs/gaudi2: new API to control engine cores running mode

The current flow of halting the engine cores is implemented by command buffers built by the user space and sent towards the Driver. This current flow is broken since the user space does not know when the cores actually halt as sending a workload is async op. Therefore the application can not free the memory that is mapped to the engine cores. This new API allows the user space to control the running mode. The API call is sync (returns after the cores are set to the requested mode). Signed-off-by: Tal Cohen <talcohen@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
author: Tal Cohen 2022-07-07 18:42:47 +0300
committer: Oded Gabbay 2022-09-18 13:29:51 +0300
commit: 194e515c79462f1ad09ebcc9e01a3acb84a98d82 (patch)
tree: 507354dadb1169d90633792bf2a20f10d18c206a /drivers/misc/habanalabs
parent: 07056f58e43319902cd1072c00df2846b31e14b8 (diff)
4 files changed, 117 insertions, 7 deletions
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 304e4f3b0e7e..cf4118515678 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -13,7 +13,7 @@
 
 #define HL_CS_FLAGS_TYPE_MASK	(HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
 			HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \
-			HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
+			HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND)
 
 
 #define MAX_TS_ITER_NUM 10
@@ -1244,6 +1244,8 @@ static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
 		return CS_RESERVE_SIGNALS;
 	else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
 		return CS_UNRESERVE_SIGNALS;
+	else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND)
+		return CS_TYPE_ENGINE_CORE;
 	else
 		return CS_TYPE_DEFAULT;
 }
@@ -2355,6 +2357,41 @@ out:
 	return rc;
 }
 
+static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
+						u32 num_engine_cores, u32 core_command)
+{
+	int rc;
+	struct hl_device *hdev = hpriv->hdev;
+	void __user *engine_cores_arr;
+	u32 *cores;
+
+	if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) {
+		dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores);
+		return -EINVAL;
+	}
+
+	if (core_command != HL_ENGINE_CORE_RUN && core_command != HL_ENGINE_CORE_HALT) {
+		dev_err(hdev->dev, "Engine core command is invalid\n");
+		return -EINVAL;
+	}
+
+	engine_cores_arr = (void __user *) (uintptr_t) engine_cores;
+	cores = kmalloc_array(num_engine_cores, sizeof(u32), GFP_KERNEL);
+	if (!cores)
+		return -ENOMEM;
+
+	if (copy_from_user(cores, engine_cores_arr, num_engine_cores * sizeof(u32))) {
+		dev_err(hdev->dev, "Failed to copy core-ids array from user\n");
+		kfree(cores);
+		return -EFAULT;
+	}
+
+	rc = hdev->asic_funcs->set_engine_cores(hdev, cores, num_engine_cores, core_command);
+	kfree(cores);
+
+	return rc;
+}
+
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	union hl_cs_args *args = data;
@@ -2407,6 +2444,10 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		rc = cs_ioctl_unreserve_signals(hpriv,
 					args->in.encaps_sig_handle_id);
 		break;
+	case CS_TYPE_ENGINE_CORE:
+		rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores,
+				args->in.num_engine_cores, args->in.core_command);
+		break;
 	default:
 		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
 						args->in.cs_flags,
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 8d9e96c6092a..ae3f5832fe58 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -345,7 +345,8 @@ enum hl_cs_type {
 	CS_TYPE_WAIT,
 	CS_TYPE_COLLECTIVE_WAIT,
 	CS_RESERVE_SIGNALS,
-	CS_UNRESERVE_SIGNALS
+	CS_UNRESERVE_SIGNALS,
+	CS_TYPE_ENGINE_CORE
 };
 
 /*
@@ -617,6 +618,7 @@ struct hl_hints_range {
  *                                      which the property supports_user_set_page_size is true
  *                                      (i.e. the DRAM supports multiple page sizes), otherwise
  *                                      it will shall  be equal to dram_page_size.
+ * @num_engine_cores: number of engine cpu cores
  * @collective_first_sob: first sync object available for collective use
  * @collective_first_mon: first monitor available for collective use
  * @sync_stream_first_sob: first sync object available for sync stream use
@@ -737,6 +739,7 @@ struct asic_fixed_properties {
 	u32				faulty_dram_cluster_map;
 	u32				xbar_edge_enabled_mask;
 	u32				device_mem_alloc_default_page_size;
+	u32				num_engine_cores;
 	u16				collective_first_sob;
 	u16				collective_first_mon;
 	u16				sync_stream_first_sob;
@@ -1511,6 +1514,7 @@ struct engines_data {
  * @check_if_razwi_happened: check if there was a razwi due to RR violation.
  * @access_dev_mem: access device memory
  * @set_dram_bar_base: set the base of the DRAM BAR
+ * @set_engine_cores: set a config command to enigne cores
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1645,6 +1649,8 @@ struct hl_asic_funcs {
 	int (*access_dev_mem)(struct hl_device *hdev, enum pci_region region_type,
 				u64 addr, u64 *val, enum debugfs_access_type acc_type);
 	u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
+	int (*set_engine_cores)(struct hl_device *hdev, u32 *core_ids,
+					u32 num_cores, u32 core_command);
 };
 
 
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 9ccde0258eca..676419961f86 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -1989,6 +1989,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 		prop->pmmu_huge.end_addr = VA_HOST_SPACE_HPAGE_END;
 	}
 
+	prop->num_engine_cores = CPU_ID_MAX;
 	prop->cfg_size = CFG_SIZE;
 	prop->max_asid = MAX_ASID;
 	prop->num_of_events = GAUDI2_EVENT_SIZE;
@@ -3751,14 +3752,16 @@ static void gaudi2_stop_dec(struct hl_device *hdev)
 	gaudi2_stop_pcie_dec(hdev);
 }
 
-static void gaudi2_halt_arc(struct hl_device *hdev, u32 cpu_id)
+static void gaudi2_set_arc_running_mode(struct hl_device *hdev, u32 cpu_id, u32 run_mode)
 {
 	u32 reg_base, reg_val;
 
 	reg_base = gaudi2_arc_blocks_bases[cpu_id];
+	if (run_mode == HL_ENGINE_CORE_RUN)
+		reg_val = FIELD_PREP(ARC_FARM_ARC0_AUX_RUN_HALT_REQ_RUN_REQ_MASK, 1);
+	else
+		reg_val = FIELD_PREP(ARC_FARM_ARC0_AUX_RUN_HALT_REQ_HALT_REQ_MASK, 1);
 
-	/* Halt ARC */
-	reg_val = FIELD_PREP(ARC_FARM_ARC0_AUX_RUN_HALT_REQ_HALT_REQ_MASK, 1);
 	WREG32(reg_base + ARC_HALT_REQ_OFFSET, reg_val);
 }
 
@@ -3768,10 +3771,37 @@ static void gaudi2_halt_arcs(struct hl_device *hdev)
 
 	for (arc_id = CPU_ID_SCHED_ARC0; arc_id < CPU_ID_MAX; arc_id++) {
 		if (gaudi2_is_arc_enabled(hdev, arc_id))
-			gaudi2_halt_arc(hdev, arc_id);
+			gaudi2_set_arc_running_mode(hdev, arc_id, HL_ENGINE_CORE_HALT);
 	}
 }
 
+static int gaudi2_verify_arc_running_mode(struct hl_device *hdev, u32 cpu_id, u32 run_mode)
+{
+	int rc;
+	u32 reg_base, val, ack_mask, timeout_usec = 100000;
+
+	if (hdev->pldm)
+		timeout_usec *= 100;
+
+	reg_base = gaudi2_arc_blocks_bases[cpu_id];
+	if (run_mode == HL_ENGINE_CORE_RUN)
+		ack_mask = ARC_FARM_ARC0_AUX_RUN_HALT_ACK_RUN_ACK_MASK;
+	else
+		ack_mask = ARC_FARM_ARC0_AUX_RUN_HALT_ACK_HALT_ACK_MASK;
+
+	rc = hl_poll_timeout(hdev, reg_base + ARC_HALT_ACK_OFFSET,
+				val, ((val & ack_mask) == ack_mask),
+				1000, timeout_usec);
+
+	if (!rc) {
+		/* Clear */
+		val = FIELD_PREP(ARC_FARM_ARC0_AUX_RUN_HALT_REQ_RUN_REQ_MASK, 0);
+		WREG32(reg_base + ARC_HALT_REQ_OFFSET, val);
+	}
+
+	return rc;
+}
+
 static void gaudi2_reset_arcs(struct hl_device *hdev)
 {
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -3796,8 +3826,39 @@ static void gaudi2_nic_qmans_manual_flush(struct hl_device *hdev)
 
 	queue_id = GAUDI2_QUEUE_ID_NIC_0_0;
 
-	for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++, queue_id += NUM_OF_PQ_PER_QMAN)
+	for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++, queue_id += NUM_OF_PQ_PER_QMAN) {
+		if (!(hdev->nic_ports_mask & BIT(i)))
+			continue;
+
 		gaudi2_qman_manual_flush_common(hdev, queue_id);
+	}
+}
+
+static int gaudi2_set_engine_cores(struct hl_device *hdev, u32 *core_ids,
+					u32 num_cores, u32 core_command)
+{
+	int i, rc;
+
+
+	for (i = 0 ; i < num_cores ; i++) {
+		if (gaudi2_is_arc_enabled(hdev, core_ids[i]))
+			gaudi2_set_arc_running_mode(hdev, core_ids[i], core_command);
+	}
+
+	for (i = 0 ; i < num_cores ; i++) {
+		if (gaudi2_is_arc_enabled(hdev, core_ids[i])) {
+			rc = gaudi2_verify_arc_running_mode(hdev, core_ids[i], core_command);
+
+			if (rc) {
+				dev_err(hdev->dev, "failed to %s arc: %d\n",
+					(core_command == HL_ENGINE_CORE_HALT) ?
+					"HALT" : "RUN", core_ids[i]);
+				return -1;
+			}
+		}
+	}
+
+	return 0;
 }
 
 static void gaudi2_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
@@ -9968,6 +10029,7 @@ static const struct hl_asic_funcs gaudi2_funcs = {
 	.mmu_get_real_page_size = gaudi2_mmu_get_real_page_size,
 	.access_dev_mem = hl_access_dev_mem,
 	.set_dram_bar_base = gaudi2_set_hbm_bar_base,
+	.set_engine_cores = gaudi2_set_engine_cores,
 };
 
 void gaudi2_set_asic_funcs(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/misc/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
index d0e2c68a639f..bfda4223bdc8 100644
--- a/drivers/misc/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
+++ b/drivers/misc/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
@@ -239,6 +239,7 @@
 #define SFT_IF_RTR_OFFSET	(mmSFT0_HBW_RTR_IF1_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE)
 
 #define ARC_HALT_REQ_OFFSET	(mmARC_FARM_ARC0_AUX_RUN_HALT_REQ - mmARC_FARM_ARC0_AUX_BASE)
+#define ARC_HALT_ACK_OFFSET	(mmARC_FARM_ARC0_AUX_RUN_HALT_ACK - mmARC_FARM_ARC0_AUX_BASE)
 
 #define ARC_REGION_CFG_OFFSET(region) \
 	(mmARC_FARM_ARC0_AUX_ARC_REGION_CFG_0 + (region * 4) - mmARC_FARM_ARC0_AUX_BASE)
author	Tal Cohen	2022-07-07 18:42:47 +0300
committer	Oded Gabbay	2022-09-18 13:29:51 +0300
commit	194e515c79462f1ad09ebcc9e01a3acb84a98d82 (patch)
tree	507354dadb1169d90633792bf2a20f10d18c206a /drivers/misc/habanalabs
parent	07056f58e43319902cd1072c00df2846b31e14b8 (diff)