]> Gentwo Git Trees - linux/.git/commitdiff
drm/msm: Wait for MMU devcoredump when waiting for GMU
authorConnor Abbott <cwabbott0@gmail.com>
Fri, 18 Jul 2025 13:50:17 +0000 (09:50 -0400)
committerRob Clark <rob.clark@oss.qualcomm.com>
Mon, 17 Nov 2025 17:43:58 +0000 (09:43 -0800)
If there is a flood of faults then the MMU can become saturated while it
waits for the kernel to process the first fault and resume it, so that
the GMU becomes blocked. This is mainly a problem when the kernel reads
the state of the GPU for a devcoredump, because this takes a while. If
we timeout waiting for the GMU, check if this has happened and retry
after we're finished.

Signed-off-by: Connor Abbott <cwabbott0@gmail.com>
Patchwork: https://patchwork.freedesktop.org/patch/664685/
Signed-off-by: Rob Clark <robin.clark@oss.qualcomm.com>
drivers/gpu/drm/msm/adreno/a6xx_gmu.c
drivers/gpu/drm/msm/adreno/a6xx_hfi.c
drivers/gpu/drm/msm/adreno/adreno_gpu.c
drivers/gpu/drm/msm/adreno/adreno_gpu.h

index d9ffe9e93ad9d9fd00e3046d23b3d694330c91c8..72cd4fe0905c4c7bd2908d8c586b1169e9220a18 100644 (file)
@@ -382,9 +382,23 @@ int a6xx_gmu_set_oob(struct a6xx_gmu *gmu, enum a6xx_gmu_oob_state state)
        /* Trigger the equested OOB operation */
        gmu_write(gmu, REG_A6XX_GMU_HOST2GMU_INTR_SET, 1 << request);
 
-       /* Wait for the acknowledge interrupt */
-       ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
-               val & (1 << ack), 100, 10000);
+       do {
+               /* Wait for the acknowledge interrupt */
+               ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
+                       val & (1 << ack), 100, 10000);
+
+               if (!ret)
+                       break;
+
+               if (completion_done(&a6xx_gpu->base.fault_coredump_done))
+                       break;
+
+               /* We may timeout because the GMU is temporarily wedged from
+                * pending faults from the GPU and we are taking a devcoredump.
+                * Wait until the MMU is resumed and try again.
+                */
+               wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
+       } while (true);
 
        if (ret)
                DRM_DEV_ERROR(gmu->dev,
index 550de6ad68effacaea09751891c2528464bdfcc5..206eb204cea1e6cb1405eb225ac7ff53602e5d19 100644 (file)
@@ -105,10 +105,25 @@ static int a6xx_hfi_wait_for_msg_interrupt(struct a6xx_gmu *gmu, u32 id, u32 seq
 {
        int ret;
        u32 val;
+       struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu);
+
+       do {
+               /* Wait for a response */
+               ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
+                       val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
+
+               if (!ret)
+                       break;
 
-       /* Wait for a response */
-       ret = gmu_poll_timeout(gmu, REG_A6XX_GMU_GMU2HOST_INTR_INFO, val,
-               val & A6XX_GMU_GMU2HOST_INTR_INFO_MSGQ, 100, 1000000);
+               if (completion_done(&a6xx_gpu->base.fault_coredump_done))
+                       break;
+
+               /* We may timeout because the GMU is temporarily wedged from
+                * pending faults from the GPU and we are taking a devcoredump.
+                * Wait until the MMU is resumed and try again.
+                */
+               wait_for_completion(&a6xx_gpu->base.fault_coredump_done);
+       } while (true);
 
        if (ret) {
                DRM_DEV_ERROR(gmu->dev,
index 71400d8999c9717ad477990085f4e5fd68140b5d..1c80909e63cab428b703a5da59933cdd9f73975a 100644 (file)
@@ -284,6 +284,7 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
                         struct adreno_smmu_fault_info *info, const char *block,
                         u32 scratch[4])
 {
+       struct adreno_gpu *adreno_gpu = container_of(gpu, struct adreno_gpu, base);
        struct msm_drm_private *priv = gpu->dev->dev_private;
        struct msm_mmu *mmu = to_msm_vm(gpu->vm)->mmu;
        const char *type = "UNKNOWN";
@@ -336,6 +337,11 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
                /* Turn off the hangcheck timer to keep it from bothering us */
                timer_delete(&gpu->hangcheck_timer);
 
+               /* Let any concurrent GMU transactions know that the MMU may be
+                * blocked for a while and they should wait on us.
+                */
+               reinit_completion(&adreno_gpu->fault_coredump_done);
+
                fault_info.ttbr0 = info->ttbr0;
                fault_info.iova  = iova;
                fault_info.flags = flags;
@@ -343,6 +349,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags,
                fault_info.block = block;
 
                msm_gpu_fault_crashstate_capture(gpu, &fault_info);
+
+               complete_all(&adreno_gpu->fault_coredump_done);
        }
 
        return 0;
@@ -1223,6 +1231,9 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
        if (ret)
                return ret;
 
+       init_completion(&adreno_gpu->fault_coredump_done);
+       complete_all(&adreno_gpu->fault_coredump_done);
+
        pm_runtime_set_autosuspend_delay(dev,
                adreno_gpu->info->inactive_period);
        pm_runtime_use_autosuspend(dev);
index 25ee6b277fe2db5950a057a92d33244d76de299c..4acb03dcbc609b72dfb95d40d5d247e9ca07bfb2 100644 (file)
@@ -180,6 +180,8 @@ struct adreno_gpu {
        uint16_t speedbin;
        const struct adreno_gpu_funcs *funcs;
 
+       struct completion fault_coredump_done;
+
        /* interesting register offsets to dump: */
        const unsigned int *registers;