From b32045d73bb4333a2cebc5d3c005807adb03ab58 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Fri, 14 Nov 2025 20:56:39 +0000 Subject: drm/xe: Fix freq kobject leak on sysfs_create_files failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure gt->freq is released when sysfs_create_files() fails in xe_gt_freq_init(). Without this, the kobject would leak. Add kobject_put() before returning the error. Fixes: fdc81c43f0c1 ("drm/xe: use devm_add_action_or_reset() helper") Signed-off-by: Shuicheng Lin Reviewed-by: Alex Zuo Reviewed-by: Xin Wang Link: https://patch.msgid.link/20251114205638.2184529-2-shuicheng.lin@intel.com Signed-off-by: Matt Roper (cherry picked from commit 251be5fb4982ebb0f5a81b62d975bd770f3ad5c2) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_freq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_freq.c b/drivers/gpu/drm/xe/xe_gt_freq.c index 849ea6c86e8e..ce3c7810469f 100644 --- a/drivers/gpu/drm/xe/xe_gt_freq.c +++ b/drivers/gpu/drm/xe/xe_gt_freq.c @@ -293,8 +293,10 @@ int xe_gt_freq_init(struct xe_gt *gt) return -ENOMEM; err = sysfs_create_files(gt->freq, freq_attrs); - if (err) + if (err) { + kobject_put(gt->freq); return err; + } err = devm_add_action_or_reset(xe->drm.dev, freq_fini, gt->freq); if (err) -- cgit v1.2.3 From c88a0731ed95f9705deb127a7f1927fa59aa742b Mon Sep 17 00:00:00 2001 From: Vinay Belgaumkar Date: Fri, 28 Nov 2025 21:25:48 -0800 Subject: drm/xe: Apply Wa_14020316580 in xe_gt_idle_enable_pg() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wa_14020316580 was getting clobbered by power gating init code later in the driver load sequence. Move the Wa so that it applies correctly. Fixes: 7cd05ef89c9d ("drm/xe/xe2hpm: Add initial set of workarounds") Suggested-by: Matt Roper Signed-off-by: Vinay Belgaumkar Reviewed-by: Riana Tauro Reviewed-by: Matt Roper Link: https://patch.msgid.link/20251129052548.70766-1-vinay.belgaumkar@intel.com Signed-off-by: Matt Roper (cherry picked from commit 8b5502145351bde87f522df082b9e41356898ba3) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_idle.c | 8 ++++++++ drivers/gpu/drm/xe/xe_wa.c | 8 -------- drivers/gpu/drm/xe/xe_wa_oob.rules | 1 + 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_idle.c b/drivers/gpu/drm/xe/xe_gt_idle.c index bdc9d9877ec4..3e3d1d52f630 100644 --- a/drivers/gpu/drm/xe/xe_gt_idle.c +++ b/drivers/gpu/drm/xe/xe_gt_idle.c @@ -5,6 +5,7 @@ #include +#include #include "xe_force_wake.h" #include "xe_device.h" #include "xe_gt.h" @@ -16,6 +17,7 @@ #include "xe_mmio.h" #include "xe_pm.h" #include "xe_sriov.h" +#include "xe_wa.h" /** * DOC: Xe GT Idle @@ -145,6 +147,12 @@ void xe_gt_idle_enable_pg(struct xe_gt *gt) xe_mmio_write32(mmio, RENDER_POWERGATE_IDLE_HYSTERESIS, 25); } + if (XE_GT_WA(gt, 14020316580)) + gtidle->powergate_enable &= ~(VDN_HCP_POWERGATE_ENABLE(0) | + VDN_MFXVDENC_POWERGATE_ENABLE(0) | + VDN_HCP_POWERGATE_ENABLE(2) | + VDN_MFXVDENC_POWERGATE_ENABLE(2)); + xe_mmio_write32(mmio, POWERGATE_ENABLE, gtidle->powergate_enable); xe_force_wake_put(gt_to_fw(gt), fw_ref); } diff --git a/drivers/gpu/drm/xe/xe_wa.c b/drivers/gpu/drm/xe/xe_wa.c index 3764abca3d4f..e32dd2fde6f1 100644 --- a/drivers/gpu/drm/xe/xe_wa.c +++ b/drivers/gpu/drm/xe/xe_wa.c @@ -270,14 +270,6 @@ static const struct xe_rtp_entry_sr gt_was[] = { XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F1C(0), MFXPIPE_CLKGATE_DIS)), XE_RTP_ENTRY_FLAG(FOREACH_ENGINE), }, - { XE_RTP_NAME("14020316580"), - XE_RTP_RULES(MEDIA_VERSION(1301)), - XE_RTP_ACTIONS(CLR(POWERGATE_ENABLE, - VDN_HCP_POWERGATE_ENABLE(0) | - VDN_MFXVDENC_POWERGATE_ENABLE(0) | - VDN_HCP_POWERGATE_ENABLE(2) | - VDN_MFXVDENC_POWERGATE_ENABLE(2))), - }, { XE_RTP_NAME("14019449301"), XE_RTP_RULES(MEDIA_VERSION(1301), ENGINE_CLASS(VIDEO_DECODE)), XE_RTP_ACTIONS(SET(VDBOX_CGCTL3F08(0), CG3DDISHRS_CLKGATE_DIS)), diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules b/drivers/gpu/drm/xe/xe_wa_oob.rules index fb38eb3d6e9a..7ca7258eb5d8 100644 --- a/drivers/gpu/drm/xe/xe_wa_oob.rules +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules @@ -76,3 +76,4 @@ 15015404425_disable PLATFORM(PANTHERLAKE), MEDIA_STEP(B0, FOREVER) 16026007364 MEDIA_VERSION(3000) +14020316580 MEDIA_VERSION(1301) -- cgit v1.2.3 From 224a6ac0808d0f58e51df2f923332adcb80fd930 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 2 Dec 2025 17:18:09 -0800 Subject: drm/xe: Do not reference loop variable directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not reference the loop variable job after the loop has exited. Instead, save the job from the last iteration of the loop. Fixes: 3d98a7164da6 ("drm/xe/vf: Start re-emission from first unsignaled job during VF migration") Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202511291102.jnnKP6IB-lkp@intel.com/ Signed-off-by: Matthew Brost Reviewed-by: Dnyaneshwar Bhadane Link: https://patch.msgid.link/20251203011809.968893-1-matthew.brost@intel.com (cherry picked from commit 76ce2313709f13a6adbcaa1a43a8539c8f509f6a) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_submit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index ed7be50b2f72..c0819377ce6e 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -2253,10 +2253,11 @@ static void guc_exec_queue_unpause_prepare(struct xe_guc *guc, struct xe_exec_queue *q) { struct xe_gpu_scheduler *sched = &q->guc->sched; - struct xe_sched_job *job = NULL; + struct xe_sched_job *job = NULL, *__job; bool restore_replay = false; - list_for_each_entry(job, &sched->base.pending_list, drm.list) { + list_for_each_entry(__job, &sched->base.pending_list, drm.list) { + job = __job; restore_replay |= job->restore_replay; if (restore_replay) { xe_gt_dbg(guc_to_gt(guc), "Replay JOB - guc_id=%d, seqno=%d", -- cgit v1.2.3 From 9acc3295813b9b846791fd3eab0a78a3144af560 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Dec 2025 10:46:58 +0100 Subject: drm/xe: fix drm_gpusvm_init() arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Xe driver fails to build when CONFIG_DRM_XE_GPUSVM is disabled but CONFIG_DRM_GPUSVM is turned on, due to the clash of two commits: In file included from drivers/gpu/drm/xe/xe_vm_madvise.c:8: drivers/gpu/drm/xe/xe_svm.h: In function 'xe_svm_init': include/linux/stddef.h:8:14: error: passing argument 5 of 'drm_gpusvm_init' makes integer from pointer without a cast [-Wint-conversion] drivers/gpu/drm/xe/xe_svm.h:217:38: note: in expansion of macro 'NULL' 217 | NULL, NULL, 0, 0, 0, NULL, NULL, 0); | ^~~~ In file included from drivers/gpu/drm/xe/xe_bo_types.h:11, from drivers/gpu/drm/xe/xe_bo.h:11, from drivers/gpu/drm/xe/xe_vm_madvise.c:11: include/drm/drm_gpusvm.h:254:35: note: expected 'long unsigned int' but argument is of type 'void *' 254 | unsigned long mm_start, unsigned long mm_range, | ~~~~~~~~~~~~~~^~~~~~~~ In file included from drivers/gpu/drm/xe/xe_vm_madvise.c:14: drivers/gpu/drm/xe/xe_svm.h:216:16: error: too many arguments to function 'drm_gpusvm_init'; expected 10, have 11 216 | return drm_gpusvm_init(&vm->svm.gpusvm, "Xe SVM (simple)", &vm->xe->drm, | ^~~~~~~~~~~~~~~ 217 | NULL, NULL, 0, 0, 0, NULL, NULL, 0); | ~ include/drm/drm_gpusvm.h:251:5: note: declared here Adapt the caller to the new argument list by removing the extraneous NULL argument. Fixes: 9e9787414882 ("drm/xe/userptr: replace xe_hmm with gpusvm") Fixes: 10aa5c806030 ("drm/gpusvm, drm/xe: Fix userptr to not allow device private pages") Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Hellström Signed-off-by: Thomas Hellström Link: https://patch.msgid.link/20251204094704.1030933-1-arnd@kernel.org (cherry picked from commit 29bce9c8b41d5c378263a927acb9a9074d0e7a0e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h index 0955d2ac8d74..fa757dd07954 100644 --- a/drivers/gpu/drm/xe/xe_svm.h +++ b/drivers/gpu/drm/xe/xe_svm.h @@ -214,7 +214,7 @@ int xe_svm_init(struct xe_vm *vm) { #if IS_ENABLED(CONFIG_DRM_GPUSVM) return drm_gpusvm_init(&vm->svm.gpusvm, "Xe SVM (simple)", &vm->xe->drm, - NULL, NULL, 0, 0, 0, NULL, NULL, 0); + NULL, 0, 0, 0, NULL, NULL, 0); #else return 0; #endif -- cgit v1.2.3 From 17d52ab2a6ec8b91bbfc577d397d42d0776ef01f Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Wed, 3 Dec 2025 18:03:55 +0530 Subject: drm/xe/throttle: Skip reason prefix while emitting array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The newly introduced "reasons" attribute already signifies possible reasons for throttling and makes the prefix in individual attribute names redundant while emitting them as an array. Skip the prefix. Fixes: 83ccde67a3f7 ("drm/xe/gt_throttle: Avoid TOCTOU when monitoring reasons") Signed-off-by: Raag Jadav Reviewed-by: Sk Anirban Link: https://patch.msgid.link/20251203123355.571606-1-raag.jadav@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit b64a14334ef3ebbcf70d11bc67d0934bdc0e390d) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_throttle.c b/drivers/gpu/drm/xe/xe_gt_throttle.c index 82c5fbcdfbe3..01477fc7b37b 100644 --- a/drivers/gpu/drm/xe/xe_gt_throttle.c +++ b/drivers/gpu/drm/xe/xe_gt_throttle.c @@ -140,7 +140,7 @@ static ssize_t reasons_show(struct kobject *kobj, struct throttle_attribute *other_ta = kobj_attribute_to_throttle(kattr); if (other_ta->mask != U32_MAX && reasons & other_ta->mask) - ret += sysfs_emit_at(buff, ret, "%s ", (*pother)->name); + ret += sysfs_emit_at(buff, ret, "%s ", (*pother)->name + strlen("reason_")); } if (drm_WARN_ONCE(&xe->drm, !ret, "Unknown reason: %#x\n", reasons)) -- cgit v1.2.3 From 61e6b711c30fc1ca690502f824c067caaf7d1a34 Mon Sep 17 00:00:00 2001 From: Tomasz Lis Date: Thu, 4 Dec 2025 21:08:20 +0100 Subject: drm/xe/vf: Stop waiting for ring space on VF post migration recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If wait for ring space started just before migration, it can delay the recovery process, by waiting without bailout path for up to 2 seconds. Two second wait for recovery is not acceptable, and if the ring was completely filled even without the migration temporarily stopping execution, then such a wait will result in up to a thousand new jobs (assuming constant flow) being added while the wait is happening. While this will not cause data corruption, it will lead to warning messages getting logged due to reset being scheduled on a GT under recovery. Also several seconds of unresponsiveness, as the backlog of jobs gets progressively executed. Add a bailout condition, to make sure the recovery starts without much delay. The recovery is expected to finish in about 100 ms when under moderate stress, so the condition verification period needs to be below that - settling at 64 ms. The theoretical max time which the recovery can take depends on how many requests can be emitted to engine rings and be pending execution. While stress testing, it was possible to reach 10k pending requests on rings when a platform with two GTs was used. This resulted in max recovery time of 5 seconds. But in real life situations, it is very unlikely that the amount of pending requests will ever exceed 100, and for that the recovery time will be around 50 ms - well within our claimed limit of 100ms. Fixes: a4dae94aad6a ("drm/xe/vf: Wakeup in GuC backend on VF post migration recovery") Signed-off-by: Tomasz Lis Reviewed-by: Matthew Brost Signed-off-by: Michal Wajdeczko Link: https://patch.msgid.link/20251204200820.2206168-1-tomasz.lis@intel.com (cherry picked from commit a00e305fba02a915cf2745bf6ef3f55537e65d57) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_submit.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index c0819377ce6e..311cd047911a 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -722,21 +722,23 @@ static int wq_wait_for_space(struct xe_exec_queue *q, u32 wqi_size) struct xe_guc *guc = exec_queue_to_guc(q); struct xe_device *xe = guc_to_xe(guc); struct iosys_map map = xe_lrc_parallel_map(q->lrc[0]); - unsigned int sleep_period_ms = 1; + unsigned int sleep_period_ms = 1, sleep_total_ms = 0; #define AVAILABLE_SPACE \ CIRC_SPACE(q->guc->wqi_tail, q->guc->wqi_head, WQ_SIZE) if (wqi_size > AVAILABLE_SPACE && !vf_recovery(guc)) { try_again: q->guc->wqi_head = parallel_read(xe, map, wq_desc.head); - if (wqi_size > AVAILABLE_SPACE) { - if (sleep_period_ms == 1024) { + if (wqi_size > AVAILABLE_SPACE && !vf_recovery(guc)) { + if (sleep_total_ms > 2000) { xe_gt_reset_async(q->gt); return -ENODEV; } msleep(sleep_period_ms); - sleep_period_ms <<= 1; + sleep_total_ms += sleep_period_ms; + if (sleep_period_ms < 64) + sleep_period_ms <<= 1; goto try_again; } } -- cgit v1.2.3 From 17445af7dcc7d645b6fb8951fd10c8b72cc7f23f Mon Sep 17 00:00:00 2001 From: Junxiao Chang Date: Fri, 7 Nov 2025 11:31:52 +0800 Subject: drm/me/gsc: mei interrupt top half should be in irq disabled context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MEI GSC interrupt comes from i915 or xe driver. It has top half and bottom half. Top half is called from i915/xe interrupt handler. It should be in irq disabled context. With RT kernel(PREEMPT_RT enabled), by default IRQ handler is in threaded IRQ. MEI GSC top half might be in threaded IRQ context. generic_handle_irq_safe API could be called from either IRQ or process context, it disables local IRQ then calls MEI GSC interrupt top half. This change fixes B580 GPU boot issue with RT enabled. Fixes: e02cea83d32d ("drm/xe/gsc: add Battlemage support") Tested-by: Baoli Zhang Signed-off-by: Junxiao Chang Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Matthew Brost Link: https://patch.msgid.link/20251107033152.834960-1-junxiao.chang@intel.com Signed-off-by: Maarten Lankhorst (cherry picked from commit 3efadf028783a49ab2941294187c8b6dd86bf7da) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_heci_gsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_heci_gsc.c b/drivers/gpu/drm/xe/xe_heci_gsc.c index 2b3d49dd394c..495cdd4f948d 100644 --- a/drivers/gpu/drm/xe/xe_heci_gsc.c +++ b/drivers/gpu/drm/xe/xe_heci_gsc.c @@ -223,7 +223,7 @@ void xe_heci_gsc_irq_handler(struct xe_device *xe, u32 iir) if (xe->heci_gsc.irq < 0) return; - ret = generic_handle_irq(xe->heci_gsc.irq); + ret = generic_handle_irq_safe(xe->heci_gsc.irq); if (ret) drm_err_ratelimited(&xe->drm, "error handling GSC irq: %d\n", ret); } @@ -243,7 +243,7 @@ void xe_heci_csc_irq_handler(struct xe_device *xe, u32 iir) if (xe->heci_gsc.irq < 0) return; - ret = generic_handle_irq(xe->heci_gsc.irq); + ret = generic_handle_irq_safe(xe->heci_gsc.irq); if (ret) drm_err_ratelimited(&xe->drm, "error handling GSC irq: %d\n", ret); } -- cgit v1.2.3 From 449bcd5d45eb4ce26740f11f8601082fe734bed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 9 Dec 2025 21:49:20 +0100 Subject: drm/xe/bo: Don't include the CCS metadata in the dma-buf sg-table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Xe bos are allocated with extra backing-store for the CCS metadata. It's never been the intention to share the CCS metadata when exporting such bos as dma-buf. Don't include it in the dma-buf sg-table. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Rodrigo Vivi Cc: Matthew Brost Cc: Maarten Lankhorst Cc: # v6.8+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Reviewed-by: Karol Wachowski Link: https://patch.msgid.link/20251209204920.224374-1-thomas.hellstrom@linux.intel.com (cherry picked from commit a4ebfb9d95d78a12512b435a698ee6886d712571) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_dma_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index 54e42960daad..7c74a31d4486 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -124,7 +124,7 @@ static struct sg_table *xe_dma_buf_map(struct dma_buf_attachment *attach, case XE_PL_TT: sgt = drm_prime_pages_to_sg(obj->dev, bo->ttm.ttm->pages, - bo->ttm.ttm->num_pages); + obj->size >> PAGE_SHIFT); if (IS_ERR(sgt)) return sgt; -- cgit v1.2.3 From c770467d28bd61391f2d2b17feadafd58af731ab Mon Sep 17 00:00:00 2001 From: Satyanarayana K V P Date: Wed, 10 Dec 2025 05:25:48 +0000 Subject: drm/xe/vf: Fix queuing of recovery work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure VF migration recovery work is only queued when no recovery is already queued and teardown is not in progress. Fixes: b47c0c07c350 ("drm/xe/vf: Teardown VF post migration worker on driver unload") Signed-off-by: Satyanarayana K V P Cc: Michal Wajdeczko Cc: Matthew Brost Cc: Tomasz Lis Reviewed-by: Michal Wajdeczko Reviewed-by: Matthew Brost Signed-off-by: Michal Wajdeczko Link: https://patch.msgid.link/20251210052546.622809-5-satyanarayana.k.v.p@intel.com (cherry picked from commit 8d8cf42b03f149dcb545b547906306f3b474565e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_sriov_vf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c index 4c73a077d314..033eae2d03d3 100644 --- a/drivers/gpu/drm/xe/xe_gt_sriov_vf.c +++ b/drivers/gpu/drm/xe/xe_gt_sriov_vf.c @@ -733,7 +733,7 @@ static void vf_start_migration_recovery(struct xe_gt *gt) spin_lock(>->sriov.vf.migration.lock); - if (!gt->sriov.vf.migration.recovery_queued || + if (!gt->sriov.vf.migration.recovery_queued && !gt->sriov.vf.migration.recovery_teardown) { gt->sriov.vf.migration.recovery_queued = true; WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, true); -- cgit v1.2.3 From eafb6f62093f756535a7be1fc4559374a511e460 Mon Sep 17 00:00:00 2001 From: Jagmeet Randhawa Date: Fri, 12 Dec 2025 05:21:46 +0800 Subject: drm/xe: Increase TDF timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are some corner cases where flushing transient data may take slightly longer than the 150us timeout we currently allow. Update the driver to use a 300us timeout instead based on the latest guidance from the hardware team. An update to the bspec to formally document this is expected to arrive soon. Fixes: c01c6066e6fa ("drm/xe/device: implement transient flush") Signed-off-by: Jagmeet Randhawa Reviewed-by: Jonathan Cavitt Reviewed-by: Matt Roper Link: https://patch.msgid.link/0201b1d6ec64d3651fcbff1ea21026efa915126a.1765487866.git.jagmeet.randhawa@intel.com Signed-off-by: Matt Roper (cherry picked from commit d69d3636f5f7a84bae7cd43473b3701ad9b7d544) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index c7d373c70f0f..cf29e259861f 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -1056,7 +1056,7 @@ static void tdf_request_sync(struct xe_device *xe) * transient and need to be flushed.. */ if (xe_mmio_wait32(>->mmio, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0, - 150, NULL, false)) + 300, NULL, false)) xe_gt_err_once(gt, "TD flush timeout\n"); xe_force_wake_put(gt_to_fw(gt), fw_ref); -- cgit v1.2.3 From eed5b815fa49c17d513202f54e980eb91955d3ed Mon Sep 17 00:00:00 2001 From: Jan Maslak Date: Wed, 10 Dec 2025 15:56:18 +0100 Subject: drm/xe: Restore engine registers before restarting schedulers after GT reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During GT reset recovery in do_gt_restart(), xe_uc_start() was called before xe_reg_sr_apply_mmio() restored engine-specific registers. This created a race window where the scheduler could run jobs before hardware state was fully restored. This caused failures in eudebug tests (xe_exec_sip_eudebug@breakpoint- waitsip-*) where TD_CTL register (containing TD_CTL_GLOBAL_DEBUG_ENABLE) wasn't restored before jobs started executing. Breakpoints would fail to trigger SIP entry because the debug enable bit wasn't set yet. Fix by moving xe_uc_start() after all MMIO register restoration, including engine registers and CCS mode configuration, ensuring all hardware state is fully restored before any jobs can be scheduled. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Jan Maslak Reviewed-by: Jonathan Cavitt Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20251210145618.169625-2-jan.maslak@intel.com (cherry picked from commit 825aed0328588b2837636c1c5a0c48795d724617) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index dbb5e7a9bc6a..cdce210e36f2 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -797,9 +797,6 @@ static int do_gt_restart(struct xe_gt *gt) xe_gt_sriov_pf_init_hw(gt); xe_mocs_init(gt); - err = xe_uc_start(>->uc); - if (err) - return err; for_each_hw_engine(hwe, gt, id) xe_reg_sr_apply_mmio(&hwe->reg_sr, gt); @@ -807,6 +804,10 @@ static int do_gt_restart(struct xe_gt *gt) /* Get CCS mode in sync between sw/hw */ xe_gt_apply_ccs_mode(gt); + err = xe_uc_start(>->uc); + if (err) + return err; + /* Restore GT freq to expected values */ xe_gt_sanitize_freq(gt); -- cgit v1.2.3 From 8e461304009135270e9ccf2d7e2dfe29daec9b60 Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Fri, 5 Dec 2025 23:47:17 +0000 Subject: drm/xe: Limit num_syncs to prevent oversized allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The exec and vm_bind ioctl allow userspace to specify an arbitrary num_syncs value. Without bounds checking, a very large num_syncs can force an excessively large allocation, leading to kernel warnings from the page allocator as below. Introduce DRM_XE_MAX_SYNCS (set to 1024) and reject any request exceeding this limit. " ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1217 at mm/page_alloc.c:5124 __alloc_frozen_pages_noprof+0x2f8/0x2180 mm/page_alloc.c:5124 ... Call Trace: alloc_pages_mpol+0xe4/0x330 mm/mempolicy.c:2416 ___kmalloc_large_node+0xd8/0x110 mm/slub.c:4317 __kmalloc_large_node_noprof+0x18/0xe0 mm/slub.c:4348 __do_kmalloc_node mm/slub.c:4364 [inline] __kmalloc_noprof+0x3d4/0x4b0 mm/slub.c:4388 kmalloc_noprof include/linux/slab.h:909 [inline] kmalloc_array_noprof include/linux/slab.h:948 [inline] xe_exec_ioctl+0xa47/0x1e70 drivers/gpu/drm/xe/xe_exec.c:158 drm_ioctl_kernel+0x1f1/0x3e0 drivers/gpu/drm/drm_ioctl.c:797 drm_ioctl+0x5e7/0xc50 drivers/gpu/drm/drm_ioctl.c:894 xe_drm_ioctl+0x10b/0x170 drivers/gpu/drm/xe/xe_device.c:224 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:598 [inline] __se_sys_ioctl fs/ioctl.c:584 [inline] __x64_sys_ioctl+0x18b/0x210 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xbb/0x380 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... " v2: Add "Reported-by" and Cc stable kernels. v3: Change XE_MAX_SYNCS from 64 to 1024. (Matt & Ashutosh) v4: s/XE_MAX_SYNCS/DRM_XE_MAX_SYNCS/ (Matt) v5: Do the check at the top of the exec func. (Matt) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Reported-by: Koen Koning Reported-by: Peter Senna Tschudin Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6450 Cc: # v6.12+ Cc: Matthew Brost Cc: Michal Mrozek Cc: Carl Zhang Cc: José Roberto de Souza Cc: Lionel Landwerlin Cc: Ivan Briano Cc: Thomas Hellström Cc: Ashutosh Dixit Signed-off-by: Shuicheng Lin Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20251205234715.2476561-5-shuicheng.lin@intel.com (cherry picked from commit b07bac9bd708ec468cd1b8a5fe70ae2ac9b0a11c) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_exec.c | 3 ++- drivers/gpu/drm/xe/xe_vm.c | 3 +++ include/uapi/drm/xe_drm.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 4d81210e41f5..fd9480031750 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -132,7 +132,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) if (XE_IOCTL_DBG(xe, args->extensions) || XE_IOCTL_DBG(xe, args->pad[0] || args->pad[1] || args->pad[2]) || - XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1])) + XE_IOCTL_DBG(xe, args->reserved[0] || args->reserved[1]) || + XE_IOCTL_DBG(xe, args->num_syncs > DRM_XE_MAX_SYNCS)) return -EINVAL; q = xe_exec_queue_lookup(xef, args->exec_queue_id); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index 7cac646bdf1c..c93155c6c627 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -3324,6 +3324,9 @@ static int vm_bind_ioctl_check_args(struct xe_device *xe, struct xe_vm *vm, if (XE_IOCTL_DBG(xe, args->extensions)) return -EINVAL; + if (XE_IOCTL_DBG(xe, args->num_syncs > DRM_XE_MAX_SYNCS)) + return -EINVAL; + if (args->num_binds > 1) { u64 __user *bind_user = u64_to_user_ptr(args->vector_of_binds); diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 47853659a705..f64dc0eff0e6 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1463,6 +1463,7 @@ struct drm_xe_exec { /** @exec_queue_id: Exec queue ID for the batch buffer */ __u32 exec_queue_id; +#define DRM_XE_MAX_SYNCS 1024 /** @num_syncs: Amount of struct drm_xe_sync in array. */ __u32 num_syncs; -- cgit v1.2.3 From f8dd66bfb4e184c71bd26418a00546ebe7f5c17a Mon Sep 17 00:00:00 2001 From: Shuicheng Lin Date: Fri, 5 Dec 2025 23:47:18 +0000 Subject: drm/xe/oa: Limit num_syncs to prevent oversized allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OA open parameters did not validate num_syncs, allowing userspace to pass arbitrarily large values, potentially leading to excessive allocations. Add check to ensure that num_syncs does not exceed DRM_XE_MAX_SYNCS, returning -EINVAL when the limit is violated. v2: use XE_IOCTL_DBG() and drop duplicated check. (Ashutosh) Fixes: c8507a25cebd ("drm/xe/oa/uapi: Define and parse OA sync properties") Cc: Matthew Brost Cc: Ashutosh Dixit Signed-off-by: Shuicheng Lin Reviewed-by: Ashutosh Dixit Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20251205234715.2476561-6-shuicheng.lin@intel.com (cherry picked from commit e057b2d2b8d815df3858a87dffafa2af37e5945b) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_oa.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 890c363282ae..1dd8ebeb41d0 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -1254,6 +1254,9 @@ static int xe_oa_set_no_preempt(struct xe_oa *oa, u64 value, static int xe_oa_set_prop_num_syncs(struct xe_oa *oa, u64 value, struct xe_oa_open_param *param) { + if (XE_IOCTL_DBG(oa->xe, value > DRM_XE_MAX_SYNCS)) + return -EINVAL; + param->num_syncs = value; return 0; } -- cgit v1.2.3 From 6f0f404bd289d79a260b634c5b3f4d330b13472c Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 12 Dec 2025 10:28:41 -0800 Subject: drm/xe: Adjust long-running workload timeslices to reasonable values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 10ms timeslice for long-running workloads is far too long and causes significant jitter in benchmarks when the system is shared. Adjust the value to 5ms for preempt-fencing VMs, as the resume step there is quite costly as memory is moved around, and set it to zero for pagefault VMs, since switching back to pagefault mode after dma-fence mode is relatively fast. Also change min_run_period_ms to 'unsiged int' type rather than 's64' as only positive values make sense. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: stable@vger.kernel.org Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Link: https://patch.msgid.link/20251212182847.1683222-2-matthew.brost@intel.com (cherry picked from commit 33a5abd9a68394aa67f9618b20eee65ee8702ff4) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_vm.c | 5 ++++- drivers/gpu/drm/xe/xe_vm_types.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index c93155c6c627..79ab6c512d3e 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -1508,7 +1508,10 @@ struct xe_vm *xe_vm_create(struct xe_device *xe, u32 flags, struct xe_file *xef) INIT_WORK(&vm->destroy_work, vm_destroy_work_func); INIT_LIST_HEAD(&vm->preempt.exec_queues); - vm->preempt.min_run_period_ms = 10; /* FIXME: Wire up to uAPI */ + if (flags & XE_VM_FLAG_FAULT_MODE) + vm->preempt.min_run_period_ms = 0; + else + vm->preempt.min_run_period_ms = 5; for_each_tile(tile, xe, id) xe_range_fence_tree_init(&vm->rftree[id]); diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index ccd6cc090309..2168ef052499 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -263,7 +263,7 @@ struct xe_vm { * @min_run_period_ms: The minimum run period before preempting * an engine again */ - s64 min_run_period_ms; + unsigned int min_run_period_ms; /** @exec_queues: list of exec queues attached to this VM */ struct list_head exec_queues; /** @num_exec_queues: number exec queues attached to this VM */ -- cgit v1.2.3 From 256edb267a9d0b5aef70e408e9fba4f930f9926e Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Fri, 5 Dec 2025 13:26:13 -0800 Subject: drm/xe/oa: Always set OAG_OAGLBCTXCTRL_COUNTER_RESUME MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reports can be written out to the OA buffer using ways other than periodic sampling. These include mmio trigger and context switches. To support these use cases, when periodic sampling is not enabled, OAG_OAGLBCTXCTRL_COUNTER_RESUME must be set. Fixes: 1db9a9dc90ae ("drm/xe/oa: OA stream initialization (OAG)") Signed-off-by: Ashutosh Dixit Reviewed-by: Umesh Nerlige Ramappa Link: https://patch.msgid.link/20251205212613.826224-4-ashutosh.dixit@intel.com (cherry picked from commit 88d98e74adf3e20f678bb89581a5c3149fdbdeaa) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_oa.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 1dd8ebeb41d0..8f3da6885e6c 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -1105,11 +1105,12 @@ static int xe_oa_enable_metric_set(struct xe_oa_stream *stream) oag_buf_size_select(stream) | oag_configure_mmio_trigger(stream, true)); - xe_mmio_write32(mmio, __oa_regs(stream)->oa_ctx_ctrl, stream->periodic ? - (OAG_OAGLBCTXCTRL_COUNTER_RESUME | + xe_mmio_write32(mmio, __oa_regs(stream)->oa_ctx_ctrl, + OAG_OAGLBCTXCTRL_COUNTER_RESUME | + (stream->periodic ? OAG_OAGLBCTXCTRL_TIMER_ENABLE | REG_FIELD_PREP(OAG_OAGLBCTXCTRL_TIMER_PERIOD_MASK, - stream->period_exponent)) : 0); + stream->period_exponent) : 0)); /* * Initialize Super Queue Internal Cnt Register -- cgit v1.2.3 From eb192bedf5908e63347c4923c5a1d58f9baef158 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Dec 2025 14:39:19 +0300 Subject: drm/xe/xe_sriov_vfio: Fix return value in xe_sriov_vfio_migration_supported() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xe_sriov_vfio_migration_supported() function is type bool so returning -EPERM means returning true. Return false instead. Fixes: bd45d46ffc8f ("drm/xe/pf: Export helpers for VFIO") Signed-off-by: Dan Carpenter Reviewed-by: Michal Wajdeczko Link: https://patch.msgid.link/aTLEZ4g-FD-iMQ2V@stanley.mountain Signed-off-by: Michał Winiarski (cherry picked from commit 0a2404c8f6a3a120f79c57ef8a3302c8e8bc34d9) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_sriov_vfio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_sriov_vfio.c b/drivers/gpu/drm/xe/xe_sriov_vfio.c index e9a7615bb5c5..3da81af97b8b 100644 --- a/drivers/gpu/drm/xe/xe_sriov_vfio.c +++ b/drivers/gpu/drm/xe/xe_sriov_vfio.c @@ -21,7 +21,7 @@ EXPORT_SYMBOL_FOR_MODULES(xe_sriov_vfio_get_pf, "xe-vfio-pci"); bool xe_sriov_vfio_migration_supported(struct xe_device *xe) { if (!IS_SRIOV_PF(xe)) - return -EPERM; + return false; return xe_sriov_pf_migration_supported(xe); } -- cgit v1.2.3 From 3595114bc31d1eb5e1996164c901485c1ffac6f7 Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Thu, 11 Dec 2025 22:18:49 -0800 Subject: drm/xe/oa: Disallow 0 OA property values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An OA property value of 0 is invalid and will cause a NPD. Reported-by: Peter Senna Tschudin Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6452 Fixes: cc4e6994d5a2 ("drm/xe/oa: Move functions up so they can be reused for config ioctl") Cc: stable@vger.kernel.org Signed-off-by: Ashutosh Dixit Reviewed-by: Harish Chegondi Link: https://patch.msgid.link/20251212061850.1565459-3-ashutosh.dixit@intel.com (cherry picked from commit 7a100e6ddcc47c1f6ba7a19402de86ce24790621) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_oa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c index 8f3da6885e6c..f8bb28ab8124 100644 --- a/drivers/gpu/drm/xe/xe_oa.c +++ b/drivers/gpu/drm/xe/xe_oa.c @@ -1347,7 +1347,7 @@ static int xe_oa_user_ext_set_property(struct xe_oa *oa, enum xe_oa_user_extn_fr ARRAY_SIZE(xe_oa_set_property_funcs_config)); if (XE_IOCTL_DBG(oa->xe, ext.property >= ARRAY_SIZE(xe_oa_set_property_funcs_open)) || - XE_IOCTL_DBG(oa->xe, ext.pad)) + XE_IOCTL_DBG(oa->xe, !ext.property) || XE_IOCTL_DBG(oa->xe, ext.pad)) return -EINVAL; idx = array_index_nospec(ext.property, ARRAY_SIZE(xe_oa_set_property_funcs_open)); -- cgit v1.2.3 From 3767ca4166ad42fa9e34269efeaf9f15995cd92d Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Thu, 11 Dec 2025 22:18:50 -0800 Subject: drm/xe/eustall: Disallow 0 EU stall property values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An EU stall property value of 0 is invalid and will cause a NPD. Reported-by: Peter Senna Tschudin Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/6453 Fixes: 1537ec85ebd7 ("drm/xe/uapi: Introduce API for EU stall sampling") Cc: stable@vger.kernel.org Signed-off-by: Ashutosh Dixit Reviewed-by: Harish Chegondi Link: https://patch.msgid.link/20251212061850.1565459-4-ashutosh.dixit@intel.com (cherry picked from commit 5bf763e908bf795da4ad538d21c1ec41f8021f76) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_eu_stall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_eu_stall.c b/drivers/gpu/drm/xe/xe_eu_stall.c index 97dfb7945b7a..a5c36a317a70 100644 --- a/drivers/gpu/drm/xe/xe_eu_stall.c +++ b/drivers/gpu/drm/xe/xe_eu_stall.c @@ -315,7 +315,7 @@ static int xe_eu_stall_user_ext_set_property(struct xe_device *xe, u64 extension return -EFAULT; if (XE_IOCTL_DBG(xe, ext.property >= ARRAY_SIZE(xe_set_eu_stall_property_funcs)) || - XE_IOCTL_DBG(xe, ext.pad)) + XE_IOCTL_DBG(xe, !ext.property) || XE_IOCTL_DBG(xe, ext.pad)) return -EINVAL; idx = array_index_nospec(ext.property, ARRAY_SIZE(xe_set_eu_stall_property_funcs)); -- cgit v1.2.3 From fe3ccd24138fd391ae8e32289d492c85f67770fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Wed, 17 Dec 2025 10:34:41 +0100 Subject: drm/xe: Drop preempt-fences when destroying imported dma-bufs. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When imported dma-bufs are destroyed, TTM is not fully individualizing the dma-resv, but it *is* copying the fences that need to be waited for before declaring idle. So in the case where the bo->resv != bo->_resv we can still drop the preempt-fences, but make sure we do that on bo->_resv which contains the fence-pointer copy. In the case where the copying fails, bo->_resv will typically not contain any fences pointers at all, so there will be nothing to drop. In that case, TTM would have ensured all fences that would have been copied are signaled, including any remaining preempt fences. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Fixes: fa0af721bd1f ("drm/ttm: test private resv obj on release/destroy") Cc: Matthew Brost Cc: # v6.16+ Signed-off-by: Thomas Hellström Tested-by: Matthew Brost Reviewed-by: Matthew Brost Link: https://patch.msgid.link/20251217093441.5073-1-thomas.hellstrom@linux.intel.com (cherry picked from commit 425fe550fb513b567bd6d01f397d274092a9c274) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_bo.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index b0bd31d14bb9..bf4ee976b680 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -1527,7 +1527,7 @@ static bool xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo) * always succeed here, as long as we hold the lru lock. */ spin_lock(&ttm_bo->bdev->lru_lock); - locked = dma_resv_trylock(ttm_bo->base.resv); + locked = dma_resv_trylock(&ttm_bo->base._resv); spin_unlock(&ttm_bo->bdev->lru_lock); xe_assert(xe, locked); @@ -1547,13 +1547,6 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) bo = ttm_to_xe_bo(ttm_bo); xe_assert(xe_bo_device(bo), !(bo->created && kref_read(&ttm_bo->base.refcount))); - /* - * Corner case where TTM fails to allocate memory and this BOs resv - * still points the VMs resv - */ - if (ttm_bo->base.resv != &ttm_bo->base._resv) - return; - if (!xe_ttm_bo_lock_in_destructor(ttm_bo)) return; @@ -1563,14 +1556,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) * TODO: Don't do this for external bos once we scrub them after * unbind. */ - dma_resv_for_each_fence(&cursor, ttm_bo->base.resv, + dma_resv_for_each_fence(&cursor, &ttm_bo->base._resv, DMA_RESV_USAGE_BOOKKEEP, fence) { if (xe_fence_is_xe_preempt(fence) && !dma_fence_is_signaled(fence)) { if (!replacement) replacement = dma_fence_get_stub(); - dma_resv_replace_fences(ttm_bo->base.resv, + dma_resv_replace_fences(&ttm_bo->base._resv, fence->context, replacement, DMA_RESV_USAGE_BOOKKEEP); @@ -1578,7 +1571,7 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) } dma_fence_put(replacement); - dma_resv_unlock(ttm_bo->base.resv); + dma_resv_unlock(&ttm_bo->base._resv); } static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) -- cgit v1.2.3 From 80f9c601d9c4d26f00356c0a9c461650e7089273 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 12 Dec 2025 10:28:42 -0800 Subject: drm/xe: Use usleep_range for accurate long-running workload timeslicing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit msleep is not very accurate in terms of how long it actually sleeps, whereas usleep_range is precise. Replace the timeslice sleep for long-running workloads with the more accurate usleep_range to avoid jitter if the sleep period is less than 20ms. Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: stable@vger.kernel.org Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Link: https://patch.msgid.link/20251212182847.1683222-3-matthew.brost@intel.com (cherry picked from commit ca415c4d4c17ad676a2c8981e1fcc432221dce79) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_submit.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 311cd047911a..f6ba2b0f074d 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -717,6 +717,24 @@ static bool vf_recovery(struct xe_guc *guc) return xe_gt_recovery_pending(guc_to_gt(guc)); } +static inline void relaxed_ms_sleep(unsigned int delay_ms) +{ + unsigned long min_us, max_us; + + if (!delay_ms) + return; + + if (delay_ms > 20) { + msleep(delay_ms); + return; + } + + min_us = mul_u32_u32(delay_ms, 1000); + max_us = min_us + 500; + + usleep_range(min_us, max_us); +} + static int wq_wait_for_space(struct xe_exec_queue *q, u32 wqi_size) { struct xe_guc *guc = exec_queue_to_guc(q); @@ -1587,7 +1605,7 @@ static void __guc_exec_queue_process_msg_suspend(struct xe_sched_msg *msg) since_resume_ms; if (wait_ms > 0 && q->guc->resume_time) - msleep(wait_ms); + relaxed_ms_sleep(wait_ms); set_exec_queue_suspended(q); disable_scheduling(q, false); -- cgit v1.2.3