Skip to content

Commit 280ffdb

Browse files
icklerodrigovivi
authored andcommitted
drm/i915/gt: Free stale request on destroying the virtual engine
Since preempt-to-busy, we may unsubmit a request while it is still on the HW and completes asynchronously. That means it may be retired and in the process destroy the virtual engine (as the user has closed their context), but that engine may still be holding onto the unsubmitted compelted request. Therefore we need to potentially cleanup the old request on destroying the virtual engine. We also have to keep the virtual_engine alive until after the sibling's execlists_dequeue() have finished peeking into the virtual engines, for which we serialise with RCU. v2: Be paranoid and flush the tasklet as well. v3: And flush the tasklet before the engines, as the tasklet may re-attach an rb_node after our removal from the siblings. Fixes: 6d06779 ("drm/i915: Load balancing across a virtual engine") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20201123113717.20500-4-chris@chris-wilson.co.uk (cherry picked from commit 46eecfc) Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
1 parent 2e6ce83 commit 280ffdb

1 file changed

Lines changed: 53 additions & 7 deletions

File tree

drivers/gpu/drm/i915/gt/intel_lrc.c

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@
182182
struct virtual_engine {
183183
struct intel_engine_cs base;
184184
struct intel_context context;
185+
struct rcu_work rcu;
185186

186187
/*
187188
* We allow only a single request through the virtual engine at a time
@@ -5425,33 +5426,57 @@ static struct list_head *virtual_queue(struct virtual_engine *ve)
54255426
return &ve->base.execlists.default_priolist.requests[0];
54265427
}
54275428

5428-
static void virtual_context_destroy(struct kref *kref)
5429+
static void rcu_virtual_context_destroy(struct work_struct *wrk)
54295430
{
54305431
struct virtual_engine *ve =
5431-
container_of(kref, typeof(*ve), context.ref);
5432+
container_of(wrk, typeof(*ve), rcu.work);
54325433
unsigned int n;
54335434

5434-
GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5435-
GEM_BUG_ON(ve->request);
54365435
GEM_BUG_ON(ve->context.inflight);
54375436

5437+
/* Preempt-to-busy may leave a stale request behind. */
5438+
if (unlikely(ve->request)) {
5439+
struct i915_request *old;
5440+
5441+
spin_lock_irq(&ve->base.active.lock);
5442+
5443+
old = fetch_and_zero(&ve->request);
5444+
if (old) {
5445+
GEM_BUG_ON(!i915_request_completed(old));
5446+
__i915_request_submit(old);
5447+
i915_request_put(old);
5448+
}
5449+
5450+
spin_unlock_irq(&ve->base.active.lock);
5451+
}
5452+
5453+
/*
5454+
* Flush the tasklet in case it is still running on another core.
5455+
*
5456+
* This needs to be done before we remove ourselves from the siblings'
5457+
* rbtrees as in the case it is running in parallel, it may reinsert
5458+
* the rb_node into a sibling.
5459+
*/
5460+
tasklet_kill(&ve->base.execlists.tasklet);
5461+
5462+
/* Decouple ourselves from the siblings, no more access allowed. */
54385463
for (n = 0; n < ve->num_siblings; n++) {
54395464
struct intel_engine_cs *sibling = ve->siblings[n];
54405465
struct rb_node *node = &ve->nodes[sibling->id].rb;
5441-
unsigned long flags;
54425466

54435467
if (RB_EMPTY_NODE(node))
54445468
continue;
54455469

5446-
spin_lock_irqsave(&sibling->active.lock, flags);
5470+
spin_lock_irq(&sibling->active.lock);
54475471

54485472
/* Detachment is lazily performed in the execlists tasklet */
54495473
if (!RB_EMPTY_NODE(node))
54505474
rb_erase_cached(node, &sibling->execlists.virtual);
54515475

5452-
spin_unlock_irqrestore(&sibling->active.lock, flags);
5476+
spin_unlock_irq(&sibling->active.lock);
54535477
}
54545478
GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5479+
GEM_BUG_ON(!list_empty(virtual_queue(ve)));
54555480

54565481
if (ve->context.state)
54575482
__execlists_context_fini(&ve->context);
@@ -5464,6 +5489,27 @@ static void virtual_context_destroy(struct kref *kref)
54645489
kfree(ve);
54655490
}
54665491

5492+
static void virtual_context_destroy(struct kref *kref)
5493+
{
5494+
struct virtual_engine *ve =
5495+
container_of(kref, typeof(*ve), context.ref);
5496+
5497+
GEM_BUG_ON(!list_empty(&ve->context.signals));
5498+
5499+
/*
5500+
* When destroying the virtual engine, we have to be aware that
5501+
* it may still be in use from an hardirq/softirq context causing
5502+
* the resubmission of a completed request (background completion
5503+
* due to preempt-to-busy). Before we can free the engine, we need
5504+
* to flush the submission code and tasklets that are still potentially
5505+
* accessing the engine. Flushing the tasklets requires process context,
5506+
* and since we can guard the resubmit onto the engine with an RCU read
5507+
* lock, we can delegate the free of the engine to an RCU worker.
5508+
*/
5509+
INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
5510+
queue_rcu_work(system_wq, &ve->rcu);
5511+
}
5512+
54675513
static void virtual_engine_initial_hint(struct virtual_engine *ve)
54685514
{
54695515
int swp;

0 commit comments

Comments
 (0)