Skip to content

Commit 52793d6

Browse files
James SmartChristoph Hellwig
authored andcommitted
nvme-fc: fix io timeout to abort I/O
Currently, an I/O timeout unconditionally invokes nvme_fc_error_recovery() which checks for LIVE or CONNECTING state. If live, the routine resets the controller which initiates a reconnect - which is valid. If CONNECTING, err_work is scheduled. Err_work then calls the terminate_io routine, which also checks for CONNECTING and noops any further action on outstanding I/O. The result is nothing happened to the timed out io. As such, if the command was dropped on the wire, it will never timeout / complete, and the connect process will hang. Change the behavior of the io timeout routine to unconditionally abort the I/O. I/O completion handling will note that an io failed due to an abort and will terminate the connection / association as needed. If the abort was unable to happen, continue with a call to nvme_fc_error_recovery(). To ensure something different happens in nvme_fc_error_recovery() rework it so at it will abort all I/Os on the association to force a failure. As I/O aborts now may occur outside of delete_association, counting for completion must be wary and only count those aborted during delete_association when TERMIO is set on the controller. Signed-off-by: James Smart <james.smart@broadcom.com> Signed-off-by: Christoph Hellwig <hch@lst.de>
1 parent 150dfb6 commit 52793d6

1 file changed

Lines changed: 69 additions & 39 deletions

File tree

  • drivers/nvme/host

drivers/nvme/host/fc.c

Lines changed: 69 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1837,8 +1837,10 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
18371837
opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
18381838
if (opstate != FCPOP_STATE_ACTIVE)
18391839
atomic_set(&op->state, opstate);
1840-
else if (test_bit(FCCTRL_TERMIO, &ctrl->flags))
1840+
else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
1841+
op->flags |= FCOP_FLAGS_TERMIO;
18411842
ctrl->iocnt++;
1843+
}
18421844
spin_unlock_irqrestore(&ctrl->lock, flags);
18431845

18441846
if (opstate != FCPOP_STATE_ACTIVE)
@@ -1874,7 +1876,8 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
18741876

18751877
if (opstate == FCPOP_STATE_ABORTED) {
18761878
spin_lock_irqsave(&ctrl->lock, flags);
1877-
if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
1879+
if (test_bit(FCCTRL_TERMIO, &ctrl->flags) &&
1880+
op->flags & FCOP_FLAGS_TERMIO) {
18781881
if (!--ctrl->iocnt)
18791882
wake_up(&ctrl->ioabort_wait);
18801883
}
@@ -2446,15 +2449,20 @@ nvme_fc_timeout(struct request *rq, bool reserved)
24462449
{
24472450
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
24482451
struct nvme_fc_ctrl *ctrl = op->ctrl;
2452+
struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
2453+
struct nvme_command *sqe = &cmdiu->sqe;
24492454

24502455
/*
2451-
* we can't individually ABTS an io without affecting the queue,
2452-
* thus killing the queue, and thus the association.
2453-
* So resolve by performing a controller reset, which will stop
2454-
* the host/io stack, terminate the association on the link,
2455-
* and recreate an association on the link.
2456+
* Attempt to abort the offending command. Command completion
2457+
* will detect the aborted io and will fail the connection.
24562458
*/
2457-
nvme_fc_error_recovery(ctrl, "io timeout error");
2459+
dev_info(ctrl->ctrl.device,
2460+
"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d w10/11: "
2461+
"x%08x/x%08x\n",
2462+
ctrl->cnum, op->queue->qnum, sqe->common.opcode,
2463+
sqe->connect.fctype, sqe->common.cdw10, sqe->common.cdw11);
2464+
if (__nvme_fc_abort_op(ctrl, op))
2465+
nvme_fc_error_recovery(ctrl, "io timeout abort failed");
24582466

24592467
/*
24602468
* the io abort has been initiated. Have the reset timer
@@ -2726,6 +2734,7 @@ nvme_fc_complete_rq(struct request *rq)
27262734
struct nvme_fc_ctrl *ctrl = op->ctrl;
27272735

27282736
atomic_set(&op->state, FCPOP_STATE_IDLE);
2737+
op->flags &= ~FCOP_FLAGS_TERMIO;
27292738

27302739
nvme_fc_unmap_data(ctrl, rq, op);
27312740
nvme_complete_rq(rq);
@@ -3090,26 +3099,19 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
30903099
return ret;
30913100
}
30923101

3102+
30933103
/*
3094-
* This routine stops operation of the controller on the host side.
3095-
* On the host os stack side: Admin and IO queues are stopped,
3096-
* outstanding ios on them terminated via FC ABTS.
3097-
* On the link side: the association is terminated.
3104+
* This routine runs through all outstanding commands on the association
3105+
* and aborts them. This routine is typically be called by the
3106+
* delete_association routine. It is also called due to an error during
3107+
* reconnect. In that scenario, it is most likely a command that initializes
3108+
* the controller, including fabric Connect commands on io queues, that
3109+
* may have timed out or failed thus the io must be killed for the connect
3110+
* thread to see the error.
30983111
*/
30993112
static void
3100-
nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
3113+
__nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
31013114
{
3102-
struct nvmefc_ls_rcv_op *disls = NULL;
3103-
unsigned long flags;
3104-
3105-
if (!test_and_clear_bit(ASSOC_ACTIVE, &ctrl->flags))
3106-
return;
3107-
3108-
spin_lock_irqsave(&ctrl->lock, flags);
3109-
set_bit(FCCTRL_TERMIO, &ctrl->flags);
3110-
ctrl->iocnt = 0;
3111-
spin_unlock_irqrestore(&ctrl->lock, flags);
3112-
31133115
/*
31143116
* If io queues are present, stop them and terminate all outstanding
31153117
* ios on them. As FC allocates FC exchange for each io, the
@@ -3127,6 +3129,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
31273129
blk_mq_tagset_busy_iter(&ctrl->tag_set,
31283130
nvme_fc_terminate_exchange, &ctrl->ctrl);
31293131
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
3132+
if (start_queues)
3133+
nvme_start_queues(&ctrl->ctrl);
31303134
}
31313135

31323136
/*
@@ -3143,13 +3147,34 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
31433147

31443148
/*
31453149
* clean up the admin queue. Same thing as above.
3146-
* use blk_mq_tagset_busy_itr() and the transport routine to
3147-
* terminate the exchanges.
31483150
*/
31493151
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
31503152
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
31513153
nvme_fc_terminate_exchange, &ctrl->ctrl);
31523154
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
3155+
}
3156+
3157+
/*
3158+
* This routine stops operation of the controller on the host side.
3159+
* On the host os stack side: Admin and IO queues are stopped,
3160+
* outstanding ios on them terminated via FC ABTS.
3161+
* On the link side: the association is terminated.
3162+
*/
3163+
static void
3164+
nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
3165+
{
3166+
struct nvmefc_ls_rcv_op *disls = NULL;
3167+
unsigned long flags;
3168+
3169+
if (!test_and_clear_bit(ASSOC_ACTIVE, &ctrl->flags))
3170+
return;
3171+
3172+
spin_lock_irqsave(&ctrl->lock, flags);
3173+
set_bit(FCCTRL_TERMIO, &ctrl->flags);
3174+
ctrl->iocnt = 0;
3175+
spin_unlock_irqrestore(&ctrl->lock, flags);
3176+
3177+
__nvme_fc_abort_outstanding_ios(ctrl, false);
31533178

31543179
/* kill the aens as they are a separate path */
31553180
nvme_fc_abort_aen_ops(ctrl);
@@ -3263,22 +3288,27 @@ static void
32633288
__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
32643289
{
32653290
/*
3266-
* if state is connecting - the error occurred as part of a
3267-
* reconnect attempt. The create_association error paths will
3268-
* clean up any outstanding io.
3269-
*
3270-
* if it's a different state - ensure all pending io is
3271-
* terminated. Given this can delay while waiting for the
3272-
* aborted io to return, we recheck adapter state below
3273-
* before changing state.
3291+
* if state is CONNECTING - the error occurred as part of a
3292+
* reconnect attempt. Abort any ios on the association and
3293+
* let the create_association error paths resolve things.
32743294
*/
3275-
if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
3276-
nvme_stop_keep_alive(&ctrl->ctrl);
3277-
3278-
/* will block will waiting for io to terminate */
3279-
nvme_fc_delete_association(ctrl);
3295+
if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
3296+
__nvme_fc_abort_outstanding_ios(ctrl, true);
3297+
return;
32803298
}
32813299

3300+
/*
3301+
* For any other state, kill the association. As this routine
3302+
* is a common io abort routine for resetting and such, after
3303+
* the association is terminated, ensure that the state is set
3304+
* to CONNECTING.
3305+
*/
3306+
3307+
nvme_stop_keep_alive(&ctrl->ctrl);
3308+
3309+
/* will block will waiting for io to terminate */
3310+
nvme_fc_delete_association(ctrl);
3311+
32823312
if (ctrl->ctrl.state != NVME_CTRL_CONNECTING &&
32833313
!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
32843314
dev_err(ctrl->ctrl.device,

0 commit comments

Comments
 (0)