@@ -1837,8 +1837,10 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
18371837 opstate = atomic_xchg (& op -> state , FCPOP_STATE_ABORTED );
18381838 if (opstate != FCPOP_STATE_ACTIVE )
18391839 atomic_set (& op -> state , opstate );
1840- else if (test_bit (FCCTRL_TERMIO , & ctrl -> flags ))
1840+ else if (test_bit (FCCTRL_TERMIO , & ctrl -> flags )) {
1841+ op -> flags |= FCOP_FLAGS_TERMIO ;
18411842 ctrl -> iocnt ++ ;
1843+ }
18421844 spin_unlock_irqrestore (& ctrl -> lock , flags );
18431845
18441846 if (opstate != FCPOP_STATE_ACTIVE )
@@ -1874,7 +1876,8 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
18741876
18751877 if (opstate == FCPOP_STATE_ABORTED ) {
18761878 spin_lock_irqsave (& ctrl -> lock , flags );
1877- if (test_bit (FCCTRL_TERMIO , & ctrl -> flags )) {
1879+ if (test_bit (FCCTRL_TERMIO , & ctrl -> flags ) &&
1880+ op -> flags & FCOP_FLAGS_TERMIO ) {
18781881 if (!-- ctrl -> iocnt )
18791882 wake_up (& ctrl -> ioabort_wait );
18801883 }
@@ -2446,15 +2449,20 @@ nvme_fc_timeout(struct request *rq, bool reserved)
24462449{
24472450 struct nvme_fc_fcp_op * op = blk_mq_rq_to_pdu (rq );
24482451 struct nvme_fc_ctrl * ctrl = op -> ctrl ;
2452+ struct nvme_fc_cmd_iu * cmdiu = & op -> cmd_iu ;
2453+ struct nvme_command * sqe = & cmdiu -> sqe ;
24492454
24502455 /*
2451- * we can't individually ABTS an io without affecting the queue,
2452- * thus killing the queue, and thus the association.
2453- * So resolve by performing a controller reset, which will stop
2454- * the host/io stack, terminate the association on the link,
2455- * and recreate an association on the link.
2456+ * Attempt to abort the offending command. Command completion
2457+ * will detect the aborted io and will fail the connection.
24562458 */
2457- nvme_fc_error_recovery (ctrl , "io timeout error" );
2459+ dev_info (ctrl -> ctrl .device ,
2460+ "NVME-FC{%d.%d}: io timeout: opcode %d fctype %d w10/11: "
2461+ "x%08x/x%08x\n" ,
2462+ ctrl -> cnum , op -> queue -> qnum , sqe -> common .opcode ,
2463+ sqe -> connect .fctype , sqe -> common .cdw10 , sqe -> common .cdw11 );
2464+ if (__nvme_fc_abort_op (ctrl , op ))
2465+ nvme_fc_error_recovery (ctrl , "io timeout abort failed" );
24582466
24592467 /*
24602468 * the io abort has been initiated. Have the reset timer
@@ -2726,6 +2734,7 @@ nvme_fc_complete_rq(struct request *rq)
27262734 struct nvme_fc_ctrl * ctrl = op -> ctrl ;
27272735
27282736 atomic_set (& op -> state , FCPOP_STATE_IDLE );
2737+ op -> flags &= ~FCOP_FLAGS_TERMIO ;
27292738
27302739 nvme_fc_unmap_data (ctrl , rq , op );
27312740 nvme_complete_rq (rq );
@@ -3090,26 +3099,19 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
30903099 return ret ;
30913100}
30923101
3102+
30933103/*
3094- * This routine stops operation of the controller on the host side.
3095- * On the host os stack side: Admin and IO queues are stopped,
3096- * outstanding ios on them terminated via FC ABTS.
3097- * On the link side: the association is terminated.
3104+ * This routine runs through all outstanding commands on the association
3105+ * and aborts them. This routine is typically be called by the
3106+ * delete_association routine. It is also called due to an error during
3107+ * reconnect. In that scenario, it is most likely a command that initializes
3108+ * the controller, including fabric Connect commands on io queues, that
3109+ * may have timed out or failed thus the io must be killed for the connect
3110+ * thread to see the error.
30983111 */
30993112static void
3100- nvme_fc_delete_association (struct nvme_fc_ctrl * ctrl )
3113+ __nvme_fc_abort_outstanding_ios (struct nvme_fc_ctrl * ctrl , bool start_queues )
31013114{
3102- struct nvmefc_ls_rcv_op * disls = NULL ;
3103- unsigned long flags ;
3104-
3105- if (!test_and_clear_bit (ASSOC_ACTIVE , & ctrl -> flags ))
3106- return ;
3107-
3108- spin_lock_irqsave (& ctrl -> lock , flags );
3109- set_bit (FCCTRL_TERMIO , & ctrl -> flags );
3110- ctrl -> iocnt = 0 ;
3111- spin_unlock_irqrestore (& ctrl -> lock , flags );
3112-
31133115 /*
31143116 * If io queues are present, stop them and terminate all outstanding
31153117 * ios on them. As FC allocates FC exchange for each io, the
@@ -3127,6 +3129,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
31273129 blk_mq_tagset_busy_iter (& ctrl -> tag_set ,
31283130 nvme_fc_terminate_exchange , & ctrl -> ctrl );
31293131 blk_mq_tagset_wait_completed_request (& ctrl -> tag_set );
3132+ if (start_queues )
3133+ nvme_start_queues (& ctrl -> ctrl );
31303134 }
31313135
31323136 /*
@@ -3143,13 +3147,34 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
31433147
31443148 /*
31453149 * clean up the admin queue. Same thing as above.
3146- * use blk_mq_tagset_busy_itr() and the transport routine to
3147- * terminate the exchanges.
31483150 */
31493151 blk_mq_quiesce_queue (ctrl -> ctrl .admin_q );
31503152 blk_mq_tagset_busy_iter (& ctrl -> admin_tag_set ,
31513153 nvme_fc_terminate_exchange , & ctrl -> ctrl );
31523154 blk_mq_tagset_wait_completed_request (& ctrl -> admin_tag_set );
3155+ }
3156+
3157+ /*
3158+ * This routine stops operation of the controller on the host side.
3159+ * On the host os stack side: Admin and IO queues are stopped,
3160+ * outstanding ios on them terminated via FC ABTS.
3161+ * On the link side: the association is terminated.
3162+ */
3163+ static void
3164+ nvme_fc_delete_association (struct nvme_fc_ctrl * ctrl )
3165+ {
3166+ struct nvmefc_ls_rcv_op * disls = NULL ;
3167+ unsigned long flags ;
3168+
3169+ if (!test_and_clear_bit (ASSOC_ACTIVE , & ctrl -> flags ))
3170+ return ;
3171+
3172+ spin_lock_irqsave (& ctrl -> lock , flags );
3173+ set_bit (FCCTRL_TERMIO , & ctrl -> flags );
3174+ ctrl -> iocnt = 0 ;
3175+ spin_unlock_irqrestore (& ctrl -> lock , flags );
3176+
3177+ __nvme_fc_abort_outstanding_ios (ctrl , false);
31533178
31543179 /* kill the aens as they are a separate path */
31553180 nvme_fc_abort_aen_ops (ctrl );
@@ -3263,22 +3288,27 @@ static void
32633288__nvme_fc_terminate_io (struct nvme_fc_ctrl * ctrl )
32643289{
32653290 /*
3266- * if state is connecting - the error occurred as part of a
3267- * reconnect attempt. The create_association error paths will
3268- * clean up any outstanding io.
3269- *
3270- * if it's a different state - ensure all pending io is
3271- * terminated. Given this can delay while waiting for the
3272- * aborted io to return, we recheck adapter state below
3273- * before changing state.
3291+ * if state is CONNECTING - the error occurred as part of a
3292+ * reconnect attempt. Abort any ios on the association and
3293+ * let the create_association error paths resolve things.
32743294 */
3275- if (ctrl -> ctrl .state != NVME_CTRL_CONNECTING ) {
3276- nvme_stop_keep_alive (& ctrl -> ctrl );
3277-
3278- /* will block will waiting for io to terminate */
3279- nvme_fc_delete_association (ctrl );
3295+ if (ctrl -> ctrl .state == NVME_CTRL_CONNECTING ) {
3296+ __nvme_fc_abort_outstanding_ios (ctrl , true);
3297+ return ;
32803298 }
32813299
3300+ /*
3301+ * For any other state, kill the association. As this routine
3302+ * is a common io abort routine for resetting and such, after
3303+ * the association is terminated, ensure that the state is set
3304+ * to CONNECTING.
3305+ */
3306+
3307+ nvme_stop_keep_alive (& ctrl -> ctrl );
3308+
3309+ /* will block will waiting for io to terminate */
3310+ nvme_fc_delete_association (ctrl );
3311+
32823312 if (ctrl -> ctrl .state != NVME_CTRL_CONNECTING &&
32833313 !nvme_change_ctrl_state (& ctrl -> ctrl , NVME_CTRL_CONNECTING ))
32843314 dev_err (ctrl -> ctrl .device ,
0 commit comments