@@ -26,6 +26,10 @@ enum nvme_fc_queue_flags {
2626};
2727
2828#define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */
29+ #define NVME_FC_DEFAULT_RECONNECT_TMO 2 /* delay between reconnects
30+ * when connected and a
31+ * connection failure.
32+ */
2933
3034struct nvme_fc_queue {
3135 struct nvme_fc_ctrl * ctrl ;
@@ -1837,8 +1841,10 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
18371841 opstate = atomic_xchg (& op -> state , FCPOP_STATE_ABORTED );
18381842 if (opstate != FCPOP_STATE_ACTIVE )
18391843 atomic_set (& op -> state , opstate );
1840- else if (test_bit (FCCTRL_TERMIO , & ctrl -> flags ))
1844+ else if (test_bit (FCCTRL_TERMIO , & ctrl -> flags )) {
1845+ op -> flags |= FCOP_FLAGS_TERMIO ;
18411846 ctrl -> iocnt ++ ;
1847+ }
18421848 spin_unlock_irqrestore (& ctrl -> lock , flags );
18431849
18441850 if (opstate != FCPOP_STATE_ACTIVE )
@@ -1874,7 +1880,8 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
18741880
18751881 if (opstate == FCPOP_STATE_ABORTED ) {
18761882 spin_lock_irqsave (& ctrl -> lock , flags );
1877- if (test_bit (FCCTRL_TERMIO , & ctrl -> flags )) {
1883+ if (test_bit (FCCTRL_TERMIO , & ctrl -> flags ) &&
1884+ op -> flags & FCOP_FLAGS_TERMIO ) {
18781885 if (!-- ctrl -> iocnt )
18791886 wake_up (& ctrl -> ioabort_wait );
18801887 }
@@ -2314,7 +2321,7 @@ nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize)
23142321 return 0 ;
23152322
23162323delete_queues :
2317- for (; i >= 0 ; i -- )
2324+ for (; i > 0 ; i -- )
23182325 __nvme_fc_delete_hw_queue (ctrl , & ctrl -> queues [i ], i );
23192326 return ret ;
23202327}
@@ -2433,7 +2440,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
24332440 return ;
24342441
24352442 dev_warn (ctrl -> ctrl .device ,
2436- "NVME-FC{%d}: transport association error detected : %s\n" ,
2443+ "NVME-FC{%d}: transport association event : %s\n" ,
24372444 ctrl -> cnum , errmsg );
24382445 dev_warn (ctrl -> ctrl .device ,
24392446 "NVME-FC{%d}: resetting controller\n" , ctrl -> cnum );
@@ -2446,15 +2453,20 @@ nvme_fc_timeout(struct request *rq, bool reserved)
24462453{
24472454 struct nvme_fc_fcp_op * op = blk_mq_rq_to_pdu (rq );
24482455 struct nvme_fc_ctrl * ctrl = op -> ctrl ;
2456+ struct nvme_fc_cmd_iu * cmdiu = & op -> cmd_iu ;
2457+ struct nvme_command * sqe = & cmdiu -> sqe ;
24492458
24502459 /*
2451- * we can't individually ABTS an io without affecting the queue,
2452- * thus killing the queue, and thus the association.
2453- * So resolve by performing a controller reset, which will stop
2454- * the host/io stack, terminate the association on the link,
2455- * and recreate an association on the link.
2460+ * Attempt to abort the offending command. Command completion
2461+ * will detect the aborted io and will fail the connection.
24562462 */
2457- nvme_fc_error_recovery (ctrl , "io timeout error" );
2463+ dev_info (ctrl -> ctrl .device ,
2464+ "NVME-FC{%d.%d}: io timeout: opcode %d fctype %d w10/11: "
2465+ "x%08x/x%08x\n" ,
2466+ ctrl -> cnum , op -> queue -> qnum , sqe -> common .opcode ,
2467+ sqe -> connect .fctype , sqe -> common .cdw10 , sqe -> common .cdw11 );
2468+ if (__nvme_fc_abort_op (ctrl , op ))
2469+ nvme_fc_error_recovery (ctrl , "io timeout abort failed" );
24582470
24592471 /*
24602472 * the io abort has been initiated. Have the reset timer
@@ -2726,6 +2738,7 @@ nvme_fc_complete_rq(struct request *rq)
27262738 struct nvme_fc_ctrl * ctrl = op -> ctrl ;
27272739
27282740 atomic_set (& op -> state , FCPOP_STATE_IDLE );
2741+ op -> flags &= ~FCOP_FLAGS_TERMIO ;
27292742
27302743 nvme_fc_unmap_data (ctrl , rq , op );
27312744 nvme_complete_rq (rq );
@@ -2876,11 +2889,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl)
28762889 if (ret )
28772890 goto out_delete_hw_queues ;
28782891
2879- if (prior_ioq_cnt != nr_io_queues )
2892+ if (prior_ioq_cnt != nr_io_queues ) {
28802893 dev_info (ctrl -> ctrl .device ,
28812894 "reconnect: revising io queue count from %d to %d\n" ,
28822895 prior_ioq_cnt , nr_io_queues );
2883- blk_mq_update_nr_hw_queues (& ctrl -> tag_set , nr_io_queues );
2896+ nvme_wait_freeze (& ctrl -> ctrl );
2897+ blk_mq_update_nr_hw_queues (& ctrl -> tag_set , nr_io_queues );
2898+ nvme_unfreeze (& ctrl -> ctrl );
2899+ }
28842900
28852901 return 0 ;
28862902
@@ -3090,26 +3106,19 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
30903106 return ret ;
30913107}
30923108
3109+
30933110/*
3094- * This routine stops operation of the controller on the host side.
3095- * On the host os stack side: Admin and IO queues are stopped,
3096- * outstanding ios on them terminated via FC ABTS.
3097- * On the link side: the association is terminated.
3111+ * This routine runs through all outstanding commands on the association
3112+ * and aborts them. This routine is typically be called by the
3113+ * delete_association routine. It is also called due to an error during
3114+ * reconnect. In that scenario, it is most likely a command that initializes
3115+ * the controller, including fabric Connect commands on io queues, that
3116+ * may have timed out or failed thus the io must be killed for the connect
3117+ * thread to see the error.
30983118 */
30993119static void
3100- nvme_fc_delete_association (struct nvme_fc_ctrl * ctrl )
3120+ __nvme_fc_abort_outstanding_ios (struct nvme_fc_ctrl * ctrl , bool start_queues )
31013121{
3102- struct nvmefc_ls_rcv_op * disls = NULL ;
3103- unsigned long flags ;
3104-
3105- if (!test_and_clear_bit (ASSOC_ACTIVE , & ctrl -> flags ))
3106- return ;
3107-
3108- spin_lock_irqsave (& ctrl -> lock , flags );
3109- set_bit (FCCTRL_TERMIO , & ctrl -> flags );
3110- ctrl -> iocnt = 0 ;
3111- spin_unlock_irqrestore (& ctrl -> lock , flags );
3112-
31133122 /*
31143123 * If io queues are present, stop them and terminate all outstanding
31153124 * ios on them. As FC allocates FC exchange for each io, the
@@ -3127,6 +3136,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
31273136 blk_mq_tagset_busy_iter (& ctrl -> tag_set ,
31283137 nvme_fc_terminate_exchange , & ctrl -> ctrl );
31293138 blk_mq_tagset_wait_completed_request (& ctrl -> tag_set );
3139+ if (start_queues )
3140+ nvme_start_queues (& ctrl -> ctrl );
31303141 }
31313142
31323143 /*
@@ -3143,13 +3154,34 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
31433154
31443155 /*
31453156 * clean up the admin queue. Same thing as above.
3146- * use blk_mq_tagset_busy_itr() and the transport routine to
3147- * terminate the exchanges.
31483157 */
31493158 blk_mq_quiesce_queue (ctrl -> ctrl .admin_q );
31503159 blk_mq_tagset_busy_iter (& ctrl -> admin_tag_set ,
31513160 nvme_fc_terminate_exchange , & ctrl -> ctrl );
31523161 blk_mq_tagset_wait_completed_request (& ctrl -> admin_tag_set );
3162+ }
3163+
3164+ /*
3165+ * This routine stops operation of the controller on the host side.
3166+ * On the host os stack side: Admin and IO queues are stopped,
3167+ * outstanding ios on them terminated via FC ABTS.
3168+ * On the link side: the association is terminated.
3169+ */
3170+ static void
3171+ nvme_fc_delete_association (struct nvme_fc_ctrl * ctrl )
3172+ {
3173+ struct nvmefc_ls_rcv_op * disls = NULL ;
3174+ unsigned long flags ;
3175+
3176+ if (!test_and_clear_bit (ASSOC_ACTIVE , & ctrl -> flags ))
3177+ return ;
3178+
3179+ spin_lock_irqsave (& ctrl -> lock , flags );
3180+ set_bit (FCCTRL_TERMIO , & ctrl -> flags );
3181+ ctrl -> iocnt = 0 ;
3182+ spin_unlock_irqrestore (& ctrl -> lock , flags );
3183+
3184+ __nvme_fc_abort_outstanding_ios (ctrl , false);
31533185
31543186 /* kill the aens as they are a separate path */
31553187 nvme_fc_abort_aen_ops (ctrl );
@@ -3263,22 +3295,27 @@ static void
32633295__nvme_fc_terminate_io (struct nvme_fc_ctrl * ctrl )
32643296{
32653297 /*
3266- * if state is connecting - the error occurred as part of a
3267- * reconnect attempt. The create_association error paths will
3268- * clean up any outstanding io.
3269- *
3270- * if it's a different state - ensure all pending io is
3271- * terminated. Given this can delay while waiting for the
3272- * aborted io to return, we recheck adapter state below
3273- * before changing state.
3298+ * if state is CONNECTING - the error occurred as part of a
3299+ * reconnect attempt. Abort any ios on the association and
3300+ * let the create_association error paths resolve things.
32743301 */
3275- if (ctrl -> ctrl .state != NVME_CTRL_CONNECTING ) {
3276- nvme_stop_keep_alive (& ctrl -> ctrl );
3277-
3278- /* will block will waiting for io to terminate */
3279- nvme_fc_delete_association (ctrl );
3302+ if (ctrl -> ctrl .state == NVME_CTRL_CONNECTING ) {
3303+ __nvme_fc_abort_outstanding_ios (ctrl , true);
3304+ return ;
32803305 }
32813306
3307+ /*
3308+ * For any other state, kill the association. As this routine
3309+ * is a common io abort routine for resetting and such, after
3310+ * the association is terminated, ensure that the state is set
3311+ * to CONNECTING.
3312+ */
3313+
3314+ nvme_stop_keep_alive (& ctrl -> ctrl );
3315+
3316+ /* will block will waiting for io to terminate */
3317+ nvme_fc_delete_association (ctrl );
3318+
32823319 if (ctrl -> ctrl .state != NVME_CTRL_CONNECTING &&
32833320 !nvme_change_ctrl_state (& ctrl -> ctrl , NVME_CTRL_CONNECTING ))
32843321 dev_err (ctrl -> ctrl .device ,
@@ -3403,7 +3440,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
34033440{
34043441 struct nvme_fc_ctrl * ctrl ;
34053442 unsigned long flags ;
3406- int ret , idx ;
3443+ int ret , idx , ctrl_loss_tmo ;
34073444
34083445 if (!(rport -> remoteport .port_role &
34093446 (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET ))) {
@@ -3429,6 +3466,19 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
34293466 goto out_free_ctrl ;
34303467 }
34313468
3469+ /*
3470+ * if ctrl_loss_tmo is being enforced and the default reconnect delay
3471+ * is being used, change to a shorter reconnect delay for FC.
3472+ */
3473+ if (opts -> max_reconnects != -1 &&
3474+ opts -> reconnect_delay == NVMF_DEF_RECONNECT_DELAY &&
3475+ opts -> reconnect_delay > NVME_FC_DEFAULT_RECONNECT_TMO ) {
3476+ ctrl_loss_tmo = opts -> max_reconnects * opts -> reconnect_delay ;
3477+ opts -> reconnect_delay = NVME_FC_DEFAULT_RECONNECT_TMO ;
3478+ opts -> max_reconnects = DIV_ROUND_UP (ctrl_loss_tmo ,
3479+ opts -> reconnect_delay );
3480+ }
3481+
34323482 ctrl -> ctrl .opts = opts ;
34333483 ctrl -> ctrl .nr_reconnects = 0 ;
34343484 if (lport -> dev )
0 commit comments