Skip to content

Commit d9a7295

Browse files
committed
Fix for rare deadlock with P2P callbacks.
When looking up connections by their handle, take both the table and the connection lock using a timeout. If we fail, release both locks and start all over. This will proptect against a deadlock if we take the locks in the opposite order. We currently don't ever encounter this situation solely within this library. However, we did come across a case with P2P callbacks calling into a custom signal handler. We could probably fix the custom signal handler to not do this, but it seems like there are going to be other edge cases, and this change is low risk because contention is low and most lock attempts will succeed immediately. And deadlocks are very hard to reproduce and debug. CR:saml P4:7307214
1 parent a581422 commit d9a7295

2 files changed

Lines changed: 60 additions & 35 deletions

File tree

src/steamnetworkingsockets/clientlib/csteamnetworkingsockets.cpp

Lines changed: 58 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -265,29 +265,43 @@ static CSteamNetworkConnectionBase *InternalGetConnectionByHandle( HSteamNetConn
265265
{
266266
if ( sock == 0 )
267267
return nullptr;
268-
TableScopeLock tableScopeLock( g_tables_lock );
269-
int idx = g_mapConnections.Find( uint16( sock ) );
270-
if ( idx == g_mapConnections.InvalidIndex() )
271-
return nullptr;
272-
CSteamNetworkConnectionBase *pResult = g_mapConnections[ idx ];
273-
if ( !pResult )
274-
{
275-
AssertMsg( false, "g_mapConnections corruption!" );
276-
return nullptr;
277-
}
278-
if ( uint16( pResult->m_hConnectionSelf ) != uint16( sock ) )
279-
{
280-
AssertMsg( false, "Connection map corruption!" );
281-
return nullptr;
282-
}
283268

284-
// Make sure connection is not in the process of being self-destructed
285-
bool bLocked = false;
269+
// Take locks by "trying" and using a short timeout. The table lock
270+
// should only be held by any one thread for an extremely short period
271+
// of time, so if we end up waiting/looping here, it should be blocked
272+
// on the connection lock. In general we should very seldom need
273+
// to actually loop here, but doing this protects against a deadlock,
274+
// because in rare situations we do need to take the locks in the
275+
// opposite order.
286276
for (;;)
287277
{
288278

279+
// First take the table lock
280+
TableScopeLock tableScopeLock;
281+
if ( !tableScopeLock.TryLock( g_tables_lock, 1, nullptr ) )
282+
continue;
283+
284+
int idx = g_mapConnections.Find( uint16( sock ) );
285+
if ( idx == g_mapConnections.InvalidIndex() )
286+
break;
287+
CSteamNetworkConnectionBase *pResult = g_mapConnections[ idx ];
288+
if ( !pResult )
289+
{
290+
AssertMsg( false, "g_mapConnections corruption!" );
291+
break;
292+
}
293+
if ( uint16( pResult->m_hConnectionSelf ) != uint16( sock ) )
294+
{
295+
AssertMsg( false, "Connection map corruption!" );
296+
break;
297+
}
298+
289299
// Fetch the state of the connection. This is OK to do
290-
// even if we don't have the lock.
300+
// even if we don't have the lock. If the connection
301+
// is already dead we can avoid even trying to take
302+
// the lock. That's good because cleaning up is one
303+
// of the rare cases where we take locks in the opposite
304+
// order, so we want to avoid that.
291305
ESteamNetworkingConnectionState s = pResult->GetState();
292306
if ( s == k_ESteamNetworkingConnectionState_Dead )
293307
break;
@@ -297,25 +311,36 @@ static CSteamNetworkConnectionBase *InternalGetConnectionByHandle( HSteamNetConn
297311
break;
298312
}
299313

300-
// Have we locked already? Then we're good
301-
if ( bLocked )
302-
{
303-
// NOTE: We unlock the table lock here, OUT OF ORDER!
304-
return pResult;
305-
}
306-
307314
// State looks good, try to lock the connection.
308315
// NOTE: we still (briefly) hold the table lock!
309-
// We *should* be able to totally block here
310-
// without creating a deadlock, but looping here
311-
// isn't so bad
312-
bLocked = scopeLock.TryLock( *pResult->m_pLock, 5, pszLockTag );
313-
}
316+
// Taking the locks in this order (table, then connection)
317+
// is the most common case by far, but in very rare
318+
// cases we might need to take them in the opposite
319+
// order, hence the loop
320+
if ( !scopeLock.TryLock( *pResult->m_pLock, 1, pszLockTag ) )
321+
continue;
314322

315-
// Connection found in table, but should not be returned to the caller.
316-
// Unlock the connection, if we locked it
317-
if ( bLocked )
323+
// Re-check the connection state, in case something changed
324+
// while we were waiting on the lock
325+
s = pResult->GetState();
326+
if ( s != k_ESteamNetworkingConnectionState_Dead )
327+
{
328+
if ( !bForAPI || BConnectionStateExistsToAPI( s ) )
329+
{
330+
// NOTE: Exiting this scope will invoke TableScopeLock destructor,
331+
// which will unlock the table lock here, OUT OF ORDER of the order
332+
// that we took the locks. That's intentional! We don't need that
333+
// lock anymore, we have locked the connection that we want.
334+
return pResult;
335+
}
336+
}
337+
338+
// Connection found in table, but should not be returned to the caller.
339+
// Go ahead and immediately unlock the connection lock, the caller should not
340+
// need it.
318341
scopeLock.Unlock();
342+
break;
343+
}
319344

320345
return nullptr;
321346
}

src/steamnetworkingsockets/clientlib/steamnetworkingsockets_connections.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,8 +1110,8 @@ extern CUtlHashMap<uint16, CSteamNetworkConnectionBase *, std::equal_to<uint16>,
11101110
extern CUtlHashMap<int, CSteamNetworkPollGroup *, std::equal_to<int>, Identity<int> > g_mapPollGroups;
11111111

11121112
// All of the tables above are projected by the same lock, since we expect to only access it briefly
1113-
struct TableLock : Lock<RecursiveMutexImpl> {
1114-
TableLock() : Lock<RecursiveMutexImpl>( "table", LockDebugInfo::k_nFlag_Table ) {}
1113+
struct TableLock : Lock<RecursiveTimedMutexImpl> {
1114+
TableLock() : Lock<RecursiveTimedMutexImpl>( "table", LockDebugInfo::k_nFlag_Table ) {}
11151115
};
11161116
using TableScopeLock = ScopeLock<TableLock>;
11171117
extern TableLock g_tables_lock;

0 commit comments

Comments
 (0)