Hi Boris,
I appreciate your patience, sorry for the late
response (both of my kids and myself caught some bug
this week :( ).
I ran the receiver example program in ddd/gdb several
times. As in previous attempts, the sender program
runs very quickly to completion, and the receiver
lags behind until it stalls/hangs with the CPU usage
pegged. My DDD source window shows that the CPU
intensive thread is hanging in the following method:
-------
template <class EXT_ID, class INT_ID, class HASH_KEY,
class COMPARE_KEYS, class ACE_LOCK> ACE_INLINE int
ACE_Hash_Map_Manager_Ex<EXT_ID, INT_ID, HASH_KEY,
COMPARE_KEYS, ACE_LOCK>::equal (const EXT_ID &id1,
const EXT_ID &id2)
{
return this->compare_keys_ (id1, id2);
}
-------
Here is the stack output when I use export
LD_ASSUME_KERNEL=2.4.1 to set LWP/Linux threads. After
the sender completed and the receiver was pegging the
CPU, I did an interrupt in DDD. Also, I'm really not
sure why there are 5 threads here instead of 4--but
that is consistent when I switch to Linux threads.
Note that in this trace, Process PID 4249 is taking
almost all of the CPU:
0xb738b916 in nanosleep () from
/lib/i686/libpthread.so.0
(gdb) bt
#0 0xb738b916 in nanosleep () from
/lib/i686/libpthread.so.0
#1 0x00000000 in ?? ()
(gdb) thread 4
[Switching to thread 4 (Thread 32771 (LWP 4248))]#0
0xb738b916 in nanosleep () from
/lib/i686/libpthread.so.0
(gdb) thread 4
[Switching to thread 4 (Thread 32771 (LWP 4248))]#0
0xb738b916 in nanosleep () from
/lib/i686/libpthread.so.0
(gdb) thread 5
[Switching to thread 5 (Thread 49156 (LWP 4249))]#0
0xb75cb91a in ACE_Hash_Map_Manager_Ex<unsigned long
long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::equal (this=0xb0d8104,
id1=@0xb87ea80, id2=@0xb6d51cdc) at
Hash_Map_Manager_T.inl:80
(gdb) thread 5
[Switching to thread 5 (Thread 49156 (LWP 4249))]#0
0xb75cb91a in ACE_Hash_Map_Manager_Ex<unsigned long
long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::equal (this=0xb0d8104,
id1=@0xb87ea80, id2=@0xb6d51cdc) at
Hash_Map_Manager_T.inl:80
(gdb) bt
#0 0xb75cb91a in ACE_Hash_Map_Manager_Ex<unsigned
long long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::equal (this=0xb0d8104,
id1=@0xb87ea80, id2=@0xb6d51cdc) at
Hash_Map_Manager_T.inl:80
#1 0xb75caafa in ACE_Hash_Map_Manager_Ex<unsigned
long long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::shared_find
(this=0xb0d8104, ext_id=@0xb6d51cdc,
entry=@0xb6d51c04, loc=@0xb6d51c00) at
Hash_Map_Manager_T.cpp:304
#2 0xb75c9c68 in ACE_Hash_Map_Manager_Ex<unsigned
long long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::find_i (this=0xb0d8104,
ext_id=@0xb6d51cdc) at Hash_Map_Manager_T.inl:197
#3 0xb75c8657 in ACE_Hash_Map_Manager_Ex<unsigned
long long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::find (this=0xb0d8104,
ext_id=@0xb6d51cdc) at Hash_Map_Manager_T.inl:222
#4 0xb75c77f4 in ACE_RMCast::Acknowledge::track_queue
(this=0x8064df8, addr=@0xb0d80e8, q=@0xb0d8104,
msgs=@0xb6d51dc4) at Acknowledge.cpp:229
#5 0xb75c6fce in ACE_RMCast::Acknowledge::track
(this=0x8064df8) at Acknowledge.cpp:104
#6 0xb75c82c7 in ACE_RMCast::Acknowledge::track_thunk
(obj=0x8064df8) at Acknowledge.cpp:384
#7 0xb754d56d in ACE_Thread_Adapter::invoke_i
(this=0xb085490) at Thread_Adapter.cpp:149
#8 0xb754d4b0 in ACE_Thread_Adapter::invoke
(this=0xb085490) at Thread_Adapter.cpp:93
#9 0xb74cd3c2 in ace_thread_adapter (args=0xb085490)
at Base_Thread_Adapter.cpp:131
#10 0xb7385e21 in pthread_start_thread () from
/lib/i686/libpthread.so.0
#11 0xb7385fb5 in pthread_start_thread_event () from
/lib/i686/libpthread.so.0
#12 0xb723108a in clone () from /lib/i686/libc.so.6
(gdb) thread 4
[Switching to thread 4 (Thread 32771 (LWP 4248))]#0
0xb738b916 in nanosleep () from
/lib/i686/libpthread.so.0
(gdb) thread 4
[Switching to thread 4 (Thread 32771 (LWP 4248))]#0
0xb738b916 in nanosleep () from
/lib/i686/libpthread.so.0
(gdb) bt
#0 0xb738b916 in nanosleep () from
/lib/i686/libpthread.so.0
#1 0x00000000 in ?? ()
(gdb) thread 3
[Switching to thread 3 (Thread 16386 (LWP 4247))]#0
0xb722a5f1 in select () from /lib/i686/libc.so.6
(gdb) thread 3
[Switching to thread 3 (Thread 16386 (LWP 4247))]#0
0xb722a5f1 in select () from /lib/i686/libc.so.6
(gdb) bt
#0 0xb722a5f1 in select () from /lib/i686/libc.so.6
#1 0xb75a7920 in __JCR_LIST__ () from
/opt/ACE_wrappers/ace/libACE.so.5.4.7
#2 0xb738f5d0 in __pthread_last_event () from
/lib/i686/libpthread.so.0
#3 0x0b087a00 in ?? ()
#4 0x00000000 in ?? ()
(gdb) thread 2
[Switching to thread 2 (Thread 32769 (LWP 4246))]#0
0xb722838a in poll () from /lib/i686/libc.so.6
(gdb) bt
#0 0xb722838a in poll () from /lib/i686/libc.so.6
#1 0xb7384d5e in __pthread_manager () from
/lib/i686/libpthread.so.0
#2 0xb738502a in __pthread_manager_event () from
/lib/i686/libpthread.so.0
#3 0xb723108a in clone () from /lib/i686/libc.so.6
(gdb) thread 1
[Switching to thread 1 (Thread 16384 (LWP 4243))]#0
0xb7388074 in __pthread_sigsuspend () from
/lib/i686/libpthread.so.0
(gdb) bt
#0 0xb7388074 in __pthread_sigsuspend () from
/lib/i686/libpthread.so.0
#1 0xb7387a28 in __pthread_wait_for_restart_signal ()
from /lib/i686/libpthread.so.0
#2 0xb738407b in pthread_cond_wait@GLIBC_2.0 () from
/lib/i686/libpthread.so.0
#3 0xb75cc00a in cond_wait (cv=0x804fcf8,
external_mutex=0x804fcdc) at OS_NS_Thread.inl:390
#4 0xb75cb26d in
ACE_Condition<ACE_Thread_Mutex>::wait (this=0x804fcf8)
at Condition_T.cpp:158
#5 0xb75ddaa4 in ACE_RMCast::Socket_Impl::size_
(this=0x804fcb0, timeout=0x0) at Socket.cpp:239
#6 0xb75de236 in ACE_RMCast::Socket::size
(this=0xbffff080) at Socket.cpp:357
#7 0x08049a7f in main (argc=2, argv=0xbffff114) at
Receiver.cpp:75
(gdb)
-------------------
Here, in this stack output, I am using the default
NPTL threads. As before, I am allowing the sender
program to run to completion, and then while receiver
is pegging the CPU, I do a DDD interrupt and dump the
trace of each of the 4 threads. If I use 'top' to show
threads, the thread that is performing ACE_Hash_Map
equal() is the culprit (same as for Linux threads
test).
0xb73c71fb in pthread_cond_wait@@GLIBC_2.3.2 () from
/lib/tls/libpthread.so.0
(gdb) bt
#0 0xb73c71fb in pthread_cond_wait@@GLIBC_2.3.2 ()
from /lib/tls/libpthread.so.0
#1 0xb75cc00a in cond_wait (cv=0x804f520,
external_mutex=0x804f504) at OS_NS_Thread.inl:390
#2 0xb75cb26d in
ACE_Condition<ACE_Thread_Mutex>::wait (this=0x804f520)
at Condition_T.cpp:158
#3 0xb75ddaa4 in ACE_RMCast::Socket_Impl::size_
(this=0x804f4d8, timeout=0x0) at Socket.cpp:239
#4 0xb75de236 in ACE_RMCast::Socket::size
(this=0xbfffb9f0) at Socket.cpp:357
#5 0x08049a7f in main (argc=2, argv=0xbfffbac4) at
Receiver.cpp:75
(gdb) thread 2
[Switching to thread 2 (Thread -1223083088 (LWP
4036))]#0 0xb726a337 in ___newselect_nocancel () from
/lib/tls/libc.so.6
(gdb) thread 2
[Switching to thread 2 (Thread -1223083088 (LWP
4036))]#0 0xb726a337 in ___newselect_nocancel () from
/lib/tls/libc.so.6
(gdb) bt
#0 0xb726a337 in ___newselect_nocancel () from
/lib/tls/libc.so.6
#1 0xb74b5f31 in select (width=11, rfds=0xb71937cc,
wfds=0x0, efds=0x0, timeout=0xb7193988) at
OS_NS_sys_select.inl:44
#2 0xb7540fab in ACE_SOCK_Dgram::recv
(this=0xb084a8c, buf=0xb0c5cc0, n=4, addr=@0xb7193990,
flags=2, timeout=0xb7193988) at SOCK_Dgram.cpp:446
#3 0xb75d4651 in ACE_RMCast::Link::recv
(this=0xb084a40) at Link.cpp:201
#4 0xb75d53fb in ACE_RMCast::Link::recv_thunk
(obj=0xb084a40) at Link.cpp:324
#5 0xb754d56d in ACE_Thread_Adapter::invoke_i
(this=0xb084cb8) at Thread_Adapter.cpp:149
#6 0xb754d4b0 in ACE_Thread_Adapter::invoke
(this=0xb084cb8) at Thread_Adapter.cpp:93
#7 0xb74cd3c2 in ace_thread_adapter (args=0xb084cb8)
at Base_Thread_Adapter.cpp:131
#8 0xb73c4dec in start_thread () from
/lib/tls/libpthread.so.0
#9 0xb7270e8a in clone () from /lib/tls/libc.so.6
(gdb) thread 3
[Switching to thread 3 (Thread -1233572944 (LWP
4037))]#0 0xb73c73b6 in
pthread_cond_timedwait@@GLIBC_2.3.2 () from
/lib/tls/libpthread.so.0
(gdb) bt
#0 0xb73c73b6 in pthread_cond_timedwait@@GLIBC_2.3.2
() from /lib/tls/libpthread.so.0
#1 0xb75cc0a7 in cond_timedwait (cv=0xb079848,
external_mutex=0xb07982c, timeout=0xb67929a0) at
OS_NS_Thread.inl:455
#2 0xb75ca143 in
ACE_Condition<ACE_Thread_Mutex>::wait (this=0xb079848,
mutex=@0xb07982c, abstime=0xb67929a0) at
Condition_T.cpp:177
#3 0xb75c8d94 in
ACE_Condition<ACE_Thread_Mutex>::wait (this=0xb079848,
abstime=0xb67929a0) at Condition_T.cpp:191
#4 0xb75da408 in ACE_RMCast::Retransmit::track
(this=0xb079800) at Retransmit.cpp:144
#5 0xb75da2e7 in ACE_RMCast::Retransmit::track_thunk
(obj=0xb079800) at Retransmit.cpp:112
#6 0xb754d56d in ACE_Thread_Adapter::invoke_i
(this=0xb084d98) at Thread_Adapter.cpp:149
#7 0xb754d4b0 in ACE_Thread_Adapter::invoke
(this=0xb084d98) at Thread_Adapter.cpp:93
#8 0xb74cd3c2 in ace_thread_adapter (args=0xb084d98)
at Base_Thread_Adapter.cpp:131
#9 0xb73c4dec in start_thread () from
/lib/tls/libpthread.so.0
#10 0xb7270e8a in clone () from /lib/tls/libc.so.6
(gdb) thread 4
[Switching to thread 4 (Thread -1244062800 (LWP
4038))]#0 0xb75cb91a in
ACE_Hash_Map_Manager_Ex<unsigned long long,
ACE_RMCast::Acknowledge::Descr, ACE_Hash<unsigned long
long>, ACE_Equal_To<unsigned long long>,
ACE_Null_Mutex>::equal (this=0xb0d44e4,
id1=@0xb6e0120, id2=@0xb5d918c8) at
Hash_Map_Manager_T.inl:80
(gdb) thread 4
[Switching to thread 4 (Thread -1244062800 (LWP
4038))]#0 0xb75cb91a in
ACE_Hash_Map_Manager_Ex<unsigned long long,
ACE_RMCast::Acknowledge::Descr, ACE_Hash<unsigned long
long>, ACE_Equal_To<unsigned long long>,
ACE_Null_Mutex>::equal (this=0xb0d44e4,
id1=@0xb6e0120, id2=@0xb5d918c8) at
Hash_Map_Manager_T.inl:80
(gdb) bt
#0 0xb75cb91a in ACE_Hash_Map_Manager_Ex<unsigned
long long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::equal (this=0xb0d44e4,
id1=@0xb6e0120, id2=@0xb5d918c8) at
Hash_Map_Manager_T.inl:80
#1 0xb75caafa in ACE_Hash_Map_Manager_Ex<unsigned
long long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::shared_find
(this=0xb0d44e4, ext_id=@0xb5d918c8,
entry=@0xb5d917f0, loc=@0xb5d917ec) at
Hash_Map_Manager_T.cpp:304
#2 0xb75c9c68 in ACE_Hash_Map_Manager_Ex<unsigned
long long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::find_i (this=0xb0d44e4,
ext_id=@0xb5d918c8) at Hash_Map_Manager_T.inl:197
#3 0xb75c8657 in ACE_Hash_Map_Manager_Ex<unsigned
long long, ACE_RMCast::Acknowledge::Descr,
ACE_Hash<unsigned long long>, ACE_Equal_To<unsigned
long long>, ACE_Null_Mutex>::find (this=0xb0d44e4,
ext_id=@0xb5d918c8) at Hash_Map_Manager_T.inl:222
#4 0xb75c77f4 in ACE_RMCast::Acknowledge::track_queue
(this=0x8064620, addr=@0xb0d44c8, q=@0xb0d44e4,
msgs=@0xb5d919b0) at Acknowledge.cpp:229
#5 0xb75c6fce in ACE_RMCast::Acknowledge::track
(this=0x8064620) at Acknowledge.cpp:104
#6 0xb75c82c7 in ACE_RMCast::Acknowledge::track_thunk
(obj=0x8064620) at Acknowledge.cpp:384
#7 0xb754d56d in ACE_Thread_Adapter::invoke_i
(this=0xb084e78) at Thread_Adapter.cpp:149
#8 0xb754d4b0 in ACE_Thread_Adapter::invoke
(this=0xb084e78) at Thread_Adapter.cpp:93
#9 0xb74cd3c2 in ace_thread_adapter (args=0xb084e78)
at Base_Thread_Adapter.cpp:131
#10 0xb73c4dec in start_thread () from
/lib/tls/libpthread.so.0
#11 0xb7270e8a in clone () from /lib/tls/libc.so.6
(gdb)
-------------------
thanks,
stephen
--- Boris Kolpackov <boris@...> wrote:
> Stephen,
>
> Stephen Horton <shorton3@...> writes:
>
> > I have done 'make clean' and recompiled ACE lib,
> > RMCast lib, and the protocol examples with debug=1
> > optimize=0 exceptions=1 threads=1. I have also set
> > LD_ASSUME_KERNEL=2.4.1 as you suggested to go back
> to
> > linux threads.
>
> I tried to figure out what was going on but without
> any success so far.
> From the thread stacks you sent, three of them do
> not appear to be from
> this process (23308, 23309, 23310, o maybe they are
> some worker thread
> from LinuxThreads lib). Also two other threads that
> should be there are
> missing (RMCast creates 3 threads for each socket +
> one for main()).
>
> So I was wondering if you could do the following:
>
> 1. Try to figure out which thread is using the CPU.
>
> 2. Send stack traces for both NPTL and LinuxThreads
> (everything
> compiled with debug info).
>
> 3. Try to dump stack several times and see if the
> threads are
> stuck/spinning at the same place.
>
> thanks,
> -boris
>
>
> > Here is the back trace for each of the 5 light
> weight
> > process threads:
> >
> > (gdb) attach 23307
> > 0xb7388074 in __pthread_sigsuspend () from
> > /lib/i686/libpthread.so.0
> > (gdb) bt
> > #0 0xb7388074 in __pthread_sigsuspend () from
> > /lib/i686/libpthread.so.0
> > #1 0xb7387a28 in
> __pthread_wait_for_restart_signal ()
> > from /lib/i686/libpthread.so.0
> > #2 0xb738407b in pthread_cond_wait@GLIBC_2.0 ()
> from
> > /lib/i686/libpthread.so.0
> > #3 0xb75cc00a in cond_wait (cv=0x804fcf8,
> > external_mutex=0x804fcdc) at OS_NS_Thread.inl:390
> > #4 0xb75cb26d in
> > ACE_Condition<ACE_Thread_Mutex>::wait
> (this=0x804fcf8)
> > at Condition_T.cpp:158
> > #5 0xb75ddaa4 in ACE_RMCast::Socket_Impl::size_
> > (this=0x804fcb0, timeout=0x0) at Socket.cpp:239
> > #6 0xb75de236 in ACE_RMCast::Socket::size
> > (this=0xbfffc440) at Socket.cpp:357
> > #7 0x08049a7f in main (argc=2, argv=0xbfffc4d4)
> at
> > Receiver.cpp:75
> > (gdb) detach
> > (gdb) attach 23308
> > 0xb722838a in poll () from /lib/i686/libc.so.6
> > (gdb) bt
> > #0 0xb722838a in poll () from /lib/i686/libc.so.6
> > #1 0xb7384d5e in __pthread_manager () from
> > /lib/i686/libpthread.so.0
> > #2 0xb723108a in clone () from
> /lib/i686/libc.so.6
> > (gdb) detach
> > (gdb) attach 23309
> > 0xb722a5f1 in select () from /lib/i686/libc.so.6
> > (gdb) bt
> > #0 0xb722a5f1 in select () from
> /lib/i686/libc.so.6
> > #1 0xb75a7920 in __JCR_LIST__ () from
> > /opt/ACE_wrappers/ace/libACE.so.5.4.7
> > #2 0xb738f5d0 in __pthread_last_event () from
> > /lib/i686/libpthread.so.0
> > #3 0x0b087a00 in ?? ()
> > #4 0x00000000 in ?? ()
> > (gdb) detach
> > (gdb) attach 23310
> > 0xb738b916 in nanosleep () from
> > /lib/i686/libpthread.so.0
> > (gdb) bt
> > #0 0xb738b916 in nanosleep () from
> > /lib/i686/libpthread.so.0
> > #1 0x00000000 in ?? ()
> > (gdb) detach
> > (gdb) attach 23311
> > 0xb75caacb in ACE_Hash_Map_Manager_Ex<unsigned
> long
> > long, ACE_RMCast::Acknowledge::Descr,
> > ACE_Hash<unsigned long long>,
> ACE_Equal_To<unsigned
> > long long>, ACE_Null_Mutex>::shared_find
> > (this=0xb0d810c, ext_id=@0xb6d51d1c,
> > entry=@0xb6d51c44, loc=@0xb6d51c40) at
> > Hash_Map_Manager_T.cpp:302
> > (gdb) bt
> > #0 0xb75caacb in ACE_Hash_Map_Manager_Ex<unsigned
> > long long, ACE_RMCast::Acknowledge::Descr,
> > ACE_Hash<unsigned long long>,
> ACE_Equal_To<unsigned
> > long long>, ACE_Null_Mutex>::shared_find
> > (this=0xb0d810c, ext_id=@0xb6d51d1c,
> > entry=@0xb6d51c44, loc=@0xb6d51c40) at
> > Hash_Map_Manager_T.cpp:302
> > #1 0xb75c9c68 in ACE_Hash_Map_Manager_Ex<unsigned
> > long long, ACE_RMCast::Acknowledge::Descr,
> > ACE_Hash<unsigned long long>,
> ACE_Equal_To<unsigned
> > long long>, ACE_Null_Mutex>::find_i
> (this=0xb0d810c,
> > ext_id=@0xb6d51d1c) at Hash_Map_Manager_T.inl:197
> > #2 0xb75c8657 in ACE_Hash_Map_Manager_Ex<unsigned
> > long long, ACE_RMCast::Acknowledge::Descr,
> > ACE_Hash<unsigned long long>,
> ACE_Equal_To<unsigned
> > long long>, ACE_Null_Mutex>::find (this=0xb0d810c,
> > ext_id=@0xb6d51d1c) at Hash_Map_Manager_T.inl:222
> > #3 0xb75c77f4 in
> ACE_RMCast::Acknowledge::track_queue
> > (this=0x8064df8, addr=@0xb0d80f0, q=@0xb0d810c,
> > msgs=@0xb6d51e04) at Acknowledge.cpp:229
> > #4 0xb75c6fce in ACE_RMCast::Acknowledge::track
> > (this=0x8064df8) at Acknowledge.cpp:104
> > #5 0xb75c82c7 in
> ACE_RMCast::Acknowledge::track_thunk
> > (obj=0x8064df8) at Acknowledge.cpp:384
> > #6 0xb754d56d in ACE_Thread_Adapter::invoke_i
> > (this=0xb085490) at Thread_Adapter.cpp:149
> > #7 0xb754d4b0 in ACE_Thread_Adapter::invoke
> > (this=0xb085490) at Thread_Adapter.cpp:93
> > #8 0xb74cd3c2 in ace_thread_adapter
> (args=0xb085490)
> > at Base_Thread_Adapter.cpp:131
> > #9 0xb7385e21 in pthread_start_thread () from
> > /lib/i686/libpthread.so.0
> > #10 0xb723108a in clone () from
> /lib/i686/libc.so.6
> > (gdb)
>
__________________________________
Yahoo! Mail - PC Magazine Editors' Choice 2005
http://mail.yahoo.com