Libpjsip.so crashed

Hi

Asterisk sometimes crashed at libasteriskpj.so.2.
There is no problem after restarting with safe_asterisk.
It seems to occasionally crash when 30 calls are concentrated at the same time.
Rather than occur every day, the frequency is once every 7 to 10 days.

Feb 22 10:03:30 asterisk kernel: [25378.044530] asterisk[10977]: segfault at 7f3b2e573918 ip 00007f3c79ba32c7 sp 00007f3c0be64390 error 4 in libasteriskpj.so.2[7f3c79a8e000+15a000]

OS is Amazon Linux 4.4.5-15.26.amzn1.x86_64、
The version of Asterisk is 16.2.0, using bundled PJSIP.

It is optimized because it is a commercial environment, but I attach backtrace obtained by ast_coredumper.

!@!@!@! thread1.txt !@!@!@!

$1 = {si_signo = 11, si_errno = 0, si_code = 1, _sifields = {_pad = {777468184, 32571, 0 <repeats 26 times>}, _kill = {si_pid = 777468184, si_uid = 32571}, _timer = {si_tid = 777468184, si_overrun = 32571, si_sigval = {sival_int = 0, sival_ptr = 0x0}}, _rt = {si_pid = 777468184, si_uid = 32571, si_sigval = {sival_int = 0, sival_ptr = 0x0}}, _sigchld = {si_pid = 777468184, si_uid = 32571, si_status = 0, si_utime = 0, si_stime = 0}, _sigfault = {si_addr = 0x7f3b2e573918}, _sigpoll = {si_band = 139892157266200, si_fd = 0}}}
Signal        Stop      Print   Pass to program Description
SIGSEGV       Yes       Yes     Yes             Segmentation fault

Thread 1 (Thread 0x7f3c0be65700 (LWP 10977)):
#0  0x00007f3c79ba32c7 in pop_freelist (ht=<optimized out>, ht=<optimized out>) at ../src/pj/timer.c:150
        new_id = -1172499704
#1  schedule_entry (future_time=0x7f3c0be643b0, entry=0x7f3c45f456d0, ht=0x1b5ca18) at ../src/pj/timer.c:313
No locals.
#2  schedule_w_grp_lock (ht=0x1b5ca18, entry=entry@entry=0x7f3c45f456d0, delay=delay@entry=0x7f3c0be64380, set_id=set_id@entry=1, id_val=id_val@entry=1, grp_lock=0x7f3c45f1bef8) at ../src/pj/timer.c:515
        expires = {sec = 25388, msec = 44}
#3  0x00007f3c79ba39af in schedule_w_grp_lock (grp_lock=<optimized out>, id_val=id_val@entry=1, set_id=1, delay=delay@entry=0x7f3c0be64380, entry=entry@entry=0x7f3c45f456d0, ht=<optimized out>) at ../src/pj/timer.c:505
No locals.
#4  pj_timer_heap_schedule_w_grp_lock (ht=<optimized out>, entry=entry@entry=0x7f3c45f456d0, delay=delay@entry=0x7f3c0be64420, id_val=id_val@entry=1, grp_lock=<optimized out>) at ../src/pj/timer.c:568
No locals.
#5  0x00007f3c79b3e3fa in pj_stun_session_send_msg (sess=0x7f3c45b16598, token=0x7f3c45f45810, cache_res=cache_res@entry=1, retransmit=retransmit@entry=1, server=server@entry=0x7f3c0be64760, addr_len=addr_len@entry=16, tdata=0x7f3c45f45618) at ../src/pjnath/stun_session.c:1024
        timeout = {sec = 10, msec = 0}
        status = <optimized out>
        cache_res = 1
        token = 0x7f3c45f45810
        sess = 0x7f3c45b16598
        tdata = <optimized out>
        addr_len = 16
        server = 0x7f3c0be64760
        retransmit = 1
#6  0x00007f3c79b317c9 in on_stun_rx_request (sess=<optimized out>, pkt=<optimized out>, pkt_len=<optimized out>, rdata=<optimized out>, token=<optimized out>, src_addr=0x7f3c0be64760, src_addr_len=16) at ../src/pjnath/ice_session.c:2666
        sd = 0x7f3c45460e98
        msg = <optimized out>
        ice = 0x7f3c45f1e178
        prio_attr = 0x7f3c45f18d80
        uc_attr = 0x0
        role_attr = 0x7f3c45f18d70
        tdata = 0x7f3c45f45618
        rcheck = <optimized out>
        tmp_rcheck = {prev = 0x0, next = 0x6f3f6a6c00000000, comp_id = 1701341246, transport_id = 1832796015, src_addr = {addr = {sa_family = 27247}, ipv4 = {sin_family = 27247, sin_port = 25662, sin_addr = {s_addr = 1044265326}, sin_zero = "hk9oijih"}, ipv6 = {sin6_family = 27247, sin6_port = 25662, sin6_flowinfo = 1044265326, sin6_addr = {s6_addr = "hk9oijih??9l\\", u6_addr32 = {1866034024, 1751738985, 1815691071, 1549556828}}, sin6_scope_id = 1549556828}}, src_addr_len = 1549556828, use_candidate = 1549556828, priority = 1549556828, role_attr = 0x5c5c5c5c5c5c5c5c}
        source_addr = 0x7f3c0be64760
        source_addr_len = 16
        status = <optimized out>
#7  0x00007f3c79b3efb2 in on_incoming_indication (tmp_pool=<optimized out>, src_addr_len=<optimized out>, src_addr=<optimized out>, msg=<optimized out>, in_pkt_len=<optimized out>, in_pkt=<optimized out>, token=<optimized out>, sess=<optimized out>) at ../src/pjnath/stun_session.c:1390
No locals.
#8  pj_stun_session_on_rx_pkt (sess=<optimized out>, packet=packet@entry=0x7f3c45b13ad0, pkt_size=pkt_size@entry=124, options=options@entry=1, token=token@entry=0x7f3c45f20140, parsed_len=parsed_len@entry=0x0, src_addr=src_addr@entry=0x7f3c0be64760, src_addr_len=src_addr_len@entry=16) at ../src/pjnath/stun_session.c:1495
        msg = 0x7f3c45f18c78
        response = 0x0
        status = <optimized out>
#9  0x00007f3c79b33913 in pj_ice_sess_on_rx_pkt (ice=0x7f3c45f1e178, comp_id=comp_id@entry=1, transport_id=transport_id@entry=0, pkt=pkt@entry=0x7f3c45b13ad0, pkt_size=pkt_size@entry=124, src_addr=src_addr@entry=0x7f3c0be64760, src_addr_len=16) at ../src/pjnath/ice_session.c:3082
        status = <optimized out>
        msg_data = 0x7f3c45f20140
        i = <optimized out>
#10 0x00007f3c2b5cded3 in __rtp_recvfrom (instance=instance@entry=0x7f3c45f174b0, buf=buf@entry=0x7f3c45b13ad0, sa=sa@entry=0x7f3c0be64940, rtcp=rtcp@entry=0, flags=0, size=8192) at res_rtp_asterisk.c:2930
        combined = {ptr = 0x7f3c45fbd5b8 "x.x.x.x:63137", slen = 19}
        address = {addr = {sa_family = 2}, ipv4 = {sin_family = 2, sin_port = 41462, sin_addr = {s_addr = 4029552246}, sin_zero = "000000000000000000000"}, ipv6 = {sin6_family = 2, sin6_port = 41462, sin6_flowinfo = 4029552246, sin6_addr = {s6_addr = "000000000000000000000000001000000000063060000", u6_addr32 = {0, 0, 1, 12339}}, sin6_scope_id = 255}}
        status = <optimized out>
        ice = 0x7f3c45f1b598
        len = <optimized out>
        rtp = 0x7f3c45b13a00
        in = 0x7f3c45b13ad0 ""
        loop = <optimized out>
#11 0x00007f3c2b5d794f in rtp_recvfrom (sa=0x7f3c0be64940, flags=0, size=8192, buf=0x7f3c45b13ad0, instance=0x7f3c45f174b0) at res_rtp_asterisk.c:2974
No locals.
#12 ast_rtp_read (instance=0x7f3c45f174b0, rtcp=0) at res_rtp_asterisk.c:6594
        rtp = 0x7f3c45b13a00
        srtp = <optimized out>
        child = 0x0
        addr = {ss = {ss_family = 2, __ss_align = 0, __ss_padding = "177I346v<177000000262311E000000000000000360336e000000000000000%310E000000000000000251310e000000000000000215Vb000000000000000340032b000000000000000001000000000000000000000262241e", '000' <repeats 13 times>, "340032b000000000000000003000000000<177000000060 213001000000000000262311E000000000000"}, len = 16}
        version = <optimized out>
        payloadtype = <optimized out>
        mark = <optimized out>
        read_area = 0x7f3c45b13ad0 ""
        rtpheader = 0x7f3c45b13ad0
        seqno = <optimized out>
        prev_seqno = <optimized out>
        remote_address = {ss = {ss_family = 0, __ss_align = 0, __ss_padding = '000' <repeats 111 times>}, len = 0}
        frames = <optimized out>
        frame = <optimized out>
        __PRETTY_FUNCTION__ = "ast_rtp_read"
#13 0x0000000000539066 in ast_rtp_instance_read (instance=0x7f3c45f174b0, rtcp=rtcp@entry=0) at rtp_engine.c:568
        frame = <optimized out>
        __PRETTY_FUNCTION__ = "ast_rtp_instance_read"
#14 0x00007f3c18403088 in media_session_rtp_read_callback (session=<optimized out>, session_media=0x7f3c44566f48) at res_pjsip_sdp_rtp.c:1832
        f = <optimized out>
#15 0x00007f3c171c2d5f in chan_pjsip_read_stream (ast=0x7f3c45f165f0) at chan_pjsip.c:821
        channel = 0x7f3c441880b0
        session = 0x7f3c44903e70
        callback_state = 0x7f3c45e28a80
        f = <optimized out>
        fdno = <optimized out>
        __PRETTY_FUNCTION__ = "chan_pjsip_read_stream"
#16 0x00000000004ac7a6 in __ast_read (chan=0x7f3c45f165f0, dropaudio=dropaudio@entry=0, dropnondefault=dropnondefault@entry=0) at channel.c:3674
        f = 0x0
        prestate = 6
        cause = 0
        stream = 0x0
        default_stream = 0x0
        __PRETTY_FUNCTION__ = "__ast_read"
#17 0x00000000004ae119 in ast_read_stream (chan=<optimized out>) at channel.c:4176
No locals.
#18 0x000000000048637c in bridge_handle_trip (bridge_channel=0x7f3c45f1c8e0) at bridge_channel.c:2545
        frame = <optimized out>
        blocked = <optimized out>
#19 bridge_channel_wait (bridge_channel=0x7f3c45f1c8e0) at bridge_channel.c:2760
        ms = -1
        outfd = -99999
        chan = 0x7f3c45f165f0
#20 bridge_channel_internal_join (bridge_channel=bridge_channel@entry=0x7f3c45f1c8e0) at bridge_channel.c:2911
        res = 0
        indicate_src_change = <optimized out>
        channel_features = <optimized out>
        swap = 0x0
        __PRETTY_FUNCTION__ = "bridge_channel_internal_join"
#21 0x000000000046e45e in bridge_channel_ind_thread (data=data@entry=0x7f3c45f1c8e0) at bridge.c:1816
        bridge_channel = 0x7f3c45f1c8e0
        chan = <optimized out>
        __PRETTY_FUNCTION__ = "bridge_channel_ind_thread"
#22 0x000000000059ac39 in dummy_start (data=<optimized out>) at utils.c:1249
        __cancel_buf = {__cancel_jmp_buf = {{__cancel_jmp_buf = {139896839771216, -5875437358096595964, 0, 139895874410944, 139895874410240, 507904, 5768643927764091908, -5875437842218761212}, __mask_was_saved = 0}}, __pad = {0x7f3c0be64ef0, 0x0, 0x0, 0x0}}
        __cancel_arg = 0x7f3c0be65700
        __not_first_call = <optimized out>
        ret = <optimized out>
        a = {start_routine = 0x46e440 <bridge_channel_ind_thread>, data = 0x7f3c45f1c8e0, name = 0x7f3c45709450 "bridge_channel_ind_thread started at [ 1908] bridge.c bridge_impart_internal()"}
        __PRETTY_FUNCTION__ = "dummy_start"
#23 0x00007f3c78141dc5 in start_thread () from /lib64/libpthread.so.0
No symbol table info available.
#24 0x00007f3c76fdac9d in clone () from /lib64/libc.so.6
No symbol table info available.

I found a similar back trace, but are there any relevance?
https://issues.asterisk.org/jira/browse/ASTERISK-28161

Please tell me if you have solutions or workarounds. Thank you.

You compiling asterisk disabled flag BUILD_NATIVE ?
I am the same problem, but i am use chan_sip.
Today i recompile asterisk with --disable BUILD_NATIVE option, but is crash now.
I am use CentOS 7 AMI in Amazon EC2.

No, I compiled without --disabled BUILD_NATIVE.

The BUILD_NATIVE problem with some virtual machines results in an Illegal Instruction trap, not a a segmentation violation one.

The performance penalty for disabling compiler optimisation is not very high, and with it disabled, people are not going to put much effort into analysing the dump. (I’m personally not familiar with chan_psjip internals, so probably counldn’t analyse it, anyway.

You mentioned you have the same problem but using chan_sip? Please elaborate: Are you getting libasteriskpj.so.2 segmentation faults using chan_sip, not chan_pjsip?

I got similar situation.
It seems to happen occasionally like once every 3 days and when registration process is concentrated.

segfault at 7feae2d3c4c8 ip 00007feb73d8d7c6 sp 00007feadef45d70 error 4 in libasteriskpj.so.2

Asterisk version 16.2.1 using bundled PJSIP.

coredump show

!@!@!@! thread1.txt !@!@!@!

$1 = {si_signo = 11, si_errno = 0, si_code = 2, _sifields = {_pad = {-393983880, 32617, 0 <repeats 26 times>}, _kill = {si_pid = -393983880, si_uid = 32617}, _timer = {si_tid = -393983880, si_overrun = 32617, si_sigval = {sival_int = 0, sival_ptr = 0x0}}, _rt = {si_pid = -393983880, si_uid = 32617, si_sigval = {sival_int = 0, sival_ptr = 0x0}}, _sigchld = {si_pid = -393983880, si_uid = 32617, si_status = 0, si_utime = 0, si_stime = 0}, _sigfault = {si_addr = 0x7f69e8844878, _addr_lsb = 0, _addr_bnd = {_lower = 0x0, _upper = 0x0}}, _sigpoll = {si_band = 140092849277048, si_fd = 0}}}
Signal        Stop	Print	Pass to program	Description
SIGSEGV       Yes	Yes	Yes		Segmentation fault

Thread 1 (Thread 0x7f69db516700 (LWP 18920)):
#0  pop_freelist (ht=0x2df1508, ht=0x2df1508) at ../src/pj/timer.c:149
        new_id = -48195608
#1  schedule_entry (future_time=0x7f69db515cb0, entry=0x7f69ec752f70, ht=0x2df1508) at ../src/pj/timer.c:313
No locals.
#2  schedule_w_grp_lock (ht=0x2df1508, entry=0x7f69ec752f70, delay=<optimized out>, set_id=set_id@entry=1, id_val=id_val@entry=1, grp_lock=0x7f69ec721588) at ../src/pj/timer.c:515
        expires = {sec = 2232617, msec = 276}
#3  0x00007f6a883a8f20 in schedule_w_grp_lock (grp_lock=<optimized out>, id_val=id_val@entry=1, set_id=1, delay=delay@entry=0x7f69db515d40, entry=entry@entry=0x7f69ec752f70, ht=ht@entry=0x2df1508) at ../src/pj/timer.c:569
No locals.
#4  pj_timer_heap_schedule_w_grp_lock (ht=ht@entry=0x2df1508, entry=entry@entry=0x7f69ec752f70, delay=delay@entry=0x7f69db515d40, id_val=id_val@entry=1, grp_lock=<optimized out>) at ../src/pj/timer.c:568
No locals.
#5  0x00007f6a8832ff87 in start_periodic_check (te=0x7f69ec752f70, th=0x2df1508) at ../src/pjnath/ice_session.c:1932
        timeout = {sec = 0, msec = 20}
        td = <optimized out>
        ice = 0x7f69ec744eb8
        clist = <optimized out>
        i = <optimized out>
        status = <optimized out>
        start_count = 1
#6  periodic_timer (th=0x2df1508, te=0x7f69ec752f70) at ../src/pjnath/ice_session.c:2005
No locals.
#7  0x00007f6a883a9197 in pj_timer_heap_poll (ht=0x2df1508, next_delay=next_delay@entry=0x0) at ../src/pj/timer.c:649
        node = <optimized out>
        node_timer_id = 102
        grp_lock = 0x7f69ec721588
        now = {sec = 2232617, msec = 256}
        count = 2
#8  0x00007f6a10318778 in timer_worker_thread (data=<optimized out>) at res_rtp_asterisk.c:2453
        delay = {sec = 0, msec = 10}
        ioqueue = 0x7f69f4000918
#9  0x00007f6a88392eb0 in thread_main (param=0x2df6860) at ../src/pj/os_core_unix.c:541
        rec = 0x2df6860
        result = <optimized out>
#10 0x00007f6a866926ba in start_thread (arg=0x7f69db516700) at pthread_create.c:333
        __res = <optimized out>
        pd = 0x7f69db516700
        now = <optimized out>
        unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140092627838720, -932150821418068601, 0, 140726497506831, 140092627839424, 48174176, 991886374364353927, 992297244075201927}, mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x0, 0x0}, data = {prev = 0x0, cleanup = 0x0, canceltype = 0}}}
        not_first_call = <optimized out>
        pagesize_m1 = <optimized out>
        sp = <optimized out>
        freesize = <optimized out>
        __PRETTY_FUNCTION__ = "start_thread"
#11 0x00007f6a85c6c41d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
No locals.

For my problem, this patch works.

Hello @syunyama, I have asterisk 18.2.0 and I have similar problem.

segfault at 270 ip 00007fd83aab0587 sp 00007fd511777d48 error 6 in libasteriskpj.so.2

The problem happens 1 or 2 times per week.

How can apply this patch ?