Problem with EAGI audio stream during external calls

Hi, I’m working on an EAGI script for real time speech recognition. I bought voip service from Zadarma and connected given SIP account as trunk in my freepbx configuration and created inbound route. First I routed it to my internal soft phone and it worked, I could talk with another person. The problem came when routing external calls to speech recognition EAGI script. It works perfectly for internal calls from zoiper or other soft phones but for external incoming calls it doesn’t work at all, ie it cannot read audio stream. Any ideas?

#!/usr/bin/python3

DEBUG = False

import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/var/lib/asterisk/agi-bin/gcloud.json"

import sys
from asterisk.agi import *
from google.cloud import speech
from google.api_core import client_options

DEBUG = True

if DEBUG:
    import logging
    logging.basicConfig(level=logging.DEBUG, filename='/tmp/log_asterisk.log', filemode='w', format='[%(levelname)s]: %(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    logging.debug("Starting EAGI")

##### Start AGI connector
agi = AGI()
if DEBUG:
    agi.verbose("EAGI started")

##### Read command line arguments

# Use "default" for default model, "enhanced" for enhanced default model or "phone_call" for model finetuned in phone calls. mandatory argument.
modelInput = sys.argv[1] # be careful because some languages do not have "phone_call" model: https://cloud.google.com/speech-to-text/docs/languages

model = None
enhancedModel = False

if modelInput == "enhanced":
    enhancedModel = True
elif modelInput == "phone_call": # phone_call models just support single_utterance mode if use_enhanced is True: https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#google.cloud.speech.v1.StreamingRecognitionConfig
    model = modelInput
    enhancedModel = True

mainLanguage = sys.argv[2] # main recognition language. mandatory argument

alternativeLanguages = [] # alternative recognition languages (max 3 alternative languages). optional argument
if len(sys.argv[3:]) > 0 and model != "phone_call": # alternative languages is not supported in "phone_call" model: https://cloud.google.com/speech-to-text/docs/reference/rest/v1p1beta1/RecognitionConfig
    alternativeLanguages = sys.argv[3:]

if DEBUG:
    agi.verbose(str(enhancedModel) + " " + str(model) + " " + str(mainLanguage) + " " + str(alternativeLanguages))

##### Google Response Parser
class Parser:
    """Parse google responses, check if the user was silence and set AGI variable with response"""

    def __init__(self, stream) -> None:
        self.stream = stream

    def isEndOfSpeech(self, response):
        if str(response.speech_event_type) == "SpeechEventType.END_OF_SINGLE_UTTERANCE":
            if DEBUG:
                logging.debug("END_OF_SINGLE_UTTERANCE detected")
                logging.debug(str(self.stream.getRecordingTime()))
            return True
        return False

    def parseGoogleResponse(self, responses):
        """
        The responses passed is a generator that will block until a response
        is provided by the server.
        """
        speechDetected = False

        for response in responses:

            if self.isEndOfSpeech(response):
                if DEBUG:
                    agi.verbose("End of speech detected")
                break

            if not response.results:
                continue

            result = response.results[0]
            if not result.alternatives:
                continue

            speechDetected = True
            interimResults = result.alternatives[0].transcript
            # reaching this point means interim results was returned and the user started saying something

        if speechDetected: # get final result
            response = next(responses)
            transcript = response.results[0].alternatives[0].transcript
            agi.set_variable("TRANSCRIPT", transcript) 
            if DEBUG:
                logging.debug("Final transcript: " + transcript)
        else:
            agi.set_variable("TRANSCRIPT", "_SILENCE_") 
            if DEBUG:
                logging.debug("No speech detected. User in silence")


##### Audio Streaming Generator
class AudioStream(object):
    """Opens a phone call stream as a generator yielding the audio chunks."""

    def __init__(self):
        self.close = False
        self.samplerate = 8000
        self.chunk = 1024
        self.totalSignal = 0

    def __enter__(self):
        if DEBUG:
            agi.verbose("Opening audio stream")

        self.file = os.fdopen(3, 'rb') # open file descriptor 3 with phone call audio
        return self

    def __exit__(self, type=None, value=None, traceback=None):

        if DEBUG:
            logging.debug("Closing audio stream")
            if type != None: # error detected. log to file because agi.verbose does not work anymore
                logging.error("Error while streaming")
                logging.error(value)
                logging.error(agi._got_sighup) # if hang up, _got_sighup == True

        self.close = True
        os.close(3) # close file descriptor 3

    def getRecordingTime(self):
        return self.totalSignal / 16076.8 # approximately

    def generator(self):
        while not self.close:
            audioData = self.file.read(self.chunk)
            self.totalSignal += self.chunk
            yield audioData


##### Configure google speech service
if DEBUG:
    agi.verbose("Configuring Google STT model")

# specify europe region
client_options = client_options.ClientOptions(
    api_endpoint="eu-speech.googleapis.com"
)

client = speech.SpeechClient(client_options=client_options)
# client = speech.SpeechClient()

# single utterance only works with normal model or phone_call + enhanced model
# check in google if the desired language has support for phone_call models (german does not have)
config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code=mainLanguage,
    alternative_language_codes=alternativeLanguages,
    model=model,
    use_enhanced=enhancedModel
)
streaming_config = speech.StreamingRecognitionConfig(config=config, interim_results=True, single_utterance=True)

##### Start to listen
if DEBUG:
    agi.verbose("Creating audio stream and starting speech recognition")

with AudioStream() as stream:
    audio_generator = stream.generator()

    requests = (
        speech.StreamingRecognizeRequest(audio_content=content)
        for content in audio_generator
    ) # requests is also a generator

    responses = client.streaming_recognize(streaming_config, requests)

    parser = Parser(stream)

    # parser will process google responses until the for loop in "parseGoogleResponse" is break
    parser.parseGoogleResponse(responses)


if DEBUG: # this point is reached only if there was no error in the speech recognition.
    # if the user hangs up the call during this eagi, the error will be catched inside the exit function and this agi.verbose won't be executed
    # if you want to continue the script even if error is present, you need to return True in the exit function
    agi.verbose("Closed audio stream sucessfully. Finalising speech to text eagi script")```

Is this your script ?

Are your external calls doing a re-INVITE so media is flowing outside of Asterisk ?