Google Streaming Speech Recognition on an Audio Stream Python

后端 未结 2 1615
暖寄归人
暖寄归人 2020-12-11 03:06

I have searched through all the available docs of Google but I could not find an example of streaming speech recognition on an audio stream in Python.

Currently, I a

相关标签:
2条回答
  • 2020-12-11 03:42

    Google provides an example of the streaming Python API here.

    Rather than opening an audio file to create the stream (as on line 34 of that example), pass the stream directly to the audio sample object (as on line 36).

    0 讨论(0)
  • 2020-12-11 03:42

    This is a working code for the above requirement.

    Code:

    import asyncio
    import websockets
    import json
    import threading
    from six.moves import queue
    from google.cloud import speech
    from google.cloud.speech import types
    
    
    IP = '0.0.0.0'
    PORT = 8000
    
    class Transcoder(object):
        """
        Converts audio chunks to text
        """
        def __init__(self, encoding, rate, language):
            self.buff = queue.Queue()
            self.encoding = encoding
            self.language = language
            self.rate = rate
            self.closed = True
            self.transcript = None
    
        def start(self):
            """Start up streaming speech call"""
            threading.Thread(target=self.process).start()
    
        def response_loop(self, responses):
            """
            Pick up the final result of Speech to text conversion
            """
            for response in responses:
                if not response.results:
                    continue
                result = response.results[0]
                if not result.alternatives:
                    continue
                transcript = result.alternatives[0].transcript
                if result.is_final:
                    self.transcript = transcript
    
        def process(self):
            """
            Audio stream recognition and result parsing
            """
            #You can add speech contexts for better recognition
            cap_speech_context = types.SpeechContext(phrases=["Add your phrases here"])
            client = speech.SpeechClient()
            config = types.RecognitionConfig(
                encoding=self.encoding,
                sample_rate_hertz=self.rate,
                language_code=self.language,
                speech_contexts=[cap_speech_context,],
                model='command_and_search'
            )
            streaming_config = types.StreamingRecognitionConfig(
                config=config,
                interim_results=False,
                single_utterance=False)
            audio_generator = self.stream_generator()
            requests = (types.StreamingRecognizeRequest(audio_content=content)
                        for content in audio_generator)
    
            responses = client.streaming_recognize(streaming_config, requests)
            try:
                self.response_loop(responses)
            except:
                self.start()
    
        def stream_generator(self):
            while not self.closed:
                chunk = self.buff.get()
                if chunk is None:
                    return
                data = [chunk]
                while True:
                    try:
                        chunk = self.buff.get(block=False)
                        if chunk is None:
                            return
                        data.append(chunk)
                    except queue.Empty:
                        break
                yield b''.join(data)
    
        def write(self, data):
            """
            Writes data to the buffer
            """
            self.buff.put(data)
    
    
    async def audio_processor(websocket, path):
        """
        Collects audio from the stream, writes it to buffer and return the output of Google speech to text
        """
        config = await websocket.recv()
        if not isinstance(config, str):
            print("ERROR, no config")
            return
        config = json.loads(config)
        transcoder = Transcoder(
            encoding=config["format"],
            rate=config["rate"],
            language=config["language"]
        )
        transcoder.start()
        while True:
            try:
                data = await websocket.recv()
            except websockets.ConnectionClosed:
                print("Connection closed")
                break
            transcoder.write(data)
            transcoder.closed = False
            if transcoder.transcript:
                print(transcoder.transcript)
                await websocket.send(transcoder.transcript)
                transcoder.transcript = None
    
    start_server = websockets.serve(audio_processor, IP, PORT)
    asyncio.get_event_loop().run_until_complete(start_server)
    asyncio.get_event_loop().run_forever()
    
    0 讨论(0)
提交回复
热议问题