Google speech to text API in C#

前端 未结 4 2070
遇见更好的自我
遇见更好的自我 2020-12-10 08:34

My program get a correct respon from google when the flac file recorded manual by using windows\'s sound recorder and convert it using a software converter.
But when I

相关标签:
4条回答
  • 2020-12-10 08:44
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.IO;
    using System.Net;
    using Newtonsoft.Json;
    
    namespace google_speech_api_trial4
    {
        class Program
        {
            public static string ACCESS_GOOGLE_SPEECH_KEY =     "AIzaSyDC8nM1S0cLpXvRc8TXrDoey-tqQsoBGnM";
    
        static void Main(string[] args)
        {
            GoogleSpeechRequest();
            Console.ReadLine();
    
        }
                 public static void GoogleSpeechRequest()
        {
                FileStream fileStream = File.OpenRead("my.flac");
            MemoryStream memoryStream = new MemoryStream();
            memoryStream.SetLength(fileStream.Length);
            fileStream.Read(memoryStream.GetBuffer(), 0, (int)fileStream.Length);
            byte[] BA_AudioFile = memoryStream.GetBuffer();
            HttpWebRequest _HWR_SpeechToText = null;
            _HWR_SpeechToText = (HttpWebRequest)HttpWebRequest.Create("https://www.google.com/speech-api/v2/recognize?output=json&lang=en-us&key=" + ACCESS_GOOGLE_SPEECH_KEY);
            _HWR_SpeechToText.Credentials = CredentialCache.DefaultCredentials;
            _HWR_SpeechToText.Method = "POST";
            _HWR_SpeechToText.ContentType = "audio/x-flac; rate=44100";
            _HWR_SpeechToText.ContentLength = BA_AudioFile.Length;
            Stream stream = _HWR_SpeechToText.GetRequestStream();
            stream.Write(BA_AudioFile, 0, BA_AudioFile.Length);
            stream.Close();
            HttpWebResponse HWR_Response = (HttpWebResponse)_HWR_SpeechToText.GetResponse();
    
            StreamReader SR_Response = new StreamReader(HWR_Response.GetResponseStream());
            string responseFromServer = (SR_Response.ReadToEnd());
    
            String[] jsons = responseFromServer.Split('\n');
            String text = "";
            foreach (String j in jsons)
            {
                dynamic jsonObject = JsonConvert.DeserializeObject(j);
                if (jsonObject == null || jsonObject.result.Count <= 0)
                {
                    continue;
                }
                text = jsonObject.result[0].alternative[0].transcript;
            }
            Console.WriteLine(text);
        }
        }
    }
    

    I was looking for 3 hours, i kept gettin {"result : []"} when i print the text. I thought it wasn't converting the audio. However, the Json obj has two lines. and the second line has the audio-text. to print that, we need to parse it. Oh i also faced a lot of problems with imports, references, and using statements. but finally this code works.

    0 讨论(0)
  • 2020-12-10 08:57

    It's clearly mentioned in the Google cloud api url i.e

    https://cloud.google.com/speech-to-text/docs/async-recognize#speech-async-recognize-gcs-protocol

    If the operation has not completed, you can poll the endpoint by repeatedly making the GET request until the done property of the response is true.

            {
          "name": "operationname here",
          "metadata": {
            "@type": "type.googleapis.com/google.cloud.speech.v1.LongRunningRecognizeMetadata",
            "progressPercent": 0,
            "startTime": "2018-12-18T10:56:09.425584Z",
            "lastUpdateTime": "2018-12-18T11:10:27.147310Z"
          },
          "done": true,
        }
    

    poll the endpoint by repeatedly making the GET request until the done property of the response is true or you can check for the "progressPercent": 0 until it's value become 100. Once its 100 percent then done property becomes true.

    I did the same in my code using operation name, for reference here is the code

    public async Task<string> TranscribeLongMediaFile(string operationName)
        {
            string bearerToken = GetOAuthToken();
            var baseUrl = new Uri(googleSpeechBaseUrl + operationName);
            string resultContent = string.Empty;
            using (var client = new HttpClient())
            {
                client.DefaultRequestHeaders.Add(HttpRequestHeader.Authorization.ToString(), "Bearer " + bearerToken);
                client.DefaultRequestHeaders.Add(HttpRequestHeader.ContentType.ToString(), "application/json; charset=utf-8");
    
                client.Timeout = TimeSpan.FromMilliseconds(Timeout.Infinite);
    
                int currentPercentage = 0;
                bool responseStatus = false;
                while (!responseStatus)
                {
                    responseStatus = false;
                    // Send request
                    using (var result = await client.GetAsync(baseUrl))
                    {
                        resultContent = await result.Content.ReadAsStringAsync();
    
                        ResponseObject responseObject = JsonConvert.DeserializeObject<ResponseObject>(resultContent);
                        currentPercentage = responseObject.metadata.progressPercent;
                        responseStatus = (responseObject.done && currentPercentage == 100);
    
                        // Delay the request based on percentage value to repeatedly making the GET request until the done property of the response is true.
                        await Task.Delay(CalculateDealy(currentPercentage));
                    }
                }
            };
            return resultContent;
        }
    

    In order to delay the get request:

    /// <summary>
        /// Delay the request to number of milliseconds
        /// </summary>
        /// <param name="currentPercentage"></param>
        /// <returns></returns>
        private int CalculateDealy(int currentPercentage)
        {
            int x = currentPercentage / 10;
            return (10 - x) * 1500;
        }
    

    Get auth token:

    /// <summary>
        /// Get OAuth token
        /// </summary>
        /// <returns></returns>
        public string GetOAuthToken()
        {
            return googleCredential.UnderlyingCredential.GetAccessTokenForRequestAsync("https://accounts.google.com/o/oauth2/v2/auth", CancellationToken.None).Result;
        }
    

    At last, you will get the result like:

        {
      "name": "operationname here",
      "metadata": {
        "@type": "type.googleapis.com/google.cloud.speech.v1.LongRunningRecognizeMetadata",
        "progressPercent": 100,
        "startTime": "2018-12-18T10:56:09.425584Z",
        "lastUpdateTime": "2018-12-18T11:10:27.147310Z"
      },
      "done": true,
      "response": {
        "@type": "type.googleapis.com/google.cloud.speech.v1.LongRunningRecognizeResponse",
        "results": [
          {
            "alternatives": [
              {
                "transcript": "okay let's get started",
                "confidence": 0.97442055
              }
            ]
          }, and so on .....
    

    Things required:

    1. api-key.json file
    2. Install package Google.Apis.Auth.OAuth2 in order to authorize the HTTP web request

    Thanks

    0 讨论(0)
  • 2020-12-10 09:00

    Use it in and get uterance(phrase) and confidence(%)

         string toParse=(VALUE RETURNED BY GOOGLE)
            var trsc1 = @"transcript"":""";
            var trsc2 = @""",""confidence"":";
            var trsc3 = @"}],""final"":";
            var start = toParse.IndexOf(trsc1) + trsc1.Length;
            var end = toParse.IndexOf(trsc2);
            var end2 = toParse.IndexOf(trsc3);
            var vv1 = toParse.Substring(start, end - start);
            var vv2 = toParse.Substring(end + trsc2.Length, end2 - (end + trsc2.Length));
            vv2 = vv2.Trim().Replace(".", ",");
    
            float confidence = (float)Math.Round(double.Parse(vv2), 2);
            string utterance = vv1;
    
    0 讨论(0)
  • 2020-12-10 09:05

    I was also having the same issue but came up with a neat solution. I used Fiddler (http://www.telerik.com/fiddler/) to figure out how Chrome does the speech recognition and then created some code to emulate chrome sending the request. This approach uses a different URI and there is also a 16-character value called pair which is different for each request. I use a simple random value generator function to create one for the request and I also changed the output value to 'json'.

    Note: The result can sometimes be empty as in your case above but there's also another json object in the response that contains the alternatives.

        private void GoogleSpeechToText()
        {
            string uri = "https://www.google.com/speech-api/full-duplex/v1/up?output=json&key=AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw&pair=" + GenerateUnique(16) + "&lang=en-US&pFilter=2&maxAlternatives=10&client=chromium";
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
            request.Timeout = 10000;
            request.Method = "POST";
            request.Host = "www.google.com";            
            request.KeepAlive = true;
            request.SendChunked = true;
            request.ContentType = "audio/x-flac; rate=16000";
            request.Headers.Set(HttpRequestHeader.AcceptLanguage, "en-GB,en-US;q=0.8,en;q=0.6");
            request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36";
    
            string path = @"C:\TestFolder\test_audio.flac";     
            FileInfo fInfo = new FileInfo(path);
            var numBytes = fInfo.Length;
            byte[] data;
    
            using (FileStream fStream = new FileStream(path, FileMode.Open, FileAccess.Read))
            {
                data = new Byte[numBytes];
                fStream.Read(data, 0, (int) numBytes);
                fStream.Close();
            }
    
            using (Stream reqStream = request.GetRequestStream())
                reqStream.Write(data, 0, data.Length);
    
            try
            {
                WebResponse response = request.GetResponse();
                Stream respStream = response.GetResponseStream();
    
                if(response.ContentType == "application/json; charset=utf-8")
                {                    
                    using (var sr = new StreamReader(respStream))
                    {
                        var res = sr.ReadToEnd();
                        textBox1.Text = res;                        
                    }
                }
            }
            catch (Exception ex) { MessageBox.Show(ex.Message, "Error", MessageBoxButtons.OK); }            
        }
    
        private string GenerateUnique(int length)
        {
            string[] LETTERS = new string[] { "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" };
            string[] DIGITS = new string[] { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" };
            string buffer = "";
            Random random = new Random();
    
            for(int i = 0; i < length; i++)
            {                
                int rnd = random.Next(2);
                if (rnd == 1)
                    buffer += LETTERS[random.Next(LETTERS.Length)];
                else
                    buffer += DIGITS[random.Next(DIGITS.Length)];
            }
            return buffer;
        }
    
    0 讨论(0)
提交回复
热议问题