AWS Transcript: file to text returns nonsense

╄→гoц情女王★ 提交于 2021-01-29 15:42:54

问题


This is a follow-on question to AWS Transcribe S3 .wav file to text. I use a stream to read and send a .wav file contents to AWS.

Instead of getting back the correct transcript, I get nonsense like a bunch of "Yeah." statements. It looks like AWS isn't able to interpret the byte stream correctly, but I'm not sure what's wrong. I'm wondering if the file needs to be encoded somehow, ie, I can't send the raw .wav bytes straight from the file? Or perhaps I need to tell the service that this is .wav format?

What's wrong here? The input file is a valid .wav voice file that sounds intelligible when I listen to it.

Here is my java code:

package com.amazonaws.transcribe;

import org.reactivestreams.Publisher;
import org.reactivestreams.Subscriber;
import org.reactivestreams.Subscription;
import software.amazon.awssdk.core.SdkBytes;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.transcribestreaming.TranscribeStreamingAsyncClient;
import software.amazon.awssdk.services.transcribestreaming.model.*;

import javax.sound.sampled.*;
import java.io.*;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;


public class TranscribeFileFromStream {
    private static final Region REGION = Region.US_EAST_1;
    private static TranscribeStreamingAsyncClient client;

    public static void main(String args[]) throws URISyntaxException, ExecutionException, InterruptedException, LineUnavailableException {
        System.out.println(System.getProperty("java.version"));
        client = TranscribeStreamingAsyncClient.builder()
                .region(REGION)
                .build();
        try {
            CompletableFuture<Void> result = client.startStreamTranscription(getRequest(16000),
                    new AudioStreamPublisher(getStreamFromFile()),
                    getResponseHandler());
            result.get();
        } finally {
            if (client != null) {
                client.close();
            }
        }
    }

    private static InputStream getStreamFromFile() {
        try {
            File inputFile = new File("~/work/transcribe/src/main/resources/story/media/Story3.m4a.wav");
            InputStream audioStream = new FileInputStream(inputFile);
            return audioStream;
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    private static StartStreamTranscriptionRequest getRequest(Integer mediaSampleRateHertz) {
        return StartStreamTranscriptionRequest.builder()
                .languageCode(LanguageCode.EN_US)
                .mediaEncoding(MediaEncoding.PCM)
                .mediaSampleRateHertz(mediaSampleRateHertz)
                .build();
    }

    private static StartStreamTranscriptionResponseHandler getResponseHandler() {
        return StartStreamTranscriptionResponseHandler.builder()
                .onResponse(r -> {
                    System.out.println("Received Initial response");
                })
                .onError(e -> {
                    System.out.println(e.getMessage());
                    StringWriter sw = new StringWriter();
                    e.printStackTrace(new PrintWriter(sw));
                    System.out.println("Error Occurred: " + sw.toString());
                })
                .onComplete(() -> {
                    System.out.println("=== All records stream successfully ===");
                })
                .subscriber(event -> {
                    List<Result> results = ((TranscriptEvent) event).transcript().results();
                    if (results.size() > 0) {
                        if (!results.get(0).alternatives().get(0).transcript().isEmpty()) {
                            System.out.println(results.get(0).alternatives().get(0).transcript());
                        } else {
                            System.out.println("Empty result");
                        }
                    } else {
                        System.out.println("No results");
                    }
                })
                .build();
    }

    private static class AudioStreamPublisher implements Publisher<AudioStream> {
        private final InputStream inputStream;
        private static Subscription currentSubscription;


        private AudioStreamPublisher(InputStream inputStream) {
            this.inputStream = inputStream;
        }

        @Override
        public void subscribe(Subscriber<? super AudioStream> s) {

            if (this.currentSubscription == null) {
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            } else {
                this.currentSubscription.cancel();
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            }
            s.onSubscribe(currentSubscription);
        }
    }

    public static class SubscriptionImpl implements Subscription {
        private static final int CHUNK_SIZE_IN_BYTES = 1024 * 1;
        private final Subscriber<? super AudioStream> subscriber;
        private final InputStream inputStream;
        private ExecutorService executor = Executors.newFixedThreadPool(1);
        private AtomicLong demand = new AtomicLong(0);

        SubscriptionImpl(Subscriber<? super AudioStream> s, InputStream inputStream) {
            this.subscriber = s;
            this.inputStream = inputStream;
        }

        @Override
        public void request(long n) {
            if (n <= 0) {
                subscriber.onError(new IllegalArgumentException("Demand must be positive"));
            }

            demand.getAndAdd(n);

            executor.submit(() -> {
                try {
                    do {
                        ByteBuffer audioBuffer = getNextEvent();
                        if (audioBuffer.remaining() > 0) {
                            AudioEvent audioEvent = audioEventFromBuffer(audioBuffer);
                            subscriber.onNext(audioEvent);
                        } else {
                            subscriber.onComplete();
                            break;
                        }
                    } while (demand.decrementAndGet() > 0);
                } catch (Exception e) {
                    subscriber.onError(e);
                }
            });
        }

        @Override
        public void cancel() {
            executor.shutdown();
        }

        private ByteBuffer getNextEvent() {
            ByteBuffer audioBuffer = null;
            byte[] audioBytes = new byte[CHUNK_SIZE_IN_BYTES];

            int len = 0;
            try {
                len = inputStream.read(audioBytes);

                if (len <= 0) {
                    audioBuffer = ByteBuffer.allocate(0);
                } else {
                    audioBuffer = ByteBuffer.wrap(audioBytes, 0, len);
                }
            } catch (IOException e) {
                throw new UncheckedIOException(e);
            }

            return audioBuffer;
        }

        private AudioEvent audioEventFromBuffer(ByteBuffer bb) {
            return AudioEvent.builder()
                    .audioChunk(SdkBytes.fromByteBuffer(bb))
                    .build();
        }
    }
}

Here's my program output:

Received Initial response
No results
No results
Yeah.
No results
Yeah.
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
Yeah.
No results
No results
Oh,
No results
Oh,
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
Oh,
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results

回答1:


The audio file had a sample rate of 44.1 kHz. It was converted to 16 kHz, and it worked:

https://drive.google.com/file/d/1mYVbNlYK3SpGT4NbFRYGn86177eTCqhd/view?usp=sharing




回答2:


As smac2020 pointed out, the sample rate was wrong. Debugging incorrect metadata values passed to AWS is tricky because there's no errors from AWS. You just get back an incorrect transcription. So, the lesson here is, make sure you know what the right values are. Some of them can be automatically detected.

If you're on mac, the tool mediainfo is quite useful.

 brew install mediainfo

So is ffmpeg:

brew install ffmpeg

Here is an updated example where I automatically detect the sample rate using AudioFormat.java. Ideally, the AWS sdk would do this for you. If the media file is outside the parameters of what can be transcribed, then it would throw an exception. Note, I had to modify my original file to 16,000 sample rate using the tool: nch.com.au/switch/index.html. It would be great (hint, hint) if the SDK would also have the ability to modify sample rate, etc, so that files can be changed to fit within input parameters.

package com.amazonaws.transcribe;

import org.reactivestreams.Publisher;
import org.reactivestreams.Subscriber;
import org.reactivestreams.Subscription;
import software.amazon.awssdk.core.SdkBytes;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.transcribestreaming.TranscribeStreamingAsyncClient;
import software.amazon.awssdk.services.transcribestreaming.model.*;

import javax.sound.sampled.*;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;

import static javax.sound.sampled.AudioFormat.Encoding.*;


public class TranscribeFileFromStream {
    private static final Region REGION = Region.US_EAST_1;
    private static TranscribeStreamingAsyncClient client;

    public static void main(String args[]) throws Exception {
        System.setProperty("AWS_ACCESS_KEY_ID", "myId");
        System.setProperty("AWS_SECRET_ACCESS_KEY", "myKey");

        System.out.println(System.getProperty("java.version"));
       // BasicConfigurator.configure();
        client = TranscribeStreamingAsyncClient.builder()
                .region(REGION)
                .build();
        try {
            File inputFile = new File("/home/me/work/transcribe/src/main/resources/test-file.wav");
            CompletableFuture<Void> result = client.startStreamTranscription(
                    getRequest(inputFile),
                    new AudioStreamPublisher(getStreamFromFile(inputFile)),
                    getResponseHandler());
            result.get();
        } finally {
            if (client != null) {
                client.close();
            }
        }
    }

    private static InputStream getStreamFromFile(File inputFile) {
        try {
            return new FileInputStream(inputFile);
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    private static StartStreamTranscriptionRequest getRequest(File inputFile) throws IOException, UnsupportedAudioFileException {
        //TODO: I read the file twice in this example.  Can this be more performant?
        AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(inputFile);
        AudioFormat audioFormat = audioInputStream.getFormat();
        return StartStreamTranscriptionRequest.builder()
                .languageCode(LanguageCode.EN_US)
                //.mediaEncoding(MediaEncoding.PCM)
                .mediaEncoding(getAwsMediaEncoding(audioFormat))
                .mediaSampleRateHertz(getAwsSampleRate(audioFormat))
                .build();
    }

    private static MediaEncoding getAwsMediaEncoding(AudioFormat audioFormat) {
        final String javaMediaEncoding = audioFormat.getEncoding().toString();

        if (PCM_SIGNED.toString().equals(javaMediaEncoding)) {
            return MediaEncoding.PCM;
        } else if (PCM_UNSIGNED.toString().equals(javaMediaEncoding)){
            return MediaEncoding.PCM;
        } /*else if (ALAW.toString().equals(javaMediaEncoding)){
            //WARNING: I have no idea how ALAW maps to AWS media encodings.
            return MediaEncoding.OGG_OPUS;
        } else if (ULAW.toString().equals(javaMediaEncoding)){
            //WARNING: I have no idea how ULAW maps to AWS encodings.  
            return MediaEncoding.FLAC;
        }*/

        throw new IllegalArgumentException("Not a recognized media encoding:" + javaMediaEncoding);
    }

    private static Integer getAwsSampleRate(AudioFormat audioFormat) {
        return Math.round(audioFormat.getSampleRate());
    }

    private static StartStreamTranscriptionResponseHandler getResponseHandler() {
        return StartStreamTranscriptionResponseHandler.builder()
                .onResponse(r -> {
                    System.out.println("Received Initial response");
                })
                .onError(e -> {
                    System.out.println(e.getMessage());
                    StringWriter sw = new StringWriter();
                    e.printStackTrace(new PrintWriter(sw));
                    System.out.println("Error Occurred: " + sw.toString());
                })
                .onComplete(() -> {
                    System.out.println("=== All records stream successfully ===");
                })
                .subscriber(event -> {
                    List<Result> results = ((TranscriptEvent) event).transcript().results();
                    if (results.size() > 0) {
                        if (!results.get(0).alternatives().get(0).transcript().isEmpty()) {
                            System.out.println(results.get(0).alternatives().get(0).transcript());
                        } else {
                            System.out.println("Empty result");
                        }
                    } else {
                        System.out.println("No results");
                    }
                })
                .build();
    }

    private static class AudioStreamPublisher implements Publisher<AudioStream> {
        private final InputStream inputStream;
        private static Subscription currentSubscription;


        private AudioStreamPublisher(InputStream inputStream) {
            this.inputStream = inputStream;
        }

        @Override
        public void subscribe(Subscriber<? super AudioStream> s) {

            if (this.currentSubscription == null) {
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            } else {
                this.currentSubscription.cancel();
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            }
            s.onSubscribe(currentSubscription);
        }
    }

    public static class SubscriptionImpl implements Subscription {
        private static final int CHUNK_SIZE_IN_BYTES = 1024 * 1;
        private final Subscriber<? super AudioStream> subscriber;
        private final InputStream inputStream;
        private ExecutorService executor = Executors.newFixedThreadPool(1);
        private AtomicLong demand = new AtomicLong(0);

        SubscriptionImpl(Subscriber<? super AudioStream> s, InputStream inputStream) {
            this.subscriber = s;
            this.inputStream = inputStream;
        }

        @Override
        public void request(long n) {
            if (n <= 0) {
                subscriber.onError(new IllegalArgumentException("Demand must be positive"));
            }

            demand.getAndAdd(n);

            executor.submit(() -> {
                try {
                    do {
                        ByteBuffer audioBuffer = getNextEvent();
                        if (audioBuffer.remaining() > 0) {
                            AudioEvent audioEvent = audioEventFromBuffer(audioBuffer);
                            subscriber.onNext(audioEvent);
                        } else {
                            subscriber.onComplete();
                            break;
                        }
                    } while (demand.decrementAndGet() > 0);
                } catch (Exception e) {
                    subscriber.onError(e);
                }
            });
        }

        @Override
        public void cancel() {
            executor.shutdown();
        }

        private ByteBuffer getNextEvent() {
            ByteBuffer audioBuffer = null;
            byte[] audioBytes = new byte[CHUNK_SIZE_IN_BYTES];

            int len = 0;
            try {
                len = inputStream.read(audioBytes);

                if (len <= 0) {
                    audioBuffer = ByteBuffer.allocate(0);
                } else {
                    audioBuffer = ByteBuffer.wrap(audioBytes, 0, len);
                }
            } catch (IOException e) {
                throw new UncheckedIOException(e);
            }

            return audioBuffer;
        }

        private AudioEvent audioEventFromBuffer(ByteBuffer bb) {
            return AudioEvent.builder()
                    .audioChunk(SdkBytes.fromByteBuffer(bb))
                    .build();
        }
    }
}


来源:https://stackoverflow.com/questions/65836411/aws-transcript-file-to-text-returns-nonsense

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!