Google Cloud Speech to Text API는 공식적으로 Unity를 직접 지원하지 않습니다.
그러나, REST API를 통해 Google Cloud Speech-to-Text를 Unity에서 사용할 수 있습니다.

📜소스 코드

아래 코드는 VAD(Voice Activity Detection)를 자체적으로 구현한 예제MicrophoneInput입니다. 간단한 음성 활동 검출을 위해 마이크 입력의 샘플을 분석하여, 일정 수준의 음량(Threshold)을 초과하는 경우를 음성 활동으로 간주합니다.

더 정교한 VAD 알고리즘을 사용하려면 GCP Speech-to-Text API 자체의 VAD 기능으로 스트리밍 인식을 사용하는 것이 좋습니다.
NuGet 패키지 매니저 콘솔에서 Google.Cloud.Speech.V1 설치 必

- GoogleSTTService는 전달받은 음성 데이터를 Google STT API에 보내고, 변환된 텍스트를 TranscriptView로 반환합니다.

- MicrophoneInput은 음성 데이터를 수집하고, 이를 GoogleSTTService에 전달합니다.

- TranscriptView는 MicrophoneInput 및 GoogleSTTService와 상호작용합니다.

using Cysharp.Threading.Tasks;
using System;
using System.Text;
using UnityEngine;
using UnityEngine.Networking;
using Newtonsoft.Json.Linq;
using UniRx;

public class GoogleSTTService : MonoBehaviour
{
    private const string API_KEY = "YOUR_API_KEY";
    private const string URL = "https://speech.googleapis.com/v1/speech:recognize?key=";
    private const string Locale =
        // "en-US"
        "ko-KR"
        ;

    public ReactiveCommand<string> OnRecognizeSpeechCommand = new();

    public async void RecognizeSpeech(byte[] audioData)
    {
        string audioContent = Convert.ToBase64String(audioData);
        string requestJson = $"{{\"config\": {{\"encoding\":\"LINEAR16\",\"sampleRateHertz\":16000,\"languageCode\":\"{Locale}\"}},\"audio\":{{\"content\":\"{audioContent}\"}}}}";
        string fullUrl = URL + API_KEY;
        using var request = new UnityWebRequest(fullUrl, "POST");
        byte[] bodyRaw = Encoding.UTF8.GetBytes(requestJson);
        request.uploadHandler = new UploadHandlerRaw(bodyRaw);
        request.downloadHandler = new DownloadHandlerBuffer();
        request.SetRequestHeader("Content-Type", "application/json");

        await request.SendWebRequest();
        if (request.result == UnityWebRequest.Result.ConnectionError || request.result == UnityWebRequest.Result.ProtocolError)
        {
            Debug.LogError($"GoogleSTTService.RecognizeSpeech() request.error is [{request.error}]");
            OnRecognizeSpeechCommand.Execute(string.Empty);
        }
        else
        {
            string responseText = request.downloadHandler.text;
            var json = JObject.Parse(responseText);
            string transcript = json["results"]?[0]?["alternatives"]?[0]?["transcript"]?.ToString();

            if (!string.IsNullOrEmpty(transcript))
            {
                OnRecognizeSpeechCommand.Execute(transcript);
            }
        }
    }
}

using System;
using UnityEngine;
using UniRx;

public class MicrophoneInput : MonoBehaviour
{
    private const int SampleWindow = 128;
    private const float VoiceThreshold = 0.25f;
    private const float VADTimeout = 1.0f; // 1 second timeout for VAD

    private AudioClip microphoneClip;
    private float lastVoiceDetectedTime;

    public ReactiveCommand<byte[]> OnMaxLevelChangeCommand = new();

    private void Start()
    {
        microphoneClip = Microphone.Start(null, true, 10, 16000);
        lastVoiceDetectedTime = Time.time;
    }

    private void FixedUpdate()
    {
        CheckMaxLevel();

        // If no voice is detected for the timeout duration, trigger the command
        if (Time.time - lastVoiceDetectedTime > VADTimeout)
        {
            var microphoneData = GetMicrophoneData();
            if (microphoneData != null)
            {
                OnMaxLevelChangeCommand.Execute(microphoneData);
            }
            lastVoiceDetectedTime = Time.time; // Reset the timer after sending data
        }
    }

    private void CheckMaxLevel()
    {
        float maxLevel = 0f;
        float[] samples = new float[SampleWindow];
        int startPosition = Microphone.GetPosition(null) - SampleWindow + 1;
        if (startPosition > 0)
        {
            microphoneClip.GetData(samples, startPosition);

            foreach (var sample in samples)
            {
                float absSample = Mathf.Abs(sample);
                if (absSample > maxLevel)
                {
                    maxLevel = absSample;
                }
            }

            if (maxLevel > VoiceThreshold)
            {
                lastVoiceDetectedTime = Time.time; // Update the last detected time when voice is detected
            }
        }
    }

    private byte[] GetMicrophoneData()
    {
        if (Microphone.GetPosition(null) <= 0)
        {
            return null;
        }
        else
        {
            float[] samples = new float[microphoneClip.samples * microphoneClip.channels];
            microphoneClip.GetData(samples, 0);
            byte[] audioData = new byte[samples.Length * 2];
            for (int i = 0; i < samples.Length; i++)
            {
                short sample = (short)(samples[i] * short.MaxValue);
                byte[] sampleBytes = BitConverter.GetBytes(sample);
                audioData[i * 2] = sampleBytes[0];
                audioData[i * 2 + 1] = sampleBytes[1];
            }
            return audioData;
        }
    }
}

using Cysharp.Threading.Tasks;
using UnityEngine;
using TMPro;
using UniRx;

public class TranscriptView : MonoBehaviour
{
    [SerializeField] private MicrophoneInput microphoneInput;
    [SerializeField] private GoogleSTTService googleSTTService;

    public TMP_Text transcriptText;

    private void Awake()
    {
        if (!microphoneInput) microphoneInput = GetComponent<MicrophoneInput>();
        if (!googleSTTService) googleSTTService = GetComponent<GoogleSTTService>();

        microphoneInput.OnMaxLevelChangeCommand
            .Subscribe(OnMaxLevelChangeExecuted).AddTo(this);

        googleSTTService.OnRecognizeSpeechCommand
            .Subscribe(OnRecognizeSpeechExecuted).AddTo(this);
    }

    private void OnMaxLevelChangeExecuted(byte[] microphoneData)
    {
        googleSTTService.RecognizeSpeech(microphoneData);
    }

    private void OnRecognizeSpeechExecuted(string transcript)
    {
        transcriptText.text = transcript;
    }
}

📚 음성 활동 검출(Voice Activity Detection, VAD) 알고리즘

음성 활동 검출(Voice Activity Detection, VAD) 알고리즘은 오디오 신호에서 음성과 비음성 구간을 구분하는 기술입니다.

이 알고리즘은 다양한 응용 분야에서 사용됩니다. 예를 들어, 음성 인식 시스템에서 VAD는 음성 구간을 식별하여 불필요한 잡음을 제거하고 음성 인식의 정확성을 높입니다. 또한, 통신 시스템에서는 전송할 데이터를 줄여 대역폭을 절약할 수 있습니다.

📖 VAD 알고리즘의 기본 원리

에너지 기반 방법
음성 신호는 일반적으로 비음성 구간보다 높은 에너지를 가지기 때문에, 신호의 에너지를 측정하여 음성 구간을 탐지합니다.
주파수 도메인 방법
음성 신호와 비음성 신호는 주파수 스펙트럼에서 다른 특성을 가지므로, 주파수 분석을 통해 구분할 수 있습니다.
통계적 방법
신호의 통계적 특성을 이용하여 음성 구간과 비음성 구간을 구분합니다. 예를 들어, 신호의 자기상관 함수나 크로스 엔트로피 등을 이용할 수 있습니다.
기계 학습 방법
음성 데이터와 비음성 데이터를 학습하여 분류 모델을 생성합니다. 최근에는 딥러닝을 활용한 VAD 모델도 많이 사용됩니다.

📖 C#으로 VAD 구현 예시

이해를 돕기 위한 예제로, 이번 글의 주제에서 사용된 예제가 아님

C#에서 VAD 알고리즘을 구현하기 위해 NAudio 라이브러리를 사용할 수 있습니다.

NAudio는 오디오 처리를 위한 라이브러리로, WAV 파일의 로드, 재생, 처리 등을 지원합니다.

using System;
using System.IO;
using NAudio.Wave;

class VAD
{
    static void Main(string[] args)
    {
        string inputFilePath = "input.wav";
        string outputFilePath = "output.wav";

        using (var reader = new AudioFileReader(inputFilePath))
        {
            var sampleProvider = reader.ToSampleProvider();
            float[] buffer = new float[reader.WaveFormat.SampleRate];
            int samplesRead;
            float threshold = 0.01f;

            using (var writer = new WaveFileWriter(outputFilePath, reader.WaveFormat))
            {
                while ((samplesRead = sampleProvider.Read(buffer, 0, buffer.Length)) > 0)
                {
                    bool isSpeech = false;

                    // 에너지 계산
                    float energy = 0;
                    for (int i = 0; i < samplesRead; i++)
                    {
                        energy += buffer[i] * buffer[i];
                    }
                    energy /= samplesRead;

                    // 음성 구간인지 판별
                    if (energy > threshold)
                    {
                        isSpeech = true;
                    }

                    // 음성 구간만 출력 파일에 기록
                    if (isSpeech)
                    {
                        writer.WriteSamples(buffer, 0, samplesRead);
                    }
                }
            }
        }

        Console.WriteLine("VAD 처리가 완료되었습니다. 결과는 output.wav 파일에 저장되었습니다.");
    }
}

이 예제에서는 NAudio 라이브러리를 사용하여 WAV 파일을 읽고, 각 샘플의 에너지를 계산하여 음성 구간을 판별합니다. 에너지가 특정 임계값(threshold)보다 큰 구간을 음성으로 간주하고, 해당 구간만 출력 파일에 기록합니다.

이 코드를 실행하기 위해서는 NAudio 라이브러리를 설치해야 합니다. NuGet 패키지 매니저 콘솔에서 다음 명령을 사용하여 설치할 수 있습니다.

Install-Package NAudio

저작자표시 비영리 변경금지

'⚙️ Programming > C# & Unity' 카테고리의 다른 글

2024 UDay Seoul: Industry \| Industry Vision & Roadmap Review (7)	2024.10.06
Unity 6 Web의 최적화 기법과 WebGL 활용 - 차세대 웹 애플리케이션의 가능성 (1)	2024.10.02
[Unity] OpenAI API(ChatGPT) 사용해보기 (2)	2024.06.20
[Unity] Profiling (0)	2024.01.17
[Unity] URP vs Built-In, Stencil과 렌더링 이론, Instancing 기법, SSAO와 Decals (3)	2024.01.16

블로그의 정보

Sugar

Sugar0810

[Unity] Google Cloud Speech to Text API + VAD 알고리즘

📜소스 코드

📚 음성 활동 검출(Voice Activity Detection, VAD) 알고리즘

📖 VAD 알고리즘의 기본 원리

📖 C#으로 VAD 구현 예시

'⚙️ Programming > C# & Unity' 카테고리의 다른 글

블로그의 정보

활동하기

티스토리툴바

📜소스 코드

📚 음성 활동 검출(Voice Activity Detection, VAD) 알고리즘

📖 VAD 알고리즘의 기본 원리

📖 C#으로 VAD 구현 예시

'⚙️ Programming > C# & Unity' 카테고리의 다른 글

블로그의 정보

활동하기

공유하기

다른 글

티스토리툴바