Implement speech-to-text functionality using microphone recording and the Whisper model

Speech Recognition

The SDK provides complete speech recognition functionality, including microphone recording, Voice Activity Detection (VAD), and audio-to-text conversion. This allows you to implement voice commands, voice chat, and other features in your games.

For example, players can speak to NPCs through voice, or use voice commands to control the game.

Before You Begin

Make sure you've completed SDK initialization
Understand microphone permission requirements for different platforms
WebGL platform does not support PlayKit_MicrophoneRecorder

Two Core Components

PlayKit_MicrophoneRecorder

A MonoBehaviour component for recording audio, with built-in Voice Activity Detection.

PlayKit_AudioTranscriptionClient

A client for converting audio to text.

Microphone Recording

Basic Recording

using PlayKit_SDK;
using PlayKit_SDK.Public;
using UnityEngine;

public class BasicRecording : MonoBehaviour
{
    private PlayKit_MicrophoneRecorder recorder;

    void Start()
    {
        recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
        recorder.maxRecordingSeconds = 60f;
        recorder.sampleRate = 16000;
    }

    public void StartRecording()
    {
        if (!recorder.IsRecording)
        {
            bool success = recorder.StartRecording();

            if (success)
            {
                Debug.Log("Recording started");
            }
            else
            {
                Debug.LogError("Failed to start recording");
            }
        }
    }

    public void StopRecording()
    {
        if (recorder.IsRecording)
        {
            AudioClip clip = recorder.StopRecording();

            if (clip != null)
            {
                Debug.Log($"Recording complete: {clip.length} seconds");
                // Use AudioClip
            }
        }
    }
}

Using Event Listeners

public class RecordingWithEvents : MonoBehaviour
{
    private PlayKit_MicrophoneRecorder recorder;

    void Start()
    {
        recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();

        // Subscribe to events
        recorder.OnRecordingStarted += HandleRecordingStarted;
        recorder.OnRecordingStopped += HandleRecordingStopped;
        recorder.OnVolumeChanged += HandleVolumeChanged;
    }

    void OnDestroy()
    {
        // Unsubscribe
        if (recorder != null)
        {
            recorder.OnRecordingStarted -= HandleRecordingStarted;
            recorder.OnRecordingStopped -= HandleRecordingStopped;
            recorder.OnVolumeChanged -= HandleVolumeChanged;
        }
    }

    void HandleRecordingStarted()
    {
        Debug.Log("Recording has started");
    }

    void HandleRecordingStopped(AudioClip clip)
    {
        Debug.Log($"Recording stopped: {clip.length} seconds");
    }

    void HandleVolumeChanged(float volume)
    {
        // Update volume indicator
        // volume range: 0.0 - 1.0
    }
}

Voice Activity Detection (VAD)

VAD can automatically detect silence and stop recording, providing a better user experience.

Configure VAD

public class VADRecording : MonoBehaviour
{
    private PlayKit_MicrophoneRecorder recorder;

    void Start()
    {
        recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();

        recorder.useVAD = true;
        recorder.silenceThreshold = 0.01f;
        recorder.maxSilenceDuration = 2.0f;

        // Listen for auto-stop
        recorder.OnRecordingStopped += (clip) =>
        {
            Debug.Log("Silence detected, recording stopped automatically");
        };
    }

    public void StartVADRecording()
    {
        recorder.StartRecording();
        // VAD will automatically detect silence and stop
    }
}

Real-time Volume Display

using UnityEngine.UI;

public class VolumeIndicator : MonoBehaviour
{
    [SerializeField] private Slider volumeSlider;
    [SerializeField] private Image volumeFill;

    private PlayKit_MicrophoneRecorder recorder;

    void Start()
    {
        recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
        recorder.OnVolumeChanged += UpdateVolumeDisplay;
    }

    void UpdateVolumeDisplay(float volume)
    {
        volumeSlider.value = volume;

        // Change color based on volume
        if (volume > recorder.silenceThreshold)
        {
            volumeFill.color = Color.green; // Sound detected
        }
        else
        {
            volumeFill.color = Color.gray;  // Silence
        }
    }
}

Audio to Text

Basic Transcription

using Cysharp.Threading.Tasks;
using PlayKit_SDK;
using PlayKit_SDK.Public;
using UnityEngine;

public class BasicTranscription : MonoBehaviour
{
    private PlayKit_AudioTranscriptionClient transcriptionClient;

    void Start()
    {
        transcriptionClient = PlayKitSDK.Factory.CreateTranscriptionClient("whisper-1");
    }

    public async UniTask TranscribeAudio(AudioClip audioClip)
    {
        // Transcribe audio
        PlayKit_TranscriptionResult result = await transcriptionClient.TranscribeAudioClipAsync(
            audioClip: audioClip,
            language: null,  // Auto-detect language
            prompt: null,    // Optional context hint
            cancellationToken: this.GetCancellationTokenOnDestroy()
        );

        if (result.Success)
        {
            Debug.Log($"Recognized text: {result.Text}");
            Debug.Log($"Language: {result.Language}");
            Debug.Log($"Duration: {result.DurationInSeconds} seconds");

            // Use the text
            ProcessTranscribedText(result.Text);
        }
        else
        {
            Debug.LogError($"Transcription failed: {result.Error}");
        }
    }

    void ProcessTranscribedText(string text)
    {
        // Process recognized text
    }
}

Specify Language

// Specify source language to improve accuracy
var result = await transcriptionClient.TranscribeAudioClipAsync(
    audioClip: audioClip,
    language: "en",  // English
    prompt: null,
    cancellationToken: this.GetCancellationTokenOnDestroy()
);

Supported language codes:

"en" - English
"zh" - Chinese
"ja" - Japanese
"ko" - Korean
"es" - Spanish
And more (all languages supported by Whisper model)

Use Prompts to Improve Accuracy

// Provide context hints to improve recognition of specific terms
string prompt = "Unity, Unreal Engine, GameObject, Transform";

var result = await transcriptionClient.TranscribeAudioClipAsync(
    audioClip: audioClip,
    language: "en",
    prompt: prompt,  // Hint for technical terms
    cancellationToken: this.GetCancellationTokenOnDestroy()
);

View Detailed Segments

Transcription results include timestamp information:

var result = await transcriptionClient.TranscribeAudioClipAsync(/*...*/);

if (result.Success && result.Segments != null)
{
    Debug.Log("Transcription segments:");
    foreach (var segment in result.Segments)
    {
        Debug.Log($"[{segment.Start:F2}s - {segment.End:F2}s]: {segment.Text}");
    }
}

Complete Speech-to-Text Flow

Record and Transcribe

using Cysharp.Threading.Tasks;
using PlayKit_SDK;
using PlayKit_SDK.Public;
using UnityEngine;
using UnityEngine.UI;

public class SpeechToText : MonoBehaviour
{
    [SerializeField] private Button recordButton;
    [SerializeField] private Text statusText;
    [SerializeField] private Text resultText;

    private PlayKit_MicrophoneRecorder recorder;
    private PlayKit_AudioTranscriptionClient transcriptionClient;
    private bool isProcessing = false;

    void Start()
    {
        // Initialize components
        recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
        recorder.useVAD = true;
        recorder.maxSilenceDuration = 2.0f;

        transcriptionClient = PlayKitSDK.Factory.CreateTranscriptionClient("whisper-1");
        recordButton.onClick.AddListener(() => ToggleRecording().Forget());
        recorder.OnRecordingStopped += OnRecordingStopped;
    }

    async UniTaskVoid ToggleRecording()
    {
        if (isProcessing) return;

        if (!recorder.IsRecording)
        {
            // Start recording
            bool success = recorder.StartRecording();
            if (success)
            {
                statusText.text = "Recording...";
                recordButton.GetComponentInChildren<Text>().text = "Stop";
            }
        }
        else
        {
            // Stop recording
            AudioClip clip = recorder.StopRecording();
            // OnRecordingStopped will be called automatically
        }
    }

    async void OnRecordingStopped(AudioClip clip)
    {
        if (clip == null)
        {
            statusText.text = "Recording failed";
            return;
        }

        isProcessing = true;
        statusText.text = "Recognizing...";
        recordButton.interactable = false;

        // Transcribe audio
        var result = await transcriptionClient.TranscribeAudioClipAsync(
            audioClip: clip,
            language: null,
            prompt: null,
            cancellationToken: this.GetCancellationTokenOnDestroy()
        );

        // Display result
        if (result.Success)
        {
            resultText.text = result.Text;
            statusText.text = "Recognition complete";
        }
        else
        {
            statusText.text = $"Recognition failed: {result.Error}";
        }

        isProcessing = false;
        recordButton.interactable = true;
        recordButton.GetComponentInChildren<Text>().text = "Start Recording";
    }
}

Voice Chat Integration

Combine speech recognition with AI conversation:

using Cysharp.Threading.Tasks;
using PlayKit_SDK;
using PlayKit_SDK.Public;
using UnityEngine;
using UnityEngine.UI;

public class VoiceChat : MonoBehaviour
{
    [SerializeField] private Button talkButton;
    [SerializeField] private Text chatDisplay;

    private PlayKit_MicrophoneRecorder recorder;
    private PlayKit_AudioTranscriptionClient transcriptionClient;
    private PlayKit_AIChatClient chatClient;
    private List<PlayKit_ChatMessage> chatHistory = new List<PlayKit_ChatMessage>();

    void Start()
    {
        // Initialize components
        recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
        recorder.useVAD = true;
        recorder.OnRecordingStopped += ProcessVoiceInput;

        transcriptionClient = PlayKitSDK.Factory.CreateTranscriptionClient("whisper-1");
        chatClient = PlayKitSDK.Factory.CreateChatClient();

        // Set system message
        chatHistory.Add(new PlayKit_ChatMessage
        {
            Role = "system",
            Content = "You are a friendly game NPC."
        });

        talkButton.onClick.AddListener(() => StartVoiceInput());
    }

    void StartVoiceInput()
    {
        if (!recorder.IsRecording)
        {
            recorder.StartRecording();
            talkButton.GetComponentInChildren<Text>().text = "Recording...";
        }
    }

    async void ProcessVoiceInput(AudioClip clip)
    {
        talkButton.interactable = false;

        // 1. Speech to text
        chatDisplay.text += "\n[Recognizing...]";

        var transcription = await transcriptionClient.TranscribeAudioClipAsync(
            clip,
            null,
            null,
            this.GetCancellationTokenOnDestroy()
        );

        if (!transcription.Success)
        {
            chatDisplay.text += $"\nRecognition failed: {transcription.Error}";
            talkButton.interactable = true;
            return;
        }

        string userText = transcription.Text;
        chatDisplay.text += $"\nPlayer: {userText}";

        // 2. Send to AI
        chatHistory.Add(new PlayKit_ChatMessage
        {
            Role = "user",
            Content = userText
        });

        var config = new PlayKit_ChatConfig(chatHistory) { Temperature = 0.7f };
        var aiResponse = await chatClient.TextGenerationAsync(config, this.GetCancellationTokenOnDestroy());

        if (aiResponse.Success)
        {
            chatHistory.Add(new PlayKit_ChatMessage
            {
                Role = "assistant",
                Content = aiResponse.Response
            });

            chatDisplay.text += $"\nNPC: {aiResponse.Response}";
        }
        else
        {
            chatDisplay.text += $"\nAI response failed: {aiResponse.ErrorMessage}";
        }

        talkButton.interactable = true;
        talkButton.GetComponentInChildren<Text>().text = "Press to Talk";
    }
}

Microphone Device Management

List Available Devices

public class MicrophoneDeviceSelector : MonoBehaviour
{
    [SerializeField] private Dropdown deviceDropdown;

    void Start()
    {
        // Get all microphone devices
        string[] devices = PlayKit_MicrophoneRecorder.GetAvailableDevices();

        // Populate dropdown
        deviceDropdown.ClearOptions();
        deviceDropdown.AddOptions(new List<string>(devices));

        // Select device
        deviceDropdown.onValueChanged.AddListener(SelectDevice);
    }

    void SelectDevice(int index)
    {
        string[] devices = PlayKit_MicrophoneRecorder.GetAvailableDevices();
        if (index < devices.Length)
        {
            string selectedDevice = devices[index];
            var recorder = GetComponent<PlayKit_MicrophoneRecorder>();
            recorder.SetMicrophoneDevice(selectedDevice);

            Debug.Log($"Selected microphone: {selectedDevice}");
        }
    }
}

Use Default Device

// Use system default microphone
recorder.StartRecording(null);

// Or specify device
recorder.StartRecording("Microphone (Realtek High Definition Audio)");

Platform Support

Windows / macOS / Linux

Full support for microphone recording and speech recognition.

iOS / Android

Microphone permissions must be requested in build settings:

iOS: Add to Info.plist:

<key>NSMicrophoneUsageDescription</key>
<string>Microphone access is required for voice input</string>

Android: Add to AndroidManifest.xml:

<uses-permission android:name="android.permission.RECORD_AUDIO" />

WebGL

WebGL platform does not support the PlayKit_MicrophoneRecorder component because Unity's Microphone API is not available on WebGL.

If you need speech recognition on WebGL, you'll need to use the browser's Web Audio API and integrate via JavaScript plugins.

Audio Formats

Supported Formats

The transcription client supports multiple audio formats:

WAV - Recommended format, 16kHz PCM16
MP3
FLAC
OGG

Convert AudioClip to Byte Array

If you need to process audio data manually:

public byte[] AudioClipToWAV(AudioClip clip)
{
    // SDK handles this internally, but if you need manual processing:
    var samples = new float[clip.samples * clip.channels];
    clip.GetData(samples, 0);

    // Convert to WAV format byte array
    // (WAV encoding implementation needed - SDK handles this automatically)

    return wavBytes;
}

Direct Byte Array Transcription

// If you already have audio byte array
byte[] audioData = LoadAudioFile("recording.wav");

var result = await transcriptionClient.TranscribeAsync(
    audioData: audioData,
    language: null,
    prompt: null,
    cancellationToken: this.GetCancellationTokenOnDestroy()
);

Best Practices

Use VAD: Enable Voice Activity Detection for better user experience
Appropriate sample rate: Use 16kHz for Whisper model
Specify language: If you know the source language, specify it for better accuracy
Provide prompts: Use prompts to improve recognition of technical terms
Limit recording duration: Set reasonable maxRecordingSeconds to avoid excessively long recordings
Show status: Clearly display recording and recognition status to users
Handle permissions: Properly request and handle microphone permissions on mobile platforms
Error handling: Always check result.Success and handle failure cases

Next Steps

Learn about NPC Conversations for intelligent dialogue
Read the API Reference for complete API documentation

Speech Recognition

On this page