Speech Recognition
Implement speech-to-text functionality using microphone recording and the Whisper model
Speech Recognition
The SDK provides complete speech recognition functionality, including microphone recording, Voice Activity Detection (VAD), and audio-to-text conversion. This allows you to implement voice commands, voice chat, and other features in your games.
For example, players can speak to NPCs through voice, or use voice commands to control the game.
Before You Begin
- Make sure you've completed SDK initialization
- Understand microphone permission requirements for different platforms
- WebGL platform does not support
PlayKit_MicrophoneRecorder
Two Core Components
PlayKit_MicrophoneRecorder
A MonoBehaviour component for recording audio, with built-in Voice Activity Detection.
PlayKit_AudioTranscriptionClient
A client for converting audio to text.
Microphone Recording
Basic Recording
using PlayKit_SDK;
using PlayKit_SDK.Public;
using UnityEngine;
public class BasicRecording : MonoBehaviour
{
private PlayKit_MicrophoneRecorder recorder;
void Start()
{
recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
recorder.maxRecordingSeconds = 60f;
recorder.sampleRate = 16000;
}
public void StartRecording()
{
if (!recorder.IsRecording)
{
bool success = recorder.StartRecording();
if (success)
{
Debug.Log("Recording started");
}
else
{
Debug.LogError("Failed to start recording");
}
}
}
public void StopRecording()
{
if (recorder.IsRecording)
{
AudioClip clip = recorder.StopRecording();
if (clip != null)
{
Debug.Log($"Recording complete: {clip.length} seconds");
// Use AudioClip
}
}
}
}Using Event Listeners
public class RecordingWithEvents : MonoBehaviour
{
private PlayKit_MicrophoneRecorder recorder;
void Start()
{
recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
// Subscribe to events
recorder.OnRecordingStarted += HandleRecordingStarted;
recorder.OnRecordingStopped += HandleRecordingStopped;
recorder.OnVolumeChanged += HandleVolumeChanged;
}
void OnDestroy()
{
// Unsubscribe
if (recorder != null)
{
recorder.OnRecordingStarted -= HandleRecordingStarted;
recorder.OnRecordingStopped -= HandleRecordingStopped;
recorder.OnVolumeChanged -= HandleVolumeChanged;
}
}
void HandleRecordingStarted()
{
Debug.Log("Recording has started");
}
void HandleRecordingStopped(AudioClip clip)
{
Debug.Log($"Recording stopped: {clip.length} seconds");
}
void HandleVolumeChanged(float volume)
{
// Update volume indicator
// volume range: 0.0 - 1.0
}
}Voice Activity Detection (VAD)
VAD can automatically detect silence and stop recording, providing a better user experience.
Configure VAD
public class VADRecording : MonoBehaviour
{
private PlayKit_MicrophoneRecorder recorder;
void Start()
{
recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
recorder.useVAD = true;
recorder.silenceThreshold = 0.01f;
recorder.maxSilenceDuration = 2.0f;
// Listen for auto-stop
recorder.OnRecordingStopped += (clip) =>
{
Debug.Log("Silence detected, recording stopped automatically");
};
}
public void StartVADRecording()
{
recorder.StartRecording();
// VAD will automatically detect silence and stop
}
}Real-time Volume Display
using UnityEngine.UI;
public class VolumeIndicator : MonoBehaviour
{
[SerializeField] private Slider volumeSlider;
[SerializeField] private Image volumeFill;
private PlayKit_MicrophoneRecorder recorder;
void Start()
{
recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
recorder.OnVolumeChanged += UpdateVolumeDisplay;
}
void UpdateVolumeDisplay(float volume)
{
volumeSlider.value = volume;
// Change color based on volume
if (volume > recorder.silenceThreshold)
{
volumeFill.color = Color.green; // Sound detected
}
else
{
volumeFill.color = Color.gray; // Silence
}
}
}Audio to Text
Basic Transcription
using Cysharp.Threading.Tasks;
using PlayKit_SDK;
using PlayKit_SDK.Public;
using UnityEngine;
public class BasicTranscription : MonoBehaviour
{
private PlayKit_AudioTranscriptionClient transcriptionClient;
void Start()
{
transcriptionClient = PlayKitSDK.Factory.CreateTranscriptionClient("whisper-1");
}
public async UniTask TranscribeAudio(AudioClip audioClip)
{
// Transcribe audio
PlayKit_TranscriptionResult result = await transcriptionClient.TranscribeAudioClipAsync(
audioClip: audioClip,
language: null, // Auto-detect language
prompt: null, // Optional context hint
cancellationToken: this.GetCancellationTokenOnDestroy()
);
if (result.Success)
{
Debug.Log($"Recognized text: {result.Text}");
Debug.Log($"Language: {result.Language}");
Debug.Log($"Duration: {result.DurationInSeconds} seconds");
// Use the text
ProcessTranscribedText(result.Text);
}
else
{
Debug.LogError($"Transcription failed: {result.Error}");
}
}
void ProcessTranscribedText(string text)
{
// Process recognized text
}
}Specify Language
// Specify source language to improve accuracy
var result = await transcriptionClient.TranscribeAudioClipAsync(
audioClip: audioClip,
language: "en", // English
prompt: null,
cancellationToken: this.GetCancellationTokenOnDestroy()
);Supported language codes:
"en"- English"zh"- Chinese"ja"- Japanese"ko"- Korean"es"- Spanish- And more (all languages supported by Whisper model)
Use Prompts to Improve Accuracy
// Provide context hints to improve recognition of specific terms
string prompt = "Unity, Unreal Engine, GameObject, Transform";
var result = await transcriptionClient.TranscribeAudioClipAsync(
audioClip: audioClip,
language: "en",
prompt: prompt, // Hint for technical terms
cancellationToken: this.GetCancellationTokenOnDestroy()
);View Detailed Segments
Transcription results include timestamp information:
var result = await transcriptionClient.TranscribeAudioClipAsync(/*...*/);
if (result.Success && result.Segments != null)
{
Debug.Log("Transcription segments:");
foreach (var segment in result.Segments)
{
Debug.Log($"[{segment.Start:F2}s - {segment.End:F2}s]: {segment.Text}");
}
}Complete Speech-to-Text Flow
Record and Transcribe
using Cysharp.Threading.Tasks;
using PlayKit_SDK;
using PlayKit_SDK.Public;
using UnityEngine;
using UnityEngine.UI;
public class SpeechToText : MonoBehaviour
{
[SerializeField] private Button recordButton;
[SerializeField] private Text statusText;
[SerializeField] private Text resultText;
private PlayKit_MicrophoneRecorder recorder;
private PlayKit_AudioTranscriptionClient transcriptionClient;
private bool isProcessing = false;
void Start()
{
// Initialize components
recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
recorder.useVAD = true;
recorder.maxSilenceDuration = 2.0f;
transcriptionClient = PlayKitSDK.Factory.CreateTranscriptionClient("whisper-1");
recordButton.onClick.AddListener(() => ToggleRecording().Forget());
recorder.OnRecordingStopped += OnRecordingStopped;
}
async UniTaskVoid ToggleRecording()
{
if (isProcessing) return;
if (!recorder.IsRecording)
{
// Start recording
bool success = recorder.StartRecording();
if (success)
{
statusText.text = "Recording...";
recordButton.GetComponentInChildren<Text>().text = "Stop";
}
}
else
{
// Stop recording
AudioClip clip = recorder.StopRecording();
// OnRecordingStopped will be called automatically
}
}
async void OnRecordingStopped(AudioClip clip)
{
if (clip == null)
{
statusText.text = "Recording failed";
return;
}
isProcessing = true;
statusText.text = "Recognizing...";
recordButton.interactable = false;
// Transcribe audio
var result = await transcriptionClient.TranscribeAudioClipAsync(
audioClip: clip,
language: null,
prompt: null,
cancellationToken: this.GetCancellationTokenOnDestroy()
);
// Display result
if (result.Success)
{
resultText.text = result.Text;
statusText.text = "Recognition complete";
}
else
{
statusText.text = $"Recognition failed: {result.Error}";
}
isProcessing = false;
recordButton.interactable = true;
recordButton.GetComponentInChildren<Text>().text = "Start Recording";
}
}Voice Chat Integration
Combine speech recognition with AI conversation:
using Cysharp.Threading.Tasks;
using PlayKit_SDK;
using PlayKit_SDK.Public;
using UnityEngine;
using UnityEngine.UI;
public class VoiceChat : MonoBehaviour
{
[SerializeField] private Button talkButton;
[SerializeField] private Text chatDisplay;
private PlayKit_MicrophoneRecorder recorder;
private PlayKit_AudioTranscriptionClient transcriptionClient;
private PlayKit_AIChatClient chatClient;
private List<PlayKit_ChatMessage> chatHistory = new List<PlayKit_ChatMessage>();
void Start()
{
// Initialize components
recorder = gameObject.AddComponent<PlayKit_MicrophoneRecorder>();
recorder.useVAD = true;
recorder.OnRecordingStopped += ProcessVoiceInput;
transcriptionClient = PlayKitSDK.Factory.CreateTranscriptionClient("whisper-1");
chatClient = PlayKitSDK.Factory.CreateChatClient();
// Set system message
chatHistory.Add(new PlayKit_ChatMessage
{
Role = "system",
Content = "You are a friendly game NPC."
});
talkButton.onClick.AddListener(() => StartVoiceInput());
}
void StartVoiceInput()
{
if (!recorder.IsRecording)
{
recorder.StartRecording();
talkButton.GetComponentInChildren<Text>().text = "Recording...";
}
}
async void ProcessVoiceInput(AudioClip clip)
{
talkButton.interactable = false;
// 1. Speech to text
chatDisplay.text += "\n[Recognizing...]";
var transcription = await transcriptionClient.TranscribeAudioClipAsync(
clip,
null,
null,
this.GetCancellationTokenOnDestroy()
);
if (!transcription.Success)
{
chatDisplay.text += $"\nRecognition failed: {transcription.Error}";
talkButton.interactable = true;
return;
}
string userText = transcription.Text;
chatDisplay.text += $"\nPlayer: {userText}";
// 2. Send to AI
chatHistory.Add(new PlayKit_ChatMessage
{
Role = "user",
Content = userText
});
var config = new PlayKit_ChatConfig(chatHistory) { Temperature = 0.7f };
var aiResponse = await chatClient.TextGenerationAsync(config, this.GetCancellationTokenOnDestroy());
if (aiResponse.Success)
{
chatHistory.Add(new PlayKit_ChatMessage
{
Role = "assistant",
Content = aiResponse.Response
});
chatDisplay.text += $"\nNPC: {aiResponse.Response}";
}
else
{
chatDisplay.text += $"\nAI response failed: {aiResponse.ErrorMessage}";
}
talkButton.interactable = true;
talkButton.GetComponentInChildren<Text>().text = "Press to Talk";
}
}Microphone Device Management
List Available Devices
public class MicrophoneDeviceSelector : MonoBehaviour
{
[SerializeField] private Dropdown deviceDropdown;
void Start()
{
// Get all microphone devices
string[] devices = PlayKit_MicrophoneRecorder.GetAvailableDevices();
// Populate dropdown
deviceDropdown.ClearOptions();
deviceDropdown.AddOptions(new List<string>(devices));
// Select device
deviceDropdown.onValueChanged.AddListener(SelectDevice);
}
void SelectDevice(int index)
{
string[] devices = PlayKit_MicrophoneRecorder.GetAvailableDevices();
if (index < devices.Length)
{
string selectedDevice = devices[index];
var recorder = GetComponent<PlayKit_MicrophoneRecorder>();
recorder.SetMicrophoneDevice(selectedDevice);
Debug.Log($"Selected microphone: {selectedDevice}");
}
}
}Use Default Device
// Use system default microphone
recorder.StartRecording(null);
// Or specify device
recorder.StartRecording("Microphone (Realtek High Definition Audio)");Platform Support
Windows / macOS / Linux
Full support for microphone recording and speech recognition.
iOS / Android
Microphone permissions must be requested in build settings:
iOS: Add to Info.plist:
<key>NSMicrophoneUsageDescription</key>
<string>Microphone access is required for voice input</string>Android: Add to AndroidManifest.xml:
<uses-permission android:name="android.permission.RECORD_AUDIO" />WebGL
WebGL platform does not support the PlayKit_MicrophoneRecorder component because Unity's Microphone API is not available on WebGL.
If you need speech recognition on WebGL, you'll need to use the browser's Web Audio API and integrate via JavaScript plugins.
Audio Formats
Supported Formats
The transcription client supports multiple audio formats:
- WAV - Recommended format, 16kHz PCM16
- MP3
- FLAC
- OGG
Convert AudioClip to Byte Array
If you need to process audio data manually:
public byte[] AudioClipToWAV(AudioClip clip)
{
// SDK handles this internally, but if you need manual processing:
var samples = new float[clip.samples * clip.channels];
clip.GetData(samples, 0);
// Convert to WAV format byte array
// (WAV encoding implementation needed - SDK handles this automatically)
return wavBytes;
}Direct Byte Array Transcription
// If you already have audio byte array
byte[] audioData = LoadAudioFile("recording.wav");
var result = await transcriptionClient.TranscribeAsync(
audioData: audioData,
language: null,
prompt: null,
cancellationToken: this.GetCancellationTokenOnDestroy()
);Best Practices
- Use VAD: Enable Voice Activity Detection for better user experience
- Appropriate sample rate: Use 16kHz for Whisper model
- Specify language: If you know the source language, specify it for better accuracy
- Provide prompts: Use prompts to improve recognition of technical terms
- Limit recording duration: Set reasonable
maxRecordingSecondsto avoid excessively long recordings - Show status: Clearly display recording and recognition status to users
- Handle permissions: Properly request and handle microphone permissions on mobile platforms
- Error handling: Always check
result.Successand handle failure cases
Next Steps
- Learn about NPC Conversations for intelligent dialogue
- Read the API Reference for complete API documentation