2using Newtonsoft.Json.Linq;
8using System.Collections.Generic;
10using System.Threading;
11using System.Threading.Tasks;
12using Unity.Collections;
14using UnityEngine.Events;
16using Utilities.Encoding.Wav;
55 private OpenAIClient
api =
null;
59 private List<OpenAI.Tool>
tools =
new List<OpenAI.Tool>();
79 Func<CancellationToken, Task> run = async (CancellationToken cancellationToken) => {
89 }
catch (Exception e) {
91 case TaskCanceledException:
break;
92 case OperationCanceledException:
break;
93 default: Debug.LogException(e);
break;
98 Debug.Log(
"Director's session disposed.");
101 _ = run(cancellationToken);
114 throw new System.Exception(
"Director.ListenForNextInput can only be invoked if the status is Waiting!");
124 }
catch (Exception e) {
125 Debug.LogException(e);
135 statusMutex.value =
Status.Waiting;
150 await
session.SendAsync(
new InputAudioBufferClearRequest(), cancellationToken);
153 response.userTranscript = message;
154 await
session.SendAsync(
new OpenAI.Realtime.ConversationItemCreateRequest(message), cancellationToken);
155 await
session.SendAsync(
new OpenAI.Realtime.CreateResponseRequest(), cancellationToken);
162 return new OpenAI.Realtime.SessionConfiguration(
164 modalities: Modality.Text,
165 instructions:
this.config ?
this.config.instructions :
null,
166 inputAudioTranscriptionSettings:
new OpenAI.Realtime.InputAudioTranscriptionSettings(Model.Transcribe_GPT_4o, language:
"en"),
167 turnDetectionSettings:
new OpenAI.Realtime.ServerVAD(silenceDuration: 2000, createResponse:
false),
169 toolChoice:
"required");
173 var memoryStream =
new MemoryStream();
174 var semaphore =
new SemaphoreSlim(1, 1);
177 byte[] emptyBuffer =
new byte[1024 * 16];
178 async Task BufferCallback(NativeArray<byte> buffer) {
180 await semaphore.WaitAsync(CancellationToken.None).ConfigureAwait(
false);
181 for (
int i = 0; i < buffer.Length; ++i) { memoryStream.WriteByte(buffer[i]); }
189 RecordingManager.StartRecordingStream<WavEncoder>(BufferCallback, 24000, cancellationToken);
192 byte[] voiceBuffer = ArrayPool<byte>.Shared.Rent(1024 * 16);
196 await semaphore.WaitAsync(cancellationToken).ConfigureAwait(
false);
197 memoryStream.Position = 0;
198 bytesRead = await memoryStream.ReadAsync(voiceBuffer, 0, (
int)Math.Min(voiceBuffer.Length, memoryStream.Length), cancellationToken).ConfigureAwait(
false);
199 memoryStream.SetLength(0);
207 await
session.SendAsync(
new InputAudioBufferAppendRequest(voiceBuffer.AsMemory(0, bytesRead)), cancellationToken).ConfigureAwait(
false);
211 await
session.SendAsync(
new InputAudioBufferAppendRequest(emptyBuffer.AsMemory(0, bytesRead)), cancellationToken).ConfigureAwait(
false);
216 }
catch (Exception e) {
219 case TaskCanceledException:
break;
220 case OperationCanceledException:
break;
221 default: Debug.LogError(e);
break;
224 ArrayPool<byte>.Shared.Return(voiceBuffer);
226 }
while (!cancellationToken.IsCancellationRequested);
228 RecordingManager.EndRecording();
229 }
catch (Exception e) {
232 case TaskCanceledException:
break;
233 case OperationCanceledException:
break;
234 default: Debug.LogError(e);
break;
237 await memoryStream.DisposeAsync();
258 case RealtimeEventError error:
throw error;
259 case SessionResponse sessionResponse:
break;
260 case RealtimeConversationResponse conversationResponse:
break;
261 case ConversationItemCreatedResponse conversationItemCreated:
break;
262 case ConversationItemInputAudioTranscriptionResponse conversationItemTranscription:
263 if (!conversationItemTranscription.IsCompleted) {
return; }
264 Debug.Log($
"[{conversationItemTranscription.ItemId}] Director's User Transcription: " + conversationItemTranscription.Transcript.Trim());
276 if (
latestItemId != conversationItemTranscription.ItemId) {
277 Debug.Log(
"Director.OnServerEvent ConversationItemInputAudioTranscriptionResponse ignored as it is outdated.");
282 response.userTranscript = conversationItemTranscription.Transcript.Trim();
285 Debug.Log(
"Director.OnServerEvent ConversationItemInputAudioTranscriptionResponse ignored as the status is not ReplyToVoice.");
288 case ConversationItemTruncatedResponse conversationItemTruncated:
break;
289 case ConversationItemDeletedResponse conversationItemDeleted:
break;
290 case InputAudioBufferCommittedResponse committedResponse:
291 Debug.Log($
"[{committedResponse.ItemId}] Director.OnServerEvent InputAudioBufferCommittedResponse");
297 session.Send(
new OpenAI.Realtime.CreateResponseRequest());
299 Debug.Log(
"Director.OnServerEvent InputAudioBufferCommittedResponse ignored.");
302 case InputAudioBufferClearedResponse clearedResponse:
303 Debug.Log($
"Director.OnServerEvent InputAudioBufferClearedResponse");
305 case InputAudioBufferStartedResponse startedResponse:
306 Debug.Log($
"Director.OnServerEvent InputAudioBufferStartedResponse");
308 case InputAudioBufferStoppedResponse stoppedResponse:
break;
309 case RealtimeResponse realtimeResponse:
310 switch (realtimeResponse.Response.Status) {
311 case RealtimeResponseStatus.InProgress:
312 Debug.Log(
"Director Realtime Response InProgress.");
314 case RealtimeResponseStatus.Completed:
315 Debug.Log(
"Director Realtime Response Completed.");
318 case RealtimeResponseStatus.Cancelled:
319 Debug.Log(
"Director Realtime Response Cancelled.");
321 case RealtimeResponseStatus.Failed:
322 Debug.Log(
"Director Realtime Response Failed.");
324 case RealtimeResponseStatus.Incomplete:
325 Debug.Log(
"Director Realtime Response Incomplete.");
329 case ResponseOutputItemResponse outputItemResponse:
break;
330 case ResponseContentPartResponse contentPartResponse:
break;
331 case ResponseTextResponse textResponse:
break;
332 case ResponseAudioResponse audioResponse:
break;
333 case ResponseAudioTranscriptResponse transcriptResponse:
break;
334 case ResponseFunctionCallArgumentsResponse functionCallArgumentsResponse:
335 if (!functionCallArgumentsResponse.IsDone) {
return; }
336 Debug.Log($
"Director's Function Call: " + functionCallArgumentsResponse.Name +
", Arguments: " + functionCallArgumentsResponse.Arguments.ToString());
339 string output =
string.Empty;
340 if (functionCallArgumentsResponse.Name ==
"trigger_discussion") {
341 response.discussionTopic =
ParseDiscussionTopic(functionCallArgumentsResponse.Arguments.ToString());
342 }
else if (functionCallArgumentsResponse.Name ==
"trigger_response") {
343 response.speakerOrder =
ParseSpeakerOrder(functionCallArgumentsResponse.Arguments.ToString());
347 ConversationItem functionCallOutput =
new ConversationItem((ToolCall)functionCallArgumentsResponse, output);
348 session.Send(
new OpenAI.Realtime.ConversationItemCreateRequest(functionCallOutput));
350 case RateLimitsResponse rateLimitsResponse:
break;
360 statusMutex.value = value;
376 speaker_order =
new {
378 description =
"The order which the AI models will respond to the user.",
381 @
enum = speakers.ToArray()
384 maxItems = speakers.Count,
388 required =
new[] {
"speaker_order" }
390 string parameters = JsonConvert.SerializeObject(args, Formatting.Indented);
391 return new OpenAI.Function(
"trigger_response",
"Triggers the AI models to respond to the user. No output is given.", JToken.Parse(parameters));
405 description =
"The topic of the discussion to trigger.",
406 @
enum = topics.ToArray()
409 required =
new[] {
"topic" }
411 string parameters = JsonConvert.SerializeObject(args, Formatting.Indented);
412 return new OpenAI.Function(
"trigger_discussion",
"Triggers a discussion based on a topic. No output is given. If there are no input choices given, do not invoke this function.", JToken.Parse(parameters));
416 JToken parsedArgs = JToken.Parse(args);
417 if (parsedArgs ==
null || parsedArgs[
"speaker_order"] ==
null) {
418 return new List<string>();
420 return new List<string>(parsedArgs[
"speaker_order"].ToObject<
string[]>());
424 JToken parsedArgs = JToken.Parse(args);
425 if (parsedArgs ==
null || parsedArgs[
"topic"] ==
null) {
428 return parsedArgs[
"topic"].ToString();
Helper class to load the authentication file and retrieve API keys.
static OpenAIAuthentication GetOpenAIAuthentication()
List< string > speakerOrder
Helper class to act as a mutex for the status value, as it may be read by multiple threads.
void SetStatus(Status value)
List< string > ParseSpeakerOrder(string args)
async void RecordInputAudio(OpenAI.Realtime.RealtimeSession session, CancellationToken cancellationToken)
void Initialise(UnityAction< Director.Response > onDirectorResponse, CancellationToken cancellationToken)
bool IsStatus(Status value)
void InvokeOnDirectorResponse()
List< OpenAI.Tool > tools
OpenAI.Function BuildTriggerResponseTool(List< string > speakers)
bool TestAndSetStatus(Status condition, Status value)
Director.Response response
string ParseDiscussionTopic(string args)
OpenAI.Realtime.SessionConfiguration GetSessionConfiguration()
async Awaitable< bool > SubmitUserTextInput(string message, CancellationToken cancellationToken)
void OnServerEvent(IServerEvent @event)
OpenAI.Function BuildTriggerDiscussionTool(List< string > topics)
async void ListenForNextUserInput(DirectorConfig config, List< string > speakers, List< string > topics, CancellationToken cancellationToken)
UnityAction< Director.Response > onDirectorResponse
Status
Current status of the director.
@ Waiting
Idling and waiting for VoiceChat system.
@ VoiceInput
Replying to voice input.
@ Listening
Listening for user input.
@ TextInput
Replying to text input.