ESPHome  2024.9.0
voice_assistant.cpp
Go to the documentation of this file.
1 #include "voice_assistant.h"
2 
3 #ifdef USE_VOICE_ASSISTANT
4 
5 #include "esphome/core/log.h"
6 
7 #include <cinttypes>
8 #include <cstdio>
9 
10 namespace esphome {
11 namespace voice_assistant {
12 
13 static const char *const TAG = "voice_assistant";
14 
15 #ifdef SAMPLE_RATE_HZ
16 #undef SAMPLE_RATE_HZ
17 #endif
18 
19 static const size_t SAMPLE_RATE_HZ = 16000;
20 static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
21 static const size_t BUFFER_SIZE = 512 * SAMPLE_RATE_HZ / 1000;
22 static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t);
23 static const size_t RECEIVE_SIZE = 1024;
24 static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
25 
27 
29  this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
30  if (this->socket_ == nullptr) {
31  ESP_LOGE(TAG, "Could not create socket");
32  this->mark_failed();
33  return false;
34  }
35  int enable = 1;
36  int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
37  if (err != 0) {
38  ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
39  // we can still continue
40  }
41  err = this->socket_->setblocking(false);
42  if (err != 0) {
43  ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
44  this->mark_failed();
45  return false;
46  }
47 
48 #ifdef USE_SPEAKER
49  if (this->speaker_ != nullptr) {
50  struct sockaddr_storage server;
51 
52  socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
53  if (sl == 0) {
54  ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
55  this->mark_failed();
56  return false;
57  }
58 
59  err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
60  if (err != 0) {
61  ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
62  this->mark_failed();
63  return false;
64  }
65  }
66 #endif
67  this->udp_socket_running_ = true;
68  return true;
69 }
70 
72  ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
73 
75 }
76 
78  if (this->send_buffer_ != nullptr) {
79  return true; // Already allocated
80  }
81 
82 #ifdef USE_SPEAKER
83  if (this->speaker_ != nullptr) {
85  this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
86  if (this->speaker_buffer_ == nullptr) {
87  ESP_LOGW(TAG, "Could not allocate speaker buffer");
88  return false;
89  }
90  }
91 #endif
92 
94  this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
95  if (this->input_buffer_ == nullptr) {
96  ESP_LOGW(TAG, "Could not allocate input buffer");
97  return false;
98  }
99 
100 #ifdef USE_ESP_ADF
101  this->vad_instance_ = vad_create(VAD_MODE_4);
102 #endif
103 
104  this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
105  if (this->ring_buffer_ == nullptr) {
106  ESP_LOGW(TAG, "Could not allocate ring buffer");
107  return false;
108  }
109 
111  this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
112  if (send_buffer_ == nullptr) {
113  ESP_LOGW(TAG, "Could not allocate send buffer");
114  return false;
115  }
116 
117  return true;
118 }
119 
121  if (this->send_buffer_ != nullptr) {
122  memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
123  }
124 
125  if (this->input_buffer_ != nullptr) {
126  memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
127  }
128 
129  if (this->ring_buffer_ != nullptr) {
130  this->ring_buffer_->reset();
131  }
132 
133 #ifdef USE_SPEAKER
134  if (this->speaker_buffer_ != nullptr) {
135  memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
136 
137  this->speaker_buffer_size_ = 0;
138  this->speaker_buffer_index_ = 0;
139  this->speaker_bytes_received_ = 0;
140  }
141 #endif
142 }
143 
146  send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
147  this->send_buffer_ = nullptr;
148 
149  if (this->ring_buffer_ != nullptr) {
150  this->ring_buffer_.reset();
151  this->ring_buffer_ = nullptr;
152  }
153 
154 #ifdef USE_ESP_ADF
155  if (this->vad_instance_ != nullptr) {
156  vad_destroy(this->vad_instance_);
157  this->vad_instance_ = nullptr;
158  }
159 #endif
160 
162  input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
163  this->input_buffer_ = nullptr;
164 
165 #ifdef USE_SPEAKER
166  if (this->speaker_buffer_ != nullptr) {
168  speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
169  this->speaker_buffer_ = nullptr;
170  }
171 #endif
172 }
173 
175  this->conversation_id_ = "";
176  ESP_LOGD(TAG, "reset conversation ID");
177 }
178 
180  size_t bytes_read = 0;
181  if (this->mic_->is_running()) { // Read audio into input buffer
182  bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
183  if (bytes_read == 0) {
184  memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
185  return 0;
186  }
187  // Write audio into ring buffer
188  this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
189  } else {
190  ESP_LOGD(TAG, "microphone not running");
191  }
192  return bytes_read;
193 }
194 
196  if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
198  if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
200  } else {
202  }
203  this->continuous_ = false;
204  this->signal_stop_();
205  this->clear_buffers_();
206  return;
207  }
208  switch (this->state_) {
209  case State::IDLE: {
210  if (this->continuous_ && this->desired_state_ == State::IDLE) {
211  this->idle_trigger_->trigger();
212 #ifdef USE_ESP_ADF
213  if (this->use_wake_word_) {
215  } else
216 #endif
217  {
219  }
220  } else {
221  this->high_freq_.stop();
222  }
223  break;
224  }
226  ESP_LOGD(TAG, "Starting Microphone");
227  if (!this->allocate_buffers_()) {
228  this->status_set_error("Failed to allocate buffers");
229  return;
230  }
231  if (this->status_has_error()) {
232  this->status_clear_error();
233  }
234  this->clear_buffers_();
235 
236  this->mic_->start();
237  this->high_freq_.start();
239  break;
240  }
242  if (this->mic_->is_running()) {
243  this->set_state_(this->desired_state_);
244  }
245  break;
246  }
247 #ifdef USE_ESP_ADF
248  case State::WAIT_FOR_VAD: {
249  this->read_microphone_();
250  ESP_LOGD(TAG, "Waiting for speech...");
252  break;
253  }
254  case State::WAITING_FOR_VAD: {
255  size_t bytes_read = this->read_microphone_();
256  if (bytes_read > 0) {
257  vad_state_t vad_state =
258  vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
259  if (vad_state == VAD_SPEECH) {
260  if (this->vad_counter_ < this->vad_threshold_) {
261  this->vad_counter_++;
262  } else {
263  ESP_LOGD(TAG, "VAD detected speech");
265 
266  // Reset for next time
267  this->vad_counter_ = 0;
268  }
269  } else {
270  if (this->vad_counter_ > 0) {
271  this->vad_counter_--;
272  }
273  }
274  }
275  break;
276  }
277 #endif
278  case State::START_PIPELINE: {
279  this->read_microphone_();
280  ESP_LOGD(TAG, "Requesting start...");
281  uint32_t flags = 0;
282  if (this->use_wake_word_)
284  if (this->silence_detection_)
286  api::VoiceAssistantAudioSettings audio_settings;
287  audio_settings.noise_suppression_level = this->noise_suppression_level_;
288  audio_settings.auto_gain = this->auto_gain_;
289  audio_settings.volume_multiplier = this->volume_multiplier_;
290 
292  msg.start = true;
293  msg.conversation_id = this->conversation_id_;
294  msg.flags = flags;
295  msg.audio_settings = audio_settings;
296  msg.wake_word_phrase = this->wake_word_;
297  this->wake_word_ = "";
298 
299  if (this->api_client_ == nullptr || !this->api_client_->send_voice_assistant_request(msg)) {
300  ESP_LOGW(TAG, "Could not request start");
301  this->error_trigger_->trigger("not-connected", "Could not request start");
302  this->continuous_ = false;
304  break;
305  }
307  this->set_timeout("reset-conversation_id", this->conversation_timeout_,
308  [this]() { this->reset_conversation_id(); });
309  break;
310  }
312  this->read_microphone_();
313  break; // State changed when udp server port received
314  }
316  this->read_microphone_();
317  size_t available = this->ring_buffer_->available();
318  while (available >= SEND_BUFFER_SIZE) {
319  size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
320  if (this->audio_mode_ == AUDIO_MODE_API) {
322  msg.data.assign((char *) this->send_buffer_, read_bytes);
324  } else {
325  if (!this->udp_socket_running_) {
326  if (!this->start_udp_socket_()) {
328  break;
329  }
330  }
331  this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
332  sizeof(this->dest_addr_));
333  }
334  available = this->ring_buffer_->available();
335  }
336 
337  break;
338  }
339  case State::STOP_MICROPHONE: {
340  if (this->mic_->is_running()) {
341  this->mic_->stop();
343  } else {
344  this->set_state_(this->desired_state_);
345  }
346  break;
347  }
349  if (this->mic_->is_stopped()) {
350  this->set_state_(this->desired_state_);
351  }
352  break;
353  }
355  break; // State changed by events
356  }
358  bool playing = false;
359 #ifdef USE_SPEAKER
360  if (this->speaker_ != nullptr) {
361  ssize_t received_len = 0;
362  if (this->audio_mode_ == AUDIO_MODE_UDP) {
363  if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
364  received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
365  if (received_len > 0) {
366  this->speaker_buffer_index_ += received_len;
367  this->speaker_buffer_size_ += received_len;
368  this->speaker_bytes_received_ += received_len;
369  }
370  } else {
371  ESP_LOGD(TAG, "Receive buffer full");
372  }
373  }
374  // Build a small buffer of audio before sending to the speaker
375  bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
376  if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
377  this->write_speaker_();
378  if (this->wait_for_stream_end_) {
379  this->cancel_timeout("playing");
380  if (end_of_stream) {
381  ESP_LOGD(TAG, "End of audio stream received");
382  this->cancel_timeout("speaker-timeout");
384  }
385  break; // We dont want to timeout here as the STREAM_END event will take care of that.
386  }
387  playing = this->speaker_->is_running();
388  }
389 #endif
390 #ifdef USE_MEDIA_PLAYER
391  if (this->media_player_ != nullptr) {
393  }
394 #endif
395  if (playing) {
396  this->set_timeout("playing", 2000, [this]() {
397  this->cancel_timeout("speaker-timeout");
399 
401  msg.success = true;
403  });
404  }
405  break;
406  }
408 #ifdef USE_SPEAKER
409  if (this->speaker_ != nullptr) {
410  if (this->speaker_buffer_size_ > 0) {
411  this->write_speaker_();
412  break;
413  }
414  if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
415  break;
416  }
417  ESP_LOGD(TAG, "Speaker has finished outputting all audio");
418  this->speaker_->stop();
419  this->cancel_timeout("speaker-timeout");
420  this->cancel_timeout("playing");
421 
422  this->clear_buffers_();
423 
424  this->wait_for_stream_end_ = false;
425  this->stream_ended_ = false;
426 
428  }
429 #endif
431  break;
432  }
433  default:
434  break;
435  }
436 }
437 
438 #ifdef USE_SPEAKER
440  if (this->speaker_buffer_size_ > 0) {
441  size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
442  size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
443  if (written > 0) {
444  memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
445  this->speaker_buffer_size_ -= written;
446  this->speaker_buffer_index_ -= written;
447  this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
448  } else {
449  ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
450  }
451  }
452 }
453 #endif
454 
456  if (!subscribe) {
457  if (this->api_client_ == nullptr || client != this->api_client_) {
458  ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
459  return;
460  }
461  this->api_client_ = nullptr;
463  return;
464  }
465 
466  if (this->api_client_ != nullptr) {
467  ESP_LOGE(TAG, "Multiple API Clients attempting to connect to Voice Assistant");
468  ESP_LOGE(TAG, "Current client: %s", this->api_client_->get_client_combined_info().c_str());
469  ESP_LOGE(TAG, "New client: %s", client->get_client_combined_info().c_str());
470  return;
471  }
472 
473  this->api_client_ = client;
475 }
476 
477 static const LogString *voice_assistant_state_to_string(State state) {
478  switch (state) {
479  case State::IDLE:
480  return LOG_STR("IDLE");
482  return LOG_STR("START_MICROPHONE");
484  return LOG_STR("STARTING_MICROPHONE");
485  case State::WAIT_FOR_VAD:
486  return LOG_STR("WAIT_FOR_VAD");
488  return LOG_STR("WAITING_FOR_VAD");
490  return LOG_STR("START_PIPELINE");
492  return LOG_STR("STARTING_PIPELINE");
494  return LOG_STR("STREAMING_MICROPHONE");
496  return LOG_STR("STOP_MICROPHONE");
498  return LOG_STR("STOPPING_MICROPHONE");
500  return LOG_STR("AWAITING_RESPONSE");
502  return LOG_STR("STREAMING_RESPONSE");
504  return LOG_STR("RESPONSE_FINISHED");
505  default:
506  return LOG_STR("UNKNOWN");
507  }
508 };
509 
511  State old_state = this->state_;
512  this->state_ = state;
513  ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
514  LOG_STR_ARG(voice_assistant_state_to_string(state)));
515 }
516 
517 void VoiceAssistant::set_state_(State state, State desired_state) {
518  this->set_state_(state);
519  this->desired_state_ = desired_state;
520  ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
521 }
522 
524  ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
525  this->error_trigger_->trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
527 }
528 
530  if (this->state_ != State::STARTING_PIPELINE) {
531  this->signal_stop_();
532  return;
533  }
534 
535  ESP_LOGD(TAG, "Client started, streaming microphone");
536  this->audio_mode_ = AUDIO_MODE_API;
537 
538  if (this->mic_->is_running()) {
540  } else {
542  }
543 }
544 
545 void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
546  if (this->state_ != State::STARTING_PIPELINE) {
547  this->signal_stop_();
548  return;
549  }
550 
551  ESP_LOGD(TAG, "Client started, streaming microphone");
552  this->audio_mode_ = AUDIO_MODE_UDP;
553 
554  memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
555  if (this->dest_addr_.ss_family == AF_INET) {
556  ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
557  }
558 #if LWIP_IPV6
559  else if (this->dest_addr_.ss_family == AF_INET6) {
560  ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
561  }
562 #endif
563  else {
564  ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
565  return;
566  }
567 
568  if (this->mic_->is_running()) {
570  } else {
572  }
573 }
574 
575 void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
576  if (this->api_client_ == nullptr) {
577  ESP_LOGE(TAG, "No API client connected");
579  this->continuous_ = false;
580  return;
581  }
582  if (this->state_ == State::IDLE) {
583  this->continuous_ = continuous;
584  this->silence_detection_ = silence_detection;
585 #ifdef USE_ESP_ADF
586  if (this->use_wake_word_) {
588  } else
589 #endif
590  {
592  }
593  }
594 }
595 
597  this->continuous_ = false;
598 
599  switch (this->state_) {
600  case State::IDLE:
601  break;
604  case State::WAIT_FOR_VAD:
608  break;
611  this->signal_stop_();
613  break;
616  this->desired_state_ = State::IDLE;
617  break;
621  break; // Let the incoming audio stream finish then it will go to idle.
622  }
623 }
624 
626  memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
627  if (this->api_client_ == nullptr) {
628  return;
629  }
630  ESP_LOGD(TAG, "Signaling stop...");
632  msg.start = false;
634 }
635 
637  ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
638  switch (msg.event_type) {
640  ESP_LOGD(TAG, "Assist Pipeline running");
641  this->defer([this]() { this->start_trigger_->trigger(); });
642  break;
644  break;
646  ESP_LOGD(TAG, "Wake word detected");
647  this->defer([this]() { this->wake_word_detected_trigger_->trigger(); });
648  break;
649  }
651  ESP_LOGD(TAG, "STT started");
652  this->defer([this]() { this->listening_trigger_->trigger(); });
653  break;
655  std::string text;
656  for (auto arg : msg.data) {
657  if (arg.name == "text") {
658  text = std::move(arg.value);
659  }
660  }
661  if (text.empty()) {
662  ESP_LOGW(TAG, "No text in STT_END event");
663  return;
664  }
665  ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
666  this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
667  break;
668  }
670  ESP_LOGD(TAG, "Intent started");
671  this->defer([this]() { this->intent_start_trigger_->trigger(); });
672  break;
674  for (auto arg : msg.data) {
675  if (arg.name == "conversation_id") {
676  this->conversation_id_ = std::move(arg.value);
677  }
678  }
679  this->defer([this]() { this->intent_end_trigger_->trigger(); });
680  break;
681  }
683  std::string text;
684  for (auto arg : msg.data) {
685  if (arg.name == "text") {
686  text = std::move(arg.value);
687  }
688  }
689  if (text.empty()) {
690  ESP_LOGW(TAG, "No text in TTS_START event");
691  return;
692  }
693  ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
694  this->defer([this, text]() {
695  this->tts_start_trigger_->trigger(text);
696 #ifdef USE_SPEAKER
697  if (this->speaker_ != nullptr) {
698  this->speaker_->start();
699  }
700 #endif
701  });
702  break;
703  }
705  std::string url;
706  for (auto arg : msg.data) {
707  if (arg.name == "url") {
708  url = std::move(arg.value);
709  }
710  }
711  if (url.empty()) {
712  ESP_LOGW(TAG, "No url in TTS_END event");
713  return;
714  }
715  ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
716  this->defer([this, url]() {
717 #ifdef USE_MEDIA_PLAYER
718  if (this->media_player_ != nullptr) {
720  }
721 #endif
722  this->tts_end_trigger_->trigger(url);
723  });
725  this->set_state_(new_state, new_state);
726  break;
727  }
729  ESP_LOGD(TAG, "Assist Pipeline ended");
730  if (this->state_ == State::STREAMING_MICROPHONE) {
731  this->ring_buffer_->reset();
732 #ifdef USE_ESP_ADF
733  if (this->use_wake_word_) {
734  // No need to stop the microphone since we didn't use the speaker
736  } else
737 #endif
738  {
740  }
741  } else if (this->state_ == State::AWAITING_RESPONSE) {
742  // No TTS start event ("nevermind")
744  }
745  this->defer([this]() { this->end_trigger_->trigger(); });
746  break;
747  }
749  std::string code = "";
750  std::string message = "";
751  for (auto arg : msg.data) {
752  if (arg.name == "code") {
753  code = std::move(arg.value);
754  } else if (arg.name == "message") {
755  message = std::move(arg.value);
756  }
757  }
758  if (code == "wake-word-timeout" || code == "wake_word_detection_aborted" || code == "no_wake_word") {
759  // Don't change state here since either the "tts-end" or "run-end" events will do it.
760  return;
761  } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
762  // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
763  this->defer([this, code, message]() {
764  this->request_stop();
765  this->error_trigger_->trigger(code, message);
766  });
767  return;
768  }
769  ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
770  if (this->state_ != State::IDLE) {
771  this->signal_stop_();
773  }
774  this->defer([this, code, message]() { this->error_trigger_->trigger(code, message); });
775  break;
776  }
778 #ifdef USE_SPEAKER
779  this->wait_for_stream_end_ = true;
780  ESP_LOGD(TAG, "TTS stream start");
781  this->defer([this] { this->tts_stream_start_trigger_->trigger(); });
782 #endif
783  break;
784  }
786 #ifdef USE_SPEAKER
787  this->stream_ended_ = true;
788  ESP_LOGD(TAG, "TTS stream end");
789 #endif
790  break;
791  }
793  ESP_LOGD(TAG, "Starting STT by VAD");
794  this->defer([this]() { this->stt_vad_start_trigger_->trigger(); });
795  break;
797  ESP_LOGD(TAG, "STT by VAD end");
799  this->defer([this]() { this->stt_vad_end_trigger_->trigger(); });
800  break;
801  default:
802  ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
803  break;
804  }
805 }
806 
808 #ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
809  if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
810  memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
811  this->speaker_buffer_index_ += msg.data.length();
812  this->speaker_buffer_size_ += msg.data.length();
813  this->speaker_bytes_received_ += msg.data.length();
814  ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data.length());
815  } else {
816  ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
817  }
818 #endif
819 }
820 
822  Timer timer = {
823  .id = msg.timer_id,
824  .name = msg.name,
825  .total_seconds = msg.total_seconds,
826  .seconds_left = msg.seconds_left,
827  .is_active = msg.is_active,
828  };
829  this->timers_[timer.id] = timer;
830  ESP_LOGD(TAG, "Timer Event");
831  ESP_LOGD(TAG, " Type: %" PRId32, msg.event_type);
832  ESP_LOGD(TAG, " %s", timer.to_string().c_str());
833 
834  switch (msg.event_type) {
836  this->timer_started_trigger_->trigger(timer);
837  break;
839  this->timer_updated_trigger_->trigger(timer);
840  break;
842  this->timer_cancelled_trigger_->trigger(timer);
843  this->timers_.erase(timer.id);
844  break;
846  this->timer_finished_trigger_->trigger(timer);
847  this->timers_.erase(timer.id);
848  break;
849  }
850 
851  if (this->timers_.empty()) {
852  this->cancel_interval("timer-event");
853  this->timer_tick_running_ = false;
854  } else if (!this->timer_tick_running_) {
855  this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
856  this->timer_tick_running_ = true;
857  }
858 }
859 
861  std::vector<Timer> res;
862  res.reserve(this->timers_.size());
863  for (auto &pair : this->timers_) {
864  auto &timer = pair.second;
865  if (timer.is_active && timer.seconds_left > 0) {
866  timer.seconds_left--;
867  }
868  res.push_back(timer);
869  }
870  this->timer_tick_trigger_->trigger(res);
871 }
872 
874 #ifdef USE_MEDIA_PLAYER
875  if (this->media_player_ != nullptr) {
876  this->tts_start_trigger_->trigger(msg.text);
879  this->tts_end_trigger_->trigger(msg.media_id);
880  this->end_trigger_->trigger();
881  }
882 #endif
883 }
884 
885 VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
886 
887 } // namespace voice_assistant
888 } // namespace esphome
889 
890 #endif // USE_VOICE_ASSISTANT
virtual size_t play(const uint8_t *data, size_t length)=0
bool is_running() const
Definition: speaker.h:31
void set_interval(const std::string &name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition: component.cpp:52
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition: component.cpp:27
std::unordered_map< std::string, Timer > timers_
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
Definition: component.cpp:73
enums::VoiceAssistantTimerEvent event_type
Definition: api_pb2.h:1814
HighFrequencyLoopRequester high_freq_
VoiceAssistant * global_voice_assistant
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition: socket.cpp:51
std::unique_ptr< socket::Socket > socket_
sa_family_t ss_family
Definition: headers.h:92
Trigger< std::string > * tts_start_trigger_
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition: component.cpp:69
bool cancel_interval(const std::string &name)
Cancel an interval function.
Definition: component.cpp:56
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Definition: component.cpp:130
void deallocate(T *p, size_t n)
Definition: helpers.h:678
Trigger< std::string > * tts_end_trigger_
uint32_t socklen_t
Definition: headers.h:97
VoiceAssistantAudioSettings audio_settings
Definition: api_pb2.h:1751
Trigger< std::vector< Timer > > * timer_tick_trigger_
enums::VoiceAssistantEvent event_type
Definition: api_pb2.h:1788
void client_subscription(api::APIConnection *client, bool subscribe)
virtual bool has_buffered_data() const =0
std::vector< VoiceAssistantEventData > data
Definition: api_pb2.h:1789
std::string get_client_combined_info() const
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
Definition: automation.h:95
bool status_has_error() const
Definition: component.cpp:150
void status_set_error(const char *message="unspecified")
Definition: component.cpp:159
media_player::MediaPlayer * media_player_
void start()
Start running the loop continuously.
Definition: helpers.cpp:648
bool send_voice_assistant_request(const VoiceAssistantRequest &msg)
void stop()
Stop running the loop continuously.
Definition: helpers.cpp:654
const uint32_t flags
Definition: stm32flash.h:85
bool send_voice_assistant_announce_finished(const VoiceAssistantAnnounceFinished &msg)
std::unique_ptr< RingBuffer > ring_buffer_
bool send_voice_assistant_audio(const VoiceAssistantAudio &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
void status_clear_error()
Definition: component.cpp:172
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
MediaPlayerCall & set_announcement(bool announce)
virtual size_t read(int16_t *buf, size_t len)=0
virtual void start()=0
virtual void mark_failed()
Mark this component as failed.
Definition: component.cpp:118
Implementation of SPI Controller mode.
Definition: a01nyub.cpp:7
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
static std::unique_ptr< RingBuffer > create(size_t len)
Definition: ring_buffer.cpp:14
MediaPlayerCall & set_media_url(const std::string &url)
virtual void stop()=0
void on_event(const api::VoiceAssistantEventResponse &msg)
bool state
Definition: fan.h:34
Trigger< std::string > * stt_end_trigger_
void request_start(bool continuous, bool silence_detection)
Trigger< std::string, std::string > * error_trigger_
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.