ESPHome  2025.2.0
voice_assistant.cpp
Go to the documentation of this file.
1 #include "voice_assistant.h"
2 #include "esphome/core/defines.h"
3 
4 #ifdef USE_VOICE_ASSISTANT
5 
6 #include "esphome/core/log.h"
7 
8 #include <cinttypes>
9 #include <cstdio>
10 
11 namespace esphome {
12 namespace voice_assistant {
13 
14 static const char *const TAG = "voice_assistant";
15 
16 #ifdef SAMPLE_RATE_HZ
17 #undef SAMPLE_RATE_HZ
18 #endif
19 
20 static const size_t SAMPLE_RATE_HZ = 16000;
21 static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
22 static const size_t BUFFER_SIZE = 512 * SAMPLE_RATE_HZ / 1000;
23 static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t);
24 static const size_t RECEIVE_SIZE = 1024;
25 static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
26 
28 
30 
32  this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
33  if (this->socket_ == nullptr) {
34  ESP_LOGE(TAG, "Could not create socket");
35  this->mark_failed();
36  return false;
37  }
38  int enable = 1;
39  int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
40  if (err != 0) {
41  ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
42  // we can still continue
43  }
44  err = this->socket_->setblocking(false);
45  if (err != 0) {
46  ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
47  this->mark_failed();
48  return false;
49  }
50 
51 #ifdef USE_SPEAKER
52  if (this->speaker_ != nullptr) {
53  struct sockaddr_storage server;
54 
55  socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
56  if (sl == 0) {
57  ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
58  this->mark_failed();
59  return false;
60  }
61 
62  err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
63  if (err != 0) {
64  ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
65  this->mark_failed();
66  return false;
67  }
68  }
69 #endif
70  this->udp_socket_running_ = true;
71  return true;
72 }
73 
75  if (this->send_buffer_ != nullptr) {
76  return true; // Already allocated
77  }
78 
79 #ifdef USE_SPEAKER
80  if (this->speaker_ != nullptr) {
82  this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
83  if (this->speaker_buffer_ == nullptr) {
84  ESP_LOGW(TAG, "Could not allocate speaker buffer");
85  return false;
86  }
87  }
88 #endif
89 
91  this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE);
92  if (this->input_buffer_ == nullptr) {
93  ESP_LOGW(TAG, "Could not allocate input buffer");
94  return false;
95  }
96 
97 #ifdef USE_ESP_ADF
98  this->vad_instance_ = vad_create(VAD_MODE_4);
99 #endif
100 
101  this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
102  if (this->ring_buffer_ == nullptr) {
103  ESP_LOGW(TAG, "Could not allocate ring buffer");
104  return false;
105  }
106 
108  this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE);
109  if (send_buffer_ == nullptr) {
110  ESP_LOGW(TAG, "Could not allocate send buffer");
111  return false;
112  }
113 
114  return true;
115 }
116 
118  if (this->send_buffer_ != nullptr) {
119  memset(this->send_buffer_, 0, SEND_BUFFER_SIZE);
120  }
121 
122  if (this->input_buffer_ != nullptr) {
123  memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
124  }
125 
126  if (this->ring_buffer_ != nullptr) {
127  this->ring_buffer_->reset();
128  }
129 
130 #ifdef USE_SPEAKER
131  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
132  memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
133 
134  this->speaker_buffer_size_ = 0;
135  this->speaker_buffer_index_ = 0;
136  this->speaker_bytes_received_ = 0;
137  }
138 #endif
139 }
140 
143  send_deallocator.deallocate(this->send_buffer_, SEND_BUFFER_SIZE);
144  this->send_buffer_ = nullptr;
145 
146  if (this->ring_buffer_ != nullptr) {
147  this->ring_buffer_.reset();
148  this->ring_buffer_ = nullptr;
149  }
150 
151 #ifdef USE_ESP_ADF
152  if (this->vad_instance_ != nullptr) {
153  vad_destroy(this->vad_instance_);
154  this->vad_instance_ = nullptr;
155  }
156 #endif
157 
159  input_deallocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE);
160  this->input_buffer_ = nullptr;
161 
162 #ifdef USE_SPEAKER
163  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
165  speaker_deallocator.deallocate(this->speaker_buffer_, SPEAKER_BUFFER_SIZE);
166  this->speaker_buffer_ = nullptr;
167  }
168 #endif
169 }
170 
172  this->conversation_id_ = "";
173  ESP_LOGD(TAG, "reset conversation ID");
174 }
175 
177  size_t bytes_read = 0;
178  if (this->mic_->is_running()) { // Read audio into input buffer
179  bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
180  if (bytes_read == 0) {
181  memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t));
182  return 0;
183  }
184  // Write audio into ring buffer
185  this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
186  } else {
187  ESP_LOGD(TAG, "microphone not running");
188  }
189  return bytes_read;
190 }
191 
193  if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
195  if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
197  } else {
199  }
200  this->continuous_ = false;
201  this->signal_stop_();
202  this->clear_buffers_();
203  return;
204  }
205  switch (this->state_) {
206  case State::IDLE: {
207  if (this->continuous_ && this->desired_state_ == State::IDLE) {
208  this->idle_trigger_->trigger();
209 #ifdef USE_ESP_ADF
210  if (this->use_wake_word_) {
212  } else
213 #endif
214  {
216  }
217  } else {
218  this->high_freq_.stop();
219  }
220  break;
221  }
223  ESP_LOGD(TAG, "Starting Microphone");
224  if (!this->allocate_buffers_()) {
225  this->status_set_error("Failed to allocate buffers");
226  return;
227  }
228  if (this->status_has_error()) {
229  this->status_clear_error();
230  }
231  this->clear_buffers_();
232 
233  this->mic_->start();
234  this->high_freq_.start();
236  break;
237  }
239  if (this->mic_->is_running()) {
240  this->set_state_(this->desired_state_);
241  }
242  break;
243  }
244 #ifdef USE_ESP_ADF
245  case State::WAIT_FOR_VAD: {
246  this->read_microphone_();
247  ESP_LOGD(TAG, "Waiting for speech...");
249  break;
250  }
251  case State::WAITING_FOR_VAD: {
252  size_t bytes_read = this->read_microphone_();
253  if (bytes_read > 0) {
254  vad_state_t vad_state =
255  vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
256  if (vad_state == VAD_SPEECH) {
257  if (this->vad_counter_ < this->vad_threshold_) {
258  this->vad_counter_++;
259  } else {
260  ESP_LOGD(TAG, "VAD detected speech");
262 
263  // Reset for next time
264  this->vad_counter_ = 0;
265  }
266  } else {
267  if (this->vad_counter_ > 0) {
268  this->vad_counter_--;
269  }
270  }
271  }
272  break;
273  }
274 #endif
275  case State::START_PIPELINE: {
276  this->read_microphone_();
277  ESP_LOGD(TAG, "Requesting start...");
278  uint32_t flags = 0;
279  if (this->use_wake_word_)
281  if (this->silence_detection_)
283  api::VoiceAssistantAudioSettings audio_settings;
284  audio_settings.noise_suppression_level = this->noise_suppression_level_;
285  audio_settings.auto_gain = this->auto_gain_;
286  audio_settings.volume_multiplier = this->volume_multiplier_;
287 
289  msg.start = true;
290  msg.conversation_id = this->conversation_id_;
291  msg.flags = flags;
292  msg.audio_settings = audio_settings;
293  msg.wake_word_phrase = this->wake_word_;
294  this->wake_word_ = "";
295 
296  if (this->api_client_ == nullptr || !this->api_client_->send_voice_assistant_request(msg)) {
297  ESP_LOGW(TAG, "Could not request start");
298  this->error_trigger_->trigger("not-connected", "Could not request start");
299  this->continuous_ = false;
301  break;
302  }
304  this->set_timeout("reset-conversation_id", this->conversation_timeout_,
305  [this]() { this->reset_conversation_id(); });
306  break;
307  }
309  this->read_microphone_();
310  break; // State changed when udp server port received
311  }
313  this->read_microphone_();
314  size_t available = this->ring_buffer_->available();
315  while (available >= SEND_BUFFER_SIZE) {
316  size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
317  if (this->audio_mode_ == AUDIO_MODE_API) {
319  msg.data.assign((char *) this->send_buffer_, read_bytes);
321  } else {
322  if (!this->udp_socket_running_) {
323  if (!this->start_udp_socket_()) {
325  break;
326  }
327  }
328  this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
329  sizeof(this->dest_addr_));
330  }
331  available = this->ring_buffer_->available();
332  }
333 
334  break;
335  }
336  case State::STOP_MICROPHONE: {
337  if (this->mic_->is_running()) {
338  this->mic_->stop();
340  } else {
341  this->set_state_(this->desired_state_);
342  }
343  break;
344  }
346  if (this->mic_->is_stopped()) {
347  this->set_state_(this->desired_state_);
348  }
349  break;
350  }
352  break; // State changed by events
353  }
355  bool playing = false;
356 #ifdef USE_SPEAKER
357  if (this->speaker_ != nullptr) {
358  ssize_t received_len = 0;
359  if (this->audio_mode_ == AUDIO_MODE_UDP) {
360  if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
361  received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
362  if (received_len > 0) {
363  this->speaker_buffer_index_ += received_len;
364  this->speaker_buffer_size_ += received_len;
365  this->speaker_bytes_received_ += received_len;
366  }
367  } else {
368  ESP_LOGD(TAG, "Receive buffer full");
369  }
370  }
371  // Build a small buffer of audio before sending to the speaker
372  bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
373  if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
374  this->write_speaker_();
375  if (this->wait_for_stream_end_) {
376  this->cancel_timeout("playing");
377  if (end_of_stream) {
378  ESP_LOGD(TAG, "End of audio stream received");
379  this->cancel_timeout("speaker-timeout");
381  }
382  break; // We dont want to timeout here as the STREAM_END event will take care of that.
383  }
384  playing = this->speaker_->is_running();
385  }
386 #endif
387 #ifdef USE_MEDIA_PLAYER
388  if (this->media_player_ != nullptr) {
390  }
391 #endif
392  if (playing) {
393  this->start_playback_timeout_();
394  }
395  break;
396  }
398 #ifdef USE_SPEAKER
399  if (this->speaker_ != nullptr) {
400  if (this->speaker_buffer_size_ > 0) {
401  this->write_speaker_();
402  break;
403  }
404  if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) {
405  break;
406  }
407  ESP_LOGD(TAG, "Speaker has finished outputting all audio");
408  this->speaker_->stop();
409  this->cancel_timeout("speaker-timeout");
410  this->cancel_timeout("playing");
411 
412  this->clear_buffers_();
413 
414  this->wait_for_stream_end_ = false;
415  this->stream_ended_ = false;
416 
418  }
419 #endif
421  break;
422  }
423  default:
424  break;
425  }
426 }
427 
428 #ifdef USE_SPEAKER
430  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
431  if (this->speaker_buffer_size_ > 0) {
432  size_t write_chunk = std::min<size_t>(this->speaker_buffer_size_, 4 * 1024);
433  size_t written = this->speaker_->play(this->speaker_buffer_, write_chunk);
434  if (written > 0) {
435  memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
436  this->speaker_buffer_size_ -= written;
437  this->speaker_buffer_index_ -= written;
438  this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); });
439  } else {
440  ESP_LOGV(TAG, "Speaker buffer full, trying again next loop");
441  }
442  }
443  }
444 }
445 #endif
446 
448  if (!subscribe) {
449  if (this->api_client_ == nullptr || client != this->api_client_) {
450  ESP_LOGE(TAG, "Client attempting to unsubscribe that is not the current API Client");
451  return;
452  }
453  this->api_client_ = nullptr;
455  return;
456  }
457 
458  if (this->api_client_ != nullptr) {
459  ESP_LOGE(TAG, "Multiple API Clients attempting to connect to Voice Assistant");
460  ESP_LOGE(TAG, "Current client: %s", this->api_client_->get_client_combined_info().c_str());
461  ESP_LOGE(TAG, "New client: %s", client->get_client_combined_info().c_str());
462  return;
463  }
464 
465  this->api_client_ = client;
467 }
468 
469 static const LogString *voice_assistant_state_to_string(State state) {
470  switch (state) {
471  case State::IDLE:
472  return LOG_STR("IDLE");
474  return LOG_STR("START_MICROPHONE");
476  return LOG_STR("STARTING_MICROPHONE");
477  case State::WAIT_FOR_VAD:
478  return LOG_STR("WAIT_FOR_VAD");
480  return LOG_STR("WAITING_FOR_VAD");
482  return LOG_STR("START_PIPELINE");
484  return LOG_STR("STARTING_PIPELINE");
486  return LOG_STR("STREAMING_MICROPHONE");
488  return LOG_STR("STOP_MICROPHONE");
490  return LOG_STR("STOPPING_MICROPHONE");
492  return LOG_STR("AWAITING_RESPONSE");
494  return LOG_STR("STREAMING_RESPONSE");
496  return LOG_STR("RESPONSE_FINISHED");
497  default:
498  return LOG_STR("UNKNOWN");
499  }
500 };
501 
503  State old_state = this->state_;
504  this->state_ = state;
505  ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
506  LOG_STR_ARG(voice_assistant_state_to_string(state)));
507 }
508 
509 void VoiceAssistant::set_state_(State state, State desired_state) {
510  this->set_state_(state);
511  this->desired_state_ = desired_state;
512  ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
513 }
514 
516  ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details.");
517  this->error_trigger_->trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details.");
519 }
520 
522  if (this->state_ != State::STARTING_PIPELINE) {
523  this->signal_stop_();
524  return;
525  }
526 
527  ESP_LOGD(TAG, "Client started, streaming microphone");
528  this->audio_mode_ = AUDIO_MODE_API;
529 
530  if (this->mic_->is_running()) {
532  } else {
534  }
535 }
536 
537 void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
538  if (this->state_ != State::STARTING_PIPELINE) {
539  this->signal_stop_();
540  return;
541  }
542 
543  ESP_LOGD(TAG, "Client started, streaming microphone");
544  this->audio_mode_ = AUDIO_MODE_UDP;
545 
546  memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
547  if (this->dest_addr_.ss_family == AF_INET) {
548  ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
549  }
550 #if LWIP_IPV6
551  else if (this->dest_addr_.ss_family == AF_INET6) {
552  ((struct sockaddr_in6 *) &this->dest_addr_)->sin6_port = htons(port);
553  }
554 #endif
555  else {
556  ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family);
557  return;
558  }
559 
560  if (this->mic_->is_running()) {
562  } else {
564  }
565 }
566 
567 void VoiceAssistant::request_start(bool continuous, bool silence_detection) {
568  if (this->api_client_ == nullptr) {
569  ESP_LOGE(TAG, "No API client connected");
571  this->continuous_ = false;
572  return;
573  }
574  if (this->state_ == State::IDLE) {
575  this->continuous_ = continuous;
576  this->silence_detection_ = silence_detection;
577 #ifdef USE_ESP_ADF
578  if (this->use_wake_word_) {
580  } else
581 #endif
582  {
584  }
585  }
586 }
587 
589  this->continuous_ = false;
590 
591  switch (this->state_) {
592  case State::IDLE:
593  break;
596  case State::WAIT_FOR_VAD:
600  break;
603  this->signal_stop_();
605  break;
608  this->desired_state_ = State::IDLE;
609  break;
611  this->signal_stop_();
612  break;
615  break; // Let the incoming audio stream finish then it will go to idle.
616  }
617 }
618 
620  memset(&this->dest_addr_, 0, sizeof(this->dest_addr_));
621  if (this->api_client_ == nullptr) {
622  return;
623  }
624  ESP_LOGD(TAG, "Signaling stop...");
626  msg.start = false;
628 }
629 
631  this->set_timeout("playing", 100, [this]() {
632  this->cancel_timeout("speaker-timeout");
634 
636  msg.success = true;
638  });
639 }
640 
642  ESP_LOGD(TAG, "Event Type: %" PRId32, msg.event_type);
643  switch (msg.event_type) {
645  ESP_LOGD(TAG, "Assist Pipeline running");
646  this->defer([this]() { this->start_trigger_->trigger(); });
647  break;
649  break;
651  ESP_LOGD(TAG, "Wake word detected");
652  this->defer([this]() { this->wake_word_detected_trigger_->trigger(); });
653  break;
654  }
656  ESP_LOGD(TAG, "STT started");
657  this->defer([this]() { this->listening_trigger_->trigger(); });
658  break;
660  std::string text;
661  for (auto arg : msg.data) {
662  if (arg.name == "text") {
663  text = std::move(arg.value);
664  }
665  }
666  if (text.empty()) {
667  ESP_LOGW(TAG, "No text in STT_END event");
668  return;
669  }
670  ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str());
671  this->defer([this, text]() { this->stt_end_trigger_->trigger(text); });
672  break;
673  }
675  ESP_LOGD(TAG, "Intent started");
676  this->defer([this]() { this->intent_start_trigger_->trigger(); });
677  break;
679  for (auto arg : msg.data) {
680  if (arg.name == "conversation_id") {
681  this->conversation_id_ = std::move(arg.value);
682  }
683  }
684  this->defer([this]() { this->intent_end_trigger_->trigger(); });
685  break;
686  }
688  std::string text;
689  for (auto arg : msg.data) {
690  if (arg.name == "text") {
691  text = std::move(arg.value);
692  }
693  }
694  if (text.empty()) {
695  ESP_LOGW(TAG, "No text in TTS_START event");
696  return;
697  }
698  ESP_LOGD(TAG, "Response: \"%s\"", text.c_str());
699  this->defer([this, text]() {
700  this->tts_start_trigger_->trigger(text);
701 #ifdef USE_SPEAKER
702  if (this->speaker_ != nullptr) {
703  this->speaker_->start();
704  }
705 #endif
706  });
707  break;
708  }
710  std::string url;
711  for (auto arg : msg.data) {
712  if (arg.name == "url") {
713  url = std::move(arg.value);
714  }
715  }
716  if (url.empty()) {
717  ESP_LOGW(TAG, "No url in TTS_END event");
718  return;
719  }
720  ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str());
721  this->defer([this, url]() {
722 #ifdef USE_MEDIA_PLAYER
723  if (this->media_player_ != nullptr) {
725  // Start the playback timeout, as the media player state isn't immediately updated
726  this->start_playback_timeout_();
727  }
728 #endif
729  this->tts_end_trigger_->trigger(url);
730  });
732  this->set_state_(new_state, new_state);
733  break;
734  }
736  ESP_LOGD(TAG, "Assist Pipeline ended");
737  if ((this->state_ == State::STARTING_PIPELINE) || (this->state_ == State::AWAITING_RESPONSE)) {
738  // Pipeline ended before starting microphone
739  // Or there wasn't a TTS start event ("nevermind")
741  } else if (this->state_ == State::STREAMING_MICROPHONE) {
742  this->ring_buffer_->reset();
743 #ifdef USE_ESP_ADF
744  if (this->use_wake_word_) {
745  // No need to stop the microphone since we didn't use the speaker
747  } else
748 #endif
749  {
751  }
752  }
753  this->defer([this]() { this->end_trigger_->trigger(); });
754  break;
755  }
757  std::string code = "";
758  std::string message = "";
759  for (auto arg : msg.data) {
760  if (arg.name == "code") {
761  code = std::move(arg.value);
762  } else if (arg.name == "message") {
763  message = std::move(arg.value);
764  }
765  }
766  if (code == "wake-word-timeout" || code == "wake_word_detection_aborted" || code == "no_wake_word") {
767  // Don't change state here since either the "tts-end" or "run-end" events will do it.
768  return;
769  } else if (code == "wake-provider-missing" || code == "wake-engine-missing") {
770  // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again.
771  this->defer([this, code, message]() {
772  this->request_stop();
773  this->error_trigger_->trigger(code, message);
774  });
775  return;
776  }
777  ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str());
778  if (this->state_ != State::IDLE) {
779  this->signal_stop_();
781  }
782  this->defer([this, code, message]() { this->error_trigger_->trigger(code, message); });
783  break;
784  }
786 #ifdef USE_SPEAKER
787  if (this->speaker_ != nullptr) {
788  this->wait_for_stream_end_ = true;
789  ESP_LOGD(TAG, "TTS stream start");
790  this->defer([this] { this->tts_stream_start_trigger_->trigger(); });
791  }
792 #endif
793  break;
794  }
796 #ifdef USE_SPEAKER
797  if (this->speaker_ != nullptr) {
798  this->stream_ended_ = true;
799  ESP_LOGD(TAG, "TTS stream end");
800  }
801 #endif
802  break;
803  }
805  ESP_LOGD(TAG, "Starting STT by VAD");
806  this->defer([this]() { this->stt_vad_start_trigger_->trigger(); });
807  break;
809  ESP_LOGD(TAG, "STT by VAD end");
811  this->defer([this]() { this->stt_vad_end_trigger_->trigger(); });
812  break;
813  default:
814  ESP_LOGD(TAG, "Unhandled event type: %" PRId32, msg.event_type);
815  break;
816  }
817 }
818 
820 #ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway
821  if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
822  if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
823  memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
824  this->speaker_buffer_index_ += msg.data.length();
825  this->speaker_buffer_size_ += msg.data.length();
826  this->speaker_bytes_received_ += msg.data.length();
827  ESP_LOGV(TAG, "Received audio: %u bytes from API", msg.data.length());
828  } else {
829  ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
830  }
831  }
832 #endif
833 }
834 
836  Timer timer = {
837  .id = msg.timer_id,
838  .name = msg.name,
839  .total_seconds = msg.total_seconds,
840  .seconds_left = msg.seconds_left,
841  .is_active = msg.is_active,
842  };
843  this->timers_[timer.id] = timer;
844  ESP_LOGD(TAG, "Timer Event");
845  ESP_LOGD(TAG, " Type: %" PRId32, msg.event_type);
846  ESP_LOGD(TAG, " %s", timer.to_string().c_str());
847 
848  switch (msg.event_type) {
850  this->timer_started_trigger_->trigger(timer);
851  break;
853  this->timer_updated_trigger_->trigger(timer);
854  break;
856  this->timer_cancelled_trigger_->trigger(timer);
857  this->timers_.erase(timer.id);
858  break;
860  this->timer_finished_trigger_->trigger(timer);
861  this->timers_.erase(timer.id);
862  break;
863  }
864 
865  if (this->timers_.empty()) {
866  this->cancel_interval("timer-event");
867  this->timer_tick_running_ = false;
868  } else if (!this->timer_tick_running_) {
869  this->set_interval("timer-event", 1000, [this]() { this->timer_tick_(); });
870  this->timer_tick_running_ = true;
871  }
872 }
873 
875  std::vector<Timer> res;
876  res.reserve(this->timers_.size());
877  for (auto &pair : this->timers_) {
878  auto &timer = pair.second;
879  if (timer.is_active && timer.seconds_left > 0) {
880  timer.seconds_left--;
881  }
882  res.push_back(timer);
883  }
884  this->timer_tick_trigger_->trigger(res);
885 }
886 
888 #ifdef USE_MEDIA_PLAYER
889  if (this->media_player_ != nullptr) {
890  this->tts_start_trigger_->trigger(msg.text);
893  this->tts_end_trigger_->trigger(msg.media_id);
894  this->end_trigger_->trigger();
895  }
896 #endif
897 }
898 
899 VoiceAssistant *global_voice_assistant = nullptr; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
900 
901 } // namespace voice_assistant
902 } // namespace esphome
903 
904 #endif // USE_VOICE_ASSISTANT
bool is_running() const
Definition: speaker.h:66
void set_interval(const std::string &name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
Definition: component.cpp:52
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
Definition: component.cpp:27
std::unordered_map< std::string, Timer > timers_
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
Definition: component.cpp:73
enums::VoiceAssistantTimerEvent event_type
Definition: api_pb2.h:1815
HighFrequencyLoopRequester high_freq_
VoiceAssistant * global_voice_assistant
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
Definition: socket.cpp:51
std::unique_ptr< socket::Socket > socket_
sa_family_t ss_family
Definition: headers.h:92
Trigger< std::string > * tts_start_trigger_
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
Definition: component.cpp:69
bool cancel_interval(const std::string &name)
Cancel an interval function.
Definition: component.cpp:56
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Definition: component.cpp:130
Trigger< std::string > * tts_end_trigger_
T * allocate(size_t n)
Definition: helpers.h:703
uint32_t socklen_t
Definition: headers.h:97
VoiceAssistantAudioSettings audio_settings
Definition: api_pb2.h:1752
Trigger< std::vector< Timer > > * timer_tick_trigger_
enums::VoiceAssistantEvent event_type
Definition: api_pb2.h:1789
void client_subscription(api::APIConnection *client, bool subscribe)
virtual bool has_buffered_data() const =0
std::vector< VoiceAssistantEventData > data
Definition: api_pb2.h:1790
std::string get_client_combined_info() const
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
Definition: automation.h:95
bool status_has_error() const
Definition: component.cpp:150
void status_set_error(const char *message="unspecified")
Definition: component.cpp:159
media_player::MediaPlayer * media_player_
void start()
Start running the loop continuously.
Definition: helpers.cpp:674
bool send_voice_assistant_request(const VoiceAssistantRequest &msg)
void stop()
Stop running the loop continuously.
Definition: helpers.cpp:680
const uint32_t flags
Definition: stm32flash.h:85
bool send_voice_assistant_announce_finished(const VoiceAssistantAnnounceFinished &msg)
void deallocate(T *p, size_t n)
Definition: helpers.h:720
std::unique_ptr< RingBuffer > ring_buffer_
bool send_voice_assistant_audio(const VoiceAssistantAudio &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
void status_clear_error()
Definition: component.cpp:172
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
MediaPlayerCall & set_announcement(bool announce)
virtual size_t read(int16_t *buf, size_t len)=0
virtual void start()=0
virtual void mark_failed()
Mark this component as failed.
Definition: component.cpp:118
Implementation of SPI Controller mode.
Definition: a01nyub.cpp:7
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
virtual size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait)
Plays the provided audio data.
Definition: speaker.h:38
An STL allocator that uses SPI or internal RAM.
Definition: helpers.h:683
static std::unique_ptr< RingBuffer > create(size_t len)
Definition: ring_buffer.cpp:22
MediaPlayerCall & set_media_url(const std::string &url)
virtual void stop()=0
void on_event(const api::VoiceAssistantEventResponse &msg)
bool state
Definition: fan.h:34
Trigger< std::string > * stt_end_trigger_
void request_start(bool continuous, bool silence_detection)
Trigger< std::string, std::string > * error_trigger_
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.