4 #ifdef USE_VOICE_ASSISTANT 12 namespace voice_assistant {
14 static const char *
const TAG =
"voice_assistant";
20 static const size_t SAMPLE_RATE_HZ = 16000;
21 static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000;
22 static const size_t BUFFER_SIZE = 512 * SAMPLE_RATE_HZ / 1000;
23 static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE *
sizeof(int16_t);
24 static const size_t RECEIVE_SIZE = 1024;
25 static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;
34 ESP_LOGE(TAG,
"Could not create socket");
39 int err = this->
socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable,
sizeof(
int));
41 ESP_LOGW(TAG,
"Socket unable to set reuseaddr: errno %d", err);
44 err = this->
socket_->setblocking(
false);
46 ESP_LOGE(TAG,
"Socket unable to set nonblocking mode: errno %d", err);
57 ESP_LOGE(TAG,
"Socket unable to set sockaddr: errno %d", errno);
64 ESP_LOGE(TAG,
"Socket unable to bind: errno %d", errno);
84 ESP_LOGW(TAG,
"Could not allocate speaker buffer");
93 ESP_LOGW(TAG,
"Could not allocate input buffer");
103 ESP_LOGW(TAG,
"Could not allocate ring buffer");
110 ESP_LOGW(TAG,
"Could not allocate send buffer");
123 memset(this->
input_buffer_, 0, INPUT_BUFFER_SIZE *
sizeof(int16_t));
173 ESP_LOGD(TAG,
"reset conversation ID");
177 size_t bytes_read = 0;
180 if (bytes_read == 0) {
181 memset(this->
input_buffer_, 0, INPUT_BUFFER_SIZE *
sizeof(int16_t));
187 ESP_LOGD(TAG,
"microphone not running");
223 ESP_LOGD(TAG,
"Starting Microphone");
247 ESP_LOGD(TAG,
"Waiting for speech...");
253 if (bytes_read > 0) {
254 vad_state_t vad_state =
256 if (vad_state == VAD_SPEECH) {
260 ESP_LOGD(TAG,
"VAD detected speech");
277 ESP_LOGD(TAG,
"Requesting start...");
297 ESP_LOGW(TAG,
"Could not request start");
315 while (available >= SEND_BUFFER_SIZE) {
319 msg.
data.assign((
char *) this->send_buffer_, read_bytes);
329 sizeof(this->dest_addr_));
355 bool playing =
false;
358 ssize_t received_len = 0;
362 if (received_len > 0) {
368 ESP_LOGD(TAG,
"Receive buffer full");
378 ESP_LOGD(TAG,
"End of audio stream received");
387 #ifdef USE_MEDIA_PLAYER 407 ESP_LOGD(TAG,
"Speaker has finished outputting all audio");
436 this->speaker_buffer_size_ -= written;
440 ESP_LOGV(TAG,
"Speaker buffer full, trying again next loop");
450 ESP_LOGE(TAG,
"Client attempting to unsubscribe that is not the current API Client");
459 ESP_LOGE(TAG,
"Multiple API Clients attempting to connect to Voice Assistant");
469 static const LogString *voice_assistant_state_to_string(
State state) {
472 return LOG_STR(
"IDLE");
474 return LOG_STR(
"START_MICROPHONE");
476 return LOG_STR(
"STARTING_MICROPHONE");
478 return LOG_STR(
"WAIT_FOR_VAD");
480 return LOG_STR(
"WAITING_FOR_VAD");
482 return LOG_STR(
"START_PIPELINE");
484 return LOG_STR(
"STARTING_PIPELINE");
486 return LOG_STR(
"STREAMING_MICROPHONE");
488 return LOG_STR(
"STOP_MICROPHONE");
490 return LOG_STR(
"STOPPING_MICROPHONE");
492 return LOG_STR(
"AWAITING_RESPONSE");
494 return LOG_STR(
"STREAMING_RESPONSE");
496 return LOG_STR(
"RESPONSE_FINISHED");
498 return LOG_STR(
"UNKNOWN");
505 ESP_LOGD(TAG,
"State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
506 LOG_STR_ARG(voice_assistant_state_to_string(state)));
512 ESP_LOGD(TAG,
"Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
516 ESP_LOGE(TAG,
"Failed to start server. See Home Assistant logs for more details.");
517 this->
error_trigger_->
trigger(
"failed-to-start",
"Failed to start server. See Home Assistant logs for more details.");
527 ESP_LOGD(TAG,
"Client started, streaming microphone");
543 ESP_LOGD(TAG,
"Client started, streaming microphone");
569 ESP_LOGE(TAG,
"No API client connected");
624 ESP_LOGD(TAG,
"Signaling stop...");
642 ESP_LOGD(TAG,
"Event Type: %" PRId32, msg.
event_type);
645 ESP_LOGD(TAG,
"Assist Pipeline running");
651 ESP_LOGD(TAG,
"Wake word detected");
656 ESP_LOGD(TAG,
"STT started");
661 for (
auto arg : msg.
data) {
662 if (arg.name ==
"text") {
663 text = std::move(arg.value);
667 ESP_LOGW(TAG,
"No text in STT_END event");
670 ESP_LOGD(TAG,
"Speech recognised as: \"%s\"", text.c_str());
675 ESP_LOGD(TAG,
"Intent started");
679 for (
auto arg : msg.
data) {
680 if (arg.name ==
"conversation_id") {
689 for (
auto arg : msg.
data) {
690 if (arg.name ==
"text") {
691 text = std::move(arg.value);
695 ESP_LOGW(TAG,
"No text in TTS_START event");
698 ESP_LOGD(TAG,
"Response: \"%s\"", text.c_str());
699 this->
defer([
this, text]() {
711 for (
auto arg : msg.
data) {
712 if (arg.name ==
"url") {
713 url = std::move(arg.value);
717 ESP_LOGW(TAG,
"No url in TTS_END event");
720 ESP_LOGD(TAG,
"Response URL: \"%s\"", url.c_str());
721 this->
defer([
this, url]() {
722 #ifdef USE_MEDIA_PLAYER 736 ESP_LOGD(TAG,
"Assist Pipeline ended");
757 std::string code =
"";
758 std::string message =
"";
759 for (
auto arg : msg.
data) {
760 if (arg.name ==
"code") {
761 code = std::move(arg.value);
762 }
else if (arg.name ==
"message") {
763 message = std::move(arg.value);
766 if (code ==
"wake-word-timeout" || code ==
"wake_word_detection_aborted" || code ==
"no_wake_word") {
769 }
else if (code ==
"wake-provider-missing" || code ==
"wake-engine-missing") {
771 this->
defer([
this, code, message]() {
777 ESP_LOGE(TAG,
"Error: %s - %s", code.c_str(), message.c_str());
789 ESP_LOGD(TAG,
"TTS stream start");
799 ESP_LOGD(TAG,
"TTS stream end");
805 ESP_LOGD(TAG,
"Starting STT by VAD");
809 ESP_LOGD(TAG,
"STT by VAD end");
814 ESP_LOGD(TAG,
"Unhandled event type: %" PRId32, msg.
event_type);
820 #ifdef USE_SPEAKER // We should never get to this function if there is no speaker anyway 827 ESP_LOGV(TAG,
"Received audio: %u bytes from API", msg.
data.length());
829 ESP_LOGE(TAG,
"Cannot receive audio, buffer is full");
844 ESP_LOGD(TAG,
"Timer Event");
845 ESP_LOGD(TAG,
" Type: %" PRId32, msg.
event_type);
846 ESP_LOGD(TAG,
" %s", timer.
to_string().c_str());
875 std::vector<Timer> res;
876 res.reserve(this->
timers_.size());
877 for (
auto &pair : this->
timers_) {
878 auto &timer = pair.second;
879 if (timer.is_active && timer.seconds_left > 0) {
880 timer.seconds_left--;
882 res.push_back(timer);
888 #ifdef USE_MEDIA_PLAYER 904 #endif // USE_VOICE_ASSISTANT Trigger * tts_stream_start_trigger_
std::string wake_word_phrase
Trigger< Timer > * timer_cancelled_trigger_
bool wait_for_stream_end_
void set_interval(const std::string &name, uint32_t interval, std::function< void()> &&f)
Set an interval function with a unique name.
const float AFTER_CONNECTION
For components that should be initialized after a data connection (API/MQTT) is connected.
std::unordered_map< std::string, Timer > timers_
bool cancel_timeout(const std::string &name)
Cancel a timeout function.
enums::VoiceAssistantTimerEvent event_type
void deallocate_buffers_()
Trigger * intent_end_trigger_
HighFrequencyLoopRequester high_freq_
VoiceAssistant * global_voice_assistant
std::string conversation_id_
socklen_t set_sockaddr_any(struct sockaddr *addr, socklen_t addrlen, uint16_t port)
Set a sockaddr to the any address and specified port for the IP version used by socket_ip().
std::string conversation_id
api::APIConnection * api_client_
Trigger< Timer > * timer_finished_trigger_
float get_setup_priority() const override
vad_handle_t vad_instance_
std::unique_ptr< socket::Socket > socket_
uint32_t noise_suppression_level
Trigger< std::string > * tts_start_trigger_
void set_timeout(const std::string &name, uint32_t timeout, std::function< void()> &&f)
Set a timeout function with a unique name.
bool cancel_interval(const std::string &name)
Cancel an interval function.
void defer(const std::string &name, std::function< void()> &&f)
Defer a callback to the next loop() call.
Trigger< std::string > * tts_end_trigger_
VoiceAssistantAudioSettings audio_settings
microphone::Microphone * mic_
Trigger< std::vector< Timer > > * timer_tick_trigger_
enums::VoiceAssistantEvent event_type
void client_subscription(api::APIConnection *client, bool subscribe)
virtual bool has_buffered_data() const =0
std::vector< VoiceAssistantEventData > data
std::string get_client_combined_info() const
uint32_t conversation_timeout_
size_t speaker_bytes_received_
uint8_t * speaker_buffer_
uint8_t noise_suppression_level_
void trigger(Ts... x)
Inform the parent automation that the event has triggered.
bool status_has_error() const
Trigger< Timer > * timer_started_trigger_
Trigger * listening_trigger_
void status_set_error(const char *message="unspecified")
media_player::MediaPlayer * media_player_
void start()
Start running the loop continuously.
Trigger * client_connected_trigger_
bool send_voice_assistant_request(const VoiceAssistantRequest &msg)
void reset_conversation_id()
speaker::Speaker * speaker_
void stop()
Stop running the loop continuously.
void set_state_(State state)
Trigger * stt_vad_start_trigger_
bool send_voice_assistant_announce_finished(const VoiceAssistantAnnounceFinished &msg)
void deallocate(T *p, size_t n)
std::unique_ptr< RingBuffer > ring_buffer_
bool send_voice_assistant_audio(const VoiceAssistantAudio &msg)
void on_audio(const api::VoiceAssistantAudio &msg)
Trigger * client_disconnected_trigger_
void status_clear_error()
void on_timer_event(const api::VoiceAssistantTimerEventResponse &msg)
Trigger * stt_vad_end_trigger_
struct sockaddr_storage dest_addr_
virtual size_t read(int16_t *buf, size_t len)=0
Trigger * wake_word_detected_trigger_
Trigger * tts_stream_end_trigger_
virtual void mark_failed()
Mark this component as failed.
Implementation of SPI Controller mode.
size_t speaker_buffer_size_
std::string to_string() const
Trigger * intent_start_trigger_
void on_announce(const api::VoiceAssistantAnnounceRequest &msg)
virtual size_t play(const uint8_t *data, size_t length, TickType_t ticks_to_wait)
Plays the provided audio data.
An STL allocator that uses SPI or internal RAM.
static std::unique_ptr< RingBuffer > create(size_t len)
Trigger< Timer > * timer_updated_trigger_
void start_playback_timeout_()
size_t speaker_buffer_index_
void on_event(const api::VoiceAssistantEventResponse &msg)
Trigger< std::string > * stt_end_trigger_
void request_start(bool continuous, bool silence_detection)
Trigger< std::string, std::string > * error_trigger_
std::unique_ptr< Socket > socket(int domain, int type, int protocol)
Create a socket of the given domain, type and protocol.