├── Makefile ├── Makefile.am ├── README.md └── mod_ppmessagespeechdetect.c /Makefile: -------------------------------------------------------------------------------- 1 | BASE=../../../.. 2 | include $(BASE)/build/modmake.rules 3 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | include $(top_srcdir)/build/modmake.rulesam 2 | MODNAME=mod_ppmessagespeechdetect 3 | 4 | mod_LTLIBRARIES = mod_ppmessagespeechdetect.la 5 | mod_ppmessage_la_SOURCES = mod_ppmessagespeechdetect.c 6 | mod_ppmessage_la_CFLAGS = $(AM_CFLAGS) 7 | mod_ppmessage_la_LIBADD = $(switch_builddir)/libfreeswitch.la 8 | mod_ppmessage_la_LDFLAGS = -avoid-version -module -no-undefined -shared 9 | 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | FreeSWITCH speech detect module for PPMESSAGE 2 | 3 | Speech detect of freeSwitch. With standard ASR interface of freeSwitch and send voice data via ESL. 4 | 5 | -------------------------------------------------------------------------------- /mod_ppmessagespeechdetect.c: -------------------------------------------------------------------------------- 1 | /* 2 | * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application 3 | * Copyright (C) 2005-2013, Anthony Minessale II 4 | * 5 | * Version: MPL 1.1 6 | * 7 | * The contents of this file are subject to the Mozilla Public License Version 8 | * 1.1 (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * http://www.mozilla.org/MPL/ 11 | * 12 | * Software distributed under the License is distributed on an "AS IS" basis, 13 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 14 | * for the specific language governing rights and limitations under the 15 | * License. 16 | * 17 | * The Original Code is FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application 18 | * 19 | * The Initial Developer of the Original Code is 20 | * Anthony Minessale II 21 | * Portions created by the Initial Developer are Copyright (C) 22 | * the Initial Developer. All Rights Reserved. 23 | * 24 | * Contributor(s): 25 | * 26 | * Guijin Ding 27 | * 28 | * mod_ppmessage - PPMessage 29 | * 30 | * 31 | */ 32 | 33 | #include 34 | 35 | SWITCH_MODULE_LOAD_FUNCTION(mod_ppmessage_load); 36 | SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_ppmessage_shutdown); 37 | SWITCH_MODULE_DEFINITION(mod_ppmessage, mod_ppmessage_load, mod_ppmessage_shutdown, NULL); 38 | 39 | 40 | #define MAX_SEGMENTS 1024 41 | 42 | static switch_mutex_t *MUTEX = NULL; 43 | static switch_event_node_t *NODE = NULL; 44 | 45 | typedef enum { 46 | AVD_NULL = (1 << 0), 47 | AVD_VOICE = (1 << 1), 48 | AVD_SILENCE = (1 << 2) 49 | } avd_status_t; 50 | 51 | typedef struct { 52 | char* speech; 53 | int length; 54 | int index; 55 | } avd_segment_t; 56 | 57 | static struct { 58 | char *model8k; 59 | char *model16k; 60 | char *dictionary; 61 | char *language_weight; 62 | uint32_t thresh; 63 | int no_input_timeout; 64 | int speech_timeout; 65 | switch_bool_t start_input_timers; 66 | int confidence_threshold; 67 | uint32_t silence_hits; 68 | uint32_t listen_hits; 69 | int auto_reload; 70 | switch_memory_pool_t *pool; 71 | } globals; 72 | 73 | typedef enum { 74 | PSFLAG_HAS_TEXT = (1 << 0), 75 | PSFLAG_READY = (1 << 1), 76 | PSFLAG_BARGE = (1 << 2), 77 | PSFLAG_ALLOCATED = (1 << 3), 78 | PSFLAG_INPUT_TIMERS = (1 << 4), 79 | PSFLAG_START_OF_SPEECH = (1 << 5), 80 | PSFLAG_NOINPUT_TIMEOUT = (1 << 6), 81 | PSFLAG_SPEECH_TIMEOUT = (1 << 7), 82 | PSFLAG_NOINPUT = (1 << 8), 83 | PSFLAG_NOMATCH = (1 << 9) 84 | } psflag_t; 85 | 86 | typedef struct { 87 | avd_status_t pre_status; 88 | avd_status_t avd_status; 89 | avd_segment_t **segments; 90 | int segment_index; 91 | 92 | uint32_t flags; 93 | switch_mutex_t *flag_mutex; 94 | uint32_t org_voice_hits; 95 | uint32_t org_silence_hits; 96 | uint32_t thresh; 97 | uint32_t voice_hits; 98 | uint32_t silence_hits; 99 | uint32_t listen_hits; 100 | uint32_t listening; 101 | uint32_t countdown; 102 | int no_input_timeout; 103 | int speech_timeout; 104 | switch_bool_t start_input_timers; 105 | switch_time_t silence_time; 106 | int confidence_threshold; 107 | char *hyp; 108 | char *grammar; 109 | int32_t score; 110 | int32_t confidence; 111 | char const *uttid; 112 | } ppmessage_t; 113 | 114 | 115 | static double get_avg_energy_score(int16_t* data, unsigned int samples) 116 | { 117 | uint32_t score, j = 0; 118 | double energy = 0; 119 | 120 | /* Do simple energy threshold for VAD */ 121 | for (j = 0; j < samples; j++) { 122 | energy += abs(data[j]); 123 | } 124 | 125 | score = (uint32_t) (energy / samples); 126 | 127 | //switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "score %u\n", score); 128 | return score; 129 | } 130 | 131 | static void init_segments(ppmessage_t* ps) 132 | { 133 | ps->segment_index = 0; 134 | memset(ps->segments, 0, sizeof(avd_segment_t*) * MAX_SEGMENTS); 135 | return; 136 | } 137 | 138 | static void push_segment(ppmessage_t* ps, void* data, unsigned int len) 139 | { 140 | avd_segment_t* segment = NULL; 141 | void* speech = NULL; 142 | 143 | speech = malloc(len); 144 | segment = (avd_segment_t*)malloc(sizeof(avd_segment_t)); 145 | 146 | memset(segment, 0, sizeof(avd_segment_t)); 147 | memcpy(speech, data, len); 148 | 149 | segment->index = ps->segment_index; 150 | segment->speech = speech; 151 | segment->length = len; 152 | 153 | ps->segments[ps->segment_index] = segment; 154 | ps->segment_index++; 155 | return; 156 | } 157 | 158 | static char* concat_segments(ppmessage_t* ps) 159 | { 160 | int i, len, offset, need_bytes; 161 | char *dst, *encoded; 162 | 163 | if (!ps->segment_index) { 164 | return NULL; 165 | } 166 | 167 | len = 0; 168 | for (i = 0; i < ps->segment_index; i++) { 169 | len += ps->segments[i]->length; 170 | } 171 | 172 | dst = (char*)malloc(len); 173 | offset = 0; 174 | for (i = 0; i < ps->segment_index; i++) { 175 | memcpy(&dst[offset], ps->segments[i]->speech, ps->segments[i]->length); 176 | offset += ps->segments[i]->length; 177 | free(ps->segments[i]->speech); 178 | free(ps->segments[i]); 179 | ps->segments[i] = NULL; 180 | } 181 | 182 | need_bytes = 3 * len + 1; 183 | encoded = (char*)malloc(need_bytes); 184 | memset(encoded, 0, need_bytes); 185 | switch_b64_encode((unsigned char*)dst, len, (unsigned char*)encoded, (unsigned int)need_bytes); 186 | free(dst); 187 | 188 | return encoded; 189 | } 190 | 191 | static void avd_status_null(ppmessage_t *ps, int16_t *data, unsigned int samples) 192 | { 193 | double score = get_avg_energy_score(data, samples); 194 | 195 | ps->voice_hits = 0; 196 | ps->silence_hits = 0; 197 | 198 | if (score >= ps->thresh) { 199 | init_segments(ps); 200 | ps->avd_status = AVD_VOICE; 201 | return; 202 | } 203 | 204 | ps->avd_status = AVD_SILENCE; 205 | return; 206 | } 207 | 208 | static void avd_status_silence(ppmessage_t *ps, int16_t *data, unsigned int samples) 209 | { 210 | double score = get_avg_energy_score(data, samples); 211 | 212 | if (score >= ps->thresh) { 213 | ps->voice_hits = ps->voice_hits + 1; 214 | if (ps->voice_hits >= ps->org_voice_hits) { 215 | init_segments(ps); 216 | ps->avd_status = AVD_VOICE; 217 | } 218 | return; 219 | } 220 | 221 | ps->voice_hits = 0; 222 | return; 223 | } 224 | 225 | static void avd_status_voice(ppmessage_t *ps, int16_t *data, unsigned int samples) 226 | { 227 | double score = get_avg_energy_score(data, samples); 228 | 229 | if (score < ps->thresh) { 230 | ps->silence_hits = ps->silence_hits + 1; 231 | if (ps->silence_hits >= ps->org_silence_hits) { 232 | ps->avd_status = AVD_SILENCE; 233 | } 234 | return; 235 | } 236 | 237 | if (ps->segment_index + 1 == MAX_SEGMENTS) { 238 | ps->avd_status = AVD_SILENCE; 239 | return; 240 | } 241 | 242 | ps->silence_hits = 0; 243 | return; 244 | 245 | } 246 | 247 | static void transfer_avd_status(ppmessage_t *ps, int16_t *data, unsigned int samples) 248 | { 249 | ps->pre_status = ps->avd_status; 250 | 251 | if (ps->avd_status == AVD_NULL) { 252 | avd_status_null(ps, data, samples); 253 | return; 254 | } 255 | 256 | if (ps->avd_status == AVD_SILENCE) { 257 | avd_status_silence(ps, data, samples); 258 | return; 259 | } 260 | 261 | if (ps->avd_status == AVD_VOICE) { 262 | avd_status_voice(ps, data, samples); 263 | return; 264 | } 265 | 266 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "error avd status %d\n", ps->avd_status); 267 | return; 268 | } 269 | 270 | /*! function to open the asr interface */ 271 | static switch_status_t ppmessage_asr_open(switch_asr_handle_t *ah, const char *codec, int rate, const char *dest, switch_asr_flag_t *flags) 272 | { 273 | 274 | ppmessage_t *ps; 275 | 276 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ppmessage_asr_open ....\n"); 277 | 278 | if (!(ps = (ppmessage_t *) switch_core_alloc(ah->memory_pool, sizeof(*ps)))) { 279 | return SWITCH_STATUS_MEMERR; 280 | } 281 | 282 | ps->avd_status = AVD_NULL; 283 | ps->pre_status = AVD_NULL; 284 | ps->segments = (avd_segment_t**)malloc(sizeof(avd_segment_t*) * MAX_SEGMENTS); 285 | init_segments(ps); 286 | 287 | switch_mutex_init(&ps->flag_mutex, SWITCH_MUTEX_NESTED, ah->memory_pool); 288 | ah->private_info = ps; 289 | 290 | if (rate == 8000) { 291 | ah->rate = 8000; 292 | } else if (rate == 16000) { 293 | ah->rate = 16000; 294 | } else { 295 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Invalid rate %d. Only 8000 and 16000 are supported.\n", rate); 296 | } 297 | 298 | codec = "L16"; 299 | 300 | ah->codec = switch_core_strdup(ah->memory_pool, codec); 301 | 302 | globals.thresh = 300; 303 | globals.silence_hits = 10; 304 | 305 | ps->thresh = globals.thresh; 306 | ps->org_silence_hits = globals.silence_hits; 307 | ps->silence_hits = 0; 308 | ps->voice_hits = 0; 309 | ps->org_voice_hits = 0; 310 | 311 | ps->listen_hits = globals.listen_hits; 312 | ps->start_input_timers = globals.start_input_timers; 313 | ps->no_input_timeout = globals.no_input_timeout; 314 | ps->speech_timeout = globals.speech_timeout; 315 | ps->confidence_threshold = globals.confidence_threshold; 316 | 317 | return SWITCH_STATUS_SUCCESS; 318 | } 319 | 320 | /*! function to load a grammar to the asr interface */ 321 | static switch_status_t ppmessage_asr_load_grammar(switch_asr_handle_t *ah, const char *grammar, const char *name) 322 | { 323 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ppmessage_asr_load_grammar ....\n"); 324 | return SWITCH_STATUS_SUCCESS; 325 | } 326 | 327 | /*! function to unload a grammar to the asr interface */ 328 | static switch_status_t ppmessage_asr_unload_grammar(switch_asr_handle_t *ah, const char *name) 329 | { 330 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ppmessage_asr_unload_grammar ....\n"); 331 | 332 | return SWITCH_STATUS_SUCCESS; 333 | } 334 | 335 | /*! function to close the asr interface */ 336 | static switch_status_t ppmessage_asr_close(switch_asr_handle_t *ah, switch_asr_flag_t *flags) 337 | { 338 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ppmessage_asr_close ....\n"); 339 | ppmessage_t *ps = (ppmessage_t *) ah->private_info; 340 | switch_safe_free(ps->segments); 341 | return SWITCH_STATUS_SUCCESS; 342 | } 343 | 344 | /*! function to feed audio to the ASR */ 345 | static switch_status_t ppmessage_asr_feed(switch_asr_handle_t *ah, void *data, unsigned int len, switch_asr_flag_t *flags) 346 | { 347 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ppmessage_asr_feed: %d\n", len); 348 | ppmessage_t *ps = (ppmessage_t *) ah->private_info; 349 | transfer_avd_status(ps, (int16_t *) data, len / 2); 350 | 351 | if (ps->avd_status == AVD_VOICE) { 352 | push_segment(ps, data, len); 353 | } 354 | return SWITCH_STATUS_SUCCESS; 355 | } 356 | 357 | /*! function to pause recognizer */ 358 | static switch_status_t ppmessage_asr_pause(switch_asr_handle_t *ah) 359 | { 360 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ppmessage_asr_pause ....\n"); 361 | return SWITCH_STATUS_SUCCESS; 362 | } 363 | 364 | /*! function to resume recognizer */ 365 | static switch_status_t ppmessage_asr_resume(switch_asr_handle_t *ah) 366 | { 367 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ppmessage_asr_resume ....\n"); 368 | return SWITCH_STATUS_SUCCESS; 369 | } 370 | 371 | /*! function to read results from the ASR*/ 372 | static switch_status_t ppmessage_asr_check_results(switch_asr_handle_t *ah, switch_asr_flag_t *flags) 373 | { 374 | ppmessage_t *ps = (ppmessage_t *) ah->private_info; 375 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "asr_check_results.\n"); 376 | 377 | /* if (ps->avd_status == AVD_VOICE) { */ 378 | /* return SWITCH_STATUS_SUCCESS; */ 379 | /* } */ 380 | 381 | if (ps->avd_status == AVD_SILENCE && ps->pre_status == AVD_VOICE) { 382 | return SWITCH_STATUS_SUCCESS; 383 | } 384 | 385 | return SWITCH_STATUS_FALSE; 386 | } 387 | 388 | /*! function to read results from the ASR */ 389 | static switch_status_t ppmessage_asr_get_results(switch_asr_handle_t *ah, char **xmlstr, switch_asr_flag_t *flags) 390 | { 391 | ppmessage_t *ps = (ppmessage_t *) ah->private_info; 392 | 393 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "asr_get_results.\n"); 394 | if (ps->avd_status == AVD_VOICE) { 395 | return SWITCH_STATUS_BREAK; 396 | } 397 | 398 | if (ps->pre_status == AVD_VOICE && ps->avd_status == AVD_SILENCE) { 399 | char* segments = concat_segments(ps); 400 | if (segments == NULL) { 401 | return SWITCH_STATUS_BREAK; 402 | } 403 | *xmlstr = segments; 404 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "%lu.\n", strlen(*xmlstr)); 405 | return SWITCH_STATUS_SUCCESS; 406 | } 407 | 408 | return SWITCH_STATUS_BREAK; 409 | } 410 | 411 | /*! function to start input timeouts */ 412 | static switch_status_t ppmessage_asr_start_input_timers(switch_asr_handle_t *ah) 413 | { 414 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ppmessage_asr_start_input_timers ....\n"); 415 | return SWITCH_STATUS_SUCCESS; 416 | } 417 | 418 | /*! set text parameter */ 419 | static void ppmessage_asr_text_param(switch_asr_handle_t *ah, char *param, const char *val) 420 | { 421 | return; 422 | } 423 | 424 | /*! set numeric parameter */ 425 | static void ppmessage_asr_numeric_param(switch_asr_handle_t *ah, char *param, int val) 426 | { 427 | return; 428 | } 429 | 430 | /*! set float parameter */ 431 | static void ppmessage_asr_float_param(switch_asr_handle_t *ah, char *param, double val) 432 | { 433 | return; 434 | } 435 | 436 | static switch_status_t load_config(void) 437 | { 438 | return SWITCH_STATUS_SUCCESS; 439 | } 440 | 441 | static void do_load(void) 442 | { 443 | switch_mutex_lock(MUTEX); 444 | load_config(); 445 | switch_mutex_unlock(MUTEX); 446 | } 447 | 448 | static void event_handler(switch_event_t *event) 449 | { 450 | if (globals.auto_reload) { 451 | do_load(); 452 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "PPMessage Reloaded\n"); 453 | } 454 | } 455 | 456 | SWITCH_MODULE_LOAD_FUNCTION(mod_ppmessage_load) 457 | { 458 | switch_asr_interface_t *asr_interface; 459 | 460 | switch_mutex_init(&MUTEX, SWITCH_MUTEX_NESTED, pool); 461 | 462 | globals.pool = pool; 463 | 464 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "mod_ppmessage_load!\n"); 465 | if ((switch_event_bind_removable(modname, SWITCH_EVENT_RELOADXML, NULL, event_handler, NULL, &NODE) != SWITCH_STATUS_SUCCESS)) { 466 | switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't bind!\n"); 467 | } 468 | 469 | do_load(); 470 | 471 | /* connect my internal structure to the blank pointer passed to me */ 472 | *module_interface = switch_loadable_module_create_module_interface(pool, modname); 473 | 474 | asr_interface = switch_loadable_module_create_interface(*module_interface, SWITCH_ASR_INTERFACE); 475 | asr_interface->interface_name = "ppmessage"; 476 | asr_interface->asr_open = ppmessage_asr_open; 477 | asr_interface->asr_load_grammar = ppmessage_asr_load_grammar; 478 | asr_interface->asr_unload_grammar = ppmessage_asr_unload_grammar; 479 | asr_interface->asr_close = ppmessage_asr_close; 480 | asr_interface->asr_feed = ppmessage_asr_feed; 481 | asr_interface->asr_resume = ppmessage_asr_resume; 482 | asr_interface->asr_pause = ppmessage_asr_pause; 483 | asr_interface->asr_check_results = ppmessage_asr_check_results; 484 | asr_interface->asr_get_results = ppmessage_asr_get_results; 485 | asr_interface->asr_start_input_timers = ppmessage_asr_start_input_timers; 486 | asr_interface->asr_text_param = ppmessage_asr_text_param; 487 | asr_interface->asr_numeric_param = ppmessage_asr_numeric_param; 488 | asr_interface->asr_float_param = ppmessage_asr_float_param; 489 | 490 | /* indicate that the module should continue to be loaded */ 491 | return SWITCH_STATUS_SUCCESS; 492 | } 493 | 494 | SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_ppmessage_shutdown) 495 | { 496 | switch_event_unbind(&NODE); 497 | return SWITCH_STATUS_UNLOAD; 498 | } 499 | 500 | 501 | /* For Emacs: 502 | * Local Variables: 503 | * mode:c 504 | * indent-tabs-mode:t 505 | * tab-width:4 506 | * c-basic-offset:4 507 | * End: 508 | * For VIM: 509 | * vim:set softtabstop=4 shiftwidth=4 tabstop=4 noet: 510 | */ 511 | --------------------------------------------------------------------------------