├── .gitignore ├── LICENSE ├── PREREQUISITES.md ├── README.md ├── docs ├── QA │ ├── cosyvoice.md │ ├── ffmpeg.md │ └── paraformer.md └── image │ ├── ai-assistant.png │ ├── asr_realtime.png │ ├── groups.png │ ├── html-asr.png │ ├── js-cosyvoice.png │ ├── js-paraformer.png │ ├── logo.svg │ ├── translator.png │ └── tts-with-subtitles │ ├── java-tts-with-subtitles.png │ └── python-tts-with-subtitles.png └── samples ├── .dev_tools ├── google-java-format-1.7-all-deps.jar └── run_ci.sh ├── .pre-commit-config.yaml ├── gallery ├── cosyvoice-js │ ├── README.md │ ├── audio_player.js │ ├── cosyvoice_api.js │ └── index.html ├── input-audio-out-text-html │ └── python │ │ ├── README.md │ │ ├── audio_recorder.js │ │ ├── index.html │ │ ├── recorder_worklet.js │ │ ├── requirements.txt │ │ └── server.py ├── input-text-out-audio-html-ai-assistant │ └── python │ │ ├── README.md │ │ ├── audio_player.js │ │ ├── index.html │ │ ├── requirements.txt │ │ └── server.py ├── paraformer-realtime-js │ ├── README.md │ ├── audio_recorder.js │ ├── index.html │ ├── paraformer_realtime_api.js │ └── recorder_worklet.js ├── read-and-display-subtitles │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── reading-story-in-multiple-role │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── run.py │ │ └── story.json ├── recognize_speech_from_video_and_decode_to_opus │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── record-from-microphone-and-display-realtime-subtitle │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py └── translate-audio-from-microphone-and-play-in-realtime │ └── python │ ├── README.md │ ├── requirements.txt │ └── run.py ├── lint.sh ├── sample-data ├── asr_example_chat.wav ├── hello_world_male_16k_16bit_mono.wav ├── sample_audio.mp3 ├── sample_for_incalculable_value.mp4 ├── sample_video_poetry.mp4 └── sample_video_story.mp4 ├── speech-plus └── transcribe-video-and-do-translation-summarization-and-qa │ ├── README.md │ └── python │ ├── README.md │ ├── ossUtil.py │ ├── requirements.txt │ └── run.py ├── speech-recognition ├── recognize_speech_and_rich_information_from_files_by_batch_mode │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_recognition │ │ │ │ └── TranscriptFilesByRestfulApi.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── recognize_speech_from_files_by_batch_mode │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_recognition │ │ │ │ └── TranscriptFilesByRestfulApi.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── recognize_speech_from_files_by_realtime_mode │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_recognition │ │ │ │ └── RecognizeSpeechFromFilesByAsyncRealtimeApi.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── recognize_speech_from_microphone │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_recognition │ │ │ │ └── RecognizeSpeechFromMicrophoneUsingFlowable.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── recognize_speech_from_single_file │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_recognition │ │ │ │ └── RecognizeSpeechFromSingleFile.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── translate_speech_from_files_by_realtime_mode │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_recognition │ │ │ │ └── TranslateFromFilesByAsyncRealtimeApi.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── translate_speech_from_files_for_one_sentence_by_realtime_mode │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_recognition │ │ │ │ └── TranslateFromFilesForOneSentenceByAsyncApi.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── translate_speech_from_microphone_for_one_sentence │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_recognition │ │ │ │ └── OneSentenceTranslateFromMic.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py └── translate_speech_from_microphone_for_realtime_stream │ ├── README.md │ ├── java │ ├── README.md │ ├── pom.xml │ ├── run.bat │ ├── run.sh │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── alibaba │ │ │ └── speech │ │ │ └── examples │ │ │ └── speech_recognition │ │ │ └── TranslateSpeechFromMicrophone.java │ │ └── resources │ │ └── logback.xml │ └── python │ ├── README.md │ ├── requirements.txt │ └── run.py ├── speech-synthesizer ├── synthesize_speech_from_llm_by_streaming_mode │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ ├── examples │ │ │ │ └── speech_synthesizer │ │ │ │ │ └── SynthesizeSpeechFromLlmByStreamingMode.java │ │ │ │ └── utils │ │ │ │ └── RealtimeMp3Player.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── synthesize_speech_from_text │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_synthesizer │ │ │ │ └── SynthesizeSpeechFromTextAndSave.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── synthesize_speech_from_text_by_streaming_mode │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ ├── examples │ │ │ │ └── speech_synthesizer │ │ │ │ │ └── SynthesizeSpeechFromTextByStreamingMode.java │ │ │ │ └── utils │ │ │ │ └── RealtimeMp3Player.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── synthesize_speech_from_text_concurrently │ ├── README.md │ ├── java │ │ ├── README.md │ │ ├── pom.xml │ │ ├── run.bat │ │ ├── run.sh │ │ └── src │ │ │ └── main │ │ │ ├── java │ │ │ └── org │ │ │ │ └── alibaba │ │ │ │ └── speech │ │ │ │ └── examples │ │ │ │ └── speech_synthesizer │ │ │ │ └── SynthesizeTextToSpeechWithCallbackConcurrently.java │ │ │ └── resources │ │ │ └── logback.xml │ └── python │ │ ├── README.md │ │ ├── requirements.txt │ │ └── run.py ├── synthesize_speech_from_text_using_asyncio │ └── python │ │ ├── README.md │ │ └── run.py └── synthesize_speech_from_text_with_cloned_voice │ ├── README.md │ ├── java │ ├── README.md │ ├── pom.xml │ ├── run.bat │ ├── run.sh │ └── src │ │ └── main │ │ ├── java │ │ └── org │ │ │ └── alibaba │ │ │ └── speech │ │ │ ├── examples │ │ │ └── speech_synthesizer │ │ │ │ └── CloneVoiceAndSynthesisTextAndPlay.java │ │ │ └── utils │ │ │ └── RealtimeMp3Player.java │ │ └── resources │ │ └── logback.xml │ └── python │ ├── README.md │ ├── record.py │ ├── requirements.txt │ └── run.py └── utils └── python ├── AudioDecoder.py ├── RealtimeMp3Player.py └── TranscriptionResultUtil.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | 4 | samples/speech-synthesizer/synthesize_speech_from_text_concurrently/python/results 5 | samples/speech-synthesizer/synthesize_speech_from_text/python/*.mp3 6 | samples/speech-recognition/recognize_speech_from_files_by_batch_mode/python/*.json 7 | samples/speech-recognition/recognize_speech_with_paralinguistics_from_files_by_batch_mode/python/*.json 8 | samples/speech-recognition/recognize_speech_from_single_file/python/*.json 9 | samples/speech-recognition/recognize_speech_from_single_file/python/*.json 10 | samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/python/*.mp3 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Alibaba Cloud 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PREREQUISITES.md: -------------------------------------------------------------------------------- 1 | # 运行示例代码的前提条件 2 | 3 | ## :point_right: 概述 4 | 通义实验室语音大模型(包括[CosyVoice](https://fun-audio-llm.github.io/)、[Paraformer](https://github.com/modelscope/FunASR)、[SenseVoice](https://fun-audio-llm.github.io/)等)可以通过阿里云百炼提供的API服务进行调用,实现语音识别(语音转文字)、语音生成(文字转语音)等功能。这些功能通过与阿里云百炼上的大语言模型API服务(包括通义千问、百川、月之暗面、零一万物、MiniMax等)结合,还可以实现语音聊天对话语音分析理解语音翻译等高阶AI功能。 5 | 6 | 运行本代码库中的示例代码需要调用阿里云百炼模型服务,该服务会提供一定的免费额度供开发者试用各个模型。要获得免费试用的额度并运行本代码库中的示例,开发者只需要简单的遵循以下四个步骤以满足前提条件: 7 | 8 | 1. 开通阿里云账号 9 | 1. 开通阿里云百炼模型服务 10 | 1. 创建阿里云百炼模型服务API-KEY并将其配置环境变量 11 | 1. 安装阿里云百炼SDK(DashScope SDK) 12 | 13 | ## :point_right: 免费开通及配置安装步骤 14 | 1. ### 开通阿里云账号 15 | 16 | 您需要开通阿里云账号以使用阿里云百炼模型服务。有关开通阿里云账号的操作,请参见[开通阿里云账号](https://help.aliyun.com/zh/account/user-guide/ali-cloud-account-registration-process)。 17 | 18 | 1. ### 开通阿里云百炼模型服务 19 | 20 | 登录阿里云账号后,您还需要开通阿里云百炼模型服务。有关开通阿里云百炼模型服务的操作,请参见[开通阿里云百炼大模型服务平台](https://help.aliyun.com/zh/model-studio/getting-started/activate-alibaba-cloud-model-studio)。 21 | 22 | 23 | 1. ### 创建阿里云百炼模型服务API-KEY并将其配置环境变量 24 | 25 | 阿里云百炼模型服务的所有模型均通过统一的API-KEY进行调用,您需要通过控制台创建自己的API-KEY。有关创建阿里云百炼API-KEY的的操作,请参见[API-KEY管理](https://help.aliyun.com/zh/model-studio/user-guide/api-key-management) 26 | 27 | 通过在环境变量中配置API-KEY,您可以避免在运行示例代码时通过明文显式的指定API-KEY,从而降低API-KEY泄漏的风险。有关在环境变量中配置API-KEY的操作,请参见[通过环境变量配置API-KEY](https://help.aliyun.com/zh/model-studio/developer-reference/configure-api-key-through-environment-variables)。 28 | 29 | 30 | 1. ### 安装阿里云百炼SDK(DashScope SDK) 31 | 32 | #### Python 33 | 34 | - 前提条件 35 | 36 | 已安装Python 3.8及以上版本。 37 | 38 | - 操作步骤 39 | 40 | ```bash 41 | pip3 install dashscope 42 | ``` 43 | 44 | #### Java 45 | 46 | - 前提条件 47 | 48 | 已安装JDK 1.8及以上版本。DashScope Java SDK版本请参见[Maven](https://mvnrepository.com/artifact/com.alibaba/dashscope-sdk-java)。 49 | 50 | - 操作步骤 51 | 52 | 执行以下命令依赖Java SDK,依赖时请将the-latest-version替换为最新版本。 53 | 54 | ```bash 55 | 56 | com.alibaba 57 | dashscope-sdk-java 58 | the-latest-version 59 | 60 | ``` 61 | 62 | 或者通过gradle依赖安装。 63 | 64 | ```bash 65 | // https://mvnrepository.com/artifact/com.alibaba/dashscope-sdk-java 66 | implementation group: 'com.alibaba', name: 'dashscope-sdk-java', version: 'the-latest-version' 67 | ``` 68 | 69 | ## :point_right: 下一步 70 | 71 | 在成功完成以上步骤后,请转到[应用场景与开发示例](https://github.com/aliyun/alibabacloud-bailian-speech-demo#point_right-应用场景与开发示例)章节,根据您所感兴趣的应用场景选择示例运行。 72 | 73 | ## :point_right: 技术支持 74 | 75 | -------------------------------------------------------------------------------- /docs/QA/ffmpeg.md: -------------------------------------------------------------------------------- 1 | # 如何安装ffmpeg 2 | 3 | FFmpeg 是一个开源跨平台多媒体框架,用于编解码、转换、播放、录制、流式传输、分析多媒体文件。它提供了一组用于处理音频、视频和字幕的库和应用程序。在本项目的部分示例中,我们使用了ffmpeg的下述功能: 4 | 1. 流式输入流式输出的解码mp3格式音频到pcm格式。 5 | 2. 流式输入流式输出提取mp4格式视频中音轨到pcm格式音频。 6 | 3. 将mp4格式视频中音轨保存为opus格式音频文件。 7 | 8 | ## 如何在 macOS 安装 ffmpeg 9 | 10 | 可以通过homebrew直接安装 11 | 12 | ```bash 13 | brew install ffmpeg 14 | ``` 15 | 16 | ## 如何在 Windows 安装 ffmpeg 17 | 18 | 1. 下载安装包:请参考[ffmpeg官网](https://www.ffmpeg.org/download.html#build-windows) ,下载已经编译好的可执行文件。 19 | 2. 解压缩:将下载好的zip/7z文件解压缩,并且进入`bin`目录,复制`bin`目录的路径。 20 | 3. 在`设置`中搜索`编辑系统环境变量`,在`环境变量`子窗口中选择编辑用户变量`Path`,选择`新建`并将ffmpeg的bin路径复制到新建的变量中。 21 | 22 | ## 如何通过源码安装ffmpeg 23 | 24 | 在Linux系统或其他不支持直接安装ffmpeg的操作系统中,可以通过源码安装ffmpeg。 25 | 1. 下载ffmpeg源码:从[ffmpeg官网](https://www.ffmpeg.org/download.html#build-linux)下载ffmpeg源码。 26 | 2. 编译并安装: 27 | ```bash 28 | cd ffmpeg 29 | ./configure --prefix=/usr/local/ffmpeg --enable-openssl --disable-x86asm 30 | make && make install 31 | ``` 32 | 33 | ## 检查是否安装成功 34 | 35 | 请确保成功安装ffmpeg并且将ffmpeg加入环境变量。 36 | 在终端执行`ffmpeg -version`命令,如果输出版本信息,则表示安装成功。 37 | 输出示例: 38 | ``` 39 | ffmpeg version 7.0.1 Copyright (c) 2000-2024 the FFmpeg developers 40 | built with Apple clang version 15.0.0 (clang-1500.3.9.4) 41 | configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack --disable-indev=jack --enable-videotoolbox --enable-audiotoolbox --enable-neon 42 | libavutil 59. 8.100 / 59. 8.100 43 | libavcodec 61. 3.100 / 61. 3.100 44 | libavformat 61. 1.100 / 61. 1.100 45 | libavdevice 61. 1.100 / 61. 1.100 46 | libavfilter 10. 1.100 / 10. 1.100 47 | libswscale 8. 1.100 / 8. 1.100 48 | libswresample 5. 1.100 / 5. 1.100 49 | libpostproc 58. 1.100 / 58. 1.100 50 | ``` -------------------------------------------------------------------------------- /docs/image/ai-assistant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/ai-assistant.png -------------------------------------------------------------------------------- /docs/image/asr_realtime.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/asr_realtime.png -------------------------------------------------------------------------------- /docs/image/groups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/groups.png -------------------------------------------------------------------------------- /docs/image/html-asr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/html-asr.png -------------------------------------------------------------------------------- /docs/image/js-cosyvoice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/js-cosyvoice.png -------------------------------------------------------------------------------- /docs/image/js-paraformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/js-paraformer.png -------------------------------------------------------------------------------- /docs/image/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /docs/image/translator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/translator.png -------------------------------------------------------------------------------- /docs/image/tts-with-subtitles/java-tts-with-subtitles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/tts-with-subtitles/java-tts-with-subtitles.png -------------------------------------------------------------------------------- /docs/image/tts-with-subtitles/python-tts-with-subtitles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/docs/image/tts-with-subtitles/python-tts-with-subtitles.png -------------------------------------------------------------------------------- /samples/.dev_tools/google-java-format-1.7-all-deps.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/samples/.dev_tools/google-java-format-1.7-all-deps.jar -------------------------------------------------------------------------------- /samples/.dev_tools/run_ci.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # get all java files 5 | FILES_TO_CHECK=$(find . -type f -name "*.java" | grep "./*/src/.*java") 6 | 7 | HAS_CHANGES=false 8 | 9 | for FILE in $FILES_TO_CHECK; do 10 | echo "Checking file: $FILE" 11 | java -jar .dev_tools/google-java-format-1.7-all-deps.jar $FILE | diff $FILE - 12 | if [ $? -ne 0 ]; then 13 | echo "File $FILE has changes after formatting." 14 | HAS_CHANGES=true 15 | else 16 | echo "File $FILE has no changes after formatting." 17 | fi 18 | done 19 | 20 | if [ "$HAS_CHANGES" = true ]; then 21 | echo "Run formatting failed, please try to run `sh lint.sh` and re-commit your java files!" 22 | exit 1 23 | fi 24 | 25 | mvn package 26 | 27 | if [ $? -ne 0 ]; then 28 | echo "mvn package failed, please check if any unittest is failed!" 29 | exit -1 30 | fi 31 | 32 | echo "CI passed." -------------------------------------------------------------------------------- /samples/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/mirrors-yapf.git 3 | rev: v0.30.0 4 | hooks: 5 | - id: yapf 6 | exclude: | 7 | (?x)^( 8 | tests/data 9 | )$ 10 | - repo: https://github.com/pre-commit/pre-commit-hooks.git 11 | rev: v3.1.0 12 | hooks: 13 | - id: trailing-whitespace 14 | exclude: thirdparty/ 15 | - id: check-yaml 16 | exclude: thirdparty/ 17 | - id: requirements-txt-fixer 18 | exclude: thirdparty/ 19 | - id: double-quote-string-fixer 20 | exclude: thirdparty/ 21 | - id: check-merge-conflict 22 | exclude: thirdparty/ 23 | - id: fix-encoding-pragma 24 | exclude: thirdparty/ 25 | args: ["--remove"] 26 | - id: mixed-line-ending 27 | exclude: thirdparty/ 28 | args: ["--fix=lf"] 29 | -------------------------------------------------------------------------------- /samples/gallery/cosyvoice-js/README.md: -------------------------------------------------------------------------------- 1 | # Cosyvoice语音合成Javascript示例 2 | 本示例演示如何通过 javascript 接入百炼平台的 cosyvoice 语音合成服务。示例同时提供一个播放器模块,可以在浏览器中播放合成的流式音频。 3 | 4 | ## 前提条件 5 | 6 | #### 配置阿里云百炼API-KEY 7 | 在使用百炼SDK进行语音识别之前,您需要先在阿里云控制台创建语音识别服务并获取API-KEY。 8 | - 在[百炼控制台](https://bailian.console.aliyun.com/)界面右上角头像位置,鼠标悬浮后,展示API-KEY,点击后进入API-KEY管理页面。 9 | - 点击【创建新的API-KEY】,会自动创建一条属于这个账号的API-KEY。列表上展示API-KEY密文,点击【查看】可以看到API-KEY的明文信息。请注意保存API-KEY的明文信息,后续使用API-KEY时需要用到。 10 | - 更多百炼配置信息请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | ## 运行示例 13 | 14 | 本目录展示了前端集成Cosyvoice的示例,需要在本地搭建http服务支持通过浏览器导入本地js脚本。 15 | 16 | 请在本目录运行一个http服务: 17 | ``` 18 | python -m http.server 9000 19 | ``` 20 | 21 | 之后您可以在浏览器输入`http://localhost:9000`打开测试网页。输入apikey、待合成文本,并点击`Send`按钮合成文本并播放。 22 | 23 | 24 | 25 | ### 关于流式输入说明 26 | 27 | 关于流式输入,可以通过调用多次`sendText`实现。本示例中不进行演示。 28 | 29 | ### 关于播放器的说明 30 | 31 | 在`audio_player.js`中,我们使用 Web Audio API 开发了 PCMAudioPlayer 播放器播放流式PCM格式的音频,将16bit采样点转化为float写入audioBuffer播放,并且在上一段音频播放结束的onended回调中立刻播放下一段音频。 32 | >注意⚠️ : 33 | >1. 使用MediaSource播放流式音频是一个更加简洁的方案,但是MediaSource不支持如下浏览器:Safari、基于Safari的iOS WebView、微信小程序。更多兼容信息参见 [MediaSource](https://developer.mozilla.org/zh-CN/docs/Web/API/MediaSource) 34 | >2. 使用[openai-realtime-console](https://github.com/openai/openai-realtime-console/tree/websockets)中集成的wavtools在移动端和safari浏览器中播放时会有噪声。 35 | 36 | ### 关于鉴权和账号安全 37 | 38 | 在百炼 Websockets 服务中,由于 JavaScript 不支持添加自定义 HTTP Header,因此 API Key 需要通过 URL 参数进行传递以完成鉴权。 39 | 40 | #### 安全性说明 41 | 42 | 通过 URL 添加永久有效的 API Key 进行鉴权的方式虽然简单易用,但在安全性方面存在一定的风险: 43 | - API Key 暴露风险:API Key 直接暴露在前端代码或 URL 中,可能被恶意用户通过浏览器开发者工具、网络抓包或日志记录等方式轻易获取。 44 | - 潜在后果:一旦 API Key 泄露,攻击者可以利用其长期访问您的服务,可能导致数据泄露、资源滥用或其他安全问题。 45 | 46 | #### 免责声明 47 | 请注意,使用此方式接入服务时,您需自行承担因 API Key 泄露而导致的一切后果。我们强烈建议您采取以下措施以提升安全性: 48 | 49 | 1. 避免直接暴露永久 API Key:考虑使用短期有效的动态令牌(如 JWT)代替永久 API Key,并通过后端生成和分发这些令牌。 50 | 2. 启用 HTTPS:确保所有通信都通过加密的 HTTPS 连接进行,以防止 API Key 在传输过程中被窃取。 51 | 限制 API Key 权限范围:为 API Key 设置最小权限,确保即使泄露也不会对系统造成严重影响。 52 | 53 | 54 | 如果您对安全性有更高要求,建议部署转发服务。 -------------------------------------------------------------------------------- /samples/gallery/cosyvoice-js/audio_player.js: -------------------------------------------------------------------------------- 1 | class PCMAudioPlayer { 2 | constructor(sampleRate) { 3 | this.sampleRate = sampleRate; 4 | this.audioContext = null; 5 | this.audioQueue = []; 6 | this.isPlaying = false; 7 | this.currentSource = null; 8 | const bufferThreshold = 2; 9 | } 10 | 11 | connect() { 12 | if (!this.audioContext) { 13 | this.audioContext = new (window.AudioContext || window.webkitAudioContext)(); 14 | } 15 | } 16 | 17 | pushPCM(arrayBuffer) { 18 | this.audioQueue.push(arrayBuffer); 19 | this._playNextAudio(); 20 | } 21 | 22 | /** 23 | * 将arrayBuffer转为audioBuffer 24 | */ 25 | _bufferPCMData(pcmData) { 26 | const sampleRate = this.sampleRate; // 设置为 PCM 数据的采样率 27 | const length = pcmData.byteLength / 2; // 假设 PCM 数据为 16 位,需除以 2 28 | const audioBuffer = this.audioContext.createBuffer(1, length, sampleRate); 29 | const channelData = audioBuffer.getChannelData(0); 30 | const int16Array = new Int16Array(pcmData); // 将 PCM 数据转换为 Int16Array 31 | 32 | for (let i = 0; i < length; i++) { 33 | // 将 16 位 PCM 转换为浮点数 (-1.0 到 1.0) 34 | channelData[i] = int16Array[i] / 32768; // 16 位数据转换范围 35 | } 36 | let audioLength = length/sampleRate*1000; 37 | console.log(`prepare audio: ${length} samples, ${audioLength} ms`) 38 | 39 | return audioBuffer; 40 | } 41 | 42 | async _playAudio(arrayBuffer) { 43 | if (this.audioContext.state === 'suspended') { 44 | await this.audioContext.resume(); 45 | } 46 | 47 | const audioBuffer = this._bufferPCMData(arrayBuffer); 48 | 49 | this.currentSource = this.audioContext.createBufferSource(); 50 | this.currentSource.buffer = audioBuffer; 51 | this.currentSource.connect(this.audioContext.destination); 52 | 53 | this.currentSource.onended = () => { 54 | console.log('Audio playback ended.'); 55 | this.isPlaying = false; 56 | this.currentSource = null; 57 | this._playNextAudio(); // Play the next audio in the queue 58 | }; 59 | this.currentSource.start(); 60 | this.isPlaying = true; 61 | } 62 | 63 | _playNextAudio() { 64 | if (this.audioQueue.length > 0 && !this.isPlaying) { 65 | // 计算总的字节长度 66 | const totalLength = this.audioQueue.reduce((acc, buffer) => acc + buffer.byteLength, 0); 67 | const combinedBuffer = new Uint8Array(totalLength); 68 | let offset = 0; 69 | 70 | // 将所有 audioQueue 中的 buffer 拼接到一个新的 Uint8Array 中 71 | for (const buffer of this.audioQueue) { 72 | combinedBuffer.set(new Uint8Array(buffer), offset); 73 | offset += buffer.byteLength; 74 | } 75 | 76 | // 清空 audioQueue,因为我们已经拼接完所有数据 77 | this.audioQueue = []; 78 | // 发送拼接的 audio 数据给 playAudio 79 | this._playAudio(combinedBuffer.buffer); 80 | } 81 | } 82 | stop() { 83 | if (this.currentSource) { 84 | this.currentSource.stop(); // 停止当前音频播放 85 | this.currentSource = null; // 清除音频源引用 86 | this.isPlaying = false; // 更新播放状态 87 | } 88 | this.audioQueue = []; // 清空音频队列 89 | console.log('Playback stopped and queue cleared.'); 90 | } 91 | 92 | } 93 | 94 | export default PCMAudioPlayer; -------------------------------------------------------------------------------- /samples/gallery/input-audio-out-text-html/python/README.md: -------------------------------------------------------------------------------- 1 | # 在网页中录音并进行语音识别 2 | 本项目在本地搭建了http服务和websocket语音识别服务,在网页中录音并实时显示识别结果。您可以参考这个示例项目在自己的网页中加入语音识别功能。 3 | 4 | ## 前提条件 5 | 6 | #### 安装 Python 依赖 7 | 8 | 阿里云百炼SDK运行环境需要Python3.8及以上版本。 9 | 运行本场景DEMO依赖的环境可以通过[PyPI](https://pypi.org/)安装。 10 | 11 | 您可以使用`pip install -r requirements.txt` 命令来安装本文件夹下的requirements依赖文件。或者手动安装下方的依赖。 12 | 13 | - 导入百炼SDK 14 | ```commandline 15 | pip3 install dashscope //安装阿里云百炼SDK 16 | pip3 install websockets //安装websocket服务依赖 17 | ``` 18 | 19 | #### 配置阿里云百炼API-KEY 20 | 在使用百炼SDK进行语音识别之前,您需要先在阿里云控制台创建语音识别服务并获取API-KEY。 21 | - 在[百炼控制台](https://bailian.console.aliyun.com/)界面右上角头像位置,鼠标悬浮后,展示API-KEY,点击后进入API-KEY管理页面。 22 | - 点击【创建新的API-KEY】,会自动创建一条属于这个账号的API-KEY。列表上展示API-KEY密文,点击【查看】可以看到API-KEY的明文信息。请注意保存API-KEY的明文信息,后续使用API-KEY时需要用到。 23 | - 更多百炼配置信息请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 24 | 25 | ## 运行示例 26 | 27 | 本目录展示了前后端分离的语音识别示例,通过websocket连接前后端,以及如何处理实时更新识别结果。 28 | 29 | 30 | 当您点击“开始录音”按钮后,网页会和python websocket服务创建连接,开始从麦克风录制音频,并将录音实时的发送给websocket服务。服务器会调用paraformer-realtime-v2语音识别模型,并将实时语音识别结果返回给网页展示。实时识别每一句的结果会在同一行内不断更新,直到分句后进入下一行。 31 | 32 | 首先,请在环境变量中配置好百炼API-KEY,并运行`demo_server.py`,默认会在本地的9090端口运行websocket服务。 33 | ``` 34 | export DASHSCOPE_API_KEY=xxxxxxx 35 | python demo_server.py 36 | ``` 37 | 38 | 之后请在本目录运行一个http服务,用于支持通过浏览器访问当前目录的文件。 39 | ``` 40 | python -m http.server 9000 41 | ``` 42 | 43 | 之后您可以在浏览器输入`http://localhost:9000`打开测试网页。输入提问并点击`开始录音`按钮发送消息后对麦克风说话。 44 | 45 | 46 | 47 | ## 关于录音的说明 48 | 49 | 在`audio_recorder.js`中,我们使用 Web Audio API 开发了 PCMAudioRecorder 录制PCM格式的音频,并通过 AudioWorkletNode 异步将采样点从浮点数转化为16bit的Int16Array并通过回调返回。buffer默认大小为1600采样点,即100ms。 50 | -------------------------------------------------------------------------------- /samples/gallery/input-audio-out-text-html/python/audio_recorder.js: -------------------------------------------------------------------------------- 1 | class PCMAudioRecorder { 2 | constructor() { 3 | this.audioContext = null; 4 | this.stream = null; 5 | this.currentSource = null; 6 | this.audioCallback = null; 7 | } 8 | 9 | async connect(audioCallback) { 10 | this.audioCallback = audioCallback; 11 | if (!this.audioContext) { 12 | this.audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 }); 13 | } 14 | 15 | this.stream = await navigator.mediaDevices.getUserMedia({ audio: true }); 16 | this.currentSource = this.audioContext.createMediaStreamSource(this.stream); 17 | 18 | // 加载 AudioWorklet 19 | try { 20 | await this.audioContext.audioWorklet.addModule('./recorder_worklet.js'); 21 | } catch (e) { 22 | console.error('Error loading AudioWorklet:', e); 23 | return; 24 | } 25 | 26 | // 创建 AudioWorkletNode 27 | this.processorNode = new AudioWorkletNode(this.audioContext, 'pcm-processor'); 28 | 29 | // 监听从 AudioWorkletProcessor 发来的消息 30 | this.processorNode.port.onmessage = (event) => { 31 | if (event.data instanceof Int16Array) { 32 | if (this.audioCallback) { 33 | this.audioCallback(event.data); 34 | } 35 | } else { 36 | console.log('Received message from AudioWorkletProcessor:', event.data); 37 | 38 | if (event.data == 'stopped') { 39 | console.log('Recorder stopped.'); 40 | // this.processorNode.disconnect(); 41 | // this.processorNode.port.close(); 42 | // this.processorNode = null; 43 | } 44 | } 45 | }; 46 | 47 | // 连接节点 48 | 49 | this.currentSource.connect(this.processorNode); 50 | this.processorNode.connect(this.audioContext.destination); 51 | console.log('Recorder connected.'); 52 | } 53 | 54 | stop() { 55 | // 断开 AudioWorkletNode 56 | if (this.processorNode) { 57 | this.processorNode.port.postMessage('stop'); 58 | } 59 | 60 | // 停止音频流 61 | if (this.stream) { 62 | const tracks = this.stream.getTracks(); 63 | tracks.forEach(track => track.stop()); 64 | } 65 | 66 | // 断开音频链接 67 | if (this.currentSource) { 68 | this.currentSource.disconnect(); 69 | this.currentSource = null; 70 | } 71 | 72 | // 关闭音频上下文 73 | if (this.audioContext) { 74 | this.audioContext.close(); 75 | this.audioContext = null; 76 | } 77 | 78 | // 重置音频回调 79 | this.audioCallback = null; 80 | if (this.processorNode) { 81 | this.processorNode.port.postMessage('stop'); 82 | this.processorNode.disconnect(); 83 | this.processorNode.port.close(); 84 | this.processorNode = null; 85 | } 86 | } 87 | } 88 | 89 | export default PCMAudioRecorder; -------------------------------------------------------------------------------- /samples/gallery/input-audio-out-text-html/python/recorder_worklet.js: -------------------------------------------------------------------------------- 1 | class PCMProcessor extends AudioWorkletProcessor { 2 | constructor() { 3 | super(); 4 | this.port.onmessage = (event) => { 5 | if (event.data === 'stop') { 6 | this.port.postMessage('prepare to stop'); 7 | this.isStopped = true; 8 | if (this.buffer.length > 0) { 9 | this.port.postMessage(new Int16Array(this.buffer)); // 发送剩余的样本 10 | this.port.postMessage({'event':'stopped'}); 11 | this.buffer = []; // 清空缓冲区 12 | } 13 | } 14 | }; 15 | this.buffer = []; // 初始化缓冲区来存储采样点 16 | this.targetSampleCount = 1600; // 目标样本数量100ms 17 | } 18 | 19 | process(inputs, outputs, parameters) { 20 | const input = inputs[0]; 21 | if (input.length > 0) { 22 | // 获取输入缓冲区的第一个通道 23 | const inputData = input[0]; 24 | 25 | // 将Float32Array转换为Int16Array,并积累到缓冲区 26 | for (let i = 0; i < inputData.length; i++) { 27 | const sample = Math.max(-32768, Math.min(32767, Math.round(inputData[i] * 32767))); 28 | this.buffer.push(sample); 29 | } 30 | 31 | // 当缓冲区里的样本数量达到目标值时,发送给主线程 32 | while (this.buffer.length >= this.targetSampleCount) { 33 | // 从缓冲区中取出目标数量的样本 34 | const pcmData = this.buffer.splice(0, this.targetSampleCount); 35 | this.port.postMessage(new Int16Array(pcmData)); // 将选定的样本发送到主线程 36 | this.port.postMessage({'event':'sending'}); 37 | } 38 | } 39 | 40 | return true; // 继续处理 41 | } 42 | } 43 | 44 | registerProcessor('pcm-processor', PCMProcessor); 45 | -------------------------------------------------------------------------------- /samples/gallery/input-audio-out-text-html/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | websockets 3 | -------------------------------------------------------------------------------- /samples/gallery/input-text-out-audio-html-ai-assistant/python/README.md: -------------------------------------------------------------------------------- 1 | # AI Assistant网页版语音助手 2 | AI Assistant网页版模拟了目前主流的大模型网站提供的交互服务,在多轮对话的基础上增加了实时、低延迟朗读大模型输出的能力。您可以参考这个示例项目搭建自己的Chatgpt网站。 3 | 4 | ## 前提条件 5 | 6 | #### 安装 Python 依赖 7 | 8 | 阿里云百炼SDK运行环境需要Python3.8及以上版本。 9 | 运行本场景DEMO依赖的环境可以通过[PyPI](https://pypi.org/)安装。 10 | 11 | 您可以使用`pip install -r requirements.txt` 命令来安装本文件夹下的requirements依赖文件。或者手动安装下方的依赖。 12 | 13 | - 导入百炼SDK 14 | ```commandline 15 | pip3 install dashscope //安装阿里云百炼SDK 16 | pip3 install websockets //安装websocket服务依赖 17 | ``` 18 | 19 | #### 配置阿里云百炼API-KEY 20 | 在使用百炼SDK进行语音识别之前,您需要先在阿里云控制台创建语音识别服务并获取API-KEY。 21 | - 在[百炼控制台](https://bailian.console.aliyun.com/)界面右上角头像位置,鼠标悬浮后,展示API-KEY,点击后进入API-KEY管理页面。 22 | - 点击【创建新的API-KEY】,会自动创建一条属于这个账号的API-KEY。列表上展示API-KEY密文,点击【查看】可以看到API-KEY的明文信息。请注意保存API-KEY的明文信息,后续使用API-KEY时需要用到。 23 | - 更多百炼配置信息请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 24 | 25 | ## 运行示例 26 | 27 | 本目录展示了前后端分离的AI Assistant示例,通过websocket连接前后端。 28 | 29 | 30 | 当您点击Send按钮后,网页会和python websocket服务创建连接,将文本框中的输入发送到服务器。服务器会调用qwen大模型和cosyvoice语音大模型,并将回答的文本片段和语音合成的PCM格式音频片段实时返回给网页播放。 31 | 再上一段音频播放时如果您点击了Send按钮,会打断上一段音频,开始下一段语音合成和播放。 32 | 33 | 首先,请运行`demo_server.py`,默认会在本地的11111端口运行websocket服务。 34 | ``` 35 | python demo_server.py 36 | ``` 37 | 38 | 之后请在本目录运行一个http服务,用于支持通过浏览器访问当前目录的文件。 39 | ``` 40 | python -m http.server 9000 41 | ``` 42 | 43 | 之后您可以在浏览器输入`http://localhost:9000`打开测试网页。输入提问并点击`Send`按钮发送消息后,会自动调用百炼SDK的接口,并在收到大模型的回复并且立刻朗读。 44 | 45 | 本示例可以支持多轮交互,默认缓存十轮历史对话消息。 46 | 47 | 48 | 49 | ### 关于播放器的说明 50 | 51 | 在`audio_player.js`中,我们使用 Web Audio API 开发了 PCMAudioPlayer 播放器播放流式PCM格式的音频,将16bit采样点转化为float写入audioBuffer播放,并且在上一段音频播放结束的onended回调中立刻播放下一段音频。 52 | >注意⚠️ : 53 | >1. 使用MediaSource播放流式音频是一个更加简洁的方案,但是MediaSource不支持如下浏览器:Safari、基于Safari的iOS WebView、微信小程序。更多兼容信息参见 [MediaSource](https://developer.mozilla.org/zh-CN/docs/Web/API/MediaSource) 54 | >2. 使用[openai-realtime-console](https://github.com/openai/openai-realtime-console/tree/websockets)中集成的wavtools在移动端和safari浏览器中播放时会有噪声。 -------------------------------------------------------------------------------- /samples/gallery/input-text-out-audio-html-ai-assistant/python/audio_player.js: -------------------------------------------------------------------------------- 1 | class PCMAudioPlayer { 2 | constructor(sampleRate) { 3 | this.sampleRate = sampleRate; 4 | this.audioContext = null; 5 | this.audioQueue = []; 6 | this.isPlaying = false; 7 | this.currentSource = null; 8 | const bufferThreshold = 2; 9 | } 10 | 11 | connect() { 12 | if (!this.audioContext) { 13 | this.audioContext = new (window.AudioContext || window.webkitAudioContext)(); 14 | } 15 | } 16 | 17 | pushPCM(arrayBuffer) { 18 | this.audioQueue.push(arrayBuffer); 19 | this._playNextAudio(); 20 | } 21 | 22 | /** 23 | * 将arrayBuffer转为audioBuffer 24 | */ 25 | _bufferPCMData(pcmData) { 26 | const sampleRate = this.sampleRate; // 设置为 PCM 数据的采样率 27 | const length = pcmData.byteLength / 2; // 假设 PCM 数据为 16 位,需除以 2 28 | const audioBuffer = this.audioContext.createBuffer(1, length, sampleRate); 29 | const channelData = audioBuffer.getChannelData(0); 30 | const int16Array = new Int16Array(pcmData); // 将 PCM 数据转换为 Int16Array 31 | 32 | for (let i = 0; i < length; i++) { 33 | // 将 16 位 PCM 转换为浮点数 (-1.0 到 1.0) 34 | channelData[i] = int16Array[i] / 32768; // 16 位数据转换范围 35 | } 36 | let audioLength = length/sampleRate*1000; 37 | console.log(`prepare audio: ${length} samples, ${audioLength} ms`) 38 | 39 | return audioBuffer; 40 | } 41 | 42 | async _playAudio(arrayBuffer) { 43 | if (this.audioContext.state === 'suspended') { 44 | await this.audioContext.resume(); 45 | } 46 | 47 | const audioBuffer = this._bufferPCMData(arrayBuffer); 48 | 49 | this.currentSource = this.audioContext.createBufferSource(); 50 | this.currentSource.buffer = audioBuffer; 51 | this.currentSource.connect(this.audioContext.destination); 52 | 53 | this.currentSource.onended = () => { 54 | console.log('Audio playback ended.'); 55 | this.isPlaying = false; 56 | this.currentSource = null; 57 | this._playNextAudio(); // Play the next audio in the queue 58 | }; 59 | this.currentSource.start(); 60 | this.isPlaying = true; 61 | } 62 | 63 | _playNextAudio() { 64 | if (this.audioQueue.length > 0 && !this.isPlaying) { 65 | // 计算总的字节长度 66 | const totalLength = this.audioQueue.reduce((acc, buffer) => acc + buffer.byteLength, 0); 67 | const combinedBuffer = new Uint8Array(totalLength); 68 | let offset = 0; 69 | 70 | // 将所有 audioQueue 中的 buffer 拼接到一个新的 Uint8Array 中 71 | for (const buffer of this.audioQueue) { 72 | combinedBuffer.set(new Uint8Array(buffer), offset); 73 | offset += buffer.byteLength; 74 | } 75 | 76 | // 清空 audioQueue,因为我们已经拼接完所有数据 77 | this.audioQueue = []; 78 | // 发送拼接的 audio 数据给 playAudio 79 | this._playAudio(combinedBuffer.buffer); 80 | } 81 | } 82 | stop() { 83 | if (this.currentSource) { 84 | this.currentSource.stop(); // 停止当前音频播放 85 | this.currentSource = null; // 清除音频源引用 86 | this.isPlaying = false; // 更新播放状态 87 | } 88 | this.audioQueue = []; // 清空音频队列 89 | console.log('Playback stopped and queue cleared.'); 90 | } 91 | 92 | } 93 | 94 | export default PCMAudioPlayer; -------------------------------------------------------------------------------- /samples/gallery/input-text-out-audio-html-ai-assistant/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | websockets 3 | -------------------------------------------------------------------------------- /samples/gallery/paraformer-realtime-js/README.md: -------------------------------------------------------------------------------- 1 | # Paraformer实时语音识别Javascript示例 2 | 本示例演示如何通过 javascript 接入百炼平台的 paraformer 实时语音识别服务。示例同时提供一个录音器模块,可以在浏览器中录制音频、调用语音识别服务并显示识别结果。 3 | 4 | ## 前提条件 5 | 6 | #### 配置阿里云百炼API-KEY 7 | 在使用百炼SDK进行语音识别之前,您需要先在阿里云控制台创建语音识别服务并获取API-KEY。 8 | - 在[百炼控制台](https://bailian.console.aliyun.com/)界面右上角头像位置,鼠标悬浮后,展示API-KEY,点击后进入API-KEY管理页面。 9 | - 点击【创建新的API-KEY】,会自动创建一条属于这个账号的API-KEY。列表上展示API-KEY密文,点击【查看】可以看到API-KEY的明文信息。请注意保存API-KEY的明文信息,后续使用API-KEY时需要用到。 10 | - 更多百炼配置信息请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | ## 运行示例 13 | 14 | 本目录展示了前端集成Paraformer的示例,需要在本地搭建http服务支持通过浏览器导入本地js脚本。 15 | 16 | 请在本目录运行一个http服务: 17 | ``` 18 | python -m http.server 9000 19 | ``` 20 | 21 | 之后您可以在浏览器输入`http://localhost:9000`打开测试网页。输入apikey,并点击`开始录音`按钮发送消息后对麦克风说话,点击`停止录音`按钮结束录音。 22 | 23 | 24 | 25 | ### 关于录音的说明 26 | 27 | 在`audio_recorder.js`中,我们使用 Web Audio API 开发了 PCMAudioRecorder 录制PCM格式的音频,并通过 AudioWorkletNode 异步将采样点从浮点数转化为16bit的Int16Array并通过回调返回。buffer默认大小为1600采样点,即100ms。 28 | 29 | 30 | ### 关于鉴权和账号安全 31 | 32 | 在百炼 Websockets 服务中,由于 JavaScript 不支持添加自定义 HTTP Header,因此 API Key 需要通过 URL 参数进行传递以完成鉴权。 33 | 34 | #### 安全性说明 35 | 36 | 通过 URL 添加永久有效的 API Key 进行鉴权的方式虽然简单易用,但在安全性方面存在一定的风险: 37 | - API Key 暴露风险:API Key 直接暴露在前端代码或 URL 中,可能被恶意用户通过浏览器开发者工具、网络抓包或日志记录等方式轻易获取。 38 | - 潜在后果:一旦 API Key 泄露,攻击者可以利用其长期访问您的服务,可能导致数据泄露、资源滥用或其他安全问题。 39 | 40 | #### 免责声明 41 | 请注意,使用此方式接入服务时,您需自行承担因 API Key 泄露而导致的一切后果。我们强烈建议您采取以下措施以提升安全性: 42 | 43 | 1. 避免直接暴露永久 API Key:考虑使用短期有效的动态令牌(如 JWT)代替永久 API Key,并通过后端生成和分发这些令牌。 44 | 2. 启用 HTTPS:确保所有通信都通过加密的 HTTPS 连接进行,以防止 API Key 在传输过程中被窃取。 45 | 限制 API Key 权限范围:为 API Key 设置最小权限,确保即使泄露也不会对系统造成严重影响。 46 | 47 | 48 | 如果您对安全性有更高要求,建议部署转发服务。 -------------------------------------------------------------------------------- /samples/gallery/paraformer-realtime-js/audio_recorder.js: -------------------------------------------------------------------------------- 1 | class PCMAudioRecorder { 2 | constructor() { 3 | this.audioContext = null; 4 | this.stream = null; 5 | this.currentSource = null; 6 | this.audioCallback = null; 7 | } 8 | 9 | async connect(audioCallback) { 10 | this.audioCallback = audioCallback; 11 | if (!this.audioContext) { 12 | this.audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 }); 13 | } 14 | 15 | this.stream = await navigator.mediaDevices.getUserMedia({ audio: true }); 16 | this.currentSource = this.audioContext.createMediaStreamSource(this.stream); 17 | 18 | // 加载 AudioWorklet 19 | try { 20 | await this.audioContext.audioWorklet.addModule('./recorder_worklet.js'); 21 | } catch (e) { 22 | console.error('Error loading AudioWorklet:', e); 23 | return; 24 | } 25 | 26 | // 创建 AudioWorkletNode 27 | this.processorNode = new AudioWorkletNode(this.audioContext, 'pcm-processor'); 28 | 29 | // 监听从 AudioWorkletProcessor 发来的消息 30 | this.processorNode.port.onmessage = (event) => { 31 | if (event.data instanceof Int16Array) { 32 | if (this.audioCallback) { 33 | this.audioCallback(event.data); 34 | } 35 | } else { 36 | console.log('Received message from AudioWorkletProcessor:', event.data); 37 | 38 | if (event.data == 'stopped') { 39 | console.log('Recorder stopped.'); 40 | // this.processorNode.disconnect(); 41 | // this.processorNode.port.close(); 42 | // this.processorNode = null; 43 | } 44 | } 45 | }; 46 | 47 | // 连接节点 48 | 49 | this.currentSource.connect(this.processorNode); 50 | this.processorNode.connect(this.audioContext.destination); 51 | console.log('Recorder connected.'); 52 | } 53 | 54 | stop() { 55 | // 断开 AudioWorkletNode 56 | if (this.processorNode) { 57 | this.processorNode.port.postMessage('stop'); 58 | } 59 | 60 | // 停止音频流 61 | if (this.stream) { 62 | const tracks = this.stream.getTracks(); 63 | tracks.forEach(track => track.stop()); 64 | } 65 | 66 | // 断开音频链接 67 | if (this.currentSource) { 68 | this.currentSource.disconnect(); 69 | this.currentSource = null; 70 | } 71 | 72 | // 关闭音频上下文 73 | if (this.audioContext) { 74 | this.audioContext.close(); 75 | this.audioContext = null; 76 | } 77 | 78 | // 重置音频回调 79 | this.audioCallback = null; 80 | if (this.processorNode) { 81 | this.processorNode.port.postMessage('stop'); 82 | this.processorNode.disconnect(); 83 | this.processorNode.port.close(); 84 | this.processorNode = null; 85 | } 86 | } 87 | } 88 | 89 | export default PCMAudioRecorder; -------------------------------------------------------------------------------- /samples/gallery/paraformer-realtime-js/recorder_worklet.js: -------------------------------------------------------------------------------- 1 | class PCMProcessor extends AudioWorkletProcessor { 2 | constructor() { 3 | super(); 4 | this.port.onmessage = (event) => { 5 | if (event.data === 'stop') { 6 | this.port.postMessage('prepare to stop'); 7 | this.isStopped = true; 8 | if (this.buffer.length > 0) { 9 | this.port.postMessage(new Int16Array(this.buffer)); // 发送剩余的样本 10 | this.port.postMessage({'event':'stopped'}); 11 | this.buffer = []; // 清空缓冲区 12 | } 13 | } 14 | }; 15 | this.buffer = []; // 初始化缓冲区来存储采样点 16 | this.targetSampleCount = 1600; // 目标样本数量100ms 17 | } 18 | 19 | process(inputs, outputs, parameters) { 20 | const input = inputs[0]; 21 | if (input.length > 0) { 22 | // 获取输入缓冲区的第一个通道 23 | const inputData = input[0]; 24 | 25 | // 将Float32Array转换为Int16Array,并积累到缓冲区 26 | for (let i = 0; i < inputData.length; i++) { 27 | const sample = Math.max(-32768, Math.min(32767, Math.round(inputData[i] * 32767))); 28 | this.buffer.push(sample); 29 | } 30 | 31 | // 当缓冲区里的样本数量达到目标值时,发送给主线程 32 | while (this.buffer.length >= this.targetSampleCount) { 33 | // 从缓冲区中取出目标数量的样本 34 | const pcmData = this.buffer.splice(0, this.targetSampleCount); 35 | this.port.postMessage(new Int16Array(pcmData)); // 将选定的样本发送到主线程 36 | this.port.postMessage({'event':'sending'}); 37 | } 38 | } 39 | 40 | return true; // 继续处理 41 | } 42 | } 43 | 44 | registerProcessor('pcm-processor', PCMProcessor); 45 | -------------------------------------------------------------------------------- /samples/gallery/read-and-display-subtitles/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 按句展示语音合成字幕 3 | 流式输入语音合成字幕项目是通过将大语言模型生成的文本进行语音合成,并同步显示字幕的一种技术实现。该项目结合了语音合成技术与字幕显示技术,适用于多种场景,如视频配音、有声读物、在线教育等需要文字转语音同步展示的场景。 4 | 11 | 12 | 13 | ## Python 14 | 15 | [comment]: # (prerequisites) 16 | ### :point_right: 前提条件 17 | 18 | 1. #### 配置阿里云百炼API-KEY 19 | 20 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 21 | 22 | 1. #### 安装Python依赖 23 | 24 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 25 | ```commandline 26 | pip3 install -r requirements.txt 27 | ``` 28 | 29 | [comment]: # (how to run the sample and expected results) 30 | ### :point_right: 运行示例 31 | 32 | 33 | ```commandline 34 | python3 read_aloud_the_text_generated_by_llm_and_display_subtitles.py 35 | ``` 36 | 37 | 本目录下提供了调用通义Speech流式输入流式输出语音合成及通义千问两个服务接口,实现低延迟、实时展示语音合成字幕示例。 38 | 39 | 本示例提供了一个简化的GUI 界面,用来展示字幕。 40 | 41 | 42 | 43 | 示例将LLM合成的文本流实时提交到任务队列中,并且按照中文句号作为字幕换行符发送句子结束信号到任务队列。 44 | 45 | 任务处理线程调用流式输入流式输出语音合成服务将队列中的文本朗读,并通过回调将合成的音频流和句子结束信号保存到SubtitlePlayer的音频缓存队列中。 46 | 47 | SubtitlePlayer是一个包含实时音频播放器的简单GUI界面。它会扫描音频缓存队列,立刻播放音频,直到遇到句子结束信号,并且等待当前音频朗读结束。 48 | 49 | 50 | **请注意:** 51 | - 播放: 52 | 为了方便演示,示例中集成了简单的录音和播放功能。并且在每一行文本朗读结束之后播放器会阻塞主线程等待朗读结束,因此会阻塞主线程获取音频。 53 | 您可以灵活在业务中调整播放器策略,比如采用其他可以在线程中等待的播放器。 54 | 55 | 56 | 57 | [comment]: # (technical support of the sample) 58 | ### :point_right: 技术支持 59 | 60 | -------------------------------------------------------------------------------- /samples/gallery/read-and-display-subtitles/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | ffmpeg-python 3 | pyaudio 4 | wxPython 5 | -------------------------------------------------------------------------------- /samples/gallery/reading-story-in-multiple-role/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 分角色朗读故事 3 | 分角色朗读故事是通过调用不同音色的语音合成朗读一个完整的故事。适用于多种场景,如有声读物、在线教育等需要区分说话人的场景。 4 | 11 | 12 | 13 | ## Python 14 | 15 | [comment]: # (prerequisites) 16 | ### :point_right: 前提条件 17 | 18 | 1. #### 配置阿里云百炼API-KEY 19 | 20 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 21 | 22 | 1. #### 安装Python依赖 23 | 24 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 25 | ```commandline 26 | pip3 install -r requirements.txt 27 | ``` 28 | 29 | [comment]: # (how to run the sample and expected results) 30 | ### :point_right: 运行示例 31 | 32 | 33 | ```commandline 34 | python3 run.py 35 | ``` 36 | 运行脚本后将会使用不同音色按照“鸭子妈妈”,“鸭子宝宝”和“旁白”三种不同的角色朗读小鸭子的故事。故事内容存放在story.json中。 37 | 38 | 39 | [comment]: # (technical support of the sample) 40 | ### :point_right: 技术支持 41 | 42 | -------------------------------------------------------------------------------- /samples/gallery/reading-story-in-multiple-role/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | ffmpeg-python 3 | pyaudio 4 | -------------------------------------------------------------------------------- /samples/gallery/reading-story-in-multiple-role/python/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) alibaba.. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | import json 5 | import os 6 | import sys 7 | import time 8 | 9 | import dashscope 10 | from dashscope.audio.tts_v2 import * 11 | 12 | sys.path.append( 13 | os.path.join(os.path.dirname(os.path.abspath(__file__)), 14 | '../../../utils/python')) 15 | 16 | from RealtimeMp3Player import RealtimeMp3Player 17 | 18 | # This sample code demonstrates how to decode MP3 audio into PCM format and play it using subprocess and pyaudio. 19 | # Decoding MP3 to PCM before playback is a common approach to audio data handling. 20 | # Alternatively, other libraries can be utilized either to decode MP3 or to play the audio directly. 21 | 22 | # Set your DashScope API key. More information: https://help.aliyun.com/document_detail/2712195.html 23 | if 'DASHSCOPE_API_KEY' in os.environ: 24 | dashscope.api_key = os.environ['DASHSCOPE_API_KEY'] 25 | # in fact,if you have set DASHSCOPE_API_KEY in your environment variable, 26 | # you can ignore this, and the sdk will automatically get the api_key from the environment variable 27 | else: 28 | dashscope.api_key = '' 29 | # if you can not set api_key in your environment variable, 30 | # you can set it here by code 31 | 32 | player = RealtimeMp3Player() 33 | # start player 34 | player.start() 35 | 36 | # Define a callback to handle the result 37 | 38 | 39 | class Callback(ResultCallback): 40 | def on_open(self): 41 | print('websocket is open.') 42 | 43 | def on_complete(self): 44 | print('speech synthesis task complete successfully.') 45 | 46 | def on_error(self, message): 47 | print(f'speech synthesis task failed, {message}') 48 | 49 | def on_close(self): 50 | print('websocket is closed.') 51 | 52 | def on_event(self, message): 53 | # print(f'recv speech synthsis message {message}') 54 | pass 55 | 56 | def on_data(self, data: bytes) -> None: 57 | player.write(data) 58 | 59 | 60 | # Call the speech synthesizer callback 61 | synthesizer_callback = Callback() 62 | 63 | voice_narrator = 'longshu' 64 | voice_motherDuck = 'longyue' 65 | vocie_babyDuck = 'longtong' 66 | 67 | # Please replace the path with your audio file path 68 | current_dir = os.path.dirname(os.path.abspath(__file__)) 69 | file_path = os.path.join(current_dir, 'story.json') 70 | print('Input file_path is: %s' % file_path) 71 | 72 | with open(file_path, 'r', encoding='utf-8') as file: 73 | data = json.load(file) 74 | 75 | story = data['story'] 76 | voice_name = '' 77 | for item in story: 78 | print(item) 79 | if item['role'] == 'narrator': 80 | voice_name = voice_narrator 81 | elif item['role'] == 'motherDuck': 82 | voice_name = voice_motherDuck 83 | elif item['role'] == 'babyDuck': 84 | voice_name = vocie_babyDuck 85 | 86 | time.sleep(1) 87 | # Synthesize speech with given text, sync call and return the audio data in result 88 | # you can customize the synthesis parameters, like model, format, sample_rate or other parameters 89 | # for more information, please refer to https://help.aliyun.com/document_detail/2712523.html 90 | synthesizer = SpeechSynthesizer( 91 | model='cosyvoice-v1', 92 | voice=voice_name, 93 | format=AudioFormat.MP3_22050HZ_MONO_256KBPS, 94 | callback=synthesizer_callback, 95 | ) 96 | synthesizer.streaming_call(item['text']) 97 | time.sleep(0.1) 98 | synthesizer.streaming_complete() 99 | # stop realtime mp3 player 100 | player.stop() 101 | -------------------------------------------------------------------------------- /samples/gallery/reading-story-in-multiple-role/python/story.json: -------------------------------------------------------------------------------- 1 | { 2 | "story": [ 3 | { 4 | "role": "narrator", 5 | "text": "在一个阳光明媚的早晨,鸭妈妈决定带着小鸭子们去池塘边学游泳。鸭妈妈说。" 6 | 7 | }, 8 | { 9 | "role": "motherDuck", 10 | "text": "小鸭子们,排好队,跟着妈妈走,我们今天要学游泳啦!" 11 | 12 | }, 13 | { 14 | "role": "narrator", 15 | "text": "小鸭子们排成一条线,摇摆着小身体,紧跟在鸭妈妈的身后。他们来到池塘边,小鸭子们睁大了眼睛,好奇地看着水面。他们看到水中自己的倒影,觉得非常神奇。" 16 | 17 | }, 18 | { 19 | "role": "babyDuck", 20 | "text": "妈妈,我们要怎样才能在水里游泳呢?" 21 | 22 | }, 23 | { 24 | "role": "motherDuck", 25 | "text": "别急,孩子们。首先,你们要记得不要害怕水,要相信自己能做到。我们鸭子天生会游泳,只需要一点点练习就可以了。" 26 | 27 | }, 28 | { 29 | "role": "motherDuck", 30 | "text": "小心地走到水边,然后慢慢地把爪子放进水里,像这样,不要着急,慢慢来。" 31 | 32 | }, 33 | { 34 | "role": "narrator", 35 | "text": "小鸭子们照着妈妈说的做,他们一个个把小爪子放进了水里,感觉水凉凉的,很奇妙。" 36 | 37 | }, 38 | { 39 | "role": "babyDuck", 40 | "text": "好舒服呀!" 41 | 42 | }, 43 | { 44 | "role": "motherDuck", 45 | "text": "现在,试着把身体放进水里,用你的脚掌轻轻地扑打水面,你就会发现你能漂浮起来。" 46 | 47 | }, 48 | { 49 | "role": "narrator", 50 | "text": "小鸭子们开始尝试,一个个扑通扑通地跳进水里。他们用小爪子扑腾水面,发现自己的身体真的浮了起来。有几个小鸭子一开始有些紧张,但很快就适应了,开始开心地在水里游来游去。" 51 | 52 | }, 53 | { 54 | "role": "babyDuck", 55 | "text": "妈妈,我们做到了!我们会游泳了!" 56 | 57 | }, 58 | { 59 | "role": "motherDuck", 60 | "text": "你们都很棒!记住,任何事情,只要坚持练习和勇敢尝试,你们都能做到。" 61 | 62 | }, 63 | { 64 | "role": "narrator", 65 | "text": "就这样,小鸭子们在鸭妈妈的带领下,度过了一个快乐而充实的上午。他们不仅学会了游泳,还学会了勇敢和自信。从此,每一个阳光明媚的早晨,池塘边都会响起小鸭子们欢快的嘎嘎声,他们开心地在水中游来游去,享受着学习和成长的乐趣。" 66 | 67 | } 68 | 69 | ] 70 | 71 | } 72 | -------------------------------------------------------------------------------- /samples/gallery/recognize_speech_from_video_and_decode_to_opus/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 视频文件的实时语音识别 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md)。 11 | 12 | 1. #### 安装ffmpeg 13 | 14 | 示例需要用到ffmpeg进行音视频解码。推荐从官方网站下载安装,并将ffmpeg安装路径配置进环境变量:[ffmpeg官方网站下载](https://www.ffmpeg.org/download.html)。也可以参考文档[如何安装ffmpeg](../../../docs/QA/ffmpeg.md)。 15 | 16 | 1. #### 安装Python依赖 17 | 18 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 19 | ```commandline 20 | pip3 install -r requirements.txt 21 | ``` 22 | 23 | [comment]: # (how to run the sample and expected results) 24 | ### :point_right: 运行示例 25 | 您可以使用以下命令运行本示例: 26 | 27 | ```commandline 28 | python3 run.py 29 | ``` 30 | 31 | 示例运行时,本地视频文件会被通过ffmpeg实时转为16k opus音频格式,之后再被实时转写为文字,并通过控制台打印结果。 32 | 33 | 本示例引入了AudioDecoder类,使用ffmpeg实现了音视频文件的转码,借助于ffmpeg丰富的格式支持,示例支持目前市面上大多数的音视频文件格式转换为适用于语音识别的音频。 34 | 35 | ### :point_right: 预期结果 36 | 37 | 完整的识别结果会以json格式保存在```result.json```文件中。完整结果包含句级别和字级别的时间戳信息等。语音识别的纯文本会同时在控制台打印: 38 | ```text 39 | The brief result is: 40 | 横看成岭侧成峰,远近高低各不同。不识庐山真面目,只缘身在此山中。 41 | ``` 42 | 43 | [comment]: # (technical support of the sample) 44 | ### :point_right: 技术支持 45 | 46 | 47 | -------------------------------------------------------------------------------- /samples/gallery/recognize_speech_from_video_and_decode_to_opus/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/gallery/record-from-microphone-and-display-realtime-subtitle/python/README.md: -------------------------------------------------------------------------------- 1 | 2 | [comment]: # (title and brief introduction of the sample) 3 | ## 语音识别并实时上屏 4 | 5 | 本示例展示了通过调用百炼平台的 Gummy 实时语音翻译模型结果中word字段和stash字段的使用方法,并且实现低延迟、实时的语音识别字幕。 6 | 7 | [comment]: # (list of scenarios of the sample) 8 | ### :point_right: 适用场景 9 | 10 | | 应用场景 | 典型用法 | 使用说明 | 11 | |----------------| ----- |----------------------| 12 | | **实时字幕** | 实时字幕上屏 | *实时识别麦克风音频并实时上屏* | 13 | 14 | [comment]: # (prerequisites) 15 | ### :point_right: 前提条件 16 | 17 | 1. #### 配置阿里云百炼API-KEY 18 | 19 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 20 | 21 | 2. #### 安装Python依赖 22 | 23 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 24 | ```commandline 25 | pip3 install -r requirements.txt 26 | ``` 27 | 28 | [comment]: # (how to run the sample and expected results) 29 | ### :point_right: 运行示例 30 | 您可以使用以下命令运行本示例: 31 | 32 | ```commandline 33 | python3 run.py 34 | ``` 35 | 在示例运行后,将会开始通过麦克风录制语音并识别,打开一个简单的GUI界面,用来动态显示实时识别结果。 36 | 37 | 38 | 39 | SubtitleFrame是双语字幕GUI界面,他会扫描队列中的语音识别结果并追加显示到界面上。其中黑色文本为已固定词汇和句子(fixed),蓝色文本为未固定的词汇,这一部分文本可能会随着识别变化。 40 | 41 | 关于如何正确识别结果的解析并实现最低延迟的上屏,请参考`update_text`函数。 42 | 43 | [comment]: # (technical support of the sample) 44 | ### :point_right: 技术支持 45 | 46 | 47 | -------------------------------------------------------------------------------- /samples/gallery/record-from-microphone-and-display-realtime-subtitle/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | pyaudio 3 | wxPython 4 | -------------------------------------------------------------------------------- /samples/gallery/translate-audio-from-microphone-and-play-in-realtime/python/README.md: -------------------------------------------------------------------------------- 1 | 2 | [comment]: # (title and brief introduction of the sample) 3 | ## 中文语音翻译成英文并实时播放 4 | 5 | 本示例展示了通过调用百炼平台的 Gummy 实时语音翻译模型和 Cosyvoice 流式语音合成模型,实现低延迟、实时的同声传译和实时双语字幕。 6 | 7 | [comment]: # (list of scenarios of the sample) 8 | ### :point_right: 适用场景 9 | 10 | | 应用场景 | 典型用法 | 使用说明 | 11 | |----------------| ----- |----------------------| 12 | | **实时翻译** | 同声传译,实时双语字幕 | *对音视频文件进行语音翻译,并显示朗读翻译结果* | 13 | 14 | [comment]: # (prerequisites) 15 | ### :point_right: 前提条件 16 | 17 | 1. #### 配置阿里云百炼API-KEY 18 | 19 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 20 | 21 | 2. #### 安装Python依赖 22 | 23 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 24 | ```commandline 25 | pip3 install -r requirements.txt 26 | ``` 27 | 28 | [comment]: # (how to run the sample and expected results) 29 | ### :point_right: 运行示例 30 | 您可以使用以下命令运行本示例: 31 | 32 | ```commandline 33 | python3 run.py 34 | ``` 35 | 在示例运行后,将会开通过麦克风录制语音并识别、翻译成英文,并将翻译结果使用 loongbella 音色朗读出来。同时将打开一个简单的GUI界面,用来显示实时双语字幕。 36 | 37 | 38 | 39 | 示例会开启两个线程执行语音任务: 40 | - 语音翻译线程:在while循环中通过麦克风获取音频数据,并发送给百炼平台实时翻译服务,通过回调收到实时结果并储存在队列中。 41 | - 语音合成线程:在while循环中从队列取出实时翻译结果,并且通过流式语音合成服务生成音频,通过回调收到合成结果并播放。 42 | SubtitleFrame是双语字幕GUI界面,他会扫描队列中的文本并追加显示到界面上。 43 | 44 | [comment]: # (technical support of the sample) 45 | ### :point_right: 技术支持 46 | 47 | 48 | -------------------------------------------------------------------------------- /samples/gallery/translate-audio-from-microphone-and-play-in-realtime/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | pyaudio 3 | wxPython 4 | -------------------------------------------------------------------------------- /samples/lint.sh: -------------------------------------------------------------------------------- 1 | java -jar .dev_tools/google-java-format-1.7-all-deps.jar -i $(find . -type f -name "*.java" | grep "./*/src/.*java") 2 | -------------------------------------------------------------------------------- /samples/sample-data/asr_example_chat.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/samples/sample-data/asr_example_chat.wav -------------------------------------------------------------------------------- /samples/sample-data/hello_world_male_16k_16bit_mono.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/samples/sample-data/hello_world_male_16k_16bit_mono.wav -------------------------------------------------------------------------------- /samples/sample-data/sample_audio.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/samples/sample-data/sample_audio.mp3 -------------------------------------------------------------------------------- /samples/sample-data/sample_for_incalculable_value.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/samples/sample-data/sample_for_incalculable_value.mp4 -------------------------------------------------------------------------------- /samples/sample-data/sample_video_poetry.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/samples/sample-data/sample_video_poetry.mp4 -------------------------------------------------------------------------------- /samples/sample-data/sample_video_story.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aliyun/alibabacloud-bailian-speech-demo/3e18d9d5f97396d1d2a76d02a46dfb2ef3518674/samples/sample-data/sample_video_story.mp4 -------------------------------------------------------------------------------- /samples/speech-plus/transcribe-video-and-do-translation-summarization-and-qa/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 视频转写并进行翻译摘要和问答 3 | 本示例展示了将一个视频文件转码为opus音频文件,通过录音文件转写服务识别为文本,然后调用通义千问大模型进行翻译、内容摘要和问答的过程。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | |----------------| ----- |----------------------| 10 | | **音视频语音分析理解** | 音视频摘要与问答 | *对音视频文件进行语音识别,并使用大模型进行摘要总结和问答* | 11 | 12 | [comment]: # (supported programming languages of the sample) 13 | ### :point_right: 编程语言 14 | - [Python](./python) 15 | 16 | 17 | [comment]: # (model and interface of the sample) 18 | ### :point_right: 参考详情 19 | 20 | | 推荐模型 | API详情 | 21 | |---------------|---------------------------------------------------------------------------------------------------| 22 | | paraformer-v2 | [Paraformer录音文件识别](https://help.aliyun.com/zh/dashscope/developer-reference/api-details-13)| 23 | | qwen-plus | [通义千问大语言模型](https://help.aliyun.com/zh/model-studio/developer-reference/what-is-qwen-llm?spm=a2c4g.11186623.0.0.5dbb76776EGFHK)| 24 | 25 | [comment]: # (dependency of the sample) 26 | ### :point_right: 依赖说明 27 | 28 | 本示例中,我们首先展示了如何将一个视频文件转码为[OPUS](https://opus-codec.org/)格式的音频文件再上传到OSS调用。这个文件预处理的过程可以大幅减少您的存储成本和网络传输成本。同时节省的传输时间也能大大加快视频文件转写的吞吐效率。在这个过程中,我们使用了ffmpeg进行音频转码,使用了OSS作为云存储和网络分发服务。以下是具体说明: 29 | 30 | 1. 安装ffmpeg: 请前往[ffmpeg官方网站下载](https://www.ffmpeg.org/download.html)。 31 | 2. 使用OSS:请前往[阿里云OSS](https://help.aliyun.com/zh/oss/getting-started/getting-started-with-oss)开通服务并进行必要配置。本示例下提供了一个简单的工具类[ossUtil.py](./python/ossUtil.py) 用来上传文件到OSS并获得文件的分享链接。请配置您的鉴权和bucket等信息,才可以使用。 32 | 33 | 34 | 35 | [comment]: # (technical support of the sample) 36 | ### :point_right: 技术支持 37 | 38 | -------------------------------------------------------------------------------- /samples/speech-plus/transcribe-video-and-do-translation-summarization-and-qa/python/README.md: -------------------------------------------------------------------------------- 1 | 2 | [comment]: # (title and brief introduction of the sample) 3 | ## 会议视频转写并进行翻译摘要和问答 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 19 | [comment]: # (how to run the sample and expected results) 20 | ### :point_right: 运行示例 21 | 您可以使用以下命令运行本示例: 22 | 23 | ```commandline 24 | python3 run.py 25 | ``` 26 | 提交需要处理的视频文件。控制台会依次输出: 27 | ```text 28 | ============ transcribe and translate === START === 29 | transcribe==> 一百多年前,电灯第一次进入了人类的生活世界,迎来了新的时代。 30 | translate ==> More than a hundred years ago, electric lights entered human life for the first time, ushering in a new era. 31 | transcribe==> 有了电,人类才发明了电视机,才有了夜生活。 32 | translate ==> With electricity, humans were able to invent television and thus, nightlife came into existence. 33 | transcribe==> 现代工业和交通的出现,创造了现代的城市。 34 | translate ==> The emergence of modern industry and transportation gave rise to modern cities. 35 | transcribe==> 人类社会发展的速度超过了历史上任何时候,进入了伟大的电气化时代。 36 | translate ==> Human society has developed at a faster pace than ever before in history, ushering in the grand era of electrification. 37 | transcribe==> 今天,我们又迎来了一轮百年未遇的科技变局。 38 | translate ==> Today, we are witnessing another round of technological transformations unprecedented in a century. 39 | ============= transcribe and translate === END === 40 | 41 | 42 | ============= summary === START === 43 | 1. 电气化时代:电灯的发明引领人类进入电气化时代,促进工业、交通发展,加速社会发展进程。 44 | 2. 数字化时代变革:当前正处于百年未遇的科技变局,数字化转型改变人类生活、生产方式及生存状态。 45 | 3. 云计算新时代:阿里云推动云计算成为新计算时代基础,让计算资源普及,如同电一般无处不在,重塑世界,激发无限想象力。 46 | 47 | 总结:从电气化到数字化,云计算正如同电一样,深刻改变世界格局,开启创新纪元。 48 | ============= summary === END === 49 | 50 | 51 | ============= QA === START === 52 | question is: 人类什么时候发明的电灯 53 | result is: 人类发明电灯大约在一百多年前,这标志着电灯首次进入了人类的生活,开启了新的时代。 54 | ============= QA === END === 55 | ``` 56 | 57 | [comment]: # (technical support of the sample) 58 | ### :point_right: 技术支持 59 | 60 | 61 | -------------------------------------------------------------------------------- /samples/speech-plus/transcribe-video-and-do-translation-summarization-and-qa/python/ossUtil.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | """ 5 | ossUtil.py: Provides utility functions for Alibaba Cloud OSS object storage service. 6 | 7 | This module defines a function `uploadFileAndGetLink`, which uploads a local file to OSS 8 | and returns a temporary access link after successful upload. 9 | """ 10 | 11 | import os 12 | 13 | import oss2 14 | from oss2.credentials import EnvironmentVariableCredentialsProvider 15 | 16 | 17 | def upload_file_and_get_link(local_path: str, file_name: str) -> str: 18 | """ 19 | Uploads a file to OSS and gets a timed access link. 20 | Refer:https://help.aliyun.com/zh/oss/user-guide/simple-upload 21 | 22 | Parameters: 23 | local_path (str): The local path of the file to be uploaded. 24 | file_name (str): The name of the file once uploaded to OSS. 25 | 26 | Returns: 27 | str: A timed access link provided by OSS, valid for 3600 seconds. 28 | """ 29 | # Get access credentials 30 | auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider()) 31 | 32 | # Initialize the Bucket instance 33 | bucket = oss2.Bucket(auth, 'https://oss-cn-hangzhou.aliyuncs.com', 34 | 'examplebucket') 35 | 36 | # Open the local file for uploading 37 | with open(local_path, 'rb') as fileobj: 38 | # Position the file pointer at the 1000th byte 39 | fileobj.seek(1000, os.SEEK_SET) 40 | current = fileobj.tell() 41 | # Upload the file to OSS 42 | bucket.put_object(file_name, fileobj) 43 | 44 | # Generate a timed access link valid for 3600 seconds 45 | url = bucket.sign_url('GET', file_name, 3600, slash_safe=True) 46 | # Output the generated link 47 | print('The signed URL is:', url) 48 | # Return the link 49 | return url 50 | -------------------------------------------------------------------------------- /samples/speech-plus/transcribe-video-and-do-translation-summarization-and-qa/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | requests 3 | #oss2 4 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_and_rich_information_from_files_by_batch_mode/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件富信息语音识别(批量模式) 3 | 本示例展示了如何批量的提交存储于云存储(例如OSS)中的音视频文件URL,并调用阿里云百炼语音识别大模型离线文件转写API,实现批量语音识别的过程。 4 | 5 | 通过使用SenseVoice语音大模型,可以对多语种语音进行识别,并同时返回情感、音频事件等富信息。音视频文件**富信息语音识别**更适合需要识别更丰富的语种、情感、音频事件等内容的场景。对一般的音视频文件语音识别场景,仍建议使用更具性价比的Paraformer模型,请参考示例:[批量音视频文件语音识别(批量模式)](../recognize_speech_from_files_by_batch_mode/)。 6 | 7 | [comment]: # (list of scenarios of the sample) 8 | ### :point_right: 适用场景 9 | 10 | | 应用场景 | 典型用法 | 使用说明 | 11 | |----------------| ----- |----------------------| 12 | | **音视频语音分析理解场景** | 音视频批量富文本语音识别 | *对音视频文件中的文本/情绪/事件进行识别* | 13 | 14 | 15 | [comment]: # (supported programming languages of the sample) 16 | ### :point_right: 编程语言 17 | - [Python](./python) 18 | - [Java](./java) 19 | 20 | [comment]: # (model and interface of the sample) 21 | ### :point_right: 参考详情 22 | 23 | | 推荐模型 | API详情 | 模型特色 | 24 | | ----- |--------------------------|-----------------------------------| 25 | | **sensevoice-v1** | [SenseVoice录音文件语音识别](https://help.aliyun.com/zh/model-studio/developer-reference/sensevoice-api) | 多达50+语种
情感识别
音频事件检测 | 26 | 27 | ### :point_right: 预期结果 28 | 运行示例,示例会访问文件转写服务,提交您输入的转写文件列表,等待识别结束。 29 | 30 | 识别结束,服务会以json列表的形式返回提交文件```file_url```和对应的识别结果文件链接```transcription_url```,您可以复制控制台打印链接到浏览器打开或下载。 31 | 32 | 在控制台,会打印每个文件对应的简略的识别结果。 33 | 34 | [comment]: # (technical support of the sample) 35 | ### :point_right: 技术支持 36 | 37 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_and_rich_information_from_files_by_batch_mode/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件富文本语音识别(批量模式) 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | [comment]: # (how to run the sample and expected results) 17 | ### :point_right: 运行示例 18 | 19 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 20 | 21 | 运行示例,录音文件识别服务将会将提交的文件列表进行后台转写。转写成功后,每个文件的识别结果,将会使用SenseVoiceParser工具解析并打印在终端。 22 | 23 | [comment]: # (technical support of the sample) 24 | ### :point_right: 技术支持 25 | 26 | 27 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_and_rich_information_from_files_by_batch_mode/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-shade-plugin 50 | 3.2.4 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | org.alibaba.speech.examples.speech_recognition.TranscriptFilesByRestfulApi 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_and_rich_information_from_files_by_batch_mode/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_and_rich_information_from_files_by_batch_mode/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_and_rich_information_from_files_by_batch_mode/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_and_rich_information_from_files_by_batch_mode/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件富文本语音识别 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 19 | [comment]: # (how to run the sample and expected results) 20 | ### :point_right: 运行示例 21 | 您可以使用以下命令运行本示例: 22 | 23 | ```commandline 24 | python3 run.py 25 | ``` 26 | 27 | 运行示例,录音文件识别服务将会将提交的文件列表进行后台转写。转写成功后,每个文件的识别结果,将会使用parse_sensevoice_result工具解析并打印在终端。 28 | 29 | [comment]: # (technical support of the sample) 30 | ### :point_right: 技术支持 31 | 32 | 33 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_and_rich_information_from_files_by_batch_mode/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音识别(批量模式) 3 | 本示例展示了如何大批量的提交存储于云存储(例如OSS)中的音视频文件URL,并调用阿里云百炼语音识别大模型离线文件转写API,实现批量语音识别的过程。 4 | 5 | 录音文件识别提供了更多音视频格式,以及更准确、信息更丰富的识别结果供用户使用。音视频文件语音识别的**批量模式**更适合对大批量云端文件进行生产任务处理、且不需要即时返回结果的场景。如果您需要对本地文件进行处理,且希望即时返回结果,请参考示例:[批量音视频文件语音识别(实时模式)](../recognize_speech_from_files_by_realtime_mode/)。 6 | 7 | [comment]: # (list of scenarios of the sample) 8 | ### :point_right: 适用场景 9 | 10 | | 应用场景 | 典型用法 | 使用说明 | 11 | |----------------| ----- |----------------------| 12 | | **音视频语音分析理解场景** | 音视频批量语音识别 | *对音视频文件进行批量语音识别* | 13 | | **会议语音分析理解场景** | 会议录音批量语音识别 | *对会议录音文件进行批量语音识别* | 14 | | **电话客服中心机器人及对话分析理解场景**| 通话录音批量语音识别 | *对客服中心通话录音文件进行批量语音识别* | 15 | 16 | [comment]: # (supported programming languages of the sample) 17 | ### :point_right: 编程语言 18 | - [Python](./python) 19 | - [Java](./java) 20 | 21 | [comment]: # (model and interface of the sample) 22 | ### :point_right: 参考详情 23 | 24 | | 推荐模型 | API详情 | 25 | |-------------------------------------------------------------|---------------------------------------------------------------------------------------------------| 26 | | **paraformer-v2**
paraformer-v1
paraformer-8k-v1
paraformer-mtl-v1 | [录音文件识别API详情](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-api)| 27 | 28 | ### :point_right: 预期结果 29 | 30 | 运行示例,示例会访问文件转写服务,提交您输入的转写文件列表,等待识别结束。 31 | 32 | 识别结束,服务会以json列表的形式返回提交文件```file_url```和对应的识别结果文件链接```transcription_url```,您可以复制控制台打印链接到浏览器打开或下载。 33 | 34 | [comment]: # (best practices) 35 | ### :point_right: 最佳实践 36 | 37 | 虽然阿里云百炼语音识别大模型的文件转写API可以兼容多种格式的音视频文件,但由于视频文件尺寸通常较大、传输较为耗时,建议对其进行预处理,仅提取需要进行语音识别的音轨,并进行合理压缩,从而显著降低文件尺寸。这样做将大大加快视频文件转写的吞吐效率。以下步骤展示了如何使用ffmpeg进行有关的预处理。 38 | 39 | 1. 安装ffmpeg:请前往[ffmpeg官方网站下载](https://www.ffmpeg.org/download.html)。也可以参考文档[如何安装ffmpeg](../../../docs/QA/ffmpeg.md)。 40 | 41 | 1. 预处理视频文件:使用ffmpeg提取视频文件中的音轨、降采样到16kHz 16bit Mono、并压缩编码为opus文件进行存储。 42 | ``` 43 | ffmpeg -i input-video-file -ac 1 -ar 16000 -acodec libopus output-audio-file.opus 44 | ``` 45 | 46 | 1. 将压缩后的纯音频文件```output-audio-file.opus```上载到云存储并获取其URL。向阿里云百炼文件转写服务提交该URL。 47 | 48 | [comment]: # (technical support of the sample) 49 | ### :point_right: 技术支持 50 | 51 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音识别(批量模式) 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | [comment]: # (how to run the sample and expected results) 17 | ### :point_right: 运行示例 18 | 19 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 20 | 21 | 运行示例,录音文件识别服务将会将提交的文件列表进行后台转写。转写成功后,每个文件的识别结果将会打印在终端。 22 | 23 | [comment]: # (technical support of the sample) 24 | ### :point_right: 技术支持 25 | 26 | 27 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-shade-plugin 50 | 3.2.4 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | org.alibaba.speech.examples.speech_recognition.TranscriptFilesByRestfulApi 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音识别(批量模式) 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 19 | [comment]: # (how to run the sample and expected results) 20 | ### :point_right: 运行示例 21 | 您可以使用以下命令运行本示例: 22 | 23 | ```commandline 24 | python3 run.py 25 | ``` 26 | 27 | 运行示例,录音文件识别服务将会将提交的文件列表进行后台转写。转写成功后,每个文件的识别结果,将会存储在一个远程JSON文件中,您可以工作URL在浏览器中查看或者下载文件进行后续处理。 28 | 29 | [comment]: # (technical support of the sample) 30 | ### :point_right: 技术支持 31 | 32 | 33 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_batch_mode/python/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | 5 | import json 6 | import os 7 | import sys 8 | from http import HTTPStatus 9 | 10 | import dashscope 11 | from dashscope.api_entities.dashscope_response import TranscriptionResponse 12 | 13 | sys.path.append( 14 | os.path.join(os.path.dirname(os.path.abspath(__file__)), 15 | '../../../utils/python')) 16 | 17 | from TranscriptionResultUtil import handle_transcription_result 18 | 19 | 20 | def init_dashscope_api_key(): 21 | """ 22 | Set your DashScope API-key. More information: 23 | https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md 24 | """ 25 | 26 | if 'DASHSCOPE_API_KEY' in os.environ: 27 | dashscope.api_key = os.environ[ 28 | 'DASHSCOPE_API_KEY'] # load API-key from environment variable DASHSCOPE_API_KEY 29 | else: 30 | dashscope.api_key = '' # set API-key manually 31 | 32 | 33 | def submit_transcription_job() -> TranscriptionResponse: 34 | """ 35 | Submit the transcription task files list 36 | the transcription api supports most of the common audio formats 37 | you can check supported formats and other parameters here: https://help.aliyun.com/document_detail/2712535.html 38 | transcription api supports 100 files at most in one job, and each file size should be less than 2GB 39 | """ 40 | # 41 | 42 | # Submit the transcription task 43 | task_response = dashscope.audio.asr.Transcription.async_call( 44 | model='paraformer-v2', 45 | # 'paraformer-8k-v1', 'paraformer-mtl-v1' 46 | file_urls=[ 47 | 'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/sensevoice/rich_text_example_1.wav', 48 | 'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/sensevoice/sample_video_poetry.mp4', 49 | 'https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/sensevoice/long_audio_demo_cn.mp3' 50 | ]) 51 | # This is the description of 'file_urls'. 52 | # You need to provide a URL from which the file can be downloaded via HTTP. 53 | # Typically, we can **store these files in public cloud storage services (such as Alibaba Cloud OSS)** 54 | # and share a publicly accessible link. 55 | # Note that it is best to add an expiration time to these links, 56 | # to prevent third-party access if the file address is leaked. 57 | return task_response 58 | 59 | 60 | def retrieve_transcription_result( 61 | transcription_response: TranscriptionResponse) -> None: 62 | """ 63 | get the transcription result 64 | """ 65 | 66 | transcribe_response = dashscope.audio.asr.Transcription.wait( 67 | task=transcription_response.output.task_id) 68 | if transcribe_response.status_code == HTTPStatus.OK: 69 | print('transcription result : ') 70 | print( 71 | json.dumps(transcribe_response.output, 72 | indent=4, 73 | ensure_ascii=False)) 74 | # you will get the transcription result in the transcribe_response.output by param : transcription_url 75 | # transcription_url is a downloadable file of json format transcription result 76 | handle_transcription_result(transcribe_response) 77 | 78 | 79 | # run the transcription script 80 | if __name__ == '__main__': 81 | init_dashscope_api_key() 82 | transcription_response = submit_transcription_job() 83 | print('transcription task id: ', transcription_response.output.task_id) 84 | retrieve_transcription_result(transcription_response) 85 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_realtime_mode/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音识别(实时模式) 3 | 本示例展示了如何批量的调用实时语音识别接口,实现多个文件流的输入,并实时返回多个文件对应的识别结果。 4 | 5 | 音视频文件语音识别的**实时模式**更适合对本地文件进行处理且即时返回结果,或搭建处理流式音频的服务端,收集前端的音频流,即时返回识别结果的场景。 6 | 7 | 如果您使用Java搭建语音服务,请参考[高并发示例文档](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-in-high-concurrency-scenarios)获得最佳的性能。 8 | 9 | 如果您需要对大批量云端文件进行生产任务处理、且不需要即时返回结果,请参考示例:[批量音视频文件语音识别(批量模式)](../recognize_speech_from_files_by_batch_mode//)。 10 | 11 | [comment]: # (list of scenarios of the sample) 12 | ### :point_right: 适用场景 13 | 14 | | 应用场景 | 典型用法 | 使用说明 | 15 | |---------|---------|-------------------| 16 | | **入门场景**| 音视频文件语音识别 | *批量对音视频文件进行语音识别* | 17 | 18 | [comment]: # (supported programming languages of the sample) 19 | ### :point_right: 编程语言 20 | - [Python](./python) 21 | - [Java](./java) 22 | 23 | [comment]: # (model and interface of the sample) 24 | ### :point_right: 参考详情 25 | 26 | | 推荐模型 | API详情 | 27 | | ----- | ----- | 28 | | **paraformer-realtime-v2**
paraformer-realtime-v1
paraformer-realtime-8k-v1 | [Paraformer实时语音识别API详情](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-real-time-speech-recognition-api) | 29 | 30 | 31 | ### :point_right: 预期结果 32 | 33 | 运行示例,在控制台会输出流式识别结果。每个文件的识别对应一个[process id], 每个文件的结果会不断返回增量的结果。如 34 | ``` 35 | //process: 51389 对应sample_audio.mp3的识别结果1 36 | [process 51389]RecognitionCallback text: 那河畔的金柳是夕阳中的 37 | 38 | //process: 51392 对应sample_video_story.mp4的识别结果 39 | [process 51392]RecognitionCallback text: 在一个阳光明媚的早晨,鸭妈妈决定带着小鸭子们 40 | 41 | //process: 51389 对应sample_audio.mp3的识别结果2 42 | [process 51389]RecognitionCallback text: 那河畔的金柳是夕阳中的新娘。 43 | ``` 44 | 45 | [comment]: # (technical support of the sample) 46 | ### :point_right: 技术支持 47 | 48 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_realtime_mode/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音识别(实时模式) 3 | 批量音视频文件语音识别(实时模式)是指并发的将多个音视频文件通过实时的方式将语音数据发送给语音识别服务,并实时地将语音转换为文字的过程。 4 | 5 | 如果您使用Java搭建语音服务请参考[高并发示例文档](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-in-high-concurrency-scenarios)获得最佳的性能。 6 | 7 | ## Java 8 | 9 | [comment]: # (prerequisites) 10 | ### :point_right: 前提条件 11 | 12 | 1. #### 配置阿里云百炼API-KEY 13 | 14 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 15 | 16 | 2. #### Java运行环境 17 | 18 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 19 | 20 | 21 | [comment]: # (how to run the sample and expected results) 22 | ### :point_right: 运行示例 23 | 24 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 25 | 26 | 示例使用了对象池和线程池实现并发运行。在示例运行时,程序会并发的读取您输入的多个音视频文件,将其独立的转为实时识别结果并分别以callback的方式回调。 27 | 28 | [comment]: # (technical support of the sample) 29 | ### :point_right: 技术支持 30 | 31 | 32 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_realtime_mode/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | org.apache.commons 47 | commons-pool2 48 | 2.11.1 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-shade-plugin 56 | 3.2.4 57 | 58 | 59 | package 60 | 61 | shade 62 | 63 | 64 | 65 | 66 | org.alibaba.speech.examples.speech_recognition.RecognizeSpeechFromFilesByAsyncRealtimeApi 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_realtime_mode/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_realtime_mode/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_realtime_mode/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_realtime_mode/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音识别(实时模式) 3 | 批量音视频文件语音识别(实时模式)是指并发的将多个音视频文件通过实时的方式将语音数据发送给语音识别服务,并实时地将语音转换为文字的过程。 4 | 5 | 6 | ## Python 7 | 8 | [comment]: # (prerequisites) 9 | ### :point_right: 前提条件 10 | 11 | 1. #### 配置阿里云百炼API-KEY 12 | 13 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 14 | 15 | 1. #### 安装ffmpeg 16 | 17 | 示例需要用到ffmpeg进行音视频解码。推荐从官方网站下载安装,并将ffmpeg安装路径配置进环境变量:[ffmpeg官方网站下载](https://www.ffmpeg.org/download.html)。也可以参考文档[如何安装ffmpeg](../../../docs/QA/ffmpeg.md)。 18 | 19 | 1. #### 安装Python依赖 20 | 21 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 22 | ```commandline 23 | pip3 install -r requirements.txt 24 | ``` 25 | 26 | [comment]: # (how to run the sample and expected results) 27 | ### :point_right: 运行示例 28 | - 您可以使用以下命令运行本示例: 29 | 30 | ```commandline 31 | python3 run.py 32 | ``` 33 | 34 | 示例使用了multiprocessing实现并发运行。在示例运行时,程序会并发的读取您输入的多个音视频文件,将其独立的转为实时识别结果并分别以callback的方式回调识别结果。 35 | 36 | [comment]: # (technical support of the sample) 37 | ### :point_right: 技术支持 38 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_files_by_realtime_mode/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_microphone/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时语音识别 3 | 本示例展示了如何从麦克风录制音频,并将获取的音频流发送至阿里云百炼模型服务进行实时语音识别。运行示例时,用户对麦克风所说的内容会被实时显示在屏幕上。我们使用VAD断句从而获得更块的响应速度。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | |----------| ----- | ----- | 10 | | **入门场景** | 麦克风语音识别 | *实时从麦克风录音并进行语音识别* | 11 | | **电话客服中心机器人及对话分析理解场景** | 实时通话语音识别 | *实时对电话系统通话进行语音识别* | 12 | | **会议语音分析理解场景** | 实时会议语音识别 | *实时对会议语音进行语音识别* | 13 | 14 | [comment]: # (supported programming languages of the sample) 15 | ### :point_right: 编程语言 16 | - [Python](./python) 17 | - [Java](./java) 18 | 19 | [comment]: # (model and interface of the sample) 20 | ### :point_right: 参考详情 21 | 22 | | 推荐模型 | API详情 | 23 | | ----- | ----- | 24 | | **paraformer-realtime-v2**
paraformer-realtime-v1
paraformer-realtime-8k-v1 | [Paraformer实时语音识别API详情](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-real-time-speech-recognition-api) | 25 | 26 | ### :point_right: 预期结果 27 | 28 | 运行示例,在控制台会提示您开始说话,控制台输入'Ctrl+C' 即可结束识别。识别结果文本会在控制台打印。 29 | ```text 30 | Press 'Ctrl+C' to stop recording and recognition 31 | RecognitionCallback text: 一 32 | RecognitionCallback text: 1234 33 | ``` 34 | 35 | [comment]: # (technical support of the sample) 36 | ### :point_right: 技术支持 37 | 38 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_microphone/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时语音识别 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 示例运行时,您通过麦克风所录制的语音将会被实时识别为文字,这些文字会打印在屏幕上。当从键盘上按下“Ctrl+C”时,示例将停止运行。 23 | 24 | [comment]: # (technical support of the sample) 25 | ### :point_right: 技术支持 26 | 27 | 28 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_microphone/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.16.10-CHAT-PRE-SNAPSHOT 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-shade-plugin 50 | 3.2.4 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | org.alibaba.speech.examples.speech_recognition.RecognizeSpeechFromMicrophoneUsingFlowable 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_microphone/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_microphone/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_microphone/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_microphone/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时语音识别 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 19 | [comment]: # (how to run the sample and expected results) 20 | ### :point_right: 运行示例 21 | 您可以使用以下命令运行本示例: 22 | 23 | ```commandline 24 | python3 run.py 25 | ``` 26 | 27 | 示例运行时,您通过麦克风所录制的语音将会被实时识别为文字,这些文字会打印在屏幕上。当从键盘上按下“Ctrl+C”时,示例将停止运行。 28 | 29 | [comment]: # (technical support of the sample) 30 | ### :point_right: 技术支持 31 | 32 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_microphone/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | pyaudio 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音识别本地的单个文件 3 | 本示例展示了如何对一个本地音视频文件进行语音识别。示例会读取本地的wav格式音频文件,并调用阿里云百炼语音识别大模型API,实现语音转文字的过程。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | |----------|-----------|-----------------| 10 | | **入门场景** | 音视频文件语音识别 | *对音视频文件进行语音识别* | 11 | 12 | [comment]: # (supported programming languages of the sample) 13 | ### :point_right: 编程语言 14 | - [Python](./python) 15 | - [Java](./java) 16 | 17 | [comment]: # (model and interface of the sample) 18 | ### :point_right: 参考详情 19 | 20 | | 推荐模型 | API详情 | 21 | | ----- | ----- | 22 | | **paraformer-realtime-v2**
paraformer-realtime-v1
paraformer-realtime-8k-v1 | [Paraformer实时语音识别API详情](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-real-time-speech-recognition-api) | 23 | 24 | ### :point_right: 预期结果 25 | 26 | 完整的识别结果会以json格式保存在```result.json```文件中。完整结果包含句级别和字级别的时间戳信息等。语音识别的纯文本会同时在控制台打印: 27 | ```text 28 | The brief result is: 29 | Hello world, 这里是阿里巴巴语音实验室。 30 | [Metric] requestId: 3d53b7bf-0bb2-4b4d-96e2-f42caa3eab92, first package delay ms: 1505, last package delay ms: 244 31 | ``` 32 | 33 | [comment]: # (technical support of the sample) 34 | ### :point_right: 技术支持 35 | 36 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音识别本地的单个文件 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 示例程序会通过调用call()接口提交文件,并同步返回识别结果。完整的识别结果会以json格式保存在```result.json```文件中。完整结果包含句级别和字级别的时间戳信息等。 23 | 24 | [comment]: # (technical support of the sample) 25 | ### :point_right: 技术支持 26 | 27 | 28 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-shade-plugin 50 | 3.2.4 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | org.alibaba.speech.examples.speech_recognition.RecognizeSpeechFromSingleFile 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音识别本地的单个文件 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md)。 11 | 12 | 2. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 19 | [comment]: # (how to run the sample and expected results) 20 | ### :point_right: 运行示例 21 | 您可以使用以下命令运行本示例: 22 | 23 | ```commandline 24 | python3 run.py 25 | ``` 26 | 27 | 示例运行时,会通过同步接口识别本地wav音频文件,并通过控制台打印结果。完整的识别结果会以json格式保存在```result.json```文件中。完整结果包含句级别和字级别的时间戳信息等。 28 | 29 | [comment]: # (technical support of the sample) 30 | ### :point_right: 技术支持 31 | 32 | 33 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/speech-recognition/recognize_speech_from_single_file/python/run.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | 5 | import json 6 | import os 7 | import sys 8 | 9 | import dashscope 10 | 11 | from dashscope.audio.asr import * 12 | 13 | 14 | def init_dashscope_api_key(): 15 | """ 16 | Set your DashScope API-key. More information: 17 | https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md 18 | """ 19 | 20 | if 'DASHSCOPE_API_KEY' in os.environ: 21 | dashscope.api_key = os.environ[ 22 | 'DASHSCOPE_API_KEY'] # load API-key from environment variable DASHSCOPE_API_KEY 23 | else: 24 | dashscope.api_key = '' # set API-key manually 25 | 26 | 27 | # main function 28 | if __name__ == '__main__': 29 | init_dashscope_api_key() 30 | 31 | # Please replace the path with your audio file path 32 | current_dir = os.path.dirname(os.path.abspath(__file__)) 33 | file_path = os.path.join(current_dir, '../../..', 'sample-data', 34 | 'hello_world_male_16k_16bit_mono.wav') 35 | print('Input file is: %s' % file_path) 36 | 37 | recognition = Recognition( 38 | model='paraformer-realtime-v2', 39 | format='wav', 40 | sample_rate=16000, 41 | callback=None, 42 | ) 43 | result = recognition.call(file_path) 44 | sentence_list = result.get_sentence() 45 | if sentence_list is None: 46 | print('No result') 47 | print(result) 48 | else: 49 | print('The brief result is: ') 50 | for sentence in sentence_list: 51 | print(sentence['text']) 52 | print( 53 | '[Metric] requestId: {}, first package delay ms: {}, last package delay ms: {}' 54 | .format( 55 | recognition.get_last_request_id(), 56 | recognition.get_first_package_delay(), 57 | recognition.get_last_package_delay(), 58 | )) 59 | if sentence_list is not None: 60 | with open('result.json', 'w', encoding='utf-8') as f: 61 | f.write(json.dumps(sentence_list, indent=4)) 62 | print('Full recognition result is saved into file: result.json') 63 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_by_realtime_mode/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音翻译(实时模式) 3 | 本示例展示了如何批量的调用实时语音翻译接口,实现多个文件流的输入,并实时返回多个文件对应的翻译结果。 4 | 5 | 音视频文件语音翻译的**实时模式**更适合对本地文件进行处理且即时返回结果,或搭建处理流式音频的服务端,收集前端的音频流,即时返回翻译结果的场景。 6 | 7 | [comment]: # (list of scenarios of the sample) 8 | ### :point_right: 适用场景 9 | 10 | | 应用场景 | 典型用法 | 使用说明 | 11 | |----------| ----- | ----- | 12 | | **实时双语字幕** | 自动生成音视频不同语言字幕 | *实时对视频流进行语音识别和翻译,生成双语字幕* | 13 | | **会议语音分析理解场景** | 实时会议语音识别 | *实时对会议语音进行语音识别并翻译* | 14 | 15 | [comment]: # (supported programming languages of the sample) 16 | ### :point_right: 编程语言 17 | - [Python](./python) 18 | - [Java](./java) 19 | 20 | [comment]: # (model and interface of the sample) 21 | ### :point_right: 参考详情 22 | 23 | | 推荐模型 | API详情 | 24 | | ----- | ----- | 25 | | **gummy-realtime-v1** | [Paraformer实时语音识别API详情(TODO:更新)](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-real-time-speech-recognition-api) | 26 | 27 | 28 | ### :point_right: 预期结果 29 | 30 | 运行示例,在控制台会输出流式识别结果。每个文件的识别对应一个[process id], 每个文件的结果会不断返回增量的结果。如 31 | ``` 32 | translation with file :hello_world_male_16k_16bit_mono.wav 33 | [process 94459] TranslationRecognizerCallback open. 34 | translation with file :hello_world_male_16k_16bit_mono.wav 35 | [process 94461] TranslationRecognizerCallback open. 36 | [process 94459] Transcript ==> hello ,word,这里是阿里巴巴语音实验室。 37 | [process 94459] Translate ==> Hello, world. This is Alibaba's voice lab. 38 | [process 94459] Translation completed 39 | [process 94459] TranslationRecognizerCallback close. 40 | [Metric] requestId: xxxxxxxxx, first package delay ms: 448.789794921875, last package delay ms: 1169.598876953125 41 | [process 94461] Transcript ==> hello ,word,这里是阿里巴巴语音实验室。 42 | [process 94461] Translate ==> Hello, world. This is Alibaba's voice lab. 43 | [process 94461] Translation completed 44 | [process 94461] TranslationRecognizerCallback close. 45 | [Metric] requestId: xxxxxxxxx, first package delay ms: 409.506103515625, last package delay ms: 1175.384033203125 46 | ``` 47 | 48 | [comment]: # (technical support of the sample) 49 | ### :point_right: 技术支持 50 | 51 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_by_realtime_mode/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音翻译(实时模式) 3 | 批量音视频文件语音翻译(实时模式)是指并发的将多个音频文件通过实时的方式将语音数据发送给语音翻译服务,并实时地返回翻译结果文本的过程。 4 | 5 | 如果您使用Java搭建语音服务请参考[高并发示例文档](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-in-high-concurrency-scenarios)获得最佳的性能。 6 | 7 | ## Java 8 | 9 | [comment]: # (prerequisites) 10 | ### :point_right: 前提条件 11 | 12 | 1. #### 配置阿里云百炼API-KEY 13 | 14 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 15 | 16 | 2. #### Java运行环境 17 | 18 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 19 | 20 | 21 | [comment]: # (how to run the sample and expected results) 22 | ### :point_right: 运行示例 23 | 24 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 25 | 26 | 示例使用了对象池和线程池实现并发运行。在示例运行时,程序会并发的读取您输入的多个音视频文件,将其独立的转为实时识别结果并分别以callback的方式回调。 27 | 28 | [comment]: # (technical support of the sample) 29 | ### :point_right: 技术支持 30 | 31 | 32 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_by_realtime_mode/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | org.apache.commons 47 | commons-pool2 48 | 2.11.1 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-shade-plugin 56 | 3.2.4 57 | 58 | 59 | package 60 | 61 | shade 62 | 63 | 64 | 65 | 66 | org.alibaba.speech.examples.speech_recognition.TranslateFromFilesByAsyncRealtimeApi 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_by_realtime_mode/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_by_realtime_mode/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_by_realtime_mode/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_by_realtime_mode/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音视频文件语音翻译(实时模式) 3 | 批量音视频文件语音翻译(实时模式)是指并发的将多个音频文件通过实时的方式将语音数据发送给语音翻译服务,并实时地返回翻译结果文本的过程。 4 | 5 | 6 | ## Python 7 | 8 | [comment]: # (prerequisites) 9 | ### :point_right: 前提条件 10 | 11 | 1. #### 配置阿里云百炼API-KEY 12 | 13 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 14 | 15 | 2. #### 安装Python依赖 16 | 17 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 18 | ```commandline 19 | pip3 install -r requirements.txt 20 | ``` 21 | 22 | [comment]: # (how to run the sample and expected results) 23 | ### :point_right: 运行示例 24 | - 您可以使用以下命令运行本示例: 25 | 26 | ```commandline 27 | python3 run.py 28 | ``` 29 | 30 | 示例使用了multiprocessing实现并发运行。在示例运行时,程序会并发的读取您输入的多个音频文件,将其独立的转为实时识别结果并分别以callback的方式回调识别结果。 31 | 32 | [comment]: # (technical support of the sample) 33 | ### :point_right: 技术支持 34 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_by_realtime_mode/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_for_one_sentence_by_realtime_mode/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音频文件一句话语音识别和翻译(实时模式) 3 | 本示例展示了如何批量的调用一句话语音翻译接口,实现多个文件流的输入,并实时返回多个文件对应的识别结果。 4 | 5 | 音频文件一句话语音识别和翻译的**实时模式**更适合对本地文件进行处理且即时返回结果,或搭建处理流式音频的服务端,收集前端的音频流,即时返回识别结果的场景。 6 | 7 | 如果您使用Java搭建语音服务,请参考[高并发示例文档](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-in-high-concurrency-scenarios)获得最佳的性能。 8 | 9 | [comment]: # (list of scenarios of the sample) 10 | ### :point_right: 适用场景 11 | 12 | | 应用场景 | 典型用法 | 使用说明 | 13 | |----------| ----- | ----- | 14 | | **入门场景** | 麦克风语音翻译 | *实时从麦克风录音并进行语音翻译* | 15 | | **实时双语字幕** | 自动生成音视频不同语言字幕 | *实时对视频流进行语音识别和翻译,生成双语字幕* | 16 | | **会议语音分析理解场景** | 实时会议语音识别 | *实时对会议语音进行语音识别并翻译* | 17 | 18 | [comment]: # (supported programming languages of the sample) 19 | ### :point_right: 编程语言 20 | - [Python](./python) 21 | - [Java](./java) 22 | 23 | [comment]: # (model and interface of the sample) 24 | ### :point_right: 参考详情 25 | 26 | | 推荐模型 | API详情 | 27 | | ----- | ----- | 28 | | **gummy-chat-v1** | [Paraformer实时语音识别API详情(TODO:更新)](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-real-time-speech-recognition-api) | 29 | 30 | 31 | ### :point_right: 预期结果 32 | 33 | 运行示例,在控制台会输出流式识别结果。每个文件的识别对应一个[process id], 每个文件的结果会不断返回增量的结果。如 34 | ``` 35 | translation with file :asr_example_chat.wav 36 | [process 92485] TranslationRecognizerCallback open. 37 | translation with file :asr_example_chat.wav 38 | [process 92483] TranslationRecognizerCallback open. 39 | [process 92483] Transcript ==> hello,word这里是阿里巴巴语音实验室。 40 | [process 92483] Translate ==> Hello world, this is the Alibaba Speech Lab. 41 | [process 92483] Translation completed 42 | [process 92485] Transcript ==> hello,word,这里是阿里巴巴语音实验室。 43 | [process 92485] Translate ==> Hello, world. This is the Alibaba Speech Lab. 44 | [process 92485] Translation completed 45 | [process 92483] TranslationRecognizerCallback close. 46 | [Metric] requestId: xxxxxxxx, first package delay ms: 444.7109375, last package delay ms: 665.9140625 47 | [process 92485] TranslationRecognizerCallback close. 48 | [Metric] requestId: xxxxxxxx, first package delay ms: 626.47509765625, last package delay ms: 963.182861328125 49 | ``` 50 | 51 | [comment]: # (technical support of the sample) 52 | ### :point_right: 技术支持 53 | 54 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_for_one_sentence_by_realtime_mode/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音频文件一句话语音识别和翻译(实时模式) 3 | 批量音频文件一句话语音识别和翻译(实时模式)是指并发的将多个音频文件通过实时的方式将语音数据发送给一句话语音翻译服务,并实时地返回翻译结果文本的过程。 4 | 5 | 如果您使用Java搭建语音服务请参考[高并发示例文档](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-in-high-concurrency-scenarios)获得最佳的性能。 6 | 7 | ## Java 8 | 9 | [comment]: # (prerequisites) 10 | ### :point_right: 前提条件 11 | 12 | 1. #### 配置阿里云百炼API-KEY 13 | 14 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 15 | 16 | 2. #### Java运行环境 17 | 18 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 19 | 20 | 21 | [comment]: # (how to run the sample and expected results) 22 | ### :point_right: 运行示例 23 | 24 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 25 | 26 | 示例使用了对象池和线程池实现并发运行。在示例运行时,程序会并发的读取您输入的多个音视频文件,将其独立的转为实时识别结果并分别以callback的方式回调。 27 | 28 | [comment]: # (technical support of the sample) 29 | ### :point_right: 技术支持 30 | 31 | 32 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_for_one_sentence_by_realtime_mode/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | org.apache.commons 47 | commons-pool2 48 | 2.11.1 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-shade-plugin 56 | 3.2.4 57 | 58 | 59 | package 60 | 61 | shade 62 | 63 | 64 | 65 | 66 | org.alibaba.speech.examples.speech_recognition.TranslateFromFilesForOneSentenceByAsyncApi 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_for_one_sentence_by_realtime_mode/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_for_one_sentence_by_realtime_mode/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_for_one_sentence_by_realtime_mode/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_for_one_sentence_by_realtime_mode/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 批量音频文件一句话语音识别和翻译(实时模式) 3 | 批量音频文件一句话语音识别和翻译(实时模式)是指并发的将多个音频文件通过实时的方式将语音数据发送给一句话语音翻译服务,并实时地返回翻译结果文本的过程。 4 | 5 | ## Python 6 | 7 | [comment]: # (prerequisites) 8 | ### :point_right: 前提条件 9 | 10 | 1. #### 配置阿里云百炼API-KEY 11 | 12 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 13 | 14 | 1. #### 安装ffmpeg 15 | 16 | 示例需要用到ffmpeg进行音视频解码。推荐从官方网站下载安装,并将ffmpeg安装路径配置进环境变量:[ffmpeg官方网站下载](https://www.ffmpeg.org/download.html)。也可以参考文档[如何安装ffmpeg](../../../docs/QA/ffmpeg.md)。 17 | 18 | 1. #### 安装Python依赖 19 | 20 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 21 | ```commandline 22 | pip3 install -r requirements.txt 23 | ``` 24 | 25 | [comment]: # (how to run the sample and expected results) 26 | ### :point_right: 运行示例 27 | - 您可以使用以下命令运行本示例: 28 | 29 | ```commandline 30 | python3 run.py 31 | ``` 32 | 33 | 示例使用了multiprocessing实现并发运行。在示例运行时,程序会并发的读取您输入的多个音视频文件,将其独立的转为实时识别结果并分别以callback的方式回调识别结果。 34 | 35 | [comment]: # (technical support of the sample) 36 | ### :point_right: 技术支持 37 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_files_for_one_sentence_by_realtime_mode/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_one_sentence/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时一句话语音识别和翻译 3 | 本示例展示了如何从麦克风录制音频,并将获取的音频流发送至阿里云百炼模型服务进行实时一句话语音翻译。运行示例时,用户对麦克风所说的第一句话会被实时翻译成英文并显示在屏幕上。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | |----------| ----- | ----- | 10 | | **入门场景** | 麦克风语音翻译 | *实时从麦克风录音并进行语音翻译* | 11 | | **实时双语字幕** | 自动生成音视频不同语言字幕 | *实时对视频流进行语音识别和翻译,生成双语字幕* | 12 | | **会议语音分析理解场景** | 实时会议语音识别 | *实时对会议语音进行语音识别并翻译* | 13 | 14 | [comment]: # (supported programming languages of the sample) 15 | ### :point_right: 编程语言 16 | - [Python](./python) 17 | - [Java](./java) 18 | 19 | [comment]: # (model and interface of the sample) 20 | ### :point_right: 参考详情 21 | 22 | | 推荐模型 | API详情 | 23 | | ----- | ----- | 24 | | **gummy-realtime-v1** | [Paraformer实时语音识别API详情(TODO:更新)](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-real-time-speech-recognition-api) | 25 | 26 | ### :point_right: 预期结果 27 | 28 | 运行示例,在控制台会提示您开始说话,服务端会根据VAD(Voice Activity Detection,静音检测)判停,当您停止说话后会自动停止录制。识别结果文本会在控制台打印,录制的音频会被保存到`request_id_record.pcm`文件中。 29 | ```text 30 | [log] Initializing ... 31 | [log] TranslationRecognizerCallback open. 32 | [log] Recording... 33 | [log] Translation started, request_id: a66eac0a04a24dddadeca3acc4a64c01 34 | translation will stop after recording one sentence... 35 | - - - - - - - - - - - 36 | [2024-12-19 14:00:06.757] transcript : 测试一 37 | [2024-12-19 14:00:06.757] translate to en: Test 38 | - - - - - - - - - - - 39 | [2024-12-19 14:00:06.956] transcript : 测试一 40 | [2024-12-19 14:00:06.956] translate to en: Test 41 | - - - - - - - - - - - 42 | [2024-12-19 14:00:07.154] transcript : 测试一 43 | [2024-12-19 14:00:07.154] translate to en: Test 44 | - - - - - - - - - - - 45 | [2024-12-19 14:00:07.353] transcript : 测试一句话识别。 46 | [2024-12-19 14:00:07.353] translate to en: Test sentence recognition. 47 | - - - - - - - - - - - 48 | [2024-12-19 14:00:07.553] transcript : 测试一句话识别。 49 | [2024-12-19 14:00:07.554] <=== [vad pre_end] silence start at 1560 ms, detected at 2000 ms ===> 50 | [2024-12-19 14:00:07.554] translate to en: Test sentence recognition. 51 | - - - - - - - - - - - 52 | [2024-12-19 14:00:07.955] transcript : 测试一句话识别。 53 | [2024-12-19 14:00:07.955] translate to en: Test sentence recognition. 54 | - - - - - - - - - - - 55 | [2024-12-19 14:00:08.157] transcript : 测试一句话识别。 56 | [2024-12-19 14:00:08.157] translate to en: Test sentence recognition. 57 | request id: a66eac0a04a24dddadeca3acc4a64c01 usage: {'duration': 4} 58 | [log] sentence end, stop sending 59 | [log] Translation completed. 60 | [log] TranslationRecognizerCallback close. 61 | [log] Recorded audio saved to a66eac0a04a24dddadeca3acc4a64c01_record.pcm 62 | ``` 63 | 64 | [comment]: # (technical support of the sample) 65 | ### :point_right: 技术支持 66 | 67 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_one_sentence/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时一句话语音识别和翻译 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 运行后将会从麦克风录制您说的第一句话,并实时翻译成英文。这些文本会打印在屏幕上。示例采用了VAD判停,当您停止说话后会自动停止录制。录制的音频会被保存到`request_id_record.pcm`文件中。 23 | 24 | [comment]: # (technical support of the sample) 25 | ### :point_right: 技术支持 26 | 27 | 28 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_one_sentence/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-shade-plugin 50 | 3.2.4 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | org.alibaba.speech.examples.speech_recognition.OneSentenceTranslateFromMic 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_one_sentence/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_one_sentence/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_one_sentence/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_one_sentence/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时一句话语音识别和翻译 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 19 | [comment]: # (how to run the sample and expected results) 20 | ### :point_right: 运行示例 21 | 您可以使用以下命令运行本示例: 22 | 23 | ```commandline 24 | python3 run.py 25 | ``` 26 | 27 | 示例运行后,将会从麦克风录制您说的第一句话,并实时翻译成英文。这些文本会打印在屏幕上。示例采用了VAD判停,当您停止说话后会自动停止录制。录制的音频会被保存到`request_id_record.pcm`文件中。 28 | 29 | [comment]: # (technical support of the sample) 30 | ### :point_right: 技术支持 31 | 32 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_one_sentence/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | pyaudio 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_realtime_stream/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时语音翻译 3 | 本示例展示了如何从麦克风录制音频,并将获取的音频流发送至阿里云百炼模型服务进行实时语音翻译。运行示例时,用户对麦克风所说的内容会被实时翻译成英文并显示在屏幕上。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | |----------| ----- | ----- | 10 | | **入门场景** | 麦克风语音翻译 | *实时从麦克风录音并进行语音翻译* | 11 | | **实时双语字幕** | 自动生成音视频不同语言字幕 | *实时对视频流进行语音识别和翻译,生成双语字幕* | 12 | | **会议语音分析理解场景** | 实时会议语音识别 | *实时对会议语音进行语音识别并翻译* | 13 | 14 | [comment]: # (supported programming languages of the sample) 15 | ### :point_right: 编程语言 16 | - [Python](./python) 17 | - [Java](./java) 18 | 19 | [comment]: # (model and interface of the sample) 20 | ### :point_right: 参考详情 21 | 22 | | 推荐模型 | API详情 | 23 | | ----- | ----- | 24 | | **gummy-realtime-v1** | [Paraformer实时语音识别API详情(TODO:更新)](https://help.aliyun.com/zh/model-studio/developer-reference/paraformer-real-time-speech-recognition-api) | 25 | 26 | ### :point_right: 预期结果 27 | 28 | 运行示例,在控制台会提示您开始说话,控制台输入'Ctrl+C' 即可结束识别。识别结果文本会在控制台打印。 29 | ```text 30 | Press 'Ctrl+C' to stop recording and recognition 31 | RecognitionCallback text: 一 32 | translate to en: The. 33 | translate to en: The. 34 | translate to en: The. 35 | translate to en: The. 36 | translate to en: This is 37 | translate to en: This is a sentence 38 | translate to en: This is a test audio. 39 | translate to en: This is a test audio. 40 | ``` 41 | 42 | [comment]: # (technical support of the sample) 43 | ### :point_right: 技术支持 44 | 45 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_realtime_stream/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时语音翻译 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 从麦克风输入进行实时语音翻译示例运行后会开启录音,调用streamCall()流式返回识别结果。 23 | 24 | [comment]: # (technical support of the sample) 25 | ### :point_right: 技术支持 26 | 27 | 28 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_realtime_stream/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-shade-plugin 50 | 3.2.4 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | org.alibaba.speech.examples.speech_recognition.TranslateSpeechFromMicrophone 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_realtime_stream/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_realtime_stream/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_realtime_stream/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_realtime_stream/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 麦克风实时语音翻译 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 19 | [comment]: # (how to run the sample and expected results) 20 | ### :point_right: 运行示例 21 | 您可以使用以下命令运行本示例: 22 | 23 | ```commandline 24 | python3 run.py 25 | ``` 26 | 27 | 示例运行时,您通过麦克风所录制的语音将会被实时识别并翻译成英文,这些文字会打印在屏幕上。当从键盘上按下“Ctrl+C”时,示例将停止运行。 28 | 29 | [comment]: # (technical support of the sample) 30 | ### :point_right: 技术支持 31 | 32 | -------------------------------------------------------------------------------- /samples/speech-recognition/translate_speech_from_microphone_for_realtime_stream/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | pyaudio 3 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成实时LLM输出并播放(流式模式) 3 | 本示例展示了如何将大语言模型(LLM)生成的文本流合成为语音流,并且通过扬声器播放。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | | ----- | ----- | ----- | 10 | | **语音播报场景** | 信息播报 | *将大语言模型生成的新闻、概要等信息实时播报* | 11 | | **电话呼叫中心场景** | 客服回复转语音 | *通过大语言模型生成客服回复并实时播报* | 12 | | **数字人场景** | 新闻播报、数字人直播、在线教育、voice chat | *通过大语言模型驱动数字人播报新闻、虚拟数字人直播、在线教育、语言学习、语音聊天等场景* | 13 | 14 | [comment]: # (supported programming languages of the sample) 15 | ### :point_right: 编程语言 16 | - [Python](./python) 17 | - [Java](./java) 18 | 19 | [comment]: # (model and interface of the sample) 20 | ### :point_right: 参考详情 21 | | 推荐模型 | API详情 | 22 | | --- | --- | 23 | | **cosyvoice-v1** | [CosyVoice大模型语音合成API详情](https://help.aliyun.com/zh/model-studio/developer-reference/api-details-25)
[音色列表](https://help.aliyun.com/zh/model-studio/developer-reference/model-list-1)| 24 | 25 | ### :point_right: 预期结果 26 | 27 | 示例运行时,将会调用阿里云百炼平台大语言模型千问(qwen-plus)回答提问:“番茄炒鸡蛋怎么做?”,并使用 loongstella 音色,按照流式方式发送大模型回答的文本并合成,将音频按照流式方式下发并通过扬声器播放。 28 | 29 | [comment]: # (technical support of the sample) 30 | ### :point_right: 技术支持 31 | 32 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成实时LLM输出并播放(流式模式) 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 运行示例。调用阿里云百炼平台大语言模型千问(qwen-turbo)回答提问:“番茄炒鸡蛋怎么做?”,并使用 longmiao 音色,按照流式方式发送大模型回答的文本并合成,将音频按照流式方式下发并通过扬声器播放。 23 | 24 | 您可以通过修改`query_to_llm`更改提问内容。 25 | 26 | [comment]: # (technical support of the sample) 27 | ### :point_right: 技术支持 28 | 29 | 30 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | com.googlecode.soundlibs 47 | mp3spi 48 | 1.9.5.4 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-shade-plugin 56 | 3.2.4 57 | 58 | 59 | package 60 | 61 | shade 62 | 63 | 64 | 65 | 66 | org.alibaba.speech.examples.speech_synthesizer.SynthesizeSpeechFromLlmByStreamingMode 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/java/src/main/java/org/alibaba/speech/utils/RealtimeMp3Player.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) Alibaba Group. All Rights Reserved. 3 | * MIT License (https://opensource.org/licenses/MIT) 4 | */ 5 | package org.alibaba.speech.utils; 6 | 7 | import java.io.IOException; 8 | import java.io.PipedInputStream; 9 | import java.io.PipedOutputStream; 10 | import java.nio.ByteBuffer; 11 | import java.util.concurrent.CountDownLatch; 12 | import javazoom.jl.decoder.JavaLayerException; 13 | import javazoom.jl.player.advanced.AdvancedPlayer; 14 | import javazoom.jl.player.advanced.PlaybackEvent; 15 | import javazoom.jl.player.advanced.PlaybackListener; 16 | 17 | // JLayer library is utilized in this demo for audio decoding and playback, but you can employ other 18 | // methods suited to your needs. 19 | public class RealtimeMp3Player { 20 | 21 | // audio player 22 | private static AdvancedPlayer player; 23 | // init pipe stream, input/output 24 | private static PipedOutputStream pipedOutputStream; // use to write audio data to pipe stream 25 | private static PipedInputStream pipedInputStream; // use to read audio data from pipe stream 26 | CountDownLatch latch = new CountDownLatch(1); 27 | 28 | public void start() { 29 | try { 30 | System.out.println("build pipe stream for audio to play"); 31 | pipedOutputStream = new PipedOutputStream(); 32 | pipedInputStream = new PipedInputStream(pipedOutputStream, 1024 * 256); 33 | } catch (IOException e) { 34 | e.printStackTrace(); 35 | } 36 | 37 | new Thread( 38 | () -> { 39 | try { 40 | player = new AdvancedPlayer(pipedInputStream); 41 | 42 | // Create a listener to respond to playback events 43 | player.setPlayBackListener( 44 | new PlaybackListener() { 45 | @Override 46 | public void playbackFinished(PlaybackEvent event) { 47 | System.out.println("Playback finished."); 48 | latch.countDown(); 49 | System.exit(0); 50 | } 51 | }); 52 | 53 | // System.out.println("player start"); 54 | player.play(); 55 | } catch (JavaLayerException e) { 56 | e.printStackTrace(); 57 | } 58 | }) 59 | .start(); 60 | } 61 | 62 | // write audio data to pipe stream 63 | public void write(ByteBuffer audioData) { 64 | try { 65 | pipedOutputStream.write(audioData.array()); 66 | pipedOutputStream.flush(); 67 | // System.out.printf("write audio data to pipe stream %d \n", audioData.array().length); 68 | } catch (IOException e) { 69 | throw new RuntimeException(e); 70 | } 71 | } 72 | 73 | // stop feed audio data to pipe stream 74 | public void stop() { 75 | // System.out.println("Stop AudioPlayer data feed"); 76 | try { 77 | pipedOutputStream.close(); 78 | } catch (IOException e) { 79 | throw new RuntimeException(e); 80 | } 81 | try { 82 | latch.await(); 83 | } catch (InterruptedException e) { 84 | throw new RuntimeException(e); 85 | } 86 | System.out.println("AudioPlayerStoped"); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成实时LLM输出并播放(流式模式) 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装ffmpeg 13 | 14 | 示例需要用到ffmpeg进行音视频解码。推荐从官方网站下载安装,并将ffmpeg安装路径配置进环境变量:[ffmpeg官方网站下载](https://www.ffmpeg.org/download.html)。也可以参考文档[如何安装ffmpeg](../../../docs/QA/ffmpeg.md)。 15 | 16 | 1. #### 安装Python依赖 17 | 18 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 19 | ```commandline 20 | pip3 install -r requirements.txt 21 | ``` 22 | 请参考[文档](https://github.com/kkroening/ffmpeg-python)安装ffmpeg 23 | 24 | [comment]: # (how to run the sample and expected results) 25 | ### :point_right: 运行示例 26 | 您可以使用以下命令运行本示例: 27 | 28 | ```commandline 29 | python3 run.py 30 | ``` 31 | 32 | 示例运行时,将会调用阿里云百炼平台大语言模型千问(qwen-turbo)回答提问:“番茄炒鸡蛋怎么做?”,并使用 longmiao 音色,按照流式方式发送大模型回答的文本并合成,将音频按照流式方式下发并通过扬声器播放。 33 | 34 | 运行示例如下: 35 | ``` 36 | >>>提问:番茄炒鸡蛋怎么做? 37 | >>>回答:做番茄炒鸡蛋挺简单的,你先准备好材料:几个新鲜的番茄和几个鸡蛋。先把鸡蛋打在碗里,加一点点盐,然后用筷子搅匀。 38 | 接着热锅凉油,油温上来后就把鸡蛋液倒进去,等它稍微凝固一点就可以用铲子翻炒几下,鸡蛋变金黄色就可以盛出来备用。 39 | 40 | 然后锅里再加点油,把切好的番茄块放进去翻炒,番茄会出一些汁水,你可以根据口味加点糖中和酸味。等番茄差不多了,就把刚才炒好 41 | 的鸡蛋倒回去一起翻炒均匀,最后尝尝味道,如果需要可以再调一下味,撒点葱花就可以出锅啦! 42 | 43 | 试试看吧,记得注意火候哦! synthesize and play over with requestId: 09690564096a47a5b7fae07dbb615117 44 | ``` 45 | 46 | 您可以通过修改`query_to_llm`更改提问内容。 47 | 48 | [comment]: # (technical support of the sample) 49 | ### :point_right: 技术支持 50 | 51 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_llm_by_streaming_mode/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | pyaudio 3 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成并保存文件(简单模式) 3 | 本示例展示了如何合成指定文本的语音,并将语音保存为文件。语音合成**简单模式**将合成语音保存为文件,适合不需要实时播放的场景。如需实时播放,请参考[语音合成并播放(流式模式)](../synthesize_speech_from_text_by_streaming_mode/)。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | | ----- | ----- | ----- | 10 | | **入门场景** | 一句话合成 | *将一段文本转化为语音* | 11 | | **视频配音场景** | 视频配音、新闻配音 | *通过语音合成播报视频中字幕等文本内容* | 12 | | **有声读物场景** | 小说配音、绘本配音 | *通过多种音色对应不同角色,朗读小说、绘本等有声读物* | 13 | 14 | [comment]: # (supported programming languages of the sample) 15 | ### :point_right: 编程语言 16 | - [Python](./python) 17 | - [Java](./java) 18 | 19 | [comment]: # (model and interface of the sample) 20 | ### :point_right: 参考详情 21 | | 推荐模型 | API详情 | 22 | | --- | --- | 23 | | **cosyvoice-v1** | [CosyVoice大模型语音合成API详情](https://help.aliyun.com/zh/model-studio/developer-reference/api-details-25)
[音色列表](https://help.aliyun.com/zh/model-studio/developer-reference/model-list-1)| 24 | 25 | ### :point_right: 预期结果 26 | 27 | 示例运行,将会使用 loongstella 音色合成示例文本 “想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!” 保存在 `result.mp3` 文件中。 28 | 29 | [comment]: # (technical support of the sample) 30 | ### :point_right: 技术支持 31 | 32 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成并保存文件(简单模式) 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 运行示例。使用 loongstella 音色合成示例文本 “想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!” 保存在 `result.mp3` 文件中。 23 | 您可以通过修改`textToSynthesize`合成指定的文本。 24 | 25 | [comment]: # (technical support of the sample) 26 | ### :point_right: 技术支持 27 | 28 | 29 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-shade-plugin 50 | 3.2.4 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | org.alibaba.speech.examples.speech_synthesizer.SynthesizeSpeechFromTextAndSave 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/java/src/main/java/org/alibaba/speech/examples/speech_synthesizer/SynthesizeSpeechFromTextAndSave.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) Alibaba Group. All Rights Reserved. 3 | * MIT License (https://opensource.org/licenses/MIT) 4 | */ 5 | 6 | package org.alibaba.speech.examples.speech_synthesizer; 7 | 8 | import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam; 9 | import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer; 10 | import com.alibaba.dashscope.exception.NoApiKeyException; 11 | import com.alibaba.dashscope.utils.ApiKey; 12 | import java.io.FileOutputStream; 13 | import java.io.IOException; 14 | import java.nio.ByteBuffer; 15 | 16 | // This demo showcases how to use Alibaba Cloud's DashScope model for real-time synthesis and 17 | // playback of MP3 audio streams. 18 | public class SynthesizeSpeechFromTextAndSave { 19 | public static void main(String[] args) throws NoApiKeyException { 20 | // set speech synthesis params 21 | SpeechSynthesisParam param = 22 | SpeechSynthesisParam.builder() 23 | .model("cosyvoice-v1") 24 | .voice("loongstella") 25 | .apiKey(getDashScopeApiKey()) // Set your API key 26 | .build(); 27 | System.out.println("init params done"); 28 | 29 | // Create a speech synthesizer 30 | SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null); 31 | 32 | String textToSynthesize = "想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!"; 33 | 34 | // Start the synthesizer with Text 35 | System.out.printf("start synthesizer : %s \n", textToSynthesize); 36 | ByteBuffer audio = synthesizer.call(textToSynthesize); 37 | try (FileOutputStream fos = new FileOutputStream("result.mp3")) { 38 | fos.write(audio.array()); 39 | System.out.println("synthesis done!"); 40 | } catch (IOException e) { 41 | throw new RuntimeException(e); 42 | } 43 | System.out.println( 44 | "[Metric] requestId: " 45 | + synthesizer.getLastRequestId() 46 | + ", first package delay ms: " 47 | + synthesizer.getFirstPackageDelay()); 48 | System.exit(0); 49 | } 50 | 51 | /** 52 | * Set your DashScope API key. More information: ... In fact, if you have set 54 | * DASHSCOPE_API_KEY in your environment variable, you can ignore this, and the SDK will 55 | * automatically get the api_key from the environment variable 56 | */ 57 | private static String getDashScopeApiKey() throws NoApiKeyException { 58 | String dashScopeApiKey = null; 59 | try { 60 | ApiKey apiKey = new ApiKey(); 61 | dashScopeApiKey = apiKey.getApiKey(null); // Retrieve from environment variable. 62 | } catch (NoApiKeyException e) { 63 | System.out.println("No API key found in environment."); 64 | } 65 | if (dashScopeApiKey == null) { 66 | // If you cannot set api_key in your environment variable, 67 | // you can set it here by code 68 | dashScopeApiKey = "your-dashscope-api-key"; 69 | } 70 | return dashScopeApiKey; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成并保存文件(简单模式) 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 请参考[文档](https://github.com/kkroening/ffmpeg-python)安装ffmpeg 19 | 20 | [comment]: # (how to run the sample and expected results) 21 | ### :point_right: 运行示例 22 | 您可以使用以下命令运行本示例: 23 | 24 | ```commandline 25 | python3 run.py 26 | ``` 27 | 28 | 示例运行时,将会使用longxiaochun音色合成示例文本 “欢迎体验阿里云百炼大模型语音合成服务!” 保存在 `result.mp3` 文件中。 29 | 您可以通过修改`text_to_synthesize`合成指定的文本。 30 | 31 | [comment]: # (technical support of the sample) 32 | ### :point_right: 技术支持 33 | 34 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text/python/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | 5 | import os 6 | import sys 7 | 8 | import dashscope 9 | from dashscope.audio.tts_v2 import SpeechSynthesizer 10 | 11 | text_to_synthesize = '想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!' 12 | file_to_save = 'result.mp3' 13 | 14 | 15 | def init_dashscope_api_key(): 16 | ''' 17 | Set your DashScope API-key. More information: 18 | https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md 19 | ''' 20 | if 'DASHSCOPE_API_KEY' in os.environ: 21 | dashscope.api_key = os.environ[ 22 | 'DASHSCOPE_API_KEY'] # load API-key from environment variable DASHSCOPE_API_KEY 23 | else: 24 | dashscope.api_key = '' # set API-key manually 25 | 26 | 27 | def synthesize_speech_from_text(text, file_path): 28 | ''' 29 | Synthesize speech with given text, sync call and save the audio data into file_path 30 | For more information, please refer to https://help.aliyun.com/document_detail/2712523.html 31 | ''' 32 | # Initialize the speech synthesizer 33 | # you can customize the synthesis parameters, like voice, format, sample_rate or other parameters 34 | speech_synthesizer = SpeechSynthesizer(model='cosyvoice-v1', 35 | voice='loongstella', 36 | callback=None) 37 | audio = speech_synthesizer.call(text) 38 | # Save the synthesized audio to a file 39 | with open(file_path, 'wb') as f: 40 | f.write(audio) 41 | print(f'Synthesized text {text} to file : {file_path}') 42 | print('[Metric] requestId: {}, first package delay ms: {}'.format( 43 | speech_synthesizer.get_last_request_id(), 44 | speech_synthesizer.get_first_package_delay())) 45 | 46 | 47 | # main function 48 | if __name__ == '__main__': 49 | init_dashscope_api_key() 50 | synthesize_speech_from_text(text=text_to_synthesize, 51 | file_path=file_to_save) 52 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成并播放(流式模式) 3 | 本示例展示了如何合成指定文本的语音,流式获取返回音频并实时播放。本示例同时展示了如何在流式回调中保存音频到文件。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | | ----- | ----- | ----- | 10 | | **电话呼叫中心场景** | 客服回复转语音 | *使用文字转语音对客服机器人回复进行实时语音播报* | 11 | | **数字人场景** | 新闻播报 | *新闻等场景,通过语音合成进行文本信息的播报* | 12 | 13 | [comment]: # (supported programming languages of the sample) 14 | ### :point_right: 编程语言 15 | - [Python](./python) 16 | - [Java](./java) 17 | 18 | [comment]: # (model and interface of the sample) 19 | ### :point_right: 参考详情 20 | | 推荐模型 | API详情 | 21 | | --- | --- | 22 | | **cosyvoice-v1** | [CosyVoice大模型语音合成API详情](https://help.aliyun.com/zh/model-studio/developer-reference/api-details-25)
[音色列表](https://help.aliyun.com/zh/model-studio/developer-reference/model-list-1)| 23 | 24 | ### :point_right: 预期结果 25 | 26 | 示例运行时,将会使用 loongstella 音色合成示例文本 “想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!” ,合成音频将按照流式方式下发,通过扬声器播放并保存到文件`result.mp3`中。 27 | 28 | [comment]: # (technical support of the sample) 29 | ### :point_right: 技术支持 30 | 31 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成并播放(流式模式) 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 运行示例。使用 loongstella 音色合成示例文本 “想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!” ,合成音频将按照流式方式下发,通过扬声器播放并保存到文件`result.mp3`中。 23 | 24 | 您可以通过修改`textToSynthesize`合成指定的文本。 25 | 26 | [comment]: # (technical support of the sample) 27 | ### :point_right: 技术支持 28 | 29 | 30 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | com.googlecode.soundlibs 47 | mp3spi 48 | 1.9.5.4 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-shade-plugin 56 | 3.2.4 57 | 58 | 59 | package 60 | 61 | shade 62 | 63 | 64 | 65 | 66 | org.alibaba.speech.examples.speech_synthesizer.SynthesizeSpeechFromTextByStreamingMode 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/java/src/main/java/org/alibaba/speech/utils/RealtimeMp3Player.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) Alibaba Group. All Rights Reserved. 3 | * MIT License (https://opensource.org/licenses/MIT) 4 | */ 5 | package org.alibaba.speech.utils; 6 | 7 | import java.io.IOException; 8 | import java.io.PipedInputStream; 9 | import java.io.PipedOutputStream; 10 | import java.nio.ByteBuffer; 11 | import java.util.concurrent.CountDownLatch; 12 | import javazoom.jl.decoder.JavaLayerException; 13 | import javazoom.jl.player.advanced.AdvancedPlayer; 14 | import javazoom.jl.player.advanced.PlaybackEvent; 15 | import javazoom.jl.player.advanced.PlaybackListener; 16 | 17 | // JLayer library is utilized in this demo for audio decoding and playback, but you can employ other 18 | // methods suited to your needs. 19 | public class RealtimeMp3Player { 20 | 21 | // audio player 22 | private static AdvancedPlayer player; 23 | // init pipe stream, input/output 24 | private static PipedOutputStream pipedOutputStream; // use to write audio data to pipe stream 25 | private static PipedInputStream pipedInputStream; // use to read audio data from pipe stream 26 | CountDownLatch latch = new CountDownLatch(1); 27 | 28 | public void start() { 29 | try { 30 | System.out.println("build pipe stream for audio to play"); 31 | pipedOutputStream = new PipedOutputStream(); 32 | pipedInputStream = new PipedInputStream(pipedOutputStream, 1024 * 256); 33 | } catch (IOException e) { 34 | e.printStackTrace(); 35 | } 36 | System.out.println("build pipe stream for audio to play"); 37 | 38 | new Thread( 39 | () -> { 40 | try { 41 | player = new AdvancedPlayer(pipedInputStream); 42 | 43 | // Create a listener to respond to playback events 44 | player.setPlayBackListener( 45 | new PlaybackListener() { 46 | @Override 47 | public void playbackFinished(PlaybackEvent event) { 48 | System.out.println("Playback finished."); 49 | latch.countDown(); 50 | System.exit(0); 51 | } 52 | }); 53 | 54 | System.out.println("player start"); 55 | player.play(); 56 | } catch (JavaLayerException e) { 57 | e.printStackTrace(); 58 | } 59 | }) 60 | .start(); 61 | } 62 | 63 | // write audio data to pipe stream 64 | public void write(ByteBuffer audioData) { 65 | try { 66 | pipedOutputStream.write(audioData.array()); 67 | pipedOutputStream.flush(); 68 | // System.out.printf("write audio data to pipe stream %d \n", audioData.array().length); 69 | } catch (IOException e) { 70 | throw new RuntimeException(e); 71 | } 72 | } 73 | 74 | // stop feed audio data to pipe stream 75 | public void stop() { 76 | System.out.println("Stop AudioPlayer data feed"); 77 | try { 78 | pipedOutputStream.close(); 79 | } catch (IOException e) { 80 | throw new RuntimeException(e); 81 | } 82 | try { 83 | latch.await(); 84 | } catch (InterruptedException e) { 85 | throw new RuntimeException(e); 86 | } 87 | System.out.println("AudioPlayerStoped"); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成并播放(流式模式) 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装ffmpeg 13 | 14 | 示例需要用到ffmpeg进行音视频解码。推荐从官方网站下载安装,并将ffmpeg安装路径配置进环境变量:[ffmpeg官方网站下载](https://www.ffmpeg.org/download.html)。也可以参考文档[如何安装ffmpeg](../../../docs/QA/ffmpeg.md)。 15 | 16 | 1. #### 安装Python依赖 17 | 18 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 19 | ```commandline 20 | pip3 install -r requirements.txt 21 | ``` 22 | 请参考[文档](https://github.com/kkroening/ffmpeg-python)安装ffmpeg 23 | 24 | [comment]: # (how to run the sample and expected results) 25 | ### :point_right: 运行示例 26 | 您可以使用以下命令运行本示例: 27 | 28 | ```commandline 29 | python3 run.py 30 | ``` 31 | 32 | 示例运行时,将会使用 loongstella 音色合成示例文本 “想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!” ,合成音频将按照流式方式下发,通过扬声器播放并保存到文件`result.mp3`中。 33 | 34 | 您可以通过修改`text_to_synthesize`合成指定的文本。 35 | 36 | [comment]: # (technical support of the sample) 37 | ### :point_right: 技术支持 38 | 39 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | pyaudio 3 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_by_streaming_mode/python/run.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | 5 | import os 6 | import sys 7 | import threading 8 | 9 | import dashscope 10 | from dashscope.audio.tts_v2 import * 11 | 12 | sys.path.append( 13 | os.path.join(os.path.dirname(os.path.abspath(__file__)), 14 | '../../../utils/python')) 15 | 16 | from RealtimeMp3Player import RealtimeMp3Player 17 | 18 | text_to_synthesize = '想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!' 19 | 20 | 21 | def init_dashscope_api_key(): 22 | ''' 23 | Set your DashScope API-key. More information: 24 | https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md 25 | ''' 26 | if 'DASHSCOPE_API_KEY' in os.environ: 27 | dashscope.api_key = os.environ[ 28 | 'DASHSCOPE_API_KEY'] # load API-key from environment variable DASHSCOPE_API_KEY 29 | else: 30 | dashscope.api_key = '' # set API-key manually 31 | 32 | 33 | def synthesis_text_to_speech_and_play_by_streaming_mode(text): 34 | ''' 35 | Synthesize speech with given text by streaming mode, async call and play the synthesized audio in real-time. 36 | for more information, please refer to https://help.aliyun.com/document_detail/2712523.html 37 | ''' 38 | player = RealtimeMp3Player() 39 | # start player 40 | player.start() 41 | 42 | complete_event = threading.Event() 43 | 44 | # Define a callback to handle the result 45 | 46 | class Callback(ResultCallback): 47 | def on_open(self): 48 | self.file = open('result.mp3', 'wb') 49 | print('websocket is open.') 50 | 51 | def on_complete(self): 52 | print('speech synthesis task complete successfully.') 53 | complete_event.set() 54 | 55 | def on_error(self, message: str): 56 | print(f'speech synthesis task failed, {message}') 57 | 58 | def on_close(self): 59 | print('websocket is closed.') 60 | 61 | def on_event(self, message): 62 | # print(f'recv speech synthsis message {message}') 63 | pass 64 | 65 | def on_data(self, data: bytes) -> None: 66 | # send to player 67 | player.write(data) 68 | # save audio to file 69 | self.file.write(data) 70 | 71 | # Call the speech synthesizer callback 72 | synthesizer_callback = Callback() 73 | 74 | # Initialize the speech synthesizer 75 | # you can customize the synthesis parameters, like voice, format, sample_rate or other parameters 76 | speech_synthesizer = SpeechSynthesizer(model='cosyvoice-v1', 77 | voice='loongstella', 78 | callback=synthesizer_callback) 79 | 80 | speech_synthesizer.call(text) 81 | print('Synthesized text: {}'.format(text)) 82 | complete_event.wait() 83 | player.stop() 84 | print('[Metric] requestId: {}, first package delay ms: {}'.format( 85 | speech_synthesizer.get_last_request_id(), 86 | speech_synthesizer.get_first_package_delay())) 87 | 88 | 89 | # main function 90 | if __name__ == '__main__': 91 | init_dashscope_api_key() 92 | synthesis_text_to_speech_and_play_by_streaming_mode( 93 | text=text_to_synthesize) 94 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 并发调用语音合成 3 | 本示例展示了如何并发合成多个文本的语音,并将语音保存为单独的文件。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | | ----- | ----- | ----- | 10 | | **入门场景**| 并发调用语音合成 | *并发地将文本合成为语音* | 11 | 12 | [comment]: # (supported programming languages of the sample) 13 | ### :point_right: 编程语言 14 | - [Python](./python) 15 | - [Java](./java) 16 | 17 | [comment]: # (model and interface of the sample) 18 | ### :point_right: 参考详情 19 | | 推荐模型 | API详情 | 20 | | --- | --- | 21 | | **cosyvoice-v1** | [CosyVoice大模型语音合成API详情](https://help.aliyun.com/zh/model-studio/developer-reference/api-details-25)
[音色列表](https://help.aliyun.com/zh/model-studio/developer-reference/model-list-1)| 22 | 23 | ### :point_right: 预期结果 24 | 25 | 示例运行时,将使用三种不同的音色并发合成 “我是<音色名>,欢迎体验阿里云百炼大模型语音合成服务!” 并保存在 `results/result_v<音色名>_p<线程号>.mp3` 文件中。 26 | 27 | [comment]: # (technical support of the sample) 28 | ### :point_right: 技术支持 29 | 30 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 并发调用语音合成 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 运行示例。使用 longxiaochun 音色并发合成 “欢迎体验阿里云百炼大模型语音合成服务!” 并保存在 `.mp3` 文件中。 23 | 24 | 在java并发示例中,使用了连接池、对象池、线程池三种资源池。当对象和连接复用时可以有效降低建立连接的时间开销。 25 | 26 | 您可以通过修改`task_list`中增加/删除任务合成指定数量的文本。通过`peakThreadNum`参数修改最大进程数。建议不超过机器的cpu核心数。通过`runTimes`参数设定任务执行次数。 27 | 28 | :information_source: **注意**:个人账号的appkey当前仅支持 3 并发,如需开通多并发请联系我们。 29 | 30 | [comment]: # (technical support of the sample) 31 | ### :point_right: 技术支持 32 | 33 | 34 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | org.apache.commons 47 | commons-pool2 48 | 2.11.1 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-shade-plugin 56 | 3.2.4 57 | 58 | 59 | package 60 | 61 | shade 62 | 63 | 64 | 65 | 66 | org.alibaba.speech.examples.speech_synthesizer.SynthesizeTextToSpeechWithCallbackConcurrently 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 并发调用语音合成 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装Python依赖 13 | 14 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 15 | ```commandline 16 | pip3 install -r requirements.txt 17 | ``` 18 | 19 | [comment]: # (how to run the sample and expected results) 20 | ### :point_right: 运行示例 21 | 您可以使用以下命令运行本示例: 22 | 23 | ```commandline 24 | python3 run.py 25 | ``` 26 | 27 | 示例运行时,将使用三种不同的音色并发合成 “我是XXX,欢迎体验阿里云百炼大模型语音合成服务!” 并保存在 `results/result_v<音色名>_p<线程号>.mp3` 文件中。 28 | 29 | 您可以通过修改`task_list`中增加/删除任务合成指定数量的文本。通过修改`multiprocessing.Pool`中的`processes`参数修改最大进程数。建议不超过机器的cpu核心数。 30 | 31 | 在Python的调用示例中,由于Python存在全局解释器,因此使用了多进程的方式实现并发。 32 | 33 | :information_source: **注意**:个人账号的appkey当前仅支持 3 并发,如需开通多并发请联系我们。 34 | 35 | 36 | 37 | [comment]: # (technical support of the sample) 38 | ### :point_right: 技术支持 39 | 40 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_concurrently/python/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | 5 | import multiprocessing 6 | import os 7 | 8 | import dashscope 9 | from dashscope.audio.tts_v2 import SpeechSynthesizer 10 | 11 | 12 | def init_dashscope_api_key(): 13 | ''' 14 | Set your DashScope API-key. More information: 15 | https://github.com/aliyun/alibabacloud-bailian-speech-demo/blob/master/PREREQUISITES.md 16 | ''' 17 | if 'DASHSCOPE_API_KEY' in os.environ: 18 | dashscope.api_key = os.environ[ 19 | 'DASHSCOPE_API_KEY'] # load API-key from environment variable DASHSCOPE_API_KEY 20 | else: 21 | dashscope.api_key = '' # set API-key manually 22 | 23 | 24 | def synthesis_one_text_to_speech(task): 25 | ''' 26 | Synthesize speech with given text and voice, sync call and save the audio into file_path 27 | for more information, please refer to https://help.aliyun.com/document_detail/2712523.html 28 | ''' 29 | init_dashscope_api_key() 30 | text_to_synthesize = task[0] 31 | voice = task[1] 32 | pid = os.getpid() 33 | file_to_save = os.path.join(f'result_{voice}_p{pid}.mp3') 34 | 35 | # Initialize the speech synthesizer 36 | # you can customize the synthesis parameters, like voice, format, sample_rate or other parameters 37 | speech_synthesizer = SpeechSynthesizer(model='cosyvoice-v1', 38 | voice=voice, 39 | callback=None) 40 | 41 | # Synthesize speech with given text, sync call and return the audio data in result 42 | # for more information, please refer to https://help.aliyun.com/document_detail/2712523.html 43 | audio_data = speech_synthesizer.call(text_to_synthesize) 44 | print('[Process {}][Metric] requestId: {}, first package delay ms: {}'. 45 | format(pid, speech_synthesizer.get_last_request_id(), 46 | speech_synthesizer.get_first_package_delay())) 47 | if audio_data is not None: 48 | # Save the synthesized audio to a file 49 | with open(file_to_save, 'wb') as f: 50 | f.write(audio_data) 51 | 52 | print('[Process {}] Synthesized text {} to file : {}'.format( 53 | pid, text_to_synthesize, file_to_save)) 54 | else: 55 | print('[Process {}] Synthesis Fail'.format(pid)) 56 | 57 | 58 | def multi_process_pool(): 59 | # Create a pool of processes with the number of available CPU cores 60 | process_pool = multiprocessing.Pool(processes=3) 61 | 62 | # Get the number of CPU cores avaliable 63 | # num_cores = multiprocessing.cpu_count() 64 | # print(f"Number of CPU cores: {num_cores}") 65 | # process_pool = multiprocessing.Pool(processes=num_cores) 66 | 67 | # Please replace the text with your own text to synthesis 68 | task_list = [ 69 | ['我是龙小淳,欢迎体验阿里云百炼语音合成大模型服务!', 'longxiaochun'], 70 | ['我是龙小白,欢迎体验阿里云百炼语音合成大模型服务!', 'longxiaobai'], 71 | ['我是龙媛,欢迎体验阿里云百炼语音合成大模型服务!', 'longyuan'], 72 | ] 73 | 74 | # Use the map method to distribute tasks among the pool and collect the results 75 | process_pool.map(synthesis_one_text_to_speech, task_list) 76 | exit(0) 77 | 78 | 79 | if __name__ == '__main__': 80 | multi_process_pool() 81 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_using_asyncio/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 语音合成异步IO(流式模式) 3 | 本示例展示了如何合成指定文本的语音,在流式回调中保存音频到文件,并且在协程中异步等待合成结束。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | 本示例旨在演示如何利用Python的协程库 `asyncio`异步等待语音合成结束,并且避免阻塞当前协程的EventLoop。适用于在异步I/O程序或系统下调用CosyVoice大模型语音合成的场景。 9 | 10 | [comment]: # (supported programming languages of the sample) 11 | ### :point_right: 编程语言 12 | - [Python](./python) 13 | 14 | [comment]: # (model and interface of the sample) 15 | ### :point_right: 参考详情 16 | | 推荐模型 | API详情 | 17 | | --- | --- | 18 | | **cosyvoice-v1** | [CosyVoice大模型语音合成API详情](https://help.aliyun.com/zh/model-studio/developer-reference/api-details-25)
[音色列表](https://help.aliyun.com/zh/model-studio/developer-reference/model-list-1)| 19 | 20 | ### :point_right: 预期结果 21 | 22 | 示例运行时,将会使用 loongstella 音色合成示例文本 “想不到时间过得这么快!昨天和你视频聊天,看到你那自豪又满意的笑容,我的心里呀,就如同喝了一瓶蜜一样甜呢!真心为你开心呢!” ,合成音频将按照流式方式下发,并保存到文件`result.mp3`中。 23 | 24 | ### :point_right: 异步调用说明 25 | 26 | 在本示例中,首先通过`async_streaming_complete`函数发送TTS结束信号,之后利用线程安全的 `ThreadSafeAsyncioEvent` 异步的等待合成任务结束。 27 | 28 | [comment]: # (technical support of the sample) 29 | ### :point_right: 技术支持 30 | 31 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 复刻你的音色进行语音合成并播放(流式模式) 3 | 本示例展示了如何根据指引录制音频,复刻您自己的音色,并合成指定文本的语音。示例将流式获取返回音频并实时播放。本示例同时展示了如何在流式回调中保存音频到文件。 4 | 5 | [comment]: # (list of scenarios of the sample) 6 | ### :point_right: 适用场景 7 | 8 | | 应用场景 | 典型用法 | 使用说明 | 9 | | ----- | ----- | ----- | 10 | | **电话呼叫中心场景** | 客服回复转语音 | *使用自定义音色文字转语音对客服机器人回复进行实时语音播报* | 11 | | **数字人场景** | 新闻播报 | *使用自定义音色新闻等场景,通过语音合成进行文本信息的播报* | 12 | 13 | [comment]: # (supported programming languages of the sample) 14 | ### :point_right: 编程语言 15 | - [Python](./python) 16 | - [Java](./java) 17 | 18 | [comment]: # (model and interface of the sample) 19 | ### :point_right: 参考详情 20 | | 推荐模型 | API详情 | 21 | | --- | --- | 22 | | **cosyvoice-v1** | [CosyVoice大模型语音合成API详情](https://help.aliyun.com/zh/model-studio/developer-reference/api-details-25)
[音色复刻](https://help.aliyun.com/zh/model-studio/developer-reference/cosyvoice-clone-api)| 23 | 24 | ### :point_right: 预期结果 25 | 26 | 本示例分为两部分:录制音频和复刻音色。 27 | 28 | #### 录制音频 29 | 30 | 示例运行时,将会开启录音。请您按照提示录制一段音频用于克隆,并储存在`your_record_file.wav`中。请您使用阿里云oss或其他云存储方法获取可下载http链接。 31 | 32 | #### 复刻音色 33 | 34 | 示例运行时,将会根据您提供的录音创建复刻音色,并使用复刻音色合成示例文本 “你好,现在我在用你自己克隆的语音朗读这一段文本~” ,合成音频将按照流式方式下发,通过扬声器播放并保存到文件`result.mp3`中。 35 | 36 | [comment]: # (technical support of the sample) 37 | ### :point_right: 技术支持 38 | 39 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/java/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 复刻你的音色进行语音合成并播放(流式模式) 3 | ## Java 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 2. #### Java运行环境 13 | 14 | 在运行本示例之前,您需要安装Java运行环境和Maven构建工具。 15 | 16 | 17 | [comment]: # (how to run the sample and expected results) 18 | ### :point_right: 运行示例 19 | 20 | 您可以通过运行run.sh (Linux, Mac系统)或run.bat (Windows系统)来运行本示例。 21 | 22 | 运行示例。使用示例录音创建复刻音色,并使用复刻音色合成示例文本 “你好,欢迎使用阿里巴巴通义语音实验室的音色复刻服务~” ,合成音频将按照流式方式下发,通过扬声器播放并保存到文件`result.mp3`中。 23 | 24 | 您可以通过修改`audioUrl`替换示例录音文件为您自己的音频文件。 25 | 26 | 您可以通过修改`textArray`合成指定的文本。 27 | 28 | 您可以通过执行`DeleteVoiceByPrefix`批量删除指定前缀的音色。 29 | 30 | [comment]: # (technical support of the sample) 31 | ### :point_right: 技术支持 32 | 33 | 34 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | alibabacloud-bailian-speech-demo-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | UTF-8 15 | 16 | 17 | 18 | 19 | com.alibaba 20 | dashscope-sdk-java 21 | 2.18.0 22 | 23 | 24 | 25 | org.projectlombok 26 | lombok 27 | 1.18.24 28 | provided 29 | 30 | 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.7.32 36 | 37 | 38 | 39 | 40 | ch.qos.logback 41 | logback-classic 42 | 1.2.6 43 | 44 | 45 | 46 | com.googlecode.soundlibs 47 | mp3spi 48 | 1.9.5.4 49 | 50 | 51 | 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-shade-plugin 56 | 3.2.4 57 | 58 | 59 | package 60 | 61 | shade 62 | 63 | 64 | 65 | 66 | org.alibaba.speech.examples.speech_synthesizer.CloneVoiceAndSynthesisTextAndPlay 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/java/run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call mvn clean 4 | call mvn package 5 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 6 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/java/run.sh: -------------------------------------------------------------------------------- 1 | mvn clean package 2 | java -jar target/alibabacloud-bailian-speech-demo-java-1.0-SNAPSHOT.jar 3 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/java/src/main/java/org/alibaba/speech/utils/RealtimeMp3Player.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) Alibaba Group. All Rights Reserved. 3 | * MIT License (https://opensource.org/licenses/MIT) 4 | */ 5 | package org.alibaba.speech.utils; 6 | 7 | import java.io.IOException; 8 | import java.io.PipedInputStream; 9 | import java.io.PipedOutputStream; 10 | import java.nio.ByteBuffer; 11 | import java.util.concurrent.CountDownLatch; 12 | import javazoom.jl.decoder.JavaLayerException; 13 | import javazoom.jl.player.advanced.AdvancedPlayer; 14 | import javazoom.jl.player.advanced.PlaybackEvent; 15 | import javazoom.jl.player.advanced.PlaybackListener; 16 | 17 | // JLayer library is utilized in this demo for audio decoding and playback, but you can employ other 18 | // methods suited to your needs. 19 | public class RealtimeMp3Player { 20 | 21 | // audio player 22 | private static AdvancedPlayer player; 23 | // init pipe stream, input/output 24 | private static PipedOutputStream pipedOutputStream; // use to write audio data to pipe stream 25 | private static PipedInputStream pipedInputStream; // use to read audio data from pipe stream 26 | CountDownLatch latch = new CountDownLatch(1); 27 | 28 | public void start() { 29 | try { 30 | System.out.println("build pipe stream for audio to play"); 31 | pipedOutputStream = new PipedOutputStream(); 32 | pipedInputStream = new PipedInputStream(pipedOutputStream, 1024 * 256); 33 | } catch (IOException e) { 34 | e.printStackTrace(); 35 | } 36 | System.out.println("build pipe stream for audio to play"); 37 | 38 | new Thread( 39 | () -> { 40 | try { 41 | player = new AdvancedPlayer(pipedInputStream); 42 | 43 | // Create a listener to respond to playback events 44 | player.setPlayBackListener( 45 | new PlaybackListener() { 46 | @Override 47 | public void playbackFinished(PlaybackEvent event) { 48 | System.out.println("Playback finished."); 49 | latch.countDown(); 50 | System.exit(0); 51 | } 52 | }); 53 | 54 | System.out.println("player start"); 55 | player.play(); 56 | } catch (JavaLayerException e) { 57 | e.printStackTrace(); 58 | } 59 | }) 60 | .start(); 61 | } 62 | 63 | // write audio data to pipe stream 64 | public void write(ByteBuffer audioData) { 65 | try { 66 | pipedOutputStream.write(audioData.array()); 67 | pipedOutputStream.flush(); 68 | // System.out.printf("write audio data to pipe stream %d \n", audioData.array().length); 69 | } catch (IOException e) { 70 | throw new RuntimeException(e); 71 | } 72 | } 73 | 74 | // stop feed audio data to pipe stream 75 | public void stop() { 76 | System.out.println("Stop AudioPlayer data feed"); 77 | try { 78 | pipedOutputStream.close(); 79 | } catch (IOException e) { 80 | throw new RuntimeException(e); 81 | } 82 | try { 83 | latch.await(); 84 | } catch (InterruptedException e) { 85 | throw new RuntimeException(e); 86 | } 87 | System.out.println("AudioPlayerStoped"); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/python/README.md: -------------------------------------------------------------------------------- 1 | [comment]: # (title and brief introduction of the sample) 2 | ## 复刻你的音色进行语音合成并播放(流式模式) 3 | ## Python 4 | 5 | [comment]: # (prerequisites) 6 | ### :point_right: 前提条件 7 | 8 | 1. #### 配置阿里云百炼API-KEY 9 | 10 | 在运行本示例之前,您需要开通阿里云账号、获得阿里云百炼API_KEY,并进行必要的环境配置。有关API-KEY的详细配置步骤请参考:[PREREQUISITES.md](../../../../PREREQUISITES.md) 11 | 12 | 1. #### 安装ffmpeg 13 | 14 | 示例需要用到ffmpeg进行音视频解码。推荐从官方网站下载安装,并将ffmpeg安装路径配置进环境变量:[ffmpeg官方网站下载](https://www.ffmpeg.org/download.html)。也可以参考文档[如何安装ffmpeg](../../../docs/QA/ffmpeg.md)。 15 | 16 | 1. #### 安装Python依赖 17 | 18 | 阿里云百炼SDK运行环境需要Python 3.8及以上版本。您可以使用以下命令来安装本示例的依赖: 19 | ```commandline 20 | pip3 install -r requirements.txt 21 | ``` 22 | 请参考[文档](https://github.com/kkroening/ffmpeg-python)安装ffmpeg 23 | 24 | [comment]: # (how to run the sample and expected results) 25 | ### :point_right: 运行示例 26 | 27 | #### 录制音频 28 | 您可以首先使用以下命令按照提示录制一段音频用于克隆,并储存在`your_record_file.wav`中。请您使用阿里云oss或其他云存储方法获取可下载http链接。 29 | 30 | ```commandline 31 | python3 record.py 32 | ``` 33 | 34 | #### 克隆音色并合成语音 35 | 36 | 在获取可下载http链接后,您可以使用以下命令,将参数替换为您的录音http链接,运行本示例: 37 | 38 | ```commandline 39 | python3 run.py 40 | ``` 41 | 42 | 示例运行时,将会根据您提供的录音创建复刻音色,并使用复刻音色合成示例文本 “你好,欢迎使用阿里巴巴通义语音实验室的音色复刻服务~” ,合成音频将按照流式方式下发,通过扬声器播放并保存到文件`result.mp3`中。 43 | 44 | 如果没有填写``参数,将会使用示例录音复刻音频。 45 | 46 | 您可以通过修改`text_to_synthesize`合成指定的文本。 47 | 48 | 您可以通过执行`delete_voice_by_prefix`批量删除指定前缀的音色。 49 | 50 | [comment]: # (technical support of the sample) 51 | ### :point_right: 技术支持 52 | 53 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/python/record.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | 5 | import time 6 | import wave 7 | 8 | import pyaudio 9 | 10 | CHUNK = 3200 11 | FORMAT = pyaudio.paInt16 12 | CHANNELS = 1 13 | RATE = 16000 14 | 15 | 16 | def record_audio_from_voice_clonse(): 17 | mic = pyaudio.PyAudio() 18 | stream = mic.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True) 19 | start_time = time.time() 20 | audio_buffer = [] 21 | print('请在录音开始后朗读以下中文示例文案,或您准备好的其他文案。') 22 | print('示例文案:在这个世界上,最重要的事情就是保持好奇心,不断去探索未知的领域,只有这样我们才能成长并有所突破。') 23 | print('开始录音...') 24 | # record for 10 seconds 25 | while time.time() - start_time < 10: 26 | if stream: 27 | data = stream.read(CHUNK, exception_on_overflow=False) 28 | audio_buffer.append(data) 29 | stream.stop_stream() 30 | mic.terminate() 31 | # convert pcm to wave file 32 | output_wave_file = 'your_record_file.wav' 33 | with wave.open(output_wave_file, 'wb') as wf: 34 | wf.setnchannels(CHANNELS) 35 | wf.setsampwidth(mic.get_sample_size(FORMAT)) 36 | wf.setframerate(RATE) 37 | wf.writeframes(b''.join(audio_buffer)) 38 | print(f'录音保存在 {output_wave_file}') 39 | print('... 请将录制好的音频上传到云端并获取可下载http链接。') 40 | print('... 如果您没有云存储能力,可以使用阿里云的OSS(对象存储服务),上传录制好的音频到oss,并获取一个具有失效的url链接。') 41 | print('... https://help.aliyun.com/zh/oss/user-guide/simple-upload') 42 | 43 | 44 | # main function 45 | if __name__ == '__main__': 46 | record_audio_from_voice_clonse() 47 | -------------------------------------------------------------------------------- /samples/speech-synthesizer/synthesize_speech_from_text_with_cloned_voice/python/requirements.txt: -------------------------------------------------------------------------------- 1 | dashscope 2 | pyaudio 3 | -------------------------------------------------------------------------------- /samples/utils/python/RealtimeMp3Player.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | 5 | import subprocess 6 | import threading 7 | 8 | import pyaudio 9 | 10 | 11 | # Define a callback to handle the result 12 | class RealtimeMp3Player: 13 | def __init__(self, verbose=False): 14 | self.ffmpeg_process = None 15 | self._stream = None 16 | self._player = None 17 | self.play_thread = None 18 | self.stop_event = threading.Event() 19 | self.verbose = verbose 20 | 21 | def reset(self): 22 | self.ffmpeg_process = None 23 | self._stream = None 24 | self._player = None 25 | self.play_thread = None 26 | self.stop_event = threading.Event() 27 | 28 | def start(self): 29 | self._player = pyaudio.PyAudio() # initialize pyaudio to play audio 30 | self._stream = self._player.open( 31 | format=pyaudio.paInt16, channels=1, rate=22050, 32 | output=True) # initialize pyaudio stream 33 | try: 34 | self.ffmpeg_process = subprocess.Popen( 35 | [ 36 | 'ffmpeg', '-i', 'pipe:0', '-f', 's16le', '-ar', '22050', 37 | '-ac', '1', 'pipe:1' 38 | ], 39 | stdin=subprocess.PIPE, 40 | stdout=subprocess.PIPE, 41 | stderr=subprocess.DEVNULL, 42 | ) # initialize ffmpeg to decode mp3 43 | if self.verbose: 44 | print('mp3 audio player is started') 45 | except subprocess.CalledProcessError as e: 46 | # Capturing ffmpeg exceptions, printing error details 47 | print(f'An error occurred: {e}') 48 | 49 | def stop(self): 50 | try: 51 | self.ffmpeg_process.stdin.close() 52 | self.ffmpeg_process.wait() 53 | self.play_thread.join() 54 | self._stream.stop_stream() 55 | self._stream.close() 56 | self._player.terminate() 57 | if self.ffmpeg_process: 58 | self.ffmpeg_process.terminate() 59 | if self.verbose: 60 | print('mp3 audio player is stopped') 61 | except subprocess.CalledProcessError as e: 62 | # Capturing ffmpeg exceptions, printing error details 63 | print(f'An error occurred: {e}') 64 | 65 | def play_audio(self): 66 | # play audio with pcm data decode by ffmpeg 67 | try: 68 | while not self.stop_event.is_set(): 69 | pcm_data = self.ffmpeg_process.stdout.read(1024) 70 | if pcm_data: 71 | self._stream.write(pcm_data) 72 | else: 73 | break 74 | except subprocess.CalledProcessError as e: 75 | # Capturing ffmpeg exceptions, printing error details 76 | print(f'An error occurred: {e}') 77 | 78 | def write(self, data: bytes) -> None: 79 | # print('write audio data:', len(data)) 80 | try: 81 | self.ffmpeg_process.stdin.write(data) 82 | if self.play_thread is None: 83 | # initialize play thread 84 | # print('start play thread') 85 | self._stream.start_stream() 86 | self.play_thread = threading.Thread(target=self.play_audio) 87 | self.play_thread.start() 88 | except subprocess.CalledProcessError as e: 89 | # Capturing ffmpeg exceptions, printing error details 90 | print(f'An error occurred: {e}') 91 | -------------------------------------------------------------------------------- /samples/utils/python/TranscriptionResultUtil.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # Copyright (C) Alibaba Group. All Rights Reserved. 3 | # MIT License (https://opensource.org/licenses/MIT) 4 | import json 5 | import random 6 | import time 7 | 8 | import requests 9 | from dashscope.api_entities.dashscope_response import TranscriptionResponse 10 | 11 | 12 | def handle_transcription_result(transcribe_response: TranscriptionResponse): 13 | """ 14 | Handle the transcription result. download the result url and print the transcription result 15 | """ 16 | if transcribe_response.output.task_status == 'SUCCEEDED': 17 | results = transcribe_response.output.get('results') 18 | if results: 19 | __result_turn = 0 20 | for result in json.loads(json.dumps(results)): 21 | transcription_url = result['transcription_url'] 22 | if transcription_url: 23 | # download the transcription result 24 | __local_file_path = './' + str( 25 | __result_turn) + '_transcription_result.json' 26 | download_file(transcription_url, __local_file_path) 27 | # read the transcription result 28 | read_file_and_print_content(__local_file_path) 29 | __result_turn += 1 30 | 31 | 32 | def download_file(url, local_path): 33 | """ 34 | Downloads a file from a given URL to a local path. 35 | 36 | Parameters: 37 | - url: File URL Address 38 | - local_path: Local Path where the file will be saved 39 | 40 | Returns: 41 | None 42 | """ 43 | try: 44 | response = requests.get(url, stream=True, timeout=10) 45 | response.raise_for_status() 46 | except requests.RequestException as e: 47 | print(f'Failed to download the file: {e} ,retrying...') 48 | time.sleep(random.randint(1, 5)) 49 | response = requests.get(url, 50 | allow_redirects=True, 51 | verify=False, 52 | timeout=15) 53 | 54 | with open(local_path, 'wb') as f: 55 | for chunk in response.iter_content(chunk_size=8192): 56 | f.write(chunk) 57 | 58 | 59 | def read_file_and_print_content(file_path): 60 | """ 61 | Reads a file and retrieves its content. 62 | 63 | Parameter: 64 | - file_path: File Path 65 | 66 | Return Value: 67 | String representation of file content. 68 | """ 69 | with open(file_path, 'r', encoding='utf-8') as f: 70 | trans_result = f.read() 71 | 72 | if trans_result: 73 | trans_result = json.loads(trans_result) 74 | print('============= transcription for file : ', 75 | trans_result['file_url'], ' === START ===') 76 | for transcript in trans_result['transcripts']: 77 | for sentence in transcript['sentences']: 78 | text = sentence['text'] 79 | print('==>: ', text) 80 | print('============= transcription for file : ', 81 | trans_result['file_url'], ' === END ===\n\n') 82 | --------------------------------------------------------------------------------