├── local └── .gitkeep ├── youtube └── .gitkeep ├── requirements.txt ├── .gitignore ├── logseq_whisper_subtitles_server ├── audio_chinese.mp3 ├── audio_english.mp3 ├── app.py └── services.py ├── run.sh ├── README.ja.md └── README.md /local/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /youtube/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask 2 | openai-whisper 3 | yt-dlp 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .vscode 3 | local/*.mp3 4 | youtube/*.mp3 5 | .DS_Store -------------------------------------------------------------------------------- /logseq_whisper_subtitles_server/audio_chinese.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usoonees/logseq-whisper-subtitles-server/HEAD/logseq_whisper_subtitles_server/audio_chinese.mp3 -------------------------------------------------------------------------------- /logseq_whisper_subtitles_server/audio_english.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usoonees/logseq-whisper-subtitles-server/HEAD/logseq_whisper_subtitles_server/audio_english.mp3 -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # conda create -n logseq-whisper-subtitles python=3.8 2 | # conda activate logseq-whisper-subtitles 3 | # pip3 install -r requirements.txt 4 | export FLASK_APP=./logseq_whisper_subtitles_server/app.py 5 | # export FLASK_ENV=development # uncomment for development 6 | flask run --host=0.0.0.0 --port=5014 7 | -------------------------------------------------------------------------------- /logseq_whisper_subtitles_server/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify 2 | from services import download_youtube, transcribe_audio, extract_audio_from_local_video, is_audio_file 3 | import re 4 | import os 5 | import traceback 6 | 7 | app = Flask(__name__) 8 | 9 | 10 | @app.route('/transcribe', methods=['POST']) 11 | def transcribe(): 12 | try: 13 | text = request.form['text'].strip() 14 | min_length = request.form.get('min_length', '') 15 | model_size = request.form.get('model_size', '') 16 | graph_path = request.form.get('graph_path', '') 17 | zh_type = request.form.get('zh_type', 'zh-cn') 18 | 19 | source = None 20 | audio_path = None 21 | youtube_pattern = r"https://www\.youtube\.com/watch\?v=[a-zA-Z0-9_-]+|https://youtu\.be/[a-zA-Z0-9_-]+" 22 | youtube_match = re.search(youtube_pattern, text) 23 | 24 | local_file_pattern = r'(!\[.*?\]\((.*?)\))|(\{\{renderer :[a-zA-Z]+, (.*?)\}\})|\[\[(.*?)\]\[.*?\]\]' 25 | local_file_match = re.search(local_file_pattern, text) 26 | 27 | if youtube_match: 28 | youtube_url = youtube_match.group() 29 | audio_path = download_youtube(youtube_url) 30 | source = "youtube" 31 | 32 | elif local_file_match: 33 | if local_file_match.group(2) is not None: 34 | local_file_path = local_file_match.group(2) 35 | elif local_file_match.group(4) is not None: 36 | local_file_path = local_file_match.group(4) 37 | elif local_file_match.group(5) is not None: 38 | local_file_path = local_file_match.group(5) 39 | else: 40 | return jsonify({ 41 | "source": "", 42 | "segments": [], 43 | "error": "No local file path found" 44 | }) 45 | 46 | if local_file_path.startswith("http") or local_file_path.startswith("https"): 47 | print("This is a URL, not a local file") 48 | return jsonify({ 49 | "source": "", 50 | "segments": [], 51 | "error": "This is a URL, not a local file" 52 | }) 53 | 54 | source = "local" 55 | if local_file_path.startswith("../"): 56 | local_file_path = os.path.join(graph_path, local_file_path[3:]) 57 | 58 | audio_path = local_file_path 59 | if not is_audio_file(local_file_path): 60 | audio_path = extract_audio_from_local_video(local_file_path) 61 | print(f"Extracted file path: {local_file_path}") 62 | 63 | else: 64 | return jsonify({ 65 | "source": "", 66 | "segments": [], 67 | "error": "not supported source yet" 68 | }) 69 | 70 | return jsonify( 71 | { 72 | "error": "", 73 | "source": source, # support local etc. 74 | "segments": transcribe_audio(audio_path, min_length, model_size, zh_type) 75 | }) 76 | except Exception as e: 77 | traceback.print_exc() 78 | return jsonify({ 79 | "error": "logseq-whisper-subtitle-server error: " + str(e), 80 | "source": "", 81 | "segments": [] 82 | }) 83 | 84 | 85 | if __name__ == '__main__': 86 | app.run(debug=True, port=5014) 87 | -------------------------------------------------------------------------------- /README.ja.md: -------------------------------------------------------------------------------- 1 | ## logseq-whisper-subtitles-server 2 | 3 | Whisper 文字起こし プラグイン(Logseq)用の専用サーバー (ローカルにセットアップ) [English](README.md) | [日本語](README.ja.md) 4 | 5 | ### 概要 6 | * このサーバーは、[logseq-plugin-whisper-subtitles](https://github.com/usoonees/logseq-plugin-whisper-subtitles) というLogseq用プラグインと連携して動作することを意図しています。 7 | * PC上のローカルにインストールされたWhisperサービス(処理サーバー)に要求し、そのデータを受信するために設計されたローカルサーバーです。 8 | > サーバーのセットアップだけでなく、その依存関係のインストールも必要です。 9 | 10 | ### 依存関係 11 | 12 | 両方ともインストールしてください。 13 | 1. **Python:** サーバーの実行には、Python が必須です。 14 | > 次のいずれかを参考にしてください。 15 | 1. [Windows版 Python のインストール](https://www.python.jp/install/windows/install.html) 16 | 2. [macOS版 Python のインストール](https://www.python.jp/install/macos/install_python.html) 17 | 1. **ffmpeg:** Whisper サービスを動作させるために必須です。 18 | 1. コマンドプロント(ターミナル)などから、次のいずれかのコマンドを実行し、インストールしてください。※[Whisper セットアップ ドキュメント (英語)](https://github.com/openai/whisper#setup)より 19 | ``` 20 | # MacOS では、 Homebrew (https://brew.sh/) を使います。 21 | brew install ffmpeg 22 | 23 | # Windows では、Chocolatey (https://chocolatey.org/) 使います。 24 | choco install ffmpeg 25 | ``` 26 | 27 | ### セットアップ 28 | 29 | > 専用サーバーをセットアップする前に、依存関係がインストールされていることを確認してください。 30 | 31 | 1. PC上のローカルに、リポジトリをクローンします 32 | > どこか安全な場所に新しいフォルダを作成し、そのフォルダに対して、次のコマンドを実行します。Windows 11 の場合は、そのフォルダを右クリックして ターミナルを開きます。 33 | ```bash 34 | git clone https://github.com/usoonees/logseq-whisper-subtitles-server.git 35 | cd logseq-whisper-subtitles-server 36 | ``` 37 | 38 | 1. Python用パッケージをインストールします 39 | 40 | ```bash 41 | pip install git+https://github.com/openai/whisper.git 42 | pip3 install flask pytube openai-whisper 43 | ``` 44 | 45 | 1. 初回のため、依存関係に問題がないかテスト動作をおこない、Whisper が正しく使える状態かどうか確認します 46 | 47 | ```bash 48 | cd logseq_whisper_subtitles_server 49 | python services.py 50 | ``` 51 | 52 | 1. コマンドプロントなどに表示される結果が、次のような出力であれば、セットアップが成功しています。 53 | 54 | ``` 55 | Loading base whisper model... 56 | Loading base whisper model done. 57 | /Users/usoon/miniforge3/envs/test/lib/python3.9/site-packages/whisper/transcribe.py:114: UserWarning: FP16 is not supported on CPU; using FP32 instead 58 | warnings.warn("FP16 is not supported on CPU; using FP32 instead") 59 | 00:00:00 --> 00:00:11 60 | When you hear the term artificial intelligence, what comes to mind? 61 | 62 | 63 | 00:00:09 --> 00:00:13 64 | Superpowered robots? 65 | 66 | 67 | 00:00:11 --> 00:00:18 68 | Hyperintelligent devices? 69 | 70 | 71 | 00:00:13 --> 00:00:29 72 | Science fiction has familiarized the world with the concept, but outside of Hollywood, what is artificial intelligence and what can AI actually do? 73 | .... 74 | ``` 75 | 76 | 1. セットアップが終わったら、次に進み、専用サーバーを起動します。 77 | 78 | ### 起動方法 79 | 80 | 専用サーバーを起動するには、2つのオプションがあります: 81 | 82 | 1. **サーバーを手動で起動する**: 83 | - セットアップで作成されたフォルダ(**logseq-whisper-subtitles-server**)の中にある **logseq_whisper_subtitles_server** というフォルダに対して、次のコマンドを実行します。 84 | > とても紛らわしいですが、"logseq-whisper-subtitles-server > logseq_whisper_subtitles_server" というフォルダ構造になっています。その中に、app.pyというPython実行ファイルが格納されています。 85 | 1. "app.py"という専用アプリをpythonに立ち上げます 86 | ```bash 87 | python3 app.py 88 | ``` 89 | > "python3"でエラーがでる場合は、"python"でも大丈夫です。 90 | 91 | 1. bash スクリプトを使用 (オプション) 92 | 93 | ```bash 94 | bash run.sh 95 | ``` 96 | 97 | Logseq で専用プラグイン(**logseq-plugin-whisper-subtitles**)を使用する場合は、サーバーが実行されていることを確認してください。 98 | 99 | ### 関連リポジトリ 100 | 101 | - [logseq-plugin-whisper-subtitles](https://github.com/usoonees/logseq-plugin-whisper-subtitles) - ローカルのWhisperに要求して、文字起こしをおこない、Logseqにその内容を取り込むまでをサポートするプラグインです。 102 | - [Whisper](https://github.com/openai/whisper) - 音声認識モデルと呼ばれます。動画からその音声を抽出し、さらにテキストを抽出します。 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # logseq-whisper-subtitles-server 2 | 3 | [English](README.md) | [日本語](README.ja.md) 4 | 5 | ### Overview 6 | * This server is designed to work in conjunction with the Logseq plugin called [logseq-plugin-whisper-subtitles](https://github.com/usoonees/logseq-plugin-whisper-subtitles). 7 | * It's a local server designed to make requests to the Whisper service (processing server) installed locally on the PC and receive data from it. 8 | > In addition to setting up the server, it's necessary to install its dependencies. 9 | 10 | ### Dependencies 11 | 12 | Please install both of the following: 13 | 1. **Python:** Python is required for running the server. 14 | 1. [Python 3 Installation & Setup Guide](https://realpython.com/installing-python/) 15 | 1. **ffmpeg:** This is essential for running the Whisper service. 16 | 1. Execute one of the following commands from the command prompt (terminal) or similar to install it. *[Whisper Setup Documentation](https://github.com/openai/whisper#setup)* 17 | ``` 18 | # On macOS, use Homebrew (https://brew.sh/). 19 | brew install ffmpeg 20 | 21 | # On Windows, use Chocolatey (https://chocolatey.org/). 22 | choco install ffmpeg 23 | ``` 24 | 25 | ### Setup 26 | 27 | > Before setting up the dedicated server, ensure that the dependencies are installed. 28 | 29 | 1. Clone the repository to the local PC. 30 | > Create a new folder in a safe location and execute the following commands for that folder. On Windows 11, right-click the folder and open a terminal. 31 | ```bash 32 | git clone https://github.com/usoonees/logseq-whisper-subtitles-server.git 33 | cd logseq-whisper-subtitles-server 34 | ``` 35 | 36 | 1. Install Python packages. 37 | 38 | ```bash 39 | pip install git+https://github.com/openai/whisper.git 40 | pip3 install flask pytube openai-whisper 41 | ``` 42 | 43 | 1. For the initial setup, test to ensure there are no issues with the dependencies and that Whisper is functioning correctly. 44 | 45 | ```bash 46 | cd logseq_whisper_subtitles_server 47 | python services.py 48 | ``` 49 | 50 | 1. If the results displayed in the command prompt resemble the following output, the setup is successful: 51 | 52 | ``` 53 | Loading base whisper model... 54 | Loading base whisper model done. 55 | /Users/usoon/miniforge3/envs/test/lib/python3.9/site-packages/whisper/transcribe.py:114: UserWarning: FP16 is not supported on CPU; using FP32 instead 56 | warnings.warn("FP16 is not supported on CPU; using FP32 instead") 57 | 00:00:00 --> 00:00:11 58 | When you hear the term artificial intelligence, what comes to mind? 59 | 60 | 61 | 00:00:09 --> 00:00:13 62 | Superpowered robots? 63 | 64 | 65 | 00:00:11 --> 00:00:18 66 | Hyperintelligent devices? 67 | 68 | 69 | 00:00:13 --> 00:00:29 70 | Science fiction has familiarized the world with the concept, but outside of Hollywood, what is artificial intelligence and what can AI actually do? 71 | .... 72 | ``` 73 | 74 | 1. Once the setup is complete, proceed to start the dedicated server. 75 | 76 | ### Starting the Server 77 | 78 | To start the dedicated server, 2 options: 79 | 80 | 1. **Manually start the server**: 81 | - Execute the following command for the folder named **logseq_whisper_subtitles_server** located inside the folder created during setup (**logseq-whisper-subtitles-server**). 82 | > It's a bit confusing, but the folder structure is "logseq-whisper-subtitles-server > logseq_whisper_subtitles_server," and inside it, there's a Python executable called "app.py." 83 | 1. Launch the dedicated app called "app.py" using Python. 84 | ```bash 85 | python3 app.py 86 | ``` 87 | > If encounter an error with "python3," "python" should work as well. 88 | 89 | 1. Use a bash script (optional) 90 | 91 | ```bash 92 | bash run.sh 93 | ``` 94 | 95 | Make sure that the server is running if intend to use the dedicated plugin (**logseq-plugin-whisper-subtitles**) in Logseq. 96 | 97 | ### Related Repository 98 | 99 | - [logseq-plugin-whisper-subtitles](https://github.com/usoonees/logseq-plugin-whisper-subtitles) - The Logseq plugin that interfaces with this server to extract subtitles and timestamp from videos. 100 | - [whisper](https://github.com/openai/whisper) - The AI model used to extract voice from audio. 101 | -------------------------------------------------------------------------------- /logseq_whisper_subtitles_server/services.py: -------------------------------------------------------------------------------- 1 | import yt_dlp 2 | from datetime import timedelta 3 | import whisper 4 | import uuid 5 | import os 6 | import subprocess 7 | # import re 8 | 9 | EN_SEGMENT_SYMBOLS = ['.', '?', '!'] 10 | PUNCTUATION = [',', '。', '?', '!'] 11 | DEFAULT_MIN_LENGTH = 100 # set to 0 to disable merging Segments 12 | DEFAULT_MODEL_SIZE = "base" 13 | 14 | print("Loading base whisper model...") 15 | models = { 16 | DEFAULT_MODEL_SIZE: whisper.load_model(DEFAULT_MODEL_SIZE) 17 | } 18 | print("Loading base whisper model done.") 19 | 20 | 21 | def is_audio_file(filename): 22 | audio_extensions = ['.mp3', '.wav', '.aac', '.ogg', '.flac', '.m4a', '.wma'] 23 | _, file_extension = os.path.splitext(filename) 24 | return file_extension.lower() in audio_extensions 25 | 26 | 27 | def extract_audio_from_local_video(video_path): 28 | audio_output_path = os.path.join('local', f'local_audio_{uuid.uuid4().hex}.mp3') 29 | if not os.path.exists('local'): 30 | os.makedirs('local') 31 | command = [ 32 | 'ffmpeg', 33 | '-i', video_path, # Input video file path 34 | '-q:a', '0', # Quality of audio (0 means best) 35 | '-map', 'a', # Extract audio stream 36 | '-vn', # No video output 37 | audio_output_path # Output audio file path 38 | ] 39 | try: 40 | print("Converting local video to audio ...") 41 | subprocess.run(command) 42 | print("Converting local video to audio done.") 43 | except subprocess.CalledProcessError as e: 44 | print("Converting local video to audio failed.") 45 | raise RuntimeError(f"Failed to convert local video to audio: {e.stderr.decode()}") from e 46 | 47 | return audio_output_path 48 | 49 | 50 | def download_youtube(video_url): 51 | print(f"Downloading the video: {video_url} into audio ...") 52 | vid = uuid.uuid4().hex 53 | if not os.path.exists('youtube'): 54 | os.makedirs('youtube') 55 | audio_name = os.path.join('youtube', f'youtube_audio_{vid}.mp3') 56 | 57 | ydl_opts = { 58 | 'format': 'bestaudio/best', 59 | 'postprocessors': [{ 60 | 'key': 'FFmpegExtractAudio', 61 | 'preferredcodec': 'mp3', 62 | 'preferredquality': '192', 63 | }], 64 | 'outtmpl': audio_name, 65 | 'keepvideo': True, # 保留原始下载的文件 66 | 'postprocessor_args': [ 67 | '-ar', '16000' # 设置音频采样率为16kHz,与原始代码保持一致 68 | ], 69 | } 70 | 71 | try: 72 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 73 | ydl.download([video_url]) 74 | print(f"Downloading the video: {video_url} into audio done.") 75 | 76 | # 检查文件是否存在,如果不存在,尝试其他可能的文件名 77 | if not os.path.exists(audio_name): 78 | possible_name = audio_name + '.mp3' 79 | if os.path.exists(possible_name): 80 | os.rename(possible_name, audio_name) 81 | else: 82 | raise FileNotFoundError(f"Could not find the downloaded audio file: {audio_name}") 83 | 84 | return audio_name 85 | except Exception as e: 86 | print(f"Error downloading video: {str(e)}") 87 | raise 88 | 89 | 90 | def replace_punctuation(text): 91 | text = text.replace(",", ",").replace(".", "。").replace("?", "?").replace("!", "!") 92 | return text 93 | 94 | 95 | def transcribe_audio(audio_path, min_length=DEFAULT_MIN_LENGTH, model_size=DEFAULT_MODEL_SIZE, zh_type='zh-cn'): 96 | if not min_length: 97 | min_length = DEFAULT_MIN_LENGTH 98 | 99 | if not model_size: 100 | model_size = DEFAULT_MODEL_SIZE 101 | 102 | if model_size not in models: 103 | print(f"Loading {model_size} whisper model...") 104 | models[model_size] = whisper.load_model(model_size) 105 | 106 | model = models[model_size] 107 | 108 | print("Using model: ", model_size) 109 | 110 | if zh_type.strip() == 'zh-cn': 111 | print("Transcribing Chinese simplified audio ...") 112 | transcribe = model.transcribe(audio=audio_path, verbose=True, initial_prompt="对于普通话句子,以中文简体输出") # 避免繁体输出 113 | else: 114 | transcribe = model.transcribe(audio=audio_path, verbose=True) 115 | 116 | segments = transcribe['segments'] 117 | detect_language = transcribe.get('language', '') 118 | print("detected language: ", detect_language) 119 | 120 | previous_segment = None 121 | previous_start_time = None 122 | previous_start_time_format = None 123 | previous_connect_space = " " 124 | res = [] 125 | for segment in segments: 126 | start_time = int(segment['start']) 127 | start_time_format = str(0) + str(timedelta(seconds=int(segment['start']))) 128 | end_time_format = str(0) + str(timedelta(seconds=int(segment['end']))) 129 | text = segment['text'].strip() 130 | cur_connect_space = previous_connect_space 131 | if detect_language in ['zh', 'ja']: 132 | text = replace_punctuation(text) 133 | if text[-1] in PUNCTUATION: 134 | previous_connect_space = "" 135 | else: 136 | previous_connect_space = "," 137 | 138 | # Check if the previous segment needs to be merged 139 | is_segment_symbol = text[-1] in EN_SEGMENT_SYMBOLS 140 | 141 | if detect_language != 'en': 142 | is_segment_symbol = True 143 | 144 | if previous_segment and (not is_segment_symbol or len(previous_segment) < int(min_length)): 145 | previous_segment = f"{previous_segment}{cur_connect_space}{text}" 146 | end_time_format = str(0) + str(timedelta(seconds=int(segment['end']))) 147 | else: 148 | # If this is not the first iteration, print the previous segment 149 | if previous_segment: 150 | merged_segment = f"{previous_start_time_format} --> {end_time_format}\n{previous_segment}\n\n" 151 | print(merged_segment) 152 | res.append({ 153 | "startTime": previous_start_time, 154 | "segment": previous_segment 155 | }) 156 | 157 | # Set the new previous segment 158 | previous_segment = text 159 | previous_start_time_format = start_time_format 160 | previous_start_time = start_time 161 | 162 | if previous_segment: 163 | last_segment = f"{previous_start_time_format} --> {end_time_format}\n{previous_segment}\n\n" 164 | print(last_segment) 165 | res.append({ 166 | "startTime": previous_start_time, 167 | "segment": previous_segment 168 | }) 169 | 170 | return res 171 | 172 | 173 | if __name__ == "__main__": 174 | print("=== English audio test") 175 | res_en = transcribe_audio("audio_english.mp3") 176 | print(res_en) 177 | 178 | print("=== Chinese audio Test") 179 | res_cn = transcribe_audio("audio_chinese.mp3") 180 | print(res_cn) 181 | --------------------------------------------------------------------------------