├── local
    └── .gitkeep
├── youtube
    └── .gitkeep
├── requirements.txt
├── .gitignore
├── logseq_whisper_subtitles_server
    ├── audio_chinese.mp3
    ├── audio_english.mp3
    ├── app.py
    └── services.py
├── run.sh
├── README.ja.md
└── README.md


/local/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/youtube/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask
2 | openai-whisper
3 | yt-dlp
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .vscode
3 | local/*.mp3
4 | youtube/*.mp3
5 | .DS_Store


--------------------------------------------------------------------------------
/logseq_whisper_subtitles_server/audio_chinese.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usoonees/logseq-whisper-subtitles-server/HEAD/logseq_whisper_subtitles_server/audio_chinese.mp3


--------------------------------------------------------------------------------
/logseq_whisper_subtitles_server/audio_english.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usoonees/logseq-whisper-subtitles-server/HEAD/logseq_whisper_subtitles_server/audio_english.mp3


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | # conda create -n logseq-whisper-subtitles python=3.8
2 | # conda activate logseq-whisper-subtitles
3 | # pip3 install -r requirements.txt
4 | export FLASK_APP=./logseq_whisper_subtitles_server/app.py
5 | # export FLASK_ENV=development # uncomment for development
6 | flask run --host=0.0.0.0 --port=5014
7 | 


--------------------------------------------------------------------------------
/logseq_whisper_subtitles_server/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, jsonify
 2 | from services import download_youtube, transcribe_audio, extract_audio_from_local_video, is_audio_file
 3 | import re
 4 | import os
 5 | import traceback
 6 | 
 7 | app = Flask(__name__)
 8 | 
 9 | 
10 | @app.route('/transcribe', methods=['POST'])
11 | def transcribe():
12 |     try:
13 |         text = request.form['text'].strip()
14 |         min_length = request.form.get('min_length', '')
15 |         model_size = request.form.get('model_size', '')
16 |         graph_path = request.form.get('graph_path', '')
17 |         zh_type = request.form.get('zh_type', 'zh-cn')
18 | 
19 |         source = None
20 |         audio_path = None
21 |         youtube_pattern = r"https://www\.youtube\.com/watch\?v=[a-zA-Z0-9_-]+|https://youtu\.be/[a-zA-Z0-9_-]+"
22 |         youtube_match = re.search(youtube_pattern, text)
23 | 
24 |         local_file_pattern = r'(!\[.*?\]\((.*?)\))|(\{\{renderer :[a-zA-Z]+, (.*?)\}\})|\[\[(.*?)\]\[.*?\]\]'
25 |         local_file_match = re.search(local_file_pattern, text)
26 | 
27 |         if youtube_match:
28 |             youtube_url = youtube_match.group()
29 |             audio_path = download_youtube(youtube_url)
30 |             source = "youtube"
31 | 
32 |         elif local_file_match:
33 |             if local_file_match.group(2) is not None:
34 |                 local_file_path = local_file_match.group(2)
35 |             elif local_file_match.group(4) is not None:
36 |                 local_file_path = local_file_match.group(4)
37 |             elif local_file_match.group(5) is not None:
38 |                 local_file_path = local_file_match.group(5)
39 |             else:
40 |                 return jsonify({
41 |                     "source": "",
42 |                     "segments": [],
43 |                     "error": "No local file path found"
44 |                 })
45 | 
46 |             if local_file_path.startswith("http") or local_file_path.startswith("https"):
47 |                 print("This is a URL, not a local file")
48 |                 return jsonify({
49 |                     "source": "",
50 |                     "segments": [],
51 |                     "error": "This is a URL, not a local file"
52 |                 })
53 | 
54 |             source = "local"
55 |             if local_file_path.startswith("../"):
56 |                 local_file_path = os.path.join(graph_path, local_file_path[3:])
57 | 
58 |             audio_path = local_file_path
59 |             if not is_audio_file(local_file_path):
60 |                 audio_path = extract_audio_from_local_video(local_file_path)
61 |             print(f"Extracted file path: {local_file_path}")
62 | 
63 |         else:
64 |             return jsonify({
65 |                 "source": "",
66 |                 "segments": [],
67 |                 "error": "not supported source yet"
68 |             })
69 | 
70 |         return jsonify(
71 |             {
72 |                 "error": "",
73 |                 "source": source,  # support local etc.
74 |                 "segments": transcribe_audio(audio_path, min_length, model_size, zh_type)
75 |             })
76 |     except Exception as e:
77 |         traceback.print_exc()
78 |         return jsonify({
79 |             "error": "logseq-whisper-subtitle-server error: " + str(e),
80 |             "source": "",
81 |             "segments": []
82 |         })
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     app.run(debug=True, port=5014)
87 | 


--------------------------------------------------------------------------------
/README.ja.md:
--------------------------------------------------------------------------------
  1 | ## logseq-whisper-subtitles-server
  2 | 
  3 | Whisper 文字起こし プラグイン(Logseq)用の専用サーバー (ローカルにセットアップ)　[English](README.md) | [日本語](README.ja.md)
  4 | 
  5 | ### 概要
  6 | * このサーバーは、[logseq-plugin-whisper-subtitles](https://github.com/usoonees/logseq-plugin-whisper-subtitles) というLogseq用プラグインと連携して動作することを意図しています。
  7 | * PC上のローカルにインストールされたWhisperサービス(処理サーバー)に要求し、そのデータを受信するために設計されたローカルサーバーです。
  8 |    > サーバーのセットアップだけでなく、その依存関係のインストールも必要です。
  9 | 
 10 | ### 依存関係
 11 | 
 12 | 両方ともインストールしてください。
 13 | 1. **Python：** サーバーの実行には、Python が必須です。
 14 |    > 次のいずれかを参考にしてください。
 15 |    1. [Windows版 Python のインストール](https://www.python.jp/install/windows/install.html)
 16 |    2. [macOS版 Python のインストール](https://www.python.jp/install/macos/install_python.html)
 17 | 1. **ffmpeg：** Whisper サービスを動作させるために必須です。
 18 |    1. コマンドプロント(ターミナル)などから、次のいずれかのコマンドを実行し、インストールしてください。※[Whisper セットアップ ドキュメント (英語)](https://github.com/openai/whisper#setup)より
 19 |     ```
 20 |     # MacOS では、 Homebrew (https://brew.sh/) を使います。
 21 |     brew install ffmpeg
 22 | 
 23 |     # Windows では、Chocolatey (https://chocolatey.org/) 使います。
 24 |     choco install ffmpeg
 25 |     ```
 26 | 
 27 | ### セットアップ
 28 | 
 29 | > 専用サーバーをセットアップする前に、依存関係がインストールされていることを確認してください。
 30 | 
 31 | 1. PC上のローカルに、リポジトリをクローンします
 32 |    > どこか安全な場所に新しいフォルダを作成し、そのフォルダに対して、次のコマンドを実行します。Windows 11 の場合は、そのフォルダを右クリックして ターミナルを開きます。
 33 |    ```bash
 34 |    git clone https://github.com/usoonees/logseq-whisper-subtitles-server.git
 35 |    cd logseq-whisper-subtitles-server
 36 |    ```
 37 | 
 38 | 1. Python用パッケージをインストールします
 39 | 
 40 |    ```bash
 41 |    pip install git+https://github.com/openai/whisper.git 
 42 |    pip3 install flask pytube openai-whisper
 43 |    ```
 44 | 
 45 | 1. 初回のため、依存関係に問題がないかテスト動作をおこない、Whisper が正しく使える状態かどうか確認します
 46 | 
 47 |    ```bash
 48 |    cd logseq_whisper_subtitles_server
 49 |    python services.py
 50 |    ```
 51 | 
 52 | 1. コマンドプロントなどに表示される結果が、次のような出力であれば、セットアップが成功しています。
 53 | 
 54 |     ```
 55 |     Loading base whisper model...
 56 |     Loading base whisper model done.
 57 |     /Users/usoon/miniforge3/envs/test/lib/python3.9/site-packages/whisper/transcribe.py:114: UserWarning: FP16 is not supported on CPU; using FP32 instead
 58 |       warnings.warn("FP16 is not supported on CPU; using FP32 instead")
 59 |     00:00:00 --> 00:00:11
 60 |     When you hear the term artificial intelligence, what comes to mind?
 61 | 
 62 | 
 63 |     00:00:09 --> 00:00:13
 64 |     Superpowered robots?
 65 | 
 66 | 
 67 |     00:00:11 --> 00:00:18
 68 |     Hyperintelligent devices?
 69 | 
 70 | 
 71 |     00:00:13 --> 00:00:29
 72 |     Science fiction has familiarized the world with the concept, but outside of Hollywood, what is artificial intelligence and what can AI actually do?
 73 |     ....
 74 |     ```
 75 | 
 76 |   1. セットアップが終わったら、次に進み、専用サーバーを起動します。
 77 | 
 78 | ### 起動方法
 79 | 
 80 | 専用サーバーを起動するには、2つのオプションがあります：
 81 | 
 82 | 1. **サーバーを手動で起動する**:
 83 |    - セットアップで作成されたフォルダ(**logseq-whisper-subtitles-server**)の中にある **logseq_whisper_subtitles_server** というフォルダに対して、次のコマンドを実行します。
 84 |       > とても紛らわしいですが、"logseq-whisper-subtitles-server > logseq_whisper_subtitles_server" というフォルダ構造になっています。その中に、app.pyというPython実行ファイルが格納されています。
 85 |    1. "app.py"という専用アプリをpythonに立ち上げます
 86 |       ```bash
 87 |       python3 app.py
 88 |       ```
 89 |       > "python3"でエラーがでる場合は、"python"でも大丈夫です。
 90 | 
 91 | 1. bash スクリプトを使用 (オプション)
 92 | 
 93 |    ```bash
 94 |    bash run.sh
 95 |    ```
 96 | 
 97 | Logseq で専用プラグイン(**logseq-plugin-whisper-subtitles**)を使用する場合は、サーバーが実行されていることを確認してください。
 98 | 
 99 | ### 関連リポジトリ
100 | 
101 | - [logseq-plugin-whisper-subtitles](https://github.com/usoonees/logseq-plugin-whisper-subtitles) - ローカルのWhisperに要求して、文字起こしをおこない、Logseqにその内容を取り込むまでをサポートするプラグインです。
102 | - [Whisper](https://github.com/openai/whisper) - 音声認識モデルと呼ばれます。動画からその音声を抽出し、さらにテキストを抽出します。


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # logseq-whisper-subtitles-server
  2 | 
  3 | [English](README.md) | [日本語](README.ja.md)
  4 | 
  5 | ### Overview
  6 | * This server is designed to work in conjunction with the Logseq plugin called [logseq-plugin-whisper-subtitles](https://github.com/usoonees/logseq-plugin-whisper-subtitles).
  7 | * It's a local server designed to make requests to the Whisper service (processing server) installed locally on the PC and receive data from it.
  8 |    > In addition to setting up the server, it's necessary to install its dependencies.
  9 | 
 10 | ### Dependencies
 11 | 
 12 | Please install both of the following:
 13 | 1. **Python:** Python is required for running the server.
 14 |    1. [Python 3 Installation & Setup Guide](https://realpython.com/installing-python/)
 15 | 1. **ffmpeg:** This is essential for running the Whisper service.
 16 |    1. Execute one of the following commands from the command prompt (terminal) or similar to install it. *[Whisper Setup Documentation](https://github.com/openai/whisper#setup)*
 17 |     ```
 18 |     # On macOS, use Homebrew (https://brew.sh/).
 19 |     brew install ffmpeg
 20 | 
 21 |     # On Windows, use Chocolatey (https://chocolatey.org/).
 22 |     choco install ffmpeg
 23 |     ```
 24 | 
 25 | ### Setup
 26 | 
 27 | > Before setting up the dedicated server, ensure that the dependencies are installed.
 28 | 
 29 | 1. Clone the repository to the local PC.
 30 |    > Create a new folder in a safe location and execute the following commands for that folder. On Windows 11, right-click the folder and open a terminal.
 31 |    ```bash
 32 |    git clone https://github.com/usoonees/logseq-whisper-subtitles-server.git
 33 |    cd logseq-whisper-subtitles-server
 34 |    ```
 35 | 
 36 | 1. Install Python packages.
 37 | 
 38 |    ```bash
 39 |    pip install git+https://github.com/openai/whisper.git 
 40 |    pip3 install flask pytube openai-whisper
 41 |    ```
 42 | 
 43 | 1. For the initial setup, test to ensure there are no issues with the dependencies and that Whisper is functioning correctly.
 44 | 
 45 |    ```bash
 46 |    cd logseq_whisper_subtitles_server
 47 |    python services.py
 48 |    ```
 49 | 
 50 | 1. If the results displayed in the command prompt resemble the following output, the setup is successful:
 51 | 
 52 |     ```
 53 |     Loading base whisper model...
 54 |     Loading base whisper model done.
 55 |     /Users/usoon/miniforge3/envs/test/lib/python3.9/site-packages/whisper/transcribe.py:114: UserWarning: FP16 is not supported on CPU; using FP32 instead
 56 |       warnings.warn("FP16 is not supported on CPU; using FP32 instead")
 57 |     00:00:00 --> 00:00:11
 58 |     When you hear the term artificial intelligence, what comes to mind?
 59 | 
 60 | 
 61 |     00:00:09 --> 00:00:13
 62 |     Superpowered robots?
 63 | 
 64 | 
 65 |     00:00:11 --> 00:00:18
 66 |     Hyperintelligent devices?
 67 | 
 68 | 
 69 |     00:00:13 --> 00:00:29
 70 |     Science fiction has familiarized the world with the concept, but outside of Hollywood, what is artificial intelligence and what can AI actually do?
 71 |     ....
 72 |     ```
 73 | 
 74 | 1. Once the setup is complete, proceed to start the dedicated server.
 75 | 
 76 | ### Starting the Server
 77 | 
 78 | To start the dedicated server, 2 options:
 79 | 
 80 | 1. **Manually start the server**:
 81 |    - Execute the following command for the folder named **logseq_whisper_subtitles_server** located inside the folder created during setup (**logseq-whisper-subtitles-server**). 
 82 |       > It's a bit confusing, but the folder structure is "logseq-whisper-subtitles-server > logseq_whisper_subtitles_server," and inside it, there's a Python executable called "app.py."
 83 |    1. Launch the dedicated app called "app.py" using Python.
 84 |       ```bash
 85 |       python3 app.py
 86 |       ```
 87 |       > If encounter an error with "python3," "python" should work as well.
 88 | 
 89 | 1. Use a bash script (optional)
 90 | 
 91 |    ```bash
 92 |    bash run.sh
 93 |    ```
 94 | 
 95 | Make sure that the server is running if intend to use the dedicated plugin (**logseq-plugin-whisper-subtitles**) in Logseq.
 96 | 
 97 | ### Related Repository
 98 | 
 99 | - [logseq-plugin-whisper-subtitles](https://github.com/usoonees/logseq-plugin-whisper-subtitles) - The Logseq plugin that interfaces with this server to extract subtitles and timestamp from videos.
100 | - [whisper](https://github.com/openai/whisper) - The AI model used to extract voice from audio.
101 | 


--------------------------------------------------------------------------------
/logseq_whisper_subtitles_server/services.py:
--------------------------------------------------------------------------------
  1 | import yt_dlp
  2 | from datetime import timedelta
  3 | import whisper
  4 | import uuid
  5 | import os
  6 | import subprocess
  7 | # import re
  8 | 
  9 | EN_SEGMENT_SYMBOLS = ['.', '?', '!']
 10 | PUNCTUATION = ['，', '。', '？', '！']
 11 | DEFAULT_MIN_LENGTH = 100  # set to 0 to disable merging Segments
 12 | DEFAULT_MODEL_SIZE = "base"
 13 | 
 14 | print("Loading base whisper model...")
 15 | models = {
 16 |     DEFAULT_MODEL_SIZE: whisper.load_model(DEFAULT_MODEL_SIZE)
 17 | }
 18 | print("Loading base whisper model done.")
 19 | 
 20 | 
 21 | def is_audio_file(filename):
 22 |     audio_extensions = ['.mp3', '.wav', '.aac', '.ogg', '.flac', '.m4a', '.wma']
 23 |     _, file_extension = os.path.splitext(filename)
 24 |     return file_extension.lower() in audio_extensions
 25 | 
 26 | 
 27 | def extract_audio_from_local_video(video_path):
 28 |     audio_output_path = os.path.join('local', f'local_audio_{uuid.uuid4().hex}.mp3')
 29 |     if not os.path.exists('local'):
 30 |         os.makedirs('local')
 31 |     command = [
 32 |         'ffmpeg',
 33 |         '-i', video_path,  # Input video file path
 34 |         '-q:a', '0',       # Quality of audio (0 means best)
 35 |         '-map', 'a',       # Extract audio stream
 36 |         '-vn',             # No video output
 37 |         audio_output_path  # Output audio file path
 38 |     ]
 39 |     try:
 40 |         print("Converting local video to audio ...")
 41 |         subprocess.run(command)
 42 |         print("Converting local video to audio done.")
 43 |     except subprocess.CalledProcessError as e:
 44 |         print("Converting local video to audio failed.")
 45 |         raise RuntimeError(f"Failed to convert local video to audio: {e.stderr.decode()}") from e
 46 | 
 47 |     return audio_output_path
 48 | 
 49 | 
 50 | def download_youtube(video_url):
 51 |     print(f"Downloading the video: {video_url} into audio ...")
 52 |     vid = uuid.uuid4().hex
 53 |     if not os.path.exists('youtube'):
 54 |         os.makedirs('youtube')
 55 |     audio_name = os.path.join('youtube', f'youtube_audio_{vid}.mp3')
 56 | 
 57 |     ydl_opts = {
 58 |         'format': 'bestaudio/best',
 59 |         'postprocessors': [{
 60 |             'key': 'FFmpegExtractAudio',
 61 |             'preferredcodec': 'mp3',
 62 |             'preferredquality': '192',
 63 |         }],
 64 |         'outtmpl': audio_name,
 65 |         'keepvideo': True,  # 保留原始下载的文件
 66 |         'postprocessor_args': [
 67 |             '-ar', '16000'  # 设置音频采样率为16kHz，与原始代码保持一致
 68 |         ],
 69 |     }
 70 | 
 71 |     try:
 72 |         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
 73 |             ydl.download([video_url])
 74 |         print(f"Downloading the video: {video_url} into audio done.")
 75 |         
 76 |         # 检查文件是否存在，如果不存在，尝试其他可能的文件名
 77 |         if not os.path.exists(audio_name):
 78 |             possible_name = audio_name + '.mp3'
 79 |             if os.path.exists(possible_name):
 80 |                 os.rename(possible_name, audio_name)
 81 |             else:
 82 |                 raise FileNotFoundError(f"Could not find the downloaded audio file: {audio_name}")
 83 |         
 84 |         return audio_name
 85 |     except Exception as e:
 86 |         print(f"Error downloading video: {str(e)}")
 87 |         raise
 88 | 
 89 | 
 90 | def replace_punctuation(text):
 91 |     text = text.replace(",", "，").replace(".", "。").replace("?", "？").replace("!", "！")
 92 |     return text
 93 | 
 94 | 
 95 | def transcribe_audio(audio_path, min_length=DEFAULT_MIN_LENGTH, model_size=DEFAULT_MODEL_SIZE, zh_type='zh-cn'):
 96 |     if not min_length:
 97 |         min_length = DEFAULT_MIN_LENGTH
 98 | 
 99 |     if not model_size:
100 |         model_size = DEFAULT_MODEL_SIZE
101 | 
102 |     if model_size not in models:
103 |         print(f"Loading {model_size} whisper model...")
104 |         models[model_size] = whisper.load_model(model_size)
105 | 
106 |     model = models[model_size]
107 | 
108 |     print("Using model: ", model_size)
109 | 
110 |     if zh_type.strip() == 'zh-cn':
111 |         print("Transcribing Chinese simplified audio ...")
112 |         transcribe = model.transcribe(audio=audio_path, verbose=True, initial_prompt="对于普通话句子，以中文简体输出")  # 避免繁体输出
113 |     else:
114 |         transcribe = model.transcribe(audio=audio_path, verbose=True)
115 | 
116 |     segments = transcribe['segments']
117 |     detect_language = transcribe.get('language', '')
118 |     print("detected language: ", detect_language)
119 | 
120 |     previous_segment = None
121 |     previous_start_time = None
122 |     previous_start_time_format = None
123 |     previous_connect_space = " "
124 |     res = []
125 |     for segment in segments:
126 |         start_time = int(segment['start'])
127 |         start_time_format = str(0) + str(timedelta(seconds=int(segment['start'])))
128 |         end_time_format = str(0) + str(timedelta(seconds=int(segment['end'])))
129 |         text = segment['text'].strip()
130 |         cur_connect_space = previous_connect_space
131 |         if detect_language in ['zh', 'ja']:
132 |             text = replace_punctuation(text)
133 |             if text[-1] in PUNCTUATION:
134 |                 previous_connect_space = ""
135 |             else:
136 |                 previous_connect_space = "，"
137 | 
138 |         # Check if the previous segment needs to be merged
139 |         is_segment_symbol = text[-1] in EN_SEGMENT_SYMBOLS
140 | 
141 |         if detect_language != 'en':
142 |             is_segment_symbol = True
143 | 
144 |         if previous_segment and (not is_segment_symbol or len(previous_segment) < int(min_length)):
145 |             previous_segment = f"{previous_segment}{cur_connect_space}{text}"
146 |             end_time_format = str(0) + str(timedelta(seconds=int(segment['end'])))
147 |         else:
148 |             # If this is not the first iteration, print the previous segment
149 |             if previous_segment:
150 |                 merged_segment = f"{previous_start_time_format} --> {end_time_format}\n{previous_segment}\n\n"
151 |                 print(merged_segment)
152 |                 res.append({
153 |                     "startTime": previous_start_time,
154 |                     "segment": previous_segment
155 |                 })
156 | 
157 |             # Set the new previous segment
158 |             previous_segment = text
159 |             previous_start_time_format = start_time_format
160 |             previous_start_time = start_time
161 | 
162 |     if previous_segment:
163 |         last_segment = f"{previous_start_time_format} --> {end_time_format}\n{previous_segment}\n\n"
164 |         print(last_segment)
165 |         res.append({
166 |             "startTime": previous_start_time,
167 |             "segment": previous_segment
168 |         })
169 | 
170 |     return res
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     print("=== English audio test")
175 |     res_en = transcribe_audio("audio_english.mp3")
176 |     print(res_en)
177 | 
178 |     print("=== Chinese audio Test")
179 |     res_cn = transcribe_audio("audio_chinese.mp3")
180 |     print(res_cn)
181 | 


--------------------------------------------------------------------------------