','',context)\n",
1742 | " context=context.replace(\"
\",\"\\n\").replace(\"\",\"\\n\")\n",
1743 | " context=re.sub('<.*?>',\"\",context).split(\"\\n\")\n",
1744 | " context=[itm.strip() for itm in context if len(itm)>0]\n",
1745 | " text+=\"\\n\\n\".join(context)+\"\\n\\n\"\n",
1746 | " text=chinese.s2t(text)\n",
1747 | "\n",
1748 | " # print(text)\n",
1749 | " with open(f\"{art_id}.txt\",mode=\"w\",encoding=\"utf-8\") as f:\n",
1750 | " f.write(text)\n",
1751 | "\n",
1752 | "\n",
1753 | "#@markdown 書籍目錄網址\n",
1754 | "url=\"https://www.52shuku.vip/yanqing/b/bjPGg.html\" #@param {type:'string'}\n",
1755 | "title=\"\\u59D1\\u5A18\\u5979\\u7F8E\\u8C8C\\u5374\\u66B4\\u529B\" #@param {type:\"string\"}\n",
1756 | "author=\"\\u4E60\\u6829\\u5112\\u751F\" #@param {type:\"string\"}\n",
1757 | "#@markdown 打勾,將會直接變成 epub\n",
1758 | "file2epub = True #@param {type:\"boolean\"}\n",
1759 | "\n",
1760 | "# 標題設定義\n",
1761 | "YAML=f'''---\n",
1762 | "title: {title}\n",
1763 | "author: {author}\n",
1764 | "language: zh-Hant\n",
1765 | "---'''\n",
1766 | "\n",
1767 | "with open(\"title.txt\",mode=\"w\",encoding='utf-8') as f:\n",
1768 | " f.write(YAML)\n",
1769 | "\n",
1770 | "sites=url[:url.find(\"/\",8)]\n",
1771 | "# sites=url[:url.rfind(\"/\")]\n",
1772 | "\n",
1773 | "reg=requests.get(url)\n",
1774 | "reg.encoding=\"utf-8\"\n",
1775 | "# soup=BeautifulSoup(reg.text,\"html.parser\")\n",
1776 | "soup=BeautifulSoup(reg.text)\n",
1777 | "output_name=soup.find(\"h1\").getText()\n",
1778 | "articles=soup.find(name=\"ul\",class_=\"list\").find_all(\"a\")\n",
1779 | "\n",
1780 | "\n",
1781 | "links=[]\n",
1782 | "# len(articles)\n",
1783 | "for i in articles:\n",
1784 | " href=i.get(\"href\")\n",
1785 | " if href[-4:]!= \"html\":\n",
1786 | " continue\n",
1787 | " links.append([i.text,f\"{href}\"])\n",
1788 | "\n",
1789 | "files_text=[link[1][link[1].rfind(\"/\")+1:link[1].rfind(\".\")]+\".txt\" for link in links]\n",
1790 | "\n",
1791 | "\n",
1792 | "\n",
1793 | "# 暫時無法使用,目前 colab 只要開放 2 個執行緖、只能運作 60 秒\n",
1794 | "# # 同時建立及啟用10個執行緒\n",
1795 | "# with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:\n",
1796 | "# executor.map(get_html, links)\n",
1797 | "for link in links:\n",
1798 | " get_html(link)\n",
1799 | "\n",
1800 | "output_name=soup.find(\"h1\").getText()\n",
1801 | "# files_text=os.listdir()\n",
1802 | "# files_text=[file for file in files_text if file.endswith(\".txt\")]\n",
1803 | "# 檔案排序,需要考慮 檔案名稱長短不一的問題,問前是透過數字的處理\n",
1804 | "# files_text.sort(key=lambda x:int(x[:-4]))\n",
1805 | "if file2epub:\n",
1806 | " mdfiles=[ itm for itm in files_text]\n",
1807 | " os.system(\"pandoc -o \\\"../{}.epub\\\" title.txt {}\".format(title,\" \".join(mdfiles)))\n",
1808 | " from google.colab import files\n",
1809 | " files.download('../{}.epub'.format(title))\n",
1810 | " pass\n",
1811 | "else:\n",
1812 | " with open(f\"../{output_name}.txt\",\"w\",encoding='utf-8') as f:\n",
1813 | " for file in files_text[::-1]:\n",
1814 | " with open(file,\"r\") as f2:\n",
1815 | " f.write(f2.read())\n",
1816 | " from google.colab import files\n",
1817 | " files.download('../{}.txt'.format(output_name))\n",
1818 | "\n"
1819 | ]
1820 | },
1821 | {
1822 | "cell_type": "markdown",
1823 | "metadata": {
1824 | "id": "XJOqF-eIbaCT"
1825 | },
1826 | "source": [
1827 | "# 測試後,暫時沒有使用的程式碼區塊"
1828 | ]
1829 | },
1830 | {
1831 | "cell_type": "markdown",
1832 | "metadata": {
1833 | "id": "eNxKvxlfhvO6"
1834 | },
1835 | "source": [
1836 | "# 效率\n",
1837 | "\n",
1838 | "透過下述的方法,合併檔案,因為輸出檔需要被反覆的開始太多次,隨著檔案大小逐漸增加。讓效能下跌\n",
1839 | "\n",
1840 | "```python\n",
1841 | "for file in files:\n",
1842 | " os.system(\"cat {}>> ../{}.txt\".format(file,output_name))\n",
1843 | "```\n",
1844 | "若改用下述的方法, output 檔,只需要開啟一次。可以大大縮短時間。\n",
1845 | "\n",
1846 | "```python\n",
1847 | "with open(f\"../{output_name}.txt\",\"w\",encoding='utf-8') as f:\n",
1848 | " for file in files_text:\n",
1849 | " with open(file,\"r\") as f2:\n",
1850 | " f.write(f2.read())\n",
1851 | "```"
1852 | ]
1853 | },
1854 | {
1855 | "cell_type": "markdown",
1856 | "metadata": {
1857 | "id": "8KCQpgIUtZ3M"
1858 | },
1859 | "source": [
1860 | "# 參考資料"
1861 | ]
1862 | },
1863 | {
1864 | "cell_type": "code",
1865 | "execution_count": null,
1866 | "metadata": {
1867 | "cellView": "form",
1868 | "id": "4CCMkYp5OxmZ"
1869 | },
1870 | "outputs": [],
1871 | "source": [
1872 | "#@title 多執行序參考程式範例\n",
1873 | "\n",
1874 | "from bs4 import BeautifulSoup\n",
1875 | "import concurrent.futures\n",
1876 | "import requests\n",
1877 | "import time\n",
1878 | "\n",
1879 | "\n",
1880 | "def scrape(urls):\n",
1881 | "\n",
1882 | " response = requests.get(urls)\n",
1883 | "\n",
1884 | " soup = BeautifulSoup(response.content, \"lxml\")\n",
1885 | "\n",
1886 | " # 爬取文章標題\n",
1887 | " titles = soup.find_all(\"h3\", {\"class\": \"post_title\"})\n",
1888 | "\n",
1889 | " for title in titles:\n",
1890 | " print(title.getText().strip())\n",
1891 | "\n",
1892 | " time.sleep(2)\n",
1893 | "\n",
1894 | "\n",
1895 | "base_url = \"https://www.inside.com.tw/tag/AI\"\n",
1896 | "urls = [f\"{base_url}?page={page}\" for page in range(1, 6)] # 1~5頁的網址清單\n",
1897 | "print(urls)\n",
1898 | "start_time = time.time() # 開始時間\n",
1899 | "# scrape(urls)\n",
1900 | "\n",
1901 | "\n",
1902 | "# 同時建立及啟用10個執行緒\n",
1903 | "# with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:\n",
1904 | "# executor.map(scrape, urls)\n",
1905 | "\n",
1906 | "end_time = time.time()\n",
1907 | "print(f\"{end_time - start_time} 秒爬取 {len(urls)} 頁的文章\")"
1908 | ]
1909 | }
1910 | ],
1911 | "metadata": {
1912 | "colab": {
1913 | "provenance": [],
1914 | "mount_file_id": "11h2gvU2w0w-cWs1O2ciOwdgK4Wi6qndj",
1915 | "authorship_tag": "ABX9TyM7n29sOH7wFTuDQLfmH77G",
1916 | "include_colab_link": true
1917 | },
1918 | "kernelspec": {
1919 | "display_name": "Python 3",
1920 | "name": "python3"
1921 | }
1922 | },
1923 | "nbformat": 4,
1924 | "nbformat_minor": 0
1925 | }
--------------------------------------------------------------------------------
/oTranscribe_txt_to_srt_格式轉換.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "oTranscribe txt to srt 格式轉換",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyPY0cE4hDO0EpD8PepnE1dl",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | }
16 | },
17 | "cells": [
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "id": "view-in-github",
22 | "colab_type": "text"
23 | },
24 | "source": [
25 | "

"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {
31 | "id": "wfFfVeSgXqTT"
32 | },
33 | "source": [
34 | "# oTranscribe txt 轉出轉 srt 格式\r\n",
35 | "\r\n",
36 | "srt 為 SubRip (.srt) 的格式,可用於 YouTube cc 字幕。"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "metadata": {
42 | "cellView": "form",
43 | "id": "T9JBJrZYZbZe"
44 | },
45 | "source": [
46 | "#@title 需求模組預載\r\n",
47 | "#@markdown 此區塊一定要執行\r\n",
48 | "\r\n",
49 | "from google.colab import files\r\n",
50 | "import re"
51 | ],
52 | "execution_count": null,
53 | "outputs": []
54 | },
55 | {
56 | "cell_type": "code",
57 | "metadata": {
58 | "cellView": "form",
59 | "id": "MkCN-ubYYYSe"
60 | },
61 | "source": [
62 | "#@title 上傳檔案\r\n",
63 | "uploaded = files.upload()"
64 | ],
65 | "execution_count": null,
66 | "outputs": []
67 | },
68 | {
69 | "cell_type": "code",
70 | "metadata": {
71 | "cellView": "form",
72 | "id": "1UYrae9gEnBI"
73 | },
74 | "source": [
75 | "#@title 環境設定\r\n",
76 | "\r\n",
77 | "#@markdown 上傳檔案名稱\r\n",
78 | "input_filename='1.txt' #@param {type:\"string\"} \r\n",
79 | "\r\n",
80 | "#@markdown 輸出檔案名稱\r\n",
81 | "output_filename=\"srt_output.txt\" #@param {type:\"string\"}"
82 | ],
83 | "execution_count": null,
84 | "outputs": []
85 | },
86 | {
87 | "cell_type": "code",
88 | "metadata": {
89 | "cellView": "form",
90 | "id": "MzZEky3BHApL"
91 | },
92 | "source": [
93 | "#@title 執行格式轉換\r\n",
94 | "with open(input_filename,'r',encoding='utf-8') as f:\r\n",
95 | " text=f.read().replace(\"\\xa0\",' ')\r\n",
96 | "\r\n",
97 | "re_patten=r'([0-9:]+)\\s{0,2}(.*)\\s?\\n'\r\n",
98 | "aa=re.findall(re_patten,text)\r\n",
99 | "content=''\r\n",
100 | "end=len(aa)-1\r\n",
101 | "for idx,itm in enumerate(aa):\r\n",
102 | " if len(itm)==0:\r\n",
103 | " continue\r\n",
104 | " content+=\"%d\\n\"%(idx+1)\r\n",
105 | " if idx != end:\r\n",
106 | " content+=\"00:{} --> 00:{}\\n\".format(itm[0],aa[idx+1][0])\r\n",
107 | " else:\r\n",
108 | " tmp=itm[0].split(\":\")\r\n",
109 | " tmp[-1]=str(int(tmp[-1])+5)\r\n",
110 | " # print(\":\".join(tmp))\r\n",
111 | " content+=\"00:{} --> 00:{}\\n\".format(itm[0],\":\".join(tmp))\r\n",
112 | " content+=\"%s\\n\\n\"%itm[1]\r\n",
113 | " \r\n",
114 | "with open(output_filename,\"w\",encoding='utf-8') as f:\r\n",
115 | " f.write(content)\r\n",
116 | " "
117 | ],
118 | "execution_count": null,
119 | "outputs": []
120 | },
121 | {
122 | "cell_type": "code",
123 | "metadata": {
124 | "cellView": "form",
125 | "id": "QyeMdPh5HZJA"
126 | },
127 | "source": [
128 | "#@title 下載檔案\r\n",
129 | "from google.colab import files\r\n",
130 | "files.download(output_filename)"
131 | ],
132 | "execution_count": null,
133 | "outputs": []
134 | }
135 | ]
136 | }
--------------------------------------------------------------------------------
/whisper_Test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "gpuType": "T4",
8 | "mount_file_id": "1z5py79AseIPWuKFO0oZ95uo4t3nA_0iG",
9 | "authorship_tag": "ABX9TyPUgpFfyXfEk66r4MgTLpTv",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "language_info": {
17 | "name": "python"
18 | },
19 | "accelerator": "GPU"
20 | },
21 | "cells": [
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {
25 | "id": "view-in-github",
26 | "colab_type": "text"
27 | },
28 | "source": [
29 | "

"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "source": [
35 | "# 語音轉文字 AI 工具\n",
36 | "本工具使用 [OpenAI 的開源工具 Whisper](https://github.com/openai/whisper) 模型, 可以相對精準的將隨語音轉文字。\n",
37 | "\n",
38 | "# (一) 選擇適合的運作環境: T4 GPU\n",
39 | "本 Colab 虛擬機器使用為免費、多GPU的環境。已指定 T4 GPU 版本。\n",
40 | "\n",
41 | "若由 Github 直接開啟,可以忽略此說明。"
42 | ],
43 | "metadata": {
44 | "id": "Z8j9agRoP2Ef"
45 | }
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {
51 | "id": "ESQe_Qm7Ceoz"
52 | },
53 | "outputs": [],
54 | "source": [
55 | "# @title (1) 安裝 whisper\n",
56 | "!pip install git+https://github.com/openai/whisper.git"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "source": [
62 | "### (2) 掛載雲端硬碟\n",
63 | "1. 透過 Coloab 左邊的操作介面掛載\n",
64 | "2. 上傳音檔/影像檔到 Google drive\n",
65 | " - 個人偏好在 Google drive 建一個 tmp 資料夾\n",
66 | " - 將音檔上傳到 tmp 資料夾\n",
67 | " - 在 Colab 左邊的掛載介面找到 drive => MyDrive => tmp\n",
68 | " - 點選上載的音檔,按滑鼠右鍵,點選 複製路徑\n",
69 | "3. 將複製的路徑貼到轉檔區塊的 filenames 欄位中\n"
70 | ],
71 | "metadata": {
72 | "id": "vj1rk1zOKoh7"
73 | }
74 | },
75 | {
76 | "cell_type": "code",
77 | "source": [
78 | "# @title (3) 轉檔\n",
79 | "import os\n",
80 | "filename = \"/content/drive/MyDrive/tmp/phison.mp4\" # @param {type:\"string\"}\n",
81 | "#@markdown 設定使用的模型, 請參考 [Whisper Model Card](https://github.com/openai/whisper/blob/main/model-card.md) 選擇適合的模型\n",
82 | "model= \"medium\" # @param {type:\"string\"}\n",
83 | "\n",
84 | "#@markdown 設定主要的語言,如 Chinese, English,其它請參表 [tokenizer 文件](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py)\n",
85 | "language = \"English\" # @param {type:\"string\"}\n",
86 | "os.chdir(os.path.dirname(filename))\n",
87 | "os.getcwd()\n",
88 | "!whisper \"{filename}\" --model {model} --language {language}"
89 | ],
90 | "metadata": {
91 | "id": "t1k2RIWHDfhz",
92 | "cellView": "form"
93 | },
94 | "execution_count": null,
95 | "outputs": []
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "source": [
100 | "# (二) 取得法說會文字\n",
101 | "## 1. 請先執行 (1) 安裝 whisper\n",
102 | "## 2. 再執行 (4) 法說會逐字稿, ...."
103 | ],
104 | "metadata": {
105 | "id": "OjboOjHQ6qOJ"
106 | }
107 | },
108 | {
109 | "cell_type": "code",
110 | "source": [
111 | "# @title (4) 法說會逐字稿,當影片在 Youtube 可直接使用這個\n",
112 | "!pip install yt-dlp\n",
113 | "\n",
114 | "tubeUrl = \"https://www.youtube.com/watch?v=Q6sI_eY6sdU\" # @param {type:\"string\"}\n",
115 | "import os\n",
116 | "from yt_dlp import YoutubeDL\n",
117 | "companyName=\"科技小電報\" # @param {type:\"string\"}\n",
118 | "model= \"large\" # @param {type:\"string\"}\n",
119 | "language = \"Chinese\" # @param {type:\"string\"}\n",
120 | "\n",
121 | "\n",
122 | "\n",
123 | "filename = companyName+\".m4a\"\n",
124 | "ydl_opts = {'overwrites': True, 'format': 'bestaudio[ext=m4a]', 'outtmpl': filename}\n",
125 | "with YoutubeDL(ydl_opts) as ydl:\n",
126 | " ydl.download([tubeUrl])\n",
127 | "\n",
128 | "!whisper \"{filename}\" --model {model} --language {language}\n",
129 | "\n",
130 | "from google.colab import files\n",
131 | "exts=[\"txt\",\"srt\",\"tsv\",\"vtt\"]\n",
132 | "for ext in exts:\n",
133 | " files.download('{}.{}'.format(companyName,ext))"
134 | ],
135 | "metadata": {
136 | "id": "2kNMQIdNgS48",
137 | "cellView": "form"
138 | },
139 | "execution_count": null,
140 | "outputs": []
141 | }
142 | ]
143 | }
--------------------------------------------------------------------------------
/youtuber_逐字稿.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "youtuber 逐字稿",
7 | "provenance": [],
8 | "mount_file_id": "1PSXhAQ7DI5i_3836QB96XgGK7vi9kJ5z",
9 | "authorship_tag": "ABX9TyPv/Hov1nyluiLhL1f4L3FD",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "language_info": {
17 | "name": "python"
18 | }
19 | },
20 | "cells": [
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "id": "view-in-github",
25 | "colab_type": "text"
26 | },
27 | "source": [
28 | "

"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "source": [
34 | "!pip install pytube\n",
35 | "!pip install inlp\n",
36 | "!pip install speechrecognition\n",
37 | "#@markdown 安裝必要套件\n",
38 | "\n",
39 | "#!pip install you-get"
40 | ],
41 | "metadata": {
42 | "id": "gTr19YDIVkKC",
43 | "cellView": "form"
44 | },
45 | "execution_count": null,
46 | "outputs": []
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {
52 | "id": "i0rNL4f_VV3c",
53 | "cellView": "form"
54 | },
55 | "outputs": [],
56 | "source": [
57 | "#@title 輸入 Youtube 網址\n",
58 | "import os\n",
59 | "from pytube import YouTube\n",
60 | "\n",
61 | "url = \"https://www.youtube.com/watch?v=Jp8xnYRhWnw\" #@param {type:\"string\"}\n",
62 | "\n",
63 | "def onProgress(stream, chunk, remains):\n",
64 | " total = stream.filesize\n",
65 | " percent = (total-remains) / total * 100\n",
66 | " print('下載中… {:05.2f}%'.format(percent), end='\\r')\n",
67 | "\n",
68 | "\n",
69 | "# yt.streams.filter().get_highest_resolution().download()\n",
70 | "\n",
71 | "yt=YouTube(url,on_progress_callback=onProgress)\n",
72 | "yfilename=yt.streams.filter(type=\"audio\").first().download()\n",
73 | "\n",
74 | "# filename=yt.streams.filter(type=\"audio\").first().default_filename\n",
75 | "\n",
76 | "musicname=yfilename[:yfilename.rfind(\".\")]\n",
77 | "# yt.streams.filter(type=\"audio\").first().download()\n",
78 | "print(\"完成下載: \",yfilename)\n",
79 | "\n",
80 | "# print(\"完成下載: \",yt.streams.first().download())\n",
81 | "# print(\"轉檔中........\")\n",
82 | "# os.system('{} -i \"{}\" -vn -sn -dn \"{}.mp3\"'.format(\"ffmpeg\",filename, musicname))\n",
83 | "# print(\"完成轉檔: {}.mp3\".format(musicname))\n",
84 | "\n",
85 | "\n",
86 | "# # from google.colab import files\n",
87 | "# # files.download(\"{}.mp4\".format(filename[:filename.rfind(\".\")]))\n",
88 | "# from google.colab import files\n",
89 | "# files.download(f\"{musicname}.mp3\")\n",
90 | "\n",
91 | "import os\n",
92 | "import shutil\n",
93 | "import speech_recognition as sr\n",
94 | "import concurrent.futures\n",
95 | "import wave\n",
96 | "import json\n",
97 | "import numpy as np\n",
98 | "from inlp.convert import chinese\n",
99 | "\n",
100 | " \n",
101 | "\n",
102 | "mp3Name= yfilename\n",
103 | " \n",
104 | "CutTimeDef = 20 \n",
105 | "wav_path='wav' \n",
106 | "txt_path='txt' \n",
107 | "thread_num = 10 \n",
108 | "\n",
109 | "workpath=os.path.dirname(mp3Name)\n",
110 | "mp3Name=os.path.basename(mp3Name)\n",
111 | "FileName = mp3Name[:mp3Name.rfind(\".\")]+\".wav\"\n",
112 | "os.chdir(workpath)\n",
113 | "chk='y' \n",
114 | " \n",
115 | "def reset_dir(path):\n",
116 | " try:\n",
117 | " os.mkdir(path)\n",
118 | " except Exception:\n",
119 | " if chk==\"y\":\n",
120 | " shutil.rmtree(path)\n",
121 | " os.mkdir(path)\n",
122 | " \n",
123 | "def CutFile(FileName, target_path):\n",
124 | " \n",
125 | " # print(\"CutFile File Name is \", FileName)\n",
126 | " f = wave.open(FileName, \"rb\")\n",
127 | " params = f.getparams() \n",
128 | " nchannels, sampwidth, framerate, nframes = params[:4]\n",
129 | " CutFrameNum = framerate * CutTimeDef\n",
130 | " # 讀取格式資訊\n",
131 | " # 一次性返回所有的WAV檔案的格式資訊,它返回的是一個組元(tuple):聲道數, 量化位數(byte 單位), 採\n",
132 | " # 樣頻率, 取樣點數, 壓縮型別, 壓縮型別的描述。wave模組只支援非壓縮的資料,因此可以忽略最後兩個資訊\n",
133 | " \n",
134 | " # print(\"CutFrameNum=%d\" % (CutFrameNum))\n",
135 | " # print(\"nchannels=%d\" % (nchannels))\n",
136 | " # print(\"sampwidth=%d\" % (sampwidth))\n",
137 | " # print(\"framerate=%d\" % (framerate))\n",
138 | " # print(\"nframes=%d\" % (nframes))\n",
139 | " \n",
140 | " str_data = f.readframes(nframes)\n",
141 | " f.close() # 將波形資料轉換成陣列\n",
142 | " # Cutnum =nframes/framerate/CutTimeDef\n",
143 | " # 需要根據聲道數和量化單位,將讀取的二進位制資料轉換為一個可以計算的陣列\n",
144 | " wave_data = np.frombuffer(str_data, dtype=np.short)\n",
145 | " wave_data.shape = -1, 2\n",
146 | " wave_data = wave_data.T\n",
147 | " temp_data = wave_data.T\n",
148 | " # StepNum = int(nframes/200)\n",
149 | " StepNum = CutFrameNum\n",
150 | " StepTotalNum = 0\n",
151 | " haha = 0\n",
152 | " while StepTotalNum < nframes:\n",
153 | " # for j in range(int(Cutnum)):\n",
154 | " # print(\"Stemp=%d\" % (haha))\n",
155 | " SaveFile = \"%s-%03d.wav\" % (FileName[:-4], (haha+1))\n",
156 | " # print(FileName)\n",
157 | " if haha % 3==0:\n",
158 | " print(\"*\",end='')\n",
159 | " temp_dataTemp = temp_data[StepNum * (haha):StepNum * (haha + 1)]\n",
160 | " haha = haha + 1\n",
161 | " StepTotalNum = haha * StepNum\n",
162 | " temp_dataTemp.shape = 1, -1\n",
163 | " temp_dataTemp = temp_dataTemp.astype(np.short) # 開啟WAV文件\n",
164 | " f = wave.open(target_path+\"/\" + SaveFile, \"wb\")\n",
165 | " # 配置聲道數、量化位數和取樣頻率\n",
166 | " f.setnchannels(nchannels)\n",
167 | " f.setsampwidth(sampwidth)\n",
168 | " f.setframerate(framerate)\n",
169 | " # 將wav_data轉換為二進位制資料寫入檔案\n",
170 | " f.writeframes(temp_dataTemp.tobytes())\n",
171 | " f.close()\n",
172 | " \n",
173 | "\n",
174 | " \n",
175 | "def texts_to_one(path, target_file):\n",
176 | " files = os.listdir(path)\n",
177 | " files.sort()\n",
178 | " files = [path+\"/\" + f for f in files if f.endswith(\".txt\")]\n",
179 | " with open(target_file, \"w\", encoding=\"utf-8\") as f:\n",
180 | " for file in files:\n",
181 | " with open(file, \"r\", encoding='utf-8') as f2:\n",
182 | " f.write(f2.read())\n",
183 | " print(\"完成合併, 檔案位於 %s \" % target_file)\n",
184 | " \n",
185 | " \n",
186 | "def texts2otr(path, target_file, audio_name, timeperiod):\n",
187 | " template = '''
{}{}
\n",
188 | " '''\n",
189 | " files = os.listdir(path)\n",
190 | " files.sort()\n",
191 | " content = ''\n",
192 | " files = [path+\"/\" + f for f in files if f.endswith(\".txt\")]\n",
193 | " with open(target_file, \"w\", encoding=\"utf-8\") as f:\n",
194 | " \n",
195 | " for file in files:\n",
196 | " with open(file, \"r\", encoding=\"utf-8\") as f2:\n",
197 | " txt = f2.read().split(\"\\n\")\n",
198 | " if len(txt) < 2:\n",
199 | " continue\n",
200 | " pos=txt[0].rfind(\".\")\n",
201 | " time=int(txt[0][pos-3:pos])\n",
202 | " # times = (int(txt[0].split(\"-\")[1][:-5])-1)*CutTimeDef\n",
203 | " times=(time-1)*CutTimeDef\n",
204 | " secs, mins = times % 60, (times//60) % 60\n",
205 | " hours = (times//60)//60\n",
206 | " timeF = \"{:02d}:{:02d}:{:02d}\".format(hours, mins, secs)\n",
207 | " content += template.format(times, timeF, txt[1])\n",
208 | " \n",
209 | " output = {\"text\": content, \"media\": audio_name,\n",
210 | " \"media-time\": timeperiod}\n",
211 | " f.write(json.dumps(output, ensure_ascii=False))\n",
212 | " print(\"完成合併, otr 檔案位於 %s \" % target_file)\n",
213 | " \n",
214 | "#@title 執行音頻轉換與分割\n",
215 | " \n",
216 | "print(\" mp3 轉 wav 檔 \".center(100,'=')) \n",
217 | "os.system('{} -i \"{}\" \"{}\"'.format(\"ffmpeg\",mp3Name, FileName))\n",
218 | "print(\" Wav 檔名為 {} \".format(FileName).center(96))\n",
219 | "reset_dir(wav_path)\n",
220 | "reset_dir(txt_path)\n",
221 | "# # Cut Wave Setting\n",
222 | "\n",
223 | "print(\" 音頻以每{}秒分割 \".format(CutTimeDef).center(94,'='))\n",
224 | "CutFile(FileName, wav_path)\n",
225 | "print(\"\")\n",
226 | "print(\" 完成分割 \".center(100,'-'))\n",
227 | "#@title 執行語音轉文字 (需要耗費不少時間)\n",
228 | "\n",
229 | "#@markdown 指定翻譯的語言類型,如何設定語系請參考 [支援列表](https://cloud.google.com/speech-to-text/docs/languages)\n",
230 | "voiceLanguage=\"cmn-Hant-TW\" #@param {type:\"string\"}\n",
231 | "\n",
232 | "def VoiceToText_thread(file):\n",
233 | " txt_file = \"%s/%s.txt\" % (txt_path, file[:-4])\n",
234 | " \n",
235 | " if os.path.isfile(txt_file):\n",
236 | " return\n",
237 | " with open(\"%s/%s.txt\" % (txt_path, file[:-4]), \"w\", encoding=\"utf-8\") as f:\n",
238 | " f.write(\"%s:\\n\" % file)\n",
239 | " r = sr.Recognizer() # 預設辨識英文\n",
240 | " with sr.WavFile(wav_path+\"/\"+file) as source: # 讀取wav檔\n",
241 | " audio = r.record(source)\n",
242 | " # r.adjust_for_ambient_noise(source)\n",
243 | " # audio = r.listen(source)\n",
244 | " try:\n",
245 | " text = r.recognize_google(audio,language = voiceLanguage)\n",
246 | " text = chinese.s2t(text)\n",
247 | " # r.recognize_google(audio)\n",
248 | " \n",
249 | " if len(text) == 0:\n",
250 | " print(\"===無資料==\")\n",
251 | " return\n",
252 | "\n",
253 | " print(f\"{file}\\t{text}\")\n",
254 | " f.write(\"%s \\n\\n\" % text)\n",
255 | " if file == files[-1]:\n",
256 | " print(\"結束翻譯\")\n",
257 | " except sr.RequestError as e:\n",
258 | " print(\"無法翻譯{0}\".format(e))\n",
259 | " # 兩個 except 是當語音辨識不出來的時候 防呆用的\n",
260 | " # 使用Google的服務\n",
261 | " except LookupError:\n",
262 | " print(\"Could not understand audio\")\n",
263 | " except sr.UnknownValueError:\n",
264 | " print(f\"Error: 無法識別 Audio\\t {file}\")\n",
265 | " \n",
266 | "\n",
267 | "\n",
268 | "\n",
269 | "files = os.listdir(wav_path)\n",
270 | "files.sort()\n",
271 | "\n",
272 | "with concurrent.futures.ThreadPoolExecutor(max_workers=thread_num) as executor:\n",
273 | " executor.map(VoiceToText_thread, files)\n",
274 | " \n",
275 | "# VoiceToText(wav_path, files, txt_path)\n",
276 | " \n",
277 | "target_txtfile = \"{}.txt\".format(FileName[:-4])\n",
278 | "texts_to_one(txt_path, target_txtfile)\n",
279 | "otr_file = \"{}.otr\".format(FileName[:-4])\n",
280 | "with wave.open(FileName, \"rb\") as f:\n",
281 | " params = f.getparams()\n",
282 | "texts2otr(txt_path, otr_file, FileName, params.nframes)"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "source": [
288 | "`"
289 | ],
290 | "metadata": {
291 | "id": "f5CFfXa-ZomL"
292 | }
293 | },
294 | {
295 | "cell_type": "code",
296 | "source": [
297 | "#@title 下載 otr\n",
298 | "import os\n",
299 | "import shutil\n",
300 | "# 搬移音錄檔案到特定的目錄下\n",
301 | "source_dir=\"/content/\"\n",
302 | "target_dir=\"/content/drive/MyDrive/tmp/\"\n",
303 | "def findotr(arr):\n",
304 | " for itm in arr[::-1]:\n",
305 | " if \"otr\" in itm:\n",
306 | " return itm\n",
307 | "otrfilename=findotr(os.listdir(\".\"))\n",
308 | "from google.colab import files\n",
309 | "files.download(\"{}\".format(otrfilename))\n",
310 | "# shutil.move(f\"{source_dir}{yfilename}\",target_dir)"
311 | ],
312 | "metadata": {
313 | "colab": {
314 | "base_uri": "https://localhost:8080/",
315 | "height": 17
316 | },
317 | "id": "XseEE1QBiBQu",
318 | "outputId": "7b4a5a66-f31b-4202-9ed8-c4c051161754",
319 | "cellView": "form"
320 | },
321 | "execution_count": 14,
322 | "outputs": [
323 | {
324 | "output_type": "display_data",
325 | "data": {
326 | "text/plain": [
327 | "
"
328 | ],
329 | "application/javascript": [
330 | "\n",
331 | " async function download(id, filename, size) {\n",
332 | " if (!google.colab.kernel.accessAllowed) {\n",
333 | " return;\n",
334 | " }\n",
335 | " const div = document.createElement('div');\n",
336 | " const label = document.createElement('label');\n",
337 | " label.textContent = `Downloading \"${filename}\": `;\n",
338 | " div.appendChild(label);\n",
339 | " const progress = document.createElement('progress');\n",
340 | " progress.max = size;\n",
341 | " div.appendChild(progress);\n",
342 | " document.body.appendChild(div);\n",
343 | "\n",
344 | " const buffers = [];\n",
345 | " let downloaded = 0;\n",
346 | "\n",
347 | " const channel = await google.colab.kernel.comms.open(id);\n",
348 | " // Send a message to notify the kernel that we're ready.\n",
349 | " channel.send({})\n",
350 | "\n",
351 | " for await (const message of channel.messages) {\n",
352 | " // Send a message to notify the kernel that we're ready.\n",
353 | " channel.send({})\n",
354 | " if (message.buffers) {\n",
355 | " for (const buffer of message.buffers) {\n",
356 | " buffers.push(buffer);\n",
357 | " downloaded += buffer.byteLength;\n",
358 | " progress.value = downloaded;\n",
359 | " }\n",
360 | " }\n",
361 | " }\n",
362 | " const blob = new Blob(buffers, {type: 'application/binary'});\n",
363 | " const a = document.createElement('a');\n",
364 | " a.href = window.URL.createObjectURL(blob);\n",
365 | " a.download = filename;\n",
366 | " div.appendChild(a);\n",
367 | " a.click();\n",
368 | " div.remove();\n",
369 | " }\n",
370 | " "
371 | ]
372 | },
373 | "metadata": {}
374 | },
375 | {
376 | "output_type": "display_data",
377 | "data": {
378 | "text/plain": [
379 | ""
380 | ],
381 | "application/javascript": [
382 | "download(\"download_c235d0ba-d21a-43be-ad8a-cb63194b34f6\", \"\\u5b64\\u7368\\u7684\\u6211\\u662f\\u5e78\\u798f\\u7684\\uff5c\\u6587\\u68ee\\u8aaa\\u66f8.otr\", 13904)"
383 | ]
384 | },
385 | "metadata": {}
386 | }
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "source": [
392 | "#@title 下載 音檔\n",
393 | "from google.colab import files\n",
394 | "files.download(\"{}\".format(yfilename))"
395 | ],
396 | "metadata": {
397 | "colab": {
398 | "base_uri": "https://localhost:8080/",
399 | "height": 17
400 | },
401 | "cellView": "form",
402 | "id": "hUXzAOTfbvE2",
403 | "outputId": "31939c83-611d-4e56-fa6c-7f97f136d77f"
404 | },
405 | "execution_count": 15,
406 | "outputs": [
407 | {
408 | "output_type": "display_data",
409 | "data": {
410 | "text/plain": [
411 | ""
412 | ],
413 | "application/javascript": [
414 | "\n",
415 | " async function download(id, filename, size) {\n",
416 | " if (!google.colab.kernel.accessAllowed) {\n",
417 | " return;\n",
418 | " }\n",
419 | " const div = document.createElement('div');\n",
420 | " const label = document.createElement('label');\n",
421 | " label.textContent = `Downloading \"${filename}\": `;\n",
422 | " div.appendChild(label);\n",
423 | " const progress = document.createElement('progress');\n",
424 | " progress.max = size;\n",
425 | " div.appendChild(progress);\n",
426 | " document.body.appendChild(div);\n",
427 | "\n",
428 | " const buffers = [];\n",
429 | " let downloaded = 0;\n",
430 | "\n",
431 | " const channel = await google.colab.kernel.comms.open(id);\n",
432 | " // Send a message to notify the kernel that we're ready.\n",
433 | " channel.send({})\n",
434 | "\n",
435 | " for await (const message of channel.messages) {\n",
436 | " // Send a message to notify the kernel that we're ready.\n",
437 | " channel.send({})\n",
438 | " if (message.buffers) {\n",
439 | " for (const buffer of message.buffers) {\n",
440 | " buffers.push(buffer);\n",
441 | " downloaded += buffer.byteLength;\n",
442 | " progress.value = downloaded;\n",
443 | " }\n",
444 | " }\n",
445 | " }\n",
446 | " const blob = new Blob(buffers, {type: 'application/binary'});\n",
447 | " const a = document.createElement('a');\n",
448 | " a.href = window.URL.createObjectURL(blob);\n",
449 | " a.download = filename;\n",
450 | " div.appendChild(a);\n",
451 | " a.click();\n",
452 | " div.remove();\n",
453 | " }\n",
454 | " "
455 | ]
456 | },
457 | "metadata": {}
458 | },
459 | {
460 | "output_type": "display_data",
461 | "data": {
462 | "text/plain": [
463 | ""
464 | ],
465 | "application/javascript": [
466 | "download(\"download_d7ea9f3e-65ed-4da1-8b0e-609894886090\", \"\\u5b64\\u7368\\u7684\\u6211\\u662f\\u5e78\\u798f\\u7684\\uff5c\\u6587\\u68ee\\u8aaa\\u66f8.mp4\", 4164951)"
467 | ]
468 | },
469 | "metadata": {}
470 | }
471 | ]
472 | }
473 | ]
474 | }
--------------------------------------------------------------------------------
/台股_Q1~Q3_EPS_抓取.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "台股 Q1~Q3 EPS 抓取",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyPn0uCcBvxxCRI0YDMGbTl3",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "language_info": {
17 | "name": "python"
18 | }
19 | },
20 | "cells": [
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "id": "view-in-github",
25 | "colab_type": "text"
26 | },
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "source": [
34 | "# 自動取得台股 Q1~Q3 EPS\n",
35 | "\n",
36 | "文字標格式。數字和文字間用 tab 區分。一張股票一行。\n",
37 | "\n",
38 | "可以直接從 excel 貼到 txt 檔即可\n",
39 | "\n",
40 | "---\n",
41 | "2330 台積電"
42 | ],
43 | "metadata": {
44 | "id": "lCqeZiJAPLwc"
45 | }
46 | },
47 | {
48 | "cell_type": "code",
49 | "source": [
50 | "import requests\n",
51 | "import concurrent.futures\n",
52 | "from bs4 import BeautifulSoup, UnicodeDammit\n",
53 | "import pandas as pd\n",
54 | "\n",
55 | "def addeps(stockid):\n",
56 | " \n",
57 | " url=url_template.format(stockid)\n",
58 | " reg=requests.get(url)\n",
59 | " soup=BeautifulSoup(reg.text)\n",
60 | " stockData=[]\n",
61 | " for itm in soup.find(\"section\",id=\"qsp-eps-table\").find_all(\"span\",class_=\"\")[1:7:2]:\n",
62 | " stockData.insert(0,itm.getText())\n",
63 | " \n",
64 | " stockids.get(stockid).extend(stockData)\n",
65 | "\n",
66 | "\n",
67 | "mytest=list()\n",
68 | "#@markdown 用 txt 檔存股票代碼\n",
69 | "stock_txt=\"/content/stock_id_name.txt\" #@param {type:'string'} \n",
70 | "with open(stock_txt,'rb') as f:\n",
71 | " encode=UnicodeDammit(f.read()).original_encoding\n",
72 | "with open(stock_txt,\"r\",encoding=encode) as f:\n",
73 | " content=[itm.split() for itm in f.read().splitlines()]\n",
74 | "\n",
75 | "'''\n",
76 | "Converting a list to dictionary with list elements as keys in dictionary\n",
77 | "All keys will have same value\n",
78 | "''' \n",
79 | "# create Stock_id data\n",
80 | "stockids = { i[0] : i for i in content }\n",
81 | "url_template=\"https://tw.stock.yahoo.com/quote/{}/eps\" \n",
82 | "with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:\n",
83 | " executor.map(addeps,stockids)\n",
84 | "df=pd.DataFrame(stockids.values(),columns=[\"股票代碼\",\"股票名稱\",\"Q1 EPS\",\"Q2 EPS\",\"Q3 EPS\"])\n",
85 | "df.to_excel(\"TWstocks_EPS.xlsx\",index=False)\n",
86 | "df\n",
87 | "\n",
88 | " "
89 | ],
90 | "metadata": {
91 | "id": "6hjtu_OMGJud"
92 | },
93 | "execution_count": null,
94 | "outputs": []
95 | }
96 | ]
97 | }
--------------------------------------------------------------------------------
/技術議題關鍵字擴展.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "技術議題關鍵字擴展.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": [],
9 | "authorship_tag": "ABX9TyO5GIToPuAyaKLz4lnvSqaj",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "language_info": {
17 | "name": "python"
18 | }
19 | },
20 | "cells": [
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "id": "view-in-github",
25 | "colab_type": "text"
26 | },
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "metadata": {
34 | "id": "hQN-VXYIGSKX",
35 | "cellView": "form"
36 | },
37 | "source": [
38 | "#@title 關鍵字擴展\n",
39 | "#@markdown 關鍵字擴展是用 政府研究資訊網 (grb.gov.tw) 的民國 105-110年公開資料做為資料集,目前只支援中文字關鍵字的擴展\n",
40 | "\n",
41 | "import requests\n",
42 | "\n",
43 | "print(\"資料載入中 .......\")\n",
44 | "url=\"https://raw.githubusercontent.com/reic/colab_python/main/data/\"\n",
45 | "fnames=[\"GRB_105.txt\",\"GRB_106.txt\",\"GRB_107.txt\",\"GRB_108.txt\",\"GRB_109.txt\",\"GRB_110.txt\"]\n",
46 | "\n",
47 | "#@markdown 關鍵字\n",
48 | "print(f\"{len(fnames)} 個資料源\")\n",
49 | "inputword = \"\\u865B\\u64EC\\u5BE6\\u5883\" #@param {type:\"string\"}\n",
50 | "#@markdown 列出的擴展關鍵字之數量\n",
51 | "extendNumber =20 #@param {type:\"number\"}\n",
52 | "content=[]\n",
53 | "\n",
54 | "for fname in fnames:\n",
55 | " # print(f\"從 Github 下載資料檔 {fname}\")\n",
56 | " reg=requests.get(f\"{url}{fname}\")\n",
57 | " content.extend(reg.text.splitlines())\n",
58 | "print(\"=== 資料載入完成 \".ljust(100,\"=\"))\n",
59 | "print(\"\")\n",
60 | "projectData = {}\n",
61 | "for itm in content:\n",
62 | " # print(itm)\n",
63 | " [id, keyword] = itm.split('\\t')\n",
64 | " projectData[id] = keyword.split(\":\")\n",
65 | "# print(projectData)\n",
66 | "\n",
67 | "inputword = inputword.lower()\n",
68 | "keywords = []\n",
69 | "projects = []\n",
70 | "for itm in content:\n",
71 | " if inputword in itm.lower():\n",
72 | " pro = itm.split(\"\\t\")\n",
73 | " projects.append(pro[0])\n",
74 | " getkeywordset = pro[1].split(\":\")\n",
75 | " keywords.extend(getkeywordset)\n",
76 | "\n",
77 | "# # print(len(keywords))\n",
78 | "# # print(len(list(set(keywords))))\n",
79 | "# # print(len(projects))\n",
80 | "# # print(projectData[projects[0]])\n",
81 | "uniqueKeywordCount = dict.fromkeys(keywords, 0)\n",
82 | "for itm in uniqueKeywordCount:\n",
83 | " uniqueKeywordCount[itm] = keywords.count(itm)\n",
84 | "\n",
85 | "keywordExtend=[]\n",
86 | "for key,value in uniqueKeywordCount.items():\n",
87 | " # if int(value) <2: \n",
88 | " # continue\n",
89 | " # print(value)\n",
90 | " keywordExtend.append([key,value])\n",
91 | "keywordExtend.sort(key=lambda x:x[1],reverse=True)\n",
92 | "\n",
93 | "if extendNumber > len(keywordExtend):\n",
94 | " extendNumber=len(keywordExtend)\n",
95 | "\n",
96 | "for itm in keywordExtend[:extendNumber]:\n",
97 | " print(f\"{itm[0]:15s}\\t{itm[1]}\")\n",
98 | "\n",
99 | "\n"
100 | ],
101 | "execution_count": null,
102 | "outputs": []
103 | },
104 | {
105 | "cell_type": "code",
106 | "source": [
107 | "#@title 產業趨勢關鍵字探索\n",
108 | "#@markdown 關鍵字擴展由 科技產業資訊室(iknow.stpi.narl.org.tw) 提供。關鍵字由 iknow 設定義,可以掌握特定關鍵字和其它產業關鍵字共現關係。\n",
109 | "\n",
110 | "import requests\n",
111 | "\n",
112 | "print(\"資料載入中 .......\")\n",
113 | "url=\"https://raw.githubusercontent.com/reic/colab_python/main/data/\"\n",
114 | "fnames=[\"iKnow_2017-2021.txt\"]\n",
115 | "\n",
116 | "#@markdown 關鍵字\n",
117 | "print(f\"{len(fnames)} 個資料源\")\n",
118 | "inputword = \"Apple\" #@param {type:\"string\"}\n",
119 | "\n",
120 | "#@markdown 列出的擴展關鍵字之數量\n",
121 | "extendNumber =20 #@param {type:\"number\"}\n",
122 | "content=[]\n",
123 | "\n",
124 | "for fname in fnames:\n",
125 | " # print(f\"從 Github 下載資料檔 {fname}\")\n",
126 | " reg=requests.get(f\"{url}{fname}\")\n",
127 | " content.extend(reg.text.splitlines())\n",
128 | "print(\"=== 資料載入完成 \".ljust(100,\"=\"))\n",
129 | "print(\"\")\n",
130 | "projectData = {}\n",
131 | "for itm in content:\n",
132 | " # print(itm)\n",
133 | " [id, keyword] = itm.split('\\t')\n",
134 | " projectData[id] = keyword.split(\":\")\n",
135 | "# print(projectData)\n",
136 | "\n",
137 | "inputword = inputword.lower()\n",
138 | "keywords = []\n",
139 | "projects = []\n",
140 | "for itm in content:\n",
141 | " if inputword in itm.lower():\n",
142 | " pro = itm.split(\"\\t\")\n",
143 | " projects.append(pro[0])\n",
144 | " getkeywordset = pro[1].split(\":\")\n",
145 | " keywords.extend(getkeywordset)\n",
146 | "\n",
147 | "# # print(len(keywords))\n",
148 | "# # print(len(list(set(keywords))))\n",
149 | "# # print(len(projects))\n",
150 | "# # print(projectData[projects[0]])\n",
151 | "uniqueKeywordCount = dict.fromkeys(keywords, 0)\n",
152 | "for itm in uniqueKeywordCount:\n",
153 | " uniqueKeywordCount[itm] = keywords.count(itm)\n",
154 | "\n",
155 | "keywordExtend=[]\n",
156 | "for key,value in uniqueKeywordCount.items():\n",
157 | " # if int(value) <2: \n",
158 | " # continue\n",
159 | " # print(value)\n",
160 | " keywordExtend.append([key,value])\n",
161 | "keywordExtend.sort(key=lambda x:x[1],reverse=True)\n",
162 | "\n",
163 | "if extendNumber > len(keywordExtend):\n",
164 | " extendNumber=len(keywordExtend)\n",
165 | "\n",
166 | "for itm in keywordExtend[:extendNumber]:\n",
167 | " print(f\"{itm[0]:15s}\\t{itm[1]}\")\n",
168 | "\n",
169 | "\n"
170 | ],
171 | "metadata": {
172 | "cellView": "form",
173 | "id": "mcHn3ncMl-en"
174 | },
175 | "execution_count": null,
176 | "outputs": []
177 | }
178 | ]
179 | }
--------------------------------------------------------------------------------
/英文單字計算.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "collapsed_sections": [],
8 | "mount_file_id": "1z6G-heUGHnpYJkvVwXvsAnv3P2QRc19r",
9 | "authorship_tag": "ABX9TyM8taDxTDLlejhnxOuDq0au",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | },
16 | "language_info": {
17 | "name": "python"
18 | }
19 | },
20 | "cells": [
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "id": "view-in-github",
25 | "colab_type": "text"
26 | },
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {
35 | "id": "QxEVE_SZC_Bc",
36 | "cellView": "form"
37 | },
38 | "outputs": [],
39 | "source": [
40 | "#@title 在 context 輸入 zotero 標注的英文單字,並將單字存至 wordtank.txt\n",
41 | "#@markdown 請先掛載 google drive ,wordtank 是放在 google雲端硬碟內\n",
42 | "\n",
43 | "import re\n",
44 | "from nltk.stem import PorterStemmer\n",
45 | "import os\n",
46 | "\n",
47 | "def checkFileexist(path,file):\n",
48 | " fileWithPath=\"{}/{}\".format(path,file)\n",
49 | " if os.path.isfile(fileWithPath):\n",
50 | " return\n",
51 | " with open(fileWithPath,mode=\"w\",encoding='utf-8') as f:\n",
52 | " f.write(\"fromReic\")\n",
53 | "\n",
54 | "def stemworkcheck(worda,wordb):\n",
55 | " if len(wordb)>len(worda):\n",
56 | " return worda\n",
57 | " return wordb\n",
58 | " \n",
59 | "context = \"\\u201Celectrolyte\\u201D ([Wang \\u7B49\\u3002, 2022, p. 1](zotero://select/library/items/VHI4XUQN)) ([pdf](zotero://open-pdf/library/items/3N7DAHAP?page=1&annotation=IG6RK45L)) electrolyte \\u82F1 [\\u026A\\u02C8lektr\\u0259la\\u026At] \\u7F8E [\\u026A\\u02C8lektr\\u0259la\\u026At] n. \\u7535\\u89E3\\u6DB2\\uFF0C\\u7535\\u89E3\\u8D28\\uFF1B\\u7535\\u89E3 [ \\u590D\\u6570 electrolytes ] \\u201Ccontradictions\\u201D ([Wang \\u7B49\\u3002, 2022, p. 2](zotero://select/library/items/VHI4XUQN)) ([pdf](zotero://open-pdf/library/items/3N7DAHAP?page=2&annotation=G3VLNTTD)) \\u77DB\\u76FE \\u201Climbs\\u201D ([Wang \\u7B49\\u3002, 2022, p. 2](zotero://select/library/items/VHI4XUQN)) ([pdf](zotero://open-pdf/library/items/3N7DAHAP?page=2&annotation=UD9D4T95)) limbs \\u82F1 [l\\u026Amz] \\u7F8E [l\\u026Amz] n. [\\u89E3\\u5256]\\u56DB\\u80A2\\uFF08limb \\u7684\\u590D\\u6570\\uFF09\" #@param {type:\"string\"}\n",
60 | "savedir = \"/content/drive/MyDrive/reic\" #@param {type:\"string\"}\n",
61 | "wordtank = \"wordtank.txt\" #@param {type:\"string\"}\n",
62 | "\n",
63 | "\n",
64 | "\n",
65 | "re_pattern=\"“(\\w+)[, .]?”\"\n",
66 | "ps=PorterStemmer()\n",
67 | "req=re.findall(re_pattern, context)\n",
68 | "req=[itm.lower() for itm in req]\n",
69 | "\n",
70 | "checkFileexist(savedir,wordtank)\n",
71 | "\n",
72 | "with open(\"{}/{}\".format(savedir,wordtank),mode=\"a\",encoding=\"utf-8\") as f:\n",
73 | " f.write(\", {}\".format(\", \".join(req))) "
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "source": [
79 | "#@title 列出重複查詢次數最多的英文單字\n",
80 | "toprank = 10 #@param {type:\"number\"}\n",
81 | "with open(\"{}/{}\".format(savedir,wordtank),mode=\"r\",encoding=\"utf-8\") as f:\n",
82 | " content=f.read().split(\", \")\n",
83 | "\n",
84 | "word=dict()\n",
85 | "wordcount=dict()\n",
86 | "for itm in content:\n",
87 | " wordstem=ps.stem(itm)\n",
88 | " word[wordstem]=stemworkcheck(word.get(wordstem,itm.lower()),itm.lower())\n",
89 | " wordcount[wordstem]=wordcount.get(wordstem,0)+1\n",
90 | "\n",
91 | "arr=sorted(wordcount.items(),key=lambda x:x[1],reverse=True)\n",
92 | "for key,value in arr[:toprank]:\n",
93 | " print(word[key],value)"
94 | ],
95 | "metadata": {
96 | "colab": {
97 | "base_uri": "https://localhost:8080/"
98 | },
99 | "id": "aGrzd_DBJ2R3",
100 | "outputId": "307e0750-7dff-40dc-be8b-b53eb296c97c",
101 | "cellView": "form"
102 | },
103 | "execution_count": 9,
104 | "outputs": [
105 | {
106 | "output_type": "stream",
107 | "name": "stdout",
108 | "text": [
109 | "fromreic 1\n",
110 | "electrolyte 1\n",
111 | "contradictions 1\n",
112 | "limbs 1\n"
113 | ]
114 | }
115 | ]
116 | }
117 | ]
118 | }
--------------------------------------------------------------------------------
/錄音檔轉文字.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "view-in-github",
7 | "colab_type": "text"
8 | },
9 | "source": [
10 | "
"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {
16 | "id": "sfatoyUwFvio"
17 | },
18 | "source": [
19 | "# 語音轉文字小工具\n",
20 | "\n",
21 | "此工具採用 Python 開發,可應用於**訪談錄音檔**轉文字、**影片的字幕**的生成,及其它相關應用。\n",
22 | "\n",
23 | "因為透過Google Colab 平台、Google的語音轉文字工具,完成語音轉文字的工作。只需要有 Google 帳號,即可具備執行此程式的環境,輔以簡單的設定,不會程式的使用者也可以完成相關的工作。\n",
24 | "\n",
25 | "# 新版的 AI 語音轉文字工具,結果更精準\n",
26 | "可以試試我用 Whisper AI模型撰新的新語音轉文字工具,**文字更精準**\n",
27 | "https://github.com/reic/colab_python/blob/main/whisper_Test.ipynb\n",
28 | "\n",
29 | "\n",
30 | "by 瑞課\n",
31 | "\n",
32 | "== 更新記錄 ===\n",
33 | "- 2024/2/20 Colab 調整執行緖限制,最多 2 個,只能執行60秒。多執行緖無法正確使用了。\n",
34 | "- 2023/4/11 調整 txt 檔的輸出模式,並將預設語言改為「繁體中文」\n",
35 | "- 2023/3/15 調整修改未完全的函式錯誤。 謝謝「左埕安」的回報\n",
36 | "- 2023/3/15 調整 txt檔的內容呈現\n",
37 | "- 2021/6/1 修正檔名有空白時,無法轉成 wav 和切割問題\n",
38 | "- 2021/5/12 增加不同翻譯語言變數的設定,並於檔案中提供語系參考表。 謝謝 chin ho Lau 的回饋。\n",
39 | "- 2021/5/9 修正因檔名無法產生 OTR 檔的問題,謝謝「彩虹小馬」的回饋\n",
40 | "- 2021/5/3 增加多執行緖的方法,縮短翻譯的時間\n",
41 | "\n",
42 | "\n",
43 | "\n"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {
49 | "id": "FQPGQ9dlTlNI"
50 | },
51 | "source": [
52 | "## 1.安裝需求套件\n",
53 | "* 文字轉語音套件\n",
54 | "* 繁簡轉換套件"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "cellView": "form",
62 | "id": "zzDanp7lDmSC"
63 | },
64 | "outputs": [],
65 | "source": [
66 | "#@title 安裝運作所需套件\n",
67 | "!pip3 install SpeechRecognition\n",
68 | "!pip3 install iNLP"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {
74 | "id": "YHUEj0k9HhSc"
75 | },
76 | "source": [
77 | "## 2.掛載 google 雲端硬碟\n",
78 | "\n",
79 | "可點選左側的 **檔案** 圖示,掛載 Google Drive 雲端硬碟,或執行程式"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {
86 | "cellView": "form",
87 | "id": "nn9tQeSLF8oF"
88 | },
89 | "outputs": [],
90 | "source": [
91 | "#@title 掛載 Google雲端硬碟\n",
92 | "from google.colab import drive\n",
93 | "drive.mount('/content/drive')"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {
99 | "id": "AoJDchjgHxl8"
100 | },
101 | "source": [
102 | "## 3.設定環境變數與函數預載\n",
103 | "\n",
104 | "需給予**錄音檔**的路徑、 wav 切割檔的暫存目錄、txt 輸出檔的暫存目錄。請確定在**錄音檔**目錄下,沒有相同名稱目錄、或相同名稱目錄下沒有重要的資料。\n",
105 | "\n",
106 | "若要自行建立目錄者,請將 **chk** 設定為 n\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 12,
112 | "metadata": {
113 | "id": "R4kZJGQdHdjy",
114 | "cellView": "form"
115 | },
116 | "outputs": [],
117 | "source": [
118 | "#@title 基礎環境設定\n",
119 | "import os\n",
120 | "import shutil\n",
121 | "import speech_recognition as sr\n",
122 | "import concurrent.futures\n",
123 | "import wave\n",
124 | "import json\n",
125 | "import numpy as np\n",
126 | "from google.colab import files\n",
127 | "from inlp.convert import chinese\n",
128 | "\n",
129 | "\n",
130 | "#@markdown 錄音檔的位置\n",
131 | "mp3Name= '/content/drive/MyDrive/tmp/240113_0558.mp3' #@param {type:\"string\"}\n",
132 | "\n",
133 | "#@markdown 設定錄音檔的分割大小,單位:秒。時間太長,轉文字的效果會較差。\n",
134 | "CutTimeDef = 20 #@param {type:\"integer\"}\n",
135 | "#@markdown 設定 wav 切割檔的暫存目錄\n",
136 | "wav_path='wav' #@param {type:\"string\"}\n",
137 | "#@markdown 設定文字檔暫存目錄。將特定秒數(CutTimeDef)的音檔轉為文字\n",
138 | "txt_path='txt' #@param {type:\"string\"}\n",
139 | "# #@markdown 執行緖的數量\n",
140 | "# thread_num = 1 #@param {type:\"number\"}\n",
141 | "\n",
142 | "workpath=os.path.dirname(mp3Name)\n",
143 | "mp3Name=os.path.basename(mp3Name)\n",
144 | "FileName = mp3Name[:mp3Name.rfind(\".\")]+\".wav\"\n",
145 | "os.chdir(workpath)\n",
146 | "#@markdown 若 wav_path, txt_path 目錄存在是否移除重建\n",
147 | "chk='y' #@param [\"y\",\"n\"]\n",
148 | "\n",
149 | "def reset_dir(path):\n",
150 | " try:\n",
151 | " os.mkdir(path)\n",
152 | " except Exception:\n",
153 | " if chk==\"y\":\n",
154 | " shutil.rmtree(path)\n",
155 | " os.mkdir(path)\n",
156 | "\n",
157 | "def CutFile(FileName, target_path):\n",
158 | "\n",
159 | " # print(\"CutFile File Name is \", FileName)\n",
160 | " f = wave.open(FileName, \"rb\")\n",
161 | " params = f.getparams()\n",
162 | " nchannels, sampwidth, framerate, nframes = params[:4]\n",
163 | " CutFrameNum = framerate * CutTimeDef\n",
164 | " # 讀取格式資訊\n",
165 | " # 一次性返回所有的WAV檔案的格式資訊,它返回的是一個組元(tuple):聲道數, 量化位數(byte 單位), 採\n",
166 | " # 樣頻率, 取樣點數, 壓縮型別, 壓縮型別的描述。wave模組只支援非壓縮的資料,因此可以忽略最後兩個資訊\n",
167 | "\n",
168 | " # print(\"CutFrameNum=%d\" % (CutFrameNum))\n",
169 | " # print(\"nchannels=%d\" % (nchannels))\n",
170 | " # print(\"sampwidth=%d\" % (sampwidth))\n",
171 | " # print(\"framerate=%d\" % (framerate))\n",
172 | " # print(\"nframes=%d\" % (nframes))\n",
173 | "\n",
174 | " str_data = f.readframes(nframes)\n",
175 | " f.close() # 將波形資料轉換成陣列\n",
176 | " # Cutnum =nframes/framerate/CutTimeDef\n",
177 | " # 需要根據聲道數和量化單位,將讀取的二進位制資料轉換為一個可以計算的陣列\n",
178 | " wave_data = np.frombuffer(str_data, dtype=np.short)\n",
179 | " wave_data.shape = -1, 2\n",
180 | " wave_data = wave_data.T\n",
181 | " temp_data = wave_data.T\n",
182 | " # StepNum = int(nframes/200)\n",
183 | " StepNum = CutFrameNum\n",
184 | " StepTotalNum = 0\n",
185 | " haha = 0\n",
186 | " while StepTotalNum < nframes:\n",
187 | " # for j in range(int(Cutnum)):\n",
188 | " # print(\"Stemp=%d\" % (haha))\n",
189 | " SaveFile = \"%s-%03d.wav\" % (FileName[:-4], (haha+1))\n",
190 | " # print(FileName)\n",
191 | " if haha % 3==0:\n",
192 | " print(\"*\",end='')\n",
193 | " temp_dataTemp = temp_data[StepNum * (haha):StepNum * (haha + 1)]\n",
194 | " haha = haha + 1\n",
195 | " StepTotalNum = haha * StepNum\n",
196 | " temp_dataTemp.shape = 1, -1\n",
197 | " temp_dataTemp = temp_dataTemp.astype(np.short) # 開啟WAV文件\n",
198 | " f = wave.open(target_path+\"/\" + SaveFile, \"wb\")\n",
199 | " # 配置聲道數、量化位數和取樣頻率\n",
200 | " f.setnchannels(nchannels)\n",
201 | " f.setsampwidth(sampwidth)\n",
202 | " f.setframerate(framerate)\n",
203 | " # 將wav_data轉換為二進位制資料寫入檔案\n",
204 | " f.writeframes(temp_dataTemp.tobytes())\n",
205 | " f.close()\n",
206 | "\n",
207 | "\n",
208 | "\n",
209 | "\n",
210 | "def texts_to_one(path, target_file):\n",
211 | " files = os.listdir(path)\n",
212 | " files.sort()\n",
213 | " files = [path+\"/\" + f for f in files if f.endswith(\".txt\")]\n",
214 | " with open(target_file, \"w\", encoding=\"utf-8\") as f:\n",
215 | " for file in files:\n",
216 | " with open(file, \"r\", encoding='utf-8') as f2:\n",
217 | " txt= f2.read().split(\"\\n\")\n",
218 | " if len(txt) < 2:\n",
219 | " continue\n",
220 | " f.write(txt[1])\n",
221 | " print(\"完成合併, 檔案位於 %s \" % target_file)\n",
222 | "\n",
223 | "\n",
224 | "def texts2otr(path, target_file, audio_name, timeperiod):\n",
225 | " template = '''{}{}
\n",
226 | " '''\n",
227 | " files = os.listdir(path)\n",
228 | " files.sort()\n",
229 | " content = ''\n",
230 | " files = [path+\"/\" + f for f in files if f.endswith(\".txt\")]\n",
231 | " with open(target_file, \"w\", encoding=\"utf-8\") as f:\n",
232 | "\n",
233 | " for file in files:\n",
234 | " with open(file, \"r\", encoding=\"utf-8\") as f2:\n",
235 | " txt = f2.read().split(\"\\n\")\n",
236 | " if len(txt) < 2:\n",
237 | " continue\n",
238 | " pos=txt[0].rfind(\".\")\n",
239 | " time=int(txt[0][pos-3:pos])\n",
240 | " # times = (int(txt[0].split(\"-\")[1][:-5])-1)*CutTimeDef\n",
241 | " times=(time-1)*CutTimeDef\n",
242 | " secs, mins = times % 60, (times//60) % 60\n",
243 | " hours = (times//60)//60\n",
244 | " timeF = \"{:02d}:{:02d}:{:02d}\".format(hours, mins, secs)\n",
245 | " content += template.format(times, timeF, txt[1])\n",
246 | "\n",
247 | " output = {\"text\": content, \"media\": audio_name,\n",
248 | " \"media-time\": timeperiod}\n",
249 | " f.write(json.dumps(output, ensure_ascii=False))\n",
250 | " print(\"完成合併, otr 檔案位於 %s \" % target_file)\n",
251 | "\n",
252 | " #files.download(target_file)"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "metadata": {
258 | "id": "RIaMvxr7Jz_W"
259 | },
260 | "source": [
261 | "## 4.音頻轉換與切割\n",
262 | "\n",
263 | "1. 將 mp3 轉成 wav 檔\n",
264 | "2. 將音頻切割,並置於 wav_path 目錄下\n",
265 | "3. 建立 txt_path ,做為語音判識的輸出檔\n",
266 | "\n",
267 | "\n"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 13,
273 | "metadata": {
274 | "cellView": "form",
275 | "id": "rpnUIqKBKBnQ",
276 | "colab": {
277 | "base_uri": "https://localhost:8080/"
278 | },
279 | "outputId": "757b14db-6b7c-47e9-c25c-78e8fd68f020"
280 | },
281 | "outputs": [
282 | {
283 | "output_type": "stream",
284 | "name": "stdout",
285 | "text": [
286 | "=========================================== mp3 轉 wav 檔 ============================================\n",
287 | " Wav 檔名為 240113_0558.wav \n",
288 | "========================================= 音頻以每20秒分割 ==========================================\n",
289 | "********\n",
290 | "----------------------------------------------- 完成分割 -----------------------------------------------\n"
291 | ]
292 | }
293 | ],
294 | "source": [
295 | "#@title 執行音頻轉換與分割\n",
296 | "\n",
297 | "print(\" mp3 轉 wav 檔 \".center(100,'='))\n",
298 | "os.system('{} -i \"{}\" \"{}\"'.format(\"ffmpeg\",mp3Name, FileName))\n",
299 | "print(\" Wav 檔名為 {} \".format(FileName).center(96))\n",
300 | "reset_dir(wav_path)\n",
301 | "reset_dir(txt_path)\n",
302 | "# # Cut Wave Setting\n",
303 | "\n",
304 | "print(\" 音頻以每{}秒分割 \".format(CutTimeDef).center(94,'='))\n",
305 | "CutFile(FileName, wav_path)\n",
306 | "print(\"\")\n",
307 | "print(\" 完成分割 \".center(100,'-'))"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {
313 | "id": "msvFQZENdwGZ"
314 | },
315 | "source": [
316 | "## 5.文字轉語音"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {
323 | "id": "rUh0kL6hC6yd",
324 | "cellView": "form"
325 | },
326 | "outputs": [],
327 | "source": [
328 | "#@title 執行語音轉文字 (需要耗費不少時間)\n",
329 | "#@markdown 指定翻譯的語言類型,如何設定語系請參考 [支援列表](https://cloud.google.com/speech-to-text/docs/languages)\n",
330 | "\n",
331 | "#@markdown 繁體中文:zh-TW(or cmn-Hant-TW)、英文: en-US\n",
332 | "voiceLanguage=\"zh-TW\" #@param {type:\"string\"}\n",
333 | "# cmn-Hant-TW\n",
334 | "\n",
335 | "def VoiceToText_thread(file):\n",
336 | " txt_file = \"%s/%s.txt\" % (txt_path, file[:-4])\n",
337 | "\n",
338 | " if os.path.isfile(txt_file):\n",
339 | " return\n",
340 | " with open(\"%s/%s.txt\" % (txt_path, file[:-4]), \"w\", encoding=\"utf-8\") as f:\n",
341 | " f.write(\"%s:\\n\" % file)\n",
342 | " r = sr.Recognizer() # 預設辨識英文\n",
343 | " with sr.WavFile(wav_path+\"/\"+file) as source: # 讀取wav檔\n",
344 | " audio = r.record(source)\n",
345 | " # r.adjust_for_ambient_noise(source)\n",
346 | " # audio = r.listen(source)\n",
347 | " try:\n",
348 | " text = r.recognize_google(audio,language = voiceLanguage)\n",
349 | " text = chinese.s2t(text)\n",
350 | " # r.recognize_google(audio)\n",
351 | "\n",
352 | " if len(text) == 0:\n",
353 | " print(\"===無資料==\")\n",
354 | " return\n",
355 | "\n",
356 | " print(f\"{file}\\t{text}\")\n",
357 | " f.write(\"%s \\n\\n\" % text)\n",
358 | " if file == files[-1]:\n",
359 | " print(\"結束翻譯\")\n",
360 | " except sr.RequestError as e:\n",
361 | " print(\"無法翻譯{0}\".format(e))\n",
362 | " # 兩個 except 是當語音辨識不出來的時候 防呆用的\n",
363 | " # 使用Google的服務\n",
364 | " except LookupError:\n",
365 | " print(\"Could not understand audio\")\n",
366 | " except sr.UnknownValueError:\n",
367 | " print(f\"Error: 無法識別 Audio\\t {file}\")\n",
368 | "\n",
369 | "\n",
370 | "\n",
371 | "\n",
372 | "files = os.listdir(wav_path)\n",
373 | "files.sort()\n",
374 | "\n",
375 | "# 因為 colab 調整執行緒的使用原則,max=2 最多 60秒就關閉\n",
376 | "# with concurrent.futures.ThreadPoolExecutor(max_workers=thread_num) as executor:\n",
377 | "# executor.map(VoiceToText_thread, files)\n",
378 | "for file in files:\n",
379 | " VoiceToText_thread(file)\n",
380 | "\n",
381 | "# VoiceToText(wav_path, files, txt_path)\n",
382 | "\n",
383 | "target_txtfile = \"{}.txt\".format(FileName[:-4])\n",
384 | "texts_to_one(txt_path, target_txtfile)\n",
385 | "otr_file = \"{}.otr\".format(FileName[:-4])\n",
386 | "with wave.open(FileName, \"rb\") as f:\n",
387 | " params = f.getparams()\n",
388 | "texts2otr(txt_path, otr_file, FileName, params.nframes)"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "metadata": {
395 | "cellView": "form",
396 | "id": "Rqlix8f26WTs"
397 | },
398 | "outputs": [],
399 | "source": [
400 | "#@title 列出合併的文字檔之檔名\n",
401 | "#@markdown 將會形成 txt 和 [oTranscribe](https://otranscribe.com/) 網站使用的 otr 格式。輸出檔將置於上傳錄音檔同目錄。\n",
402 | "\n",
403 | "#@markdown 若已知道檔名,不需要執行此區塊。\n",
404 | "print(\" 輸出檔名 \".center(100,'='))\n",
405 | "print(target_txtfile)\n",
406 | "print(otr_file)"
407 | ]
408 | },
409 | {
410 | "cell_type": "markdown",
411 | "metadata": {
412 | "id": "nFJ79zpxDuMt"
413 | },
414 | "source": [
415 | "## 6.暫存檔、暫目錄清理"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 15,
421 | "metadata": {
422 | "cellView": "form",
423 | "id": "XWG0Z-L-D6AK"
424 | },
425 | "outputs": [],
426 | "source": [
427 | "#@title 移除暫存檔、暫存目標\n",
428 | "\n",
429 | "#@markdown 將會移除 wav, txt 的目錄和 .wav 的暫存檔\n",
430 | "\n",
431 | "#@markdown 你可以透直接在 **Google雲端硬碟** 手動刪除,不透過程式移除\n",
432 | "\n",
433 | "\n",
434 | "shutil.rmtree(wav_path)\n",
435 | "shutil.rmtree(txt_path)\n",
436 | "os.remove(FileName)\n"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {
443 | "cellView": "form",
444 | "id": "hO9kdxCwaadE"
445 | },
446 | "outputs": [],
447 | "source": [
448 | "#@title 卸載 **Google 雲端硬碟**\n",
449 | "drive.flush_and_unmount()"
450 | ]
451 | },
452 | {
453 | "cell_type": "markdown",
454 | "metadata": {
455 | "id": "YnV3vGD6gS-W"
456 | },
457 | "source": [
458 | "## 附錄一.Youtube字幕格式輸出"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": null,
464 | "metadata": {
465 | "cellView": "form",
466 | "id": "D9M7MJS7a281"
467 | },
468 | "outputs": [],
469 | "source": [
470 | "#@title Youtube 字幕 (.srt) 格式輸出\n",
471 | "def get_timeF(times):\n",
472 | " secs, mins = times % 60, (times//60) % 60\n",
473 | " hours = (times//60)//60\n",
474 | " timeF = \"{:02d}:{:02d}:{:02d}\".format(hours, mins, secs)\n",
475 | " return timeF\n",
476 | "\n",
477 | "def texts2srt(path, target_file):\n",
478 | " template = '''{}\\n{} --> {}\\n{}\\n\\n'''\n",
479 | " files = os.listdir(path)\n",
480 | " files.sort()\n",
481 | " content = ''\n",
482 | " counter = 0\n",
483 | " files = [path+\"/\" + f for f in files if f.endswith(\".txt\")]\n",
484 | " with open(target_file, \"w\", encoding=\"utf-8\") as f:\n",
485 | "\n",
486 | " for file in files:\n",
487 | " with open(file, \"r\", encoding=\"utf-8\") as f2:\n",
488 | " txt = f2.read().split(\"\\n\")\n",
489 | " if len(txt) < 2:\n",
490 | " continue\n",
491 | " counter+=1\n",
492 | " times = (int(txt[0].split(\"-\")[1][:-5])-1)*CutTimeDef\n",
493 | " time_start=get_timeF(times)\n",
494 | " time_end=get_timeF(times+CutTimeDef)\n",
495 | " content += template.format(counter, time_start, time_end, txt[1])\n",
496 | " f.write(content)\n",
497 | " print(\"完成合併, srt 檔案位於 %s \" % target_file)\n",
498 | "\n",
499 | "srt_file = \"{}_srt.txt\".format(FileName[:-4])\n",
500 | "texts2srt(txt_path, srt_file)\n",
501 | "files.download(srt_file)"
502 | ]
503 | }
504 | ],
505 | "metadata": {
506 | "colab": {
507 | "provenance": [],
508 | "mount_file_id": "1SPRxSXsaErSrZ4riQ-1sxHankJ3Hlc9X",
509 | "authorship_tag": "ABX9TyMWp1agJax/qdgy3Ri4I38A",
510 | "include_colab_link": true
511 | },
512 | "kernelspec": {
513 | "display_name": "Python 3",
514 | "name": "python3"
515 | }
516 | },
517 | "nbformat": 4,
518 | "nbformat_minor": 0
519 | }
--------------------------------------------------------------------------------