├── demo.png ├── .gitattributes ├── .gitignore ├── extractDataFromHTML.py ├── extractWordClass.py ├── templates ├── kencollo.css └── kencollo.plist ├── KENCOLLO.lst ├── checkDB.py ├── LICENSE ├── DBSetup.py ├── README.md ├── convertDBToXML.py └── functions.py /demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dodosuke/EpwingToDict/HEAD/demo.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | *.pyc 4 | *.db 5 | *.xml 6 | *.out 7 | *.html 8 | *.db 9 | dictionary.db 10 | .DS_Store 11 | test.py 12 | -------------------------------------------------------------------------------- /extractDataFromHTML.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | from functions import pretreat, storeWCToDB, extractEntryAndIndex, extractMeaning 3 | 4 | # 入力ファイルのパスを指定 5 | f_path = 'KENCOLLO.html' 6 | 7 | # HTML ファイルを前処理し、temp ファイルを出力する 8 | pretreat(f_path) 9 | 10 | # 前処理した HTML からデータを抜き出す。 11 | # 品詞の分類を保存 12 | storeWCToDB() 13 | 14 | # 項目を抽出し、保存 15 | extractEntryAndIndex() 16 | 17 | # 説明文を抽出し、保存 18 | extractMeaning() 19 | -------------------------------------------------------------------------------- /extractWordClass.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | # Extract wordclass type from HTML file 4 | f = codecs.open('KENCOLLO2', 'r', 'utf-8') 5 | categories = [] 6 | for line in f: 7 | start = line.find("【") 8 | if start > 0: 9 | end = line.rfind("】") 10 | category = line[start:end+1] 11 | if category not in categories: 12 | print(category) 13 | categories.append(category) 14 | f.close() 15 | 16 | print(categories) 17 | -------------------------------------------------------------------------------- /templates/kencollo.css: -------------------------------------------------------------------------------- 1 | @charset "UTF-8"; 2 | @namespace d url(http://www.apple.com/DTDs/DictionaryService-1.0.rng); 3 | 4 | d|entry { 5 | } 6 | 7 | h1 { 8 | font-size: 150%; 9 | } 10 | 11 | h3 { 12 | font-size: 100%; 13 | } 14 | 15 | span.column { 16 | display: block; 17 | border: solid 2px #c0c0c0; 18 | margin-left: 2em; 19 | margin-right: 2em; 20 | margin-top: 0.5em; 21 | margin-bottom: 0.5em; 22 | padding: 0.5em; 23 | } 24 | 25 | @media (prefers-dark-interface) 26 | { 27 | body { 28 | color: white; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /KENCOLLO.lst: -------------------------------------------------------------------------------- 1 |  2 |  3 |  ✓ 4 |  嗉 5 |  胳 6 |  骶 7 |  稃 8 |  炻 9 |  絇 10 |  蒴 11 |  © 12 |  ☨ 13 |  14 |  15 |  * 16 |  = 17 |  ç 18 |  ə 19 |  ­ʃ 20 |  ŋ 21 |  ː 22 |  ɛ̃ 23 |  ɑ̃ 24 |  ɑ 25 |  ñ 26 |  $ 27 |  28 |  29 |  30 |  ɔ́ 31 |  É 32 |  á 33 |  é 34 |  í 35 |  à 36 |  è 37 |  ë 38 |  ï 39 |  ö 40 |  â 41 |  ê 42 |  ô 43 |  ō 44 | -------------------------------------------------------------------------------- /checkDB.py: -------------------------------------------------------------------------------- 1 | #import sqlite3 2 | #connect = sqlite3.connect("dictionary.db") 3 | #c = connect.cursor() 4 | #c.execute("SELECT * FROM IndexTag order by id limit 10") 5 | #print(c.fetchall()) 6 | 7 | from sqlalchemy import create_engine 8 | from sqlalchemy.orm import sessionmaker 9 | from DBSetup import Base, Entry, WordClass, Meaning, IndexTag 10 | engine = create_engine('sqlite:///dictionary.db') 11 | Base.metadata.bind = engine 12 | DBSession = sessionmaker(bind=engine) 13 | session = DBSession() 14 | 15 | entry = session.query(Entry).filter_by(entryId='0003804D03D0').one() 16 | meaning = session.query(Meaning).filter_by(entryId='8566').all() 17 | 18 | for i in meaning: 19 | print(i.sentence) 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Keisuke Kishida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /templates/kencollo.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleDevelopmentRegion 6 | English 7 | CFBundleIdentifier 8 | com.apple.dictionary.kencollo 9 | CFBundleName 10 | $DICT_NAME 11 | CFBundleShortVersionString 12 | 1.0 13 | DCSDictionaryCopyright 14 | Copyright © 2007 Apple Inc. 15 | DCSDictionaryManufacturerName 16 | Kenkyusha CO., Ltd. 17 | DCSDictionaryFrontMatterReferenceID 18 | front_back_matter 19 | DCSDictionaryPrefsHTML 20 | MyDictionary_prefs.html 21 | DCSDictionaryXSL 22 | MyDictionary.xsl 23 | DCSDictionaryUseSystemAppearance 24 | 25 | DCSDictionaryDefaultPrefs 26 | 27 | pronunciation 28 | 0 29 | display-column 30 | 1 31 | display-picture 32 | 1 33 | version 34 | 1 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /DBSetup.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, ForeignKey, Integer, String, create_engine 2 | from sqlalchemy.ext.declarative import declarative_base 3 | from sqlalchemy.orm import relationship 4 | 5 | Base = declarative_base() 6 | 7 | # Entry 8 | class Entry(Base): 9 | __tablename__ = 'entry' 10 | id = Column(Integer, primary_key=True) 11 | entryId = Column(String(12), index=True) 12 | title = Column(String(128), nullable=False) 13 | headword = Column(String(1000)) 14 | 15 | # Index 16 | class IndexTag(Base): 17 | __tablename__ = 'indextag' 18 | id = Column(Integer, primary_key=True) 19 | value = Column(String(256), nullable=False) 20 | title = Column(String(256), nullable=False) 21 | yomi = Column(String(256)) 22 | entryId = Column(Integer, ForeignKey('entry.id')) 23 | entry = relationship(Entry) 24 | 25 | # Categorized by the adjacent word type 26 | class WordClass(Base): 27 | __tablename__ = 'wordclass' 28 | id = Column(Integer, primary_key=True) 29 | type = Column(String(1024), nullable=False) 30 | 31 | # Meaning 32 | class Meaning(Base): 33 | __tablename__ = 'meaning' 34 | id = Column(Integer, primary_key=True) 35 | sentence = Column(String(1024), nullable=False) 36 | entryId = Column(Integer, ForeignKey('entry.id')) 37 | entry = relationship(Entry) 38 | wcId = Column(Integer, ForeignKey('wordclass.id')) 39 | wordclass = relationship(WordClass) 40 | 41 | engine = create_engine('sqlite:///dictionary.db') 42 | Base.metadata.create_all(engine) 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EpwingToDict 2 | LogoVista社製「研究社 新編 英和活用大辞典(EPWING形式)」を Mac に標準搭載されている辞書アプリで使えるように変換するための python スクリプトです。 3 | 4 | This is a tool to make a dictionary for Mac from Kenkyusha Eiwa-Katsuyo-Daijiten. 5 | 6 | ## Description 7 | EBDump + ebd2html で出力した HTML をスクレイピングし、データベースを作成し、辞書用の XML ファイルを出力する Python スクリプトです。 8 | LogoVista社の 英和活用大辞典(EPWING) 専用であり、他の辞書やバージョンには対応していません。 9 | * extractDataFromHTML.py: HTML からデータを抜き出すスクリプト 10 | * convertDBToXML.py: XML 出力するスクリプト 11 | * DBSetup.py: データベースの設定 12 | * ExtractFunctions: HTML の前処理及び抽出に関するファイル 13 | * checkDB.py: データベースの内容確認用スクリプト(開発用) 14 | * extractWordClass.py: 品詞の分類抜き出し用スクリプト(開発用) 15 | * KENCOLLO.lst: 外字ファイル(参考資料) 16 | * /templates: Dictionary Development Kit 用 template ファイル例。ダークテーマ対応済。 17 | 18 | なお、今回のスクリプト作成にあたっては、下記のブログを参考にしています。英辞郎やCoubuildの変換に関しては、こちらをご参照ください。 19 | http://www.binword.com/blog/archives/000588.html 20 | 21 | ## Demo 22 | ![Demo](https://github.com/dodosuke/EpwingToDict/blob/master/demo.png) 23 | 24 | ## Requirement 25 | * 辞書データ:LogoVista 社「徹底英語活用セット」(2002年発売版、EPWING形式) 26 | * HTML変換:Windows + EBDump + ebd2html 27 | * 辞書作成:Mac + Python 3 28 | 29 | ## Usage 30 | 1. まずは、下記サイトを参考に HTML ファイルを作成します。作成したファイルの名前を "KENCOLLO.html" にします。(外字を修正したい場合は、このタイミングで HTML ファイルを直接編集してください。) 31 | http://hp.vector.co.jp/authors/VA000022/ebd2html/ebd2html.html 32 | 33 | 2. extractDataFromHTML.py で、HTML を前処理した後、スクレイピングし、データベース(dictionary.db)にデータを格納します(所用時間:20分程度)。 34 | 35 | 3. convertDBToXML.py で、データベースの情報からXMLファイルを作成します(所用時間:30分程度)。 36 | 37 | 4. 作成された XML ファイルを使い、Macの辞書を作成し、インストールします(所用時間:数分)。 38 | 39 | なお、所要時間は、MacBook Pro (late 2013)で実行時した場合の実測値です。 40 | 41 | ## Version 42 | 43 | 2018/10/16 (ver. 1.3): Dark Theme 対応 44 | 45 | 2017/ 8/17 (ver. 1.1): Bug fix 及び コード修正 46 | 47 | 2017/ 8/18 (ver. 1.2): Bug fix 48 | 49 | 2017/ 8/16 (ver. 1.0): 初版公開 50 | 51 | ## TODO 52 | 外字を自動で書き換えない、かなで検索できない、などについては、気が向いたら対応予定。 53 | 54 | ## Licence 55 | [MIT Licence](https://github.com/dodosuke/EpwingToDict/LICENCE) 56 | 57 | ## Author 58 | [dodosuke @ Github](https://github.com/dodosuke) 59 | 60 | [dodosuke0920 @ Twitter](https://twitter.com/dodosuke0920) 61 | -------------------------------------------------------------------------------- /convertDBToXML.py: -------------------------------------------------------------------------------- 1 | import codecs, re 2 | from tqdm import tqdm 3 | from functions import deleteLink 4 | 5 | # データベースを呼び出す 6 | from sqlalchemy import create_engine 7 | from sqlalchemy.orm import sessionmaker 8 | from DBSetup import Base, Entry, WordClass, Meaning, IndexTag 9 | engine = create_engine('sqlite:///dictionary.db') 10 | Base.metadata.bind = engine 11 | DBSession = sessionmaker(bind=engine) 12 | session = DBSession() 13 | 14 | # 出力するファイルを設定 15 | xml_out = codecs.open('KENCOLLO.xml', 'w', 'utf-8') 16 | xml_out.write('\n') 17 | xml_out.write('\n') 18 | 19 | # Entryの数を数える 20 | lastEntry = session.query(Entry).order_by(Entry.id.desc()).first() 21 | numberOfEntries = lastEntry.id 22 | 23 | # 変換プロセスの可視化 24 | pbar = tqdm(range(numberOfEntries)) 25 | 26 | # データベース上のエントリーを出力する 27 | for i in range(numberOfEntries): 28 | 29 | # データベースからエントリーとインデックスを読み込み 30 | entry = session.query(Entry).filter_by(id=i+1).one() 31 | indices = session.query(IndexTag).filter_by(entryId=i+1).all() 32 | 33 | # エントリー名を書く 34 | xml_out.write('\n') 35 | 36 | # Indexを書く 37 | for index in indices: 38 | xml_out.write('\t\n') 39 | 40 | # 検索結果の頭の表記を書く 41 | headword = entry.headword 42 | if headword.find(" -1: 43 | headword = deleteLink(headword) 44 | xml_out.write('\t

' + headword + '

\n') 45 | 46 | # データベースから説明文を読み込み 47 | meanings = session.query(Meaning).filter_by(entryId=i+1).all() 48 | 49 | # 説明文が無い場合は、エントリーを閉じて次の項目へ行こう 50 | if len(meanings) == 0: 51 | xml_out.write('
\n') 52 | pbar.update(1) 53 | continue 54 | 55 | # 説明文を出力 56 | for i in range(len(meanings)): 57 | wcId = meanings[i].wcId 58 | sentence = meanings[i].sentence 59 | 60 | if sentence.find(" -1: 61 | sentence = deleteLink(sentence) 62 | 63 | # 最初 かつ 小分類(1)(2)...がある場合 64 | if wcId == 0 and i == 0: 65 | xml_out.write('\t
\n\t\t

' + sentence + '

\n') 66 | 67 | # 最後に小分類がある場合 68 | elif wcId == 0 and i == len(meanings)-1: 69 | xml_out.write('\t\t\n\t
\n\t
\n\t\t

' + sentence + '

\n\t\t
    \n') 70 | 71 | # 小分類 72 | elif wcId == 0: 73 | xml_out.write('\t\t
\n\t
\n\t
\n\t\t

' + sentence + '

\n') 74 | 75 | # 説明文の最初、【品詞】+説明文 76 | elif i == 0: 77 | wordclass = session.query(WordClass).filter_by(id = wcId).one() 78 | xml_out.write('\t
\n\t\t

' + wordclass.type + '

\n\t\t
    \n') 79 | xml_out.write('\t\t\t
  • ' + sentence + '
  • \n') 80 | 81 | # 説明文のみ記載 82 | elif wcId == meanings[i-1].wcId: 83 | xml_out.write('\t\t\t
  • ' + sentence + '
  • \n') 84 | 85 | # 直前が小分類 の場合 86 | elif meanings[i-1].wcId == 0: 87 | wordclass = session.query(WordClass).filter_by(id = wcId).one() 88 | xml_out.write('\t\t

    ' + wordclass.type + '

    \n\t\t
      \n') 89 | xml_out.write('\t\t\t
    • ' + sentence + '
    • \n') 90 | 91 | # 品詞が切り替わるタイミング 92 | else: 93 | wordclass = session.query(WordClass).filter_by(id = wcId).one() 94 | xml_out.write('\t\t
    \n\t
\n\t
\n\t\t

' + wordclass.type + '

\n\t\t
    \n') 95 | xml_out.write('\t\t\t
  • ' + sentence + '
  • \n') 96 | 97 | # エントリーを閉じる 98 | xml_out.write('\t\t
\n\t
\n\n') 99 | 100 | # tqdm を使って進捗を示す 101 | pbar.update(1) 102 | 103 | # 辞書を閉じる 104 | xml_out.write('\n') 105 | xml_out.close() 106 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import re 3 | import mojimoji 4 | from tqdm import tqdm 5 | 6 | # For Importing dictionary 7 | from sqlalchemy import create_engine 8 | from sqlalchemy.orm import sessionmaker 9 | from DBSetup import Base, Entry, WordClass, Meaning, IndexTag 10 | engine = create_engine('sqlite:///dictionary.db') 11 | Base.metadata.bind = engine 12 | DBSession = sessionmaker(bind=engine) 13 | session = DBSession() 14 | 15 | # 一時ファイルのパスを指定 16 | f_temp_path = 'temp.out' 17 | 18 | # HTMLファイルの前処理 19 | def pretreat(f_path): 20 | 21 | print('preprocessing HTML file') 22 | 23 | # ファイルの読み込み 24 | f = codecs.open(f_path, 'r', 'utf-8') 25 | f_temp = codecs.open(f_temp_path, 'w', 'utf-8') 26 | 27 | for line in f: 28 | # , , タグを全削除する 29 | line = line.replace("", "").replace("", "") 30 | line = line.replace("", "").replace("", "") 31 | line = re.sub('(.+?)', '', line) 32 | 33 | # 半角文字前の全角数字を削除する 34 | line = re.sub('[0-9]([a-xA-Z0-9_])', '\\1', line) 35 | 36 | # 全角スペースを半角スペースに変換する 37 | line = line.replace(" ", " ") 38 | 39 | # 不要な改行を削除する 40 | line = line.replace("\n", "") 41 | if line.find(" ") == 0 : 42 | f_temp.write(line) 43 | else: 44 | f_temp.write("\n" + line) 45 | 46 | f.close() 47 | f_temp.close() 48 | 49 | # 品詞の分類をDBへ保存 50 | def storeWCToDB(): 51 | WCs = ['【動詞+】', '【+動詞】', '【形容詞・名詞+】', '【副詞】', '【副詞1】', \ 52 | '【副詞2】', '【前置詞+】', '【+前置詞】', '【雑】', '【+to do】', '【+doing】', \ 53 | '【+that節】', '【+wh.】', '【+how】', '【+that節】【+補】', '【+whether】', \ 54 | '【+whether [if]】', '【+to do】【+doing】', '【+-self】', '【+補】', '【+-self】【+補】'] 55 | for i in range(len(WCs)): 56 | wordclass = WordClass(type=WCs[i]) 57 | session.add(wordclass) 58 | session.commit() 59 | 60 | # 各アイテムをデータベースへ保存 61 | def storeEntryToDB(entryId, title, headword=None): 62 | entry = Entry(entryId=entryId, title=title, headword=headword) 63 | session.add(entry) 64 | session.commit() 65 | 66 | def storeIndexToDB(entryId, value, title): 67 | indextag = IndexTag(value=value, title=title, entryId=entryId) 68 | session.add(indextag) 69 | session.commit() 70 | 71 | def storeMeaningToDB(sentence, entryId, wcId): 72 | meaning = Meaning(sentence=sentence, entryId=entryId, wcId=wcId) 73 | session.add(meaning) 74 | session.commit() 75 | 76 | # HTML から Entry と Index を抜き出し、データベースへ保存 77 | def extractEntryAndIndex(): 78 | 79 | print('Start extracting entry and index') 80 | 81 | # 一時ファイルの読み込み 82 | f = codecs.open(f_temp_path, 'r', 'utf-8') 83 | entryIdForIndex = 0 84 | pbar = tqdm(range(15958)) 85 | 86 | for line in f: 87 | start = line.find("
-1: 99 | # Ignore the Kana type and store index into database 100 | if line.find('type="かな"') < 0: 101 | value_end = line.find("") 102 | title_end = line.find("type=") 103 | value = line[title_end+10:value_end] 104 | value = mojimoji.zen_to_han(value, kana=False).lower() 105 | title = line[12:title_end-2] 106 | storeIndexToDB(entryIdForIndex, value, title) 107 | 108 | elif line.find("") > 0: 109 | break 110 | 111 | f.close() 112 | 113 | # HTML から説明文を抜き出し、データベースへ保存 114 | def extractMeaning(): 115 | 116 | print("Start extracting items.") 117 | # 一時ファイルの読み込み 118 | f = codecs.open(f_temp_path, 'r', 'utf-8') 119 | 120 | entryIdForMeaning = 0 121 | wcId = 0 122 | pbar = tqdm(range(15958)) 123 | 124 | # Extract meanings 125 | # Extract headword and store into Entry DB 126 | for line in f: 127 | if line.find("") > 0: 128 | end = line.find("
") 129 | headword = line[20:end] 130 | entryIdForMeaning += 1 131 | entry = session.query(Entry).filter_by(id=entryIdForMeaning).one() 132 | entry.headword = headword 133 | session.commit() 134 | pbar.update(1) 135 | 136 | elif entryIdForMeaning == 0: 137 | continue 138 | 139 | # Extract subcategory 140 | elif line.find("(") > 0: 141 | end = line.find("
") 142 | sentence = line[20:end] 143 | storeMeaningToDB(sentence,entryIdForMeaning, 0) 144 | 145 | # Extract word class and save wcId for later 146 | elif line.find("【") > 0: 147 | end = line.find("
") 148 | wcType = line[20:end] 149 | wordclass = session.query(WordClass).filter_by(type=wcType).one() 150 | wcId = wordclass.id 151 | 152 | # 普通の項目、最初の点は除く 153 | elif line.find("") > 0: 154 | end = line.find("", "") 163 | a1 = word.find('') 165 | word_out = word[:a1] + word[a2+2:] 166 | return word_out 167 | --------------------------------------------------------------------------------