├── demo.png
├── .gitattributes
├── .gitignore
├── extractDataFromHTML.py
├── extractWordClass.py
├── templates
    ├── kencollo.css
    └── kencollo.plist
├── KENCOLLO.lst
├── checkDB.py
├── LICENSE
├── DBSetup.py
├── README.md
├── convertDBToXML.py
└── functions.py


/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dodosuke/EpwingToDict/HEAD/demo.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | *.pyc
 4 | *.db
 5 | *.xml
 6 | *.out
 7 | *.html
 8 | *.db
 9 | dictionary.db
10 | .DS_Store
11 | test.py
12 | 


--------------------------------------------------------------------------------
/extractDataFromHTML.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | from functions import pretreat, storeWCToDB, extractEntryAndIndex, extractMeaning
 3 | 
 4 | # 入力ファイルのパスを指定
 5 | f_path = 'KENCOLLO.html'
 6 | 
 7 | # HTML ファイルを前処理し、temp ファイルを出力する
 8 | pretreat(f_path)
 9 | 
10 | # 前処理した HTML からデータを抜き出す。
11 | # 品詞の分類を保存
12 | storeWCToDB()
13 | 
14 | # 項目を抽出し、保存
15 | extractEntryAndIndex()
16 | 
17 | # 説明文を抽出し、保存
18 | extractMeaning()
19 | 


--------------------------------------------------------------------------------
/extractWordClass.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | # Extract wordclass type from HTML file
 4 | f = codecs.open('KENCOLLO2', 'r', 'utf-8')
 5 | categories = []
 6 | for line in f:
 7 |     start = line.find("【")
 8 |     if start > 0:
 9 |         end = line.rfind("】")
10 |         category = line[start:end+1]
11 |         if category not in categories:
12 |             print(category)
13 |             categories.append(category)
14 | f.close()
15 | 
16 | print(categories)
17 | 


--------------------------------------------------------------------------------
/templates/kencollo.css:
--------------------------------------------------------------------------------
 1 | @charset "UTF-8";
 2 | @namespace d url(http://www.apple.com/DTDs/DictionaryService-1.0.rng);
 3 | 
 4 | d|entry {
 5 | }
 6 | 
 7 | h1	{
 8 | 	font-size: 150%;
 9 | }
10 | 
11 | h3	{
12 | 	font-size: 100%;
13 | }
14 | 
15 | span.column {
16 | 	display: block;
17 | 	border: solid 2px #c0c0c0;
18 | 	margin-left: 2em;
19 | 	margin-right: 2em;
20 | 	margin-top: 0.5em;
21 | 	margin-bottom: 0.5em;
22 | 	padding: 0.5em;
23 | }
24 | 
25 | @media (prefers-dark-interface)
26 | {
27 | 	body {
28 | 		color: white;
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/KENCOLLO.lst:
--------------------------------------------------------------------------------
 1 | &#xE000;
 2 | &#xE001;
 3 | &#xE002;	✓
 4 | &#xE003;	嗉
 5 | &#xE004;	胳
 6 | &#xE005;	骶
 7 | &#xE006;	稃
 8 | &#xE007;	炻
 9 | &#xE008;	絇
10 | &#xE009;	蒴
11 | &#xE00A;	©
12 | &#xE00B;	☨
13 | &#xE00C;
14 | &#xE00D;
15 | &#xE00E;	*
16 | &#xE00F;	=
17 | &#xE010;	ç
18 | &#xE011;	ə
19 | &#xE012;	­ʃ
20 | &#xE013;	ŋ
21 | &#xE014;	ː
22 | &#xE015;	ɛ̃
23 | &#xE016;	ɑ̃
24 | &#xE017;	ɑ
25 | &#xE018;	ñ
26 | &#xE019;	$
27 | &#xE01A;
28 | &#xE01B;
29 | &#xE01C;
30 | &#xE01D;	ɔ́
31 | &#xE01E;	É
32 | &#xE01F;	á
33 | &#xE020;	é
34 | &#xE021;	í
35 | &#xE022;	à
36 | &#xE023;	è
37 | &#xE024;	ë
38 | &#xE025;	ï
39 | &#xE026;	ö
40 | &#xE027;	â
41 | &#xE028;	ê
42 | &#xE029;	ô
43 | &#xE02A;	ō
44 | 


--------------------------------------------------------------------------------
/checkDB.py:
--------------------------------------------------------------------------------
 1 | #import sqlite3
 2 | #connect = sqlite3.connect("dictionary.db")
 3 | #c = connect.cursor()
 4 | #c.execute("SELECT * FROM IndexTag order by id limit 10")
 5 | #print(c.fetchall())
 6 | 
 7 | from sqlalchemy import create_engine
 8 | from sqlalchemy.orm import sessionmaker
 9 | from DBSetup import Base, Entry, WordClass, Meaning, IndexTag
10 | engine = create_engine('sqlite:///dictionary.db')
11 | Base.metadata.bind = engine
12 | DBSession = sessionmaker(bind=engine)
13 | session = DBSession()
14 | 
15 | entry = session.query(Entry).filter_by(entryId='0003804D03D0').one()
16 | meaning = session.query(Meaning).filter_by(entryId='8566').all()
17 | 
18 | for i in meaning:
19 |     print(i.sentence)
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Keisuke Kishida
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/templates/kencollo.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | <dict>
 5 | 	<key>CFBundleDevelopmentRegion</key>
 6 | 	<string>English</string>
 7 | 	<key>CFBundleIdentifier</key>
 8 | 	<string>com.apple.dictionary.kencollo</string>
 9 | 	<key>CFBundleName</key>
10 | 	<string>$DICT_NAME</string>
11 | 	<key>CFBundleShortVersionString</key>
12 | 	<string>1.0</string>
13 | 	<key>DCSDictionaryCopyright</key>
14 | 	<string>Copyright © 2007 Apple Inc.</string>
15 | 	<key>DCSDictionaryManufacturerName</key>
16 | 	<string>Kenkyusha CO., Ltd.</string>
17 | 	<key>DCSDictionaryFrontMatterReferenceID</key>
18 | 	<string>front_back_matter</string>
19 | 	<key>DCSDictionaryPrefsHTML</key>
20 | 	<string>MyDictionary_prefs.html</string>
21 | 	<key>DCSDictionaryXSL</key>
22 | 	<string>MyDictionary.xsl</string>
23 | 	<key>DCSDictionaryUseSystemAppearance</key>
24 |     <true/>
25 | 	<key>DCSDictionaryDefaultPrefs</key>
26 | 	<dict>
27 | 		<key>pronunciation</key>
28 | 		<string>0</string>
29 | 		<key>display-column</key>
30 | 		<string>1</string>
31 | 		<key>display-picture</key>
32 | 		<string>1</string>
33 | 		<key>version</key>
34 | 		<string>1</string>
35 | 	</dict>
36 | </dict>
37 | </plist>
38 | 


--------------------------------------------------------------------------------
/DBSetup.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column, ForeignKey, Integer, String, create_engine
 2 | from sqlalchemy.ext.declarative import declarative_base
 3 | from sqlalchemy.orm import relationship
 4 | 
 5 | Base = declarative_base()
 6 | 
 7 | # Entry
 8 | class Entry(Base):
 9 |     __tablename__ = 'entry'
10 |     id = Column(Integer, primary_key=True)
11 |     entryId = Column(String(12), index=True)
12 |     title = Column(String(128), nullable=False)
13 |     headword = Column(String(1000))
14 | 
15 | # Index
16 | class IndexTag(Base):
17 |     __tablename__ = 'indextag'
18 |     id = Column(Integer, primary_key=True)
19 |     value = Column(String(256), nullable=False)
20 |     title = Column(String(256), nullable=False)
21 |     yomi = Column(String(256))
22 |     entryId = Column(Integer, ForeignKey('entry.id'))
23 |     entry = relationship(Entry)
24 | 
25 | # Categorized by the adjacent word type
26 | class WordClass(Base):
27 |     __tablename__ = 'wordclass'
28 |     id = Column(Integer, primary_key=True)
29 |     type = Column(String(1024), nullable=False)
30 | 
31 | # Meaning
32 | class Meaning(Base):
33 |     __tablename__ = 'meaning'
34 |     id = Column(Integer, primary_key=True)
35 |     sentence = Column(String(1024), nullable=False)
36 |     entryId = Column(Integer, ForeignKey('entry.id'))
37 |     entry = relationship(Entry)
38 |     wcId = Column(Integer, ForeignKey('wordclass.id'))
39 |     wordclass = relationship(WordClass)
40 | 
41 | engine = create_engine('sqlite:///dictionary.db')
42 | Base.metadata.create_all(engine)
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EpwingToDict
 2 | LogoVista社製「研究社 新編 英和活用大辞典（EPWING形式）」を Mac に標準搭載されている辞書アプリで使えるように変換するための python スクリプトです。
 3 | 
 4 | This is a tool to make a dictionary for Mac from Kenkyusha Eiwa-Katsuyo-Daijiten.
 5 | 
 6 | ## Description
 7 | EBDump + ebd2html で出力した HTML をスクレイピングし、データベースを作成し、辞書用の XML ファイルを出力する Python スクリプトです。
 8 | LogoVista社の 英和活用大辞典（EPWING） 専用であり、他の辞書やバージョンには対応していません。
 9 | * extractDataFromHTML.py: HTML からデータを抜き出すスクリプト
10 | * convertDBToXML.py: XML 出力するスクリプト
11 | * DBSetup.py: データベースの設定
12 | * ExtractFunctions: HTML の前処理及び抽出に関するファイル
13 | * checkDB.py: データベースの内容確認用スクリプト（開発用）
14 | * extractWordClass.py: 品詞の分類抜き出し用スクリプト（開発用）
15 | * KENCOLLO.lst: 外字ファイル（参考資料）
16 | * /templates:  Dictionary Development Kit 用 template ファイル例。ダークテーマ対応済。
17 | 
18 | なお、今回のスクリプト作成にあたっては、下記のブログを参考にしています。英辞郎やCoubuildの変換に関しては、こちらをご参照ください。
19 | http://www.binword.com/blog/archives/000588.html
20 | 
21 | ## Demo
22 | ![Demo](https://github.com/dodosuke/EpwingToDict/blob/master/demo.png)
23 | 
24 | ## Requirement
25 | * 辞書データ：LogoVista 社「徹底英語活用セット」（2002年発売版、EPWING形式）
26 | * HTML変換：Windows + EBDump + ebd2html
27 | * 辞書作成：Mac + Python 3
28 | 
29 | ## Usage
30 | 1. まずは、下記サイトを参考に HTML ファイルを作成します。作成したファイルの名前を "KENCOLLO.html" にします。（外字を修正したい場合は、このタイミングで HTML ファイルを直接編集してください。）
31 | http://hp.vector.co.jp/authors/VA000022/ebd2html/ebd2html.html
32 | 
33 | 2. extractDataFromHTML.py で、HTML を前処理した後、スクレイピングし、データベース(dictionary.db)にデータを格納します（所用時間：20分程度）。
34 | 
35 | 3. convertDBToXML.py で、データベースの情報からXMLファイルを作成します（所用時間：30分程度）。
36 | 
37 | 4. 作成された XML ファイルを使い、Macの辞書を作成し、インストールします（所用時間：数分）。
38 | 
39 | なお、所要時間は、MacBook Pro (late 2013）で実行時した場合の実測値です。
40 | 
41 | ## Version
42 | 
43 | 2018/10/16 (ver. 1.3): Dark Theme 対応
44 | 
45 | 2017/ 8/17 (ver. 1.1): Bug fix 及び コード修正
46 | 
47 | 2017/ 8/18 (ver. 1.2): Bug fix
48 | 
49 | 2017/ 8/16 (ver. 1.0): 初版公開
50 | 
51 | ## TODO
52 | 外字を自動で書き換えない、かなで検索できない、などについては、気が向いたら対応予定。
53 | 
54 | ## Licence
55 | [MIT Licence](https://github.com/dodosuke/EpwingToDict/LICENCE)
56 | 
57 | ## Author
58 | [dodosuke @ Github](https://github.com/dodosuke)
59 | 
60 | [dodosuke0920 @ Twitter](https://twitter.com/dodosuke0920)
61 | 


--------------------------------------------------------------------------------
/convertDBToXML.py:
--------------------------------------------------------------------------------
  1 | import codecs, re
  2 | from tqdm import tqdm
  3 | from functions import deleteLink
  4 | 
  5 | # データベースを呼び出す
  6 | from sqlalchemy import create_engine
  7 | from sqlalchemy.orm import sessionmaker
  8 | from DBSetup import Base, Entry, WordClass, Meaning, IndexTag
  9 | engine = create_engine('sqlite:///dictionary.db')
 10 | Base.metadata.bind = engine
 11 | DBSession = sessionmaker(bind=engine)
 12 | session = DBSession()
 13 | 
 14 | # 出力するファイルを設定
 15 | xml_out = codecs.open('KENCOLLO.xml', 'w', 'utf-8')
 16 | xml_out.write('<?xml version="1.0" encoding="UTF-8"?>\n')
 17 | xml_out.write('<d:dictionary xmlns="http://www.w3.org/1999/xhtml" xmlns:d="http://www.apple.com/DTDs/DictionaryService-1.0.rng">\n')
 18 | 
 19 | # Entryの数を数える
 20 | lastEntry = session.query(Entry).order_by(Entry.id.desc()).first()
 21 | numberOfEntries = lastEntry.id
 22 | 
 23 | # 変換プロセスの可視化
 24 | pbar = tqdm(range(numberOfEntries))
 25 | 
 26 | # データベース上のエントリーを出力する
 27 | for i in range(numberOfEntries):
 28 | 
 29 |     # データベースからエントリーとインデックスを読み込み
 30 |     entry = session.query(Entry).filter_by(id=i+1).one()
 31 |     indices = session.query(IndexTag).filter_by(entryId=i+1).all()
 32 | 
 33 |     # エントリー名を書く
 34 |     xml_out.write('<d:entry id="' + entry.entryId + '" d:title="' + entry.title + '">\n')
 35 | 
 36 |     # Indexを書く
 37 |     for index in indices:
 38 |         xml_out.write('\t<d:index d:value="' + index.value + '" d:title="' + index.title + '" />\n')
 39 | 
 40 |     # 検索結果の頭の表記を書く
 41 |     headword = entry.headword
 42 |     if headword.find("<a") > -1:
 43 |         headword = deleteLink(headword)
 44 |     xml_out.write('\t<h1><span class="headword">' + headword + '</span></h1>\n')
 45 | 
 46 |     # データベースから説明文を読み込み
 47 |     meanings = session.query(Meaning).filter_by(entryId=i+1).all()
 48 | 
 49 |     # 説明文が無い場合は、エントリーを閉じて次の項目へ行こう
 50 |     if len(meanings) == 0:
 51 |         xml_out.write('</d:entry>\n')
 52 |         pbar.update(1)
 53 |         continue
 54 | 
 55 |     # 説明文を出力
 56 |     for i in range(len(meanings)):
 57 |         wcId = meanings[i].wcId
 58 |         sentence = meanings[i].sentence
 59 | 
 60 |         if sentence.find("<a") > -1:
 61 |             sentence = deleteLink(sentence)
 62 | 
 63 |         # 最初　かつ　小分類（１）（２）...がある場合
 64 |         if wcId == 0 and i == 0:
 65 |             xml_out.write('\t<div>\n\t\t<p>' + sentence + '</p>\n')
 66 | 
 67 |         # 最後に小分類がある場合
 68 |         elif wcId == 0 and i == len(meanings)-1:
 69 |             xml_out.write('\t\t</ul>\n\t</div>\n\t<div>\n\t\t<p>' + sentence + '</p>\n\t\t<ul>\n')
 70 | 
 71 |         # 小分類
 72 |         elif wcId == 0:
 73 |             xml_out.write('\t\t</ul>\n\t</div>\n\t<div>\n\t\t<p>' + sentence + '</p>\n')
 74 | 
 75 |         # 説明文の最初、【品詞】＋説明文
 76 |         elif i == 0:
 77 |             wordclass = session.query(WordClass).filter_by(id = wcId).one()
 78 |             xml_out.write('\t<div>\n\t\t<p>' + wordclass.type + '</p>\n\t\t<ul>\n')
 79 |             xml_out.write('\t\t\t<li>' + sentence + '</li>\n')
 80 | 
 81 |         # 説明文のみ記載
 82 |         elif wcId == meanings[i-1].wcId:
 83 |             xml_out.write('\t\t\t<li>' + sentence + '</li>\n')
 84 | 
 85 |         # 直前が小分類 の場合
 86 |         elif meanings[i-1].wcId == 0:
 87 |             wordclass = session.query(WordClass).filter_by(id = wcId).one()
 88 |             xml_out.write('\t\t<p>' + wordclass.type + '</p>\n\t\t<ul>\n')
 89 |             xml_out.write('\t\t\t<li>' + sentence + '</li>\n')
 90 | 
 91 |         # 品詞が切り替わるタイミング
 92 |         else:
 93 |             wordclass = session.query(WordClass).filter_by(id = wcId).one()
 94 |             xml_out.write('\t\t</ul>\n\t</div>\n\t<div>\n\t\t<p>' + wordclass.type + '</p>\n\t\t<ul>\n')
 95 |             xml_out.write('\t\t\t<li>' + sentence + '</li>\n')
 96 | 
 97 |     # エントリーを閉じる
 98 |     xml_out.write('\t\t</ul>\n\t</div>\n</d:entry>\n')
 99 | 
100 |     # tqdm を使って進捗を示す
101 |     pbar.update(1)
102 | 
103 | # 辞書を閉じる
104 | xml_out.write('</d:dictionary>\n')
105 | xml_out.close()
106 | 


--------------------------------------------------------------------------------
/functions.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import re
  3 | import mojimoji
  4 | from tqdm import tqdm
  5 | 
  6 | # For Importing dictionary
  7 | from sqlalchemy import create_engine
  8 | from sqlalchemy.orm import sessionmaker
  9 | from DBSetup import Base, Entry, WordClass, Meaning, IndexTag
 10 | engine = create_engine('sqlite:///dictionary.db')
 11 | Base.metadata.bind = engine
 12 | DBSession = sessionmaker(bind=engine)
 13 | session = DBSession()
 14 | 
 15 | # 一時ファイルのパスを指定
 16 | f_temp_path = 'temp.out'
 17 | 
 18 | # HTMLファイルの前処理
 19 | def pretreat(f_path):
 20 | 
 21 |     print('preprocessing HTML file')
 22 | 
 23 |     # ファイルの読み込み
 24 |     f = codecs.open(f_path, 'r', 'utf-8')
 25 |     f_temp = codecs.open(f_temp_path, 'w', 'utf-8')
 26 | 
 27 |     for line in f:
 28 |         # <nobr>, <sub>, <sup> タグを全削除する
 29 |         line = line.replace("<nobr>", "").replace("</nobr>", "")
 30 |         line = line.replace("<sub>", "").replace("</sub>", "")
 31 |         line = re.sub('<sup>(.+?)</sup>', '', line)
 32 | 
 33 |         # 半角文字前の全角数字を削除する
 34 |         line = re.sub('[０-９]([a-xA-Z0-9_])', '\\1', line)
 35 | 
 36 |         # 全角スペースを半角スペースに変換する
 37 |         line = line.replace("　", " ")
 38 | 
 39 |         # 不要な改行を削除する
 40 |         line = line.replace("\n", "")
 41 |         if line.find(" ") == 0 :
 42 |             f_temp.write(line)
 43 |         else:
 44 |             f_temp.write("\n" + line)
 45 | 
 46 |     f.close()
 47 |     f_temp.close()
 48 | 
 49 | # 品詞の分類をDBへ保存
 50 | def storeWCToDB():
 51 |     WCs = ['【動詞＋】', '【＋動詞】', '【形容詞・名詞＋】', '【副詞】', '【副詞１】', \
 52 |     '【副詞２】', '【前置詞＋】', '【＋前置詞】', '【雑】', '【＋to do】', '【＋doing】', \
 53 |     '【＋that節】', '【＋wh.】', '【＋how】', '【＋that節】【＋補】', '【＋whether】', \
 54 |     '【＋whether [if]】', '【＋to do】【＋doing】', '【＋-self】', '【＋補】', '【＋-self】【＋補】']
 55 |     for i in range(len(WCs)):
 56 |         wordclass = WordClass(type=WCs[i])
 57 |         session.add(wordclass)
 58 |         session.commit()
 59 | 
 60 | # 各アイテムをデータベースへ保存
 61 | def storeEntryToDB(entryId, title, headword=None):
 62 |     entry = Entry(entryId=entryId, title=title, headword=headword)
 63 |     session.add(entry)
 64 |     session.commit()
 65 | 
 66 | def storeIndexToDB(entryId, value, title):
 67 |     indextag = IndexTag(value=value, title=title, entryId=entryId)
 68 |     session.add(indextag)
 69 |     session.commit()
 70 | 
 71 | def storeMeaningToDB(sentence, entryId, wcId):
 72 |     meaning = Meaning(sentence=sentence, entryId=entryId, wcId=wcId)
 73 |     session.add(meaning)
 74 |     session.commit()
 75 | 
 76 | # HTML から Entry と Index を抜き出し、データベースへ保存
 77 | def extractEntryAndIndex():
 78 | 
 79 |     print('Start extracting entry and index')
 80 | 
 81 |     # 一時ファイルの読み込み
 82 |     f = codecs.open(f_temp_path, 'r', 'utf-8')
 83 |     entryIdForIndex = 0
 84 |     pbar = tqdm(range(15958))
 85 | 
 86 |     for line in f:
 87 |         start = line.find("<dt id=")
 88 |         # Extract Entry
 89 |         if start > -1:
 90 |             end = line.find("<a")
 91 |             entryId = line[start+8:20]
 92 |             title = line[22:end-1]
 93 |             storeEntryToDB(entryId, title)
 94 |             entryIdForIndex += 1
 95 |             pbar.update(1)
 96 | 
 97 |         # Extract index
 98 |         elif line.find("<key") > -1:
 99 |             # Ignore the Kana type and store index into database
100 |             if line.find('type="かな"') < 0:
101 |                 value_end = line.find("</key>")
102 |                 title_end = line.find("type=")
103 |                 value = line[title_end+10:value_end]
104 |                 value = mojimoji.zen_to_han(value, kana=False).lower()
105 |                 title = line[12:title_end-2]
106 |                 storeIndexToDB(entryIdForIndex, value, title)
107 | 
108 |         elif line.find("") > 0:
109 |             break
110 | 
111 |     f.close()
112 | 
113 | # HTML から説明文を抜き出し、データベースへ保存
114 | def extractMeaning():
115 | 
116 |     print("Start extracting items.")
117 |     # 一時ファイルの読み込み
118 |     f = codecs.open(f_temp_path, 'r', 'utf-8')
119 | 
120 |     entryIdForMeaning = 0
121 |     wcId = 0
122 |     pbar = tqdm(range(15958))
123 | 
124 |     # Extract meanings
125 |     # Extract headword and store into Entry DB
126 |     for line in f:
127 |         if line.find("") > 0:
128 |             end = line.find("<br>")
129 |             headword = line[20:end]
130 |             entryIdForMeaning += 1
131 |             entry = session.query(Entry).filter_by(id=entryIdForMeaning).one()
132 |             entry.headword = headword
133 |             session.commit()
134 |             pbar.update(1)
135 | 
136 |         elif entryIdForMeaning == 0:
137 |             continue
138 | 
139 |         # Extract subcategory
140 |         elif line.find("(") > 0:
141 |             end = line.find("<br>")
142 |             sentence = line[20:end]
143 |             storeMeaningToDB(sentence,entryIdForMeaning, 0)
144 | 
145 |         # Extract word class and save wcId for later
146 |         elif line.find("【") > 0:
147 |             end = line.find("<br>")
148 |             wcType = line[20:end]
149 |             wordclass = session.query(WordClass).filter_by(type=wcType).one()
150 |             wcId = wordclass.id
151 | 
152 |         # 普通の項目、最初の点は除く
153 |         elif line.find("") > 0:
154 |             end = line.find("<a")
155 |             sentence = line[21:end]
156 |             storeMeaningToDB(sentence, entryIdForMeaning, wcId)
157 | 
158 |     f.close()
159 | 
160 | # linkを削除するための関数
161 | def deleteLink(word):
162 |     word = word.replace("</a>", "")
163 |     a1 = word.find('<a')
164 |     a2 = word.rfind('">')
165 |     word_out = word[:a1] + word[a2+2:]
166 |     return word_out
167 | 


--------------------------------------------------------------------------------