\n\t\t
' + sentence + '
\n')
74 |
75 | # 説明文の最初、【品詞】+説明文
76 | elif i == 0:
77 | wordclass = session.query(WordClass).filter_by(id = wcId).one()
78 | xml_out.write('\t
\n\t\t
' + wordclass.type + '
\n\t\t
\n')
79 | xml_out.write('\t\t\t- ' + sentence + '
\n')
80 |
81 | # 説明文のみ記載
82 | elif wcId == meanings[i-1].wcId:
83 | xml_out.write('\t\t\t- ' + sentence + '
\n')
84 |
85 | # 直前が小分類 の場合
86 | elif meanings[i-1].wcId == 0:
87 | wordclass = session.query(WordClass).filter_by(id = wcId).one()
88 | xml_out.write('\t\t' + wordclass.type + '
\n\t\t\n')
89 | xml_out.write('\t\t\t- ' + sentence + '
\n')
90 |
91 | # 品詞が切り替わるタイミング
92 | else:
93 | wordclass = session.query(WordClass).filter_by(id = wcId).one()
94 | xml_out.write('\t\t
\n\t
\n\t
\n\t\t
' + wordclass.type + '
\n\t\t
\n')
95 | xml_out.write('\t\t\t- ' + sentence + '
\n')
96 |
97 | # エントリーを閉じる
98 | xml_out.write('\t\t
\n\t
\n\n')
99 |
100 | # tqdm を使って進捗を示す
101 | pbar.update(1)
102 |
103 | # 辞書を閉じる
104 | xml_out.write('\n')
105 | xml_out.close()
106 |
--------------------------------------------------------------------------------
/functions.py:
--------------------------------------------------------------------------------
1 | import codecs
2 | import re
3 | import mojimoji
4 | from tqdm import tqdm
5 |
6 | # For Importing dictionary
7 | from sqlalchemy import create_engine
8 | from sqlalchemy.orm import sessionmaker
9 | from DBSetup import Base, Entry, WordClass, Meaning, IndexTag
10 | engine = create_engine('sqlite:///dictionary.db')
11 | Base.metadata.bind = engine
12 | DBSession = sessionmaker(bind=engine)
13 | session = DBSession()
14 |
15 | # 一時ファイルのパスを指定
16 | f_temp_path = 'temp.out'
17 |
18 | # HTMLファイルの前処理
19 | def pretreat(f_path):
20 |
21 | print('preprocessing HTML file')
22 |
23 | # ファイルの読み込み
24 | f = codecs.open(f_path, 'r', 'utf-8')
25 | f_temp = codecs.open(f_temp_path, 'w', 'utf-8')
26 |
27 | for line in f:
28 | #
, , タグを全削除する
29 | line = line.replace("", "").replace("", "")
30 | line = line.replace("", "").replace("", "")
31 | line = re.sub('(.+?)', '', line)
32 |
33 | # 半角文字前の全角数字を削除する
34 | line = re.sub('[0-9]([a-xA-Z0-9_])', '\\1', line)
35 |
36 | # 全角スペースを半角スペースに変換する
37 | line = line.replace(" ", " ")
38 |
39 | # 不要な改行を削除する
40 | line = line.replace("\n", "")
41 | if line.find(" ") == 0 :
42 | f_temp.write(line)
43 | else:
44 | f_temp.write("\n" + line)
45 |
46 | f.close()
47 | f_temp.close()
48 |
49 | # 品詞の分類をDBへ保存
50 | def storeWCToDB():
51 | WCs = ['【動詞+】', '【+動詞】', '【形容詞・名詞+】', '【副詞】', '【副詞1】', \
52 | '【副詞2】', '【前置詞+】', '【+前置詞】', '【雑】', '【+to do】', '【+doing】', \
53 | '【+that節】', '【+wh.】', '【+how】', '【+that節】【+補】', '【+whether】', \
54 | '【+whether [if]】', '【+to do】【+doing】', '【+-self】', '【+補】', '【+-self】【+補】']
55 | for i in range(len(WCs)):
56 | wordclass = WordClass(type=WCs[i])
57 | session.add(wordclass)
58 | session.commit()
59 |
60 | # 各アイテムをデータベースへ保存
61 | def storeEntryToDB(entryId, title, headword=None):
62 | entry = Entry(entryId=entryId, title=title, headword=headword)
63 | session.add(entry)
64 | session.commit()
65 |
66 | def storeIndexToDB(entryId, value, title):
67 | indextag = IndexTag(value=value, title=title, entryId=entryId)
68 | session.add(indextag)
69 | session.commit()
70 |
71 | def storeMeaningToDB(sentence, entryId, wcId):
72 | meaning = Meaning(sentence=sentence, entryId=entryId, wcId=wcId)
73 | session.add(meaning)
74 | session.commit()
75 |
76 | # HTML から Entry と Index を抜き出し、データベースへ保存
77 | def extractEntryAndIndex():
78 |
79 | print('Start extracting entry and index')
80 |
81 | # 一時ファイルの読み込み
82 | f = codecs.open(f_temp_path, 'r', 'utf-8')
83 | entryIdForIndex = 0
84 | pbar = tqdm(range(15958))
85 |
86 | for line in f:
87 | start = line.find("