├── ._wscript
├── .lock-waf_darwin_build
├── LICENSE
├── README.rst
├── history.txt
├── src
    ├── abstime_expression_normalizer.cpp
    ├── abstime_expression_normalizer.hpp
    ├── abstime_expression_normalizer_test.cpp
    ├── dic
    │   ├── en
    │   │   └── .DS_Store
    │   ├── ja
    │   │   ├── abstime_expression_json.txt
    │   │   ├── abstime_prefix_counter_json.txt
    │   │   ├── abstime_prefix_json.txt
    │   │   ├── abstime_suffix_json.txt
    │   │   ├── chinese_character.txt
    │   │   ├── duration_expression_json.txt
    │   │   ├── duration_prefix_counter_json.txt
    │   │   ├── duration_prefix_json.txt
    │   │   ├── duration_suffix_json.txt
    │   │   ├── inappropriate_strings_json.txt
    │   │   ├── num_counter_json.txt
    │   │   ├── num_prefix_counter_json.txt
    │   │   ├── num_prefix_json.txt
    │   │   ├── num_suffix_json.txt
    │   │   ├── raw
    │   │   │   ├── abstime_date.txt
    │   │   │   ├── abstime_dayweek.txt
    │   │   │   ├── abstime_dayweek_pattern.txt
    │   │   │   ├── abstime_nengou.txt
    │   │   │   ├── abstime_prefix_counter.txt
    │   │   │   ├── abstime_settouji.txt
    │   │   │   ├── abstime_setubiji.txt
    │   │   │   ├── abstime_time.txt
    │   │   │   ├── create_dic_abstime.py
    │   │   │   ├── create_dic_abstime_date+time.py
    │   │   │   ├── create_dic_abstime_prefix_counter.py
    │   │   │   ├── create_dic_dayweek.py
    │   │   │   ├── create_dic_duration.py
    │   │   │   ├── create_dic_inappropriate.py
    │   │   │   ├── create_dic_num.py
    │   │   │   ├── create_dic_num_prefix_counter.py
    │   │   │   ├── create_dic_number_modifier.py
    │   │   │   ├── create_dic_reltime.py
    │   │   │   ├── create_dic_reltime_prefix_counter.py
    │   │   │   ├── duration_prefix_counter.txt
    │   │   │   ├── duration_setouji.txt
    │   │   │   ├── duration_setubiji.txt
    │   │   │   ├── duration_time_position.txt
    │   │   │   ├── inappropriate_strings.txt
    │   │   │   ├── make_dictionary.sh
    │   │   │   ├── make_dictionary.sh~
    │   │   │   ├── num.txt
    │   │   │   ├── num_SItanni_hankaku.txt
    │   │   │   ├── num_SItanni_katakana.txt
    │   │   │   ├── num_SItanni_settouji_hankaku.txt
    │   │   │   ├── num_SItanni_settouji_katakana.txt
    │   │   │   ├── num_SItanni_settouji_zenkaku.txt
    │   │   │   ├── num_SItanni_zenkaku.txt
    │   │   │   ├── num_expand.txt
    │   │   │   ├── num_prefix_counter.txt
    │   │   │   ├── num_settouji.txt
    │   │   │   ├── num_setubiji.txt
    │   │   │   ├── num_wari.txt
    │   │   │   ├── reltime_prefix_counter.txt
    │   │   │   ├── reltime_settouji.txt
    │   │   │   ├── reltime_specific.txt
    │   │   │   ├── reltime_time_option.txt
    │   │   │   ├── reltime_time_position.txt
    │   │   │   └── reltime_time_pre_option.txt
    │   │   ├── reltime_expression_json.txt
    │   │   ├── reltime_prefix_counter_json.txt
    │   │   ├── reltime_prefix_json.txt
    │   │   └── reltime_suffix_json.txt
    │   └── zh
    │   │   ├── .DS_Store
    │   │   ├── ._chinese_character.txt
    │   │   ├── abstime_expression_json.txt
    │   │   ├── chinese_character.txt
    │   │   └── num_counter_json.txt
    ├── dictionary_dirpath.cpp
    ├── dictionary_dirpath.hpp
    ├── digit_utility.cpp
    ├── digit_utility.hpp
    ├── digit_utility_test.cpp
    ├── duration_expression_normalizer.cpp
    ├── duration_expression_normalizer.hpp
    ├── duration_expression_normalizer_test.cpp
    ├── inappropriate_expression_remover.cpp
    ├── inappropriate_expression_remover.hpp
    ├── main.cpp
    ├── normalize_numexp.cpp
    ├── normalize_numexp.hpp
    ├── normalize_numexp_test.cpp
    ├── normalizer_template.hpp
    ├── normalizer_utility.cpp
    ├── normalizer_utility.hpp
    ├── normalizer_utility_test.cpp
    ├── number_normalizer.cpp
    ├── number_normalizer.hpp
    ├── number_normalizer_test.cpp
    ├── numerical_expression_extractor.pyc
    ├── numerical_expression_normalizer.cpp
    ├── numerical_expression_normalizer.hpp
    ├── numerical_expression_normalizer_test.cpp
    ├── optparse.h
    ├── reltime_expression_normalizer.cpp
    ├── reltime_expression_normalizer.hpp
    ├── reltime_expression_normalizer_test.cpp
    └── wscript
├── swig
    ├── java
    │   ├── TestNormalizeNumexp.java
    │   ├── compile.sh
    │   └── readme.txt
    ├── normalize_numexp.i
    ├── python
    │   ├── compile.sh
    │   ├── readme.txt
    │   └── test_normalize_numexp.py
    └── ruby
    │   ├── compile.sh
    │   ├── readme.txt
    │   └── test-normalize-numexp.rb
├── unittest_gtest.py
├── waf
└── wscript


/._wscript:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/._wscript


--------------------------------------------------------------------------------
/.lock-waf_darwin_build:
--------------------------------------------------------------------------------
1 | argv = ['./waf', 'configure']
2 | environ = {'TERM_SESSION_ID': '8DDB165E-368B-48DD-A813-50947A275351', 'PYTHONPATH': '/Users/katsuma/', 'SSH_AUTH_SOCK': '/tmp/launch-ZM3Pmm/Listeners', 'TERM_PROGRAM_VERSION': '303.2', 'Apple_PubSub_Socket_Render': '/tmp/launch-ueodjD/Render', 'LOGNAME': 'katsuma', 'USER': 'katsuma', 'HOME': '/Users/katsuma', 'PKG_CONFIG_PATH': '/usr/local/lib/pkgconfig/', 'PATH': '/opt/local/bin/:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/usr/X11/bin', 'PS1': '\\[\\e[0;32m\\]\\u@\\h\\[\\e[0m\\][\\t]:\\W$ ', 'DISPLAY': '/tmp/launch-xm9ozw/org.x:0', '_': './waf', 'TERM_PROGRAM': 'Apple_Terminal', 'LANG': 'ja_JP.UTF-8', '__CF_USER_TEXT_ENCODING': '0x1F6:1:14', 'TERM': 'xterm-256color', 'SHELL': '/bin/bash', 'SHLVL': '1', 'OLDPWD': '/Users/katsuma', 'HISTSIZE': '10000', 'HISTCONTROL': 'ignoreboth', 'Apple_Ubiquity_Message': '/tmp/launch-UtGZZs/Apple_Ubiquity_Message', 'PWD': '/Users/katsuma/src/normalizeNumexp', 'TMPDIR': '/var/folders/68/zvn0f60d2cqgsjrn3pnd9rr00000gp/T/', 'CLICOLOR': '1', 'COMMAND_MODE': 'unix2003', 'LSCOLORS': 'gxfxcxdxbxegedabagacad'}
3 | files = ['/Users/katsuma/src/normalizeNumexp/wscript']
4 | hash = -8290367226182741820
5 | options = {'files': '', 'checkall': False, 'targets': '', 'jobs': 2, 'verbose': 0, 'nocache': False, 'progress_bar': 0, 'checkone': False, 'top': '', 'destdir': '', 'keep': 0, 'zones': '', 'prefix': '/usr/local/', 'download': False, 'force': False, 'out': '', 'check_cxx_compiler': 'g++', 'check': False, 'checkfilter': False}
6 | out_dir = '/Users/katsuma/src/normalizeNumexp/build'
7 | run_dir = '/Users/katsuma/src/normalizeNumexp'
8 | top_dir = '/Users/katsuma/src/normalizeNumexp'
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012, Katsuma Narisawa.
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of tanakh nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | ================================
 2 | normalizeNumexp : Numerical/Temporal expression normalizer
 3 | ================================
 4 | 
 5 | About
 6 | =====
 7 | 
 8 | This is a tool for normalizing numerical/temporal expression.
 9 | 
10 | 
11 | Necessary Libraries
12 | ======
13 | ux(More Succinct Trie Data structure):http://code.google.com/p/ux-trie/wiki/Tutorial_Japanese
14 | 
15 | pficommon(General purpose C++ library for PFI):https://github.com/pfi/pficommon
16 | 
17 | 
18 | Install
19 | =======
20 | 
21 | Do following instructions.
22 | 
23 | ..
24 | 
25 |   $ ./waf configure
26 | 
27 |   $ ./waf build
28 | 
29 |   $ ./waf install
30 | 
31 | To check that the installation has completed successfully, 
32 | 
33 | ..
34 | 
35 |   $ ./waf --checkall
36 | 
37 | 
38 | How to Use
39 | =======
40 | 
41 | This utility normalize (Japanese) numerical and temporal expressions in the input sentence.
42 | 
43 | ..
44 | 
45 |   $ normalizeNumexp
46 | 
47 |   魔女狩りは15世紀〜18世紀にかけてみられ、全ヨーロッパで4万人が処刑された
48 | 
49 |   >numerical*4万人*29*32*人*40000*40000*
50 | 
51 |   >abstime*15世紀〜18世紀*5*14*none*1401-XX-XX*1800-XX-XX*
52 | 
53 | 
54 | If you want to know more detail about this tool, please read following documents.
55 | http://www.cl.ecei.tohoku.ac.jp/~katsuma/software/normalizeNumexp/
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/history.txt:
--------------------------------------------------------------------------------
  1 | 2012/12/? ver3.0を公開。
  2 | 
  3 | 
  4 | ver2.0 -> 3.0 の主な違い
  5 | ・より簡潔な実装を目指してリファクタリングを行いました。
  6 | ・MeCabの使用をやめました。形態素区切り情報が利用できず、若干精度が落ちますが、処理が簡潔になりました。精度が落ちる問題は、外部モジュールを作成し今後対応する予定です
  7 | ・pficommonを用いることで、wstringを使用せずに日本語が扱えるようになりました。これによりlocale周りの問題が解決しています。
  8 | ・uxの使用。
  9 | ・辞書をjson形式に
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | *****************
 17 | 
 18 | 以下に制作者の個人的なメモを乗せておきます。
 19 | 細かい仕様などが気になる方以外は特に読む必要はありません。
 20 | 
 21 | 
 22 | ■バグリスト
 23 | 「3分の1」 = 33%として抽出する？
 24 | 「戦後五十年間」の扱い
 25 | ぐらい
 26 | ミリ秒
 27 | 「7年ぶり」の扱い（扱わない）
 28 | 十六世　とらない
 29 | 翌三年　明日三日、と同じ。どうするのだっけ？
 30 | 五十五分ごろ、がdurationになる（abstimeの許容値を超えてしまっているため）
 31 | １０時—１８時 五百円—千百円
 32 | 約三○度
 33 | 「年度」どう扱う？（評価ではとりあえず無視）
 34 | 「60t」はトン？
 35 | ３００〜７００万円
 36 | 
 37 | 
 38 | ■TODO List
 39 | ・多倍長整数の実装（現在は数値をすべてdoubleで扱っている）
 40 | ・辞書の整備　SI単位系、世界の貨幣、各種専門用語
 41 | ・曖昧性の解消
 42 | 	・num,abs,rel,durで複数ヒットした場合の処理。現状では適当な順序で最長マッチさせてるだけ。
 43 | 	・一般名詞を認識してしまう。除去リストの作成の必要
 44 | 	・URLとか英字羅列で認識してしまう。URLは頻繁に出てくるので、なんとかする。
 45 | ・英語の表現（特に時間）
 46 | ・数の認識
 47 | 	・並列表記（１、２）への対応　
 48 | 		・x,x+1となる数のみ対応。1991,92年などは未対応
 49 | 		            ・これに限って言えば、abstimeのパターンとしてとってしまってprocessで処理すれば処理可能。
 50 | 			    ・「数万」の扱い
 51 | 			    ・1千1千など、不適当な表記　ある程度やったが、他に変な表現はあるかも。「100百20十」とかは無理。
 52 | 			    ・30-40万年前　30年から40万年前で認識
 53 | ・数量表現
 54 | 	・「代」「台」の問題　特に対処していない
 55 | ・絶対時間表現
 56 | 	・1989.3  3.11の違いを判定
 57 | 	・序数はすべて持続時間。absではない。
 58 | 	・曜日　９月２９日（金）〜１０月１８日（水）　　2001.4.29 Friday 1:30
 59 | ・2回以上の接尾辞、接頭辞はとっていない
 60 | ・形態素区切り情報をいれていない　「シャンプー1本」「総ページ数100頁」
 61 | ・<= <の区別を表示させる。1920年代　とかで間違えてしまう
 62 | ・その他
 63 | 	・月額２，６０４円（税込）から　（税込）まで認識しないと、範囲表現がとれない
 64 | ・「2の10乗キロメートル」「三分の一キログラム」このような特殊な数の表記については対応していない
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | ■対応が難しいもの
 71 | ・数量を含まない表現
 72 | 	・半世紀、数世紀　などの表現。数は入っていないが、数として本当は認識したい
 73 | ・曖昧性の解消
 74 | 	・五輪、 　一体（除去してしまっているが、本当に除去していいのか判定する必要）
 75 | 	・（20）、
 76 | 	・評価用アイスセンサーキット：　\120,000
 77 | 	・頻度表現、現状の表現法で良い？
 78 | 	・同三日　　どうしようもない
 79 | 	・キロ、センチ、ミリ（すべてmに統一）
 80 | 	・30年の歴史　　「三年生」もとっちゃってる
 81 | 	・ＤＣカプラーＣＰ４５Ｗ　ＬＤＲ−２１６シリーズ ２３区、　一戸〜八戸
 82 | 	・「雑居ビル6F」　ファラデーとして抽出してしまう
 83 | ・特殊な例
 84 | 	・年齢表現　生後６ヶ月〜８０才前後
 85 | ・その他
 86 | 	・ひらがなは難しい。「１ねんぶんのじゃがいも」「３にんがたべました」
 87 | 	・３月第三週　　序数はどうする？
 88 | 
 89 | 
 90 | 
 91 | その他メモ：
 92 | ・二週間以内：持続時間
 93 | ・比などはとっていない　　 1:3:5、　1,3,5の割合で〜とか
 94 | ・３１ページで紹介した〜 -> 31ページ「目」で紹介した〜　ということ？未考慮
 95 | ・一番お値打ち　＜「最も」の意味。数量表現か？
 96 | ・「周年」は経過した時間を示す。どれとも言い辛いが、数量表現とする。
 97 | ・「13月」など存在しない絶対時間表現は抽出しない。
 98 | ・昔の数字にも対応したが、precisionが下がる&入れなくてもほぼrecallは下がらないので、外した方がいいかもしれない。
 99 | 
100 | 
101 | 未対応・注意点リスト（具体的に、細かく。）
102 | ・「20人〜」「20人から」「〜20人」は「20人」に等しいとする（20~∞とは扱わない）
103 | ・「30人まで」は「−∞〜30」と扱う
104 | ・「先月1日」は「先日（相対時間表現）」＋「1日（絶対時間表現）」という構成をもつ相対時間表現。これを相対時間表現とするために、相対時間表現の実装の中で絶対時間表現の実装と重複する処理をたくさん行っている
105 | ・３月第三週　　分割して認識している。
106 | ・「9Paまで下げる」「9階まで降りる」これは範囲表現ではないと考えられる。現在はとってしまっている。
107 | ・「数年」「数週間」　数字が入っていないので抽出対象外
108 | ・およそ１００人〜５００人　およそが二重にかかるバグ <本当ならaboutなどの処理は最後にやるべき。やはり、処理はせずにoptionとして出力した方が良い？
109 | ・80歳前後、で0.7がけするのはやりすぎ。aboutの範囲はかなり雑に決められている
110 | ・何百円　扱っていない
111 | ・「台分」「人分」「◯分」分は色々な単位につきうる。数量表現？？
112 | 	・とりあえず出てきたやつを追加している
113 | 	・「単位」＋「分」で検索かけて、でてきたやつを単位に追加しよう <<< 後で
114 | 	・※「台」だと車を数えていて、「台分」だとそれによりできるスペースを示している
115 | ・「およそほぼ約30人」　修飾語は2語までしかとっていない
116 | ・h, hour, m, min, s, sec 対応していない
117 | ・1歳未満　厳密な意味は0~1だけど-INF~1になっている
118 | ・3割5分　5尺6寸 1円30銭　対応していない（どんな数量表現にする？）
119 | ・定価１，５００円（税込）、　家族４人、　　余計な表現は含めない
120 | ・電話番号、住所etcはとらない
121 | ・２ヶ月に１回　一日に三回　現状では、とりあえず分割して考える
122 | ・グッチペンダントネックレス１４５１７１−Ｊ８４００−８１０６価格：４０６３５円　【グッチ】ＧＵＣＣＩ　商品名、番号はとらない
123 | ・直径１．６ｃｍ < 直径が1.6cm。できるだけ抽出する、が評価実験の際、抽出できなくても負例とはしない。
124 | ・5階、305号室、3丁目　＜名詞化。場所を示しているのであって、量を示してはいない。抽出しない。（「階」はとれてしまっている
125 | ・月号　＜量ではない。とらない。
126 | ・数量表現かまだ迷っているもの、のうち抽出するもの：３倍速
127 | ・3歳児：数量表現でない
128 | ・固有名詞中の数量表現
129 |          * 正例：マガジン3月3日号、特集国家百年
130 |          * 負例：そろそろ三日兎にいこうぜ　<- 店の名前？
131 |          * 固有名詞中の曖昧な表現はとらなくて良い、ということで（マガジン2006）
132 |          * ? : 3L缶（3リットル缶？）
133 |       * text:で，車の一時入校許可証で気付いたのだが，今日は１２３２１な日でした．　なんとなくメモ
134 | ・「一番◯◯な〜」この「一番」は「最も」の意味ではあるが、数量表現としても捉えられなくはないのでOKとする
135 | 
136 | ・メモ：
137 | 	・パート3　　　
138 | 	・段落的な意味の数量とか　「1. はじめに　2. 関連研究」　　
139 | 	・ベスト３　（ベストなもの3つ）
140 | 	・2chのスレッド数っぽく　　〜なんだけど（31）
141 | 	・fnを見つけるのはめんどいので、「一時」みたいにとらずともとってもどっちもいいような場合は、とりあえず取る
142 | 	・〜の五十人の（うち）一人
143 | 	・23.6%増　　増、は数量の属性を表しているので〜　＜なんか、境界が曖昧じゃない？
144 | 	・取引銀行３行
145 | 	・チャンネル　チャンネル数を表す単位のときもある？
146 |       * 時間表現
147 |       * 計18時間 <- 18時間の属性を付加しているだけで、時間表現としては「18時間」というだけ　税込みとかと同じ理論
148 |       * 八年半ぶり　＜ぶりってつけないんだっけ？
149 |       * 曜日、「第四月曜」、「毎月2のつく日」
150 |       * 今月十一日 < 11日だけでok。対象としない
151 |       * 戦後：1945の年号として捉える
152 |       * 曖昧性：2/8　これは多分日付だけど…
153 |       * 月50時間、週2回：per月として認識
154 | 
155 | 
156 | 
157 | 
158 | 数を含まないもの
159 | 昨年、前年、来年、再来年、
160 | 先月、来月、
161 | 明日、昨日、同日、
162 | 正月
163 | 


--------------------------------------------------------------------------------
/src/abstime_expression_normalizer.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef ABSTIME_EXPRESSION_NORMALIZER_H_
 2 | #define ABSTIME_EXPRESSION_NORMALIZER_H_
 3 | #include <string>
 4 | #include "digit_utility.hpp"
 5 | #include "number_normalizer.hpp"
 6 | #include "normalizer_utility.hpp"
 7 | #include "normalizer_template.hpp"
 8 | #include <ux/ux.hpp>
 9 | 
10 | namespace abstime_expression_normalizer{
11 | 
12 | struct AbstimeExpression : normalizer_utility::NormalizedExpressionTemplate{
13 |   AbstimeExpression(digit_utility::Number number)
14 |           : normalizer_utility::NormalizedExpressionTemplate(number.original_expression, number.position_start, number.position_end),
15 |             org_value_lowerbound(number.value_lowerbound),
16 |             org_value_upperbound(number.value_upperbound),
17 |             value_lowerbound(normalizer_utility::Time(INFINITY)),
18 |             value_upperbound(normalizer_utility::Time(-INFINITY)),
19 |             ordinary(false)
20 |             {}
21 | 
22 |   double org_value_lowerbound, org_value_upperbound;
23 |   normalizer_utility::Time value_lowerbound, value_upperbound;
24 |   bool ordinary;
25 | };
26 | 
27 |   
28 | class LimitedAbstimeExpression : public normalizer_utility::LimitedExpressionTemplate{
29 | public:
30 |   template <class Archive>
31 |   void serialize(Archive &ar){
32 |     ar & MEMBER(pattern) & MEMBER(corresponding_time_position) & MEMBER(process_type) & MEMBER(ordinary) & MEMBER(option);
33 |   }
34 | 
35 |   std::vector<std::string> corresponding_time_position;
36 |   std::vector<std::string> process_type;
37 | };
38 | 
39 |   
40 | class AbstimeExpressionNormalizer : public normalizer_template::NormalizerTemplate<AbstimeExpression, LimitedAbstimeExpression>{
41 | public:
42 |   AbstimeExpressionNormalizer(const std::string& language) : NN(language) { language_ = language; init(); }
43 | 
44 | private:
45 |   void init();
46 |   void normalize_number(const std::string& text, std::vector<digit_utility::Number>& numbers);
47 |   void revise_any_type_expression_by_matching_limited_expression(std::vector<AbstimeExpression>& abstimeexps, int& expression_id, LimitedAbstimeExpression matching_limited_abstime_expression);
48 |   void revise_any_type_expression_by_matching_prefix_counter(AbstimeExpression& any_type_expression, const LimitedAbstimeExpression& matching_limited_expression);
49 |   void revise_any_type_expression_by_number_modifier(AbstimeExpression& abstimeexp, const normalizer_utility::NumberModifier& number_modifier);
50 |   void delete_not_any_type_expression(std::vector<AbstimeExpression>& abstimeexps);
51 |   void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector<AbstimeExpression>& abstimeexps);
52 |   
53 |   number_normalizer::NumberNormalizer NN;
54 | };
55 | 
56 | } //namespace abstime_expression_normalizer
57 | 
58 | #endif //ABSTIME_EXPRESSON_NORMALIZER_H_
59 | 


--------------------------------------------------------------------------------
/src/dic/en/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/en/.DS_Store


--------------------------------------------------------------------------------
/src/dic/ja/abstime_prefix_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"だいたい", "process_type":"about"}
 2 | {"pattern":"およそ", "process_type":"about"}
 3 | {"pattern":"ちょうど", "process_type":"none"}
 4 | {"pattern":"~", "process_type":"kara_prefix"}
 5 | {"pattern":"〜", "process_type":"kara_prefix"}
 6 | {"pattern":"～", "process_type":"kara_prefix"}
 7 | {"pattern":"-", "process_type":"kara_prefix"}
 8 | {"pattern":"−", "process_type":"kara_prefix"}
 9 | {"pattern":"ー", "process_type":"kara_prefix"}
10 | {"pattern":"から", "process_type":"kara_prefix"}
11 | {"pattern":"PM", "process_type":"gogo"}
12 | {"pattern":"AM", "process_type":"gozen"}
13 | {"pattern":"ＰＭ", "process_type":"gogo"}
14 | {"pattern":"ＡＭ", "process_type":"gozen"}
15 | {"pattern":"PM", "process_type":"gogo"}
16 | {"pattern":"AM", "process_type":"gozen"}
17 | {"pattern":"ＰＭ　", "process_type":"gogo"}
18 | {"pattern":"ＡＭ　", "process_type":"gozen"}
19 | {"pattern":"朝", "process_type":"asa"}
20 | {"pattern":"夜", "process_type":"yoru"}
21 | {"pattern":"深夜", "process_type":"sinnya"}
22 | 


--------------------------------------------------------------------------------
/src/dic/ja/abstime_suffix_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"以前", "process_type":"or_less"}
 2 | {"pattern":"まで", "process_type":"made"}
 3 | {"pattern":"迄", "process_type":"or_less"}
 4 | {"pattern":"より前", "process_type":"less"}
 5 | {"pattern":"以降", "process_type":"or_over"}
 6 | {"pattern":"より後", "process_type":"over"}
 7 | {"pattern":"~", "process_type":"kara_suffix"}
 8 | {"pattern":"〜", "process_type":"kara_suffix"}
 9 | {"pattern":"～", "process_type":"kara_suffix"}
10 | {"pattern":"-", "process_type":"kara_suffix"}
11 | {"pattern":"−", "process_type":"kara_suffix"}
12 | {"pattern":"ー", "process_type":"kara_suffix"}
13 | {"pattern":"から", "process_type":"kara_suffix"}
14 | {"pattern":"くらい", "process_type":"about"}
15 | {"pattern":"ばかり", "process_type":"about"}
16 | {"pattern":"前後", "process_type":"about"}
17 | {"pattern":"近く", "process_type":"about"}
18 | {"pattern":"頃", "process_type":"about"}
19 | {"pattern":"ごろ", "process_type":"about"}
20 | {"pattern":"頭", "process_type":"zenhan"}
21 | {"pattern":"前半", "process_type":"zenhan"}
22 | {"pattern":"前記", "process_type":"zenhan"}
23 | {"pattern":"初頭", "process_type":"zenhan"}
24 | {"pattern":"初期", "process_type":"zenhan"}
25 | {"pattern":"初め", "process_type":"zenhan"}
26 | {"pattern":"始め", "process_type":"zenhan"}
27 | {"pattern":"はじめ", "process_type":"zenhan"}
28 | {"pattern":"後半", "process_type":"kouhan"}
29 | {"pattern":"後期", "process_type":"kouhan"}
30 | {"pattern":"終盤", "process_type":"kouhan"}
31 | {"pattern":"終わり", "process_type":"kouhan"}
32 | {"pattern":"末", "process_type":"kouhan"}
33 | {"pattern":"半ば", "process_type":"nakaba"}
34 | {"pattern":"中期", "process_type":"nakaba"}
35 | {"pattern":"中盤", "process_type":"nakaba"}
36 | {"pattern":"中頃", "process_type":"nakaba"}
37 | {"pattern":"中ごろ", "process_type":"nakaba"}
38 | {"pattern":"中旬", "process_type":"nakaba"}
39 | {"pattern":"上旬", "process_type":"joujun"}
40 | {"pattern":"中旬", "process_type":"tyujun"}
41 | {"pattern":"下旬", "process_type":"gejun"}
42 | {"pattern":"PM", "process_type":"gogo"}
43 | {"pattern":"AM", "process_type":"gozen"}
44 | {"pattern":"ＰＭ", "process_type":"gogo"}
45 | {"pattern":"ＡＭ", "process_type":"gozen"}
46 | {"pattern":"PM", "process_type":"gogo"}
47 | {"pattern":"AM", "process_type":"gozen"}
48 | {"pattern":"　ＰＭ", "process_type":"gogo"}
49 | {"pattern":"　ＡＭ", "process_type":"gozen"}
50 | 


--------------------------------------------------------------------------------
/src/dic/ja/chinese_character.txt:
--------------------------------------------------------------------------------
 1 | {"character":"〇", "value":0, "NotationType":"09"}
 2 | {"character":"一", "value":1, "NotationType":"09"}
 3 | {"character":"二", "value":2, "NotationType":"09"}
 4 | {"character":"三", "value":3, "NotationType":"09"}
 5 | {"character":"四", "value":4, "NotationType":"09"}
 6 | {"character":"五", "value":5, "NotationType":"09"}
 7 | {"character":"六", "value":6, "NotationType":"09"}
 8 | {"character":"七", "value":7, "NotationType":"09"}
 9 | {"character":"八", "value":8, "NotationType":"09"}
10 | {"character":"九", "value":9, "NotationType":"09"}
11 | {"character":"零", "value":0, "NotationType":"09"}
12 | {"character":"十", "value":1, "NotationType":"sen"}
13 | {"character":"百", "value":2, "NotationType":"sen"}
14 | {"character":"千", "value":3, "NotationType":"sen"}
15 | {"character":"万", "value":4, "NotationType":"man"}
16 | {"character":"億", "value":8, "NotationType":"man"}
17 | {"character":"兆", "value":12, "NotationType":"man"}
18 | {"character":"京", "value":16, "NotationType":"man"}
19 | {"character":"壱", "value":1, "NotationType":"09"}
20 | {"character":"弐", "value":2, "NotationType":"09"}
21 | {"character":"参", "value":3, "NotationType":"09"}
22 | {"character":"伍", "value":5, "NotationType":"09"}
23 | {"character":"拾", "value":1, "NotationType":"sen"}
24 | {"character":"佰", "value":2, "NotationType":"sen"}
25 | {"character":"阡", "value":3, "NotationType":"sen"}
26 | {"character":"萬", "value":4, "NotationType":"man"}


--------------------------------------------------------------------------------
/src/dic/ja/duration_expression_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"世紀", "corresponding_time_position":["seiki"], "process_type":[], "ordinary":false, "option":""}
 2 | {"pattern":"世紀半", "corresponding_time_position":["seiki"], "process_type":["han"], "ordinary":false, "option":""}
 3 | {"pattern":"年間", "corresponding_time_position":["y"], "process_type":[], "ordinary":false, "option":""}
 4 | {"pattern":"年間半", "corresponding_time_position":["y"], "process_type":["han"], "ordinary":false, "option":""}
 5 | {"pattern":"年", "corresponding_time_position":["y"], "process_type":[], "ordinary":false, "option":""}
 6 | {"pattern":"年半", "corresponding_time_position":["y"], "process_type":["han"], "ordinary":false, "option":""}
 7 | {"pattern":"ヶ月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""}
 8 | {"pattern":"ヶ月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""}
 9 | {"pattern":"か月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""}
10 | {"pattern":"か月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""}
11 | {"pattern":"カ月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""}
12 | {"pattern":"カ月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""}
13 | {"pattern":"ヵ月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""}
14 | {"pattern":"ヵ月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""}
15 | {"pattern":"ケ月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""}
16 | {"pattern":"ケ月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""}
17 | {"pattern":"箇月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""}
18 | {"pattern":"箇月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""}
19 | {"pattern":"週間", "corresponding_time_position":["w"], "process_type":[], "ordinary":false, "option":""}
20 | {"pattern":"週間半", "corresponding_time_position":["w"], "process_type":["han"], "ordinary":false, "option":""}
21 | {"pattern":"日間", "corresponding_time_position":["d"], "process_type":[], "ordinary":false, "option":""}
22 | {"pattern":"日間半", "corresponding_time_position":["d"], "process_type":["han"], "ordinary":false, "option":""}
23 | {"pattern":"時間", "corresponding_time_position":["h"], "process_type":[], "ordinary":false, "option":""}
24 | {"pattern":"時間半", "corresponding_time_position":["h"], "process_type":["han"], "ordinary":false, "option":""}
25 | {"pattern":"分間", "corresponding_time_position":["mn"], "process_type":[], "ordinary":false, "option":""}
26 | {"pattern":"分間半", "corresponding_time_position":["mn"], "process_type":["han"], "ordinary":false, "option":""}
27 | {"pattern":"秒間", "corresponding_time_position":["s"], "process_type":[], "ordinary":false, "option":""}
28 | {"pattern":"秒間半", "corresponding_time_position":["s"], "process_type":["han"], "ordinary":false, "option":""}
29 | {"pattern":"年", "corresponding_time_position":["y"], "process_type":[], "ordinary":false, "option":""}
30 | {"pattern":"年半", "corresponding_time_position":["y"], "process_type":["han"], "ordinary":false, "option":""}
31 | {"pattern":"月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""}
32 | {"pattern":"月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""}
33 | {"pattern":"週", "corresponding_time_position":["w"], "process_type":[], "ordinary":false, "option":""}
34 | {"pattern":"週半", "corresponding_time_position":["w"], "process_type":["han"], "ordinary":false, "option":""}
35 | {"pattern":"日", "corresponding_time_position":["d"], "process_type":[], "ordinary":false, "option":""}
36 | {"pattern":"日半", "corresponding_time_position":["d"], "process_type":["han"], "ordinary":false, "option":""}
37 | {"pattern":"分", "corresponding_time_position":["mn"], "process_type":[], "ordinary":false, "option":""}
38 | {"pattern":"分半", "corresponding_time_position":["mn"], "process_type":["han"], "ordinary":false, "option":""}
39 | {"pattern":"秒", "corresponding_time_position":["s"], "process_type":[], "ordinary":false, "option":""}
40 | {"pattern":"秒半", "corresponding_time_position":["s"], "process_type":["han"], "ordinary":false, "option":""}
41 | {"pattern":"年ǂヶ月", "corresponding_time_position":["y", "m"], "process_type":[], "ordinary":false, "option":""}
42 | {"pattern":"年ǂヶ月半", "corresponding_time_position":["y", "m"], "process_type":["han"], "ordinary":false, "option":""}
43 | {"pattern":"時間ǂ分", "corresponding_time_position":["h", "mn"], "process_type":[], "ordinary":false, "option":""}
44 | {"pattern":"時間ǂ分半", "corresponding_time_position":["h", "mn"], "process_type":["han"], "ordinary":false, "option":""}
45 | {"pattern":"分ǂ秒", "corresponding_time_position":["mn", "s"], "process_type":[], "ordinary":false, "option":""}
46 | {"pattern":"分ǂ秒半", "corresponding_time_position":["mn", "s"], "process_type":["han"], "ordinary":false, "option":""}
47 | 


--------------------------------------------------------------------------------
/src/dic/ja/duration_prefix_counter_json.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/ja/duration_prefix_counter_json.txt


--------------------------------------------------------------------------------
/src/dic/ja/duration_prefix_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"約", "process_type":"about"}
 2 | {"pattern":"だいたい", "process_type":"about"}
 3 | {"pattern":"ほぼ", "process_type":"about"}
 4 | {"pattern":"およそ", "process_type":"about"}
 5 | {"pattern":"ほとんど", "process_type":"about"}
 6 | {"pattern":"全", "process_type":"none"}
 7 | {"pattern":"ちょうど", "process_type":"none"}
 8 | {"pattern":"第", "process_type":"ordinary"}
 9 | {"pattern":"~", "process_type":"kara_prefix"}
10 | {"pattern":"〜", "process_type":"kara_prefix"}
11 | {"pattern":"～", "process_type":"kara_prefix"}
12 | {"pattern":"-", "process_type":"kara_prefix"}
13 | {"pattern":"−", "process_type":"kara_prefix"}
14 | {"pattern":"ー", "process_type":"kara_prefix"}
15 | {"pattern":"から", "process_type":"kara_prefix"}
16 | 


--------------------------------------------------------------------------------
/src/dic/ja/duration_suffix_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"目", "process_type":"ordinary"}
 2 | {"pattern":"以下", "process_type":"or_less"}
 3 | {"pattern":"以前", "process_type":"or_less"}
 4 | {"pattern":"以内", "process_type":"or_less"}
 5 | {"pattern":"まで", "process_type":"made"}
 6 | {"pattern":"迄", "process_type":"or_less"}
 7 | {"pattern":"未満", "process_type":"less"}
 8 | {"pattern":"以上", "process_type":"or_over"}
 9 | {"pattern":"以降", "process_type":"or_over"}
10 | {"pattern":"超", "process_type":"or_over"}
11 | {"pattern":"越え", "process_type":"or_over"}
12 | {"pattern":"超え", "process_type":"or_over"}
13 | {"pattern":"~", "process_type":"kara_suffix"}
14 | {"pattern":"〜", "process_type":"kara_suffix"}
15 | {"pattern":"～", "process_type":"kara_suffix"}
16 | {"pattern":"-", "process_type":"kara_suffix"}
17 | {"pattern":"−", "process_type":"kara_suffix"}
18 | {"pattern":"ー", "process_type":"kara_suffix"}
19 | {"pattern":"から", "process_type":"kara_suffix"}
20 | {"pattern":"くらい", "process_type":"about"}
21 | {"pattern":"ばかり", "process_type":"about"}
22 | {"pattern":"前後", "process_type":"about"}
23 | {"pattern":"程度", "process_type":"about"}
24 | {"pattern":"ほど", "process_type":"about"}
25 | {"pattern":"近く", "process_type":"about"}
26 | {"pattern":"頃", "process_type":"about"}
27 | {"pattern":"ごろ", "process_type":"about"}
28 | {"pattern":"余り", "process_type":"kyou"}
29 | {"pattern":"強", "process_type":"kyou"}
30 | {"pattern":"弱", "process_type":"jaku"}
31 | {"pattern":"台", "process_type":"dai"}
32 | {"pattern":"代", "process_type":"dai"}
33 | {"pattern":"毎", "process_type":"per"}
34 | 


--------------------------------------------------------------------------------
/src/dic/ja/inappropriate_strings_json.txt:
--------------------------------------------------------------------------------
 1 | {"str":"一切"}
 2 | {"str":"一部"}
 3 | {"str":"一連"}
 4 | {"str":"三振"}
 5 | {"str":"一段"}
 6 | {"str":"一体"}
 7 | {"str":"九州"}
 8 | {"str":"四国"}
 9 | {"str":"一種"}
10 | {"str":"一番"}
11 | 


--------------------------------------------------------------------------------
/src/dic/ja/num_prefix_counter_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"￥", "counter":"円", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
 2 | {"pattern":"¥", "counter":"円", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
 3 | {"pattern":"$", "counter":"ドル", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
 4 | {"pattern":"＄", "counter":"ドル", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
 5 | {"pattern":"€", "counter":"ユーロ", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
 6 | {"pattern":"£", "counter":"ポンド", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
 7 | {"pattern":"小さじ", "counter":"小さじ", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
 8 | {"pattern":"大さじ", "counter":"大さじ", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
 9 | {"pattern":"時速", "counter":"/h", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
10 | {"pattern":"毎時", "counter":"/h", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
11 | {"pattern":"分速", "counter":"/m", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
12 | {"pattern":"毎分", "counter":"/m", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
13 | {"pattern":"秒速", "counter":"/s", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
14 | {"pattern":"毎秒", "counter":"/s", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
15 | {"pattern":"週", "counter":"/week", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
16 | {"pattern":"月", "counter":"/month", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
17 | {"pattern":"年", "counter":"/year", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"}
18 | {"pattern":"最大", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"saidai"}
19 | {"pattern":"最長", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"saityou"}
20 | {"pattern":"最高", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"saikou"}
21 | {"pattern":"華氏", "counter":"℉", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
22 | {"pattern":"摂氏", "counter":"℃", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"}
23 | {"pattern":"風速", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"fusoku"}
24 | {"pattern":"水温", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"suion"}
25 | {"pattern":"北緯", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"hokui"}
26 | {"pattern":"南緯", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"nanni"}
27 | {"pattern":"東経", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"toukei"}
28 | {"pattern":"西経", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"seikei"}
29 | 


--------------------------------------------------------------------------------
/src/dic/ja/num_prefix_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"第", "process_type":"ordinary"}
 2 | {"pattern":"約", "process_type":"about"}
 3 | {"pattern":"だいたい", "process_type":"about"}
 4 | {"pattern":"ほぼ", "process_type":"about"}
 5 | {"pattern":"およそ", "process_type":"about"}
 6 | {"pattern":"ほとんど", "process_type":"about"}
 7 | {"pattern":"全", "process_type":"none"}
 8 | {"pattern":"ちょうど", "process_type":"none"}
 9 | {"pattern":"~", "process_type":"kara_prefix"}
10 | {"pattern":"〜", "process_type":"kara_prefix"}
11 | {"pattern":"～", "process_type":"kara_prefix"}
12 | {"pattern":"-", "process_type":"kara_prefix"}
13 | {"pattern":"−", "process_type":"kara_prefix"}
14 | {"pattern":"ー", "process_type":"kara_prefix"}
15 | {"pattern":"から", "process_type":"kara_prefix"}
16 | 


--------------------------------------------------------------------------------
/src/dic/ja/num_suffix_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"目", "process_type":"ordinary"}
 2 | {"pattern":"以下", "process_type":"or_less"}
 3 | {"pattern":"以前", "process_type":"or_less"}
 4 | {"pattern":"以内", "process_type":"or_less"}
 5 | {"pattern":"まで", "process_type":"made"}
 6 | {"pattern":"迄", "process_type":"or_less"}
 7 | {"pattern":"未満", "process_type":"less"}
 8 | {"pattern":"以上", "process_type":"or_over"}
 9 | {"pattern":"以降", "process_type":"or_over"}
10 | {"pattern":"超", "process_type":"or_over"}
11 | {"pattern":"越え", "process_type":"or_over"}
12 | {"pattern":"超え", "process_type":"or_over"}
13 | {"pattern":"~", "process_type":"kara_suffix"}
14 | {"pattern":"〜", "process_type":"kara_suffix"}
15 | {"pattern":"～", "process_type":"kara_suffix"}
16 | {"pattern":"-", "process_type":"kara_suffix"}
17 | {"pattern":"−", "process_type":"kara_suffix"}
18 | {"pattern":"ー", "process_type":"kara_suffix"}
19 | {"pattern":"から", "process_type":"kara_suffix"}
20 | {"pattern":"くらい", "process_type":"about"}
21 | {"pattern":"ばかり", "process_type":"about"}
22 | {"pattern":"前後", "process_type":"about"}
23 | {"pattern":"程度", "process_type":"about"}
24 | {"pattern":"ほど", "process_type":"about"}
25 | {"pattern":"近く", "process_type":"about"}
26 | {"pattern":"頃", "process_type":"about"}
27 | {"pattern":"ごろ", "process_type":"about"}
28 | {"pattern":"余り", "process_type":"kyou"}
29 | {"pattern":"強", "process_type":"kyou"}
30 | {"pattern":"弱", "process_type":"jaku"}
31 | {"pattern":"台", "process_type":"dai"}
32 | {"pattern":"代", "process_type":"dai"}
33 | {"pattern":"毎", "process_type":"per"}
34 | {"pattern":"半", "process_type":"han"}
35 | {"pattern":"／時", "process_type":"/h"}
36 | {"pattern":"／分", "process_type":"/min"}
37 | {"pattern":"／秒", "process_type":"/sec"}
38 | {"pattern":"/時", "process_type":"/h"}
39 | {"pattern":"/分", "process_type":"/min"}
40 | {"pattern":"/秒", "process_type":"/sec"}
41 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/abstime_date.txt:
--------------------------------------------------------------------------------
 1 | 世紀
 2 | seiki
 3 | 年
 4 | y
 5 | 年*月
 6 | y m
 7 | 年*月*日
 8 | y m d
 9 | 月
10 | m
11 | 月*日
12 | m d
13 | 日
14 | d
15 | /*
16 | m d
17 | /*/*
18 | y m d
19 | ／*
20 | m d
21 | ／*／*
22 | y m d
23 | -*-*
24 | y m d
25 | −*−*
26 | y m d
27 | ー*ー*
28 | y m d
29 | .*
30 | m d
31 | .*.*
32 | y m d
33 | ．*
34 | m d
35 | ．*．*
36 | y m d
37 | ・*
38 | m d
39 | ・*・*
40 | y m d
41 | ，*
42 | m d
43 | ，*，*
44 | y m d
45 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/abstime_dayweek.txt:
--------------------------------------------------------------------------------
 1 | 月	Mon
 2 | 火	Tue
 3 | 水	Wed
 4 | 木	Thu
 5 | 金	Fri
 6 | 土	Sat
 7 | 日	Sun
 8 | 月曜	Mon
 9 | 火曜	Tue
10 | 水曜	Wed
11 | 木曜	Thu
12 | 金曜	Fri
13 | 土曜	Sat
14 | 日曜	Sun
15 | 月曜日	Mon
16 | 火曜日	Tue
17 | 水曜日	Wed
18 | 木曜日	Thu
19 | 金曜日	Fri
20 | 土曜日	Sat
21 | 日曜日	Sun
22 | Monday	Mon
23 | Tuesday	Tue
24 | Wednesday	Wed
25 | Thursday	Thu
26 | Friday	Fri
27 | Saturday	Sat
28 | Sunday	Sun
29 | Mon	Mon
30 | Tue	Tue
31 | Wed	Wed
32 | Thu	Thu
33 | Fri	Fri
34 | Sat	Sat
35 | Sun	Sun
36 | Ｍｏｎｄａｙ	Mon
37 | Ｔｕｅｓｄａｙ	Tue
38 | Ｗｅｄｎｅｓｄａｙ	Wed
39 | Ｔｈｕｒｓｄａｙ	Thu
40 | Ｆｒｉｄａｙ	Fri
41 | Ｓａｔｕｒｄａｙ	Sat
42 | Ｓｕｎｄａｙ	Sun
43 | Ｍｏｎ	Mon
44 | Ｔｕｅ	Tue
45 | Ｗｅｄ	Wed
46 | Ｔｈｕ	Thu
47 | Ｆｒｉ	Fri
48 | Ｓａｔ	Sat
49 | Ｓｕｎ	Sun


--------------------------------------------------------------------------------
/src/dic/ja/raw/abstime_dayweek_pattern.txt:
--------------------------------------------------------------------------------
1 | 
2 | 　
3 |  
4 | *
5 | (*)
6 | （*）
7 |  *
8 | 　*　
9 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/abstime_nengou.txt:
--------------------------------------------------------------------------------
  1 | 西暦	0
  2 | 飛鳥時代	644
  3 | 白雉	649
  4 | 朱鳥	685
  5 | 大宝	700
  6 | 慶雲	703
  7 | 和銅	707
  8 | 奈良時代	714
  9 | 養老	716
 10 | 神亀	723
 11 | 天平	728
 12 | 天平感宝	748
 13 | 天平勝宝	748
 14 | 天平宝字	756
 15 | 天平神護	764
 16 | 神護景雲	766
 17 | 宝亀	769
 18 | 天応	780
 19 | 延暦	781
 20 | 平安時代	805
 21 | 弘仁	809
 22 | 天長	823
 23 | 承和	833
 24 | 嘉祥	847
 25 | 仁寿	850
 26 | 斎衡	853
 27 | 天安	856
 28 | 貞観	858
 29 | 元慶	876
 30 | 仁和	884
 31 | 寛平	888
 32 | 昌泰	897
 33 | 延喜	900
 34 | 延長	922
 35 | 承平	930
 36 | 天慶	937
 37 | 天暦	946
 38 | 天徳	956
 39 | 応和	960
 40 | 康保	963
 41 | 安和	967
 42 | 天禄	969
 43 | 天延	972
 44 | 貞元	975
 45 | 天元	977
 46 | 永観	982
 47 | 寛和	984
 48 | 永延	986
 49 | 永祚	988
 50 | 正暦	989
 51 | 長徳	994
 52 | 長保	998
 53 | 寛弘	1003
 54 | 長和	1011
 55 | 寛仁	1016
 56 | 治安	1020
 57 | 万寿	1023
 58 | 長元	1027
 59 | 長暦	1036
 60 | 長久	1039
 61 | 寛徳	1043
 62 | 永承	1045
 63 | 天喜	1052
 64 | 康平	1057
 65 | 治暦	1064
 66 | 延久	1068
 67 | 承保	1073
 68 | 承暦	1076
 69 | 永保	1080
 70 | 応徳	1083
 71 | 寛治	1086
 72 | 嘉保	1093
 73 | 永長	1095
 74 | 承徳	1096
 75 | 康和	1098
 76 | 長治	1103
 77 | 嘉承	1105
 78 | 天仁	1107
 79 | 天永	1109
 80 | 永久	1112
 81 | 元永	1117
 82 | 保安	1119
 83 | 天治	1123
 84 | 大治	1125
 85 | 天承	1130
 86 | 長承	1131
 87 | 保延	1134
 88 | 永治	1140
 89 | 康治	1141
 90 | 天養	1143
 91 | 久安	1144
 92 | 仁平	1150
 93 | 久寿	1153
 94 | 保元	1155
 95 | 平治	1158
 96 | 永暦	1159
 97 | 応保	1160
 98 | 長寛	1162
 99 | 永万	1164
100 | 仁安	1165
101 | 嘉応	1168
102 | 承安	1170
103 | 安元	1174
104 | 治承	1176
105 | 養和	1180
106 | 寿永	1181
107 | 元暦	1183
108 | 文治	1184
109 | 鎌倉時代	1189
110 | 正治	1198
111 | 建仁	1200
112 | 元久	1203
113 | 建永	1205
114 | 承元	1206
115 | 建暦	1210
116 | 建保	1212
117 | 承久	1218
118 | 貞応	1221
119 | 元仁	1223
120 | 嘉禄	1224
121 | 安貞	1226
122 | 寛喜	1228
123 | 貞永	1231
124 | 天福	1232
125 | 文暦	1233
126 | 嘉禎	1234
127 | 暦仁	1237
128 | 延応	1238
129 | 仁治	1239
130 | 寛元	1242
131 | 宝治	1246
132 | 建長	1248
133 | 康元	1255
134 | 正嘉	1256
135 | 正元	1258
136 | 文応	1259
137 | 弘長	1260
138 | 文永	1263
139 | 建治	1274
140 | 弘安	1277
141 | 正応	1287
142 | 永仁	1292
143 | 正安	1298
144 | 乾元	1301
145 | 嘉元	1302
146 | 徳治	1305
147 | 延慶	1307
148 | 応長	1310
149 | 正和	1311
150 | 文保	1316
151 | 元応	1318
152 | 元亨	1320
153 | 正中	1323
154 | 嘉暦	1325
155 | 南北朝時代	1328
156 | 北朝	1328
157 | 南朝	1330
158 | 北朝	1331
159 | 南朝	1333
160 | 北朝	1333
161 | 南朝	1335
162 | 北朝	1337
163 | 南朝	1339
164 | 北朝	1341
165 | 北朝	1344
166 | 南朝	1345
167 | 北朝	1349
168 | 北朝	1351
169 | 北朝	1355
170 | 北朝	1360
171 | 北朝	1361
172 | 北朝	1367
173 | 南朝	1369
174 | 南朝	1371
175 | 南朝	1374
176 | 北朝	1374
177 | 北朝	1378
178 | 南朝	1380
179 | 北朝	1380
180 | 南朝	1383
181 | 北朝	1383
182 | 北朝	1386
183 | 北朝	1388
184 | 北朝	1389
185 | 室町時代	1391
186 | 応永	1393
187 | 正長	1427
188 | 永享	1428
189 | 嘉吉	1440
190 | 文安	1443
191 | 宝徳	1448
192 | 享徳	1451
193 | 康正	1454
194 | 長禄	1456
195 | 寛正	1459
196 | 文正	1465
197 | 応仁	1466
198 | 文明	1468
199 | 長享	1486
200 | 延徳	1488
201 | 明応	1491
202 | 文亀	1500
203 | 永正	1503
204 | 大永	1520
205 | 享禄	1527
206 | 天文	1531
207 | 弘治	1554
208 | 永禄	1557
209 | 元亀	1569
210 | 安土桃山時代	1572
211 | 文禄	1591
212 | 慶長	1595
213 | 江戸時代	1614
214 | 寛永	1623
215 | 正保	1643
216 | 慶安	1647
217 | 承応	1651
218 | 明暦	1654
219 | 万治	1657
220 | 寛文	1660
221 | 延宝	1672
222 | 天和	1680
223 | 貞享	1683
224 | 元禄	1687
225 | 宝永	1703
226 | 正徳	1710
227 | 享保	1715
228 | 元文	1735
229 | 寛保	1740
230 | 延享	1743
231 | 寛延	1747
232 | 宝暦	1750
233 | 明和	1763
234 | 安永	1771
235 | 天明	1780
236 | 寛政	1788
237 | 享和	1800
238 | 文化	1803
239 | 文政	1817
240 | 天保	1829
241 | 弘化	1843
242 | 嘉永	1847
243 | 安政	1853
244 | 万延	1859
245 | 文久	1860
246 | 元治	1863
247 | 慶応	1864
248 | 明治	1867
249 | 大正	1911
250 | 昭和	1925
251 | S	1925
252 | Ｓ	1925
253 | 戦後	1945
254 | 平成	1988
255 | H	1988
256 | Ｈ	1988


--------------------------------------------------------------------------------
/src/dic/ja/raw/abstime_prefix_counter.txt:
--------------------------------------------------------------------------------
1 | 紀元前	kigenzen
2 | 午前	gozen
3 | 午後	gogo
4 | AM	gozen
5 | PM	gogo


--------------------------------------------------------------------------------
/src/dic/ja/raw/abstime_settouji.txt:
--------------------------------------------------------------------------------
 1 | だいたい	about
 2 | およそ	about
 3 | ちょうど	none
 4 | ~	kara_prefix
 5 | 〜	kara_prefix
 6 | ～	kara_prefix
 7 | -	kara_prefix
 8 | −	kara_prefix
 9 | ー	kara_prefix
10 | から	kara_prefix
11 | PM	gogo
12 | AM	gozen
13 | ＰＭ	gogo
14 | ＡＭ	gozen
15 | PM 	gogo
16 | AM 	gozen
17 | ＰＭ　	gogo
18 | ＡＭ　	gozen
19 | 朝	asa
20 | 夜	yoru
21 | 深夜	sinnya


--------------------------------------------------------------------------------
/src/dic/ja/raw/abstime_setubiji.txt:
--------------------------------------------------------------------------------
 1 | 以前	>=
 2 | まで	made
 3 | 迄	>=
 4 | より前	>
 5 | 以降	<=
 6 | より後	<
 7 | ~	kara_suffix
 8 | 〜	kara_suffix
 9 | ～	kara_suffix
10 | -	kara_suffix
11 | −	kara_suffix
12 | ー	kara_suffix
13 | から	kara_suffix
14 | くらい	about
15 | ばかり	about
16 | 前後	about
17 | 近く	about
18 | 頃	about
19 | ごろ	about
20 | 頭	zenhan
21 | 前半	zenhan
22 | 前記	zenhan
23 | 初頭	zenhan
24 | 初期	zenhan
25 | 初め	zenhan
26 | 始め	zenhan
27 | はじめ	zenhan
28 | 後半	kouhan
29 | 後期	kouhan
30 | 終盤	kouhan
31 | 終わり	kouhan
32 | 末	kouhan
33 | 半ば	nakaba
34 | 中期	nakaba
35 | 中盤	nakaba
36 | 中頃	nakaba
37 | 中ごろ	nakaba
38 | 中旬	nakaba
39 | 上旬	joujun
40 | 中旬	tyujun
41 | 下旬	gejun
42 | PM	gogo
43 | AM	gozen
44 | ＰＭ	gogo
45 | ＡＭ	gozen
46 |  PM	gogo
47 |  AM	gozen
48 | 　ＰＭ	gogo
49 | 　ＡＭ	gozen
50 | 朝	asa
51 | 夜	yoru


--------------------------------------------------------------------------------
/src/dic/ja/raw/abstime_time.txt:
--------------------------------------------------------------------------------
 1 | 時
 2 | h
 3 | 時半
 4 | h
 5 | 時*分
 6 | h mn
 7 | 時*分*秒
 8 | h mn s
 9 | 分*秒
10 | mn s
11 | :*
12 | h mn
13 | ：*
14 | h mn
15 | :*:*
16 | h mn s
17 | ：*：*
18 | h mn s


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_abstime.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | def create_list_expression(lst) :
 5 |   if len(lst)==0 : return "[]"
 6 |   ret = "["
 7 |   for l in lst :
 8 |     ret += "\""+l+"\""
 9 |     ret += ", "
10 |   ret = ret[:-2] + "]"
11 |   return ret
12 | 	
13 | def create_process_type(str) :
14 | 	process_type = []
15 | 	if(str.endswith("半")) : process_type.append("han")
16 | 	if str in ["/ǂ", "／ǂ", ".ǂ", "・ǂ", "．ǂ", "，ǂ"] : #ハイフンで3-3（3月3日)はなさそうなので除外してある
17 | 		process_type.append("unclear")
18 | 	return process_type
19 | 	
20 | #load dayweeks　　「午後」などの処理もここで。
21 | lst_dayweek = []
22 | fin = open("abstime_dayweek.txt", "r")
23 | for line in fin.readlines() :
24 | 	dayweek, type = line.rstrip().split()
25 | 	fin2 = open("abstime_dayweek_pattern.txt", "r")
26 | 	for pattern in fin2.readlines() :
27 | 		pattern = pattern.rstrip("\n")
28 | 		tmp_type = ""
29 | 		if pattern.count("*") != 0 : tmp_type = type
30 | 		lst_dayweek.append([pattern.replace("*",dayweek), tmp_type])
31 | 			
32 | #create date expression
33 | fin = open("abstime_date.txt")
34 | lst = []
35 | for line in fin.readlines() :
36 |   l = line.rstrip()
37 |   l = l.replace("*","ǂ")
38 |   lst.append(l)
39 |   if len(lst) == 2 :
40 | 		corresponding_time_position = create_list_expression( lst[1].split() )
41 | 		process_type = create_process_type(lst[0])
42 | 		print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}"
43 | 		if lst[1].find("d") != -1 : #曜日表現を加える
44 | 			for dayweek in lst_dayweek :
45 | 				dayweek[0] = dayweek[0].rstrip().rstrip("　")
46 | 				if dayweek[0] == "" : continue
47 | 				process_type = create_process_type(lst[0] + dayweek[0])
48 | 				print "{\"pattern\":\""+lst[0]+dayweek[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\""+dayweek[1]+"\"}"
49 | 		lst = []
50 | fin.close()
51 | 
52 | 
53 | #create time expression
54 | fin = open("abstime_time.txt")
55 | lst = []
56 | for line in fin.readlines() :
57 |   l = line.rstrip()
58 |   l = l.replace("*","ǂ")
59 |   lst.append(l)
60 |   if len(lst) == 2 :
61 | 		corresponding_time_position = create_list_expression( lst[1].split() )
62 | 		process_type = create_process_type(lst[0])
63 | 		print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}"
64 | 		lst = []
65 | fin.close()
66 | 
67 | """
68 | #普通にやろうとすると18MBほどになる。検索時間はあまり変わらないが、最初の読み込み時に5秒程時間がかかる。どうする？
69 | 
70 | #create date+time expression
71 | fin = open("abstime_date.txt")
72 | lst = []
73 | for line in fin.readlines() :
74 |   l = line.rstrip()
75 |   l = l.replace("*","ǂ")
76 |   lst.append(l)
77 |   if len(lst) == 2 :
78 | 		if lst[1].find("d") != -1 :
79 | 			for dayweek in lst_dayweek :
80 | 				fin2 = open("abstime_time.txt")
81 | 				lst2 = []
82 | 				for line2 in fin2.readlines() :
83 | 					l2 = l2.replace("*","ǂ")
84 | 					lst2.append(l2)
85 | 					if len(lst2) == 2 :
86 | 						if lst2[1].find("h") == -1 : continue
87 | 						tmp = lst[1].split()
88 | 						tmp2 = lst2[1].split()
89 | 						tmp.extend(tmp2)
90 | 						corresponding_time_position = create_list_expression(tmp)
91 | 						process_type = create_process_type(lst[0]+dayweek+lst2[0])
92 | 						print "{\"pattern\":\""+lst[0]+dayweek+"ǂ"+lst2[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}"
93 | 						lst2 = []
94 | 		lst = []
95 | fin.close()
96 | """


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_abstime_date+time.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | def create_list_expression(lst) :
 5 |   if len(lst)==0 : return "[]"
 6 |   ret = "["
 7 |   for l in lst :
 8 |     ret += "\""+l+"\""
 9 |     ret += ", "
10 |   ret = ret[:-2] + "]"
11 |   return ret
12 | 	
13 | def create_process_type(str) :
14 | 	process_type = []
15 | 	if(str.find("午前") != -1 or str.find("AM") != -1) : process_type.append("gozen")
16 | 	if(str.find("午後") != -1 or str.find("PM") != -1) : process_type.append("gogo")
17 | 	if(str=="世紀") : process_type.append("seiki")
18 | 	if(str.endswith("半")) : process_type.append("han")
19 | 	return process_type
20 | 	
21 | 
22 | #load dayweeks　　「午後」などの処理もここで。
23 | lst_dayweek = []
24 | lst_gogo = ["", "午前", "午後", "AM", "PM"]
25 | fin = open("abstime_dayweek.txt", "r")
26 | for line in fin.readlines() :
27 | 	dayweek = line.rstrip()
28 | 	fin2 = open("abstime_dayweek_pattern.txt", "r")
29 | 	for pattern in fin2.readlines() :
30 | 		pattern = pattern.rstrip("\n")
31 | 		for gogo in lst_gogo :
32 | 			lst_dayweek.append(pattern.replace("*",dayweek)+gogo)
33 | 			
34 | 
35 | 
36 | #create time expression
37 | fin = open("abstime_time.txt")
38 | lst = []
39 | for line in fin.readlines() :
40 |   l = line.rstrip()
41 |   l = l.replace("*","ǂ")
42 |   lst.append(l)
43 |   if len(lst) == 2 :
44 | 		corresponding_time_position = create_list_expression( lst[1].split() )
45 | 		process_type = create_process_type(lst[0])
46 | 		print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}"
47 | 		lst = []
48 | fin.close()
49 | 
50 | #create date expression
51 | fin = open("abstime_date.txt")
52 | lst = []
53 | for line in fin.readlines() :
54 |   l = line.rstrip()
55 |   l = l.replace("*","ǂ")
56 |   lst.append(l)
57 |   if len(lst) == 2 :
58 | 		corresponding_time_position = create_list_expression( lst[1].split() )
59 | 		print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":[], \"ordinary\":false, \"option\":\"\"}"
60 | 		if lst[1].find("d") != -1 : #曜日表現を加える
61 | 			for dayweek in lst_dayweek :
62 | 				dayweek = dayweek.rstrip().rstrip("　")
63 | 				process_type = create_process_type(lst[0] + dayweek)
64 | 				print "{\"pattern\":\""+lst[0]+dayweek+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}"
65 | 		lst = []
66 | fin.close()
67 | 
68 | 
69 | #普通にやろうとすると18MBほどになる。検索時間はあまり変わらないが、最初の読み込み時に5秒程時間がかかる。どうする？
70 | 
71 | #create date+time expression
72 | fin = open("abstime_date.txt")
73 | lst = []
74 | for line in fin.readlines() :
75 |   l = line.rstrip()
76 |   l = l.replace("*","ǂ")
77 |   lst.append(l)
78 |   if len(lst) == 2 :
79 | 		if lst[1].find("d") != -1 :
80 | 			for dayweek in lst_dayweek :
81 | 				fin2 = open("abstime_time.txt")
82 | 				lst2 = []
83 | 				for line2 in fin2.readlines() :
84 | 					l2 = l2.replace("*","ǂ")
85 | 					lst2.append(l2)
86 | 					if len(lst2) == 2 :
87 | 						if lst2[1].find("h") == -1 : continue
88 | 						tmp = lst[1].split()
89 | 						tmp2 = lst2[1].split()
90 | 						tmp.extend(tmp2)
91 | 						corresponding_time_position = create_list_expression(tmp)
92 | 						process_type = create_process_type(lst[0]+dayweek+lst2[0])
93 | 						print "{\"pattern\":\""+lst[0]+dayweek+"ǂ"+lst2[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}"
94 | 						lst2 = []
95 | 		lst = []
96 | fin.close()
97 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_abstime_prefix_counter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | fin = open("abstime_nengou.txt", "r")
 5 | for line in fin.readlines() :
 6 |   lst = line.rstrip().split()
 7 |   print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":[], \"process_type\":[\""+lst[1]+"\"], \"ordinary\":false, \"option\":\"seireki\"}"
 8 | 
 9 | fin = open("abstime_prefix_counter.txt", "r")
10 | for line in fin.readlines() :
11 |   lst = line.rstrip().split()
12 |   print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":[], \"process_type\":[], \"ordinary\":false, \"option\":\""+lst[1]+"\"}"


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_dayweek.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | fin = open("abstime_dayweek.txt", "r")
3 | for line in fin.readlines() :
4 | 	dayweek = line.rstrip()
5 | 	fin2 = open("abstime_dayweek_pattern.txt", "r")
6 | 	for pattern in fin2.readlines() :
7 | 		pattern = pattern.rstrip("\n")
8 | 		print "{\"pattern\":\""+pattern.replace("*",dayweek)+"}"
9 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_duration.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | def create_list_expression(lst_org) :
 4 |   lst = lst_org.split(",")
 5 |   if len(lst)==0 : return "[]"
 6 |   ret = "["
 7 |   for l in lst :
 8 |     ret += "\""+l+"\""
 9 |     ret += ", "
10 |   ret = ret[:-2] + "]"
11 |   return ret
12 | 
13 | fin = open("duration_time_position.txt", "r")
14 | for line in fin.readlines() :
15 |   lst = line.rstrip().split()
16 |   lst[0] = lst[0].replace("*","ǂ")
17 |   print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst[1])+", \"process_type\":[], \"ordinary\":false, \"option\":\"\"}"
18 |   print "{\"pattern\":\""+lst[0]+"半\", \"corresponding_time_position\":"+create_list_expression(lst[1])+", \"process_type\":[\"han\"], \"ordinary\":false, \"option\":\"\"}"


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_inappropriate.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | fin = open("inappropriate_strings.txt", "r")
4 | for line in fin.readlines() :
5 |   l = line.rstrip()
6 |   print "{\"str\":\""+l+"\"}"
7 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_num.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | fin = open("num.txt", "r")
 4 | for line in fin.readlines() :
 5 |   l = line.rstrip().split()
 6 |   print "{\"pattern\":\""+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":0, \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\"\"}"
 7 | 
 8 | fin = open("num_SItanni_settouji_katakana.txt")
 9 | prefix = [["",0]]
10 | for line in fin.readlines() :
11 |   l = line.rstrip().split()
12 |   prefix.append(l)
13 | fin = open("num_SItanni_katakana.txt", "r")
14 | for line in fin.readlines() :
15 |   l = line.rstrip().split()
16 |   for p in prefix :
17 |     print "{\"pattern\":\""+p[0]+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":"+str(p[1])+", \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\"\"}"
18 | 
19 | fin = open("num_SItanni_settouji_hankaku.txt")
20 | prefix = [["",0]]
21 | for line in fin.readlines() :
22 |   l = line.rstrip().split()
23 |   prefix.append(l)
24 | fin = open("num_SItanni_hankaku.txt", "r")
25 | for line in fin.readlines() :
26 |   l = line.rstrip().split()
27 |   for p in prefix :
28 |     print "{\"pattern\":\""+p[0]+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":"+str(p[1])+", \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\"\"}"
29 | 
30 | fin = open("num_SItanni_settouji_zenkaku.txt")
31 | prefix = [["",0]]
32 | for line in fin.readlines() :
33 |   l = line.rstrip().split()
34 |   prefix.append(l)
35 | fin = open("num_SItanni_zenkaku.txt", "r")
36 | for line in fin.readlines() :
37 |   l = line.rstrip().split()
38 |   for p in prefix :
39 |     print "{\"pattern\":\""+p[0]+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":"+str(p[1])+", \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\"\"}"
40 | 
41 | fin = open("num_expand.txt")
42 | for line in fin.readlines() :
43 |   l = line.rstrip().split()
44 |   print "{\"pattern\":\""+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":0, \"optional_power_of_ten\":"+l[2]+", \"ordinary\":false, \"option\":\"\"}"
45 | 
46 | fin = open("num_wari.txt")
47 | for line in fin.readlines() :
48 | 	l = line.rstrip().replace("*","ǂ")
49 | 	print l
50 | 
51 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_num_prefix_counter.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | fin = open("num_prefix_counter.txt", "r")
4 | for line in fin.readlines() :
5 |   l = line.rstrip().split()
6 |   print "{\"pattern\":\""+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":0, \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\""+l[2]+"\"}"
7 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_number_modifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | 
 4 | filename = sys.argv[1]
 5 | fin = open(filename, "r")
 6 | for line in fin.readlines() :
 7 |   line = line.replace("<=", "or_over").replace(">=", "or_less").replace("<","over").replace(">", "less")
 8 |   l = line.rstrip().split()
 9 |   print "{\"pattern\":\""+l[0]+"\", \"process_type\":\""+l[1]+"\"}"
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_reltime.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | def create_list_expression(fugou,lst_org) :
 5 |   lst = lst_org.split(",")
 6 |   if len(lst)==0 : return "[]"
 7 |   ret = "["
 8 |   for l in lst :
 9 |     ret += "\""+fugou+l+"\""
10 |     ret += ", "
11 |   ret = ret[:-2] + "]"
12 |   return ret
13 | 
14 | fin = open("reltime_time_position.txt", "r")
15 | for line in fin.readlines() :
16 |   lst = line.rstrip().split()
17 |   lst[0] = lst[0].replace("*","ǂ")
18 |   fin2 = open("reltime_time_option.txt", "r")
19 |   for line2 in fin2.readlines() :
20 |     lst2 = line2.rstrip().split()
21 |     if len(lst2) == 3 :
22 |       print "{\"pattern\":\""+lst[0]+lst2[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst2[1],lst[1])+", \"process_type\":[\""+lst2[2]+"\"], \"ordinary\":false, \"option\":\"\"}"
23 |     else :
24 |       print "{\"pattern\":\""+lst[0]+lst2[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst2[1],lst[1])+", \"process_type\":[], \"ordinary\":false, \"option\":\"\"}"
25 | #hanのため
26 |     if len(lst2) == 3 :
27 |       print "{\"pattern\":\""+lst[0]+"半"+lst2[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst2[1],lst[1])+", \"process_type\":[\""+lst2[2]+"\", \"han\"], \"ordinary\":false, \"option\":\"\"}"
28 |     else :
29 |       print "{\"pattern\":\""+lst[0]+"半"+lst2[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst2[1],lst[1])+", \"process_type\":[\"han\"], \"ordinary\":false, \"option\":\"\"}"
30 | 
31 | fin.close()
32 | 
33 | fin = open("reltime_specific.txt","r")
34 | for line in fin.readlines() :
35 |   print line.rstrip()
36 | fin.close()
37 | 
38 | fin = open("../abstime_expression_json.txt", "r")
39 | for line in fin.readlines() :
40 |   print line.rstrip()


--------------------------------------------------------------------------------
/src/dic/ja/raw/create_dic_reltime_prefix_counter.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | fin = open("reltime_prefix_counter.txt", "r")
4 | for line in fin.readlines() :
5 |   lst = line.rstrip().split()
6 |   print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":[\""+lst[1]+"\"], \"process_type\":[\""+lst[2]+"\"], \"ordinary\":false, \"option\":\"add_relation\"}"
7 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/duration_prefix_counter.txt:
--------------------------------------------------------------------------------
1 | 週	/week
2 | 月	/month
3 | 年	/year
4 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/duration_setouji.txt:
--------------------------------------------------------------------------------
 1 | 約	about
 2 | だいたい	about
 3 | ほぼ	about
 4 | およそ	about
 5 | ほとんど	about
 6 | 全	none
 7 | ちょうど	none
 8 | 第	ordinary
 9 | ~	kara_prefix
10 | 〜	kara_prefix
11 | ～	kara_prefix
12 | -	kara_prefix
13 | −	kara_prefix
14 | ー	kara_prefix
15 | から	kara_prefix


--------------------------------------------------------------------------------
/src/dic/ja/raw/duration_setubiji.txt:
--------------------------------------------------------------------------------
 1 | 目	ordinary
 2 | 以下	>=
 3 | 以前	>=
 4 | 以内	>=
 5 | まで	made
 6 | 迄	>=
 7 | 未満	>
 8 | 以上	<=
 9 | 以降	<=
10 | 超	<=
11 | 越え	<=
12 | 超え	<=
13 | ~	kara_suffix
14 | 〜	kara_suffix
15 | ～	kara_suffix
16 | -	kara_suffix
17 | −	kara_suffix
18 | ー	kara_suffix
19 | から	kara_suffix
20 | くらい	about
21 | ばかり	about
22 | 前後	about
23 | 程度	about
24 | ほど	about
25 | 近く	about
26 | 頃	about
27 | ごろ	about
28 | 余り	kyou
29 | 強	kyou
30 | 弱	jaku
31 | 台	dai
32 | 代	dai
33 | 毎	per
34 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/duration_time_position.txt:
--------------------------------------------------------------------------------
 1 | 世紀	seiki
 2 | 年間	y
 3 | 年	y
 4 | ヶ月	m
 5 | か月	m
 6 | カ月	m
 7 | ヵ月	m
 8 | ケ月	m
 9 | 箇月	m
10 | 週間	w
11 | 日間	d
12 | 時間	h
13 | 分間	mn
14 | 秒間	s
15 | 年	y
16 | 月	m
17 | 週	w
18 | 日	d
19 | 分	mn
20 | 秒	s
21 | 年*ヶ月	y,m
22 | 時間*分	h,mn
23 | 分*秒	mn,s


--------------------------------------------------------------------------------
/src/dic/ja/raw/inappropriate_strings.txt:
--------------------------------------------------------------------------------
 1 | 一切
 2 | 一部
 3 | 一連
 4 | 三振
 5 | 一段
 6 | 一体
 7 | 九州
 8 | 四国
 9 | 一種
10 | 一番


--------------------------------------------------------------------------------
/src/dic/ja/raw/make_dictionary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | python create_dic_number_modifier.py num_settouji.txt > ../num_prefix_json.txt
 3 | python create_dic_number_modifier.py num_setubiji.txt > ../num_suffix_json.txt
 4 | python create_dic_number_modifier.py abstime_settouji.txt > ../abstime_prefix_json.txt
 5 | python create_dic_number_modifier.py abstime_setubiji.txt > ../abstime_suffix_json.txt
 6 | python create_dic_number_modifier.py reltime_settouji.txt > ../reltime_prefix_json.txt
 7 | python create_dic_number_modifier.py abstime_setubiji.txt > ../reltime_suffix_json.txt #reltime自体の接尾辞は存在しない。相対絶対表現でabsの接尾辞を使う
 8 | python create_dic_number_modifier.py duration_setouji.txt > ../duration_prefix_json.txt
 9 | python create_dic_number_modifier.py duration_setubiji.txt > ../duration_suffix_json.txt
10 | python create_dic_num.py > ../num_counter_json.txt
11 | python create_dic_num_prefix_counter.py > ../num_prefix_counter_json.txt
12 | python create_dic_abstime.py > ../abstime_expression_json.txt
13 | python create_dic_abstime_prefix_counter.py > ../abstime_prefix_counter_json.txt
14 | python create_dic_reltime.py > ../reltime_expression_json.txt
15 | python create_dic_reltime_prefix_counter.py > ../reltime_prefix_counter_json.txt
16 | python create_dic_duration.py > ../duration_expression_json.txt
17 | python create_dic_inappropriate.py > ../inappropriate_strings_json.txt


--------------------------------------------------------------------------------
/src/dic/ja/raw/make_dictionary.sh~:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | python create_dic_number_modifier.py num_settouji.txt > ../num_prefix_json.txt
 3 | python create_dic_number_modifier.py num_setubiji.txt > ../num_suffix_json.txt
 4 | python create_dic_number_modifier.py abstime_settouji.txt > ../abstime_prefix_json.txt
 5 | python create_dic_number_modifier.py abstime_setubiji.txt > ../abstime_suffix_json.txt
 6 | python create_dic_number_modifier.py reltime_settouji.txt > ../reltime_prefix_json.txt
 7 | python create_dic_number_modifier.py duration_setouji.txt > ../duration_prefix_json.txt
 8 | python create_dic_number_modifier.py duration_setubiji.txt > ../duration_suffix_json.txt
 9 | python create_dic_num.py > ../num_counter_json.txt
10 | python create_dic_abstime.py > ../abstime_expression_json.txt
11 | python create_dic_abstime_prefix_counter.py > ../abstime_prefix_counter_json.txt
12 | python create_dic_reltime.py > ../reltime_expression_json.txt
13 | python create_dic_duration.py > ../duration_expression_json.txt
14 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/num.txt:
--------------------------------------------------------------------------------
  1 | カナダドル	カナダドル
  2 | シンガポールドル	シンガポールドル
  3 | オクターブ	オクターブ
  4 | パーセント	%
  5 | オクターヴ	オクターブ
  6 | フィート	フィート
  7 | カラット	カラット
  8 | グループ	グループ
  9 | USドル	ドル
 10 | タイトル	タイトル
 11 | シリーズ	シリーズ
 12 | ポイント	ポイント
 13 | ジャンル	ジャンル
 14 | ステージ	ステージ
 15 | パターン	パターン
 16 | ラウンド	ラウンド
 17 | フラン	フラン
 18 | クローネ	クローネ
 19 | ポンド	ポンド
 20 | ユーロ	ユーロ
 21 | レース	レース
 22 | ウォン	ウォン
 23 | ルピー	ルピー
 24 | インチ	インチ
 25 | ノット	ノット
 26 | モーラ	モーラ
 27 | コース	コース
 28 | ページ	ページ
 29 | テイク	テイク
 30 | タイプ	タイプ
 31 | ゲーム	ゲーム
 32 | チーム	チーム
 33 | 新ペソ	新ペソ
 34 | ペソ	ペソ
 35 | 選手	選手
 36 | 拍子	拍子
 37 | 音節	音節
 38 | 区画	区画
 39 | 切れ	切れ
 40 | 人前	人前
 41 | ドル	ドル
 42 | 海里	海里
 43 | ペア	ペア
 44 | 種類	種類
 45 | 集落	集落
 46 | 手法	手法
 47 | 言語	言語
 48 | 地域	地域
 49 | 議席	議席
 50 | カ国	カ国
 51 | ヶ国	カ国
 52 | か国	カ国
 53 | ケ国	カ国
 54 | ヵ国	カ国
 55 | 年分	年分
 56 | 民族	民族
 57 | 種目	種目
 58 | 分割	分割
 59 | 母音	母音
 60 | 箇所	箇所
 61 | ケ所	箇所
 62 | ヶ所	箇所
 63 | ヵ所	箇所
 64 | カ所	箇所
 65 | か所	箇所
 66 | 個所	箇所
 67 | 文節	文節
 68 | 回生	回生
 69 | 単位	単位
 70 | 次元	次元
 71 | 連勝	連勝
 72 | 連敗	連敗
 73 | 重奏	重奏
 74 | 年制	年制
 75 | 試合	試合
 76 | 文字	文字
 77 | 作品	作品
 78 | 世代	世代
 79 | 大会	大会
 80 | 得点	得点
 81 | 方向	方向
 82 | 店舗	店舗
 83 | 世帯	世帯
 84 | 師団	師団
 85 | 艦隊	艦隊
 86 | 要素	要素
 87 | 領域	領域
 88 | 音素	音素
 89 | 段階	段階
 90 | 連隊	連隊
 91 | 階級	階級
 92 | 連覇	連覇
 93 | 路線	路線
 94 | bite	バイト
 95 | 便	便
 96 | 勝	勝
 97 | 敗	敗
 98 | 等	等
 99 | 人	人
100 | 個	個
101 | つ	つ
102 | 枚	枚
103 | 面	面
104 | 段	段
105 | 本	本
106 | 匹	匹
107 | 羽	羽
108 | 灯	灯
109 | 頭	頭
110 | 本	本
111 | 張	張
112 | 戸	戸
113 | 軒	軒
114 | 棟	棟
115 | 杯	杯
116 | 匹	匹
117 | 枚	枚
118 | 架	架
119 | 体	体
120 | 柱	柱
121 | 府	府
122 | 党	党
123 | 氏	氏
124 | 団体	団体
125 | 局	局
126 | 番	番
127 | 脚	脚
128 | 本	本
129 | 基	基
130 | 着	着
131 | 具	具
132 | 羽	羽
133 | 頭	頭
134 | 席	席
135 | 献	献
136 | 柄	柄
137 | 玉	玉
138 | 杯	杯
139 | 巻	巻
140 | 枝	枝
141 | 尾	尾
142 | 港	港
143 | 掛	掛
144 | 番	番
145 | 封	封
146 | 筋	筋
147 | 挺	挺
148 | 条	条
149 | 錠	錠
150 | 丈	丈
151 | 幅	幅
152 | 株	株
153 | 刎	刎
154 | 座	座
155 | 騎	騎
156 | 行	行
157 | 服	服
158 | 包	包
159 | 果	果
160 | 菓	菓
161 | 足	足
162 | 領	領
163 | 丁	丁
164 | 俵	俵
165 | 膳	膳
166 | 喉	喉
167 | 斤	斤
168 | 叺	叺
169 | 貫	貫
170 | 篇	篇
171 | 尊	尊
172 | 棹	棹
173 | 台	台
174 | 両	両
175 | 連	連
176 | 部	部
177 | 頁	ページ
178 | 球	球
179 | 部	部
180 | 句	句
181 | 門	門
182 | 問	問
183 | 戦	戦
184 | 畳	畳
185 | 棹	棹
186 | 反	反
187 | 卓	卓
188 | 口	口
189 | 壷	壷
190 | 通	通
191 | 振	振
192 | 腰	腰
193 | 剣	剣
194 | 刀	刀
195 | 票	票
196 | 帖	帖
197 | 句	句
198 | 輪	輪
199 | 片	片
200 | 機	機
201 | 名	名
202 | 拍	拍
203 | 躯	躯
204 | 隻	隻
205 | 粒	粒
206 | 顆	顆
207 | 札	札
208 | 冊	冊
209 | 品	品
210 | ℃	℃
211 | rad	rad
212 | 円	円
213 | 種	種類
214 | 級	級
215 | 度	度
216 | こ	個
217 | 倍	倍
218 | ％	%
219 | 回	回
220 | 弦	弦
221 | 校	校
222 | 次	次
223 | 項	項
224 | 歳	歳
225 | 才	歳
226 | 国	国
227 | 州	州
228 | 件	件
229 | 区	区
230 | 話	話
231 | 選	選
232 | 位	位
233 | 合	合
234 | 階	階
235 | 波	波
236 | 節	節
237 | bit	ビット
238 | 期	期
239 | 切	切
240 | 音	音
241 | 手	手
242 | 尺	尺
243 | 寸	寸
244 | 県	県
245 | 章	章
246 | 泊	泊
247 | 曲	曲
248 | 列	駅
249 | 線	線
250 | 社	社
251 | 弾	弾
252 | 組	組
253 | 役	役
254 | 桁	桁
255 | 字	字
256 | 点	点
257 | 店	店
258 | 石	石
259 | 版	版
260 | 藩	藩
261 | 号	号
262 | 課	課
263 | 作	作
264 | 集	集
265 | 州	州
266 | 周	周
267 | 袋	袋
268 | rpm	rpm
269 | ｒｐｍ	rpm
270 | 代	代
271 | 項	項
272 | °	度
273 | ％	%
274 | %	%
275 | 日分	日分
276 | ヶ月分	ヶ月分
277 | 年分	年分
278 | 行	行
279 | 碗	碗
280 | 台分	台分
281 | 人分	人分
282 | 倍速	倍速
283 | コ	個
284 | こま	コマ
285 | コマ	コマ
286 | マルク	マルク
287 | リラ	リラ
288 | ペセタ	ペセタ
289 | 箱	箱
290 | カウント	カウント
291 | ハイ	ハイ
292 | KB	KB
293 | MB	MB
294 | GB	GB
295 | TB	TB
296 | PB	PB
297 | ＫＢ	KB
298 | ＭＢ	MB
299 | ＧＢ	GB
300 | ＴＢ	TB
301 | ＰＢ	PB
302 | 事例	事例
303 | 周年	周年
304 | 例	例
305 | ppm	ppm
306 | G	G
307 | Ｇ	G
308 | 色	色
309 | 気圧	気圧
310 | 光年	光年
311 | 里	里
312 | セット	セット


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_SItanni_hankaku.txt:
--------------------------------------------------------------------------------
 1 | mol	mol
 2 | bps	bps
 3 | m/h	m/h
 4 | m/s	m/s
 5 | m/m	m/m
 6 | g/l	g/l
 7 | N/m2	N/m2
 8 | cd	cd
 9 | Pa	Pa
10 | Ω	Ω
11 | Wb	Wb
12 | Hz	Hz
13 | sr	sr
14 | ha	ha
15 | cc	cc
16 | m	m
17 | g	g
18 | N	N
19 | l	l
20 | s	s
21 | A	A
22 | K	K
23 | J	J
24 | W	W
25 | C	C
26 | V	V
27 | F	F
28 | S	S
29 | T	T
30 | H	H
31 | Sv	Sv
32 | B	バイト
33 | Bq	Bq
34 | dB	dB
35 | pixel	pixel
36 | cal	cal


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_SItanni_katakana.txt:
--------------------------------------------------------------------------------
 1 | モル	mol
 2 | カンデラ	cd
 3 | パスカル	Pa
 4 | オーム	Ω
 5 | ウェーバ	Wb
 6 | ヘルツ	Hz
 7 | ステラジアン	sr
 8 | ヘクタール	ha
 9 | シーシー	cc
10 | メートル	m
11 | グラム	g
12 | ニュートン	N
13 | リットル	l
14 | アンペア	A
15 | ケルビン	K
16 | ジュール	J
17 | ワット	W
18 | クーロン	C
19 | ボルト	V
20 | ファラド	F
21 | ジーメンス	S
22 | テスラ	T
23 | ヘンリー	H
24 | シーベルト	Sv
25 | バイト	バイト
26 | ベクレル	Bq
27 | デシベル	dB
28 | ピクセル	pixel
29 | カロリー	cal
30 | ビット	ビット
31 | マイル	マイル
32 | フィート	フィート
33 | ヤード	ヤード
34 | インチ	インチ
35 | エーカー	エーカー
36 | オンス	オンス
37 | パイント	パイント
38 | ガロン	ガロン
39 | バレル	バレル
40 | オンス	オンス
41 | ポンド	ポンド


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_SItanni_settouji_hankaku.txt:
--------------------------------------------------------------------------------
 1 | G	9
 2 | M	6
 3 | k	3
 4 | h	3
 5 | da	2
 6 | d	-1
 7 | c	-2
 8 | m	-3
 9 | μ	-6
10 | n	-9
11 | p	-12


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_SItanni_settouji_katakana.txt:
--------------------------------------------------------------------------------
 1 | ギガ	9
 2 | メガ	6
 3 | キロ	3
 4 | ヘクト	3
 5 | デシ	-1
 6 | センチ	-2
 7 | ミリ	-3
 8 | マイクロ	-6
 9 | ナノ	-9
10 | ピコ	-12
11 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_SItanni_settouji_zenkaku.txt:
--------------------------------------------------------------------------------
 1 | Ｇ	9
 2 | Ｍ	6
 3 | ｋ	3
 4 | ｈ	3
 5 | ｄａ	2
 6 | ｄ	-1
 7 | ｃ	-2
 8 | ｍ	-3
 9 | μ	-6
10 | ｎ	-9
11 | ｐ	-12


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_SItanni_zenkaku.txt:
--------------------------------------------------------------------------------
 1 | ｍｏｌ	mol
 2 | ｂｐｓ	bps
 3 | m／h	m/h
 4 | ｍ／ｓ	m/s
 5 | ｍ／ｍ	m/m
 6 | ｇ／ｌ	g/l
 7 | Ｎ／ｍ２	N/m2
 8 | ｃｄ	cd
 9 | Ｐａ	Pa
10 | Ｗｂ	Wb
11 | Ｈｚ	Hz
12 | ｓｒ	sr
13 | ｈａ	ha
14 | ｃｃ	cc
15 | ｍ	m
16 | ｇ	g
17 | Ｎ	N
18 | ｌ	l
19 | ｓ	s
20 | Ａ	A
21 | Ｋ	K
22 | Ｊ	J
23 | Ｗ	W
24 | Ｃ	C
25 | Ｖ	V
26 | Ｆ	F
27 | Ｓ	S
28 | Ｔ	T
29 | Ｈ	H
30 | Ｓｖ	Sv
31 | Ｂ	B


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_expand.txt:
--------------------------------------------------------------------------------
 1 | t	g	6
 2 | トン	g	6
 3 | センチ	m 	-2
 4 | キロ	m	3
 5 | ミリ	m	-3
 6 | 平方キロメートル	m2	6
 7 | 平方ミリメートル	m2	-6
 8 | 立方キロメートル	m3	9
 9 | 立方ミリメートル	m3	-9
10 | km²	m2	6
11 | mm²	m2	-6
12 | km³	m3	9
13 | mm³	m3	-9
14 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_prefix_counter.txt:
--------------------------------------------------------------------------------
 1 | ￥	円	counter
 2 | ¥	円	counter
 3 | $	ドル	counter
 4 | ＄	ドル	counter
 5 | €	ユーロ	counter
 6 | £	ポンド	counter
 7 | 小さじ	小さじ	counter
 8 | 大さじ	大さじ	counter
 9 | 時速	/h	add_suffix_counter
10 | 毎時	/h	add_suffix_counter
11 | 分速	/m	add_suffix_counter
12 | 毎分	/m	add_suffix_counter
13 | 秒速	/s	add_suffix_counter
14 | 毎秒	/s	add_suffix_counter
15 | 週	/week	add_suffix_counter
16 | 月	/month	add_suffix_counter
17 | 年	/year	add_suffix_counter
18 | 最大	*	saidai
19 | 最長	*	saityou
20 | 最高	*	saikou
21 | 華氏	℉	counter
22 | 摂氏	℃	counter
23 | 風速	*	fusoku
24 | 水温	*	suion
25 | 北緯	*	hokui
26 | 南緯	*	nanni
27 | 東経	*	toukei
28 | 西経	*	seikei


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_settouji.txt:
--------------------------------------------------------------------------------
 1 | 第	ordinary
 2 | 約	about
 3 | だいたい	about
 4 | ほぼ	about
 5 | およそ	about
 6 | ほとんど	about
 7 | 全	none
 8 | ちょうど	none
 9 | ~	kara_prefix
10 | 〜	kara_prefix
11 | ～	kara_prefix
12 | -	kara_prefix
13 | −	kara_prefix
14 | ー	kara_prefix
15 | から	kara_prefix


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_setubiji.txt:
--------------------------------------------------------------------------------
 1 | 目	ordinary
 2 | 以下	>=
 3 | 以前	>=
 4 | 以内	>=
 5 | まで	made
 6 | 迄	>=
 7 | 未満	>
 8 | 以上	<=
 9 | 以降	<=
10 | 超	<=
11 | 越え	<=
12 | 超え	<=
13 | ~	kara_suffix
14 | 〜	kara_suffix
15 | ～	kara_suffix
16 | -	kara_suffix
17 | −	kara_suffix
18 | ー	kara_suffix
19 | から	kara_suffix
20 | くらい	about
21 | ばかり	about
22 | 前後	about
23 | 程度	about
24 | ほど	about
25 | 近く	about
26 | 頃	about
27 | ごろ	about
28 | 余り	kyou
29 | 強	kyou
30 | 弱	jaku
31 | 台	dai
32 | 代	dai
33 | 毎	per
34 | 半	han
35 | ／時	/h
36 | ／分	/min
37 | ／秒	/sec
38 | /時	/h
39 | /分	/min
40 | /秒	/sec
41 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/num_wari.txt:
--------------------------------------------------------------------------------
1 | {"pattern":"割", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"}
2 | {"pattern":"分", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"}
3 | {"pattern":"厘", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"}
4 | {"pattern":"割*分", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"}
5 | {"pattern":"分*厘", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"}
6 | {"pattern":"割*分*厘", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"}


--------------------------------------------------------------------------------
/src/dic/ja/raw/reltime_prefix_counter.txt:
--------------------------------------------------------------------------------
 1 | 去年	y	-1
 2 | 昨年	y	-1
 3 | 一昨年	y	-2
 4 | 今年	y	0
 5 | 来年	y	+1
 6 | 先月	m	-1
 7 | 先々月	m	-2
 8 | 今月	m	0
 9 | 来月	m	+1
10 | 来来月	m	+2
11 | 昨日	d	-1
12 | 一昨日	d	-2
13 | 今日	d	0
14 | 本日	d	0
15 | 明日	d	+1
16 | 明後日	d	+2


--------------------------------------------------------------------------------
/src/dic/ja/raw/reltime_settouji.txt:
--------------------------------------------------------------------------------
 1 | 約	about
 2 | だいたい	about
 3 | ほぼ	about
 4 | およそ	about
 5 | ほとんど	about
 6 | 全	none
 7 | ちょうど	none
 8 | ~	kara_prefix
 9 | 〜	kara_prefix
10 | ～	kara_prefix
11 | -	kara_prefix
12 | −	kara_prefix
13 | ー	kara_prefix
14 | から	kara_prefix


--------------------------------------------------------------------------------
/src/dic/ja/raw/reltime_specific.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/ja/raw/reltime_specific.txt


--------------------------------------------------------------------------------
/src/dic/ja/raw/reltime_time_option.txt:
--------------------------------------------------------------------------------
 1 | 前	-	 
 2 | 以上前	-	or_over
 3 | くらい前	-	about
 4 | ぐらい前	-	about
 5 | ほど前	-	about
 6 | 程度前	-	about
 7 | ばかり前	-	about
 8 | 近く前	-	about
 9 | より前	-	over
10 | よりも前	-	over
11 | 後	+	 
12 | 以上後	+	or_over
13 | より後	+	over
14 | よりも後	+	over
15 | ほど後	+	about
16 | くらい後	+	about
17 | ぐらい後	+	about
18 | 程度後	+	about
19 | ばかり後	+	about
20 | 近く後	+	about 


--------------------------------------------------------------------------------
/src/dic/ja/raw/reltime_time_position.txt:
--------------------------------------------------------------------------------
 1 | 世紀	seiki
 2 | 年	y
 3 | ヶ月	m
 4 | か月	m
 5 | カ月	m
 6 | ヵ月	m
 7 | ケ月	m
 8 | 箇月	m
 9 | 週	w
10 | 週間	w
11 | 日	d
12 | 日間	d
13 | 時間	h
14 | 分	mn
15 | 秒	s
16 | 年*ヶ月	y,m
17 | 年*ヶ月*日間	y,m,d
18 | 


--------------------------------------------------------------------------------
/src/dic/ja/raw/reltime_time_pre_option.txt:
--------------------------------------------------------------------------------
1 | 半	han


--------------------------------------------------------------------------------
/src/dic/ja/reltime_prefix_counter_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"去年", "corresponding_time_position":["y"], "process_type":["-1"], "ordinary":false, "option":"add_relation"}
 2 | {"pattern":"昨年", "corresponding_time_position":["y"], "process_type":["-1"], "ordinary":false, "option":"add_relation"}
 3 | {"pattern":"一昨年", "corresponding_time_position":["y"], "process_type":["-2"], "ordinary":false, "option":"add_relation"}
 4 | {"pattern":"今年", "corresponding_time_position":["y"], "process_type":["0"], "ordinary":false, "option":"add_relation"}
 5 | {"pattern":"来年", "corresponding_time_position":["y"], "process_type":["+1"], "ordinary":false, "option":"add_relation"}
 6 | {"pattern":"先月", "corresponding_time_position":["m"], "process_type":["-1"], "ordinary":false, "option":"add_relation"}
 7 | {"pattern":"先々月", "corresponding_time_position":["m"], "process_type":["-2"], "ordinary":false, "option":"add_relation"}
 8 | {"pattern":"今月", "corresponding_time_position":["m"], "process_type":["0"], "ordinary":false, "option":"add_relation"}
 9 | {"pattern":"来月", "corresponding_time_position":["m"], "process_type":["+1"], "ordinary":false, "option":"add_relation"}
10 | {"pattern":"来来月", "corresponding_time_position":["m"], "process_type":["+2"], "ordinary":false, "option":"add_relation"}
11 | {"pattern":"昨日", "corresponding_time_position":["d"], "process_type":["-1"], "ordinary":false, "option":"add_relation"}
12 | {"pattern":"一昨日", "corresponding_time_position":["d"], "process_type":["-2"], "ordinary":false, "option":"add_relation"}
13 | {"pattern":"今日", "corresponding_time_position":["d"], "process_type":["0"], "ordinary":false, "option":"add_relation"}
14 | {"pattern":"本日", "corresponding_time_position":["d"], "process_type":["0"], "ordinary":false, "option":"add_relation"}
15 | {"pattern":"明日", "corresponding_time_position":["d"], "process_type":["+1"], "ordinary":false, "option":"add_relation"}
16 | {"pattern":"明後日", "corresponding_time_position":["d"], "process_type":["+2"], "ordinary":false, "option":"add_relation"}
17 | 


--------------------------------------------------------------------------------
/src/dic/ja/reltime_prefix_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"約", "process_type":"about"}
 2 | {"pattern":"だいたい", "process_type":"about"}
 3 | {"pattern":"ほぼ", "process_type":"about"}
 4 | {"pattern":"およそ", "process_type":"about"}
 5 | {"pattern":"ほとんど", "process_type":"about"}
 6 | {"pattern":"全", "process_type":"none"}
 7 | {"pattern":"ちょうど", "process_type":"none"}
 8 | {"pattern":"~", "process_type":"kara_prefix"}
 9 | {"pattern":"〜", "process_type":"kara_prefix"}
10 | {"pattern":"～", "process_type":"kara_prefix"}
11 | {"pattern":"-", "process_type":"kara_prefix"}
12 | {"pattern":"−", "process_type":"kara_prefix"}
13 | {"pattern":"ー", "process_type":"kara_prefix"}
14 | {"pattern":"から", "process_type":"kara_prefix"}
15 | 


--------------------------------------------------------------------------------
/src/dic/ja/reltime_suffix_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"以前", "process_type":"or_less"}
 2 | {"pattern":"まで", "process_type":"made"}
 3 | {"pattern":"迄", "process_type":"or_less"}
 4 | {"pattern":"より前", "process_type":"less"}
 5 | {"pattern":"以降", "process_type":"or_over"}
 6 | {"pattern":"より後", "process_type":"over"}
 7 | {"pattern":"~", "process_type":"kara_suffix"}
 8 | {"pattern":"〜", "process_type":"kara_suffix"}
 9 | {"pattern":"～", "process_type":"kara_suffix"}
10 | {"pattern":"-", "process_type":"kara_suffix"}
11 | {"pattern":"−", "process_type":"kara_suffix"}
12 | {"pattern":"ー", "process_type":"kara_suffix"}
13 | {"pattern":"から", "process_type":"kara_suffix"}
14 | {"pattern":"くらい", "process_type":"about"}
15 | {"pattern":"ばかり", "process_type":"about"}
16 | {"pattern":"前後", "process_type":"about"}
17 | {"pattern":"近く", "process_type":"about"}
18 | {"pattern":"頃", "process_type":"about"}
19 | {"pattern":"ごろ", "process_type":"about"}
20 | {"pattern":"頭", "process_type":"zenhan"}
21 | {"pattern":"前半", "process_type":"zenhan"}
22 | {"pattern":"前記", "process_type":"zenhan"}
23 | {"pattern":"初頭", "process_type":"zenhan"}
24 | {"pattern":"初期", "process_type":"zenhan"}
25 | {"pattern":"初め", "process_type":"zenhan"}
26 | {"pattern":"始め", "process_type":"zenhan"}
27 | {"pattern":"はじめ", "process_type":"zenhan"}
28 | {"pattern":"後半", "process_type":"kouhan"}
29 | {"pattern":"後期", "process_type":"kouhan"}
30 | {"pattern":"終盤", "process_type":"kouhan"}
31 | {"pattern":"終わり", "process_type":"kouhan"}
32 | {"pattern":"末", "process_type":"kouhan"}
33 | {"pattern":"半ば", "process_type":"nakaba"}
34 | {"pattern":"中期", "process_type":"nakaba"}
35 | {"pattern":"中盤", "process_type":"nakaba"}
36 | {"pattern":"中頃", "process_type":"nakaba"}
37 | {"pattern":"中ごろ", "process_type":"nakaba"}
38 | {"pattern":"中旬", "process_type":"nakaba"}
39 | {"pattern":"上旬", "process_type":"joujun"}
40 | {"pattern":"中旬", "process_type":"tyujun"}
41 | {"pattern":"下旬", "process_type":"gejun"}
42 | {"pattern":"PM", "process_type":"gogo"}
43 | {"pattern":"AM", "process_type":"gozen"}
44 | {"pattern":"ＰＭ", "process_type":"gogo"}
45 | {"pattern":"ＡＭ", "process_type":"gozen"}
46 | {"pattern":"PM", "process_type":"gogo"}
47 | {"pattern":"AM", "process_type":"gozen"}
48 | {"pattern":"　ＰＭ", "process_type":"gogo"}
49 | {"pattern":"　ＡＭ", "process_type":"gozen"}
50 | 


--------------------------------------------------------------------------------
/src/dic/zh/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/zh/.DS_Store


--------------------------------------------------------------------------------
/src/dic/zh/._chinese_character.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/zh/._chinese_character.txt


--------------------------------------------------------------------------------
/src/dic/zh/abstime_expression_json.txt:
--------------------------------------------------------------------------------
 1 | {"pattern":"世紀", "corresponding_time_position":["seiki"], "process_type":[], "ordinary":false, "option":""}
 2 | {"pattern":"年", "corresponding_time_position":["y"], "process_type":[], "ordinary":false, "option":""}
 3 | {"pattern":"年ǂ月", "corresponding_time_position":["y", "m"], "process_type":[], "ordinary":false, "option":""}
 4 | {"pattern":"年ǂ月ǂ日", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""}
 5 | {"pattern":"年ǂ月ǂ日ǂ時", "corresponding_time_position":["y", "m", "d", "h"], "process_type":[], "ordinary":false, "option":""}
 6 | {"pattern":"年ǂ月ǂ日ǂ時ǂ分", "corresponding_time_position":["y", "m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
 7 | {"pattern":"年ǂ月ǂ日ǂ時ǂ分ǂ秒", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
 8 | {"pattern":"年ǂ月ǂ日ǂ：ǂ", "corresponding_time_position":["y", "m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
 9 | {"pattern":"月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""}
10 | {"pattern":"月ǂ日", "corresponding_time_position":["m", "d"], "process_type":[], "ordinary":false, "option":""}
11 | {"pattern":"月ǂ日ǂ時", "corresponding_time_position":["m", "d", "h"], "process_type":[], "ordinary":false, "option":""}
12 | {"pattern":"月ǂ日ǂ時ǂ分", "corresponding_time_position":["m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
13 | {"pattern":"月ǂ日ǂ時ǂ分ǂ秒", "corresponding_time_position":["m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
14 | {"pattern":"月ǂ日ǂ：ǂ", "corresponding_time_position":["m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
15 | {"pattern":"日", "corresponding_time_position":["d"], "process_type":[], "ordinary":false, "option":""}
16 | {"pattern":"日ǂ時", "corresponding_time_position":["d", "h"], "process_type":[], "ordinary":false, "option":""}
17 | {"pattern":"日ǂ時ǂ分", "corresponding_time_position":["d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
18 | {"pattern":"日ǂ時ǂ分ǂ秒", "corresponding_time_position":["d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
19 | {"pattern":"日ǂ：ǂ", "corresponding_time_position":["d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
20 | {"pattern":"時", "corresponding_time_position":["h"], "process_type":[], "ordinary":false, "option":""}
21 | {"pattern":"時ǂ分", "corresponding_time_position":["h", "mn"], "process_type":[], "ordinary":false, "option":""}
22 | {"pattern":"時ǂ分ǂ秒", "corresponding_time_position":["h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
23 | {"pattern":"分ǂ秒", "corresponding_time_position":["mn", "s"], "process_type":[], "ordinary":false, "option":""}
24 | {"pattern":"/ǂ", "corresponding_time_position":["m", "d"], "process_type":[], "ordinary":false, "option":""}
25 | {"pattern":"/ǂ/ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""}
26 | {"pattern":"／ǂ", "corresponding_time_position":["m", "d"], "process_type":[], "ordinary":false, "option":""}
27 | {"pattern":"／ǂ／ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""}
28 | {"pattern":":ǂ", "corresponding_time_position":["h", "mn"], "process_type":[], "ordinary":false, "option":""}
29 | {"pattern":"：ǂ", "corresponding_time_position":["h", "mn"], "process_type":[], "ordinary":false, "option":""}
30 | {"pattern":":ǂ:ǂ", "corresponding_time_position":["h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
31 | {"pattern":"：ǂ：ǂ", "corresponding_time_position":["h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
32 | {"pattern":"年ǂ月ǂ日午前ǂ時", "corresponding_time_position":["y", "m", "d", "h"], "process_type":[], "ordinary":false, "option":""}
33 | {"pattern":"年ǂ月ǂ日午前ǂ時ǂ分", "corresponding_time_position":["y", "m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
34 | {"pattern":"年ǂ月ǂ日午前ǂ時ǂ分ǂ秒", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
35 | {"pattern":"月ǂ日午前ǂ時", "corresponding_time_position":["m", "d", "h"], "process_type":[], "ordinary":false, "option":""}
36 | {"pattern":"月ǂ日午前ǂ時ǂ分", "corresponding_time_position":["m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
37 | {"pattern":"月ǂ日午前ǂ時ǂ分ǂ秒", "corresponding_time_position":["m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
38 | {"pattern":"日午前ǂ時", "corresponding_time_position":["d", "h"], "process_type":[], "ordinary":false, "option":""}
39 | {"pattern":"日午前ǂ時ǂ分", "corresponding_time_position":["d", "h", "mn"], "process_type":[], "ordinary":false, "option":""}
40 | {"pattern":"日午前ǂ時ǂ分ǂ秒", "corresponding_time_position":["d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
41 | {"pattern":"-ǂ-ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""}
42 | {"pattern":"−ǂ−ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""}
43 | {"pattern":"ーǂーǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""}
44 | {"pattern":"ǂ／ǂ／ǂ　ǂ：ǂ：ǂ", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
45 | {"pattern":"ǂ／ǂ／ǂ　ǂ：ǂ：ǂ", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""}
46 | {"pattern":".ǂ.ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""}
47 | {"pattern":".ǂ", "corresponding_time_position":["y", "m"], "process_type":[], "ordinary":false, "option":""}
48 | {"pattern":"年ǂ月ǂ日午後ǂ時", "corresponding_time_position":["y", "m", "d", "h"], "process_type":["gogo"], "ordinary":false, "option":""}
49 | {"pattern":"年ǂ月ǂ日午後ǂ時ǂ分", "corresponding_time_position":["y", "m", "d", "h", "mn"], "process_type":["gogo"], "ordinary":false, "option":""}
50 | {"pattern":"年ǂ月ǂ日午後ǂ時ǂ分ǂ秒", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":["gogo"], "ordinary":false, "option":""}
51 | {"pattern":"月ǂ日午後ǂ時", "corresponding_time_position":["m", "d", "h"], "process_type":["gogo"], "ordinary":false, "option":""}
52 | {"pattern":"月ǂ日午後ǂ時ǂ分", "corresponding_time_position":["m", "d", "h", "mn"], "process_type":["gogo"], "ordinary":false, "option":""}
53 | {"pattern":"月ǂ日午後ǂ時ǂ分ǂ秒", "corresponding_time_position":["m", "d", "h", "mn", "s"], "process_type":["gogo"], "ordinary":false, "option":""}
54 | {"pattern":"日午後ǂ時", "corresponding_time_position":["d", "h"], "process_type":["gogo"], "ordinary":false, "option":""}
55 | {"pattern":"日午後ǂ時ǂ分", "corresponding_time_position":["d", "h", "mn"], "process_type":["gogo"], "ordinary":false, "option":""}
56 | {"pattern":"日午後ǂ時ǂ分ǂ秒", "corresponding_time_position":["d", "h", "mn", "s"], "process_type":["gogo"], "ordinary":false, "option":""}
57 | {"pattern":"年ǂ月ǂ日ǂ時半", "corresponding_time_position":["y", "m", "d", "h"], "process_type":["han"], "ordinary":false, "option":""}
58 | {"pattern":"年ǂ月ǂ日午前ǂ時半", "corresponding_time_position":["y", "m", "d", "h"], "process_type":["han"], "ordinary":false, "option":""}
59 | {"pattern":"年ǂ月ǂ日午後ǂ時半", "corresponding_time_position":["y", "m", "d", "h"], "process_type":["gogo", "han"], "ordinary":false, "option":""}
60 | {"pattern":"月ǂ日ǂ時半", "corresponding_time_position":["m", "d", "h"], "process_type":["han"], "ordinary":false, "option":""}
61 | {"pattern":"月ǂ日午前ǂ時半", "corresponding_time_position":["m", "d", "h"], "process_type":["han"], "ordinary":false, "option":""}
62 | {"pattern":"月ǂ日午後ǂ時半", "corresponding_time_position":["m", "d", "h"], "process_type":["gogo", "han"], "ordinary":false, "option":""}
63 | {"pattern":"日ǂ時半", "corresponding_time_position":["d", "h"], "process_type":["han"], "ordinary":false, "option":""}
64 | {"pattern":"日午前ǂ時半", "corresponding_time_position":["d", "h"], "process_type":["han"], "ordinary":false, "option":""}
65 | {"pattern":"日午後ǂ時半", "corresponding_time_position":["d", "h"], "process_type":["gogo", "han"], "ordinary":false, "option":""}
66 | {"pattern":"時半", "corresponding_time_position":["h"], "process_type":["han"], "ordinary":false, "option":""}
67 | 


--------------------------------------------------------------------------------
/src/dic/zh/chinese_character.txt:
--------------------------------------------------------------------------------
 1 | {"character":"〇", "value":0, "NotationType":"09"}
 2 | {"character":"一", "value":1, "NotationType":"09"}
 3 | {"character":"二", "value":2, "NotationType":"09"}
 4 | {"character":"三", "value":3, "NotationType":"09"}
 5 | {"character":"四", "value":4, "NotationType":"09"}
 6 | {"character":"五", "value":5, "NotationType":"09"}
 7 | {"character":"六", "value":6, "NotationType":"09"}
 8 | {"character":"七", "value":7, "NotationType":"09"}
 9 | {"character":"八", "value":8, "NotationType":"09"}
10 | {"character":"九", "value":9, "NotationType":"09"}
11 | {"character":"十", "value":1, "NotationType":"sen"}
12 | {"character":"百", "value":2, "NotationType":"sen"}
13 | {"character":"千", "value":3, "NotationType":"sen"}
14 | {"character":"万", "value":4, "NotationType":"man"}
15 | {"character":"億", "value":8, "NotationType":"man"}
16 | {"character":"兆", "value":12, "NotationType":"man"}
17 | {"character":"京", "value":16, "NotationType":"man"}
18 | {"character":"零", "value":0, "NotationType":"09"}
19 | {"character":"壹", "value":1, "NotationType":"09"}
20 | {"character":"贰", "value":2, "NotationType":"09"}
21 | {"character":"两", "value":2, "NotationType":"09"}
22 | {"character":"叁", "value":3, "NotationType":"09"}
23 | {"character":"肆", "value":4, "NotationType":"09"}
24 | {"character":"伍", "value":5, "NotationType":"09"}
25 | {"character":"陆", "value":6, "NotationType":"09"}
26 | {"character":"柒", "value":7, "NotationType":"09"}
27 | {"character":"捌", "value":8, "NotationType":"09"}
28 | {"character":"玖", "value":9, "NotationType":"09"}
29 | {"character":"拾", "value":1, "NotationType":"sen"}
30 | {"character":"佰", "value":2, "NotationType":"sen"}
31 | {"character":"仟", "value":3, "NotationType":"sen"}
32 | {"character":"萬", "value":4, "NotationType":"man"}
33 | {"character":"亿", "value":8, "NotationType":"man"}


--------------------------------------------------------------------------------
/src/dic/zh/num_counter_json.txt:
--------------------------------------------------------------------------------
1 | {"pattern":"元", "counter":"元", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":""}
2 | {"pattern":"美元", "counter":"美元", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":""}
3 | {"pattern":"円", "counter":"円", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":""}


--------------------------------------------------------------------------------
/src/dictionary_dirpath.cpp:
--------------------------------------------------------------------------------
1 | 
2 | #include "dictionary_dirpath.hpp"
3 | namespace dictionary_dirpath {
4 | std::string get_dictionary_dirpath(){
5 | 			return "/usr/local/lib/normalizeNumexp/dic/";}}


--------------------------------------------------------------------------------
/src/dictionary_dirpath.hpp:
--------------------------------------------------------------------------------
1 | #ifndef DICTIONARY_DIRPATH_H_
2 | #define DICTIONARY_DIRPATH_H_
3 | #include <string>
4 | 
5 | namespace dictionary_dirpath {
6 | std::string get_dictionary_dirpath();
7 | }
8 | 
9 | #endif //DICTIONARY_DIRPATH_H_


--------------------------------------------------------------------------------
/src/digit_utility.cpp:
--------------------------------------------------------------------------------
  1 | #include "digit_utility.hpp"
  2 | #include "dictionary_dirpath.hpp"
  3 | #include <pficommon/text/json.h>
  4 | //debug
  5 | namespace digit_utility {
  6 | 
  7 | std::map<std::string, ENotationType> string_to_notation_type;
  8 | std::map<std::string, int> kansuji_09_to_value;
  9 | std::map<std::string, int> kansuji_kurai_to_power_value;
 10 | 
 11 | struct ChineseCharacter {
 12 |   template <class Archive>
 13 |   void serialize(Archive &ar){
 14 |     ar & MEMBER(character) & MEMBER(NotationType) & MEMBER(value);
 15 |   }
 16 |   
 17 |   std::string character, NotationType;
 18 |   int value;
 19 | };
 20 | 
 21 | void load_json_from_file(const std::string& filepath, pfi::text::json::json& js) {
 22 |   std::ifstream in(filepath.c_str());
 23 |   pfi::text::json::json_parser parser(in);
 24 |   try {
 25 |     while (true) {
 26 |       js.add(parser.parse());
 27 |     }
 28 |   } catch (const pfi::lang::end_of_data&) {
 29 |   }
 30 | }
 31 | 
 32 | template <class T>
 33 | void load_from_dictionary(const std::string& dictionary_path, std::vector<T>& load_target) {
 34 |   load_target.clear();
 35 |   pfi::text::json::json js = pfi::text::json::json(new pfi::text::json::json_array());
 36 |   load_json_from_file(dictionary_path, js);
 37 |   pfi::text::json::from_json(js, load_target);
 38 | }
 39 | 
 40 | void init_kansuji(const std::string& language){
 41 |   std::vector<ChineseCharacter> chinese_characters;
 42 |   std::string dictionary_path;
 43 |   dictionary_path += dictionary_dirpath::get_dictionary_dirpath();
 44 |   if(language == "ja"){
 45 |     dictionary_path += "ja/chinese_character.txt";
 46 |   }else if (language == "zh"){
 47 |     dictionary_path += "zh/chinese_character.txt";
 48 |   }else {
 49 |     return;
 50 |   }
 51 |   load_from_dictionary(dictionary_path, chinese_characters);
 52 |   for(int i=0; i<static_cast<int>(chinese_characters.size()); i++){
 53 |     ENotationType notation_type = NOT_NUMBER;
 54 |     if(chinese_characters[i].NotationType == "09") notation_type = KANSUJI_09;
 55 |     else if(chinese_characters[i].NotationType == "sen") notation_type = KANSUJI_KURAI_SEN;
 56 |     else if(chinese_characters[i].NotationType == "man") notation_type = KANSUJI_KURAI_MAN;
 57 |     string_to_notation_type[chinese_characters[i].character] = notation_type;
 58 |     if(notation_type == KANSUJI_09) kansuji_09_to_value[chinese_characters[i].character] = chinese_characters[i].value;
 59 |     else if(notation_type == KANSUJI_KURAI_MAN || notation_type == KANSUJI_KURAI_SEN) kansuji_kurai_to_power_value[chinese_characters[i].character] = chinese_characters[i].value;
 60 |   }
 61 |   kansuji_kurai_to_power_value["　"] = 0;
 62 | }
 63 |   
 64 | bool is_hankakusuji(const pfi::data::string::uchar uc) {
 65 |   return (pfi::data::string::string_to_uchar("0") <= uc
 66 |       && uc <= pfi::data::string::string_to_uchar("9"));
 67 | }
 68 | 
 69 | bool is_zenkakusuji(const pfi::data::string::uchar uc) {
 70 |   return (pfi::data::string::string_to_uchar("０") <= uc
 71 |       && uc <= pfi::data::string::string_to_uchar("９"));
 72 | }
 73 | 
 74 | bool is_arabic(const pfi::data::string::uchar uc) {
 75 |   return (is_hankakusuji(uc) || is_zenkakusuji(uc));
 76 | }
 77 | 
 78 | bool is_notation_type(const pfi::data::string::uchar uc, ENotationType NOTATION_TYPE) {
 79 |   std::map<std::string, ENotationType>::const_iterator itr =
 80 |       string_to_notation_type.find(pfi::data::string::uchar_to_string(uc));
 81 |   if (itr == string_to_notation_type.end())
 82 |     return 0;
 83 |   return (itr->second) & NOTATION_TYPE;
 84 | }
 85 | 
 86 | bool is_kansuji(const pfi::data::string::uchar uc) {
 87 |   return is_notation_type(uc, KANSUJI);
 88 | }
 89 | 
 90 | bool is_kansuji_09(const pfi::data::string::uchar uc) {
 91 |   return is_notation_type(uc, KANSUJI_09);
 92 | }
 93 | 
 94 | bool is_kansuji_kurai_sen(const pfi::data::string::uchar uc) {
 95 |   return is_notation_type(uc, KANSUJI_KURAI_SEN);
 96 | }
 97 | 
 98 | bool is_kansuji_kurai_man(const pfi::data::string::uchar uc) {
 99 |   return is_notation_type(uc, KANSUJI_KURAI_MAN);
100 | }
101 | 
102 | bool is_kansuji_kurai(const pfi::data::string::uchar uc) {
103 |   return is_notation_type(uc, KANSUJI_KURAI);
104 | }
105 | 
106 | bool is_comma(const pfi::data::string::uchar uc) {
107 |   std::string str = pfi::data::string::uchar_to_string(uc);
108 |   return (str == "," || str == "、" || str == "，");
109 | }
110 | 
111 | bool is_decimal_point(const pfi::data::string::ustring& ustr) {
112 |   std::string str = pfi::data::string::ustring_to_string(ustr);
113 |   return (str == "." || str == "・" || str == "．");
114 | }
115 | 
116 | bool is_range_expression(const pfi::data::string::ustring& ustr) {
117 |   std::string str = pfi::data::string::ustring_to_string(ustr);
118 |   return (str == "~" || str == "〜" || str == "～" || str == "-" || str == "−" || str == "ー" || str == "―" || str == "から");
119 | }
120 | 
121 | bool is_number(const pfi::data::string::uchar uc) {
122 |   return is_hankakusuji(uc) or is_zenkakusuji(uc) or is_kansuji(uc);
123 | }
124 | 
125 | int convert_kansuji_09_to_value(const pfi::data::string::uchar uc) {
126 |   std::string str = pfi::data::string::uchar_to_string(uc);
127 |   std::map<std::string, int>::const_iterator itr = kansuji_09_to_value.find(str);
128 |   if (itr == kansuji_09_to_value.end()) {
129 |     //例外処理。どうする？
130 |     throw "Exception : is not kansuji09";
131 |   }
132 |   return (itr->second);
133 | }
134 | 
135 | int convert_kansuji_kurai_to_power_value(const pfi::data::string::uchar uc) {
136 |   std::string str = pfi::data::string::uchar_to_string(uc);
137 |   std::map<std::string, int>::const_iterator itr = kansuji_kurai_to_power_value.find(str);
138 |   if (itr == kansuji_kurai_to_power_value.end()) {
139 |     //例外処理。どうする？
140 |     throw "Exception : is not kansuji_kurai";
141 |   }
142 |   return (itr->second);
143 | }
144 | } //namespace digit_utility
145 | 


--------------------------------------------------------------------------------
/src/digit_utility.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DIGIT_UTILITY_H_
 2 | #define DIGIT_UTILITY_H_
 3 | #include <map>
 4 | #include <string>
 5 | #include <float.h>
 6 | #include <complex>
 7 | #include <pficommon/data/string/ustring.h>
 8 | 
 9 | namespace digit_utility {
10 | //const double DOUBLE_NULL = INFINITY;
11 | //std::string dictionary_dirpath("/home/katsuma/src/digit_utils/src/dic/");
12 | 
13 | enum ENotationType {
14 |   NOT_NUMBER = 0,
15 |   KANSUJI_09 = 1,
16 |   KANSUJI_KURAI_SEN = 2,
17 |   KANSUJI_KURAI_MAN = 4,
18 |   KANSUJI_KURAI = 6,
19 |   KANSUJI = 7,
20 |   ZENKAKU = 8,
21 |   HANKAKU = 16,
22 | };
23 | 
24 | struct Number {
25 |   Number()
26 |       : original_expression(pfi::data::string::string_to_ustring("")),
27 |         position_start(-1),
28 |         position_end(-1),
29 |         value_lowerbound(INFINITY),
30 |         value_upperbound(-INFINITY),
31 |         notation_type(NOT_NUMBER) {
32 |   }
33 | 
34 |   Number(pfi::data::string::ustring& original_expression, int position_start, int position_end)
35 |       : original_expression(original_expression),
36 |         position_start(position_start),
37 |         position_end(position_end),
38 |         value_lowerbound(INFINITY),
39 |         value_upperbound(-INFINITY),
40 |         notation_type(NOT_NUMBER) {
41 |   }
42 | 
43 |   pfi::data::string::ustring original_expression;
44 |   int position_start;
45 |   int position_end;
46 |   double value_lowerbound;
47 |   double value_upperbound;
48 |   int notation_type;
49 | };
50 | 
51 | void init_kansuji(const std::string& language);
52 | bool is_hankakusuji(pfi::data::string::uchar uc);
53 | bool is_zenkakusuji(pfi::data::string::uchar uc);
54 | bool is_arabic(pfi::data::string::uchar uc);
55 | bool is_kansuji(pfi::data::string::uchar uc);
56 | bool is_kansuji_09(pfi::data::string::uchar uc);
57 | bool is_kansuji_kurai_sen(pfi::data::string::uchar uc);
58 | bool is_kansuji_kurai_man(pfi::data::string::uchar uc);
59 | bool is_kansuji_kurai(pfi::data::string::uchar uc);
60 | bool is_number(pfi::data::string::uchar uc);
61 | bool is_comma(pfi::data::string::uchar uc);
62 | bool is_decimal_point(const pfi::data::string::ustring& ustr);
63 | bool is_range_expression(const pfi::data::string::ustring& ustr);
64 | int convert_kansuji_09_to_value(pfi::data::string::uchar uc);
65 | int convert_kansuji_kurai_to_power_value(pfi::data::string::uchar uc);
66 | }
67 | 
68 | #endif //DIGIT_UTILITY_H_
69 | 


--------------------------------------------------------------------------------
/src/digit_utility_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | #include <string>
 3 | #include "digit_utility.hpp"
 4 | 
 5 | #include <pficommon/data/string/ustring.h>
 6 | 
 7 | using namespace digit_utility;
 8 | 
 9 | class DigitUtilityTest: public testing::Test {
10 | public:
11 |   void SetUp() {
12 |     std::string language("ja");
13 |     init_kansuji(language);
14 |   }
15 |   void TearDown() {
16 |   }
17 | };
18 | 
19 | TEST_F(DigitUtilityTest, isHankaku) {
20 |   EXPECT_TRUE(is_hankakusuji(pfi::data::string::string_to_uchar("1")));
21 |   EXPECT_FALSE(is_hankakusuji(pfi::data::string::string_to_uchar("１")));
22 |   EXPECT_FALSE(is_hankakusuji(pfi::data::string::string_to_uchar("一")));
23 |   EXPECT_FALSE(is_hankakusuji(pfi::data::string::string_to_uchar("あ")));
24 | }
25 | 
26 | TEST_F(DigitUtilityTest, isZenkaku) {
27 |   EXPECT_FALSE(is_zenkakusuji(pfi::data::string::string_to_uchar("1")));
28 |   EXPECT_TRUE(is_zenkakusuji(pfi::data::string::string_to_uchar("１")));
29 |   EXPECT_FALSE(is_zenkakusuji(pfi::data::string::string_to_uchar("一")));
30 |   EXPECT_FALSE(is_zenkakusuji(pfi::data::string::string_to_uchar("あ")));
31 | }
32 | 
33 | TEST_F(DigitUtilityTest, isArabic) {
34 |   EXPECT_TRUE(is_arabic(pfi::data::string::string_to_uchar("1")));
35 |   EXPECT_TRUE(is_arabic(pfi::data::string::string_to_uchar("１")));
36 |   EXPECT_FALSE(is_arabic(pfi::data::string::string_to_uchar("一")));
37 |   EXPECT_FALSE(is_arabic(pfi::data::string::string_to_uchar("あ")));
38 | }
39 | 
40 | TEST_F(DigitUtilityTest, isKansuji) {
41 |   EXPECT_FALSE(is_kansuji(pfi::data::string::string_to_uchar("1")));
42 |   EXPECT_FALSE(is_kansuji(pfi::data::string::string_to_uchar("１")));
43 |   EXPECT_TRUE(is_kansuji(pfi::data::string::string_to_uchar("一")));
44 |   EXPECT_FALSE(is_kansuji(pfi::data::string::string_to_uchar("あ")));
45 | }
46 | 
47 | TEST_F(DigitUtilityTest, isKansuji09) {
48 |   EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("1")));
49 |   EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("１")));
50 |   EXPECT_TRUE(is_kansuji_09(pfi::data::string::string_to_uchar("一")));
51 |   EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("十")));
52 |   EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("万")));
53 |   EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("あ")));
54 | }
55 | 
56 | TEST_F(DigitUtilityTest, isKansujiKuraiSen) {
57 |   EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("1")));
58 |   EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("１")));
59 |   EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("一")));
60 |   EXPECT_TRUE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("十")));
61 |   EXPECT_TRUE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("百")));
62 |   EXPECT_TRUE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("千")));
63 |   EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("万")));
64 |   EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("あ")));
65 | }
66 | 
67 | TEST_F(DigitUtilityTest, isKansujiKuraiMan) {
68 |   EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("1")));
69 |   EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("１")));
70 |   EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("一")));
71 |   EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("十")));
72 |   EXPECT_TRUE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("万")));
73 |   EXPECT_TRUE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("億")));
74 |   EXPECT_TRUE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("兆")));
75 |   EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("あ")));
76 | }
77 | 
78 | TEST_F(DigitUtilityTest, isKansujiKurai) {
79 |   EXPECT_FALSE(is_kansuji_kurai(pfi::data::string::string_to_uchar("1")));
80 |   EXPECT_FALSE(is_kansuji_kurai(pfi::data::string::string_to_uchar("１")));
81 |   EXPECT_FALSE(is_kansuji_kurai(pfi::data::string::string_to_uchar("一")));
82 |   EXPECT_TRUE(is_kansuji_kurai(pfi::data::string::string_to_uchar("十")));
83 |   EXPECT_TRUE(is_kansuji_kurai(pfi::data::string::string_to_uchar("万")));
84 |   EXPECT_FALSE(is_kansuji_kurai(pfi::data::string::string_to_uchar("あ")));
85 | }
86 | 
87 | TEST_F(DigitUtilityTest, isNumber) {
88 |   EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("1")));
89 |   EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("１")));
90 |   EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("一")));
91 |   EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("十")));
92 |   EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("万")));
93 |   EXPECT_FALSE(is_number(pfi::data::string::string_to_uchar("あ")));
94 | }
95 | 


--------------------------------------------------------------------------------
/src/duration_expression_normalizer.cpp:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <math.h>
  3 | #include "duration_expression_normalizer.hpp"
  4 | #include "digit_utility.hpp"
  5 | #include "number_normalizer.hpp"
  6 | 
  7 | namespace duration_expression_normalizer {
  8 | 
  9 | void DurationExpressionNormalizer::init() {
 10 | 	load_from_dictionaries("duration_expression_json.txt", "duration_prefix_counter_json.txt", "duration_prefix_json.txt", "duration_suffix_json.txt");
 11 | }
 12 | 
 13 | void DurationExpressionNormalizer::normalize_number(const std::string& text, std::vector<digit_utility::Number>& numbers) {
 14 | 	NN.process(text, numbers);
 15 | }
 16 | 
 17 | void set_time(DurationExpression& durationexp, const std::string& corresponding_time_position, const DurationExpression& integrate_durationexp) {
 18 | 	if (corresponding_time_position == "y") {
 19 | 		durationexp.value_lowerbound.year = integrate_durationexp.org_value_lowerbound;
 20 | 		durationexp.value_upperbound.year = integrate_durationexp.org_value_upperbound;
 21 | 	} else if (corresponding_time_position == "m") {
 22 | 		durationexp.value_lowerbound.month = integrate_durationexp.org_value_lowerbound;
 23 | 		durationexp.value_upperbound.month = integrate_durationexp.org_value_upperbound;
 24 | 	} else if (corresponding_time_position == "d") {
 25 | 		durationexp.value_lowerbound.day = integrate_durationexp.org_value_lowerbound;
 26 | 		durationexp.value_upperbound.day = integrate_durationexp.org_value_upperbound;
 27 | 	} else if (corresponding_time_position == "h") {
 28 | 		durationexp.value_lowerbound.hour = integrate_durationexp.org_value_lowerbound;
 29 | 		durationexp.value_upperbound.hour = integrate_durationexp.org_value_upperbound;
 30 | 	} else if (corresponding_time_position == "mn") {
 31 | 		durationexp.value_lowerbound.minute = integrate_durationexp.org_value_lowerbound;
 32 | 		durationexp.value_upperbound.minute = integrate_durationexp.org_value_upperbound;
 33 | 	} else if (corresponding_time_position == "s") {
 34 | 		durationexp.value_lowerbound.second = integrate_durationexp.org_value_lowerbound;
 35 | 		durationexp.value_upperbound.second = integrate_durationexp.org_value_upperbound;
 36 | 	} else if (corresponding_time_position == "seiki") {
 37 | 		durationexp.value_lowerbound.year = integrate_durationexp.org_value_lowerbound*100;
 38 | 		durationexp.value_upperbound.year = integrate_durationexp.org_value_upperbound*100;
 39 | 	} else if (corresponding_time_position == "w") {
 40 | 		durationexp.value_lowerbound.day = integrate_durationexp.org_value_lowerbound*7;
 41 | 		durationexp.value_upperbound.day = integrate_durationexp.org_value_upperbound*7;
 42 | 	}
 43 | }
 44 | 
 45 | void do_option_han(DurationExpression& durationexp, const std::string& corresponding_time_position){
 46 | 	if (corresponding_time_position == "y") {
 47 | 		durationexp.value_lowerbound.year += 0.5;
 48 | 		durationexp.value_upperbound.year += 0.5;
 49 | 	} else if (corresponding_time_position == "m") {
 50 | 		durationexp.value_lowerbound.month += 0.5;
 51 | 		durationexp.value_upperbound.month += 0.5;
 52 | 	} else if (corresponding_time_position == "d") {
 53 | 		durationexp.value_lowerbound.day += 0.5;
 54 | 		durationexp.value_upperbound.day += 0.5;
 55 | 	} else if (corresponding_time_position == "h") {
 56 | 		durationexp.value_lowerbound.hour += 0.5;
 57 | 		durationexp.value_upperbound.hour += 0.5;
 58 | 	} else if (corresponding_time_position == "mn") {
 59 | 		durationexp.value_lowerbound.minute += 0.5;
 60 | 		durationexp.value_upperbound.minute += 0.5;
 61 | 	} else if (corresponding_time_position == "s") {
 62 | 		durationexp.value_lowerbound.second += 0.5;
 63 | 		durationexp.value_upperbound.second += 0.5;
 64 | 	} else if (corresponding_time_position == "seiki") {
 65 | 		durationexp.value_lowerbound.year += 50;
 66 | 		durationexp.value_upperbound.year += 50;
 67 | 	}
 68 | }
 69 | 
 70 | void revise_durationexp_by_process_type(DurationExpression& durationexp, std::string process_type, const LimitedDurationExpression& matching_limited_duration_expression) {
 71 | 	if (process_type == "han") {
 72 | 		if(matching_limited_duration_expression.corresponding_time_position.empty()) return;
 73 | 		std::string corresponding_time_position = matching_limited_duration_expression.corresponding_time_position[matching_limited_duration_expression.corresponding_time_position.size()-1];
 74 | 		do_option_han(durationexp, corresponding_time_position);
 75 | 	}
 76 | }
 77 | 
 78 | void DurationExpressionNormalizer::revise_any_type_expression_by_matching_limited_expression(std::vector<DurationExpression>& durationexps, int &expression_id,
 79 | 																																														const LimitedDurationExpression matching_limited_duration_expression) {
 80 | 	int final_integrated_durationexp_id = expression_id + matching_limited_duration_expression.total_number_of_place_holder;
 81 | 	durationexps[expression_id].position_end = durationexps[final_integrated_durationexp_id].position_end
 82 | 	+ matching_limited_duration_expression.length_of_strings_after_final_place_holder;
 83 | 	for (int i = 0; i < static_cast<int>(matching_limited_duration_expression.corresponding_time_position.size()); i++) {
 84 | 		set_time(durationexps[expression_id], matching_limited_duration_expression.corresponding_time_position[i], durationexps[expression_id + i]);
 85 | 	}
 86 | 	for (int i = 0; i < static_cast<int>(matching_limited_duration_expression.process_type.size()); i++) {
 87 | 		revise_durationexp_by_process_type(durationexps[expression_id], matching_limited_duration_expression.process_type[i], matching_limited_duration_expression);
 88 | 	}
 89 | 	durationexps[expression_id].ordinary = matching_limited_duration_expression.ordinary;
 90 | 	
 91 | 	durationexps.erase(durationexps.begin() + expression_id + 1,
 92 | 										durationexps.begin() + expression_id + 1 + matching_limited_duration_expression.total_number_of_place_holder);
 93 | }
 94 | 
 95 | void DurationExpressionNormalizer::revise_any_type_expression_by_matching_prefix_counter(DurationExpression& any_type_expression, const LimitedDurationExpression& matching_limited_expression) {} //持続時間にprefix_counterは存在しない（今のところ）
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 	
112 | 
113 | 
114 | 
115 | /*
116 |  　修飾語による規格化表現の補正処理。
117 |  */
118 | 
119 | void do_time_about(DurationExpression& durationexp) {
120 | 	normalizer_utility::Time &tvl = durationexp.value_lowerbound, &tvu = durationexp.value_upperbound;
121 | 	const std::string target_time_position = normalizer_utility::identify_time_detail(durationexp.value_lowerbound);
122 | 	if (target_time_position == "y") {
123 | 		tvl.year -= 5;
124 | 		tvu.year += 5;
125 | 	} else if (target_time_position == "m") {
126 | 		tvl.month -= 1;
127 | 		tvu.month += 1;
128 | 	} else if (target_time_position == "d") {
129 | 		tvl.day -= 1;
130 | 		tvu.day += 1;
131 | 	} else if (target_time_position == "h") {
132 | 		tvl.hour -= 1;
133 | 		tvu.hour += 1;
134 | 	} else if (target_time_position == "mn") {
135 | 		tvl.minute -= 5;
136 | 		tvu.minute += 5;
137 | 	} else if (target_time_position == "s") {
138 | 		tvl.second -= 5;
139 | 		tvu.second += 5;
140 | 	}
141 | }
142 | 
143 | 
144 | void do_time_kyou(DurationExpression& durationexp) {
145 | 	normalizer_utility::Time &tvu = durationexp.value_upperbound;
146 | 	const std::string target_time_position = normalizer_utility::identify_time_detail(durationexp.value_lowerbound);
147 | 	if (target_time_position == "y") {
148 | 		tvu.year += 5;
149 | 	} else if (target_time_position == "m") {
150 | 		tvu.month += 1;
151 | 	} else if (target_time_position == "d") {
152 | 		tvu.day += 1;
153 | 	} else if (target_time_position == "h") {
154 | 		tvu.hour += 1;
155 | 	} else if (target_time_position == "mn") {
156 | 		tvu.minute += 5;
157 | 	} else if (target_time_position == "s") {
158 | 		tvu.second += 5;
159 | 	}
160 | }
161 | 
162 | 
163 | void do_time_jaku(DurationExpression& durationexp) {
164 | 	normalizer_utility::Time &tvl = durationexp.value_lowerbound;
165 | 	const std::string target_time_position = normalizer_utility::identify_time_detail(durationexp.value_lowerbound);
166 | 	if (target_time_position == "y") {
167 | 		tvl.year -= 5;
168 | 	} else if (target_time_position == "m") {
169 | 		tvl.month -= 1;
170 | 	} else if (target_time_position == "d") {
171 | 		tvl.day -= 1;
172 | 	} else if (target_time_position == "h") {
173 | 		tvl.hour -= 1;
174 | 	} else if (target_time_position == "mn") {
175 | 		tvl.minute -= 5;
176 | 	} else if (target_time_position == "s") {
177 | 		tvl.second -= 5;
178 | 	}
179 | }	
180 | 
181 | 
182 | void DurationExpressionNormalizer::revise_any_type_expression_by_number_modifier(DurationExpression& durationexp,
183 | 																																								const normalizer_utility::NumberModifier& number_modifier) {
184 | 	std::string process_type = number_modifier.process_type;
185 | 	if (process_type == "or_over") {
186 | 		durationexp.value_upperbound = normalizer_utility::Time(INFINITY);
187 | 	} else if (process_type == "or_less") {
188 | 		durationexp.value_lowerbound = normalizer_utility::Time(-INFINITY);
189 | 	} else if (process_type == "over") {
190 | 		durationexp.value_upperbound = normalizer_utility::Time(INFINITY);
191 | 		durationexp.include_lowerbound = false;
192 | 	} else if (process_type == "less") {
193 | 		durationexp.value_lowerbound = normalizer_utility::Time(-INFINITY);
194 | 		durationexp.include_upperbound = false;
195 | 	} else if (process_type == "ordinary") { //TODO : 序数は絶対時間として扱う？持続時間として扱う？　未定
196 | 		durationexp.ordinary = true;
197 | 	} else if (process_type == "none") {
198 | 		;
199 | 	} else if (process_type == "per") {
200 | 		// TODO : 「1日毎」など? どんな処理をするか未定。
201 | 	} else if (process_type == "dai") {
202 | 		// TODO : 「1秒台」など。　どんな処理をするか未定。　これは持続時間？（ではなさそう）
203 | 	} else if (process_type == "about") {
204 | 		do_time_about(durationexp);
205 | 	} else if (process_type == "kyou") {
206 | 		do_time_kyou(durationexp);
207 | 	} else if (process_type == "jaku") {
208 | 		do_time_jaku(durationexp);				
209 | 	} else if (process_type == "made") {
210 | 		if(durationexp.value_lowerbound == durationexp.value_upperbound){
211 | 			durationexp.value_lowerbound = normalizer_utility::Time(-INFINITY);
212 | 		} else{
213 | 			
214 | 		}
215 | 	} else {
216 | 		durationexp.options.push_back(process_type);
217 | 	}
218 | 	
219 | }
220 | 
221 | void DurationExpressionNormalizer::delete_not_any_type_expression(std::vector<DurationExpression>& durationexps){
222 | 	for(int i=0; i<static_cast<int>(durationexps.size()); i++){
223 | 		if(normalizer_utility::is_null_time(durationexps[i].value_lowerbound) && normalizer_utility::is_null_time(durationexps[i].value_upperbound)){
224 | 			durationexps.erase(durationexps.begin() + i);
225 | 			i--;
226 | 		}
227 | 	}
228 | }
229 | 
230 | void DurationExpressionNormalizer::fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector<DurationExpression>& durationexps) {
231 | 	for(int i=0; i<static_cast<int>(durationexps.size()-1); i++){
232 | 		if(have_kara_suffix(durationexps[i].options) && have_kara_prefix(durationexps[i+1].options) && durationexps[i].position_end +2 >= durationexps[i+1].position_start){
233 | 			durationexps[i].value_upperbound = durationexps[i+1].value_upperbound;
234 | 			durationexps[i].position_end = durationexps[i+1].position_end;
235 | 			durationexps[i].set_original_expression_from_position(utext);
236 | 			merge_options(durationexps[i].options, durationexps[i+1].options);
237 | 			durationexps.erase(durationexps.begin()+i+1);
238 | 		}
239 | 	}
240 | }
241 | } //namespace duration_expression_normalizer
242 | 
243 | 


--------------------------------------------------------------------------------
/src/duration_expression_normalizer.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DURATION_EXPRESSION_NORMALIZER_H_
 2 | #define DURATION_EXPRESSION_NORMALIZER_H_
 3 | #include <string>
 4 | #include "digit_utility.hpp"
 5 | #include "number_normalizer.hpp"
 6 | #include "normalizer_utility.hpp"
 7 | #include "normalizer_template.hpp"
 8 | #include <ux/ux.hpp>
 9 | 
10 | namespace duration_expression_normalizer {
11 |   
12 | struct DurationExpression : normalizer_utility::NormalizedExpressionTemplate{
13 |   DurationExpression(digit_utility::Number number)
14 |   : normalizer_utility::NormalizedExpressionTemplate(number.original_expression, number.position_start, number.position_end),
15 |     org_value_lowerbound(number.value_lowerbound),
16 |     org_value_upperbound(number.value_upperbound),
17 |     value_lowerbound(normalizer_utility::Time(INFINITY)),
18 |     value_upperbound(normalizer_utility::Time(-INFINITY)),
19 |     ordinary(false)
20 |   {}
21 |   
22 |   double org_value_lowerbound, org_value_upperbound;
23 |   normalizer_utility::Time value_lowerbound, value_upperbound;
24 |   bool ordinary;
25 | };
26 | 
27 |   
28 | class LimitedDurationExpression : public normalizer_utility::LimitedExpressionTemplate{
29 | public:
30 |   template <class Archive>
31 |   void serialize(Archive &ar){
32 |     ar & MEMBER(pattern) & MEMBER(corresponding_time_position) & MEMBER(process_type) & MEMBER(ordinary) & MEMBER(option);
33 |   }
34 |   
35 |   std::vector<std::string> corresponding_time_position;
36 |   std::vector<std::string> process_type;
37 | };
38 | 
39 |   
40 | class DurationExpressionNormalizer : public normalizer_template::NormalizerTemplate<DurationExpression, LimitedDurationExpression>{
41 | public:
42 |   DurationExpressionNormalizer(const std::string& language) : NN(language) { language_ = language; init(); }
43 |   
44 | private:
45 |   void init();
46 |   void normalize_number(const std::string& text, std::vector<digit_utility::Number>& numbers);
47 |   void revise_any_type_expression_by_matching_limited_expression(std::vector<DurationExpression>& durationexps, int& expression_id, LimitedDurationExpression matching_limited_duration_expression);
48 |   void revise_any_type_expression_by_matching_prefix_counter(DurationExpression& any_type_expression, const LimitedDurationExpression& matching_limited_expression);
49 |   void revise_any_type_expression_by_number_modifier(DurationExpression& durationexp, const normalizer_utility::NumberModifier& number_modifier);
50 |   void delete_not_any_type_expression(std::vector<DurationExpression>& durationexps);
51 |   void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector<DurationExpression>& durationexps);
52 |   
53 |   number_normalizer::NumberNormalizer NN;
54 | };
55 |   
56 | } //namespace duration_expression_normalizer
57 | 
58 | #endif //RELTIME_EXPRESSON_NORMALIZER_H_
59 | 


--------------------------------------------------------------------------------
/src/duration_expression_normalizer_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | #include <string>
  3 | #include "normalizer_utility.hpp"
  4 | #include "duration_expression_normalizer.hpp"
  5 | 
  6 | #include <pficommon/data/string/ustring.h>
  7 | #include <ux/ux.hpp>
  8 | 
  9 | using namespace normalizer_utility;
 10 | using namespace std;
 11 | using namespace pfi::data::string;
 12 | using namespace duration_expression_normalizer;
 13 | 
 14 | class DurationexpNormalizerTest : public testing::Test {
 15 | public:
 16 |   void SetUp() {}
 17 |   void TearDown() {}
 18 | };
 19 | 
 20 | bool is_same_time(const Time& a, const Time& b){
 21 |   return
 22 |   a.year == b.year &&
 23 |   a.month == b.month &&
 24 |   a.day == b.day &&
 25 |   a.hour == b.hour &&
 26 |   a.minute == b.minute &&
 27 |   a.second == b.second;
 28 | }
 29 | 
 30 | TEST_F(DurationexpNormalizerTest, simple1) {
 31 |   DurationExpressionNormalizer DEN("ja");
 32 |   std::string text("あの人は三時間も耐えた");
 33 |   std::vector<DurationExpression> durationexps;
 34 |   DEN.process(text, durationexps);
 35 |   ASSERT_EQ(1u, durationexps.size());
 36 |   
 37 |   Time ex1_lower(INFINITY, INFINITY, INFINITY, 3, INFINITY, INFINITY);
 38 |   Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 3, -INFINITY, -INFINITY);
 39 |   
 40 |   EXPECT_EQ("三時間", ustring_to_string(durationexps[0].original_expression));
 41 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
 42 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
 43 | }
 44 | 
 45 | TEST_F(DurationexpNormalizerTest, simple2) {
 46 |   DurationExpressionNormalizer DEN("ja");
 47 |   std::string text("それは3年5ヶ月の間にも");
 48 |   std::vector<DurationExpression> durationexps;
 49 |   DEN.process(text, durationexps);
 50 |   ASSERT_EQ(1u, durationexps.size());
 51 |   Time ex1_lower(3, 5, INFINITY, INFINITY, INFINITY, INFINITY);
 52 |   Time ex1_upper(3, 5, -INFINITY, -INFINITY, -INFINITY, -INFINITY);
 53 |   
 54 |   EXPECT_EQ("3年5ヶ月", ustring_to_string(durationexps[0].original_expression));
 55 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
 56 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
 57 | }
 58 | 
 59 | TEST_F(DurationexpNormalizerTest, seiki1) {
 60 |   DurationExpressionNormalizer DEN("ja");
 61 |   std::string text("あの人は三世紀も耐えた");
 62 |   std::vector<DurationExpression> durationexps;
 63 |   DEN.process(text, durationexps);
 64 |   ASSERT_EQ(1u, durationexps.size());
 65 |   
 66 |   Time ex1_lower(300, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY);
 67 |   Time ex1_upper(300, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY);
 68 |   
 69 |   EXPECT_EQ("三世紀", ustring_to_string(durationexps[0].original_expression));
 70 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
 71 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
 72 | }
 73 | 
 74 | TEST_F(DurationexpNormalizerTest, han1) {
 75 |   DurationExpressionNormalizer DEN("ja");
 76 |   std::string text("あの人は三世紀半も耐えた");
 77 |   std::vector<DurationExpression> durationexps;
 78 |   DEN.process(text, durationexps);
 79 |   ASSERT_EQ(1u, durationexps.size());
 80 |   
 81 |   Time ex1_lower(350, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY);
 82 |   Time ex1_upper(350, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY);
 83 |   
 84 |   EXPECT_EQ("三世紀半", ustring_to_string(durationexps[0].original_expression));
 85 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
 86 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
 87 | }
 88 | 
 89 | TEST_F(DurationexpNormalizerTest, han2) {
 90 |   DurationExpressionNormalizer DEN("ja");
 91 |   std::string text("あの人は三時間半も耐えた");
 92 |   std::vector<DurationExpression> durationexps;
 93 |   DEN.process(text, durationexps);
 94 |   ASSERT_EQ(1u, durationexps.size());
 95 |   
 96 |   Time ex1_lower(INFINITY, INFINITY, INFINITY, 3.5, INFINITY, INFINITY);
 97 |   Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 3.5, -INFINITY, -INFINITY);
 98 |   
 99 |   EXPECT_EQ("三時間半", ustring_to_string(durationexps[0].original_expression));
100 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
101 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
102 | }
103 | 
104 | TEST_F(DurationexpNormalizerTest, plural1) {
105 |   DurationExpressionNormalizer DEN("ja");
106 |   std::string text("三年間と五ヶ月の間");
107 |   std::vector<DurationExpression> durationexps;
108 |   DEN.process(text, durationexps);
109 |   ASSERT_EQ(2u, durationexps.size());
110 |   
111 |   Time ex1_lower(3, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY);
112 |   Time ex1_upper(3, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY);
113 |   
114 |   EXPECT_EQ("三年間", ustring_to_string(durationexps[0].original_expression));
115 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
116 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
117 |   
118 |   Time ex2_lower(INFINITY, 5, INFINITY, INFINITY, INFINITY, INFINITY);
119 |   Time ex2_upper(-INFINITY, 5, -INFINITY, -INFINITY, -INFINITY, -INFINITY);
120 |   
121 |   EXPECT_EQ("五ヶ月", ustring_to_string(durationexps[1].original_expression));
122 |   EXPECT_TRUE(is_same_time(ex2_lower, durationexps[1].value_lowerbound));
123 |   EXPECT_TRUE(is_same_time(ex2_upper, durationexps[1].value_upperbound));
124 | }
125 | 
126 | TEST_F(DurationexpNormalizerTest, or_over1) {
127 |   DurationExpressionNormalizer DEN("ja");
128 |   std::string text("あの人は三時間以上も耐えた");
129 |   std::vector<DurationExpression> durationexps;
130 |   DEN.process(text, durationexps);
131 |   ASSERT_EQ(1u, durationexps.size());
132 |   
133 |   Time ex1_lower(INFINITY, INFINITY, INFINITY, 3, INFINITY, INFINITY);
134 |   Time ex1_upper(INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY);
135 |   
136 |   EXPECT_EQ("三時間以上", ustring_to_string(durationexps[0].original_expression));
137 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
138 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
139 | }
140 | 
141 | TEST_F(DurationexpNormalizerTest, about_suffix) {
142 |   DurationExpressionNormalizer DEN("ja");
143 |   std::string text("あの人は三時間くらいは耐えた");
144 |   std::vector<DurationExpression> durationexps;
145 |   DEN.process(text, durationexps);
146 |   ASSERT_EQ(1u, durationexps.size());
147 |   
148 |   Time ex1_lower(INFINITY, INFINITY, INFINITY, 2, INFINITY, INFINITY);
149 |   Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 4, -INFINITY, -INFINITY);
150 |   
151 |   EXPECT_EQ("三時間くらい", ustring_to_string(durationexps[0].original_expression));
152 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
153 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
154 | }
155 | 
156 | 
157 | TEST_F(DurationexpNormalizerTest, about_prefix) {
158 |   DurationExpressionNormalizer DEN("ja");
159 |   std::string text("あの人はほぼ三時間は耐えた");
160 |   std::vector<DurationExpression> durationexps;
161 |   DEN.process(text, durationexps);
162 |   ASSERT_EQ(1u, durationexps.size());
163 |   
164 |   Time ex1_lower(INFINITY, INFINITY, INFINITY, 2, INFINITY, INFINITY);
165 |   Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 4, -INFINITY, -INFINITY);
166 |   
167 |   EXPECT_EQ("ほぼ三時間", ustring_to_string(durationexps[0].original_expression));
168 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
169 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
170 | }
171 | 
172 | 
173 | TEST_F(DurationexpNormalizerTest, kyou) {
174 |   DurationExpressionNormalizer DEN("ja");
175 |   std::string text("あの人は三時間強は耐えた");
176 |   std::vector<DurationExpression> durationexps;
177 |   DEN.process(text, durationexps);
178 |   ASSERT_EQ(1u, durationexps.size());
179 |   
180 |   Time ex1_lower(INFINITY, INFINITY, INFINITY, 3, INFINITY, INFINITY);
181 |   Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 4, -INFINITY, -INFINITY);
182 |   
183 |   EXPECT_EQ("三時間強", ustring_to_string(durationexps[0].original_expression));
184 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
185 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
186 | }
187 | 
188 | 
189 | TEST_F(DurationexpNormalizerTest, jaku) {
190 |   DurationExpressionNormalizer DEN("ja");
191 |   std::string text("あの人は三時間弱は耐えた");
192 |   std::vector<DurationExpression> durationexps;
193 |   DEN.process(text, durationexps);
194 |   ASSERT_EQ(1u, durationexps.size());
195 |   
196 |   Time ex1_lower(INFINITY, INFINITY, INFINITY, 2, INFINITY, INFINITY);
197 |   Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 3, -INFINITY, -INFINITY);
198 |   
199 |   EXPECT_EQ("三時間弱", ustring_to_string(durationexps[0].original_expression));
200 |   EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound));
201 |   EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound));
202 | }


--------------------------------------------------------------------------------
/src/inappropriate_expression_remover.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef INAPPROPRIATE_EXPRESSION_REMOVER_H_
 2 | #define INAPPROPRIATE_EXPRESSION_REMOVER_H_
 3 | #include "numerical_expression_normalizer.hpp"
 4 | #include "abstime_expression_normalizer.hpp"
 5 | #include "reltime_expression_normalizer.hpp"
 6 | #include "duration_expression_normalizer.hpp"
 7 | 
 8 | namespace inappropriate_expression_remover{
 9 | 	struct InappropriateStrings {
10 | 		template <class Archive>
11 | 		void serialize(Archive &ar){
12 | 			ar & MEMBER(str);
13 | 		}
14 | 		std::string str;
15 | 	};
16 | 	
17 | 	class InappropriateExpressionRemover{
18 | 	public:
19 | 		InappropriateExpressionRemover(const std::string& language);
20 | 		void remove_inappropriate_extraction(const std::string& text,
21 | 																				 std::vector<numerical_expression_normalizer::NumericalExpression>& numexps,
22 | 																				 std::vector<abstime_expression_normalizer::AbstimeExpression>& abstimeexps,
23 | 																				 std::vector<reltime_expression_normalizer::ReltimeExpression>& reltimeexps,
24 | 																				 std::vector<duration_expression_normalizer::DurationExpression>& durationexps);
25 | 	private:
26 | 		template <class AnyTypeExpression>
27 | 		void delete_inappropriate_extraction_using_dictionary_one_type(std::vector<AnyTypeExpression>& any_type_expressions);
28 | 		template <class AnyTypeExpression>
29 | 		bool is_url_strings(const std::string& text, const AnyTypeExpression& any_type_expression);
30 | 		template <class AnyTypeExpression>
31 | 		void delete_url_strings(const std::string& text, std::vector<AnyTypeExpression>& any_type_expressions);
32 | 		void delete_inappropriate_extraction_using_dictionary(const std::string& text,
33 | 																													 std::vector<numerical_expression_normalizer::NumericalExpression>& numexps,
34 | 																													 std::vector<abstime_expression_normalizer::AbstimeExpression>& abstimeexps,
35 | 																													 std::vector<reltime_expression_normalizer::ReltimeExpression>& reltimeexps,
36 | 																													 std::vector<duration_expression_normalizer::DurationExpression>& durationexps);
37 | 		void init_inappropriate_stringss(const std::string& language);
38 | 		void init_url_strings();
39 | 		
40 | 		std::map<std::string, bool> inappropriate_strings_to_bool;
41 | 		std::map<std::string, bool> url_strings_to_bool;
42 | 	};
43 | } //namespace inappropriate_expression_remover.hpp
44 | 
45 | #endif
46 | 
47 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
 1 | #include "normalize_numexp.hpp"
 2 | #include "optparse.h"
 3 | #include <iostream>
 4 | using namespace normalize_numexp;
 5 | 
 6 | class option : public optparse
 7 | {
 8 | public:
 9 | 	bool help, version, cnt;
10 | 	std::string show;
11 | 	
12 | public:
13 | 	option()
14 | 	: help(false), version(false) {}
15 | 	
16 | 	BEGIN_OPTION_MAP_INLINE()
17 | 	
18 | 	ON_OPTION(SHORTOPT('v') || LONGOPT("version"))
19 | 	version = true;
20 | 	
21 | 	ON_OPTION(SHORTOPT('h') || LONGOPT("help"))
22 | 	help = true;
23 | 	
24 | 	END_OPTION_MAP()
25 | };
26 | 
27 | int usage(std::ostream& os, const char *argv0)
28 | {
29 | 	os << "USAGE: " << argv0 << " [OPTIONS]" << std::endl;
30 | 	os << "This utility normalize (Japanese) numerical and temporal expressions in the input sentence." << std::endl;
31 | 	os << std::endl;
32 | 	os << "OPTIONS:" << std::endl;
33 | 	os << "  -v, --version         show this version information and exit" << std::endl;
34 | 	os << "  -h, --help            show this help message and exit" << std::endl;
35 | 	os << std::endl;
36 | 	return 0;
37 | }
38 | 
39 | int version(std::ostream& os)
40 | {
41 | 	os << NORMALIZENUMEXP_NAME;
42 | 	os << NORMALIZENUMEXP_VERSION << " ";
43 | 	os << NORMALIZENUMEXP_COPYRIGHT << std::endl;
44 | 	os << std::endl;
45 | 	return 0;
46 | }
47 | 
48 | 
49 | int main(int argc, char * argv[]){
50 | 	option opt;
51 | 	try { 
52 | 		//int arg_used = opt.parse(argv, argc);
53 | 	} catch (const optparse::unrecognized_option& e) {
54 | 		std::cerr << "ERROR: unrecognized option: " << e.what() << std::endl;
55 | 		return 1;
56 | 	} catch (const optparse::invalid_value& e) {
57 | 		std::cerr << "ERROR: " << e.what() << std::endl;
58 | 		return 1;
59 | 	}
60 | 	if(opt.help){
61 | 		usage(std::cerr, argv[0]);
62 | 		return 1;
63 | 	}else if(opt.version){
64 | 		version(std::cerr);
65 | 		return 1;
66 | 	}
67 | 	
68 |   NormalizeNumexp NN("ja");
69 |   std::string sentence;
70 |   std::vector<std::string> result;
71 | 	
72 |   while(1) {
73 |     sentence = "";
74 |     std::getline(std::cin, sentence);
75 |     if(sentence.empty()) break;
76 |     NN.normalize(sentence, result);
77 | 
78 |     //show results
79 |     for(int i=0; i<static_cast<int>(result.size()); i++){
80 |       std::cout << result[i] << std::endl;
81 |     }	
82 |     std::cout << "END" << std::endl;
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/normalize_numexp.cpp:
--------------------------------------------------------------------------------
  1 | #include "normalize_numexp.hpp"
  2 | #include <sstream>
  3 | 
  4 | namespace normalize_numexp{
  5 |     
  6 | 	void NormalizeNumexp::normalize_each_type_expressions(const std::string& text,
  7 | 																	   std::vector<numerical_expression_normalizer::NumericalExpression>& numexps,
  8 | 																	   std::vector<abstime_expression_normalizer::AbstimeExpression>& abstimeexps,
  9 | 																	   std::vector<reltime_expression_normalizer::ReltimeExpression>& reltimeexps,
 10 | 																	   std::vector<duration_expression_normalizer::DurationExpression>& durationexps){
 11 | 		NEN.process(text, numexps);
 12 | 		AEN.process(text, abstimeexps);
 13 | 		REN.process(text, reltimeexps);
 14 | 		DEN.process(text, durationexps);
 15 | 	}
 16 | 	
 17 | 	
 18 | 	template <class AnyTypeExpression>
 19 | 	std::string show_options(AnyTypeExpression& any_type_expression){
 20 | 		std::stringstream ss;
 21 | 		if(any_type_expression.ordinary) any_type_expression.options.push_back("ordinary");
 22 | 		int sz = static_cast<int>(any_type_expression.options.size());
 23 | 		for(int i=0; i<sz; i++){
 24 | 			if(any_type_expression.options[i] == "") continue;
 25 | 			ss << any_type_expression.options[i];
 26 | 			if(i!=sz-1) ss << ",";
 27 | 		}
 28 | 		std::string ret;
 29 | 		ss >> ret;
 30 | 		return ret;
 31 | 	}
 32 | 	
 33 | 	
 34 | 	
 35 | 	//resultの生成
 36 | 	void merge_normalize_expressions_into_result( std::vector<numerical_expression_normalizer::NumericalExpression> numexps,  std::vector<abstime_expression_normalizer::AbstimeExpression> abstimeexps,  std::vector<reltime_expression_normalizer::ReltimeExpression> reltimeexps,  std::vector<duration_expression_normalizer::DurationExpression> durationexps, std::vector<std::string>& result){
 37 | 
 38 | 	 //TODO : それぞれの正規形に、toString関数をつける？逆に分かり辛い？　とりあえずここで処理
 39 | 	 std::string kugiri("*");
 40 | 	 std::string tmpstr;
 41 | 	 std::stringstream ss;
 42 | 	 result.clear();
 43 | 	 
 44 | 	 for(int i=0; i<static_cast<int>(numexps.size()); i++){
 45 | 	 ss.clear(); ss.str("");
 46 | 	 ss << "numerical" << "*" << numexps[i].original_expression << "*" << numexps[i].position_start << "*" << numexps[i].position_end << "*" << numexps[i].counter << "*" << numexps[i].value_lowerbound << "*" << numexps[i].value_upperbound << "*" << show_options(numexps[i]);
 47 | 	 ss >> tmpstr;
 48 | 	 result.push_back(tmpstr);
 49 | 	 }
 50 | 	 
 51 | 	 for(int i=0; i<static_cast<int>(abstimeexps.size()); i++){
 52 | 	 ss.clear(); ss.str("");
 53 | 	 ss << "abstime" << "*" << abstimeexps[i].original_expression << "*" << abstimeexps[i].position_start << "*" << abstimeexps[i].position_end << "*" << "none" << "*" << abstimeexps[i].value_lowerbound.to_string(false) << "*" << abstimeexps[i].value_upperbound.to_string(true) << "*" << show_options(abstimeexps[i]);
 54 | 	 ss >> tmpstr;
 55 | 	 result.push_back(tmpstr);
 56 | 	 }
 57 | 	 
 58 | 	 for(int i=0; i<static_cast<int>(reltimeexps.size()); i++){
 59 | 	 ss.clear(); ss.str("");
 60 | 	 //TODO : 相対時間表現を、どう表示させるか？
 61 | 	 ss << "reltime" << "*" << reltimeexps[i].original_expression << "*" << reltimeexps[i].position_start << "*" << reltimeexps[i].position_end << "*" << "none" << "*" << reltimeexps[i].value_lowerbound_abs.to_string(false) << "," << reltimeexps[i].value_lowerbound_rel.to_duration_string(false) << "*" << reltimeexps[i].value_upperbound_abs.to_string(true) << "," << reltimeexps[i].value_upperbound_rel.to_duration_string(true) << "*" << show_options(reltimeexps[i]);
 62 | 	 ss >> tmpstr;
 63 | 	 result.push_back(tmpstr);
 64 | 	 }
 65 | 	 
 66 | 	 for(int i=0; i<static_cast<int>(durationexps.size()); i++){
 67 | 	 ss.clear(); ss.str("");
 68 | 	 ss << "duration" << "*" << durationexps[i].original_expression << "*" << durationexps[i].position_start << "*" << durationexps[i].position_end << "*" << "none" << "*" << durationexps[i].value_lowerbound.to_duration_string(false) << "*" << durationexps[i].value_upperbound.to_duration_string(true) << "*" << show_options(durationexps[i]);
 69 | 	 ss >> tmpstr;
 70 | 	 result.push_back(tmpstr);
 71 | 	 }
 72 | 	 }
 73 | 
 74 | 
 75 | 	
 76 | 	NormalizeNumexp::NormalizeNumexp(const std::string& language) : NEN(language), AEN(language), REN(language), DEN(language), IER(language) {}
 77 | 
 78 | 
 79 | 	void NormalizeNumexp::normalize(const std::string& text, std::vector<std::string>& result){
 80 | 		result.clear();
 81 | 		std::vector<numerical_expression_normalizer::NumericalExpression> numexps;
 82 | 		std::vector<abstime_expression_normalizer::AbstimeExpression> abstimeexps;
 83 | 		std::vector<reltime_expression_normalizer::ReltimeExpression> reltimeexps;
 84 | 		std::vector<duration_expression_normalizer::DurationExpression> durationexps;
 85 | 		
 86 | 		//4つのnormalizerで処理を行う
 87 | 		normalize_each_type_expressions(text, numexps, abstimeexps, reltimeexps, durationexps);
 88 | 		
 89 | 		//それぞれの結果より、不適当な抽出を削除
 90 | 		IER.remove_inappropriate_extraction(text, numexps, abstimeexps, reltimeexps, durationexps);
 91 | 		
 92 | 		//string型に変換し、resultにまとめる
 93 | 		merge_normalize_expressions_into_result(numexps, abstimeexps, reltimeexps, durationexps, result);
 94 | 	}
 95 | 	
 96 | } //namespace normalize_numexp
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/src/normalize_numexp.hpp:
--------------------------------------------------------------------------------
 1 | #define NORMALIZENUMEXP_NAME "normalizeNumexp"
 2 | #define NORMALIZENUMEXP_VERSION "3.0"
 3 | #define NORMALIZENUMEXP_COPYRIGHT "Copyright (c) 2012 Katsuma Narisawa"
 4 | 
 5 | #include "inappropriate_expression_remover.hpp"
 6 | 
 7 | namespace normalize_numexp{
 8 | 
 9 | 	class NormalizeNumexp{
10 | 	public:
11 | 		NormalizeNumexp(const std::string& language);
12 | 		void normalize(const std::string& text, std::vector<std::string>& result);
13 | 		
14 | 	private:
15 | 		void normalize_each_type_expressions(const std::string& text,
16 | 											 std::vector<numerical_expression_normalizer::NumericalExpression>& numexps,
17 | 											 std::vector<abstime_expression_normalizer::AbstimeExpression>& abstimeexps,
18 | 											 std::vector<reltime_expression_normalizer::ReltimeExpression>& reltimeexps,
19 | 											 std::vector<duration_expression_normalizer::DurationExpression>& durationexps);
20 | 		
21 | 		numerical_expression_normalizer::NumericalExpressionNormalizer NEN;
22 | 		abstime_expression_normalizer::AbstimeExpressionNormalizer AEN;
23 | 		reltime_expression_normalizer::ReltimeExpressionNormalizer REN;
24 | 		duration_expression_normalizer::DurationExpressionNormalizer DEN;
25 | 		inappropriate_expression_remover::InappropriateExpressionRemover IER;
26 | 	};
27 | } //namespace normalize_numexp
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/src/normalize_numexp_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | #include <string>
  3 | #include "normalize_numexp.hpp"
  4 | 
  5 | using namespace normalize_numexp;
  6 | using namespace std;
  7 | 
  8 | class NumexpExtractorTest : public testing::Test {
  9 | public:
 10 |     void SetUp() {}
 11 |     void TearDown() {}
 12 | };
 13 | 
 14 | TEST_F(NumexpExtractorTest, simple1) {
 15 |   vector<string> result;
 16 |   string language("ja");
 17 |   string text("1911年から2011年の間、その100年間において、9.3万人もの死傷者がでた。");
 18 |   NormalizeNumexp NN(language);
 19 |   NN.normalize(text, result);
 20 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
 21 | 	  cout << result[i] << endl;
 22 |   }
 23 |   ASSERT_EQ(3u, result.size());
 24 |   EXPECT_EQ("numerical*9.3万人*27*32*人*93000*93000*", result[0]);
 25 |   EXPECT_EQ("abstime*1911年から2011年*0*12*none*1911-XX-XX*2011-XX-XX*", result[1]);
 26 |   EXPECT_EQ("duration*100年間*17*22*none*P100Y*P100Y*", result[2]);
 27 | }
 28 | 
 29 | TEST_F(NumexpExtractorTest, simple2) {
 30 | 	vector<string> result;
 31 | 	string language("ja");
 32 | 	string text("15年前、戦争があった");
 33 | 	NormalizeNumexp NN(language);
 34 | 	NN.normalize(text, result);
 35 | 	ASSERT_EQ(1u, result.size());
 36 | 	EXPECT_EQ("reltime*15年前*0*4*none*XX:XX:XX,P-15Y*XX:XX:XX,P-15Y*", result[0]);
 37 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
 38 | 		cout << result[i] << endl;
 39 | 	}
 40 | }
 41 | 
 42 | TEST_F(NumexpExtractorTest, simple3) {
 43 | 	vector<string> result;
 44 | 	string language("ja");
 45 | 	string text("昨年3月、僕たち２人は結婚した");
 46 | 	NormalizeNumexp NN(language);
 47 | 	NN.normalize(text, result);
 48 | 	ASSERT_EQ(2u, result.size());
 49 | 	EXPECT_EQ("numerical*２人*8*10*人*2*2*", result[0]);
 50 | 	EXPECT_EQ("reltime*昨年3月*0*4*none*XXXX-03-XX,P-1Y*XXXX-03-XX,P-1Y*", result[1]);
 51 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
 52 | 		cout << result[i] << endl;
 53 | 	}
 54 | }
 55 | 
 56 | TEST_F(NumexpExtractorTest, simple4) {
 57 | 	vector<string> result;
 58 | 	string language("ja");
 59 | 	string text("131.1ポイントというスコアを叩き出した");
 60 | 	NormalizeNumexp NN(language);
 61 | 	NN.normalize(text, result);
 62 | 	ASSERT_EQ(1u, result.size());
 63 | 	EXPECT_EQ("numerical*131.1ポイント*0*9*ポイント*131.1*131.1*", result[0]);
 64 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
 65 | 		cout << result[i] << endl;
 66 | 	}
 67 | }
 68 | 
 69 | TEST_F(NumexpExtractorTest, simple5) {
 70 | 	vector<string> result;
 71 | 	string language("ja");
 72 | 	string text("午後3時45分に待ち合わせ");
 73 | 	NormalizeNumexp NN(language);
 74 | 	NN.normalize(text, result);
 75 | 	ASSERT_EQ(1u, result.size());
 76 | 	EXPECT_EQ("abstime*午後3時45分*0*7*none*15:45:XX*15:45:XX*", result[0]);
 77 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
 78 | 		cout << result[i] << endl;
 79 | 	}
 80 | }
 81 | 
 82 | TEST_F(NumexpExtractorTest, day_of_week1) {
 83 | 	vector<string> result;
 84 | 	string language("ja");
 85 | 	string text("5月3日(水)");
 86 | 	NormalizeNumexp NN(language);
 87 | 	NN.normalize(text, result);
 88 | 	ASSERT_EQ(1u, result.size());
 89 | 	EXPECT_EQ("abstime*5月3日(水)*0*7*none*XXXX-05-03*XXXX-05-03*Wed", result[0]);
 90 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
 91 | 		cout << result[i] << endl;
 92 | 	}
 93 | }
 94 | 
 95 | /*
 96 | //辞書にはあるが認識してくれない。uxが空白を認識してくれていない??
 97 | TEST_F(NumexpExtractorTest, day_of_week2) {
 98 | 	vector<string> result;
 99 | 	string language("ja");
100 | 	string text("2001/3/3 Sat");
101 | 	NormalizeNumexp NN(language);
102 | 	NN.normalize(text, result);
103 | 	ASSERT_EQ(1u, result.size());
104 | 	EXPECT_EQ("abstime*2001/3/3*8*16*none*2001-03-3*2001-03-3*Sat", result[0]);
105 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
106 | 		cout << result[i] << endl;
107 | 	}
108 | }
109 | */
110 | 
111 | TEST_F(NumexpExtractorTest, real_example1) {
112 | 	vector<string> result;
113 | 	string language("ja");
114 | 	string text("【今日から開催】The Fruits of Adventures @ ZEIT-FOTO SALON(東京・京橋)  4/26(Tue)まで");
115 | 	NormalizeNumexp NN(language);
116 | 	NN.normalize(text, result);
117 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
118 | 		cout << result[i] << endl;
119 | 	}
120 | 	ASSERT_EQ(1u, result.size());
121 | 	EXPECT_EQ("abstime*4/26(Tue)まで*59*70*none*XXXX-04-26*XXXX-04-26*Tue", result[0]);
122 | }
123 | 
124 | TEST_F(NumexpExtractorTest, inappropriate_range1) {
125 | 	vector<string> result;
126 | 	string language("ja");
127 | 	string text("中国から30匹の鳥がきた");
128 | 	NormalizeNumexp NN(language);
129 | 	NN.normalize(text, result);
130 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
131 | 		cout << result[i] << endl;
132 | 	}
133 | 	ASSERT_EQ(1u, result.size());
134 | 	EXPECT_EQ("numerical*30匹*4*7*匹*30*30*kara_prefix", result[0]);
135 | }
136 | 
137 | 
138 | TEST_F(NumexpExtractorTest, inappropriate_range2) {
139 | 	vector<string> result;
140 | 	string language("ja");
141 | 	string text("30匹からのプレゼント");
142 | 	NormalizeNumexp NN(language);
143 | 	NN.normalize(text, result);
144 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
145 | 		cout << result[i] << endl;
146 | 	}
147 | 	ASSERT_EQ(1u, result.size());
148 | 	EXPECT_EQ("numerical*30匹*0*3*匹*30*30*kara_suffix", result[0]);
149 | }
150 | 
151 | TEST_F(NumexpExtractorTest, inappropriate_range3) {
152 | 	vector<string> result;
153 | 	string language("ja");
154 | 	string text("一万年と二千年前から愛してる");
155 | 	NormalizeNumexp NN(language);
156 | 	NN.normalize(text, result);
157 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
158 | 		cout << result[i] << endl;
159 | 	}
160 | 	ASSERT_EQ(2u, result.size());
161 | 	EXPECT_EQ("reltime*二千年前*4*8*none*XX:XX:XX,P-2000Y*XX:XX:XX,P-2000Y*kara_suffix", result[0]);
162 | 	EXPECT_EQ("duration*一万年*0*3*none*P10000Y*P10000Y*", result[1]);
163 | }
164 | 
165 | TEST_F(NumexpExtractorTest, inappropriate_range4) {
166 | 	vector<string> result;
167 | 	string language("ja");
168 | 	string text("話をしよう。あれは今から36万年前………いや、1万4000年前だったか。");
169 | 	NormalizeNumexp NN(language);
170 | 	NN.normalize(text, result);
171 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
172 | 		cout << result[i] << endl;
173 | 	}
174 | 	ASSERT_EQ(2u, result.size());
175 | 	EXPECT_EQ("reltime*36万年前*12*17*none*XX:XX:XX,P-360000Y*XX:XX:XX,P-360000Y*kara_prefix", result[0]);
176 | 	EXPECT_EQ("reltime*1万4000年前*23*31*none*XX:XX:XX,P-14000Y*XX:XX:XX,P-14000Y*", result[1]);
177 | }
178 | 
179 | TEST_F(NumexpExtractorTest, inappropriate_strings1) {
180 | 	vector<string> result;
181 | 	string language("ja");
182 | 	string text("一体それがどうしたというのだね。九州。四国。");
183 | 	NormalizeNumexp NN(language);
184 | 	NN.normalize(text, result);
185 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
186 | 		cout << result[i] << endl;
187 | 	}
188 | 	ASSERT_EQ(0u, result.size());
189 | }
190 | 
191 | 
192 | TEST_F(NumexpExtractorTest, inappropriate_prefix1) {
193 | 	vector<string> result;
194 | 	string language("ja");
195 | 	string text("ver2.3.4。ver２．３。");
196 | 	NormalizeNumexp NN(language);
197 | 	NN.normalize(text, result);
198 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
199 | 		cout << result[i] << endl;
200 | 	}
201 | 	ASSERT_EQ(0u, result.size());
202 | }
203 | 
204 | 
205 | TEST_F(NumexpExtractorTest, inappropriate_abstime1) {
206 | 	vector<string> result;
207 | 	string language("ja");
208 | 	string text("080-6006-4451。ver2.0。");
209 | 	NormalizeNumexp NN(language);
210 | 	NN.normalize(text, result);
211 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
212 | 		cout << result[i] << endl;
213 | 	}
214 | 	ASSERT_EQ(0u, result.size());
215 | }
216 | 
217 | TEST_F(NumexpExtractorTest, inappropriate_abstime2) {
218 | 	vector<string> result;
219 | 	string language("ja");
220 | 	string text("198999年30月41日。");
221 | 	NormalizeNumexp NN(language);
222 | 	NN.normalize(text, result);
223 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
224 | 		cout << result[i] << endl;
225 | 	}
226 | 	ASSERT_EQ(3u, result.size()); //durationとして認識される
227 | }
228 | 
229 | TEST_F(NumexpExtractorTest, url1) {
230 | 	vector<string> result;
231 | 	string language("ja");
232 | 	string text("tttp3gl3molggg");
233 | 	NormalizeNumexp NN(language);
234 | 	NN.normalize(text, result);
235 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
236 | 		cout << result[i] << endl;
237 | 	}
238 | 	ASSERT_EQ(0u, result.size());
239 | }
240 | 
241 | TEST_F(NumexpExtractorTest, revise_abstime1) {
242 | 	vector<string> result;
243 | 	string language("ja");
244 | 	string text("09年5月。99年5月");
245 | 	NormalizeNumexp NN(language);
246 | 	NN.normalize(text, result);
247 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
248 | 		cout << result[i] << endl;
249 | 	}
250 | 	ASSERT_EQ(2u, result.size());
251 | 	EXPECT_EQ("abstime*09年5月*0*5*none*2009-05-XX*2009-05-XX*", result[0]);
252 | 	EXPECT_EQ("abstime*99年5月*6*11*none*1999-05-XX*1999-05-XX*", result[1]);	
253 | }
254 | 
255 | TEST_F(NumexpExtractorTest, not_abstime1) {
256 | 	vector<string> result;
257 | 	string language("ja");
258 | 	string text("1.2.2 2-2-2");
259 | 	NormalizeNumexp NN(language);
260 | 	NN.normalize(text, result);
261 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
262 | 		cout << result[i] << endl;
263 | 	}
264 | 	ASSERT_EQ(0u, result.size());
265 | }
266 | 
267 | TEST_F(NumexpExtractorTest, revise_abstime2) {
268 | 	vector<string> result;
269 | 	string language("ja");
270 | 	string text("西暦99年5月");
271 | 	NormalizeNumexp NN(language);
272 | 	NN.normalize(text, result);
273 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
274 | 		cout << result[i] << endl;
275 | 	}
276 | 	ASSERT_EQ(1u, result.size());
277 | 	EXPECT_EQ("abstime*西暦99年5月*0*7*none*0099-05-XX*0099-05-XX*", result[0]);
278 | }
279 | 
280 | TEST_F(NumexpExtractorTest, su1) {
281 | 	vector<string> result;
282 | 	string language("ja");
283 | 	string text("数十人が十数人と喧嘩して、百数十円落とした");
284 | 	NormalizeNumexp NN(language);
285 | 	NN.normalize(text, result);
286 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
287 | 		cout << result[i] << endl;
288 | 	}
289 | 	ASSERT_EQ(3u, result.size());
290 | 	EXPECT_EQ("numerical*数十人*0*3*人*10*90*", result[0]);
291 | 	EXPECT_EQ("numerical*十数人*4*7*人*11*19*", result[1]);
292 | 	EXPECT_EQ("numerical*百数十円*13*17*円*110*190*", result[2]);
293 | }
294 | 
295 | TEST_F(NumexpExtractorTest, range1) {
296 | 	vector<string> result;
297 | 	string language("ja");
298 | 	string text("2012/4/3~6に行われる");
299 | 	NormalizeNumexp NN(language);
300 | 	NN.normalize(text, result);
301 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
302 | 		cout << result[i] << endl;
303 | 	}
304 | 	ASSERT_EQ(1u, result.size());
305 | 	EXPECT_EQ("abstime*2012/4/3~6*0*10*none*2012-04-03*2012-04-06*", result[0]);
306 | }
307 | 
308 | TEST_F(NumexpExtractorTest, range2) {
309 | 	vector<string> result;
310 | 	string language("ja");
311 | 	string text("2012/4/3~2012/4/6に行われる");
312 | 	NormalizeNumexp NN(language);
313 | 	NN.normalize(text, result);
314 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
315 | 		cout << result[i] << endl;
316 | 	}
317 | 	ASSERT_EQ(1u, result.size());
318 | 	EXPECT_EQ("abstime*2012/4/3~2012/4/6*0*17*none*2012-04-03*2012-04-06*", result[0]);
319 | }
320 | 
321 | TEST_F(NumexpExtractorTest, wari1) {
322 | 	vector<string> result;
323 | 	string language("ja");
324 | 	string text("彼の打率は3割4分5厘だ");
325 | 	NormalizeNumexp NN(language);
326 | 	NN.normalize(text, result);
327 | 	for(int i=0; i<static_cast<int>(result.size()); i++){
328 | 		cout << result[i] << endl;
329 | 	}
330 | 	ASSERT_EQ(1u, result.size());
331 | 	EXPECT_EQ("numerical*3割4分5厘*5*11*%*34.5*34.5*", result[0]);
332 | }


--------------------------------------------------------------------------------
/src/normalizer_template.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  数量表現（「三人」「約1000円」などといった表現）や時間表現（「1989年3月」「3:30」「百年後」などといった表現）は以下のように構成される
  3 |  
  4 |  【接頭辞 + 前置助数詞 + 数量表現or時間表現の基本パターン + 接尾辞】
  5 | 　　・接頭辞：「約」「およそ」など
  6 | 　　・前置助数詞：数量表現における「時速」「￥」や、絶対時間表現における年号など（本来はパターンに含めたいところだが、基本パターンをprefixSearchで探索している都合上、今回は別の構成要素として考える）
  7 | 　　・基本パターン：「*人」「*円」「*年*月」「*:*」などの正規表現パターン。
  8 | 　　・接尾辞：「以降」「くらい」など
  9 |  
 10 |  この構成性に着目し、この規格化モジュールでは、文中の数の周囲を正規表現でマッチングさせ、表現を認識させる。
 11 |  （「数」 -> 「数」＋「助数詞」 -> 「前置助数詞」＋「数」＋「助数詞」 -> 「前置助数詞」＋「数」＋「助数詞」＋「接尾辞」 -> 「接頭辞」＋「前置助数詞」＋「数」＋「助数詞」　と認識範囲を増やしていく）
 12 |  認識した際には、認識したパターンに対応する処理を、辞書を参照して実行し、規格化表現を作成していく。
 13 |  
 14 |  この基底クラスでは、上のようにパターンを順番に認識していく処理を書いている。
 15 |  派生クラスとなるnumerical_expression_normalizer, abstime_expression_normalizer, reltime_expression_normalizer, duration_expression_normalizerでは、認識したパターンに対応する処理を書く。
 16 |  
 17 |  */
 18 | 
 19 | #ifndef NORMALIZER_TEMPLATE_H_
 20 | #define NORMALIZER_TEMPLATE_H_
 21 | #include <string>
 22 | #include <ux/ux.hpp>
 23 | #include "digit_utility.hpp"
 24 | #include "number_normalizer.hpp"
 25 | #include "normalizer_utility.hpp"
 26 | #include "dictionary_dirpath.hpp"
 27 | #include <unistd.h>
 28 | 
 29 | namespace normalizer_template{
 30 | 
 31 | template <class AnyTypeExpression, class AnyTypeLimitedExpression>
 32 | class NormalizerTemplate{
 33 | public:
 34 |   virtual void init() = 0;
 35 |   virtual void normalize_number(const std::string& text, std::vector<digit_utility::Number>& numbers) = 0;
 36 |   virtual void revise_any_type_expression_by_matching_limited_expression(std::vector<AnyTypeExpression>& any_type_expressions, int& expression_id, AnyTypeLimitedExpression matching_limited_expression) = 0;
 37 |   virtual void revise_any_type_expression_by_matching_prefix_counter(AnyTypeExpression& any_type_expression, const AnyTypeLimitedExpression& matching_limited_expression) = 0;
 38 |   virtual void revise_any_type_expression_by_number_modifier(AnyTypeExpression& any_type_expression, const normalizer_utility::NumberModifier& number_modifier) = 0;
 39 |   virtual void delete_not_any_type_expression(std::vector<AnyTypeExpression>& any_type_expressions) = 0;
 40 |   virtual void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector<AnyTypeExpression>& any_type_expressions) = 0;
 41 |   
 42 |   void build_limited_expression_patterns_from_limited_expressions(){
 43 |     //limited_expressionのpatternでprefixSearchするために、patternをキーとするトライ木を生成する。
 44 |     std::vector<std::pair<std::string, int> > limited_expression_pattern_table;
 45 |     for(int i=0; i<static_cast<int>(limited_expressions_.size()); i++){
 46 |       limited_expression_pattern_table.push_back(make_pair(limited_expressions_[i].pattern, i));
 47 |     }
 48 |     limited_expression_patterns_.build(limited_expression_pattern_table);
 49 |   }
 50 | 
 51 |   void load_json_from_file(const std::string& filepath, pfi::text::json::json& js) {
 52 |     std::ifstream in(filepath.c_str());
 53 |     pfi::text::json::json_parser parser(in);
 54 |     try {
 55 |       while (true) {
 56 |         js.add(parser.parse());
 57 |       }
 58 |     } catch (const pfi::lang::end_of_data&) {
 59 |     }
 60 |   }
 61 |   
 62 |   template <class T>
 63 |   void load_from_dictionary(const std::string& dictionary_path, std::vector<T>& load_target) {
 64 |     load_target.clear();
 65 |     try {
 66 |       pfi::text::json::json js = pfi::text::json::json(new pfi::text::json::json_array());
 67 |       load_json_from_file(dictionary_path, js);
 68 |       pfi::text::json::from_json(js, load_target);
 69 |     } catch( ... ) {
 70 |       std::cout << "dictionary load error" << std::endl; //TODO : error処理
 71 |       exit(1);
 72 |     }
 73 |   }
 74 |   
 75 |   template <class T>
 76 |   void build_patterns_rev(const std::vector<T>& originals, ux::Map<int>& patterns) {
 77 |     //prefixSearchをつかってsuffixSearchを実現するため、uxに格納するパターンを予め前後逆にしておく
 78 |     std::vector<std::pair<std::string, int> > kvs;
 79 |     for (int i = 0; i < static_cast<int>(originals.size()); i++) {
 80 |       kvs.push_back(std::make_pair(normalizer_utility::reverse_string(originals[i].pattern), i));
 81 |     }
 82 |     patterns.build(kvs);
 83 |   }
 84 |   
 85 |   template <class T>
 86 |   void build_patterns(const std::vector<T>& originals, ux::Map<int>& patterns) {
 87 |     std::vector<std::pair<std::string, int> > kvs;
 88 |     for (int i = 0; i < static_cast<int>(originals.size()); i++) {
 89 |       kvs.push_back(std::make_pair(originals[i].pattern, i));
 90 |     }
 91 |     patterns.build(kvs);
 92 |   }
 93 |   
 94 |   void load_from_dictionaries(const std::string& limited_expression_dictionary, const std::string& prefix_counter_dictionary, const std::string& prefix_number_modifier_dictionary, const std::string& suffix_number_modifier_dictionary){
 95 | 	std::string dictionary_path;
 96 | 	dictionary_path += dictionary_dirpath::get_dictionary_dirpath();
 97 |     dictionary_path += language_; 
 98 | 	dictionary_path += "/";
 99 |     load_from_dictionary(dictionary_path+limited_expression_dictionary, limited_expressions_);
100 |     load_from_dictionary(dictionary_path+prefix_counter_dictionary, prefix_counters_);
101 |     load_from_dictionary(dictionary_path+suffix_number_modifier_dictionary, suffix_number_modifier_);
102 |     load_from_dictionary(dictionary_path+prefix_number_modifier_dictionary, prefix_number_modifier_);
103 |     
104 |     build_patterns(limited_expressions_, limited_expression_patterns_);
105 |     build_patterns_rev(prefix_counters_, prefix_counter_patterns_);
106 |     build_patterns_rev(prefix_number_modifier_, prefix_number_modifier_patterns_);
107 |     build_patterns(suffix_number_modifier_, suffix_number_modifier_patterns_);
108 |     
109 |     for(int i=0; i<static_cast<int>(limited_expressions_.size()); i++){
110 |       limited_expressions_[i].set_total_number_of_place_holder();
111 |       limited_expressions_[i].set_length_of_strings_after_final_place_holder();
112 |     }
113 |   }
114 |   
115 |   void search_matching_limited_expression(const pfi::data::string::ustring& utext_replaced, const AnyTypeExpression& any_type_expression, int& matching_pattern_id){
116 |     pfi::data::string::ustring string_after_expression;
117 |     normalizer_utility::extract_after_string(utext_replaced, any_type_expression.position_end, string_after_expression);
118 |     normalizer_utility::prefixSearch(string_after_expression, limited_expression_patterns_, matching_pattern_id);
119 |   }
120 |   
121 |   void search_matching_prefix_counter(const pfi::data::string::ustring& utext_replaced, const AnyTypeExpression& any_type_expression, int& matching_pattern_id){
122 |     pfi::data::string::ustring string_before_expression;
123 |     normalizer_utility::extract_before_string(utext_replaced, any_type_expression.position_start, string_before_expression);
124 |     normalizer_utility::suffixSearch(string_before_expression, prefix_counter_patterns_, matching_pattern_id);
125 |   }
126 |   
127 |   void revise_any_type_expression_by_matching_prefix_number_modifier(AnyTypeExpression& any_type_expression, const normalizer_utility::NumberModifier& number_modifier){
128 |     any_type_expression.position_start -= pfi::data::string::string_to_ustring(number_modifier.pattern).size();
129 |     revise_any_type_expression_by_number_modifier(any_type_expression, number_modifier);
130 |   }
131 |   
132 |   void revise_any_type_expression_by_matching_suffix_number_modifier(AnyTypeExpression& any_type_expression, const normalizer_utility::NumberModifier& number_modifier){
133 |     any_type_expression.position_end += pfi::data::string::string_to_ustring(number_modifier.pattern).size();
134 |     revise_any_type_expression_by_number_modifier(any_type_expression, number_modifier);
135 |   }
136 |   
137 |   bool normalize_limited_expression(const pfi::data::string::ustring& utext_replaced, std::vector<AnyTypeExpression>& any_type_expressions, int &i){
138 |     int matching_pattern_id;
139 |     search_matching_limited_expression(utext_replaced, any_type_expressions[i], matching_pattern_id);
140 |     if(matching_pattern_id == -1) return false;
141 |     revise_any_type_expression_by_matching_limited_expression(any_type_expressions, i, limited_expressions_[matching_pattern_id]);
142 |     return true;
143 |   }
144 |   
145 |   void normalize_prefix_counter(const pfi::data::string::ustring& utext_replaced, AnyTypeExpression& any_type_expression){
146 |     int matching_pattern_id;
147 |     search_matching_prefix_counter(utext_replaced, any_type_expression, matching_pattern_id);
148 |     if(matching_pattern_id == -1) return;
149 |     revise_any_type_expression_by_matching_prefix_counter(any_type_expression, prefix_counters_[matching_pattern_id]);
150 |     return;
151 |   }
152 |   
153 |   bool normalize_suffix_number_modifier(const pfi::data::string::ustring& utext_replaced, AnyTypeExpression& any_type_expression){
154 |     int matching_pattern_id;
155 |     normalizer_utility::search_suffix_number_modifier(utext_replaced, any_type_expression.position_end, suffix_number_modifier_patterns_, matching_pattern_id);
156 |     if(matching_pattern_id == -1) return false;
157 |     revise_any_type_expression_by_matching_suffix_number_modifier(any_type_expression, suffix_number_modifier_[matching_pattern_id]);
158 | 		return true;
159 |   }
160 |   
161 |   bool normalize_prefix_number_modifier(const pfi::data::string::ustring& utext_replaced, AnyTypeExpression& any_type_expression){
162 |     int matching_pattern_id;
163 |     normalizer_utility::search_prefix_number_modifier(utext_replaced, any_type_expression.position_start, prefix_number_modifier_patterns_, matching_pattern_id);
164 |     if(matching_pattern_id == -1) return false;
165 |     revise_any_type_expression_by_matching_prefix_number_modifier(any_type_expression, prefix_number_modifier_[matching_pattern_id]);
166 | 		return true;
167 |   }
168 |   
169 |   void convert_numbers_to_any_type_expressions(const std::vector<digit_utility::Number>& numbers, std::vector<AnyTypeExpression>& any_type_expressions){
170 |     for(int i=0; i<static_cast<int>(numbers.size()); i++){
171 |       any_type_expressions.push_back(numbers[i]);
172 |     }
173 |   }
174 |   
175 |   bool have_kara_prefix(const std::vector<std::string>& options){
176 |     return find(options.begin(), options.end(), "kara_prefix") != options.end();
177 |   }
178 |   
179 |   bool have_kara_suffix(const std::vector<std::string>& options){
180 |     return find(options.begin(), options.end(), "kara_suffix") != options.end();
181 |   }
182 |   
183 |   void merge_options(std::vector<std::string>& options1, std::vector<std::string>& options2){
184 | 		//範囲表現の統合の際に使われる。kara_suffix, kara_prefixはここで削除する
185 | 		//TODO : 削除するというのが非常に分かり辛い。どうにかする。
186 | 		for(int i=0; i<static_cast<int>(options1.size()); i++){
187 | 			if(options1[i] == "kara_suffix"){
188 | 				options1.erase(options1.begin() + i);
189 | 				break;
190 | 			}
191 | 		}
192 |     for(int i=0; i<static_cast<int>(options2.size()); i++){
193 | 			if(options2[i] == "kara_prefix") continue;
194 |       options1.push_back(options2[i]);
195 |     }
196 |   }
197 |   
198 |   void process(const std::string& text, std::vector<AnyTypeExpression>& any_type_expressions) {
199 |     any_type_expressions.clear();
200 |     pfi::data::string::ustring utext = pfi::data::string::string_to_ustring(text);
201 |     
202 |     //numbersの作成
203 |     std::vector<digit_utility::Number> numbers;
204 |     normalize_number(text, numbers);
205 |     
206 |     //numbersを変換して、ベースとなるany_type_expressionsを作成
207 |     convert_numbers_to_any_type_expressions(numbers, any_type_expressions);
208 |     
209 |     //searchするために、text中の数を*に置換しておく
210 |     pfi::data::string::ustring utext_replaced;
211 |     normalizer_utility::replace_numbers_in_text(utext, numbers, utext_replaced);
212 |     
213 |     //単位の探索、規格化
214 |     for(int i=0; i<static_cast<int>(any_type_expressions.size()); i++){
215 |       if(!normalize_limited_expression(utext_replaced, any_type_expressions, i)){
216 |         //TODO : 単位が存在しなかった場合の処理をどうするか、相談して決める
217 |       }
218 |       normalize_prefix_counter(utext_replaced, any_type_expressions[i]);
219 |       if(normalize_suffix_number_modifier(utext_replaced, any_type_expressions[i])) normalize_suffix_number_modifier(utext_replaced, any_type_expressions[i]);  //TODO : 2回以上の繰り返しを本当に含めて良いのか？
220 |       if(normalize_prefix_number_modifier(utext_replaced, any_type_expressions[i])) normalize_prefix_counter(utext_replaced, any_type_expressions[i]);
221 | 			any_type_expressions[i].set_original_expression_from_position(utext);
222 |     }
223 |     
224 |     //TODO : 範囲表現の処理
225 |     fix_by_range_expression(utext, any_type_expressions);
226 |     
227 |     //規格化されなかったnumberを削除
228 |     delete_not_any_type_expression(any_type_expressions);
229 |   }
230 |   
231 |   ux::Map<int> limited_expression_patterns_, prefix_counter_patterns_, suffix_number_modifier_patterns_, prefix_number_modifier_patterns_;
232 |   std::vector<AnyTypeLimitedExpression> limited_expressions_, prefix_counters_;
233 |   std::vector<normalizer_utility::NumberModifier> suffix_number_modifier_, prefix_number_modifier_;
234 |   std::string language_;
235 | };
236 | 
237 | } //namespace numerical_expression_normalizer
238 | 
239 | #endif //NORMALIZER_TEMPLATE_H_
240 | 


--------------------------------------------------------------------------------
/src/normalizer_utility.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include "normalizer_utility.hpp"
  3 | #include <pficommon/lang/exception.h>
  4 | 
  5 | namespace normalizer_utility {
  6 |   
  7 | void NormalizedExpressionTemplate::set_original_expression_from_position(const pfi::data::string::ustring& utext){
  8 |   original_expression = utext.substr(position_start, position_end - position_start);
  9 | }
 10 | 
 11 | pfi::data::string::ustring reverse_string(const pfi::data::string::ustring& ustr) {
 12 |   return pfi::data::string::ustring(ustr.rbegin(), ustr.rend());
 13 | }
 14 |   
 15 | std::string reverse_string(const std::string& str) {
 16 |   pfi::data::string::ustring ustr = pfi::data::string::string_to_ustring(str);
 17 |   return pfi::data::string::ustring_to_string(reverse_string(ustr));
 18 | }
 19 | 
 20 | void extract_after_string(const pfi::data::string::ustring& text, const int i, pfi::data::string::ustring& after_string) {
 21 |   after_string = text.substr(i, text.size() - i);
 22 | }
 23 | 
 24 | void extract_before_string(const pfi::data::string::ustring& text, const int i, pfi::data::string::ustring& before_string) {
 25 |   before_string = text.substr(0, i);
 26 | }
 27 | 
 28 | void prefixSearch(const pfi::data::string::ustring& ustr, const ux::Map<int>& patterns, int& matching_pattern_id) {
 29 |   /*patternsの中から、ustrのprefixになっているものを探索（複数ある場合は最長のもの）
 30 |    */
 31 |   pfi::data::string::ustring ustr_shortened;
 32 |   shorten_place_holder_in_text(ustr, ustr_shortened); //ustrは数字が一字一字、「*」に変換されているので、patternsの表記と食い違っている。*を縮約する操作を行う
 33 |   std::string str = pfi::data::string::ustring_to_string(ustr_shortened);
 34 |   size_t retLen;
 35 |   int ret = patterns.prefixSearch(str.c_str(), str.size(), retLen, matching_pattern_id);
 36 |   if (ret == -1) matching_pattern_id = -1;
 37 | }
 38 | 
 39 | void suffixSearch(const pfi::data::string::ustring& ustr, const ux::Map<int>& patterns_rev, int& matching_pattern_id) {
 40 |   /*patternsの中から、ustrのsuffixになっているものを探索（複数ある場合は最長のもの）
 41 |    あらかじめpatternsの文字列を逆にしたものを保管しておき（patterns_rev）、ustrも逆にしてしまい、その状態でprefixSearchを行った結果を返す
 42 |    */
 43 |   pfi::data::string::ustring ustr_shortened;
 44 |   shorten_place_holder_in_text(ustr, ustr_shortened); //ustrは数字が一字一字、「*」に変換されているので、patternsの表記と食い違っている。*を縮約する操作を行う
 45 |   pfi::data::string::ustring ustr_rev = reverse_string(ustr_shortened);
 46 |   std::string str_rev = pfi::data::string::ustring_to_string(ustr_rev);
 47 |   size_t retLen;
 48 |   int ret = patterns_rev.prefixSearch(str_rev.c_str(), str_rev.size(), retLen, matching_pattern_id);
 49 |   if (ret == -1) matching_pattern_id = -1;
 50 | }
 51 | 
 52 | void search_suffix_number_modifier(const pfi::data::string::ustring& text, const int exp_position_end,
 53 |     const ux::Map<int>& suffix_number_modifier_patterns, int& matching_pattern_id) {
 54 |   pfi::data::string::ustring string_after_expression;
 55 |   extract_after_string(text, exp_position_end, string_after_expression);
 56 |   prefixSearch(string_after_expression, suffix_number_modifier_patterns, matching_pattern_id);
 57 | }
 58 | 
 59 | void search_prefix_number_modifier(const pfi::data::string::ustring& text, const int exp_position_start,
 60 |     const ux::Map<int>& prefix_number_modifier_patterns, int& matching_pattern_id) {
 61 |   pfi::data::string::ustring string_before_expression;
 62 |   extract_before_string(text, exp_position_start, string_before_expression);
 63 |   suffixSearch(string_before_expression, prefix_number_modifier_patterns, matching_pattern_id);
 64 | }
 65 | 
 66 | void replace_numbers_in_text(const pfi::data::string::ustring& utext, const std::vector<digit_utility::Number>& numbers,
 67 |     pfi::data::string::ustring& utext_replaced) {
 68 |   //「1989年7月」 -> 「****年*月」のように数の部分を置き換える（正規表現で一致させるため）
 69 |   utext_replaced = utext;
 70 |   for (int i = 0; i < static_cast<int>(numbers.size()); i++) {
 71 |     std::fill(utext_replaced.begin() + numbers[i].position_start, utext_replaced.begin() + numbers[i].position_end, PLACE_HOLDER[0]);
 72 |   }
 73 | }
 74 | 
 75 | void shorten_place_holder_in_text(const pfi::data::string::ustring& utext, pfi::data::string::ustring& utext_shortened) {
 76 |   //「****年*月」 -> 「*年*月」のように数の部分を縮約する（uxのprefixSearchで一致させるため）
 77 |   utext_shortened.clear();
 78 |   bool prev_is_place_holder = false;
 79 |   for (int i = 0; i < static_cast<int>(utext.size()); i++) {
 80 |     if (utext[i] == PLACE_HOLDER[0]) {
 81 |       if (prev_is_place_holder) {
 82 |         ;
 83 |       } else {
 84 |         utext_shortened += PLACE_HOLDER;
 85 |         prev_is_place_holder = true;
 86 |       }
 87 |     } else {
 88 |       utext_shortened += utext[i];
 89 |       prev_is_place_holder = false;
 90 |     }
 91 |   }
 92 | }
 93 | 
 94 | void build_number_modifier_patterns_from_number_modifiers(const std::vector<normalizer_utility::NumberModifier>& number_modifiers,
 95 |     ux::Map<int>& number_modifier_patterns) {
 96 |   //patternでの探索を可能にするため、トライ木を構築する。
 97 |   std::vector<std::pair<std::string, int> > kvs;
 98 |   for (int i = 0; i < static_cast<int>(number_modifiers.size()); i++) {
 99 |     kvs.push_back(std::make_pair(number_modifiers[i].pattern, i));
100 |   }
101 |   number_modifier_patterns.build(kvs);
102 | }
103 |   
104 | bool is_place_holder(pfi::data::string::uchar uc) {
105 |   return uc == normalizer_utility::PLACE_HOLDER[0];
106 | }
107 | 
108 | bool is_finite(double value){
109 |   return value != INFINITY and value != -INFINITY;
110 | }
111 |   
112 | bool is_null_time(const Time& t){
113 |   Time positive_inf(INFINITY), negative_inf(-INFINITY);
114 |   return (positive_inf == t) || (negative_inf == t);
115 | }
116 | 
117 | const std::string identify_time_detail(const normalizer_utility::Time& time) {
118 |   if (normalizer_utility::is_finite(time.second)) {
119 |     return "s";
120 |   } else if (normalizer_utility::is_finite(time.minute)) {
121 |     return "mn";
122 |   } else if (normalizer_utility::is_finite(time.hour)) {
123 |     return "h";
124 |   } else if (normalizer_utility::is_finite(time.day)) {
125 |     return "d";
126 |   } else if (normalizer_utility::is_finite(time.month)) {
127 |     return "m";
128 |   } else if (normalizer_utility::is_finite(time.year)) {
129 |     return "y";
130 |   }
131 |   return "";
132 | }
133 | 
134 | void LimitedExpressionTemplate::set_total_number_of_place_holder() {
135 |   pfi::data::string::ustring ustr_pattern = pfi::data::string::string_to_ustring(pattern);
136 |   total_number_of_place_holder = static_cast<int>(count_if(ustr_pattern.begin(), ustr_pattern.end(), normalizer_utility::is_place_holder));
137 | }
138 | 
139 | void LimitedExpressionTemplate::set_length_of_strings_after_final_place_holder() {
140 |   pfi::data::string::ustring ustr_pattern = pfi::data::string::string_to_ustring(pattern);
141 |   length_of_strings_after_final_place_holder = ustr_pattern.size() - ustr_pattern.rfind(normalizer_utility::PLACE_HOLDER[0]) - 1;
142 | }
143 | } //normalizer_utility
144 | 
145 | 


--------------------------------------------------------------------------------
/src/normalizer_utility.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef NORMALIZER_UTILITY_H_
  2 | #define NORMALIZER_UTILITY_H_
  3 | #include <string>
  4 | #include <pficommon/data/string/ustring.h>
  5 | #include <pficommon/data/string/utility.h>
  6 | #include <pficommon/text/json.h>
  7 | #include <ux/ux.hpp>
  8 | #include "digit_utility.hpp"
  9 | 
 10 | namespace normalizer_utility {
 11 | using namespace digit_utility;
 12 | class Time {
 13 | public:
 14 |   Time(const double value) {
 15 |     year = month = day = hour = minute = second = value;
 16 |   }
 17 |   Time(const double &year, const double &month, const double &day, const double &hour, const double &minute, const double &second)
 18 |       : year(year),
 19 |         month(month),
 20 |         day(day),
 21 |         hour(hour),
 22 |         minute(minute),
 23 |         second(second) {
 24 |   }
 25 |   const bool operator==(const Time& t){
 26 |     return t.year == year &&
 27 |            t.month == month &&
 28 |            t.day == day &&
 29 |            t.hour == hour &&
 30 |            t.minute == minute &&
 31 |            t.second == second;
 32 |   }
 33 | 
 34 |   bool is_null_time_element(double t, bool is_upperbound){
 35 | 		if(is_upperbound) return t==-INFINITY;
 36 | 		else return t==INFINITY;
 37 |   }
 38 | 	
 39 | 	bool is_infinity_time_element(double t, bool is_upperbound){
 40 | 		if(is_upperbound) return t==INFINITY;
 41 | 		else return t==-INFINITY;
 42 | 	}
 43 |   
 44 |   std::string to_string_from_time_element(double t, std::string null_string, std::string kugiri, bool is_upperbound, int width){
 45 | 		std::stringstream ss;
 46 | 	  std::string ret;
 47 | 	  if(is_null_time_element(t, is_upperbound)){
 48 | 		  return null_string + kugiri;
 49 | 		}else{
 50 | 			ss.fill('0'); ss.width(width);
 51 | 		  ss << t << kugiri;
 52 | 		  ss >> ret;
 53 | 		  return ret;
 54 | 	  }
 55 |   }
 56 | 	
 57 | 	std::string to_interval_string_from_time_element(double t, std::string time_position, bool is_upperbound){
 58 | 		std::stringstream ss;
 59 | 	  std::string ret;
 60 | 	  if(is_null_time_element(t, is_upperbound)){
 61 | 		  return "";
 62 | 		}else{
 63 | 		  ss << t << time_position;
 64 | 		  ss >> ret;
 65 | 		  return ret;
 66 | 	  }
 67 |   }
 68 | 	
 69 | 	std::string to_string(bool is_upperbound){
 70 | 		if(is_null_time_element(year, is_upperbound) and is_null_time_element(month, is_upperbound) and is_null_time_element(day, is_upperbound)){
 71 | 			return to_time_string(is_upperbound);
 72 | 		}else{
 73 | 			return to_date_string(is_upperbound);
 74 | 		}
 75 | 	}
 76 |   
 77 |   std::string to_date_string(bool is_upperbound){
 78 | 	 std::stringstream ss;
 79 | 	 std::string ret;
 80 | 	 if(is_infinity_time_element(year, is_upperbound)){ 
 81 | 		if(is_upperbound) return "INF";
 82 | 		else return "-INF";
 83 | 	 }
 84 | 	 ss << to_string_from_time_element(year, "XXXX", "-", is_upperbound, 4);
 85 | 	 ss << to_string_from_time_element(month, "XX", "-", is_upperbound, 2);
 86 | 	 ss << to_string_from_time_element(day, "XX", "", is_upperbound, 2);
 87 | 	 ss >> ret;
 88 | 	 return ret;
 89 |   }
 90 | 	
 91 | 	std::string to_time_string(bool is_upperbound){
 92 | 		std::stringstream ss;
 93 | 		std::string ret;
 94 | 		if(is_infinity_time_element(year, is_upperbound)){ 
 95 | 			if(is_upperbound) return "INF";
 96 | 			else return "-INF";
 97 | 		}
 98 | 		ss << to_string_from_time_element(hour, "XX", ":", is_upperbound, 2);
 99 | 		ss << to_string_from_time_element(minute, "XX", ":", is_upperbound, 2);
100 | 		ss << to_string_from_time_element(second, "XX", "", is_upperbound, 2);
101 | 		ss >> ret;
102 | 		return ret;
103 |   }
104 | 	
105 | 	std::string to_duration_string(bool is_upperbound){
106 | 		std::stringstream ss;
107 | 		std::string ret;
108 | 		if(is_infinity_time_element(year, is_upperbound)){ 
109 | 			if(is_upperbound) return "INF";
110 | 			else return "-INF";
111 | 		}
112 | 		ss << "P";
113 | 		ss << to_interval_string_from_time_element(year, "Y", is_upperbound);
114 | 		ss << to_interval_string_from_time_element(month, "M", is_upperbound);
115 | 		ss << to_interval_string_from_time_element(day, "D", is_upperbound);
116 | 		ss << to_interval_string_from_time_element(hour, "h", is_upperbound);
117 | 		ss << to_interval_string_from_time_element(minute, "m", is_upperbound);
118 | 		ss << to_interval_string_from_time_element(second, "s", is_upperbound);
119 | 		ss >> ret;
120 | 		return ret;
121 |   }
122 | 
123 |   double year, month, day, hour, minute, second;
124 | };
125 | 
126 | class NormalizedExpressionTemplate {
127 | public:
128 |   NormalizedExpressionTemplate(const pfi::data::string::ustring& original_expression, const int position_start, const int position_end)
129 |       : original_expression(original_expression),
130 |         position_start(position_start),
131 |         position_end(position_end),
132 |         number_notation_type(NOT_NUMBER),
133 |         include_lowerbound(true),
134 |         include_upperbound(true),
135 |         is_over(false),
136 |         is_less(false),
137 |         ordinary(false) {
138 |       options.clear();
139 |   }
140 |   
141 |   void set_original_expression_from_position(const pfi::data::string::ustring& utext);
142 | 
143 |   pfi::data::string::ustring original_expression;
144 |   int position_start, position_end;
145 |   int number_notation_type;
146 |   bool include_lowerbound, include_upperbound;
147 |   bool is_over, is_less;
148 |   bool ordinary;
149 |   std::vector<std::string> options;
150 | };
151 |   
152 | class LimitedExpressionTemplate {
153 | public:
154 |   void set_total_number_of_place_holder();
155 |   void set_length_of_strings_after_final_place_holder();
156 |   
157 |   std::string pattern;
158 |   bool ordinary;
159 |   std::string option;
160 |   int total_number_of_place_holder; //patternが含むPLACE_HOLDERの数（ *月*日 -> 2個）
161 |   int length_of_strings_after_final_place_holder; //pattern中の最後のPLACE_HOLDERの後に続く文字列の長さ（*月*日 -> 1）　positionの同定に必要
162 | };
163 | 
164 | struct NumberModifier {
165 |   template<class Archive>
166 |   void serialize(Archive &ar) {
167 |     ar & MEMBER(pattern)& MEMBER(process_type);
168 |   }
169 |   std::string pattern, process_type;
170 | };
171 | 
172 | void extract_after_string(const pfi::data::string::ustring& text, int i, pfi::data::string::ustring& after_string);
173 | void extract_before_string(const pfi::data::string::ustring& text, int i, pfi::data::string::ustring& before_string);
174 | void prefixSearch(const pfi::data::string::ustring& ustr, const ux::Map<int>& patterns, int& matching_pattern_id);
175 | void suffixSearch(const pfi::data::string::ustring& ustr, const ux::Map<int>& patterns_rev, int& matching_pattern_id);
176 | void search_suffix_number_modifier(const pfi::data::string::ustring& text, const int exp_position_end,
177 |     const ux::Map<int>& suffix_number_modifier_patterns, int& matching_pattern_id);
178 | void search_prefix_number_modifier(const pfi::data::string::ustring& text, const int exp_position_start,
179 |     const ux::Map<int>& prefix_number_modifier_patterns, int& matching_pattern_id);
180 | void replace_numbers_in_text(const pfi::data::string::ustring& utext, const std::vector<digit_utility::Number>& numbers,
181 |     pfi::data::string::ustring& utext_replaced);
182 | void shorten_place_holder_in_text(const pfi::data::string::ustring& utext, pfi::data::string::ustring& utext_shortened);
183 | bool is_place_holder(pfi::data::string::uchar uc);
184 | bool is_finite(double value);
185 | bool is_null_time(const Time& t);
186 | const std::string identify_time_detail(const normalizer_utility::Time& time);
187 | std::string reverse_string(const std::string& str);
188 | 
189 | template<class T1, class T2>
190 | void cast(const T1& a, T2& b) {
191 |   std::stringstream ss;
192 |   ss << a;
193 |   ss >> b;
194 | }
195 | const pfi::data::string::ustring PLACE_HOLDER = pfi::data::string::string_to_ustring("ǂ"); //LATIN LETTER ALVEOLAR CLICK
196 | 
197 | } //normalizer_utility
198 | 
199 | #endif //NORMALIZER_UTILITY_H_
200 | 


--------------------------------------------------------------------------------
/src/normalizer_utility_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | #include <string>
  3 | #include <complex>
  4 | #include "normalizer_utility.hpp"
  5 | 
  6 | #include <pficommon/data/string/ustring.h>
  7 | #include <ux/ux.hpp>
  8 | 
  9 | using namespace normalizer_utility;
 10 | using namespace std;
 11 | using namespace pfi::data::string;
 12 | class NormalizerUtilityTest: public testing::Test {
 13 | public:
 14 |   ux::Map<int> uxm, uxm_rev;
 15 |   ustring rev(ustring str) {
 16 |     return ustring(str.rbegin(), str.rend());
 17 |   }
 18 | 
 19 |   void SetUp() {
 20 |     vector<pair<string, int> > kvs, kvs_rev;
 21 |     kvs.push_back(make_pair("あ", 1));
 22 |     kvs.push_back(make_pair("あい", 2));
 23 |     kvs.push_back(make_pair("あいう", 3));
 24 |     kvs.push_back(make_pair("いう", 4));
 25 |     kvs.push_back(make_pair("うえ", 5));
 26 |     kvs.push_back(make_pair("うえお", 6));
 27 |     kvs.push_back(make_pair("えお", 7));
 28 |     kvs.push_back(make_pair("いうえおあ", 8));
 29 |     uxm.build(kvs);
 30 | 
 31 |     for (int i = 0; i < static_cast<int>(kvs.size()); i++) {
 32 |       string str_rev = ustring_to_string(rev(string_to_ustring(kvs[i].first)));
 33 |       kvs_rev.push_back(make_pair(str_rev, kvs[i].second));
 34 |     }
 35 |     uxm_rev.build(kvs_rev);
 36 |   }
 37 | 
 38 |   void TearDown() {
 39 |   }
 40 | };
 41 | 
 42 | TEST_F(NormalizerUtilityTest, prefixSearch) {
 43 |   ustring ustr(string_to_ustring("あいうえお"));
 44 |   int matching_pattern_id;
 45 |   prefixSearch(ustr, uxm, matching_pattern_id);
 46 |   EXPECT_EQ(3, matching_pattern_id); // ("あいう", 3)
 47 | }
 48 | 
 49 | TEST_F(NormalizerUtilityTest, prefixSearch2) {
 50 |   ustring ustr(string_to_ustring("いうえおあいうえお"));
 51 |   int matching_pattern_id;
 52 |   prefixSearch(ustr, uxm, matching_pattern_id);
 53 |   EXPECT_EQ(8, matching_pattern_id); // ("いうえおあ", 8)
 54 | }
 55 | 
 56 | TEST_F(NormalizerUtilityTest, suffixSearch) {
 57 |   ustring ustr(string_to_ustring("あいうえお"));
 58 |   int matching_pattern_id;
 59 |   suffixSearch(ustr, uxm_rev, matching_pattern_id);
 60 |   EXPECT_EQ(6, matching_pattern_id); // ("うえお", 6)
 61 | }
 62 | 
 63 | TEST_F(NormalizerUtilityTest, suffixSearch2) {
 64 |   ustring ustr(string_to_ustring("あいうえおあ"));
 65 |   int matching_pattern_id;
 66 |   suffixSearch(ustr, uxm_rev, matching_pattern_id);
 67 |   EXPECT_EQ(8, matching_pattern_id); // ("いうえおあ", 8)
 68 | }
 69 | 
 70 | TEST_F(NormalizerUtilityTest, extract_after_string) {
 71 |   ustring text(string_to_ustring("それは秒速5センチメートルくらいで進む"));
 72 |   ustring str;
 73 |   extract_after_string(text, 6, str);
 74 |   EXPECT_EQ("センチメートルくらいで進む", ustring_to_string(str));
 75 | }
 76 | 
 77 | TEST_F(NormalizerUtilityTest, extract_before_string) {
 78 |   ustring text(string_to_ustring("それは秒速5センチメートルくらいで進む"));
 79 |   ustring str;
 80 |   extract_before_string(text, 5, str);
 81 |   EXPECT_EQ("それは秒速", ustring_to_string(str));
 82 | }
 83 | 
 84 | TEST_F(NormalizerUtilityTest, seach_suffix) {
 85 |   ustring text(string_to_ustring("あいうえおあ5あいうえおごごごごご"));
 86 |   int matching_pattern_id;
 87 |   search_suffix_number_modifier(text, 7, uxm, matching_pattern_id);
 88 |   EXPECT_EQ(3, matching_pattern_id);
 89 | }
 90 | 
 91 | TEST_F(NormalizerUtilityTest, search_prefix) {
 92 |   ustring text(string_to_ustring("あいうえおあ5あいうえおごごごごご"));
 93 |   int matching_pattern_id;
 94 |   search_prefix_number_modifier(text, 6, uxm_rev, matching_pattern_id);
 95 |   EXPECT_EQ(8, matching_pattern_id);
 96 | }
 97 | 
 98 | TEST_F(NormalizerUtilityTest, replace_numbers_in_text) {
 99 |   ustring text(string_to_ustring("その30人がそれは三十五人でボボボ")), text_replaced;
100 |   vector<Number> numbers;
101 |   ustring exp1(string_to_ustring("30人")), exp2(string_to_ustring("三十五人"));
102 |   numbers.push_back(Number(exp1, 2, 4));
103 |   numbers.push_back(Number(exp2, 9, 12));
104 |   replace_numbers_in_text(text, numbers, text_replaced);
105 |   EXPECT_EQ(string_to_ustring("そのǂǂ人がそれはǂǂǂ人でボボボ") , text_replaced);
106 | }
107 | 
108 | TEST_F(NormalizerUtilityTest, shorten_place_holder_in_text) {
109 |   ustring text(string_to_ustring("そのǂǂ人がそれはǂǂǂǂǂǂ人でボボボǂǂǂ")), text_shortened;
110 |   shorten_place_holder_in_text(text, text_shortened);
111 |   EXPECT_EQ(string_to_ustring("そのǂ人がそれはǂ人でボボボǂ") , text_shortened);
112 | }
113 | 
114 | TEST_F(NormalizerUtilityTest, is_place_holder_true) {
115 |   EXPECT_TRUE(is_place_holder(string_to_uchar("ǂ")));
116 | }
117 | 
118 | TEST_F(NormalizerUtilityTest, is_place_holder_false) {
119 |   EXPECT_FALSE(is_place_holder(string_to_uchar("あ")));
120 | }
121 | 
122 | TEST_F(NormalizerUtilityTest, is_finite_false) {
123 |   EXPECT_FALSE(is_finite(INFINITY));
124 | }
125 | 
126 | TEST_F(NormalizerUtilityTest, is_finite_true) {
127 |   EXPECT_TRUE(is_finite(99999.0));
128 | }
129 | 
130 | TEST_F(NormalizerUtilityTest, is_null_time_true) {
131 |   Time t(INFINITY);
132 |   EXPECT_TRUE(is_null_time(t));
133 | }
134 | 
135 | TEST_F(NormalizerUtilityTest, is_null_time_false) {
136 |   Time t(1);
137 |   EXPECT_FALSE(is_null_time(t));
138 | }
139 | 
140 | TEST_F(NormalizerUtilityTest, identify_time_detail) {
141 |   Time t(1,1,1,1,1,INFINITY);
142 |   EXPECT_EQ(identify_time_detail(t), "mn");
143 | }
144 | 
145 | TEST_F(NormalizerUtilityTest, reverse_string) {
146 |   string str("aiueo");
147 |   EXPECT_EQ(reverse_string(str), "oeuia");
148 | }


--------------------------------------------------------------------------------
/src/number_normalizer.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef NUMBER_NORMALIZER_H_
 2 | #define NUMBER_NORMALIZER_H_
 3 | #include <string>
 4 | #include <vector>
 5 | #include <float.h>
 6 | #include <pficommon/data/string/ustring.h>
 7 | #include <pficommon/data/string/utility.h>
 8 | #include "digit_utility.hpp"
 9 | 
10 | namespace number_normalizer {
11 | using namespace digit_utility;
12 | 
13 | 
14 | class NumberExtractor {
15 | public:
16 |   void extract_number(const std::string& input, std::vector<Number>& output);
17 | private:
18 |   bool is_invalid_notation_type(int notation_type);
19 |   void return_longest_number_strings(const pfi::data::string::ustring& utext, int &i, std::string& numstr);
20 | };
21 | 
22 | 
23 | 
24 | class NumberConverterTemplate {
25 | public:
26 |   void convert_number(const pfi::data::string::ustring& number_string_org, double& value, int& number_type);
27 | protected:
28 |   virtual void convert_arabic_kansuji_mixed_of_4digit(const pfi::data::string::ustring& number_string, int& number_converted) = 0;
29 |   void delete_comma(const pfi::data::string::ustring& ustr, pfi::data::string::ustring& ret);
30 |   void convert_arabic_numerals(const pfi::data::string::ustring& number_string, double& value);
31 |   void convert_arabic_kansuji_kurai_man_mixed(const pfi::data::string::ustring& number_string, double& value);
32 |   void convert_arabic_kansuji_mixed(const pfi::data::string::ustring& number_string, double& value);
33 | };
34 |   
35 | class JapaneseNumberConverter : public NumberConverterTemplate{
36 | private:
37 |   void convert_arabic_kansuji_mixed_of_4digit(const pfi::data::string::ustring& number_string, int& number_converted);
38 | };
39 | 
40 | class ChineseNumberConverter : public NumberConverterTemplate{
41 | private:
42 |   void convert_arabic_kansuji_mixed_of_4digit(const pfi::data::string::ustring& number_string, int& number_converted);
43 | };
44 | 
45 | class ArabicNumberConverter : public NumberConverterTemplate{
46 | public:
47 |   void convert_number(const pfi::data::string::ustring& number_string_org, double& value, int& number_type);
48 | private:
49 |   void convert_arabic_kansuji_mixed_of_4digit(const pfi::data::string::ustring& number_string, int& number_converted);
50 | };
51 |   
52 | 
53 | 
54 | class SymbolFixer {
55 | public:
56 |   void fix_numbers_by_symbol(const std::string& text, std::vector<Number>& numbers);
57 | private:
58 |   bool is_plus(const pfi::data::string::ustring& utext, int i, pfi::data::string::ustring& plus_strings);
59 |   bool is_minus(const pfi::data::string::ustring& utext, int i, pfi::data::string::ustring& plus_strings);
60 |   void fix_prefix_symbol(const pfi::data::string::ustring& utext, std::vector<Number>& numbers, int i);
61 |   double create_decimal_value(const Number& number);
62 |   void fix_decimal_point(std::vector<Number>& numbers, int i, pfi::data::string::ustring decimal_strings);
63 |   void fix_range_expression(std::vector<Number>& numbers, int i, pfi::data::string::ustring range_strings);
64 |   void fix_intermediate_symbol(const pfi::data::string::ustring& utext, std::vector<Number>& numbers, int i);
65 |   void fix_suffix_symbol(const pfi::data::string::ustring& utext, std::vector<Number>& numbers, int i);
66 | };
67 | 
68 | 
69 | 
70 | class NumberNormalizer {
71 | public:
72 |   NumberNormalizer(const std::string& language) {language_ = language; digit_utility::init_kansuji(language); }
73 |   ;
74 |   void process(const std::string& input, std::vector<Number>& output);
75 |   void process_dont_fix_by_symbol(const std::string& input, std::vector<Number>& output); //絶対時間表現の規格化の際に使用する（絶対時間表現では、前もって記号を処理させないため）
76 |   
77 | private:
78 |   std::string language_;
79 | };
80 |   
81 | } //namespace number_normalizer
82 | 
83 | #endif //NUMBER_NORMALIZER_H_
84 | 


--------------------------------------------------------------------------------
/src/numerical_expression_extractor.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/numerical_expression_extractor.pyc


--------------------------------------------------------------------------------
/src/numerical_expression_normalizer.cpp:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include "numerical_expression_normalizer.hpp"
  3 | #include "digit_utility.hpp"
  4 | #include "number_normalizer.hpp"
  5 | 
  6 | namespace numerical_expression_normalizer{
  7 | 
  8 | void NumericalExpressionNormalizer::init(){
  9 |   load_from_dictionaries("num_counter_json.txt", "num_prefix_counter_json.txt", "num_prefix_json.txt", "num_suffix_json.txt");
 10 | }
 11 |   
 12 | void NumericalExpressionNormalizer::normalize_number(const std::string& text, std::vector<digit_utility::Number>& numbers) {
 13 |   NN.process(text, numbers);
 14 | }
 15 | 
 16 | void multiply_numexp_value(NumericalExpression& numexp, double x){
 17 |   numexp.value_lowerbound *= x;
 18 |   numexp.value_upperbound *= x;
 19 | }
 20 | 
 21 | 
 22 | void do_option_wari(std::vector<NumericalExpression>& numexps, int expression_id, const Counter matching_limited_expression){
 23 | 	pfi::data::string::ustring upattern = pfi::data::string::string_to_ustring(matching_limited_expression.pattern);
 24 |   numexps[expression_id].position_end += upattern.size();
 25 | 	numexps[expression_id].counter = pfi::data::string::string_to_ustring("%");
 26 | 	numexps[expression_id].ordinary = false;
 27 | 			
 28 | 	//set_value
 29 | 	double value = 0;
 30 | 	for(int i=0; i<static_cast<int>(upattern.size()); i+=2){
 31 | 		if(upattern[i] == pfi::data::string::string_to_ustring("割")[0]){
 32 | 			value += numexps[expression_id + i/2].value_lowerbound * 10; 
 33 | 		}else if(upattern[i] == pfi::data::string::string_to_ustring("分")[0]){
 34 | 			value += numexps[expression_id + i/2].value_lowerbound * 1;
 35 | 		}else if(upattern[i] == pfi::data::string::string_to_ustring("厘")[0]){
 36 | 			value += numexps[expression_id + i/2].value_lowerbound * 0.1;
 37 | 		}
 38 | 	}
 39 | 	numexps[expression_id].value_lowerbound = value;
 40 | 	numexps[expression_id].value_upperbound = value;
 41 | 	
 42 | 	//erase merged numexps
 43 | 	for(int i=2; i<static_cast<int>(upattern.size()); i+=2){
 44 | 		numexps.erase(numexps.begin() + expression_id + 1);
 45 | 	}
 46 | }
 47 | 
 48 | void NumericalExpressionNormalizer::revise_any_type_expression_by_matching_limited_expression(std::vector<NumericalExpression>& numexps, int& expression_id, const Counter matching_limited_expression){
 49 |   //特殊なタイプをここで例外処理
 50 | 	if(matching_limited_expression.option == "wari"){
 51 | 		do_option_wari(numexps, expression_id, matching_limited_expression);
 52 | 		return;
 53 | 	}
 54 |   //TODO : 今のところ特殊なタイプは分数しかないので、とりあえず保留
 55 | 
 56 |   numexps[expression_id].position_end += pfi::data::string::string_to_ustring(matching_limited_expression.pattern).size();
 57 |   numexps[expression_id].counter = pfi::data::string::string_to_ustring(matching_limited_expression.counter);
 58 |   multiply_numexp_value(numexps[expression_id], pow(10, matching_limited_expression.SI_prefix));
 59 |   multiply_numexp_value(numexps[expression_id], pow(10, matching_limited_expression.optional_power_of_ten));
 60 |   numexps[expression_id].ordinary = matching_limited_expression.ordinary;
 61 | }
 62 |   
 63 | void NumericalExpressionNormalizer::revise_any_type_expression_by_matching_prefix_counter(NumericalExpression& numexp, const Counter& matching_limited_expression){    
 64 |   if(matching_limited_expression.option == "counter"){
 65 |     numexp.position_start -= pfi::data::string::string_to_ustring(matching_limited_expression.pattern).size();
 66 |     numexp.counter = pfi::data::string::string_to_ustring(matching_limited_expression.counter);
 67 |     multiply_numexp_value(numexp, pow(10, matching_limited_expression.SI_prefix));
 68 |     multiply_numexp_value(numexp, pow(10, matching_limited_expression.optional_power_of_ten));
 69 |     numexp.ordinary = matching_limited_expression.ordinary;
 70 |   }else if(matching_limited_expression.option == "add_suffix_counter"){
 71 |     if(numexp.counter.empty()) return; //TODO : 単位が空の場合、追加は行わない？
 72 |     numexp.position_start -= pfi::data::string::string_to_ustring(matching_limited_expression.pattern).size();
 73 |     numexp.counter += pfi::data::string::string_to_ustring(matching_limited_expression.counter);
 74 |   }
 75 | }
 76 | 
 77 | void NumericalExpressionNormalizer::revise_any_type_expression_by_number_modifier(NumericalExpression& numexp, const normalizer_utility::NumberModifier& number_modifier){
 78 |   std::string process_type = number_modifier.process_type;
 79 |   /* 「約」などのNumberModifierの処理を行う。
 80 |    */
 81 |   if(process_type == "or_over"){
 82 |     numexp.value_upperbound = INFINITY;
 83 |   }else if(process_type == "or_less"){
 84 |     numexp.value_lowerbound = -INFINITY;
 85 |   }else if(process_type == "over"){
 86 |     numexp.value_upperbound = INFINITY;
 87 |     numexp.include_lowerbound = false;
 88 |   }else if(process_type == "less"){
 89 |     numexp.value_lowerbound = -INFINITY;
 90 |     numexp.include_upperbound = false;
 91 |   }else if(process_type == "dai"){
 92 |     //TODO : どんな処理をするか未定。。　該当する事例は「30代」「9秒台」のみ？
 93 |   }else if(process_type == "ordinary"){
 94 |     numexp.ordinary = true;
 95 |   }else if(process_type == "han"){
 96 |     numexp.value_lowerbound += 0.5;
 97 |     numexp.value_upperbound += 0.5;
 98 |   }else if(process_type[0] == '/'){  // /hour, /minなど
 99 |     numexp.counter += pfi::data::string::string_to_ustring(process_type);
100 |   }else if(process_type == "none"){
101 |     ;
102 |   }else if(process_type == "per"){
103 |     // TODO : どんな処理をするか未定。　該当する事例は「1ページ毎」など。
104 |   }else if(process_type == "about"){
105 |     numexp.value_lowerbound *= 0.7;
106 |     numexp.value_upperbound *= 1.3;
107 | 	}else if(process_type == "kyou"){
108 |     numexp.value_upperbound *= 1.6;
109 |   }else if(process_type == "jaku"){
110 |     numexp.value_lowerbound *= 0.5;
111 | 	}else if(process_type == "made"){
112 | 		if(numexp.value_lowerbound == numexp.value_upperbound) {
113 | 			numexp.value_lowerbound = -INFINITY;
114 | 		}else{
115 | 			;
116 | 		}
117 | 	}else {
118 |     numexp.options.push_back(process_type);
119 | 	}
120 | }
121 |   
122 | void NumericalExpressionNormalizer::delete_not_any_type_expression(std::vector<NumericalExpression>& numexps){
123 |   for(int i=0; i<static_cast<int>(numexps.size()); i++){
124 |     if(numexps[i].counter.empty()){
125 |       numexps.erase(numexps.begin() + i);
126 |       i--;
127 |     }
128 |   }
129 | }
130 |   
131 | void delete_after_slash(pfi::data::string::ustring& ustr){
132 |   if(ustr.find(pfi::data::string::string_to_uchar("/")) == pfi::data::string::ustring::npos) return;
133 |   ustr = ustr.substr(0, ustr.find(pfi::data::string::string_to_uchar("/")));
134 | }
135 | 
136 | bool suffix_match_counter(pfi::data::string::ustring counter1, pfi::data::string::ustring counter2){
137 |   //単位が一致しているかどうかを判断する。
138 |   //「時速50km〜60km」のような事例に対応する（前者は[50km/h], 後者は[60km]と規格化されており、完全一致ではマッチしない）ために、スラッシュより前の単位が一致するかどうかで判断する
139 |   delete_after_slash(counter1);
140 |   delete_after_slash(counter2);
141 |   return counter1 == counter2;
142 | }
143 | 
144 | void NumericalExpressionNormalizer::fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector<NumericalExpression>& numexps) {
145 |   for(int i=0; i<static_cast<int>(numexps.size()-1); i++){
146 |     if(have_kara_suffix(numexps[i].options) && have_kara_prefix(numexps[i+1].options) && numexps[i].position_end +2 >= numexps[i+1].position_start){
147 |       if(!suffix_match_counter(numexps[i].counter, numexps[i+1].counter)) continue;
148 |       numexps[i].value_upperbound = numexps[i+1].value_upperbound;
149 |       numexps[i].position_end = numexps[i+1].position_end;
150 |       numexps[i].set_original_expression_from_position(utext);
151 |       //memo :単位のマージは、必ずiの方がi+1よりも長いので、する必要なし
152 |       merge_options(numexps[i].options, numexps[i+1].options);
153 |       numexps.erase(numexps.begin()+i+1);
154 |     }
155 |   }
156 | }
157 |   
158 | } //namespace numerical_expression_normalizer
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/src/numerical_expression_normalizer.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef NUMERICAL_EXPRESSION_NORMALIZER_H_
 2 | #define NUMERICAL_EXPRESSION_NORMALIZER_H_
 3 | #include <string>
 4 | #include <ux/ux.hpp>
 5 | #include "digit_utility.hpp"
 6 | #include "number_normalizer.hpp"
 7 | #include "normalizer_utility.hpp"
 8 | #include "normalizer_template.hpp"
 9 | 
10 | namespace numerical_expression_normalizer{
11 | 
12 | struct NumericalExpression : normalizer_utility::NormalizedExpressionTemplate{
13 |   NumericalExpression(const pfi::data::string::ustring& original_expression, // TODO : 実装方針が変わったので、この初期化リストはテストでしか用いていない。テストを変更して、これは削る。
14 |          const int position_start,
15 |          const int position_end,
16 |          const double value_lowerbound,
17 |          const double value_upperbound)
18 |       : normalizer_utility::NormalizedExpressionTemplate(original_expression, position_start, position_end),
19 |         value_lowerbound(value_lowerbound),
20 |         value_upperbound(value_upperbound),
21 |         counter(pfi::data::string::string_to_ustring("")),
22 |         ordinary(false)
23 |         {}
24 |   
25 |   NumericalExpression(digit_utility::Number number)
26 |       : normalizer_utility::NormalizedExpressionTemplate(number.original_expression, number.position_start, number.position_end),
27 |         value_lowerbound(number.value_lowerbound),
28 |         value_upperbound(number.value_upperbound),
29 |         counter(pfi::data::string::string_to_ustring("")),
30 |         ordinary(false)
31 |         {}
32 | 
33 |   double value_lowerbound, value_upperbound;
34 |   pfi::data::string::ustring counter;
35 |   bool ordinary;
36 | };
37 | 
38 |   
39 | struct Counter : public normalizer_utility::LimitedExpressionTemplate{
40 |   template <class Archive>
41 |   void serialize(Archive &ar){
42 |     ar & MEMBER(pattern) & MEMBER(counter) & MEMBER(SI_prefix) & MEMBER(optional_power_of_ten) & MEMBER(ordinary) & MEMBER(option);
43 |   }
44 | 
45 |   std::string counter;
46 |   int SI_prefix; //「キロ」「ミリ」などの表記に使用
47 |   int optional_power_of_ten; //「トン」のような特殊な表記（数に10^4を乗算する必要がある）の時に使用
48 | };
49 | 
50 |   
51 | class NumericalExpressionNormalizer : public normalizer_template::NormalizerTemplate<NumericalExpression, Counter>{
52 | public:
53 |   NumericalExpressionNormalizer(const std::string& language) : NN(language) { language_ = language; init(); }
54 | 
55 | private:
56 |   void init();
57 |   void normalize_number(const std::string& text, std::vector<digit_utility::Number>& numbers);
58 |   void revise_any_type_expression_by_matching_limited_expression(std::vector<NumericalExpression>& numexps, int& expression_id, Counter matching_limited_expression);
59 |   void revise_any_type_expression_by_matching_prefix_counter(NumericalExpression& numexps, const Counter& matching_limited_expression);
60 |   void revise_any_type_expression_by_number_modifier(NumericalExpression& numexp, const normalizer_utility::NumberModifier& number_modifier);
61 |   void delete_not_any_type_expression(std::vector<NumericalExpression>& numexps);
62 |   void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector<NumericalExpression>& numexps);
63 |   
64 |   number_normalizer::NumberNormalizer NN;
65 | };
66 | 
67 | } //namespace numerical_expression_normalizer
68 | 
69 | #endif //NUMERICAL_EXPRESSION_NORMALIZER_H_
70 | 


--------------------------------------------------------------------------------
/src/numerical_expression_normalizer_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | #include <string>
  3 | #include <fstream>
  4 | #include "normalizer_utility.hpp"
  5 | #include "numerical_expression_normalizer.hpp"
  6 | 
  7 | #include <pficommon/data/string/ustring.h>
  8 | #include <ux/ux.hpp>
  9 | 
 10 | using namespace normalizer_utility;
 11 | using namespace std;
 12 | using namespace pfi::data::string;
 13 | using namespace numerical_expression_normalizer;
 14 | class NumexpNormalizerTest : public testing::Test {
 15 | public:
 16 |     void SetUp() {}
 17 |     void TearDown() {}
 18 | };
 19 | 
 20 | bool is_same_numexp(const NumericalExpression &n1, const NumericalExpression &n2){
 21 |   return
 22 |     n1.original_expression==n2.original_expression &&
 23 |     n1.position_start==n2.position_start &&
 24 |     n1.position_end==n2.position_end &&
 25 |     n1.value_lowerbound==n2.value_lowerbound &&
 26 |     n1.value_upperbound==n2.value_upperbound &&
 27 |     n1.counter==n2.counter;
 28 | }
 29 | 
 30 | TEST_F(NumexpNormalizerTest, simple1) {
 31 |   NumericalExpressionNormalizer NEN("ja");
 32 |   std::string text("その三人が死んだ");
 33 |   std::vector<NumericalExpression> numexps;
 34 |   NEN.process(text, numexps);
 35 |   NumericalExpression ex(string_to_ustring("三人"), 2, 4, 3, 3);
 36 |   ex.counter = string_to_ustring("人");
 37 |   
 38 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
 39 | }
 40 | 
 41 | TEST_F(NumexpNormalizerTest, simple2) {
 42 |   NumericalExpressionNormalizer NEN("ja");
 43 |   std::string text("3kgのレッドブルと、2USドルのモンスター");
 44 |   std::vector<NumericalExpression> numexps;
 45 |   NEN.process(text, numexps);
 46 |   NumericalExpression ex1(string_to_ustring("3kg"), 0, 3, 3000, 3000);
 47 |   ex1.counter = string_to_ustring("g");
 48 |   NumericalExpression ex2(string_to_ustring("2USドル"),11, 16, 2, 2);
 49 |   ex2.counter = string_to_ustring("ドル");
 50 |   EXPECT_TRUE(is_same_numexp(ex1, numexps[0]));
 51 |   EXPECT_TRUE(is_same_numexp(ex2, numexps[1]));
 52 | }
 53 | 
 54 | TEST_F(NumexpNormalizerTest, about1) {
 55 |   NumericalExpressionNormalizer NEN("ja");
 56 |   std::string text("その約十人がぼぼぼぼ");
 57 |   std::vector<NumericalExpression> numexps;
 58 |   NEN.process(text, numexps);
 59 |   NumericalExpression ex(string_to_ustring("約十人"), 2, 5, 7, 13.0);
 60 |   ex.counter = string_to_ustring("人");
 61 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
 62 | }
 63 | 
 64 | TEST_F(NumexpNormalizerTest, about2) {
 65 |   NumericalExpressionNormalizer NEN("ja");
 66 |   std::string text("そのおよそ十人がぼぼぼぼ");
 67 |   std::vector<NumericalExpression> numexps;
 68 |   NEN.process(text, numexps);
 69 |   NumericalExpression ex(string_to_ustring("およそ十人"), 2, 7, 7, 13.0);
 70 |   ex.counter = string_to_ustring("人");
 71 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
 72 | }
 73 | 
 74 | TEST_F(NumexpNormalizerTest, or_over) {
 75 |   NumericalExpressionNormalizer NEN("ja");
 76 |   std::string text("その三人以上がぼぼぼぼ");
 77 |   std::vector<NumericalExpression> numexps;
 78 |   NEN.process(text, numexps);
 79 |   NumericalExpression ex(string_to_ustring("三人以上"), 2, 6, 3.0, INFINITY);
 80 |   ex.counter = string_to_ustring("人");
 81 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
 82 | }
 83 | 
 84 | TEST_F(NumexpNormalizerTest, about_and_or_over) {
 85 |   NumericalExpressionNormalizer NEN("ja");
 86 |   std::string text("その約十人以上がぼぼぼぼ");
 87 |   std::vector<NumericalExpression> numexps;
 88 |   NEN.process(text, numexps);
 89 |   NumericalExpression ex(string_to_ustring("約十人以上"), 2, 7, 7.0, INFINITY);
 90 |   ex.counter = string_to_ustring("人");
 91 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
 92 | }
 93 | 
 94 | TEST_F(NumexpNormalizerTest, or_less) {
 95 |   NumericalExpressionNormalizer NEN("ja");
 96 |   std::string text("その三人以下がぼぼぼぼ");
 97 |   std::vector<NumericalExpression> numexps;
 98 |   NEN.process(text, numexps);
 99 |   NumericalExpression ex(string_to_ustring("三人以下"), 2, 6, -INFINITY, 3);
100 |   ex.counter = string_to_ustring("人");
101 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
102 | }
103 | 
104 | TEST_F(NumexpNormalizerTest, kyou) {
105 |   NumericalExpressionNormalizer NEN("ja");
106 |   std::string text("レッドブルを10本強飲んだ");
107 |   std::vector<NumericalExpression> numexps;
108 |   NEN.process(text, numexps);
109 |   NumericalExpression ex(string_to_ustring("10本強"), 6, 10, 10, 16);
110 |   ex.counter = string_to_ustring("本");
111 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
112 | }
113 | 
114 | TEST_F(NumexpNormalizerTest, jaku) {
115 |   NumericalExpressionNormalizer NEN("ja");
116 |   std::string text("レッドブルを10本弱飲んだ");
117 |   std::vector<NumericalExpression> numexps;
118 |   NEN.process(text, numexps);
119 |   NumericalExpression ex(string_to_ustring("10本弱"), 6, 10, 5, 10);
120 |   ex.counter = string_to_ustring("本");
121 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
122 | }
123 | 
124 | TEST_F(NumexpNormalizerTest, ordinary) {
125 |   NumericalExpressionNormalizer NEN("ja");
126 |   std::string text("本日10本目のレッドブル");
127 |   std::vector<NumericalExpression> numexps;
128 |   NEN.process(text, numexps);
129 |   NumericalExpression ex(string_to_ustring("10本目"), 2, 6, 10, 10);
130 |   ex.counter = string_to_ustring("本");
131 |   ex.ordinary = true;
132 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
133 | }
134 | 
135 | TEST_F(NumexpNormalizerTest, han) {
136 |   NumericalExpressionNormalizer NEN("ja");
137 |   std::string text("レッドブルを1本半飲んだ");
138 |   std::vector<NumericalExpression> numexps;
139 |   NEN.process(text, numexps);
140 |   NumericalExpression ex(string_to_ustring("1本半"), 6, 9, 1.5, 1.5);
141 |   ex.counter = string_to_ustring("本");
142 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
143 | }
144 | 
145 | TEST_F(NumexpNormalizerTest, per) {
146 |   NumericalExpressionNormalizer NEN("ja");
147 |   std::string text("１キロメートル／時");
148 |   std::vector<NumericalExpression> numexps;
149 |   NEN.process(text, numexps);
150 |   NumericalExpression ex(string_to_ustring("１キロメートル／時"), 0, 9, 1000, 1000);
151 |   ex.counter = string_to_ustring("m/h");
152 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
153 | }
154 | 
155 | TEST_F(NumexpNormalizerTest, prefix_counter1) {
156 |   NumericalExpressionNormalizer NEN("ja");
157 |   std::string text("それは¥100だ");
158 |   std::vector<NumericalExpression> numexps;
159 |   NEN.process(text, numexps);
160 |   NumericalExpression ex(string_to_ustring("¥100"), 3, 7, 100, 100);
161 |   ex.counter = string_to_ustring("円");
162 |   ASSERT_EQ(1u, numexps.size());
163 | 
164 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
165 | }
166 | 
167 | TEST_F(NumexpNormalizerTest, prefix_counter2) {
168 |   NumericalExpressionNormalizer NEN("ja");
169 |   std::string text("それは時速40キロメートルだ");
170 |   std::vector<NumericalExpression> numexps;
171 |   NEN.process(text, numexps);
172 |   NumericalExpression ex(string_to_ustring("時速40キロメートル"), 3, 13, 40000, 40000);
173 |   ex.counter = string_to_ustring("m/h");
174 |   ASSERT_EQ(1u, numexps.size());
175 |   
176 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
177 | }
178 | 
179 | TEST_F(NumexpNormalizerTest, range1) {
180 |   NumericalExpressionNormalizer NEN("ja");
181 |   std::string text("このアトラクションは3人〜の運用になります");
182 |   std::vector<NumericalExpression> numexps;
183 |   NEN.process(text, numexps);
184 |   NumericalExpression ex(string_to_ustring("3人〜"), 10, 13, 3, 3);
185 |   ex.counter = string_to_ustring("人");
186 |   ASSERT_EQ(1u, numexps.size());
187 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
188 |   ASSERT_EQ(1u, numexps[0].options.size());
189 |   EXPECT_EQ(numexps[0].options[0], "kara_suffix");
190 | }
191 | 
192 | TEST_F(NumexpNormalizerTest, range2) {
193 |   NumericalExpressionNormalizer NEN("ja");
194 |   std::string text("遊び方の欄には「〜8人」と書いてある");
195 |   std::vector<NumericalExpression> numexps;
196 |   NEN.process(text, numexps);
197 |   NumericalExpression ex(string_to_ustring("〜8人"), 8, 11, 8, 8);
198 |   ex.counter = string_to_ustring("人");
199 |   ASSERT_EQ(1u, numexps.size());
200 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
201 |   ASSERT_EQ(1u, numexps[0].options.size());
202 |   EXPECT_EQ(numexps[0].options[0], "kara_prefix");
203 | }
204 | 
205 | TEST_F(NumexpNormalizerTest, range3) {
206 |   NumericalExpressionNormalizer NEN("ja");
207 |   std::string text("遊び方の欄には「5〜8人」と書いてある");
208 |   std::vector<NumericalExpression> numexps;
209 |   NEN.process(text, numexps);
210 |   NumericalExpression ex(string_to_ustring("5〜8人"), 8, 12, 5, 8);
211 |   ex.counter = string_to_ustring("人");
212 |   ASSERT_EQ(1u, numexps.size());
213 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
214 | }
215 | 
216 | TEST_F(NumexpNormalizerTest, range4) {
217 |   NumericalExpressionNormalizer NEN("ja");
218 |   std::string text("遊び方の欄には「5人〜8人」と書いてある");
219 |   std::vector<NumericalExpression> numexps;
220 |   NEN.process(text, numexps);
221 |   NumericalExpression ex(string_to_ustring("5人〜8人"), 8, 13, 5, 8);
222 |   ex.counter = string_to_ustring("人");
223 |   ASSERT_EQ(1u, numexps.size());
224 |   
225 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
226 | }
227 | 
228 | TEST_F(NumexpNormalizerTest, range5) {
229 |   NumericalExpressionNormalizer NEN("ja");
230 |   std::string text("時速50km〜60km");
231 |   std::vector<NumericalExpression> numexps;
232 |   NEN.process(text, numexps);
233 |   NumericalExpression ex(string_to_ustring("時速50km〜60km"), 0, 11, 50000, 60000);
234 |   ex.counter = string_to_ustring("m/h");
235 |   ASSERT_EQ(1u, numexps.size());
236 |   
237 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
238 | }
239 | 
240 | TEST_F(NumexpNormalizerTest, range6) {
241 |   NumericalExpressionNormalizer NEN("ja");
242 |   std::string text("時速50kmから時速60km");
243 |   std::vector<NumericalExpression> numexps;
244 |   NEN.process(text, numexps);
245 |   NumericalExpression ex(string_to_ustring("時速50kmから時速60km"), 0, 14, 50000, 60000);
246 |   ex.counter = string_to_ustring("m/h");
247 |   ASSERT_EQ(1u, numexps.size());
248 |   
249 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
250 | }
251 | 
252 | TEST_F(NumexpNormalizerTest, range7) {
253 |   NumericalExpressionNormalizer NEN("ja");
254 |   std::string text("時速50〜60km");
255 |   std::vector<NumericalExpression> numexps;
256 |   NEN.process(text, numexps);
257 |   NumericalExpression ex(string_to_ustring("時速50〜60km"), 0, 9, 50000, 60000);
258 |   ex.counter = string_to_ustring("m/h");
259 |   ASSERT_EQ(1u, numexps.size());
260 |   
261 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
262 | }
263 | 
264 | TEST_F(NumexpNormalizerTest, range8) {
265 |   NumericalExpressionNormalizer NEN("ja");
266 |   std::string text("世界50カ国から3000人が出席予定だ");
267 |   std::vector<NumericalExpression> numexps;
268 |   NEN.process(text, numexps);
269 |   ASSERT_EQ(2u, numexps.size()); //単位が違うので、マージされない
270 | }
271 | 
272 | TEST_F(NumexpNormalizerTest, range9) {
273 |   NumericalExpressionNormalizer NEN("ja");
274 |   std::string text("およそ時速50km〜60kmくらい");
275 |   std::vector<NumericalExpression> numexps;
276 |   NEN.process(text, numexps);
277 |   NumericalExpression ex(string_to_ustring("およそ時速50km〜60kmくらい"), 0, 17, 35000, 78000);
278 |   ex.counter = string_to_ustring("m/h");
279 |   ASSERT_EQ(1u, numexps.size());
280 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
281 | }
282 | 
283 | TEST_F(NumexpNormalizerTest, chinese1) {
284 |   NumericalExpressionNormalizer NEN("zh");
285 |   std::string text("日本政府受清廷壓力，以千二百三元請孫中山離開日本。");
286 |   std::vector<NumericalExpression> numexps;
287 |   NEN.process(text, numexps);
288 |   ASSERT_EQ(1u, numexps.size());
289 |   NumericalExpression ex(string_to_ustring("千二百三元"), 11, 16, 1230, 1230);
290 |   ex.counter = string_to_ustring("元");
291 |   EXPECT_TRUE(is_same_numexp(ex, numexps[0]));
292 | 
293 | }
294 | //"東京支部の三人"
295 | 


--------------------------------------------------------------------------------
/src/optparse.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *      An event-driven parser for command-line arguments.
  3 |  *  
  4 |  *      Copyright (c) 2004-2005 by Naoaki Okazaki
  5 |  *
  6 |  * This software is provided 'as-is', without any express or implied
  7 |  * warranty.  In no event will the authors be held liable for any damages
  8 |  * arising from the use of this software.
  9 |  *
 10 |  * Permission is granted to anyone to use this software for any purpose,
 11 |  * including commercial applications, and to alter it and redistribute it
 12 |  * freely, subject to the following restrictions (known as zlib license):
 13 |  *
 14 |  * 1. The origin of this software must not be misrepresented; you must not
 15 |  *    claim that you wrote the original software. If you use this software
 16 |  *    in a product, an acknowledgment in the product documentation would be
 17 |  *    appreciated but is not required.
 18 |  * 2. Altered source versions must be plainly marked as such, and must not be
 19 |  *    misrepresented as being the original software.
 20 |  * 3. This notice may not be removed or altered from any source distribution.
 21 |  *
 22 |  * Naoaki Okazaki <okazaki at chokkan dot org>
 23 |  *
 24 |  */
 25 | 
 26 | /* $Id$ */
 27 | 
 28 | /*
 29 |  * Class 'optparse' implements a parser for GNU-style command-line arguments.
 30 |  * Inherit this class to define your own option variables and to implement an
 31 |  * option handler with macros, BEGIN_OPTION_MAP, ON_OPTION(_WITH_ARG), and
 32 |  * END_OPTION_MAP. Consult the sample program attached at the bottom of this
 33 |  * source code.
 34 |  *
 35 |  * This code was comfirmed to be compiled with MCVC++ 2003 and gcc 3.3.
 36 |  * Define _BUILD_NCL_SAMPLE if you want to build a sample program.
 37 |  *  $ g++ -D_BUILD_NCL_SAMPLE -xc++ optparse.h
 38 |  */
 39 | 
 40 | #ifndef __NCL_OPTPRASE_H__
 41 | #define __NCL_OPTPRASE_H__
 42 | 
 43 | #include <cstring>
 44 | #include <sstream>
 45 | #include <stdexcept>
 46 | #include <string>
 47 | 
 48 | 
 49 | #ifdef  USE_NCL_NAMESPACE
 50 | namespace ncl {
 51 | #endif/*USE_NCL_NAMESPACE*/
 52 | 	
 53 | 	
 54 | 	/**
 55 | 	 * An event-driven parser for command-line arguments.
 56 | 	 *  @author Naoaki Okazaki
 57 | 	 */
 58 | 	class optparse {
 59 | 	public:
 60 | 		/**
 61 | 		 * Exception class for unrecognized options.
 62 | 		 */
 63 | 		class unrecognized_option : public std::invalid_argument {
 64 | 		public:
 65 | 			unrecognized_option(char shortopt)
 66 |             : std::invalid_argument(std::string("-") + shortopt) {}
 67 | 			unrecognized_option(const std::string& longopt)
 68 |             : std::invalid_argument(std::string("--") + longopt) {}
 69 | 		};
 70 | 		/**
 71 | 		 * Exception class for invalid values.
 72 | 		 */
 73 | 		class invalid_value : public std::invalid_argument {
 74 | 		public:
 75 | 			invalid_value(const std::string& message)
 76 |             : std::invalid_argument(message) {}
 77 | 		};
 78 | 		
 79 | 	public:
 80 | 		/** Construct. */
 81 | 		optparse() {}
 82 | 		/** Destruct. */
 83 | 		virtual ~optparse() {}
 84 | 		
 85 | 		/**
 86 | 		 * Parse options.
 87 | 		 *  @param  argv        array of null-terminated strings to be parsed
 88 | 		 *  @param  num_argv    specifies the number, in strings, of the array
 89 | 		 *  @return             the number of used arguments
 90 | 		 *  @throws             optparse_exception
 91 | 		 */
 92 | 		int parse(char * const argv[], int num_argv)
 93 | 		{
 94 | 			int i;
 95 | 			for (i = 1;i < num_argv;++i) {
 96 | 				const char *token = argv[i];
 97 | 				if (*token++ == '-') {
 98 | 					const char *next_token = (i+1 < num_argv) ? argv[i+1] : "";
 99 | 					if (!*token) {
100 | 						break;  // only '-' was found.
101 | 					} else if (*token == '-') {
102 | 						const char *arg = std::strchr(++token, '=');
103 | 						if (arg) {
104 | 							arg++;
105 | 						} else {
106 | 							arg = next_token;
107 | 						}
108 | 						int ret = handle_option(0, token, arg);
109 | 						if (ret < 0) {
110 | 							throw unrecognized_option(token);
111 | 						}
112 | 						if (arg == next_token) {
113 | 							i += ret;
114 | 						}
115 | 					} else {
116 | 						char c;
117 | 						while ((c = *token++) != '\0') {
118 | 							const char *arg = *token ? token : next_token;
119 | 							int ret = handle_option(c, token, arg);
120 | 							if (ret < 0) {
121 | 								throw unrecognized_option(c);
122 | 							}
123 | 							if (ret > 0) {
124 | 								if (arg == token) {
125 | 									token = "";
126 | 								} else {
127 | 									i++;
128 | 								}
129 | 							}
130 | 						} // while
131 | 					} // else (*token == '-') 
132 | 				} else {
133 | 					break;  // a non-option argument was fonud.
134 | 				} 
135 | 			} // for (i)
136 | 			
137 | 			return i;
138 | 		}
139 | 		
140 | 	protected:
141 | 		/**
142 | 		 * Option handler
143 | 		 *  This function should be overridden by inheritance class.
144 | 		 *  @param  c           short option character, 0 for long option
145 | 		 *  @param  longname    long option name
146 | 		 *  @param  arg         an argument for the option
147 | 		 *  @return             0 (success);
148 | 		 1 (success with use of an argument);
149 | 		 -1 (failed, unrecognized option)
150 | 		 *  @throws             option_parser_exception
151 | 		 */
152 | 		virtual int handle_option(char c, const char *longname, const char *arg)
153 | 		{
154 | 			return 0;
155 | 		}
156 | 		
157 | 		int __optstrcmp(const char *option, const char *longname)
158 | 		{
159 | 			const char *p = std::strchr(option, '=');
160 | 			return p ?
161 |             std::strncmp(option, longname, p-option) :
162 |             std::strcmp(option, longname);
163 | 		}
164 | 	};
165 | 	
166 | 	
167 | 	/** The begin of inline option map. */
168 | #define BEGIN_OPTION_MAP_INLINE() \
169 | virtual int handle_option(char __c, const char *__longname, const char *arg) \
170 | { \
171 | int used_args = 0; \
172 | if (0) { \
173 | 
174 | 	/** Define of option map. */
175 | #define DEFINE_OPTION_MAP() \
176 | virtual int handle_option(char __c, const char *__longname, const char *arg);
177 | 	
178 | 	/** Begin of option map implimentation. */
179 | #define BEGIN_OPTION_MAP(_Class) \
180 | int _Class::handle_option(char __c, const char *__longname, const char *arg) \
181 | { \
182 | int used_args = 0; \
183 | if (0) { \
184 | 
185 | 	/** An entry of option map */
186 | #define ON_OPTION(test) \
187 | return used_args; \
188 | } else if (test) { \
189 | used_args = 0; \
190 | 
191 | #define ON_OPTION_WITH_ARG(test) \
192 | return used_args; \
193 | } else if (test) { \
194 | used_args = 1; \
195 | 
196 | 	/** The end of option map implementation */
197 | #define END_OPTION_MAP() \
198 | return used_args; \
199 | } \
200 | return -1; \
201 | } \
202 | 
203 | 	/** A predicator for short options */
204 | #define SHORTOPT(x)     (__c == x)
205 | 	/** A predicator for long options */
206 | #define LONGOPT(x)      (!__c && __optstrcmp(__longname, x) == 0)
207 | 	
208 | 	
209 | #ifdef  USE_NCL_NAMESPACE
210 | };
211 | #endif/*USE_NCL_NAMESPACE*/
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | #ifdef  _BUILD_NCL_SAMPLE
219 | 
220 | #include <cstdio>
221 | #include <iostream>
222 | 
223 | /**
224 |  * A class to store parameters specified by command-line arguments
225 |  */
226 | class option : public optparse {
227 | public:
228 |     int bytes;
229 |     int lines;
230 |     bool quiet;
231 | 	
232 |     option() : bytes(0), lines(0), quiet(false) {}
233 | 	
234 |     BEGIN_OPTION_MAP_INLINE()
235 | 	ON_OPTION(SHORTOPT('b') || LONGOPT("bytes"))
236 | 	bytes = std::atoi(arg);
237 | 	used_args = 1;  // Notify the parser of a consumption of argument.
238 | 	
239 | 	ON_OPTION_WITH_ARG(SHORTOPT('l') || LONGOPT("lines"))
240 | 	lines = std::atoi(arg);
241 | 	// no need of the notification: used_args variable will be set to 1.
242 | 	
243 | 	ON_OPTION(SHORTOPT('q') || LONGOPT("quiet") || LONGOPT("silent"))
244 | 	quiet = true;
245 | 	
246 |     END_OPTION_MAP()
247 | };
248 | 
249 | int main(int argc, char *argv[])
250 | {
251 |     try {
252 |         option opt;
253 |         int argused = opt.parse(&argv[1], argc-1); // Skip argv[0].
254 | 		
255 |         std::cout << "used argv: " << argused << std::endl;
256 |         std::cout << "bytes: " << opt.bytes << std::endl;
257 |         std::cout << "lines: " << opt.lines << std::endl;
258 |         std::cout << "quiet: " << opt.quiet << std::endl;
259 |     } catch (const optparse::unrecognized_option& e) {
260 |         std::cout << "unrecognized option: " << e.what() << std::endl;
261 |         return 1;
262 |     } catch (const optparse::invalid_value& e) {
263 |         std::cout << "invalid value: " << e.what() << std::endl;
264 |         return 1;
265 |     }
266 | 	
267 |     return 0;
268 | }
269 | 
270 | #endif/*_BUILD_NCL_SAMPLE*/
271 | 
272 | 
273 | #endif/*__NCL_OPTPRASE_H__*/
274 | 


--------------------------------------------------------------------------------
/src/reltime_expression_normalizer.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef RELTIME_EXPRESSION_NORMALIZER_H_
 2 | #define RELTIME_EXPRESSION_NORMALIZER_H_
 3 | #include <string>
 4 | #include "digit_utility.hpp"
 5 | #include "number_normalizer.hpp"
 6 | #include "normalizer_utility.hpp"
 7 | #include "normalizer_template.hpp"
 8 | #include <ux/ux.hpp>
 9 | 
10 | namespace reltime_expression_normalizer{
11 |   
12 | struct ReltimeExpression : normalizer_utility::NormalizedExpressionTemplate{
13 |   ReltimeExpression(digit_utility::Number number)
14 |   : normalizer_utility::NormalizedExpressionTemplate(number.original_expression, number.position_start, number.position_end),
15 |     org_value_lowerbound(number.value_lowerbound),
16 |     org_value_upperbound(number.value_upperbound),
17 |     value_lowerbound_abs(normalizer_utility::Time(INFINITY)),
18 |     value_upperbound_abs(normalizer_utility::Time(-INFINITY)),
19 |     value_lowerbound_rel(normalizer_utility::Time(INFINITY)),
20 |     value_upperbound_rel(normalizer_utility::Time(-INFINITY)),
21 |   ordinary(false)
22 |   {}
23 |   
24 |   double org_value_lowerbound, org_value_upperbound;
25 |   normalizer_utility::Time value_lowerbound_abs, value_upperbound_abs;
26 |   normalizer_utility::Time value_lowerbound_rel, value_upperbound_rel;
27 |   bool ordinary;
28 | };
29 | 
30 |   
31 | class LimitedReltimeExpression : public normalizer_utility::LimitedExpressionTemplate{
32 | public:
33 |   template <class Archive>
34 |   void serialize(Archive &ar){
35 |     ar & MEMBER(pattern) & MEMBER(corresponding_time_position) & MEMBER(process_type) & MEMBER(ordinary) & MEMBER(option);
36 |   }
37 |   
38 |   std::vector<std::string> corresponding_time_position;
39 |   std::vector<std::string> process_type;
40 | };
41 |   
42 | 
43 | class ReltimeExpressionNormalizer : public normalizer_template::NormalizerTemplate<ReltimeExpression, LimitedReltimeExpression>{
44 | public:
45 |   ReltimeExpressionNormalizer(const std::string& language) : NN(language) { language_ = language; init(); }
46 |   
47 | private:
48 |   void init();
49 |   void normalize_number(const std::string& text, std::vector<digit_utility::Number>& numbers);
50 |   void revise_any_type_expression_by_matching_limited_expression(std::vector<ReltimeExpression>& reltimeexps, int& expression_id, LimitedReltimeExpression matching_limited_reltime_expression);
51 |   void revise_any_type_expression_by_matching_prefix_counter(ReltimeExpression& reltimeexp, const LimitedReltimeExpression& matching_limited_expression);
52 |   void revise_any_type_expression_by_number_modifier(ReltimeExpression& reltimeexp, const normalizer_utility::NumberModifier& number_modifier);
53 |   void delete_not_any_type_expression(std::vector<ReltimeExpression>& reltimeexps);
54 |   void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector<ReltimeExpression>& reltimeexps);
55 |   
56 |   number_normalizer::NumberNormalizer NN;
57 | };
58 |   
59 | } //namespace reltime_expression_normalizer
60 | 
61 | #endif //RELTIME_EXPRESSON_NORMALIZER_H_
62 | 


--------------------------------------------------------------------------------
/src/wscript:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def build(bld):
 4 |     def define_test(source):
 5 |         target = source.split('.')[0].replace('/', '_')
 6 |         bld.program(
 7 |             features = 'gtest',
 8 |             source = source,
 9 |             target = target,
10 |             use = ['PFICOMMON','normalize_numexp', 'UX', 'boost_regex'])
11 | 
12 |     bld.shlib(
13 |         source = ['dictionary_dirpath.cpp', 'digit_utility.cpp', 'number_normalizer.cpp', 'normalizer_utility.cpp',  'numerical_expression_normalizer.cpp', 'abstime_expression_normalizer.cpp', 'reltime_expression_normalizer.cpp', 'duration_expression_normalizer.cpp', 'inappropriate_expression_remover.cpp', 'normalize_numexp.cpp'],
14 |         use = ['PFICOMMON', 'UX'],
15 |         target = 'normalize_numexp'
16 |         )
17 | 
18 |     bld(features = 'cxx cprogram',
19 |         source = 'main.cpp',
20 |         use = ['PFICOMMON'],
21 |         target = 'normalizeNumexp',
22 |         uselib_local = 'normalize_numexp')
23 | 
24 |     define_test('digit_utility_test.cpp')
25 |     define_test('number_normalizer_test.cpp')
26 |     define_test('normalizer_utility_test.cpp')
27 |     define_test('numerical_expression_normalizer_test.cpp')
28 |     define_test('abstime_expression_normalizer_test.cpp')
29 |     define_test('reltime_expression_normalizer_test.cpp')
30 |     define_test('duration_expression_normalizer_test.cpp')
31 |     define_test('normalize_numexp_test.cpp')
32 |     #define_test('normalizer_tester.cpp')
33 | 


--------------------------------------------------------------------------------
/swig/java/TestNormalizeNumexp.java:
--------------------------------------------------------------------------------
 1 | import java.util.Scanner;
 2 | import jp.ac.tohoku.ecei.cl.numexp.*;
 3 | 
 4 | public class TestNormalizeNumexp {
 5 |     static {
 6 |         System.loadLibrary("normalize_numexp");
 7 |     }
 8 |     public static void main(String [] args) {
 9 |         NormalizeNumexp n = new NormalizeNumexp("ja");
10 |         StringVector result = new StringVector(0);
11 |         
12 |         String text = "魔女狩りは15世紀〜18世紀にかけてみられ、全ヨーロッパで4万人が処刑された";
13 |         n.normalize(text, result);
14 | 
15 |         System.out.println("text:" + text);
16 |         for (long i = 0, size = result.size(); i < size; i++) {
17 |             System.out.println(result.get((int)i));
18 |         }
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/swig/java/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #http://www.swig.org/Doc2.0/SWIGDocumentation.html
 3 | 
 4 | [ -d src ] || mkdir src
 5 | [ -d classes ] || mkdir classes
 6 | 
 7 | #1. create _wrap.cxx and wrapper source files by swig (you can change the package name)
 8 | swig -c++ -java -I../../src/  -o normalize_numexp_wrap.cxx -package jp.ac.tohoku.ecei.cl.numexp  -outdir src ../normalize_numexp.i
 9 | 
10 | #2. compile _wrap.cxx (JNI header files are necessary)
11 | gcc -O2 -fPIC -c normalize_numexp_wrap.cxx -I../../src/ -I/usr/lib/jvm/java-7-oracle/include -I/usr/lib/jvm/java-7-oracle/include/linux
12 | 
13 | #3. create shared library
14 | gcc -shared ../../build/src/dictionary_dirpath.cpp.1.o ../../build/src/normalize_numexp.cpp.1.o ../../build/src/abstime_expression_normalizer.cpp.1.o ../../build/src/digit_utility.cpp.1.o ../../build/src/duration_expression_normalizer.cpp.1.o ../../build/src/normalizer_utility.cpp.1.o ../../build/src/number_normalizer.cpp.1.o ../../build/src/numerical_expression_normalizer.cpp.1.o ../../build/src/reltime_expression_normalizer.cpp.1.o ../../build/src/inappropriate_expression_remover.cpp.1.o normalize_numexp_wrap.o -o libnormalize_numexp.so -I/usr/lib/jvm/java-7-oracle/include -L/usr/local/lib -lpficommon -lpficommon_visualization -lpficommon_text -lpficommon_network_base -lpficommon_concurrent -lpficommon_data -lpficommon_math -lpficommon_system -lpficommon_network_http -lpficommon_lang -lpficommon_network_rpc -lpficommon_network_cgi -lux
15 | 
16 | #4. create wrapper classes
17 | javac -d classes src/*.java
18 | 
19 | echo finished!
20 | echo 'how to use: "java -Djava.library.path=<path to libnormalize_numexp.so> -classpath <path to wrapper classes> 〜"'
21 | 


--------------------------------------------------------------------------------
/swig/java/readme.txt:
--------------------------------------------------------------------------------
1 | ■swigによるJava連携について
2 | compile.shを実行した後、生成されたclassesディレクトリをclasspathに含めるようにしてください。
3 | また生成された.soファイルをおいたディレクトリを、java.library.pathシステムプロパティに含めるようにしてください。
4 | 


--------------------------------------------------------------------------------
/swig/normalize_numexp.i:
--------------------------------------------------------------------------------
 1 | %module normalize_numexp
 2 | 
 3 | %{
 4 |   #define SWIG_FILE_WITH_INIT
 5 |   #include "normalize_numexp.hpp"
 6 | %}
 7 | 
 8 | %include "std_string.i"
 9 | %include "std_vector.i"
10 | 
11 | namespace std {
12 |   %template(IntVector) vector<int>;
13 |   %template(DoubleVector) vector<double>;
14 |   %template(StringVector) vector<std::string>;
15 | }
16 | 
17 | %include "normalize_numexp.hpp"
18 | 


--------------------------------------------------------------------------------
/swig/python/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #http://www.swig.org/Doc2.0/SWIGDocumentation.html
 3 | 
 4 | #1. create _wrap.cxx and wrapper .py by swig.
 5 | swig -c++ -python -I../../src/  -o normalize_numexp_wrap.cxx ../normalize_numexp.i
 6 | 
 7 | #2. compile _wrap.cxx (Python.h is necessary.)
 8 | gcc -O2 -fPIC -c normalize_numexp_wrap.cxx -I/usr/include/python2.7 -I../../src/ -I/usr/local/include
 9 | 
10 | #3. create shared object
11 | gcc -shared ../../build/src/dictionary_dirpath.cpp.1.o ../../build/src/normalize_numexp.cpp.1.o ../../build/src/abstime_expression_normalizer.cpp.1.o ../../build/src/digit_utility.cpp.1.o ../../build/src/duration_expression_normalizer.cpp.1.o ../../build/src/normalizer_utility.cpp.1.o ../../build/src/number_normalizer.cpp.1.o ../../build/src/numerical_expression_normalizer.cpp.1.o ../../build/src/reltime_expression_normalizer.cpp.1.o ../../build/src/inappropriate_expression_remover.cpp.1.o normalize_numexp_wrap.o -o _normalize_numexp.so -I/usr/include/python2.7 -L/usr/local/lib -lpficommon -lpficommon_visualization -lpficommon_text -lpficommon_network_base -lpficommon_concurrent -lpficommon_data -lpficommon_math -lpficommon_system -lpficommon_network_http -lpficommon_lang -lpficommon_network_rpc -lpficommon_network_cgi -lux
12 | 
13 | #4.
14 | echo finished!
15 | echo 'please copy "_normalize_numexp.so" and "normalize_numexp.py" to your python "site-packages"'
16 | 


--------------------------------------------------------------------------------
/swig/python/readme.txt:
--------------------------------------------------------------------------------
1 | ■swigによるpython連携について
2 | distutilが上手く使えなかったため、通常のインストール方法ではなく、手動でのインストールになります。
3 | compile.shを実行した後、生成された.soファイルと.pyファイルをpythonのsite-packagesにコピーして下さい。
4 | 


--------------------------------------------------------------------------------
/swig/python/test_normalize_numexp.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import sys
 3 | from normalize_numexp import *
 4 | 
 5 | n = NormalizeNumexp("ja")
 6 | text = "魔女狩りは15世紀〜18世紀にかけてみられ、全ヨーロッパで4万人が処刑された"
 7 | result = StringVector(0)
 8 | 
 9 | n.normalize(text, result)
10 | print "text:",text
11 | for r in result :
12 |     print r
13 | 


--------------------------------------------------------------------------------
/swig/ruby/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #http://www.swig.org/Doc2.0/SWIGDocumentation.html
 3 | 
 4 | #1. create _wrap.cxx by swig.
 5 | swig -c++ -ruby -I../../src/  -o normalize_numexp_wrap.cxx ../normalize_numexp.i
 6 | 
 7 | #2. compile _wrap.cxx (ruby.h and config.h are necessary.)
 8 | gcc -O2 -fPIC -c normalize_numexp_wrap.cxx -I../../src/ -I/usr/include/ruby-1.9.1 -I/usr/include/ruby-1.9.1/x86_64-linux
 9 | 
10 | #3. create shared object
11 | gcc -shared ../../build/src/dictionary_dirpath.cpp.1.o ../../build/src/normalize_numexp.cpp.1.o ../../build/src/abstime_expression_normalizer.cpp.1.o ../../build/src/digit_utility.cpp.1.o ../../build/src/duration_expression_normalizer.cpp.1.o ../../build/src/normalizer_utility.cpp.1.o ../../build/src/number_normalizer.cpp.1.o ../../build/src/numerical_expression_normalizer.cpp.1.o ../../build/src/reltime_expression_normalizer.cpp.1.o ../../build/src/inappropriate_expression_remover.cpp.1.o normalize_numexp_wrap.o -o normalize_numexp.so -I/usr/include/ruby-1.9.1 -L/usr/local/lib -lpficommon -lpficommon_visualization -lpficommon_text -lpficommon_network_base -lpficommon_concurrent -lpficommon_data -lpficommon_math -lpficommon_system -lpficommon_network_http -lpficommon_lang -lpficommon_network_rpc -lpficommon_network_cgi -lux
12 | 
13 | #4.
14 | echo finished!
15 | echo 'please copy "normalize_numexp.so" to your ruby "site-ruby"'
16 | 


--------------------------------------------------------------------------------
/swig/ruby/readme.txt:
--------------------------------------------------------------------------------
1 | ■swigによるruby連携について
2 | compile.shを実行した後、生成された.soファイルをsite-rubyにコピーして下さい。
3 | 


--------------------------------------------------------------------------------
/swig/ruby/test-normalize-numexp.rb:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | require "normalize_numexp"
 3 | 
 4 | n = Normalize_numexp::NormalizeNumexp::new("ja")
 5 | text = "魔女狩りは15世紀〜18世紀にかけてみられ、全ヨーロッパで4万人が処刑された"
 6 | result = Normalize_numexp::StringVector::new(0)
 7 | 
 8 | n.normalize(text, result)
 9 | print "text:#{text}\n"
10 | result.each do |r|
11 | 		print "#{r}\n"
12 | end
13 | 


--------------------------------------------------------------------------------
/unittest_gtest.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/unittest_gtest.py


--------------------------------------------------------------------------------
/waf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/waf


--------------------------------------------------------------------------------
/wscript:
--------------------------------------------------------------------------------
 1 | import os
 2 | APPNAME = 'numerical and temporal expression normalizer'
 3 | VERSION = '0.5.0'
 4 | 
 5 | def options(opt):
 6 |   opt.load('compiler_cxx')
 7 |   opt.load('unittest_gtest')
 8 | 
 9 | def configure(conf):
10 |   conf.env.CXXFLAGS += ['-O2', '-Wall', '-g', '-pipe']
11 |   conf.load('compiler_cxx')
12 |   conf.load('unittest_gtest')
13 |   conf.check_cfg(package = 'pficommon', args = '--cflags --libs')
14 |   conf.check_cfg(package = 'ux', args = '--cflags --libs')
15 |   #conf.check_cxx(lib='re2', libpath=conf.env.LIBDIR)
16 |   #conf.check_cfg(package = 'boost', args = '--cflags --libs')
17 |   #conf.check_cxx(lib='libname', header_name = 'header.h')
18 |   pass
19 | 
20 | def build(bld):
21 |   create_dic_file(bld)
22 |   bld.recurse('src')
23 | #  bld.install_files('${PREFIX}/include', 'src/*.hpp') #cannot install
24 |   for dpath, dnames, fnames in os.walk("src") :
25 |     for fname in fnames :
26 |       if not fname.endswith(".hpp") : continue
27 |       bld.install_files('${PREFIX}/include/normalizeNumexp/', [dpath+"/"+fname])
28 |   for dpath, dnames, fnames in os.walk("src/dic") :
29 |     for fname in fnames :
30 |       if not fname.endswith(".txt") : continue
31 |       bld.install_files('${PREFIX}/lib/normalizeNumexp/'+dpath[4:], [dpath+"/"+fname])
32 |   
33 | 
34 | def create_dic_file(bld) :
35 | #辞書ファイルの場所を指定
36 | 	dictionary_dirpath = str(bld.env.PREFIX) + "/lib/normalizeNumexp/dic/"
37 | #	dictionary_dirpath = "/home/katsuma/usr/local/lib/normalizeNumexp/dic/"	
38 | 	source = """
39 | #include "dictionary_dirpath.hpp"
40 | namespace dictionary_dirpath {
41 | std::string get_dictionary_dirpath(){
42 | 			return \"%s\";}}"""
43 | 			
44 | 	source = source % dictionary_dirpath
45 | 
46 | 	fout = open("./src/dictionary_dirpath.cpp", "w")
47 | 	fout.write(source)
48 | 


--------------------------------------------------------------------------------