├── ._wscript ├── .lock-waf_darwin_build ├── LICENSE ├── README.rst ├── history.txt ├── src ├── abstime_expression_normalizer.cpp ├── abstime_expression_normalizer.hpp ├── abstime_expression_normalizer_test.cpp ├── dic │ ├── en │ │ └── .DS_Store │ ├── ja │ │ ├── abstime_expression_json.txt │ │ ├── abstime_prefix_counter_json.txt │ │ ├── abstime_prefix_json.txt │ │ ├── abstime_suffix_json.txt │ │ ├── chinese_character.txt │ │ ├── duration_expression_json.txt │ │ ├── duration_prefix_counter_json.txt │ │ ├── duration_prefix_json.txt │ │ ├── duration_suffix_json.txt │ │ ├── inappropriate_strings_json.txt │ │ ├── num_counter_json.txt │ │ ├── num_prefix_counter_json.txt │ │ ├── num_prefix_json.txt │ │ ├── num_suffix_json.txt │ │ ├── raw │ │ │ ├── abstime_date.txt │ │ │ ├── abstime_dayweek.txt │ │ │ ├── abstime_dayweek_pattern.txt │ │ │ ├── abstime_nengou.txt │ │ │ ├── abstime_prefix_counter.txt │ │ │ ├── abstime_settouji.txt │ │ │ ├── abstime_setubiji.txt │ │ │ ├── abstime_time.txt │ │ │ ├── create_dic_abstime.py │ │ │ ├── create_dic_abstime_date+time.py │ │ │ ├── create_dic_abstime_prefix_counter.py │ │ │ ├── create_dic_dayweek.py │ │ │ ├── create_dic_duration.py │ │ │ ├── create_dic_inappropriate.py │ │ │ ├── create_dic_num.py │ │ │ ├── create_dic_num_prefix_counter.py │ │ │ ├── create_dic_number_modifier.py │ │ │ ├── create_dic_reltime.py │ │ │ ├── create_dic_reltime_prefix_counter.py │ │ │ ├── duration_prefix_counter.txt │ │ │ ├── duration_setouji.txt │ │ │ ├── duration_setubiji.txt │ │ │ ├── duration_time_position.txt │ │ │ ├── inappropriate_strings.txt │ │ │ ├── make_dictionary.sh │ │ │ ├── make_dictionary.sh~ │ │ │ ├── num.txt │ │ │ ├── num_SItanni_hankaku.txt │ │ │ ├── num_SItanni_katakana.txt │ │ │ ├── num_SItanni_settouji_hankaku.txt │ │ │ ├── num_SItanni_settouji_katakana.txt │ │ │ ├── num_SItanni_settouji_zenkaku.txt │ │ │ ├── num_SItanni_zenkaku.txt │ │ │ ├── num_expand.txt │ │ │ ├── num_prefix_counter.txt │ │ │ ├── num_settouji.txt │ │ │ ├── num_setubiji.txt │ │ │ ├── num_wari.txt │ │ │ ├── reltime_prefix_counter.txt │ │ │ ├── reltime_settouji.txt │ │ │ ├── reltime_specific.txt │ │ │ ├── reltime_time_option.txt │ │ │ ├── reltime_time_position.txt │ │ │ └── reltime_time_pre_option.txt │ │ ├── reltime_expression_json.txt │ │ ├── reltime_prefix_counter_json.txt │ │ ├── reltime_prefix_json.txt │ │ └── reltime_suffix_json.txt │ └── zh │ │ ├── .DS_Store │ │ ├── ._chinese_character.txt │ │ ├── abstime_expression_json.txt │ │ ├── chinese_character.txt │ │ └── num_counter_json.txt ├── dictionary_dirpath.cpp ├── dictionary_dirpath.hpp ├── digit_utility.cpp ├── digit_utility.hpp ├── digit_utility_test.cpp ├── duration_expression_normalizer.cpp ├── duration_expression_normalizer.hpp ├── duration_expression_normalizer_test.cpp ├── inappropriate_expression_remover.cpp ├── inappropriate_expression_remover.hpp ├── main.cpp ├── normalize_numexp.cpp ├── normalize_numexp.hpp ├── normalize_numexp_test.cpp ├── normalizer_template.hpp ├── normalizer_utility.cpp ├── normalizer_utility.hpp ├── normalizer_utility_test.cpp ├── number_normalizer.cpp ├── number_normalizer.hpp ├── number_normalizer_test.cpp ├── numerical_expression_extractor.pyc ├── numerical_expression_normalizer.cpp ├── numerical_expression_normalizer.hpp ├── numerical_expression_normalizer_test.cpp ├── optparse.h ├── reltime_expression_normalizer.cpp ├── reltime_expression_normalizer.hpp ├── reltime_expression_normalizer_test.cpp └── wscript ├── swig ├── java │ ├── TestNormalizeNumexp.java │ ├── compile.sh │ └── readme.txt ├── normalize_numexp.i ├── python │ ├── compile.sh │ ├── readme.txt │ └── test_normalize_numexp.py └── ruby │ ├── compile.sh │ ├── readme.txt │ └── test-normalize-numexp.rb ├── unittest_gtest.py ├── waf └── wscript /._wscript: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/._wscript -------------------------------------------------------------------------------- /.lock-waf_darwin_build: -------------------------------------------------------------------------------- 1 | argv = ['./waf', 'configure'] 2 | environ = {'TERM_SESSION_ID': '8DDB165E-368B-48DD-A813-50947A275351', 'PYTHONPATH': '/Users/katsuma/', 'SSH_AUTH_SOCK': '/tmp/launch-ZM3Pmm/Listeners', 'TERM_PROGRAM_VERSION': '303.2', 'Apple_PubSub_Socket_Render': '/tmp/launch-ueodjD/Render', 'LOGNAME': 'katsuma', 'USER': 'katsuma', 'HOME': '/Users/katsuma', 'PKG_CONFIG_PATH': '/usr/local/lib/pkgconfig/', 'PATH': '/opt/local/bin/:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/usr/X11/bin', 'PS1': '\\[\\e[0;32m\\]\\u@\\h\\[\\e[0m\\][\\t]:\\W$ ', 'DISPLAY': '/tmp/launch-xm9ozw/org.x:0', '_': './waf', 'TERM_PROGRAM': 'Apple_Terminal', 'LANG': 'ja_JP.UTF-8', '__CF_USER_TEXT_ENCODING': '0x1F6:1:14', 'TERM': 'xterm-256color', 'SHELL': '/bin/bash', 'SHLVL': '1', 'OLDPWD': '/Users/katsuma', 'HISTSIZE': '10000', 'HISTCONTROL': 'ignoreboth', 'Apple_Ubiquity_Message': '/tmp/launch-UtGZZs/Apple_Ubiquity_Message', 'PWD': '/Users/katsuma/src/normalizeNumexp', 'TMPDIR': '/var/folders/68/zvn0f60d2cqgsjrn3pnd9rr00000gp/T/', 'CLICOLOR': '1', 'COMMAND_MODE': 'unix2003', 'LSCOLORS': 'gxfxcxdxbxegedabagacad'} 3 | files = ['/Users/katsuma/src/normalizeNumexp/wscript'] 4 | hash = -8290367226182741820 5 | options = {'files': '', 'checkall': False, 'targets': '', 'jobs': 2, 'verbose': 0, 'nocache': False, 'progress_bar': 0, 'checkone': False, 'top': '', 'destdir': '', 'keep': 0, 'zones': '', 'prefix': '/usr/local/', 'download': False, 'force': False, 'out': '', 'check_cxx_compiler': 'g++', 'check': False, 'checkfilter': False} 6 | out_dir = '/Users/katsuma/src/normalizeNumexp/build' 7 | run_dir = '/Users/katsuma/src/normalizeNumexp' 8 | top_dir = '/Users/katsuma/src/normalizeNumexp' 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Katsuma Narisawa. 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of tanakh nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | normalizeNumexp : Numerical/Temporal expression normalizer 3 | ================================ 4 | 5 | About 6 | ===== 7 | 8 | This is a tool for normalizing numerical/temporal expression. 9 | 10 | 11 | Necessary Libraries 12 | ====== 13 | ux(More Succinct Trie Data structure):http://code.google.com/p/ux-trie/wiki/Tutorial_Japanese 14 | 15 | pficommon(General purpose C++ library for PFI):https://github.com/pfi/pficommon 16 | 17 | 18 | Install 19 | ======= 20 | 21 | Do following instructions. 22 | 23 | .. 24 | 25 | $ ./waf configure 26 | 27 | $ ./waf build 28 | 29 | $ ./waf install 30 | 31 | To check that the installation has completed successfully, 32 | 33 | .. 34 | 35 | $ ./waf --checkall 36 | 37 | 38 | How to Use 39 | ======= 40 | 41 | This utility normalize (Japanese) numerical and temporal expressions in the input sentence. 42 | 43 | .. 44 | 45 | $ normalizeNumexp 46 | 47 | 魔女狩りは15世紀〜18世紀にかけてみられ、全ヨーロッパで4万人が処刑された 48 | 49 | >numerical*4万人*29*32*人*40000*40000* 50 | 51 | >abstime*15世紀〜18世紀*5*14*none*1401-XX-XX*1800-XX-XX* 52 | 53 | 54 | If you want to know more detail about this tool, please read following documents. 55 | http://www.cl.ecei.tohoku.ac.jp/~katsuma/software/normalizeNumexp/ 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /history.txt: -------------------------------------------------------------------------------- 1 | 2012/12/? ver3.0を公開。 2 | 3 | 4 | ver2.0 -> 3.0 の主な違い 5 | ・より簡潔な実装を目指してリファクタリングを行いました。 6 | ・MeCabの使用をやめました。形態素区切り情報が利用できず、若干精度が落ちますが、処理が簡潔になりました。精度が落ちる問題は、外部モジュールを作成し今後対応する予定です 7 | ・pficommonを用いることで、wstringを使用せずに日本語が扱えるようになりました。これによりlocale周りの問題が解決しています。 8 | ・uxの使用。 9 | ・辞書をjson形式に 10 | 11 | 12 | 13 | 14 | 15 | 16 | ***************** 17 | 18 | 以下に制作者の個人的なメモを乗せておきます。 19 | 細かい仕様などが気になる方以外は特に読む必要はありません。 20 | 21 | 22 | ■バグリスト 23 | 「3分の1」 = 33%として抽出する? 24 | 「戦後五十年間」の扱い 25 | ぐらい 26 | ミリ秒 27 | 「7年ぶり」の扱い(扱わない) 28 | 十六世 とらない 29 | 翌三年 明日三日、と同じ。どうするのだっけ? 30 | 五十五分ごろ、がdurationになる(abstimeの許容値を超えてしまっているため) 31 | 10時—18時 五百円—千百円 32 | 約三○度 33 | 「年度」どう扱う?(評価ではとりあえず無視) 34 | 「60t」はトン? 35 | 300〜700万円 36 | 37 | 38 | ■TODO List 39 | ・多倍長整数の実装(現在は数値をすべてdoubleで扱っている) 40 | ・辞書の整備 SI単位系、世界の貨幣、各種専門用語 41 | ・曖昧性の解消 42 | ・num,abs,rel,durで複数ヒットした場合の処理。現状では適当な順序で最長マッチさせてるだけ。 43 | ・一般名詞を認識してしまう。除去リストの作成の必要 44 | ・URLとか英字羅列で認識してしまう。URLは頻繁に出てくるので、なんとかする。 45 | ・英語の表現(特に時間) 46 | ・数の認識 47 | ・並列表記(1、2)への対応  48 | ・x,x+1となる数のみ対応。1991,92年などは未対応 49 | ・これに限って言えば、abstimeのパターンとしてとってしまってprocessで処理すれば処理可能。 50 | ・「数万」の扱い 51 | ・1千1千など、不適当な表記 ある程度やったが、他に変な表現はあるかも。「100百20十」とかは無理。 52 | ・30-40万年前 30年から40万年前で認識 53 | ・数量表現 54 | ・「代」「台」の問題 特に対処していない 55 | ・絶対時間表現 56 | ・1989.3 3.11の違いを判定 57 | ・序数はすべて持続時間。absではない。 58 | ・曜日 9月29日(金)〜10月18日(水)  2001.4.29 Friday 1:30 59 | ・2回以上の接尾辞、接頭辞はとっていない 60 | ・形態素区切り情報をいれていない 「シャンプー1本」「総ページ数100頁」 61 | ・<= <の区別を表示させる。1920年代 とかで間違えてしまう 62 | ・その他 63 | ・月額2,604円(税込)から (税込)まで認識しないと、範囲表現がとれない 64 | ・「2の10乗キロメートル」「三分の一キログラム」このような特殊な数の表記については対応していない 65 | 66 | 67 | 68 | 69 | 70 | ■対応が難しいもの 71 | ・数量を含まない表現 72 | ・半世紀、数世紀 などの表現。数は入っていないが、数として本当は認識したい 73 | ・曖昧性の解消 74 | ・五輪、  一体(除去してしまっているが、本当に除去していいのか判定する必要) 75 | ・(20)、 76 | ・評価用アイスセンサーキット: \120,000 77 | ・頻度表現、現状の表現法で良い? 78 | ・同三日  どうしようもない 79 | ・キロ、センチ、ミリ(すべてmに統一) 80 | ・30年の歴史  「三年生」もとっちゃってる 81 | ・DCカプラーCP45W LDR−216シリーズ 23区、 一戸〜八戸 82 | ・「雑居ビル6F」 ファラデーとして抽出してしまう 83 | ・特殊な例 84 | ・年齢表現 生後6ヶ月〜80才前後 85 | ・その他 86 | ・ひらがなは難しい。「1ねんぶんのじゃがいも」「3にんがたべました」 87 | ・3月第三週  序数はどうする? 88 | 89 | 90 | 91 | その他メモ: 92 | ・二週間以内:持続時間 93 | ・比などはとっていない   1:3:5、 1,3,5の割合で〜とか 94 | ・31ページで紹介した〜 -> 31ページ「目」で紹介した〜 ということ?未考慮 95 | ・一番お値打ち <「最も」の意味。数量表現か? 96 | ・「周年」は経過した時間を示す。どれとも言い辛いが、数量表現とする。 97 | ・「13月」など存在しない絶対時間表現は抽出しない。 98 | ・昔の数字にも対応したが、precisionが下がる&入れなくてもほぼrecallは下がらないので、外した方がいいかもしれない。 99 | 100 | 101 | 未対応・注意点リスト(具体的に、細かく。) 102 | ・「20人〜」「20人から」「〜20人」は「20人」に等しいとする(20~∞とは扱わない) 103 | ・「30人まで」は「−∞〜30」と扱う 104 | ・「先月1日」は「先日(相対時間表現)」+「1日(絶対時間表現)」という構成をもつ相対時間表現。これを相対時間表現とするために、相対時間表現の実装の中で絶対時間表現の実装と重複する処理をたくさん行っている 105 | ・3月第三週  分割して認識している。 106 | ・「9Paまで下げる」「9階まで降りる」これは範囲表現ではないと考えられる。現在はとってしまっている。 107 | ・「数年」「数週間」 数字が入っていないので抽出対象外 108 | ・およそ100人〜500人 およそが二重にかかるバグ <本当ならaboutなどの処理は最後にやるべき。やはり、処理はせずにoptionとして出力した方が良い? 109 | ・80歳前後、で0.7がけするのはやりすぎ。aboutの範囲はかなり雑に決められている 110 | ・何百円 扱っていない 111 | ・「台分」「人分」「◯分」分は色々な単位につきうる。数量表現?? 112 | ・とりあえず出てきたやつを追加している 113 | ・「単位」+「分」で検索かけて、でてきたやつを単位に追加しよう <<< 後で 114 | ・※「台」だと車を数えていて、「台分」だとそれによりできるスペースを示している 115 | ・「およそほぼ約30人」 修飾語は2語までしかとっていない 116 | ・h, hour, m, min, s, sec 対応していない 117 | ・1歳未満 厳密な意味は0~1だけど-INF~1になっている 118 | ・3割5分 5尺6寸 1円30銭 対応していない(どんな数量表現にする?) 119 | ・定価1,500円(税込)、 家族4人、  余計な表現は含めない 120 | ・電話番号、住所etcはとらない 121 | ・2ヶ月に1回 一日に三回 現状では、とりあえず分割して考える 122 | ・グッチペンダントネックレス145171−J8400−8106価格:40635円 【グッチ】GUCCI 商品名、番号はとらない 123 | ・直径1.6cm < 直径が1.6cm。できるだけ抽出する、が評価実験の際、抽出できなくても負例とはしない。 124 | ・5階、305号室、3丁目 <名詞化。場所を示しているのであって、量を示してはいない。抽出しない。(「階」はとれてしまっている 125 | ・月号 <量ではない。とらない。 126 | ・数量表現かまだ迷っているもの、のうち抽出するもの:3倍速 127 | ・3歳児:数量表現でない 128 | ・固有名詞中の数量表現 129 | * 正例:マガジン3月3日号、特集国家百年 130 | * 負例:そろそろ三日兎にいこうぜ <- 店の名前? 131 | * 固有名詞中の曖昧な表現はとらなくて良い、ということで(マガジン2006) 132 | * ? : 3L缶(3リットル缶?) 133 | * text:で,車の一時入校許可証で気付いたのだが,今日は12321な日でした. なんとなくメモ 134 | ・「一番◯◯な〜」この「一番」は「最も」の意味ではあるが、数量表現としても捉えられなくはないのでOKとする 135 | 136 | ・メモ: 137 | ・パート3    138 | ・段落的な意味の数量とか 「1. はじめに 2. 関連研究」   139 | ・ベスト3 (ベストなもの3つ) 140 | ・2chのスレッド数っぽく  〜なんだけど(31) 141 | ・fnを見つけるのはめんどいので、「一時」みたいにとらずともとってもどっちもいいような場合は、とりあえず取る 142 | ・〜の五十人の(うち)一人 143 | ・23.6%増  増、は数量の属性を表しているので〜 <なんか、境界が曖昧じゃない? 144 | ・取引銀行3行 145 | ・チャンネル チャンネル数を表す単位のときもある? 146 | * 時間表現 147 | * 計18時間 <- 18時間の属性を付加しているだけで、時間表現としては「18時間」というだけ 税込みとかと同じ理論 148 | * 八年半ぶり <ぶりってつけないんだっけ? 149 | * 曜日、「第四月曜」、「毎月2のつく日」 150 | * 今月十一日 < 11日だけでok。対象としない 151 | * 戦後:1945の年号として捉える 152 | * 曖昧性:2/8 これは多分日付だけど… 153 | * 月50時間、週2回:per月として認識 154 | 155 | 156 | 157 | 158 | 数を含まないもの 159 | 昨年、前年、来年、再来年、 160 | 先月、来月、 161 | 明日、昨日、同日、 162 | 正月 163 | -------------------------------------------------------------------------------- /src/abstime_expression_normalizer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ABSTIME_EXPRESSION_NORMALIZER_H_ 2 | #define ABSTIME_EXPRESSION_NORMALIZER_H_ 3 | #include 4 | #include "digit_utility.hpp" 5 | #include "number_normalizer.hpp" 6 | #include "normalizer_utility.hpp" 7 | #include "normalizer_template.hpp" 8 | #include 9 | 10 | namespace abstime_expression_normalizer{ 11 | 12 | struct AbstimeExpression : normalizer_utility::NormalizedExpressionTemplate{ 13 | AbstimeExpression(digit_utility::Number number) 14 | : normalizer_utility::NormalizedExpressionTemplate(number.original_expression, number.position_start, number.position_end), 15 | org_value_lowerbound(number.value_lowerbound), 16 | org_value_upperbound(number.value_upperbound), 17 | value_lowerbound(normalizer_utility::Time(INFINITY)), 18 | value_upperbound(normalizer_utility::Time(-INFINITY)), 19 | ordinary(false) 20 | {} 21 | 22 | double org_value_lowerbound, org_value_upperbound; 23 | normalizer_utility::Time value_lowerbound, value_upperbound; 24 | bool ordinary; 25 | }; 26 | 27 | 28 | class LimitedAbstimeExpression : public normalizer_utility::LimitedExpressionTemplate{ 29 | public: 30 | template 31 | void serialize(Archive &ar){ 32 | ar & MEMBER(pattern) & MEMBER(corresponding_time_position) & MEMBER(process_type) & MEMBER(ordinary) & MEMBER(option); 33 | } 34 | 35 | std::vector corresponding_time_position; 36 | std::vector process_type; 37 | }; 38 | 39 | 40 | class AbstimeExpressionNormalizer : public normalizer_template::NormalizerTemplate{ 41 | public: 42 | AbstimeExpressionNormalizer(const std::string& language) : NN(language) { language_ = language; init(); } 43 | 44 | private: 45 | void init(); 46 | void normalize_number(const std::string& text, std::vector& numbers); 47 | void revise_any_type_expression_by_matching_limited_expression(std::vector& abstimeexps, int& expression_id, LimitedAbstimeExpression matching_limited_abstime_expression); 48 | void revise_any_type_expression_by_matching_prefix_counter(AbstimeExpression& any_type_expression, const LimitedAbstimeExpression& matching_limited_expression); 49 | void revise_any_type_expression_by_number_modifier(AbstimeExpression& abstimeexp, const normalizer_utility::NumberModifier& number_modifier); 50 | void delete_not_any_type_expression(std::vector& abstimeexps); 51 | void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector& abstimeexps); 52 | 53 | number_normalizer::NumberNormalizer NN; 54 | }; 55 | 56 | } //namespace abstime_expression_normalizer 57 | 58 | #endif //ABSTIME_EXPRESSON_NORMALIZER_H_ 59 | -------------------------------------------------------------------------------- /src/dic/en/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/en/.DS_Store -------------------------------------------------------------------------------- /src/dic/ja/abstime_prefix_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"だいたい", "process_type":"about"} 2 | {"pattern":"およそ", "process_type":"about"} 3 | {"pattern":"ちょうど", "process_type":"none"} 4 | {"pattern":"~", "process_type":"kara_prefix"} 5 | {"pattern":"〜", "process_type":"kara_prefix"} 6 | {"pattern":"~", "process_type":"kara_prefix"} 7 | {"pattern":"-", "process_type":"kara_prefix"} 8 | {"pattern":"−", "process_type":"kara_prefix"} 9 | {"pattern":"ー", "process_type":"kara_prefix"} 10 | {"pattern":"から", "process_type":"kara_prefix"} 11 | {"pattern":"PM", "process_type":"gogo"} 12 | {"pattern":"AM", "process_type":"gozen"} 13 | {"pattern":"PM", "process_type":"gogo"} 14 | {"pattern":"AM", "process_type":"gozen"} 15 | {"pattern":"PM", "process_type":"gogo"} 16 | {"pattern":"AM", "process_type":"gozen"} 17 | {"pattern":"PM ", "process_type":"gogo"} 18 | {"pattern":"AM ", "process_type":"gozen"} 19 | {"pattern":"朝", "process_type":"asa"} 20 | {"pattern":"夜", "process_type":"yoru"} 21 | {"pattern":"深夜", "process_type":"sinnya"} 22 | -------------------------------------------------------------------------------- /src/dic/ja/abstime_suffix_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"以前", "process_type":"or_less"} 2 | {"pattern":"まで", "process_type":"made"} 3 | {"pattern":"迄", "process_type":"or_less"} 4 | {"pattern":"より前", "process_type":"less"} 5 | {"pattern":"以降", "process_type":"or_over"} 6 | {"pattern":"より後", "process_type":"over"} 7 | {"pattern":"~", "process_type":"kara_suffix"} 8 | {"pattern":"〜", "process_type":"kara_suffix"} 9 | {"pattern":"~", "process_type":"kara_suffix"} 10 | {"pattern":"-", "process_type":"kara_suffix"} 11 | {"pattern":"−", "process_type":"kara_suffix"} 12 | {"pattern":"ー", "process_type":"kara_suffix"} 13 | {"pattern":"から", "process_type":"kara_suffix"} 14 | {"pattern":"くらい", "process_type":"about"} 15 | {"pattern":"ばかり", "process_type":"about"} 16 | {"pattern":"前後", "process_type":"about"} 17 | {"pattern":"近く", "process_type":"about"} 18 | {"pattern":"頃", "process_type":"about"} 19 | {"pattern":"ごろ", "process_type":"about"} 20 | {"pattern":"頭", "process_type":"zenhan"} 21 | {"pattern":"前半", "process_type":"zenhan"} 22 | {"pattern":"前記", "process_type":"zenhan"} 23 | {"pattern":"初頭", "process_type":"zenhan"} 24 | {"pattern":"初期", "process_type":"zenhan"} 25 | {"pattern":"初め", "process_type":"zenhan"} 26 | {"pattern":"始め", "process_type":"zenhan"} 27 | {"pattern":"はじめ", "process_type":"zenhan"} 28 | {"pattern":"後半", "process_type":"kouhan"} 29 | {"pattern":"後期", "process_type":"kouhan"} 30 | {"pattern":"終盤", "process_type":"kouhan"} 31 | {"pattern":"終わり", "process_type":"kouhan"} 32 | {"pattern":"末", "process_type":"kouhan"} 33 | {"pattern":"半ば", "process_type":"nakaba"} 34 | {"pattern":"中期", "process_type":"nakaba"} 35 | {"pattern":"中盤", "process_type":"nakaba"} 36 | {"pattern":"中頃", "process_type":"nakaba"} 37 | {"pattern":"中ごろ", "process_type":"nakaba"} 38 | {"pattern":"中旬", "process_type":"nakaba"} 39 | {"pattern":"上旬", "process_type":"joujun"} 40 | {"pattern":"中旬", "process_type":"tyujun"} 41 | {"pattern":"下旬", "process_type":"gejun"} 42 | {"pattern":"PM", "process_type":"gogo"} 43 | {"pattern":"AM", "process_type":"gozen"} 44 | {"pattern":"PM", "process_type":"gogo"} 45 | {"pattern":"AM", "process_type":"gozen"} 46 | {"pattern":"PM", "process_type":"gogo"} 47 | {"pattern":"AM", "process_type":"gozen"} 48 | {"pattern":" PM", "process_type":"gogo"} 49 | {"pattern":" AM", "process_type":"gozen"} 50 | -------------------------------------------------------------------------------- /src/dic/ja/chinese_character.txt: -------------------------------------------------------------------------------- 1 | {"character":"〇", "value":0, "NotationType":"09"} 2 | {"character":"一", "value":1, "NotationType":"09"} 3 | {"character":"二", "value":2, "NotationType":"09"} 4 | {"character":"三", "value":3, "NotationType":"09"} 5 | {"character":"四", "value":4, "NotationType":"09"} 6 | {"character":"五", "value":5, "NotationType":"09"} 7 | {"character":"六", "value":6, "NotationType":"09"} 8 | {"character":"七", "value":7, "NotationType":"09"} 9 | {"character":"八", "value":8, "NotationType":"09"} 10 | {"character":"九", "value":9, "NotationType":"09"} 11 | {"character":"零", "value":0, "NotationType":"09"} 12 | {"character":"十", "value":1, "NotationType":"sen"} 13 | {"character":"百", "value":2, "NotationType":"sen"} 14 | {"character":"千", "value":3, "NotationType":"sen"} 15 | {"character":"万", "value":4, "NotationType":"man"} 16 | {"character":"億", "value":8, "NotationType":"man"} 17 | {"character":"兆", "value":12, "NotationType":"man"} 18 | {"character":"京", "value":16, "NotationType":"man"} 19 | {"character":"壱", "value":1, "NotationType":"09"} 20 | {"character":"弐", "value":2, "NotationType":"09"} 21 | {"character":"参", "value":3, "NotationType":"09"} 22 | {"character":"伍", "value":5, "NotationType":"09"} 23 | {"character":"拾", "value":1, "NotationType":"sen"} 24 | {"character":"佰", "value":2, "NotationType":"sen"} 25 | {"character":"阡", "value":3, "NotationType":"sen"} 26 | {"character":"萬", "value":4, "NotationType":"man"} -------------------------------------------------------------------------------- /src/dic/ja/duration_expression_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"世紀", "corresponding_time_position":["seiki"], "process_type":[], "ordinary":false, "option":""} 2 | {"pattern":"世紀半", "corresponding_time_position":["seiki"], "process_type":["han"], "ordinary":false, "option":""} 3 | {"pattern":"年間", "corresponding_time_position":["y"], "process_type":[], "ordinary":false, "option":""} 4 | {"pattern":"年間半", "corresponding_time_position":["y"], "process_type":["han"], "ordinary":false, "option":""} 5 | {"pattern":"年", "corresponding_time_position":["y"], "process_type":[], "ordinary":false, "option":""} 6 | {"pattern":"年半", "corresponding_time_position":["y"], "process_type":["han"], "ordinary":false, "option":""} 7 | {"pattern":"ヶ月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""} 8 | {"pattern":"ヶ月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""} 9 | {"pattern":"か月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""} 10 | {"pattern":"か月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""} 11 | {"pattern":"カ月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""} 12 | {"pattern":"カ月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""} 13 | {"pattern":"ヵ月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""} 14 | {"pattern":"ヵ月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""} 15 | {"pattern":"ケ月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""} 16 | {"pattern":"ケ月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""} 17 | {"pattern":"箇月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""} 18 | {"pattern":"箇月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""} 19 | {"pattern":"週間", "corresponding_time_position":["w"], "process_type":[], "ordinary":false, "option":""} 20 | {"pattern":"週間半", "corresponding_time_position":["w"], "process_type":["han"], "ordinary":false, "option":""} 21 | {"pattern":"日間", "corresponding_time_position":["d"], "process_type":[], "ordinary":false, "option":""} 22 | {"pattern":"日間半", "corresponding_time_position":["d"], "process_type":["han"], "ordinary":false, "option":""} 23 | {"pattern":"時間", "corresponding_time_position":["h"], "process_type":[], "ordinary":false, "option":""} 24 | {"pattern":"時間半", "corresponding_time_position":["h"], "process_type":["han"], "ordinary":false, "option":""} 25 | {"pattern":"分間", "corresponding_time_position":["mn"], "process_type":[], "ordinary":false, "option":""} 26 | {"pattern":"分間半", "corresponding_time_position":["mn"], "process_type":["han"], "ordinary":false, "option":""} 27 | {"pattern":"秒間", "corresponding_time_position":["s"], "process_type":[], "ordinary":false, "option":""} 28 | {"pattern":"秒間半", "corresponding_time_position":["s"], "process_type":["han"], "ordinary":false, "option":""} 29 | {"pattern":"年", "corresponding_time_position":["y"], "process_type":[], "ordinary":false, "option":""} 30 | {"pattern":"年半", "corresponding_time_position":["y"], "process_type":["han"], "ordinary":false, "option":""} 31 | {"pattern":"月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""} 32 | {"pattern":"月半", "corresponding_time_position":["m"], "process_type":["han"], "ordinary":false, "option":""} 33 | {"pattern":"週", "corresponding_time_position":["w"], "process_type":[], "ordinary":false, "option":""} 34 | {"pattern":"週半", "corresponding_time_position":["w"], "process_type":["han"], "ordinary":false, "option":""} 35 | {"pattern":"日", "corresponding_time_position":["d"], "process_type":[], "ordinary":false, "option":""} 36 | {"pattern":"日半", "corresponding_time_position":["d"], "process_type":["han"], "ordinary":false, "option":""} 37 | {"pattern":"分", "corresponding_time_position":["mn"], "process_type":[], "ordinary":false, "option":""} 38 | {"pattern":"分半", "corresponding_time_position":["mn"], "process_type":["han"], "ordinary":false, "option":""} 39 | {"pattern":"秒", "corresponding_time_position":["s"], "process_type":[], "ordinary":false, "option":""} 40 | {"pattern":"秒半", "corresponding_time_position":["s"], "process_type":["han"], "ordinary":false, "option":""} 41 | {"pattern":"年ǂヶ月", "corresponding_time_position":["y", "m"], "process_type":[], "ordinary":false, "option":""} 42 | {"pattern":"年ǂヶ月半", "corresponding_time_position":["y", "m"], "process_type":["han"], "ordinary":false, "option":""} 43 | {"pattern":"時間ǂ分", "corresponding_time_position":["h", "mn"], "process_type":[], "ordinary":false, "option":""} 44 | {"pattern":"時間ǂ分半", "corresponding_time_position":["h", "mn"], "process_type":["han"], "ordinary":false, "option":""} 45 | {"pattern":"分ǂ秒", "corresponding_time_position":["mn", "s"], "process_type":[], "ordinary":false, "option":""} 46 | {"pattern":"分ǂ秒半", "corresponding_time_position":["mn", "s"], "process_type":["han"], "ordinary":false, "option":""} 47 | -------------------------------------------------------------------------------- /src/dic/ja/duration_prefix_counter_json.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/ja/duration_prefix_counter_json.txt -------------------------------------------------------------------------------- /src/dic/ja/duration_prefix_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"約", "process_type":"about"} 2 | {"pattern":"だいたい", "process_type":"about"} 3 | {"pattern":"ほぼ", "process_type":"about"} 4 | {"pattern":"およそ", "process_type":"about"} 5 | {"pattern":"ほとんど", "process_type":"about"} 6 | {"pattern":"全", "process_type":"none"} 7 | {"pattern":"ちょうど", "process_type":"none"} 8 | {"pattern":"第", "process_type":"ordinary"} 9 | {"pattern":"~", "process_type":"kara_prefix"} 10 | {"pattern":"〜", "process_type":"kara_prefix"} 11 | {"pattern":"~", "process_type":"kara_prefix"} 12 | {"pattern":"-", "process_type":"kara_prefix"} 13 | {"pattern":"−", "process_type":"kara_prefix"} 14 | {"pattern":"ー", "process_type":"kara_prefix"} 15 | {"pattern":"から", "process_type":"kara_prefix"} 16 | -------------------------------------------------------------------------------- /src/dic/ja/duration_suffix_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"目", "process_type":"ordinary"} 2 | {"pattern":"以下", "process_type":"or_less"} 3 | {"pattern":"以前", "process_type":"or_less"} 4 | {"pattern":"以内", "process_type":"or_less"} 5 | {"pattern":"まで", "process_type":"made"} 6 | {"pattern":"迄", "process_type":"or_less"} 7 | {"pattern":"未満", "process_type":"less"} 8 | {"pattern":"以上", "process_type":"or_over"} 9 | {"pattern":"以降", "process_type":"or_over"} 10 | {"pattern":"超", "process_type":"or_over"} 11 | {"pattern":"越え", "process_type":"or_over"} 12 | {"pattern":"超え", "process_type":"or_over"} 13 | {"pattern":"~", "process_type":"kara_suffix"} 14 | {"pattern":"〜", "process_type":"kara_suffix"} 15 | {"pattern":"~", "process_type":"kara_suffix"} 16 | {"pattern":"-", "process_type":"kara_suffix"} 17 | {"pattern":"−", "process_type":"kara_suffix"} 18 | {"pattern":"ー", "process_type":"kara_suffix"} 19 | {"pattern":"から", "process_type":"kara_suffix"} 20 | {"pattern":"くらい", "process_type":"about"} 21 | {"pattern":"ばかり", "process_type":"about"} 22 | {"pattern":"前後", "process_type":"about"} 23 | {"pattern":"程度", "process_type":"about"} 24 | {"pattern":"ほど", "process_type":"about"} 25 | {"pattern":"近く", "process_type":"about"} 26 | {"pattern":"頃", "process_type":"about"} 27 | {"pattern":"ごろ", "process_type":"about"} 28 | {"pattern":"余り", "process_type":"kyou"} 29 | {"pattern":"強", "process_type":"kyou"} 30 | {"pattern":"弱", "process_type":"jaku"} 31 | {"pattern":"台", "process_type":"dai"} 32 | {"pattern":"代", "process_type":"dai"} 33 | {"pattern":"毎", "process_type":"per"} 34 | -------------------------------------------------------------------------------- /src/dic/ja/inappropriate_strings_json.txt: -------------------------------------------------------------------------------- 1 | {"str":"一切"} 2 | {"str":"一部"} 3 | {"str":"一連"} 4 | {"str":"三振"} 5 | {"str":"一段"} 6 | {"str":"一体"} 7 | {"str":"九州"} 8 | {"str":"四国"} 9 | {"str":"一種"} 10 | {"str":"一番"} 11 | -------------------------------------------------------------------------------- /src/dic/ja/num_prefix_counter_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"¥", "counter":"円", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 2 | {"pattern":"¥", "counter":"円", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 3 | {"pattern":"$", "counter":"ドル", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 4 | {"pattern":"$", "counter":"ドル", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 5 | {"pattern":"€", "counter":"ユーロ", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 6 | {"pattern":"£", "counter":"ポンド", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 7 | {"pattern":"小さじ", "counter":"小さじ", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 8 | {"pattern":"大さじ", "counter":"大さじ", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 9 | {"pattern":"時速", "counter":"/h", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 10 | {"pattern":"毎時", "counter":"/h", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 11 | {"pattern":"分速", "counter":"/m", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 12 | {"pattern":"毎分", "counter":"/m", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 13 | {"pattern":"秒速", "counter":"/s", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 14 | {"pattern":"毎秒", "counter":"/s", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 15 | {"pattern":"週", "counter":"/week", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 16 | {"pattern":"月", "counter":"/month", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 17 | {"pattern":"年", "counter":"/year", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"add_suffix_counter"} 18 | {"pattern":"最大", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"saidai"} 19 | {"pattern":"最長", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"saityou"} 20 | {"pattern":"最高", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"saikou"} 21 | {"pattern":"華氏", "counter":"℉", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 22 | {"pattern":"摂氏", "counter":"℃", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"counter"} 23 | {"pattern":"風速", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"fusoku"} 24 | {"pattern":"水温", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"suion"} 25 | {"pattern":"北緯", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"hokui"} 26 | {"pattern":"南緯", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"nanni"} 27 | {"pattern":"東経", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"toukei"} 28 | {"pattern":"西経", "counter":"*", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"seikei"} 29 | -------------------------------------------------------------------------------- /src/dic/ja/num_prefix_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"第", "process_type":"ordinary"} 2 | {"pattern":"約", "process_type":"about"} 3 | {"pattern":"だいたい", "process_type":"about"} 4 | {"pattern":"ほぼ", "process_type":"about"} 5 | {"pattern":"およそ", "process_type":"about"} 6 | {"pattern":"ほとんど", "process_type":"about"} 7 | {"pattern":"全", "process_type":"none"} 8 | {"pattern":"ちょうど", "process_type":"none"} 9 | {"pattern":"~", "process_type":"kara_prefix"} 10 | {"pattern":"〜", "process_type":"kara_prefix"} 11 | {"pattern":"~", "process_type":"kara_prefix"} 12 | {"pattern":"-", "process_type":"kara_prefix"} 13 | {"pattern":"−", "process_type":"kara_prefix"} 14 | {"pattern":"ー", "process_type":"kara_prefix"} 15 | {"pattern":"から", "process_type":"kara_prefix"} 16 | -------------------------------------------------------------------------------- /src/dic/ja/num_suffix_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"目", "process_type":"ordinary"} 2 | {"pattern":"以下", "process_type":"or_less"} 3 | {"pattern":"以前", "process_type":"or_less"} 4 | {"pattern":"以内", "process_type":"or_less"} 5 | {"pattern":"まで", "process_type":"made"} 6 | {"pattern":"迄", "process_type":"or_less"} 7 | {"pattern":"未満", "process_type":"less"} 8 | {"pattern":"以上", "process_type":"or_over"} 9 | {"pattern":"以降", "process_type":"or_over"} 10 | {"pattern":"超", "process_type":"or_over"} 11 | {"pattern":"越え", "process_type":"or_over"} 12 | {"pattern":"超え", "process_type":"or_over"} 13 | {"pattern":"~", "process_type":"kara_suffix"} 14 | {"pattern":"〜", "process_type":"kara_suffix"} 15 | {"pattern":"~", "process_type":"kara_suffix"} 16 | {"pattern":"-", "process_type":"kara_suffix"} 17 | {"pattern":"−", "process_type":"kara_suffix"} 18 | {"pattern":"ー", "process_type":"kara_suffix"} 19 | {"pattern":"から", "process_type":"kara_suffix"} 20 | {"pattern":"くらい", "process_type":"about"} 21 | {"pattern":"ばかり", "process_type":"about"} 22 | {"pattern":"前後", "process_type":"about"} 23 | {"pattern":"程度", "process_type":"about"} 24 | {"pattern":"ほど", "process_type":"about"} 25 | {"pattern":"近く", "process_type":"about"} 26 | {"pattern":"頃", "process_type":"about"} 27 | {"pattern":"ごろ", "process_type":"about"} 28 | {"pattern":"余り", "process_type":"kyou"} 29 | {"pattern":"強", "process_type":"kyou"} 30 | {"pattern":"弱", "process_type":"jaku"} 31 | {"pattern":"台", "process_type":"dai"} 32 | {"pattern":"代", "process_type":"dai"} 33 | {"pattern":"毎", "process_type":"per"} 34 | {"pattern":"半", "process_type":"han"} 35 | {"pattern":"/時", "process_type":"/h"} 36 | {"pattern":"/分", "process_type":"/min"} 37 | {"pattern":"/秒", "process_type":"/sec"} 38 | {"pattern":"/時", "process_type":"/h"} 39 | {"pattern":"/分", "process_type":"/min"} 40 | {"pattern":"/秒", "process_type":"/sec"} 41 | -------------------------------------------------------------------------------- /src/dic/ja/raw/abstime_date.txt: -------------------------------------------------------------------------------- 1 | 世紀 2 | seiki 3 | 年 4 | y 5 | 年*月 6 | y m 7 | 年*月*日 8 | y m d 9 | 月 10 | m 11 | 月*日 12 | m d 13 | 日 14 | d 15 | /* 16 | m d 17 | /*/* 18 | y m d 19 | /* 20 | m d 21 | /*/* 22 | y m d 23 | -*-* 24 | y m d 25 | −*−* 26 | y m d 27 | ー*ー* 28 | y m d 29 | .* 30 | m d 31 | .*.* 32 | y m d 33 | .* 34 | m d 35 | .*.* 36 | y m d 37 | ・* 38 | m d 39 | ・*・* 40 | y m d 41 | ,* 42 | m d 43 | ,*,* 44 | y m d 45 | -------------------------------------------------------------------------------- /src/dic/ja/raw/abstime_dayweek.txt: -------------------------------------------------------------------------------- 1 | 月 Mon 2 | 火 Tue 3 | 水 Wed 4 | 木 Thu 5 | 金 Fri 6 | 土 Sat 7 | 日 Sun 8 | 月曜 Mon 9 | 火曜 Tue 10 | 水曜 Wed 11 | 木曜 Thu 12 | 金曜 Fri 13 | 土曜 Sat 14 | 日曜 Sun 15 | 月曜日 Mon 16 | 火曜日 Tue 17 | 水曜日 Wed 18 | 木曜日 Thu 19 | 金曜日 Fri 20 | 土曜日 Sat 21 | 日曜日 Sun 22 | Monday Mon 23 | Tuesday Tue 24 | Wednesday Wed 25 | Thursday Thu 26 | Friday Fri 27 | Saturday Sat 28 | Sunday Sun 29 | Mon Mon 30 | Tue Tue 31 | Wed Wed 32 | Thu Thu 33 | Fri Fri 34 | Sat Sat 35 | Sun Sun 36 | Monday Mon 37 | Tuesday Tue 38 | Wednesday Wed 39 | Thursday Thu 40 | Friday Fri 41 | Saturday Sat 42 | Sunday Sun 43 | Mon Mon 44 | Tue Tue 45 | Wed Wed 46 | Thu Thu 47 | Fri Fri 48 | Sat Sat 49 | Sun Sun -------------------------------------------------------------------------------- /src/dic/ja/raw/abstime_dayweek_pattern.txt: -------------------------------------------------------------------------------- 1 | 2 |   3 | 4 | * 5 | (*) 6 | (*) 7 | * 8 |  *  9 | -------------------------------------------------------------------------------- /src/dic/ja/raw/abstime_nengou.txt: -------------------------------------------------------------------------------- 1 | 西暦 0 2 | 飛鳥時代 644 3 | 白雉 649 4 | 朱鳥 685 5 | 大宝 700 6 | 慶雲 703 7 | 和銅 707 8 | 奈良時代 714 9 | 養老 716 10 | 神亀 723 11 | 天平 728 12 | 天平感宝 748 13 | 天平勝宝 748 14 | 天平宝字 756 15 | 天平神護 764 16 | 神護景雲 766 17 | 宝亀 769 18 | 天応 780 19 | 延暦 781 20 | 平安時代 805 21 | 弘仁 809 22 | 天長 823 23 | 承和 833 24 | 嘉祥 847 25 | 仁寿 850 26 | 斎衡 853 27 | 天安 856 28 | 貞観 858 29 | 元慶 876 30 | 仁和 884 31 | 寛平 888 32 | 昌泰 897 33 | 延喜 900 34 | 延長 922 35 | 承平 930 36 | 天慶 937 37 | 天暦 946 38 | 天徳 956 39 | 応和 960 40 | 康保 963 41 | 安和 967 42 | 天禄 969 43 | 天延 972 44 | 貞元 975 45 | 天元 977 46 | 永観 982 47 | 寛和 984 48 | 永延 986 49 | 永祚 988 50 | 正暦 989 51 | 長徳 994 52 | 長保 998 53 | 寛弘 1003 54 | 長和 1011 55 | 寛仁 1016 56 | 治安 1020 57 | 万寿 1023 58 | 長元 1027 59 | 長暦 1036 60 | 長久 1039 61 | 寛徳 1043 62 | 永承 1045 63 | 天喜 1052 64 | 康平 1057 65 | 治暦 1064 66 | 延久 1068 67 | 承保 1073 68 | 承暦 1076 69 | 永保 1080 70 | 応徳 1083 71 | 寛治 1086 72 | 嘉保 1093 73 | 永長 1095 74 | 承徳 1096 75 | 康和 1098 76 | 長治 1103 77 | 嘉承 1105 78 | 天仁 1107 79 | 天永 1109 80 | 永久 1112 81 | 元永 1117 82 | 保安 1119 83 | 天治 1123 84 | 大治 1125 85 | 天承 1130 86 | 長承 1131 87 | 保延 1134 88 | 永治 1140 89 | 康治 1141 90 | 天養 1143 91 | 久安 1144 92 | 仁平 1150 93 | 久寿 1153 94 | 保元 1155 95 | 平治 1158 96 | 永暦 1159 97 | 応保 1160 98 | 長寛 1162 99 | 永万 1164 100 | 仁安 1165 101 | 嘉応 1168 102 | 承安 1170 103 | 安元 1174 104 | 治承 1176 105 | 養和 1180 106 | 寿永 1181 107 | 元暦 1183 108 | 文治 1184 109 | 鎌倉時代 1189 110 | 正治 1198 111 | 建仁 1200 112 | 元久 1203 113 | 建永 1205 114 | 承元 1206 115 | 建暦 1210 116 | 建保 1212 117 | 承久 1218 118 | 貞応 1221 119 | 元仁 1223 120 | 嘉禄 1224 121 | 安貞 1226 122 | 寛喜 1228 123 | 貞永 1231 124 | 天福 1232 125 | 文暦 1233 126 | 嘉禎 1234 127 | 暦仁 1237 128 | 延応 1238 129 | 仁治 1239 130 | 寛元 1242 131 | 宝治 1246 132 | 建長 1248 133 | 康元 1255 134 | 正嘉 1256 135 | 正元 1258 136 | 文応 1259 137 | 弘長 1260 138 | 文永 1263 139 | 建治 1274 140 | 弘安 1277 141 | 正応 1287 142 | 永仁 1292 143 | 正安 1298 144 | 乾元 1301 145 | 嘉元 1302 146 | 徳治 1305 147 | 延慶 1307 148 | 応長 1310 149 | 正和 1311 150 | 文保 1316 151 | 元応 1318 152 | 元亨 1320 153 | 正中 1323 154 | 嘉暦 1325 155 | 南北朝時代 1328 156 | 北朝 1328 157 | 南朝 1330 158 | 北朝 1331 159 | 南朝 1333 160 | 北朝 1333 161 | 南朝 1335 162 | 北朝 1337 163 | 南朝 1339 164 | 北朝 1341 165 | 北朝 1344 166 | 南朝 1345 167 | 北朝 1349 168 | 北朝 1351 169 | 北朝 1355 170 | 北朝 1360 171 | 北朝 1361 172 | 北朝 1367 173 | 南朝 1369 174 | 南朝 1371 175 | 南朝 1374 176 | 北朝 1374 177 | 北朝 1378 178 | 南朝 1380 179 | 北朝 1380 180 | 南朝 1383 181 | 北朝 1383 182 | 北朝 1386 183 | 北朝 1388 184 | 北朝 1389 185 | 室町時代 1391 186 | 応永 1393 187 | 正長 1427 188 | 永享 1428 189 | 嘉吉 1440 190 | 文安 1443 191 | 宝徳 1448 192 | 享徳 1451 193 | 康正 1454 194 | 長禄 1456 195 | 寛正 1459 196 | 文正 1465 197 | 応仁 1466 198 | 文明 1468 199 | 長享 1486 200 | 延徳 1488 201 | 明応 1491 202 | 文亀 1500 203 | 永正 1503 204 | 大永 1520 205 | 享禄 1527 206 | 天文 1531 207 | 弘治 1554 208 | 永禄 1557 209 | 元亀 1569 210 | 安土桃山時代 1572 211 | 文禄 1591 212 | 慶長 1595 213 | 江戸時代 1614 214 | 寛永 1623 215 | 正保 1643 216 | 慶安 1647 217 | 承応 1651 218 | 明暦 1654 219 | 万治 1657 220 | 寛文 1660 221 | 延宝 1672 222 | 天和 1680 223 | 貞享 1683 224 | 元禄 1687 225 | 宝永 1703 226 | 正徳 1710 227 | 享保 1715 228 | 元文 1735 229 | 寛保 1740 230 | 延享 1743 231 | 寛延 1747 232 | 宝暦 1750 233 | 明和 1763 234 | 安永 1771 235 | 天明 1780 236 | 寛政 1788 237 | 享和 1800 238 | 文化 1803 239 | 文政 1817 240 | 天保 1829 241 | 弘化 1843 242 | 嘉永 1847 243 | 安政 1853 244 | 万延 1859 245 | 文久 1860 246 | 元治 1863 247 | 慶応 1864 248 | 明治 1867 249 | 大正 1911 250 | 昭和 1925 251 | S 1925 252 | S 1925 253 | 戦後 1945 254 | 平成 1988 255 | H 1988 256 | H 1988 -------------------------------------------------------------------------------- /src/dic/ja/raw/abstime_prefix_counter.txt: -------------------------------------------------------------------------------- 1 | 紀元前 kigenzen 2 | 午前 gozen 3 | 午後 gogo 4 | AM gozen 5 | PM gogo -------------------------------------------------------------------------------- /src/dic/ja/raw/abstime_settouji.txt: -------------------------------------------------------------------------------- 1 | だいたい about 2 | およそ about 3 | ちょうど none 4 | ~ kara_prefix 5 | 〜 kara_prefix 6 | ~ kara_prefix 7 | - kara_prefix 8 | − kara_prefix 9 | ー kara_prefix 10 | から kara_prefix 11 | PM gogo 12 | AM gozen 13 | PM gogo 14 | AM gozen 15 | PM gogo 16 | AM gozen 17 | PM  gogo 18 | AM  gozen 19 | 朝 asa 20 | 夜 yoru 21 | 深夜 sinnya -------------------------------------------------------------------------------- /src/dic/ja/raw/abstime_setubiji.txt: -------------------------------------------------------------------------------- 1 | 以前 >= 2 | まで made 3 | 迄 >= 4 | より前 > 5 | 以降 <= 6 | より後 < 7 | ~ kara_suffix 8 | 〜 kara_suffix 9 | ~ kara_suffix 10 | - kara_suffix 11 | − kara_suffix 12 | ー kara_suffix 13 | から kara_suffix 14 | くらい about 15 | ばかり about 16 | 前後 about 17 | 近く about 18 | 頃 about 19 | ごろ about 20 | 頭 zenhan 21 | 前半 zenhan 22 | 前記 zenhan 23 | 初頭 zenhan 24 | 初期 zenhan 25 | 初め zenhan 26 | 始め zenhan 27 | はじめ zenhan 28 | 後半 kouhan 29 | 後期 kouhan 30 | 終盤 kouhan 31 | 終わり kouhan 32 | 末 kouhan 33 | 半ば nakaba 34 | 中期 nakaba 35 | 中盤 nakaba 36 | 中頃 nakaba 37 | 中ごろ nakaba 38 | 中旬 nakaba 39 | 上旬 joujun 40 | 中旬 tyujun 41 | 下旬 gejun 42 | PM gogo 43 | AM gozen 44 | PM gogo 45 | AM gozen 46 | PM gogo 47 | AM gozen 48 |  PM gogo 49 |  AM gozen 50 | 朝 asa 51 | 夜 yoru -------------------------------------------------------------------------------- /src/dic/ja/raw/abstime_time.txt: -------------------------------------------------------------------------------- 1 | 時 2 | h 3 | 時半 4 | h 5 | 時*分 6 | h mn 7 | 時*分*秒 8 | h mn s 9 | 分*秒 10 | mn s 11 | :* 12 | h mn 13 | :* 14 | h mn 15 | :*:* 16 | h mn s 17 | :*:* 18 | h mn s -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_abstime.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | def create_list_expression(lst) : 5 | if len(lst)==0 : return "[]" 6 | ret = "[" 7 | for l in lst : 8 | ret += "\""+l+"\"" 9 | ret += ", " 10 | ret = ret[:-2] + "]" 11 | return ret 12 | 13 | def create_process_type(str) : 14 | process_type = [] 15 | if(str.endswith("半")) : process_type.append("han") 16 | if str in ["/ǂ", "/ǂ", ".ǂ", "・ǂ", ".ǂ", ",ǂ"] : #ハイフンで3-3(3月3日)はなさそうなので除外してある 17 | process_type.append("unclear") 18 | return process_type 19 | 20 | #load dayweeks  「午後」などの処理もここで。 21 | lst_dayweek = [] 22 | fin = open("abstime_dayweek.txt", "r") 23 | for line in fin.readlines() : 24 | dayweek, type = line.rstrip().split() 25 | fin2 = open("abstime_dayweek_pattern.txt", "r") 26 | for pattern in fin2.readlines() : 27 | pattern = pattern.rstrip("\n") 28 | tmp_type = "" 29 | if pattern.count("*") != 0 : tmp_type = type 30 | lst_dayweek.append([pattern.replace("*",dayweek), tmp_type]) 31 | 32 | #create date expression 33 | fin = open("abstime_date.txt") 34 | lst = [] 35 | for line in fin.readlines() : 36 | l = line.rstrip() 37 | l = l.replace("*","ǂ") 38 | lst.append(l) 39 | if len(lst) == 2 : 40 | corresponding_time_position = create_list_expression( lst[1].split() ) 41 | process_type = create_process_type(lst[0]) 42 | print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}" 43 | if lst[1].find("d") != -1 : #曜日表現を加える 44 | for dayweek in lst_dayweek : 45 | dayweek[0] = dayweek[0].rstrip().rstrip(" ") 46 | if dayweek[0] == "" : continue 47 | process_type = create_process_type(lst[0] + dayweek[0]) 48 | print "{\"pattern\":\""+lst[0]+dayweek[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\""+dayweek[1]+"\"}" 49 | lst = [] 50 | fin.close() 51 | 52 | 53 | #create time expression 54 | fin = open("abstime_time.txt") 55 | lst = [] 56 | for line in fin.readlines() : 57 | l = line.rstrip() 58 | l = l.replace("*","ǂ") 59 | lst.append(l) 60 | if len(lst) == 2 : 61 | corresponding_time_position = create_list_expression( lst[1].split() ) 62 | process_type = create_process_type(lst[0]) 63 | print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}" 64 | lst = [] 65 | fin.close() 66 | 67 | """ 68 | #普通にやろうとすると18MBほどになる。検索時間はあまり変わらないが、最初の読み込み時に5秒程時間がかかる。どうする? 69 | 70 | #create date+time expression 71 | fin = open("abstime_date.txt") 72 | lst = [] 73 | for line in fin.readlines() : 74 | l = line.rstrip() 75 | l = l.replace("*","ǂ") 76 | lst.append(l) 77 | if len(lst) == 2 : 78 | if lst[1].find("d") != -1 : 79 | for dayweek in lst_dayweek : 80 | fin2 = open("abstime_time.txt") 81 | lst2 = [] 82 | for line2 in fin2.readlines() : 83 | l2 = l2.replace("*","ǂ") 84 | lst2.append(l2) 85 | if len(lst2) == 2 : 86 | if lst2[1].find("h") == -1 : continue 87 | tmp = lst[1].split() 88 | tmp2 = lst2[1].split() 89 | tmp.extend(tmp2) 90 | corresponding_time_position = create_list_expression(tmp) 91 | process_type = create_process_type(lst[0]+dayweek+lst2[0]) 92 | print "{\"pattern\":\""+lst[0]+dayweek+"ǂ"+lst2[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}" 93 | lst2 = [] 94 | lst = [] 95 | fin.close() 96 | """ -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_abstime_date+time.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | def create_list_expression(lst) : 5 | if len(lst)==0 : return "[]" 6 | ret = "[" 7 | for l in lst : 8 | ret += "\""+l+"\"" 9 | ret += ", " 10 | ret = ret[:-2] + "]" 11 | return ret 12 | 13 | def create_process_type(str) : 14 | process_type = [] 15 | if(str.find("午前") != -1 or str.find("AM") != -1) : process_type.append("gozen") 16 | if(str.find("午後") != -1 or str.find("PM") != -1) : process_type.append("gogo") 17 | if(str=="世紀") : process_type.append("seiki") 18 | if(str.endswith("半")) : process_type.append("han") 19 | return process_type 20 | 21 | 22 | #load dayweeks  「午後」などの処理もここで。 23 | lst_dayweek = [] 24 | lst_gogo = ["", "午前", "午後", "AM", "PM"] 25 | fin = open("abstime_dayweek.txt", "r") 26 | for line in fin.readlines() : 27 | dayweek = line.rstrip() 28 | fin2 = open("abstime_dayweek_pattern.txt", "r") 29 | for pattern in fin2.readlines() : 30 | pattern = pattern.rstrip("\n") 31 | for gogo in lst_gogo : 32 | lst_dayweek.append(pattern.replace("*",dayweek)+gogo) 33 | 34 | 35 | 36 | #create time expression 37 | fin = open("abstime_time.txt") 38 | lst = [] 39 | for line in fin.readlines() : 40 | l = line.rstrip() 41 | l = l.replace("*","ǂ") 42 | lst.append(l) 43 | if len(lst) == 2 : 44 | corresponding_time_position = create_list_expression( lst[1].split() ) 45 | process_type = create_process_type(lst[0]) 46 | print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}" 47 | lst = [] 48 | fin.close() 49 | 50 | #create date expression 51 | fin = open("abstime_date.txt") 52 | lst = [] 53 | for line in fin.readlines() : 54 | l = line.rstrip() 55 | l = l.replace("*","ǂ") 56 | lst.append(l) 57 | if len(lst) == 2 : 58 | corresponding_time_position = create_list_expression( lst[1].split() ) 59 | print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":[], \"ordinary\":false, \"option\":\"\"}" 60 | if lst[1].find("d") != -1 : #曜日表現を加える 61 | for dayweek in lst_dayweek : 62 | dayweek = dayweek.rstrip().rstrip(" ") 63 | process_type = create_process_type(lst[0] + dayweek) 64 | print "{\"pattern\":\""+lst[0]+dayweek+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}" 65 | lst = [] 66 | fin.close() 67 | 68 | 69 | #普通にやろうとすると18MBほどになる。検索時間はあまり変わらないが、最初の読み込み時に5秒程時間がかかる。どうする? 70 | 71 | #create date+time expression 72 | fin = open("abstime_date.txt") 73 | lst = [] 74 | for line in fin.readlines() : 75 | l = line.rstrip() 76 | l = l.replace("*","ǂ") 77 | lst.append(l) 78 | if len(lst) == 2 : 79 | if lst[1].find("d") != -1 : 80 | for dayweek in lst_dayweek : 81 | fin2 = open("abstime_time.txt") 82 | lst2 = [] 83 | for line2 in fin2.readlines() : 84 | l2 = l2.replace("*","ǂ") 85 | lst2.append(l2) 86 | if len(lst2) == 2 : 87 | if lst2[1].find("h") == -1 : continue 88 | tmp = lst[1].split() 89 | tmp2 = lst2[1].split() 90 | tmp.extend(tmp2) 91 | corresponding_time_position = create_list_expression(tmp) 92 | process_type = create_process_type(lst[0]+dayweek+lst2[0]) 93 | print "{\"pattern\":\""+lst[0]+dayweek+"ǂ"+lst2[0]+"\", \"corresponding_time_position\":"+corresponding_time_position+", \"process_type\":"+create_list_expression(process_type)+", \"ordinary\":false, \"option\":\"\"}" 94 | lst2 = [] 95 | lst = [] 96 | fin.close() 97 | -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_abstime_prefix_counter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | fin = open("abstime_nengou.txt", "r") 5 | for line in fin.readlines() : 6 | lst = line.rstrip().split() 7 | print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":[], \"process_type\":[\""+lst[1]+"\"], \"ordinary\":false, \"option\":\"seireki\"}" 8 | 9 | fin = open("abstime_prefix_counter.txt", "r") 10 | for line in fin.readlines() : 11 | lst = line.rstrip().split() 12 | print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":[], \"process_type\":[], \"ordinary\":false, \"option\":\""+lst[1]+"\"}" -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_dayweek.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | fin = open("abstime_dayweek.txt", "r") 3 | for line in fin.readlines() : 4 | dayweek = line.rstrip() 5 | fin2 = open("abstime_dayweek_pattern.txt", "r") 6 | for pattern in fin2.readlines() : 7 | pattern = pattern.rstrip("\n") 8 | print "{\"pattern\":\""+pattern.replace("*",dayweek)+"}" 9 | -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_duration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | def create_list_expression(lst_org) : 4 | lst = lst_org.split(",") 5 | if len(lst)==0 : return "[]" 6 | ret = "[" 7 | for l in lst : 8 | ret += "\""+l+"\"" 9 | ret += ", " 10 | ret = ret[:-2] + "]" 11 | return ret 12 | 13 | fin = open("duration_time_position.txt", "r") 14 | for line in fin.readlines() : 15 | lst = line.rstrip().split() 16 | lst[0] = lst[0].replace("*","ǂ") 17 | print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst[1])+", \"process_type\":[], \"ordinary\":false, \"option\":\"\"}" 18 | print "{\"pattern\":\""+lst[0]+"半\", \"corresponding_time_position\":"+create_list_expression(lst[1])+", \"process_type\":[\"han\"], \"ordinary\":false, \"option\":\"\"}" -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_inappropriate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | fin = open("inappropriate_strings.txt", "r") 4 | for line in fin.readlines() : 5 | l = line.rstrip() 6 | print "{\"str\":\""+l+"\"}" 7 | -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_num.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | fin = open("num.txt", "r") 4 | for line in fin.readlines() : 5 | l = line.rstrip().split() 6 | print "{\"pattern\":\""+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":0, \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\"\"}" 7 | 8 | fin = open("num_SItanni_settouji_katakana.txt") 9 | prefix = [["",0]] 10 | for line in fin.readlines() : 11 | l = line.rstrip().split() 12 | prefix.append(l) 13 | fin = open("num_SItanni_katakana.txt", "r") 14 | for line in fin.readlines() : 15 | l = line.rstrip().split() 16 | for p in prefix : 17 | print "{\"pattern\":\""+p[0]+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":"+str(p[1])+", \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\"\"}" 18 | 19 | fin = open("num_SItanni_settouji_hankaku.txt") 20 | prefix = [["",0]] 21 | for line in fin.readlines() : 22 | l = line.rstrip().split() 23 | prefix.append(l) 24 | fin = open("num_SItanni_hankaku.txt", "r") 25 | for line in fin.readlines() : 26 | l = line.rstrip().split() 27 | for p in prefix : 28 | print "{\"pattern\":\""+p[0]+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":"+str(p[1])+", \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\"\"}" 29 | 30 | fin = open("num_SItanni_settouji_zenkaku.txt") 31 | prefix = [["",0]] 32 | for line in fin.readlines() : 33 | l = line.rstrip().split() 34 | prefix.append(l) 35 | fin = open("num_SItanni_zenkaku.txt", "r") 36 | for line in fin.readlines() : 37 | l = line.rstrip().split() 38 | for p in prefix : 39 | print "{\"pattern\":\""+p[0]+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":"+str(p[1])+", \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\"\"}" 40 | 41 | fin = open("num_expand.txt") 42 | for line in fin.readlines() : 43 | l = line.rstrip().split() 44 | print "{\"pattern\":\""+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":0, \"optional_power_of_ten\":"+l[2]+", \"ordinary\":false, \"option\":\"\"}" 45 | 46 | fin = open("num_wari.txt") 47 | for line in fin.readlines() : 48 | l = line.rstrip().replace("*","ǂ") 49 | print l 50 | 51 | -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_num_prefix_counter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | fin = open("num_prefix_counter.txt", "r") 4 | for line in fin.readlines() : 5 | l = line.rstrip().split() 6 | print "{\"pattern\":\""+l[0]+"\", \"counter\":\""+l[1]+"\", \"SI_prefix\":0, \"optional_power_of_ten\":0, \"ordinary\":false, \"option\":\""+l[2]+"\"}" 7 | -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_number_modifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | 4 | filename = sys.argv[1] 5 | fin = open(filename, "r") 6 | for line in fin.readlines() : 7 | line = line.replace("<=", "or_over").replace(">=", "or_less").replace("<","over").replace(">", "less") 8 | l = line.rstrip().split() 9 | print "{\"pattern\":\""+l[0]+"\", \"process_type\":\""+l[1]+"\"}" 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_reltime.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | def create_list_expression(fugou,lst_org) : 5 | lst = lst_org.split(",") 6 | if len(lst)==0 : return "[]" 7 | ret = "[" 8 | for l in lst : 9 | ret += "\""+fugou+l+"\"" 10 | ret += ", " 11 | ret = ret[:-2] + "]" 12 | return ret 13 | 14 | fin = open("reltime_time_position.txt", "r") 15 | for line in fin.readlines() : 16 | lst = line.rstrip().split() 17 | lst[0] = lst[0].replace("*","ǂ") 18 | fin2 = open("reltime_time_option.txt", "r") 19 | for line2 in fin2.readlines() : 20 | lst2 = line2.rstrip().split() 21 | if len(lst2) == 3 : 22 | print "{\"pattern\":\""+lst[0]+lst2[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst2[1],lst[1])+", \"process_type\":[\""+lst2[2]+"\"], \"ordinary\":false, \"option\":\"\"}" 23 | else : 24 | print "{\"pattern\":\""+lst[0]+lst2[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst2[1],lst[1])+", \"process_type\":[], \"ordinary\":false, \"option\":\"\"}" 25 | #hanのため 26 | if len(lst2) == 3 : 27 | print "{\"pattern\":\""+lst[0]+"半"+lst2[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst2[1],lst[1])+", \"process_type\":[\""+lst2[2]+"\", \"han\"], \"ordinary\":false, \"option\":\"\"}" 28 | else : 29 | print "{\"pattern\":\""+lst[0]+"半"+lst2[0]+"\", \"corresponding_time_position\":"+create_list_expression(lst2[1],lst[1])+", \"process_type\":[\"han\"], \"ordinary\":false, \"option\":\"\"}" 30 | 31 | fin.close() 32 | 33 | fin = open("reltime_specific.txt","r") 34 | for line in fin.readlines() : 35 | print line.rstrip() 36 | fin.close() 37 | 38 | fin = open("../abstime_expression_json.txt", "r") 39 | for line in fin.readlines() : 40 | print line.rstrip() -------------------------------------------------------------------------------- /src/dic/ja/raw/create_dic_reltime_prefix_counter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | fin = open("reltime_prefix_counter.txt", "r") 4 | for line in fin.readlines() : 5 | lst = line.rstrip().split() 6 | print "{\"pattern\":\""+lst[0]+"\", \"corresponding_time_position\":[\""+lst[1]+"\"], \"process_type\":[\""+lst[2]+"\"], \"ordinary\":false, \"option\":\"add_relation\"}" 7 | -------------------------------------------------------------------------------- /src/dic/ja/raw/duration_prefix_counter.txt: -------------------------------------------------------------------------------- 1 | 週 /week 2 | 月 /month 3 | 年 /year 4 | -------------------------------------------------------------------------------- /src/dic/ja/raw/duration_setouji.txt: -------------------------------------------------------------------------------- 1 | 約 about 2 | だいたい about 3 | ほぼ about 4 | およそ about 5 | ほとんど about 6 | 全 none 7 | ちょうど none 8 | 第 ordinary 9 | ~ kara_prefix 10 | 〜 kara_prefix 11 | ~ kara_prefix 12 | - kara_prefix 13 | − kara_prefix 14 | ー kara_prefix 15 | から kara_prefix -------------------------------------------------------------------------------- /src/dic/ja/raw/duration_setubiji.txt: -------------------------------------------------------------------------------- 1 | 目 ordinary 2 | 以下 >= 3 | 以前 >= 4 | 以内 >= 5 | まで made 6 | 迄 >= 7 | 未満 > 8 | 以上 <= 9 | 以降 <= 10 | 超 <= 11 | 越え <= 12 | 超え <= 13 | ~ kara_suffix 14 | 〜 kara_suffix 15 | ~ kara_suffix 16 | - kara_suffix 17 | − kara_suffix 18 | ー kara_suffix 19 | から kara_suffix 20 | くらい about 21 | ばかり about 22 | 前後 about 23 | 程度 about 24 | ほど about 25 | 近く about 26 | 頃 about 27 | ごろ about 28 | 余り kyou 29 | 強 kyou 30 | 弱 jaku 31 | 台 dai 32 | 代 dai 33 | 毎 per 34 | -------------------------------------------------------------------------------- /src/dic/ja/raw/duration_time_position.txt: -------------------------------------------------------------------------------- 1 | 世紀 seiki 2 | 年間 y 3 | 年 y 4 | ヶ月 m 5 | か月 m 6 | カ月 m 7 | ヵ月 m 8 | ケ月 m 9 | 箇月 m 10 | 週間 w 11 | 日間 d 12 | 時間 h 13 | 分間 mn 14 | 秒間 s 15 | 年 y 16 | 月 m 17 | 週 w 18 | 日 d 19 | 分 mn 20 | 秒 s 21 | 年*ヶ月 y,m 22 | 時間*分 h,mn 23 | 分*秒 mn,s -------------------------------------------------------------------------------- /src/dic/ja/raw/inappropriate_strings.txt: -------------------------------------------------------------------------------- 1 | 一切 2 | 一部 3 | 一連 4 | 三振 5 | 一段 6 | 一体 7 | 九州 8 | 四国 9 | 一種 10 | 一番 -------------------------------------------------------------------------------- /src/dic/ja/raw/make_dictionary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python create_dic_number_modifier.py num_settouji.txt > ../num_prefix_json.txt 3 | python create_dic_number_modifier.py num_setubiji.txt > ../num_suffix_json.txt 4 | python create_dic_number_modifier.py abstime_settouji.txt > ../abstime_prefix_json.txt 5 | python create_dic_number_modifier.py abstime_setubiji.txt > ../abstime_suffix_json.txt 6 | python create_dic_number_modifier.py reltime_settouji.txt > ../reltime_prefix_json.txt 7 | python create_dic_number_modifier.py abstime_setubiji.txt > ../reltime_suffix_json.txt #reltime自体の接尾辞は存在しない。相対絶対表現でabsの接尾辞を使う 8 | python create_dic_number_modifier.py duration_setouji.txt > ../duration_prefix_json.txt 9 | python create_dic_number_modifier.py duration_setubiji.txt > ../duration_suffix_json.txt 10 | python create_dic_num.py > ../num_counter_json.txt 11 | python create_dic_num_prefix_counter.py > ../num_prefix_counter_json.txt 12 | python create_dic_abstime.py > ../abstime_expression_json.txt 13 | python create_dic_abstime_prefix_counter.py > ../abstime_prefix_counter_json.txt 14 | python create_dic_reltime.py > ../reltime_expression_json.txt 15 | python create_dic_reltime_prefix_counter.py > ../reltime_prefix_counter_json.txt 16 | python create_dic_duration.py > ../duration_expression_json.txt 17 | python create_dic_inappropriate.py > ../inappropriate_strings_json.txt -------------------------------------------------------------------------------- /src/dic/ja/raw/make_dictionary.sh~: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python create_dic_number_modifier.py num_settouji.txt > ../num_prefix_json.txt 3 | python create_dic_number_modifier.py num_setubiji.txt > ../num_suffix_json.txt 4 | python create_dic_number_modifier.py abstime_settouji.txt > ../abstime_prefix_json.txt 5 | python create_dic_number_modifier.py abstime_setubiji.txt > ../abstime_suffix_json.txt 6 | python create_dic_number_modifier.py reltime_settouji.txt > ../reltime_prefix_json.txt 7 | python create_dic_number_modifier.py duration_setouji.txt > ../duration_prefix_json.txt 8 | python create_dic_number_modifier.py duration_setubiji.txt > ../duration_suffix_json.txt 9 | python create_dic_num.py > ../num_counter_json.txt 10 | python create_dic_abstime.py > ../abstime_expression_json.txt 11 | python create_dic_abstime_prefix_counter.py > ../abstime_prefix_counter_json.txt 12 | python create_dic_reltime.py > ../reltime_expression_json.txt 13 | python create_dic_duration.py > ../duration_expression_json.txt 14 | -------------------------------------------------------------------------------- /src/dic/ja/raw/num.txt: -------------------------------------------------------------------------------- 1 | カナダドル カナダドル 2 | シンガポールドル シンガポールドル 3 | オクターブ オクターブ 4 | パーセント % 5 | オクターヴ オクターブ 6 | フィート フィート 7 | カラット カラット 8 | グループ グループ 9 | USドル ドル 10 | タイトル タイトル 11 | シリーズ シリーズ 12 | ポイント ポイント 13 | ジャンル ジャンル 14 | ステージ ステージ 15 | パターン パターン 16 | ラウンド ラウンド 17 | フラン フラン 18 | クローネ クローネ 19 | ポンド ポンド 20 | ユーロ ユーロ 21 | レース レース 22 | ウォン ウォン 23 | ルピー ルピー 24 | インチ インチ 25 | ノット ノット 26 | モーラ モーラ 27 | コース コース 28 | ページ ページ 29 | テイク テイク 30 | タイプ タイプ 31 | ゲーム ゲーム 32 | チーム チーム 33 | 新ペソ 新ペソ 34 | ペソ ペソ 35 | 選手 選手 36 | 拍子 拍子 37 | 音節 音節 38 | 区画 区画 39 | 切れ 切れ 40 | 人前 人前 41 | ドル ドル 42 | 海里 海里 43 | ペア ペア 44 | 種類 種類 45 | 集落 集落 46 | 手法 手法 47 | 言語 言語 48 | 地域 地域 49 | 議席 議席 50 | カ国 カ国 51 | ヶ国 カ国 52 | か国 カ国 53 | ケ国 カ国 54 | ヵ国 カ国 55 | 年分 年分 56 | 民族 民族 57 | 種目 種目 58 | 分割 分割 59 | 母音 母音 60 | 箇所 箇所 61 | ケ所 箇所 62 | ヶ所 箇所 63 | ヵ所 箇所 64 | カ所 箇所 65 | か所 箇所 66 | 個所 箇所 67 | 文節 文節 68 | 回生 回生 69 | 単位 単位 70 | 次元 次元 71 | 連勝 連勝 72 | 連敗 連敗 73 | 重奏 重奏 74 | 年制 年制 75 | 試合 試合 76 | 文字 文字 77 | 作品 作品 78 | 世代 世代 79 | 大会 大会 80 | 得点 得点 81 | 方向 方向 82 | 店舗 店舗 83 | 世帯 世帯 84 | 師団 師団 85 | 艦隊 艦隊 86 | 要素 要素 87 | 領域 領域 88 | 音素 音素 89 | 段階 段階 90 | 連隊 連隊 91 | 階級 階級 92 | 連覇 連覇 93 | 路線 路線 94 | bite バイト 95 | 便 便 96 | 勝 勝 97 | 敗 敗 98 | 等 等 99 | 人 人 100 | 個 個 101 | つ つ 102 | 枚 枚 103 | 面 面 104 | 段 段 105 | 本 本 106 | 匹 匹 107 | 羽 羽 108 | 灯 灯 109 | 頭 頭 110 | 本 本 111 | 張 張 112 | 戸 戸 113 | 軒 軒 114 | 棟 棟 115 | 杯 杯 116 | 匹 匹 117 | 枚 枚 118 | 架 架 119 | 体 体 120 | 柱 柱 121 | 府 府 122 | 党 党 123 | 氏 氏 124 | 団体 団体 125 | 局 局 126 | 番 番 127 | 脚 脚 128 | 本 本 129 | 基 基 130 | 着 着 131 | 具 具 132 | 羽 羽 133 | 頭 頭 134 | 席 席 135 | 献 献 136 | 柄 柄 137 | 玉 玉 138 | 杯 杯 139 | 巻 巻 140 | 枝 枝 141 | 尾 尾 142 | 港 港 143 | 掛 掛 144 | 番 番 145 | 封 封 146 | 筋 筋 147 | 挺 挺 148 | 条 条 149 | 錠 錠 150 | 丈 丈 151 | 幅 幅 152 | 株 株 153 | 刎 刎 154 | 座 座 155 | 騎 騎 156 | 行 行 157 | 服 服 158 | 包 包 159 | 果 果 160 | 菓 菓 161 | 足 足 162 | 領 領 163 | 丁 丁 164 | 俵 俵 165 | 膳 膳 166 | 喉 喉 167 | 斤 斤 168 | 叺 叺 169 | 貫 貫 170 | 篇 篇 171 | 尊 尊 172 | 棹 棹 173 | 台 台 174 | 両 両 175 | 連 連 176 | 部 部 177 | 頁 ページ 178 | 球 球 179 | 部 部 180 | 句 句 181 | 門 門 182 | 問 問 183 | 戦 戦 184 | 畳 畳 185 | 棹 棹 186 | 反 反 187 | 卓 卓 188 | 口 口 189 | 壷 壷 190 | 通 通 191 | 振 振 192 | 腰 腰 193 | 剣 剣 194 | 刀 刀 195 | 票 票 196 | 帖 帖 197 | 句 句 198 | 輪 輪 199 | 片 片 200 | 機 機 201 | 名 名 202 | 拍 拍 203 | 躯 躯 204 | 隻 隻 205 | 粒 粒 206 | 顆 顆 207 | 札 札 208 | 冊 冊 209 | 品 品 210 | ℃ ℃ 211 | rad rad 212 | 円 円 213 | 種 種類 214 | 級 級 215 | 度 度 216 | こ 個 217 | 倍 倍 218 | % % 219 | 回 回 220 | 弦 弦 221 | 校 校 222 | 次 次 223 | 項 項 224 | 歳 歳 225 | 才 歳 226 | 国 国 227 | 州 州 228 | 件 件 229 | 区 区 230 | 話 話 231 | 選 選 232 | 位 位 233 | 合 合 234 | 階 階 235 | 波 波 236 | 節 節 237 | bit ビット 238 | 期 期 239 | 切 切 240 | 音 音 241 | 手 手 242 | 尺 尺 243 | 寸 寸 244 | 県 県 245 | 章 章 246 | 泊 泊 247 | 曲 曲 248 | 列 駅 249 | 線 線 250 | 社 社 251 | 弾 弾 252 | 組 組 253 | 役 役 254 | 桁 桁 255 | 字 字 256 | 点 点 257 | 店 店 258 | 石 石 259 | 版 版 260 | 藩 藩 261 | 号 号 262 | 課 課 263 | 作 作 264 | 集 集 265 | 州 州 266 | 周 周 267 | 袋 袋 268 | rpm rpm 269 | rpm rpm 270 | 代 代 271 | 項 項 272 | ° 度 273 | % % 274 | % % 275 | 日分 日分 276 | ヶ月分 ヶ月分 277 | 年分 年分 278 | 行 行 279 | 碗 碗 280 | 台分 台分 281 | 人分 人分 282 | 倍速 倍速 283 | コ 個 284 | こま コマ 285 | コマ コマ 286 | マルク マルク 287 | リラ リラ 288 | ペセタ ペセタ 289 | 箱 箱 290 | カウント カウント 291 | ハイ ハイ 292 | KB KB 293 | MB MB 294 | GB GB 295 | TB TB 296 | PB PB 297 | KB KB 298 | MB MB 299 | GB GB 300 | TB TB 301 | PB PB 302 | 事例 事例 303 | 周年 周年 304 | 例 例 305 | ppm ppm 306 | G G 307 | G G 308 | 色 色 309 | 気圧 気圧 310 | 光年 光年 311 | 里 里 312 | セット セット -------------------------------------------------------------------------------- /src/dic/ja/raw/num_SItanni_hankaku.txt: -------------------------------------------------------------------------------- 1 | mol mol 2 | bps bps 3 | m/h m/h 4 | m/s m/s 5 | m/m m/m 6 | g/l g/l 7 | N/m2 N/m2 8 | cd cd 9 | Pa Pa 10 | Ω Ω 11 | Wb Wb 12 | Hz Hz 13 | sr sr 14 | ha ha 15 | cc cc 16 | m m 17 | g g 18 | N N 19 | l l 20 | s s 21 | A A 22 | K K 23 | J J 24 | W W 25 | C C 26 | V V 27 | F F 28 | S S 29 | T T 30 | H H 31 | Sv Sv 32 | B バイト 33 | Bq Bq 34 | dB dB 35 | pixel pixel 36 | cal cal -------------------------------------------------------------------------------- /src/dic/ja/raw/num_SItanni_katakana.txt: -------------------------------------------------------------------------------- 1 | モル mol 2 | カンデラ cd 3 | パスカル Pa 4 | オーム Ω 5 | ウェーバ Wb 6 | ヘルツ Hz 7 | ステラジアン sr 8 | ヘクタール ha 9 | シーシー cc 10 | メートル m 11 | グラム g 12 | ニュートン N 13 | リットル l 14 | アンペア A 15 | ケルビン K 16 | ジュール J 17 | ワット W 18 | クーロン C 19 | ボルト V 20 | ファラド F 21 | ジーメンス S 22 | テスラ T 23 | ヘンリー H 24 | シーベルト Sv 25 | バイト バイト 26 | ベクレル Bq 27 | デシベル dB 28 | ピクセル pixel 29 | カロリー cal 30 | ビット ビット 31 | マイル マイル 32 | フィート フィート 33 | ヤード ヤード 34 | インチ インチ 35 | エーカー エーカー 36 | オンス オンス 37 | パイント パイント 38 | ガロン ガロン 39 | バレル バレル 40 | オンス オンス 41 | ポンド ポンド -------------------------------------------------------------------------------- /src/dic/ja/raw/num_SItanni_settouji_hankaku.txt: -------------------------------------------------------------------------------- 1 | G 9 2 | M 6 3 | k 3 4 | h 3 5 | da 2 6 | d -1 7 | c -2 8 | m -3 9 | μ -6 10 | n -9 11 | p -12 -------------------------------------------------------------------------------- /src/dic/ja/raw/num_SItanni_settouji_katakana.txt: -------------------------------------------------------------------------------- 1 | ギガ 9 2 | メガ 6 3 | キロ 3 4 | ヘクト 3 5 | デシ -1 6 | センチ -2 7 | ミリ -3 8 | マイクロ -6 9 | ナノ -9 10 | ピコ -12 11 | -------------------------------------------------------------------------------- /src/dic/ja/raw/num_SItanni_settouji_zenkaku.txt: -------------------------------------------------------------------------------- 1 | G 9 2 | M 6 3 | k 3 4 | h 3 5 | da 2 6 | d -1 7 | c -2 8 | m -3 9 | μ -6 10 | n -9 11 | p -12 -------------------------------------------------------------------------------- /src/dic/ja/raw/num_SItanni_zenkaku.txt: -------------------------------------------------------------------------------- 1 | mol mol 2 | bps bps 3 | m/h m/h 4 | m/s m/s 5 | m/m m/m 6 | g/l g/l 7 | N/m2 N/m2 8 | cd cd 9 | Pa Pa 10 | Wb Wb 11 | Hz Hz 12 | sr sr 13 | ha ha 14 | cc cc 15 | m m 16 | g g 17 | N N 18 | l l 19 | s s 20 | A A 21 | K K 22 | J J 23 | W W 24 | C C 25 | V V 26 | F F 27 | S S 28 | T T 29 | H H 30 | Sv Sv 31 | B B -------------------------------------------------------------------------------- /src/dic/ja/raw/num_expand.txt: -------------------------------------------------------------------------------- 1 | t g 6 2 | トン g 6 3 | センチ m -2 4 | キロ m 3 5 | ミリ m -3 6 | 平方キロメートル m2 6 7 | 平方ミリメートル m2 -6 8 | 立方キロメートル m3 9 9 | 立方ミリメートル m3 -9 10 | km² m2 6 11 | mm² m2 -6 12 | km³ m3 9 13 | mm³ m3 -9 14 | -------------------------------------------------------------------------------- /src/dic/ja/raw/num_prefix_counter.txt: -------------------------------------------------------------------------------- 1 | ¥ 円 counter 2 | ¥ 円 counter 3 | $ ドル counter 4 | $ ドル counter 5 | € ユーロ counter 6 | £ ポンド counter 7 | 小さじ 小さじ counter 8 | 大さじ 大さじ counter 9 | 時速 /h add_suffix_counter 10 | 毎時 /h add_suffix_counter 11 | 分速 /m add_suffix_counter 12 | 毎分 /m add_suffix_counter 13 | 秒速 /s add_suffix_counter 14 | 毎秒 /s add_suffix_counter 15 | 週 /week add_suffix_counter 16 | 月 /month add_suffix_counter 17 | 年 /year add_suffix_counter 18 | 最大 * saidai 19 | 最長 * saityou 20 | 最高 * saikou 21 | 華氏 ℉ counter 22 | 摂氏 ℃ counter 23 | 風速 * fusoku 24 | 水温 * suion 25 | 北緯 * hokui 26 | 南緯 * nanni 27 | 東経 * toukei 28 | 西経 * seikei -------------------------------------------------------------------------------- /src/dic/ja/raw/num_settouji.txt: -------------------------------------------------------------------------------- 1 | 第 ordinary 2 | 約 about 3 | だいたい about 4 | ほぼ about 5 | およそ about 6 | ほとんど about 7 | 全 none 8 | ちょうど none 9 | ~ kara_prefix 10 | 〜 kara_prefix 11 | ~ kara_prefix 12 | - kara_prefix 13 | − kara_prefix 14 | ー kara_prefix 15 | から kara_prefix -------------------------------------------------------------------------------- /src/dic/ja/raw/num_setubiji.txt: -------------------------------------------------------------------------------- 1 | 目 ordinary 2 | 以下 >= 3 | 以前 >= 4 | 以内 >= 5 | まで made 6 | 迄 >= 7 | 未満 > 8 | 以上 <= 9 | 以降 <= 10 | 超 <= 11 | 越え <= 12 | 超え <= 13 | ~ kara_suffix 14 | 〜 kara_suffix 15 | ~ kara_suffix 16 | - kara_suffix 17 | − kara_suffix 18 | ー kara_suffix 19 | から kara_suffix 20 | くらい about 21 | ばかり about 22 | 前後 about 23 | 程度 about 24 | ほど about 25 | 近く about 26 | 頃 about 27 | ごろ about 28 | 余り kyou 29 | 強 kyou 30 | 弱 jaku 31 | 台 dai 32 | 代 dai 33 | 毎 per 34 | 半 han 35 | /時 /h 36 | /分 /min 37 | /秒 /sec 38 | /時 /h 39 | /分 /min 40 | /秒 /sec 41 | -------------------------------------------------------------------------------- /src/dic/ja/raw/num_wari.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"割", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"} 2 | {"pattern":"分", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"} 3 | {"pattern":"厘", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"} 4 | {"pattern":"割*分", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"} 5 | {"pattern":"分*厘", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"} 6 | {"pattern":"割*分*厘", "counter":"%", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":"wari"} -------------------------------------------------------------------------------- /src/dic/ja/raw/reltime_prefix_counter.txt: -------------------------------------------------------------------------------- 1 | 去年 y -1 2 | 昨年 y -1 3 | 一昨年 y -2 4 | 今年 y 0 5 | 来年 y +1 6 | 先月 m -1 7 | 先々月 m -2 8 | 今月 m 0 9 | 来月 m +1 10 | 来来月 m +2 11 | 昨日 d -1 12 | 一昨日 d -2 13 | 今日 d 0 14 | 本日 d 0 15 | 明日 d +1 16 | 明後日 d +2 -------------------------------------------------------------------------------- /src/dic/ja/raw/reltime_settouji.txt: -------------------------------------------------------------------------------- 1 | 約 about 2 | だいたい about 3 | ほぼ about 4 | およそ about 5 | ほとんど about 6 | 全 none 7 | ちょうど none 8 | ~ kara_prefix 9 | 〜 kara_prefix 10 | ~ kara_prefix 11 | - kara_prefix 12 | − kara_prefix 13 | ー kara_prefix 14 | から kara_prefix -------------------------------------------------------------------------------- /src/dic/ja/raw/reltime_specific.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/ja/raw/reltime_specific.txt -------------------------------------------------------------------------------- /src/dic/ja/raw/reltime_time_option.txt: -------------------------------------------------------------------------------- 1 | 前 - 2 | 以上前 - or_over 3 | くらい前 - about 4 | ぐらい前 - about 5 | ほど前 - about 6 | 程度前 - about 7 | ばかり前 - about 8 | 近く前 - about 9 | より前 - over 10 | よりも前 - over 11 | 後 + 12 | 以上後 + or_over 13 | より後 + over 14 | よりも後 + over 15 | ほど後 + about 16 | くらい後 + about 17 | ぐらい後 + about 18 | 程度後 + about 19 | ばかり後 + about 20 | 近く後 + about -------------------------------------------------------------------------------- /src/dic/ja/raw/reltime_time_position.txt: -------------------------------------------------------------------------------- 1 | 世紀 seiki 2 | 年 y 3 | ヶ月 m 4 | か月 m 5 | カ月 m 6 | ヵ月 m 7 | ケ月 m 8 | 箇月 m 9 | 週 w 10 | 週間 w 11 | 日 d 12 | 日間 d 13 | 時間 h 14 | 分 mn 15 | 秒 s 16 | 年*ヶ月 y,m 17 | 年*ヶ月*日間 y,m,d 18 | -------------------------------------------------------------------------------- /src/dic/ja/raw/reltime_time_pre_option.txt: -------------------------------------------------------------------------------- 1 | 半 han -------------------------------------------------------------------------------- /src/dic/ja/reltime_prefix_counter_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"去年", "corresponding_time_position":["y"], "process_type":["-1"], "ordinary":false, "option":"add_relation"} 2 | {"pattern":"昨年", "corresponding_time_position":["y"], "process_type":["-1"], "ordinary":false, "option":"add_relation"} 3 | {"pattern":"一昨年", "corresponding_time_position":["y"], "process_type":["-2"], "ordinary":false, "option":"add_relation"} 4 | {"pattern":"今年", "corresponding_time_position":["y"], "process_type":["0"], "ordinary":false, "option":"add_relation"} 5 | {"pattern":"来年", "corresponding_time_position":["y"], "process_type":["+1"], "ordinary":false, "option":"add_relation"} 6 | {"pattern":"先月", "corresponding_time_position":["m"], "process_type":["-1"], "ordinary":false, "option":"add_relation"} 7 | {"pattern":"先々月", "corresponding_time_position":["m"], "process_type":["-2"], "ordinary":false, "option":"add_relation"} 8 | {"pattern":"今月", "corresponding_time_position":["m"], "process_type":["0"], "ordinary":false, "option":"add_relation"} 9 | {"pattern":"来月", "corresponding_time_position":["m"], "process_type":["+1"], "ordinary":false, "option":"add_relation"} 10 | {"pattern":"来来月", "corresponding_time_position":["m"], "process_type":["+2"], "ordinary":false, "option":"add_relation"} 11 | {"pattern":"昨日", "corresponding_time_position":["d"], "process_type":["-1"], "ordinary":false, "option":"add_relation"} 12 | {"pattern":"一昨日", "corresponding_time_position":["d"], "process_type":["-2"], "ordinary":false, "option":"add_relation"} 13 | {"pattern":"今日", "corresponding_time_position":["d"], "process_type":["0"], "ordinary":false, "option":"add_relation"} 14 | {"pattern":"本日", "corresponding_time_position":["d"], "process_type":["0"], "ordinary":false, "option":"add_relation"} 15 | {"pattern":"明日", "corresponding_time_position":["d"], "process_type":["+1"], "ordinary":false, "option":"add_relation"} 16 | {"pattern":"明後日", "corresponding_time_position":["d"], "process_type":["+2"], "ordinary":false, "option":"add_relation"} 17 | -------------------------------------------------------------------------------- /src/dic/ja/reltime_prefix_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"約", "process_type":"about"} 2 | {"pattern":"だいたい", "process_type":"about"} 3 | {"pattern":"ほぼ", "process_type":"about"} 4 | {"pattern":"およそ", "process_type":"about"} 5 | {"pattern":"ほとんど", "process_type":"about"} 6 | {"pattern":"全", "process_type":"none"} 7 | {"pattern":"ちょうど", "process_type":"none"} 8 | {"pattern":"~", "process_type":"kara_prefix"} 9 | {"pattern":"〜", "process_type":"kara_prefix"} 10 | {"pattern":"~", "process_type":"kara_prefix"} 11 | {"pattern":"-", "process_type":"kara_prefix"} 12 | {"pattern":"−", "process_type":"kara_prefix"} 13 | {"pattern":"ー", "process_type":"kara_prefix"} 14 | {"pattern":"から", "process_type":"kara_prefix"} 15 | -------------------------------------------------------------------------------- /src/dic/ja/reltime_suffix_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"以前", "process_type":"or_less"} 2 | {"pattern":"まで", "process_type":"made"} 3 | {"pattern":"迄", "process_type":"or_less"} 4 | {"pattern":"より前", "process_type":"less"} 5 | {"pattern":"以降", "process_type":"or_over"} 6 | {"pattern":"より後", "process_type":"over"} 7 | {"pattern":"~", "process_type":"kara_suffix"} 8 | {"pattern":"〜", "process_type":"kara_suffix"} 9 | {"pattern":"~", "process_type":"kara_suffix"} 10 | {"pattern":"-", "process_type":"kara_suffix"} 11 | {"pattern":"−", "process_type":"kara_suffix"} 12 | {"pattern":"ー", "process_type":"kara_suffix"} 13 | {"pattern":"から", "process_type":"kara_suffix"} 14 | {"pattern":"くらい", "process_type":"about"} 15 | {"pattern":"ばかり", "process_type":"about"} 16 | {"pattern":"前後", "process_type":"about"} 17 | {"pattern":"近く", "process_type":"about"} 18 | {"pattern":"頃", "process_type":"about"} 19 | {"pattern":"ごろ", "process_type":"about"} 20 | {"pattern":"頭", "process_type":"zenhan"} 21 | {"pattern":"前半", "process_type":"zenhan"} 22 | {"pattern":"前記", "process_type":"zenhan"} 23 | {"pattern":"初頭", "process_type":"zenhan"} 24 | {"pattern":"初期", "process_type":"zenhan"} 25 | {"pattern":"初め", "process_type":"zenhan"} 26 | {"pattern":"始め", "process_type":"zenhan"} 27 | {"pattern":"はじめ", "process_type":"zenhan"} 28 | {"pattern":"後半", "process_type":"kouhan"} 29 | {"pattern":"後期", "process_type":"kouhan"} 30 | {"pattern":"終盤", "process_type":"kouhan"} 31 | {"pattern":"終わり", "process_type":"kouhan"} 32 | {"pattern":"末", "process_type":"kouhan"} 33 | {"pattern":"半ば", "process_type":"nakaba"} 34 | {"pattern":"中期", "process_type":"nakaba"} 35 | {"pattern":"中盤", "process_type":"nakaba"} 36 | {"pattern":"中頃", "process_type":"nakaba"} 37 | {"pattern":"中ごろ", "process_type":"nakaba"} 38 | {"pattern":"中旬", "process_type":"nakaba"} 39 | {"pattern":"上旬", "process_type":"joujun"} 40 | {"pattern":"中旬", "process_type":"tyujun"} 41 | {"pattern":"下旬", "process_type":"gejun"} 42 | {"pattern":"PM", "process_type":"gogo"} 43 | {"pattern":"AM", "process_type":"gozen"} 44 | {"pattern":"PM", "process_type":"gogo"} 45 | {"pattern":"AM", "process_type":"gozen"} 46 | {"pattern":"PM", "process_type":"gogo"} 47 | {"pattern":"AM", "process_type":"gozen"} 48 | {"pattern":" PM", "process_type":"gogo"} 49 | {"pattern":" AM", "process_type":"gozen"} 50 | -------------------------------------------------------------------------------- /src/dic/zh/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/zh/.DS_Store -------------------------------------------------------------------------------- /src/dic/zh/._chinese_character.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/dic/zh/._chinese_character.txt -------------------------------------------------------------------------------- /src/dic/zh/abstime_expression_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"世紀", "corresponding_time_position":["seiki"], "process_type":[], "ordinary":false, "option":""} 2 | {"pattern":"年", "corresponding_time_position":["y"], "process_type":[], "ordinary":false, "option":""} 3 | {"pattern":"年ǂ月", "corresponding_time_position":["y", "m"], "process_type":[], "ordinary":false, "option":""} 4 | {"pattern":"年ǂ月ǂ日", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""} 5 | {"pattern":"年ǂ月ǂ日ǂ時", "corresponding_time_position":["y", "m", "d", "h"], "process_type":[], "ordinary":false, "option":""} 6 | {"pattern":"年ǂ月ǂ日ǂ時ǂ分", "corresponding_time_position":["y", "m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 7 | {"pattern":"年ǂ月ǂ日ǂ時ǂ分ǂ秒", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 8 | {"pattern":"年ǂ月ǂ日ǂ:ǂ", "corresponding_time_position":["y", "m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 9 | {"pattern":"月", "corresponding_time_position":["m"], "process_type":[], "ordinary":false, "option":""} 10 | {"pattern":"月ǂ日", "corresponding_time_position":["m", "d"], "process_type":[], "ordinary":false, "option":""} 11 | {"pattern":"月ǂ日ǂ時", "corresponding_time_position":["m", "d", "h"], "process_type":[], "ordinary":false, "option":""} 12 | {"pattern":"月ǂ日ǂ時ǂ分", "corresponding_time_position":["m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 13 | {"pattern":"月ǂ日ǂ時ǂ分ǂ秒", "corresponding_time_position":["m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 14 | {"pattern":"月ǂ日ǂ:ǂ", "corresponding_time_position":["m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 15 | {"pattern":"日", "corresponding_time_position":["d"], "process_type":[], "ordinary":false, "option":""} 16 | {"pattern":"日ǂ時", "corresponding_time_position":["d", "h"], "process_type":[], "ordinary":false, "option":""} 17 | {"pattern":"日ǂ時ǂ分", "corresponding_time_position":["d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 18 | {"pattern":"日ǂ時ǂ分ǂ秒", "corresponding_time_position":["d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 19 | {"pattern":"日ǂ:ǂ", "corresponding_time_position":["d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 20 | {"pattern":"時", "corresponding_time_position":["h"], "process_type":[], "ordinary":false, "option":""} 21 | {"pattern":"時ǂ分", "corresponding_time_position":["h", "mn"], "process_type":[], "ordinary":false, "option":""} 22 | {"pattern":"時ǂ分ǂ秒", "corresponding_time_position":["h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 23 | {"pattern":"分ǂ秒", "corresponding_time_position":["mn", "s"], "process_type":[], "ordinary":false, "option":""} 24 | {"pattern":"/ǂ", "corresponding_time_position":["m", "d"], "process_type":[], "ordinary":false, "option":""} 25 | {"pattern":"/ǂ/ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""} 26 | {"pattern":"/ǂ", "corresponding_time_position":["m", "d"], "process_type":[], "ordinary":false, "option":""} 27 | {"pattern":"/ǂ/ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""} 28 | {"pattern":":ǂ", "corresponding_time_position":["h", "mn"], "process_type":[], "ordinary":false, "option":""} 29 | {"pattern":":ǂ", "corresponding_time_position":["h", "mn"], "process_type":[], "ordinary":false, "option":""} 30 | {"pattern":":ǂ:ǂ", "corresponding_time_position":["h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 31 | {"pattern":":ǂ:ǂ", "corresponding_time_position":["h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 32 | {"pattern":"年ǂ月ǂ日午前ǂ時", "corresponding_time_position":["y", "m", "d", "h"], "process_type":[], "ordinary":false, "option":""} 33 | {"pattern":"年ǂ月ǂ日午前ǂ時ǂ分", "corresponding_time_position":["y", "m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 34 | {"pattern":"年ǂ月ǂ日午前ǂ時ǂ分ǂ秒", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 35 | {"pattern":"月ǂ日午前ǂ時", "corresponding_time_position":["m", "d", "h"], "process_type":[], "ordinary":false, "option":""} 36 | {"pattern":"月ǂ日午前ǂ時ǂ分", "corresponding_time_position":["m", "d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 37 | {"pattern":"月ǂ日午前ǂ時ǂ分ǂ秒", "corresponding_time_position":["m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 38 | {"pattern":"日午前ǂ時", "corresponding_time_position":["d", "h"], "process_type":[], "ordinary":false, "option":""} 39 | {"pattern":"日午前ǂ時ǂ分", "corresponding_time_position":["d", "h", "mn"], "process_type":[], "ordinary":false, "option":""} 40 | {"pattern":"日午前ǂ時ǂ分ǂ秒", "corresponding_time_position":["d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 41 | {"pattern":"-ǂ-ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""} 42 | {"pattern":"−ǂ−ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""} 43 | {"pattern":"ーǂーǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""} 44 | {"pattern":"ǂ/ǂ/ǂ ǂ:ǂ:ǂ", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 45 | {"pattern":"ǂ/ǂ/ǂ ǂ:ǂ:ǂ", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":[], "ordinary":false, "option":""} 46 | {"pattern":".ǂ.ǂ", "corresponding_time_position":["y", "m", "d"], "process_type":[], "ordinary":false, "option":""} 47 | {"pattern":".ǂ", "corresponding_time_position":["y", "m"], "process_type":[], "ordinary":false, "option":""} 48 | {"pattern":"年ǂ月ǂ日午後ǂ時", "corresponding_time_position":["y", "m", "d", "h"], "process_type":["gogo"], "ordinary":false, "option":""} 49 | {"pattern":"年ǂ月ǂ日午後ǂ時ǂ分", "corresponding_time_position":["y", "m", "d", "h", "mn"], "process_type":["gogo"], "ordinary":false, "option":""} 50 | {"pattern":"年ǂ月ǂ日午後ǂ時ǂ分ǂ秒", "corresponding_time_position":["y", "m", "d", "h", "mn", "s"], "process_type":["gogo"], "ordinary":false, "option":""} 51 | {"pattern":"月ǂ日午後ǂ時", "corresponding_time_position":["m", "d", "h"], "process_type":["gogo"], "ordinary":false, "option":""} 52 | {"pattern":"月ǂ日午後ǂ時ǂ分", "corresponding_time_position":["m", "d", "h", "mn"], "process_type":["gogo"], "ordinary":false, "option":""} 53 | {"pattern":"月ǂ日午後ǂ時ǂ分ǂ秒", "corresponding_time_position":["m", "d", "h", "mn", "s"], "process_type":["gogo"], "ordinary":false, "option":""} 54 | {"pattern":"日午後ǂ時", "corresponding_time_position":["d", "h"], "process_type":["gogo"], "ordinary":false, "option":""} 55 | {"pattern":"日午後ǂ時ǂ分", "corresponding_time_position":["d", "h", "mn"], "process_type":["gogo"], "ordinary":false, "option":""} 56 | {"pattern":"日午後ǂ時ǂ分ǂ秒", "corresponding_time_position":["d", "h", "mn", "s"], "process_type":["gogo"], "ordinary":false, "option":""} 57 | {"pattern":"年ǂ月ǂ日ǂ時半", "corresponding_time_position":["y", "m", "d", "h"], "process_type":["han"], "ordinary":false, "option":""} 58 | {"pattern":"年ǂ月ǂ日午前ǂ時半", "corresponding_time_position":["y", "m", "d", "h"], "process_type":["han"], "ordinary":false, "option":""} 59 | {"pattern":"年ǂ月ǂ日午後ǂ時半", "corresponding_time_position":["y", "m", "d", "h"], "process_type":["gogo", "han"], "ordinary":false, "option":""} 60 | {"pattern":"月ǂ日ǂ時半", "corresponding_time_position":["m", "d", "h"], "process_type":["han"], "ordinary":false, "option":""} 61 | {"pattern":"月ǂ日午前ǂ時半", "corresponding_time_position":["m", "d", "h"], "process_type":["han"], "ordinary":false, "option":""} 62 | {"pattern":"月ǂ日午後ǂ時半", "corresponding_time_position":["m", "d", "h"], "process_type":["gogo", "han"], "ordinary":false, "option":""} 63 | {"pattern":"日ǂ時半", "corresponding_time_position":["d", "h"], "process_type":["han"], "ordinary":false, "option":""} 64 | {"pattern":"日午前ǂ時半", "corresponding_time_position":["d", "h"], "process_type":["han"], "ordinary":false, "option":""} 65 | {"pattern":"日午後ǂ時半", "corresponding_time_position":["d", "h"], "process_type":["gogo", "han"], "ordinary":false, "option":""} 66 | {"pattern":"時半", "corresponding_time_position":["h"], "process_type":["han"], "ordinary":false, "option":""} 67 | -------------------------------------------------------------------------------- /src/dic/zh/chinese_character.txt: -------------------------------------------------------------------------------- 1 | {"character":"〇", "value":0, "NotationType":"09"} 2 | {"character":"一", "value":1, "NotationType":"09"} 3 | {"character":"二", "value":2, "NotationType":"09"} 4 | {"character":"三", "value":3, "NotationType":"09"} 5 | {"character":"四", "value":4, "NotationType":"09"} 6 | {"character":"五", "value":5, "NotationType":"09"} 7 | {"character":"六", "value":6, "NotationType":"09"} 8 | {"character":"七", "value":7, "NotationType":"09"} 9 | {"character":"八", "value":8, "NotationType":"09"} 10 | {"character":"九", "value":9, "NotationType":"09"} 11 | {"character":"十", "value":1, "NotationType":"sen"} 12 | {"character":"百", "value":2, "NotationType":"sen"} 13 | {"character":"千", "value":3, "NotationType":"sen"} 14 | {"character":"万", "value":4, "NotationType":"man"} 15 | {"character":"億", "value":8, "NotationType":"man"} 16 | {"character":"兆", "value":12, "NotationType":"man"} 17 | {"character":"京", "value":16, "NotationType":"man"} 18 | {"character":"零", "value":0, "NotationType":"09"} 19 | {"character":"壹", "value":1, "NotationType":"09"} 20 | {"character":"贰", "value":2, "NotationType":"09"} 21 | {"character":"两", "value":2, "NotationType":"09"} 22 | {"character":"叁", "value":3, "NotationType":"09"} 23 | {"character":"肆", "value":4, "NotationType":"09"} 24 | {"character":"伍", "value":5, "NotationType":"09"} 25 | {"character":"陆", "value":6, "NotationType":"09"} 26 | {"character":"柒", "value":7, "NotationType":"09"} 27 | {"character":"捌", "value":8, "NotationType":"09"} 28 | {"character":"玖", "value":9, "NotationType":"09"} 29 | {"character":"拾", "value":1, "NotationType":"sen"} 30 | {"character":"佰", "value":2, "NotationType":"sen"} 31 | {"character":"仟", "value":3, "NotationType":"sen"} 32 | {"character":"萬", "value":4, "NotationType":"man"} 33 | {"character":"亿", "value":8, "NotationType":"man"} -------------------------------------------------------------------------------- /src/dic/zh/num_counter_json.txt: -------------------------------------------------------------------------------- 1 | {"pattern":"元", "counter":"元", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":""} 2 | {"pattern":"美元", "counter":"美元", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":""} 3 | {"pattern":"円", "counter":"円", "SI_prefix":0, "optional_power_of_ten":0, "ordinary":false, "option":""} -------------------------------------------------------------------------------- /src/dictionary_dirpath.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dictionary_dirpath.hpp" 3 | namespace dictionary_dirpath { 4 | std::string get_dictionary_dirpath(){ 5 | return "/usr/local/lib/normalizeNumexp/dic/";}} -------------------------------------------------------------------------------- /src/dictionary_dirpath.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DICTIONARY_DIRPATH_H_ 2 | #define DICTIONARY_DIRPATH_H_ 3 | #include 4 | 5 | namespace dictionary_dirpath { 6 | std::string get_dictionary_dirpath(); 7 | } 8 | 9 | #endif //DICTIONARY_DIRPATH_H_ -------------------------------------------------------------------------------- /src/digit_utility.cpp: -------------------------------------------------------------------------------- 1 | #include "digit_utility.hpp" 2 | #include "dictionary_dirpath.hpp" 3 | #include 4 | //debug 5 | namespace digit_utility { 6 | 7 | std::map string_to_notation_type; 8 | std::map kansuji_09_to_value; 9 | std::map kansuji_kurai_to_power_value; 10 | 11 | struct ChineseCharacter { 12 | template 13 | void serialize(Archive &ar){ 14 | ar & MEMBER(character) & MEMBER(NotationType) & MEMBER(value); 15 | } 16 | 17 | std::string character, NotationType; 18 | int value; 19 | }; 20 | 21 | void load_json_from_file(const std::string& filepath, pfi::text::json::json& js) { 22 | std::ifstream in(filepath.c_str()); 23 | pfi::text::json::json_parser parser(in); 24 | try { 25 | while (true) { 26 | js.add(parser.parse()); 27 | } 28 | } catch (const pfi::lang::end_of_data&) { 29 | } 30 | } 31 | 32 | template 33 | void load_from_dictionary(const std::string& dictionary_path, std::vector& load_target) { 34 | load_target.clear(); 35 | pfi::text::json::json js = pfi::text::json::json(new pfi::text::json::json_array()); 36 | load_json_from_file(dictionary_path, js); 37 | pfi::text::json::from_json(js, load_target); 38 | } 39 | 40 | void init_kansuji(const std::string& language){ 41 | std::vector chinese_characters; 42 | std::string dictionary_path; 43 | dictionary_path += dictionary_dirpath::get_dictionary_dirpath(); 44 | if(language == "ja"){ 45 | dictionary_path += "ja/chinese_character.txt"; 46 | }else if (language == "zh"){ 47 | dictionary_path += "zh/chinese_character.txt"; 48 | }else { 49 | return; 50 | } 51 | load_from_dictionary(dictionary_path, chinese_characters); 52 | for(int i=0; i(chinese_characters.size()); i++){ 53 | ENotationType notation_type = NOT_NUMBER; 54 | if(chinese_characters[i].NotationType == "09") notation_type = KANSUJI_09; 55 | else if(chinese_characters[i].NotationType == "sen") notation_type = KANSUJI_KURAI_SEN; 56 | else if(chinese_characters[i].NotationType == "man") notation_type = KANSUJI_KURAI_MAN; 57 | string_to_notation_type[chinese_characters[i].character] = notation_type; 58 | if(notation_type == KANSUJI_09) kansuji_09_to_value[chinese_characters[i].character] = chinese_characters[i].value; 59 | else if(notation_type == KANSUJI_KURAI_MAN || notation_type == KANSUJI_KURAI_SEN) kansuji_kurai_to_power_value[chinese_characters[i].character] = chinese_characters[i].value; 60 | } 61 | kansuji_kurai_to_power_value[" "] = 0; 62 | } 63 | 64 | bool is_hankakusuji(const pfi::data::string::uchar uc) { 65 | return (pfi::data::string::string_to_uchar("0") <= uc 66 | && uc <= pfi::data::string::string_to_uchar("9")); 67 | } 68 | 69 | bool is_zenkakusuji(const pfi::data::string::uchar uc) { 70 | return (pfi::data::string::string_to_uchar("0") <= uc 71 | && uc <= pfi::data::string::string_to_uchar("9")); 72 | } 73 | 74 | bool is_arabic(const pfi::data::string::uchar uc) { 75 | return (is_hankakusuji(uc) || is_zenkakusuji(uc)); 76 | } 77 | 78 | bool is_notation_type(const pfi::data::string::uchar uc, ENotationType NOTATION_TYPE) { 79 | std::map::const_iterator itr = 80 | string_to_notation_type.find(pfi::data::string::uchar_to_string(uc)); 81 | if (itr == string_to_notation_type.end()) 82 | return 0; 83 | return (itr->second) & NOTATION_TYPE; 84 | } 85 | 86 | bool is_kansuji(const pfi::data::string::uchar uc) { 87 | return is_notation_type(uc, KANSUJI); 88 | } 89 | 90 | bool is_kansuji_09(const pfi::data::string::uchar uc) { 91 | return is_notation_type(uc, KANSUJI_09); 92 | } 93 | 94 | bool is_kansuji_kurai_sen(const pfi::data::string::uchar uc) { 95 | return is_notation_type(uc, KANSUJI_KURAI_SEN); 96 | } 97 | 98 | bool is_kansuji_kurai_man(const pfi::data::string::uchar uc) { 99 | return is_notation_type(uc, KANSUJI_KURAI_MAN); 100 | } 101 | 102 | bool is_kansuji_kurai(const pfi::data::string::uchar uc) { 103 | return is_notation_type(uc, KANSUJI_KURAI); 104 | } 105 | 106 | bool is_comma(const pfi::data::string::uchar uc) { 107 | std::string str = pfi::data::string::uchar_to_string(uc); 108 | return (str == "," || str == "、" || str == ","); 109 | } 110 | 111 | bool is_decimal_point(const pfi::data::string::ustring& ustr) { 112 | std::string str = pfi::data::string::ustring_to_string(ustr); 113 | return (str == "." || str == "・" || str == "."); 114 | } 115 | 116 | bool is_range_expression(const pfi::data::string::ustring& ustr) { 117 | std::string str = pfi::data::string::ustring_to_string(ustr); 118 | return (str == "~" || str == "〜" || str == "~" || str == "-" || str == "−" || str == "ー" || str == "―" || str == "から"); 119 | } 120 | 121 | bool is_number(const pfi::data::string::uchar uc) { 122 | return is_hankakusuji(uc) or is_zenkakusuji(uc) or is_kansuji(uc); 123 | } 124 | 125 | int convert_kansuji_09_to_value(const pfi::data::string::uchar uc) { 126 | std::string str = pfi::data::string::uchar_to_string(uc); 127 | std::map::const_iterator itr = kansuji_09_to_value.find(str); 128 | if (itr == kansuji_09_to_value.end()) { 129 | //例外処理。どうする? 130 | throw "Exception : is not kansuji09"; 131 | } 132 | return (itr->second); 133 | } 134 | 135 | int convert_kansuji_kurai_to_power_value(const pfi::data::string::uchar uc) { 136 | std::string str = pfi::data::string::uchar_to_string(uc); 137 | std::map::const_iterator itr = kansuji_kurai_to_power_value.find(str); 138 | if (itr == kansuji_kurai_to_power_value.end()) { 139 | //例外処理。どうする? 140 | throw "Exception : is not kansuji_kurai"; 141 | } 142 | return (itr->second); 143 | } 144 | } //namespace digit_utility 145 | -------------------------------------------------------------------------------- /src/digit_utility.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DIGIT_UTILITY_H_ 2 | #define DIGIT_UTILITY_H_ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace digit_utility { 10 | //const double DOUBLE_NULL = INFINITY; 11 | //std::string dictionary_dirpath("/home/katsuma/src/digit_utils/src/dic/"); 12 | 13 | enum ENotationType { 14 | NOT_NUMBER = 0, 15 | KANSUJI_09 = 1, 16 | KANSUJI_KURAI_SEN = 2, 17 | KANSUJI_KURAI_MAN = 4, 18 | KANSUJI_KURAI = 6, 19 | KANSUJI = 7, 20 | ZENKAKU = 8, 21 | HANKAKU = 16, 22 | }; 23 | 24 | struct Number { 25 | Number() 26 | : original_expression(pfi::data::string::string_to_ustring("")), 27 | position_start(-1), 28 | position_end(-1), 29 | value_lowerbound(INFINITY), 30 | value_upperbound(-INFINITY), 31 | notation_type(NOT_NUMBER) { 32 | } 33 | 34 | Number(pfi::data::string::ustring& original_expression, int position_start, int position_end) 35 | : original_expression(original_expression), 36 | position_start(position_start), 37 | position_end(position_end), 38 | value_lowerbound(INFINITY), 39 | value_upperbound(-INFINITY), 40 | notation_type(NOT_NUMBER) { 41 | } 42 | 43 | pfi::data::string::ustring original_expression; 44 | int position_start; 45 | int position_end; 46 | double value_lowerbound; 47 | double value_upperbound; 48 | int notation_type; 49 | }; 50 | 51 | void init_kansuji(const std::string& language); 52 | bool is_hankakusuji(pfi::data::string::uchar uc); 53 | bool is_zenkakusuji(pfi::data::string::uchar uc); 54 | bool is_arabic(pfi::data::string::uchar uc); 55 | bool is_kansuji(pfi::data::string::uchar uc); 56 | bool is_kansuji_09(pfi::data::string::uchar uc); 57 | bool is_kansuji_kurai_sen(pfi::data::string::uchar uc); 58 | bool is_kansuji_kurai_man(pfi::data::string::uchar uc); 59 | bool is_kansuji_kurai(pfi::data::string::uchar uc); 60 | bool is_number(pfi::data::string::uchar uc); 61 | bool is_comma(pfi::data::string::uchar uc); 62 | bool is_decimal_point(const pfi::data::string::ustring& ustr); 63 | bool is_range_expression(const pfi::data::string::ustring& ustr); 64 | int convert_kansuji_09_to_value(pfi::data::string::uchar uc); 65 | int convert_kansuji_kurai_to_power_value(pfi::data::string::uchar uc); 66 | } 67 | 68 | #endif //DIGIT_UTILITY_H_ 69 | -------------------------------------------------------------------------------- /src/digit_utility_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "digit_utility.hpp" 4 | 5 | #include 6 | 7 | using namespace digit_utility; 8 | 9 | class DigitUtilityTest: public testing::Test { 10 | public: 11 | void SetUp() { 12 | std::string language("ja"); 13 | init_kansuji(language); 14 | } 15 | void TearDown() { 16 | } 17 | }; 18 | 19 | TEST_F(DigitUtilityTest, isHankaku) { 20 | EXPECT_TRUE(is_hankakusuji(pfi::data::string::string_to_uchar("1"))); 21 | EXPECT_FALSE(is_hankakusuji(pfi::data::string::string_to_uchar("1"))); 22 | EXPECT_FALSE(is_hankakusuji(pfi::data::string::string_to_uchar("一"))); 23 | EXPECT_FALSE(is_hankakusuji(pfi::data::string::string_to_uchar("あ"))); 24 | } 25 | 26 | TEST_F(DigitUtilityTest, isZenkaku) { 27 | EXPECT_FALSE(is_zenkakusuji(pfi::data::string::string_to_uchar("1"))); 28 | EXPECT_TRUE(is_zenkakusuji(pfi::data::string::string_to_uchar("1"))); 29 | EXPECT_FALSE(is_zenkakusuji(pfi::data::string::string_to_uchar("一"))); 30 | EXPECT_FALSE(is_zenkakusuji(pfi::data::string::string_to_uchar("あ"))); 31 | } 32 | 33 | TEST_F(DigitUtilityTest, isArabic) { 34 | EXPECT_TRUE(is_arabic(pfi::data::string::string_to_uchar("1"))); 35 | EXPECT_TRUE(is_arabic(pfi::data::string::string_to_uchar("1"))); 36 | EXPECT_FALSE(is_arabic(pfi::data::string::string_to_uchar("一"))); 37 | EXPECT_FALSE(is_arabic(pfi::data::string::string_to_uchar("あ"))); 38 | } 39 | 40 | TEST_F(DigitUtilityTest, isKansuji) { 41 | EXPECT_FALSE(is_kansuji(pfi::data::string::string_to_uchar("1"))); 42 | EXPECT_FALSE(is_kansuji(pfi::data::string::string_to_uchar("1"))); 43 | EXPECT_TRUE(is_kansuji(pfi::data::string::string_to_uchar("一"))); 44 | EXPECT_FALSE(is_kansuji(pfi::data::string::string_to_uchar("あ"))); 45 | } 46 | 47 | TEST_F(DigitUtilityTest, isKansuji09) { 48 | EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("1"))); 49 | EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("1"))); 50 | EXPECT_TRUE(is_kansuji_09(pfi::data::string::string_to_uchar("一"))); 51 | EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("十"))); 52 | EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("万"))); 53 | EXPECT_FALSE(is_kansuji_09(pfi::data::string::string_to_uchar("あ"))); 54 | } 55 | 56 | TEST_F(DigitUtilityTest, isKansujiKuraiSen) { 57 | EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("1"))); 58 | EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("1"))); 59 | EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("一"))); 60 | EXPECT_TRUE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("十"))); 61 | EXPECT_TRUE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("百"))); 62 | EXPECT_TRUE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("千"))); 63 | EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("万"))); 64 | EXPECT_FALSE(is_kansuji_kurai_sen(pfi::data::string::string_to_uchar("あ"))); 65 | } 66 | 67 | TEST_F(DigitUtilityTest, isKansujiKuraiMan) { 68 | EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("1"))); 69 | EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("1"))); 70 | EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("一"))); 71 | EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("十"))); 72 | EXPECT_TRUE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("万"))); 73 | EXPECT_TRUE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("億"))); 74 | EXPECT_TRUE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("兆"))); 75 | EXPECT_FALSE(is_kansuji_kurai_man(pfi::data::string::string_to_uchar("あ"))); 76 | } 77 | 78 | TEST_F(DigitUtilityTest, isKansujiKurai) { 79 | EXPECT_FALSE(is_kansuji_kurai(pfi::data::string::string_to_uchar("1"))); 80 | EXPECT_FALSE(is_kansuji_kurai(pfi::data::string::string_to_uchar("1"))); 81 | EXPECT_FALSE(is_kansuji_kurai(pfi::data::string::string_to_uchar("一"))); 82 | EXPECT_TRUE(is_kansuji_kurai(pfi::data::string::string_to_uchar("十"))); 83 | EXPECT_TRUE(is_kansuji_kurai(pfi::data::string::string_to_uchar("万"))); 84 | EXPECT_FALSE(is_kansuji_kurai(pfi::data::string::string_to_uchar("あ"))); 85 | } 86 | 87 | TEST_F(DigitUtilityTest, isNumber) { 88 | EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("1"))); 89 | EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("1"))); 90 | EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("一"))); 91 | EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("十"))); 92 | EXPECT_TRUE(is_number(pfi::data::string::string_to_uchar("万"))); 93 | EXPECT_FALSE(is_number(pfi::data::string::string_to_uchar("あ"))); 94 | } 95 | -------------------------------------------------------------------------------- /src/duration_expression_normalizer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "duration_expression_normalizer.hpp" 4 | #include "digit_utility.hpp" 5 | #include "number_normalizer.hpp" 6 | 7 | namespace duration_expression_normalizer { 8 | 9 | void DurationExpressionNormalizer::init() { 10 | load_from_dictionaries("duration_expression_json.txt", "duration_prefix_counter_json.txt", "duration_prefix_json.txt", "duration_suffix_json.txt"); 11 | } 12 | 13 | void DurationExpressionNormalizer::normalize_number(const std::string& text, std::vector& numbers) { 14 | NN.process(text, numbers); 15 | } 16 | 17 | void set_time(DurationExpression& durationexp, const std::string& corresponding_time_position, const DurationExpression& integrate_durationexp) { 18 | if (corresponding_time_position == "y") { 19 | durationexp.value_lowerbound.year = integrate_durationexp.org_value_lowerbound; 20 | durationexp.value_upperbound.year = integrate_durationexp.org_value_upperbound; 21 | } else if (corresponding_time_position == "m") { 22 | durationexp.value_lowerbound.month = integrate_durationexp.org_value_lowerbound; 23 | durationexp.value_upperbound.month = integrate_durationexp.org_value_upperbound; 24 | } else if (corresponding_time_position == "d") { 25 | durationexp.value_lowerbound.day = integrate_durationexp.org_value_lowerbound; 26 | durationexp.value_upperbound.day = integrate_durationexp.org_value_upperbound; 27 | } else if (corresponding_time_position == "h") { 28 | durationexp.value_lowerbound.hour = integrate_durationexp.org_value_lowerbound; 29 | durationexp.value_upperbound.hour = integrate_durationexp.org_value_upperbound; 30 | } else if (corresponding_time_position == "mn") { 31 | durationexp.value_lowerbound.minute = integrate_durationexp.org_value_lowerbound; 32 | durationexp.value_upperbound.minute = integrate_durationexp.org_value_upperbound; 33 | } else if (corresponding_time_position == "s") { 34 | durationexp.value_lowerbound.second = integrate_durationexp.org_value_lowerbound; 35 | durationexp.value_upperbound.second = integrate_durationexp.org_value_upperbound; 36 | } else if (corresponding_time_position == "seiki") { 37 | durationexp.value_lowerbound.year = integrate_durationexp.org_value_lowerbound*100; 38 | durationexp.value_upperbound.year = integrate_durationexp.org_value_upperbound*100; 39 | } else if (corresponding_time_position == "w") { 40 | durationexp.value_lowerbound.day = integrate_durationexp.org_value_lowerbound*7; 41 | durationexp.value_upperbound.day = integrate_durationexp.org_value_upperbound*7; 42 | } 43 | } 44 | 45 | void do_option_han(DurationExpression& durationexp, const std::string& corresponding_time_position){ 46 | if (corresponding_time_position == "y") { 47 | durationexp.value_lowerbound.year += 0.5; 48 | durationexp.value_upperbound.year += 0.5; 49 | } else if (corresponding_time_position == "m") { 50 | durationexp.value_lowerbound.month += 0.5; 51 | durationexp.value_upperbound.month += 0.5; 52 | } else if (corresponding_time_position == "d") { 53 | durationexp.value_lowerbound.day += 0.5; 54 | durationexp.value_upperbound.day += 0.5; 55 | } else if (corresponding_time_position == "h") { 56 | durationexp.value_lowerbound.hour += 0.5; 57 | durationexp.value_upperbound.hour += 0.5; 58 | } else if (corresponding_time_position == "mn") { 59 | durationexp.value_lowerbound.minute += 0.5; 60 | durationexp.value_upperbound.minute += 0.5; 61 | } else if (corresponding_time_position == "s") { 62 | durationexp.value_lowerbound.second += 0.5; 63 | durationexp.value_upperbound.second += 0.5; 64 | } else if (corresponding_time_position == "seiki") { 65 | durationexp.value_lowerbound.year += 50; 66 | durationexp.value_upperbound.year += 50; 67 | } 68 | } 69 | 70 | void revise_durationexp_by_process_type(DurationExpression& durationexp, std::string process_type, const LimitedDurationExpression& matching_limited_duration_expression) { 71 | if (process_type == "han") { 72 | if(matching_limited_duration_expression.corresponding_time_position.empty()) return; 73 | std::string corresponding_time_position = matching_limited_duration_expression.corresponding_time_position[matching_limited_duration_expression.corresponding_time_position.size()-1]; 74 | do_option_han(durationexp, corresponding_time_position); 75 | } 76 | } 77 | 78 | void DurationExpressionNormalizer::revise_any_type_expression_by_matching_limited_expression(std::vector& durationexps, int &expression_id, 79 | const LimitedDurationExpression matching_limited_duration_expression) { 80 | int final_integrated_durationexp_id = expression_id + matching_limited_duration_expression.total_number_of_place_holder; 81 | durationexps[expression_id].position_end = durationexps[final_integrated_durationexp_id].position_end 82 | + matching_limited_duration_expression.length_of_strings_after_final_place_holder; 83 | for (int i = 0; i < static_cast(matching_limited_duration_expression.corresponding_time_position.size()); i++) { 84 | set_time(durationexps[expression_id], matching_limited_duration_expression.corresponding_time_position[i], durationexps[expression_id + i]); 85 | } 86 | for (int i = 0; i < static_cast(matching_limited_duration_expression.process_type.size()); i++) { 87 | revise_durationexp_by_process_type(durationexps[expression_id], matching_limited_duration_expression.process_type[i], matching_limited_duration_expression); 88 | } 89 | durationexps[expression_id].ordinary = matching_limited_duration_expression.ordinary; 90 | 91 | durationexps.erase(durationexps.begin() + expression_id + 1, 92 | durationexps.begin() + expression_id + 1 + matching_limited_duration_expression.total_number_of_place_holder); 93 | } 94 | 95 | void DurationExpressionNormalizer::revise_any_type_expression_by_matching_prefix_counter(DurationExpression& any_type_expression, const LimitedDurationExpression& matching_limited_expression) {} //持続時間にprefix_counterは存在しない(今のところ) 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | /* 116 |  修飾語による規格化表現の補正処理。 117 | */ 118 | 119 | void do_time_about(DurationExpression& durationexp) { 120 | normalizer_utility::Time &tvl = durationexp.value_lowerbound, &tvu = durationexp.value_upperbound; 121 | const std::string target_time_position = normalizer_utility::identify_time_detail(durationexp.value_lowerbound); 122 | if (target_time_position == "y") { 123 | tvl.year -= 5; 124 | tvu.year += 5; 125 | } else if (target_time_position == "m") { 126 | tvl.month -= 1; 127 | tvu.month += 1; 128 | } else if (target_time_position == "d") { 129 | tvl.day -= 1; 130 | tvu.day += 1; 131 | } else if (target_time_position == "h") { 132 | tvl.hour -= 1; 133 | tvu.hour += 1; 134 | } else if (target_time_position == "mn") { 135 | tvl.minute -= 5; 136 | tvu.minute += 5; 137 | } else if (target_time_position == "s") { 138 | tvl.second -= 5; 139 | tvu.second += 5; 140 | } 141 | } 142 | 143 | 144 | void do_time_kyou(DurationExpression& durationexp) { 145 | normalizer_utility::Time &tvu = durationexp.value_upperbound; 146 | const std::string target_time_position = normalizer_utility::identify_time_detail(durationexp.value_lowerbound); 147 | if (target_time_position == "y") { 148 | tvu.year += 5; 149 | } else if (target_time_position == "m") { 150 | tvu.month += 1; 151 | } else if (target_time_position == "d") { 152 | tvu.day += 1; 153 | } else if (target_time_position == "h") { 154 | tvu.hour += 1; 155 | } else if (target_time_position == "mn") { 156 | tvu.minute += 5; 157 | } else if (target_time_position == "s") { 158 | tvu.second += 5; 159 | } 160 | } 161 | 162 | 163 | void do_time_jaku(DurationExpression& durationexp) { 164 | normalizer_utility::Time &tvl = durationexp.value_lowerbound; 165 | const std::string target_time_position = normalizer_utility::identify_time_detail(durationexp.value_lowerbound); 166 | if (target_time_position == "y") { 167 | tvl.year -= 5; 168 | } else if (target_time_position == "m") { 169 | tvl.month -= 1; 170 | } else if (target_time_position == "d") { 171 | tvl.day -= 1; 172 | } else if (target_time_position == "h") { 173 | tvl.hour -= 1; 174 | } else if (target_time_position == "mn") { 175 | tvl.minute -= 5; 176 | } else if (target_time_position == "s") { 177 | tvl.second -= 5; 178 | } 179 | } 180 | 181 | 182 | void DurationExpressionNormalizer::revise_any_type_expression_by_number_modifier(DurationExpression& durationexp, 183 | const normalizer_utility::NumberModifier& number_modifier) { 184 | std::string process_type = number_modifier.process_type; 185 | if (process_type == "or_over") { 186 | durationexp.value_upperbound = normalizer_utility::Time(INFINITY); 187 | } else if (process_type == "or_less") { 188 | durationexp.value_lowerbound = normalizer_utility::Time(-INFINITY); 189 | } else if (process_type == "over") { 190 | durationexp.value_upperbound = normalizer_utility::Time(INFINITY); 191 | durationexp.include_lowerbound = false; 192 | } else if (process_type == "less") { 193 | durationexp.value_lowerbound = normalizer_utility::Time(-INFINITY); 194 | durationexp.include_upperbound = false; 195 | } else if (process_type == "ordinary") { //TODO : 序数は絶対時間として扱う?持続時間として扱う? 未定 196 | durationexp.ordinary = true; 197 | } else if (process_type == "none") { 198 | ; 199 | } else if (process_type == "per") { 200 | // TODO : 「1日毎」など? どんな処理をするか未定。 201 | } else if (process_type == "dai") { 202 | // TODO : 「1秒台」など。 どんな処理をするか未定。 これは持続時間?(ではなさそう) 203 | } else if (process_type == "about") { 204 | do_time_about(durationexp); 205 | } else if (process_type == "kyou") { 206 | do_time_kyou(durationexp); 207 | } else if (process_type == "jaku") { 208 | do_time_jaku(durationexp); 209 | } else if (process_type == "made") { 210 | if(durationexp.value_lowerbound == durationexp.value_upperbound){ 211 | durationexp.value_lowerbound = normalizer_utility::Time(-INFINITY); 212 | } else{ 213 | 214 | } 215 | } else { 216 | durationexp.options.push_back(process_type); 217 | } 218 | 219 | } 220 | 221 | void DurationExpressionNormalizer::delete_not_any_type_expression(std::vector& durationexps){ 222 | for(int i=0; i(durationexps.size()); i++){ 223 | if(normalizer_utility::is_null_time(durationexps[i].value_lowerbound) && normalizer_utility::is_null_time(durationexps[i].value_upperbound)){ 224 | durationexps.erase(durationexps.begin() + i); 225 | i--; 226 | } 227 | } 228 | } 229 | 230 | void DurationExpressionNormalizer::fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector& durationexps) { 231 | for(int i=0; i(durationexps.size()-1); i++){ 232 | if(have_kara_suffix(durationexps[i].options) && have_kara_prefix(durationexps[i+1].options) && durationexps[i].position_end +2 >= durationexps[i+1].position_start){ 233 | durationexps[i].value_upperbound = durationexps[i+1].value_upperbound; 234 | durationexps[i].position_end = durationexps[i+1].position_end; 235 | durationexps[i].set_original_expression_from_position(utext); 236 | merge_options(durationexps[i].options, durationexps[i+1].options); 237 | durationexps.erase(durationexps.begin()+i+1); 238 | } 239 | } 240 | } 241 | } //namespace duration_expression_normalizer 242 | 243 | -------------------------------------------------------------------------------- /src/duration_expression_normalizer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DURATION_EXPRESSION_NORMALIZER_H_ 2 | #define DURATION_EXPRESSION_NORMALIZER_H_ 3 | #include 4 | #include "digit_utility.hpp" 5 | #include "number_normalizer.hpp" 6 | #include "normalizer_utility.hpp" 7 | #include "normalizer_template.hpp" 8 | #include 9 | 10 | namespace duration_expression_normalizer { 11 | 12 | struct DurationExpression : normalizer_utility::NormalizedExpressionTemplate{ 13 | DurationExpression(digit_utility::Number number) 14 | : normalizer_utility::NormalizedExpressionTemplate(number.original_expression, number.position_start, number.position_end), 15 | org_value_lowerbound(number.value_lowerbound), 16 | org_value_upperbound(number.value_upperbound), 17 | value_lowerbound(normalizer_utility::Time(INFINITY)), 18 | value_upperbound(normalizer_utility::Time(-INFINITY)), 19 | ordinary(false) 20 | {} 21 | 22 | double org_value_lowerbound, org_value_upperbound; 23 | normalizer_utility::Time value_lowerbound, value_upperbound; 24 | bool ordinary; 25 | }; 26 | 27 | 28 | class LimitedDurationExpression : public normalizer_utility::LimitedExpressionTemplate{ 29 | public: 30 | template 31 | void serialize(Archive &ar){ 32 | ar & MEMBER(pattern) & MEMBER(corresponding_time_position) & MEMBER(process_type) & MEMBER(ordinary) & MEMBER(option); 33 | } 34 | 35 | std::vector corresponding_time_position; 36 | std::vector process_type; 37 | }; 38 | 39 | 40 | class DurationExpressionNormalizer : public normalizer_template::NormalizerTemplate{ 41 | public: 42 | DurationExpressionNormalizer(const std::string& language) : NN(language) { language_ = language; init(); } 43 | 44 | private: 45 | void init(); 46 | void normalize_number(const std::string& text, std::vector& numbers); 47 | void revise_any_type_expression_by_matching_limited_expression(std::vector& durationexps, int& expression_id, LimitedDurationExpression matching_limited_duration_expression); 48 | void revise_any_type_expression_by_matching_prefix_counter(DurationExpression& any_type_expression, const LimitedDurationExpression& matching_limited_expression); 49 | void revise_any_type_expression_by_number_modifier(DurationExpression& durationexp, const normalizer_utility::NumberModifier& number_modifier); 50 | void delete_not_any_type_expression(std::vector& durationexps); 51 | void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector& durationexps); 52 | 53 | number_normalizer::NumberNormalizer NN; 54 | }; 55 | 56 | } //namespace duration_expression_normalizer 57 | 58 | #endif //RELTIME_EXPRESSON_NORMALIZER_H_ 59 | -------------------------------------------------------------------------------- /src/duration_expression_normalizer_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "normalizer_utility.hpp" 4 | #include "duration_expression_normalizer.hpp" 5 | 6 | #include 7 | #include 8 | 9 | using namespace normalizer_utility; 10 | using namespace std; 11 | using namespace pfi::data::string; 12 | using namespace duration_expression_normalizer; 13 | 14 | class DurationexpNormalizerTest : public testing::Test { 15 | public: 16 | void SetUp() {} 17 | void TearDown() {} 18 | }; 19 | 20 | bool is_same_time(const Time& a, const Time& b){ 21 | return 22 | a.year == b.year && 23 | a.month == b.month && 24 | a.day == b.day && 25 | a.hour == b.hour && 26 | a.minute == b.minute && 27 | a.second == b.second; 28 | } 29 | 30 | TEST_F(DurationexpNormalizerTest, simple1) { 31 | DurationExpressionNormalizer DEN("ja"); 32 | std::string text("あの人は三時間も耐えた"); 33 | std::vector durationexps; 34 | DEN.process(text, durationexps); 35 | ASSERT_EQ(1u, durationexps.size()); 36 | 37 | Time ex1_lower(INFINITY, INFINITY, INFINITY, 3, INFINITY, INFINITY); 38 | Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 3, -INFINITY, -INFINITY); 39 | 40 | EXPECT_EQ("三時間", ustring_to_string(durationexps[0].original_expression)); 41 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 42 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 43 | } 44 | 45 | TEST_F(DurationexpNormalizerTest, simple2) { 46 | DurationExpressionNormalizer DEN("ja"); 47 | std::string text("それは3年5ヶ月の間にも"); 48 | std::vector durationexps; 49 | DEN.process(text, durationexps); 50 | ASSERT_EQ(1u, durationexps.size()); 51 | Time ex1_lower(3, 5, INFINITY, INFINITY, INFINITY, INFINITY); 52 | Time ex1_upper(3, 5, -INFINITY, -INFINITY, -INFINITY, -INFINITY); 53 | 54 | EXPECT_EQ("3年5ヶ月", ustring_to_string(durationexps[0].original_expression)); 55 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 56 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 57 | } 58 | 59 | TEST_F(DurationexpNormalizerTest, seiki1) { 60 | DurationExpressionNormalizer DEN("ja"); 61 | std::string text("あの人は三世紀も耐えた"); 62 | std::vector durationexps; 63 | DEN.process(text, durationexps); 64 | ASSERT_EQ(1u, durationexps.size()); 65 | 66 | Time ex1_lower(300, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY); 67 | Time ex1_upper(300, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY); 68 | 69 | EXPECT_EQ("三世紀", ustring_to_string(durationexps[0].original_expression)); 70 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 71 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 72 | } 73 | 74 | TEST_F(DurationexpNormalizerTest, han1) { 75 | DurationExpressionNormalizer DEN("ja"); 76 | std::string text("あの人は三世紀半も耐えた"); 77 | std::vector durationexps; 78 | DEN.process(text, durationexps); 79 | ASSERT_EQ(1u, durationexps.size()); 80 | 81 | Time ex1_lower(350, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY); 82 | Time ex1_upper(350, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY); 83 | 84 | EXPECT_EQ("三世紀半", ustring_to_string(durationexps[0].original_expression)); 85 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 86 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 87 | } 88 | 89 | TEST_F(DurationexpNormalizerTest, han2) { 90 | DurationExpressionNormalizer DEN("ja"); 91 | std::string text("あの人は三時間半も耐えた"); 92 | std::vector durationexps; 93 | DEN.process(text, durationexps); 94 | ASSERT_EQ(1u, durationexps.size()); 95 | 96 | Time ex1_lower(INFINITY, INFINITY, INFINITY, 3.5, INFINITY, INFINITY); 97 | Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 3.5, -INFINITY, -INFINITY); 98 | 99 | EXPECT_EQ("三時間半", ustring_to_string(durationexps[0].original_expression)); 100 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 101 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 102 | } 103 | 104 | TEST_F(DurationexpNormalizerTest, plural1) { 105 | DurationExpressionNormalizer DEN("ja"); 106 | std::string text("三年間と五ヶ月の間"); 107 | std::vector durationexps; 108 | DEN.process(text, durationexps); 109 | ASSERT_EQ(2u, durationexps.size()); 110 | 111 | Time ex1_lower(3, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY); 112 | Time ex1_upper(3, -INFINITY, -INFINITY, -INFINITY, -INFINITY, -INFINITY); 113 | 114 | EXPECT_EQ("三年間", ustring_to_string(durationexps[0].original_expression)); 115 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 116 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 117 | 118 | Time ex2_lower(INFINITY, 5, INFINITY, INFINITY, INFINITY, INFINITY); 119 | Time ex2_upper(-INFINITY, 5, -INFINITY, -INFINITY, -INFINITY, -INFINITY); 120 | 121 | EXPECT_EQ("五ヶ月", ustring_to_string(durationexps[1].original_expression)); 122 | EXPECT_TRUE(is_same_time(ex2_lower, durationexps[1].value_lowerbound)); 123 | EXPECT_TRUE(is_same_time(ex2_upper, durationexps[1].value_upperbound)); 124 | } 125 | 126 | TEST_F(DurationexpNormalizerTest, or_over1) { 127 | DurationExpressionNormalizer DEN("ja"); 128 | std::string text("あの人は三時間以上も耐えた"); 129 | std::vector durationexps; 130 | DEN.process(text, durationexps); 131 | ASSERT_EQ(1u, durationexps.size()); 132 | 133 | Time ex1_lower(INFINITY, INFINITY, INFINITY, 3, INFINITY, INFINITY); 134 | Time ex1_upper(INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY); 135 | 136 | EXPECT_EQ("三時間以上", ustring_to_string(durationexps[0].original_expression)); 137 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 138 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 139 | } 140 | 141 | TEST_F(DurationexpNormalizerTest, about_suffix) { 142 | DurationExpressionNormalizer DEN("ja"); 143 | std::string text("あの人は三時間くらいは耐えた"); 144 | std::vector durationexps; 145 | DEN.process(text, durationexps); 146 | ASSERT_EQ(1u, durationexps.size()); 147 | 148 | Time ex1_lower(INFINITY, INFINITY, INFINITY, 2, INFINITY, INFINITY); 149 | Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 4, -INFINITY, -INFINITY); 150 | 151 | EXPECT_EQ("三時間くらい", ustring_to_string(durationexps[0].original_expression)); 152 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 153 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 154 | } 155 | 156 | 157 | TEST_F(DurationexpNormalizerTest, about_prefix) { 158 | DurationExpressionNormalizer DEN("ja"); 159 | std::string text("あの人はほぼ三時間は耐えた"); 160 | std::vector durationexps; 161 | DEN.process(text, durationexps); 162 | ASSERT_EQ(1u, durationexps.size()); 163 | 164 | Time ex1_lower(INFINITY, INFINITY, INFINITY, 2, INFINITY, INFINITY); 165 | Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 4, -INFINITY, -INFINITY); 166 | 167 | EXPECT_EQ("ほぼ三時間", ustring_to_string(durationexps[0].original_expression)); 168 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 169 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 170 | } 171 | 172 | 173 | TEST_F(DurationexpNormalizerTest, kyou) { 174 | DurationExpressionNormalizer DEN("ja"); 175 | std::string text("あの人は三時間強は耐えた"); 176 | std::vector durationexps; 177 | DEN.process(text, durationexps); 178 | ASSERT_EQ(1u, durationexps.size()); 179 | 180 | Time ex1_lower(INFINITY, INFINITY, INFINITY, 3, INFINITY, INFINITY); 181 | Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 4, -INFINITY, -INFINITY); 182 | 183 | EXPECT_EQ("三時間強", ustring_to_string(durationexps[0].original_expression)); 184 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 185 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 186 | } 187 | 188 | 189 | TEST_F(DurationexpNormalizerTest, jaku) { 190 | DurationExpressionNormalizer DEN("ja"); 191 | std::string text("あの人は三時間弱は耐えた"); 192 | std::vector durationexps; 193 | DEN.process(text, durationexps); 194 | ASSERT_EQ(1u, durationexps.size()); 195 | 196 | Time ex1_lower(INFINITY, INFINITY, INFINITY, 2, INFINITY, INFINITY); 197 | Time ex1_upper(-INFINITY, -INFINITY, -INFINITY, 3, -INFINITY, -INFINITY); 198 | 199 | EXPECT_EQ("三時間弱", ustring_to_string(durationexps[0].original_expression)); 200 | EXPECT_TRUE(is_same_time(ex1_lower, durationexps[0].value_lowerbound)); 201 | EXPECT_TRUE(is_same_time(ex1_upper, durationexps[0].value_upperbound)); 202 | } -------------------------------------------------------------------------------- /src/inappropriate_expression_remover.hpp: -------------------------------------------------------------------------------- 1 | #ifndef INAPPROPRIATE_EXPRESSION_REMOVER_H_ 2 | #define INAPPROPRIATE_EXPRESSION_REMOVER_H_ 3 | #include "numerical_expression_normalizer.hpp" 4 | #include "abstime_expression_normalizer.hpp" 5 | #include "reltime_expression_normalizer.hpp" 6 | #include "duration_expression_normalizer.hpp" 7 | 8 | namespace inappropriate_expression_remover{ 9 | struct InappropriateStrings { 10 | template 11 | void serialize(Archive &ar){ 12 | ar & MEMBER(str); 13 | } 14 | std::string str; 15 | }; 16 | 17 | class InappropriateExpressionRemover{ 18 | public: 19 | InappropriateExpressionRemover(const std::string& language); 20 | void remove_inappropriate_extraction(const std::string& text, 21 | std::vector& numexps, 22 | std::vector& abstimeexps, 23 | std::vector& reltimeexps, 24 | std::vector& durationexps); 25 | private: 26 | template 27 | void delete_inappropriate_extraction_using_dictionary_one_type(std::vector& any_type_expressions); 28 | template 29 | bool is_url_strings(const std::string& text, const AnyTypeExpression& any_type_expression); 30 | template 31 | void delete_url_strings(const std::string& text, std::vector& any_type_expressions); 32 | void delete_inappropriate_extraction_using_dictionary(const std::string& text, 33 | std::vector& numexps, 34 | std::vector& abstimeexps, 35 | std::vector& reltimeexps, 36 | std::vector& durationexps); 37 | void init_inappropriate_stringss(const std::string& language); 38 | void init_url_strings(); 39 | 40 | std::map inappropriate_strings_to_bool; 41 | std::map url_strings_to_bool; 42 | }; 43 | } //namespace inappropriate_expression_remover.hpp 44 | 45 | #endif 46 | 47 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "normalize_numexp.hpp" 2 | #include "optparse.h" 3 | #include 4 | using namespace normalize_numexp; 5 | 6 | class option : public optparse 7 | { 8 | public: 9 | bool help, version, cnt; 10 | std::string show; 11 | 12 | public: 13 | option() 14 | : help(false), version(false) {} 15 | 16 | BEGIN_OPTION_MAP_INLINE() 17 | 18 | ON_OPTION(SHORTOPT('v') || LONGOPT("version")) 19 | version = true; 20 | 21 | ON_OPTION(SHORTOPT('h') || LONGOPT("help")) 22 | help = true; 23 | 24 | END_OPTION_MAP() 25 | }; 26 | 27 | int usage(std::ostream& os, const char *argv0) 28 | { 29 | os << "USAGE: " << argv0 << " [OPTIONS]" << std::endl; 30 | os << "This utility normalize (Japanese) numerical and temporal expressions in the input sentence." << std::endl; 31 | os << std::endl; 32 | os << "OPTIONS:" << std::endl; 33 | os << " -v, --version show this version information and exit" << std::endl; 34 | os << " -h, --help show this help message and exit" << std::endl; 35 | os << std::endl; 36 | return 0; 37 | } 38 | 39 | int version(std::ostream& os) 40 | { 41 | os << NORMALIZENUMEXP_NAME; 42 | os << NORMALIZENUMEXP_VERSION << " "; 43 | os << NORMALIZENUMEXP_COPYRIGHT << std::endl; 44 | os << std::endl; 45 | return 0; 46 | } 47 | 48 | 49 | int main(int argc, char * argv[]){ 50 | option opt; 51 | try { 52 | //int arg_used = opt.parse(argv, argc); 53 | } catch (const optparse::unrecognized_option& e) { 54 | std::cerr << "ERROR: unrecognized option: " << e.what() << std::endl; 55 | return 1; 56 | } catch (const optparse::invalid_value& e) { 57 | std::cerr << "ERROR: " << e.what() << std::endl; 58 | return 1; 59 | } 60 | if(opt.help){ 61 | usage(std::cerr, argv[0]); 62 | return 1; 63 | }else if(opt.version){ 64 | version(std::cerr); 65 | return 1; 66 | } 67 | 68 | NormalizeNumexp NN("ja"); 69 | std::string sentence; 70 | std::vector result; 71 | 72 | while(1) { 73 | sentence = ""; 74 | std::getline(std::cin, sentence); 75 | if(sentence.empty()) break; 76 | NN.normalize(sentence, result); 77 | 78 | //show results 79 | for(int i=0; i(result.size()); i++){ 80 | std::cout << result[i] << std::endl; 81 | } 82 | std::cout << "END" << std::endl; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/normalize_numexp.cpp: -------------------------------------------------------------------------------- 1 | #include "normalize_numexp.hpp" 2 | #include 3 | 4 | namespace normalize_numexp{ 5 | 6 | void NormalizeNumexp::normalize_each_type_expressions(const std::string& text, 7 | std::vector& numexps, 8 | std::vector& abstimeexps, 9 | std::vector& reltimeexps, 10 | std::vector& durationexps){ 11 | NEN.process(text, numexps); 12 | AEN.process(text, abstimeexps); 13 | REN.process(text, reltimeexps); 14 | DEN.process(text, durationexps); 15 | } 16 | 17 | 18 | template 19 | std::string show_options(AnyTypeExpression& any_type_expression){ 20 | std::stringstream ss; 21 | if(any_type_expression.ordinary) any_type_expression.options.push_back("ordinary"); 22 | int sz = static_cast(any_type_expression.options.size()); 23 | for(int i=0; i> ret; 30 | return ret; 31 | } 32 | 33 | 34 | 35 | //resultの生成 36 | void merge_normalize_expressions_into_result( std::vector numexps, std::vector abstimeexps, std::vector reltimeexps, std::vector durationexps, std::vector& result){ 37 | 38 | //TODO : それぞれの正規形に、toString関数をつける?逆に分かり辛い? とりあえずここで処理 39 | std::string kugiri("*"); 40 | std::string tmpstr; 41 | std::stringstream ss; 42 | result.clear(); 43 | 44 | for(int i=0; i(numexps.size()); i++){ 45 | ss.clear(); ss.str(""); 46 | ss << "numerical" << "*" << numexps[i].original_expression << "*" << numexps[i].position_start << "*" << numexps[i].position_end << "*" << numexps[i].counter << "*" << numexps[i].value_lowerbound << "*" << numexps[i].value_upperbound << "*" << show_options(numexps[i]); 47 | ss >> tmpstr; 48 | result.push_back(tmpstr); 49 | } 50 | 51 | for(int i=0; i(abstimeexps.size()); i++){ 52 | ss.clear(); ss.str(""); 53 | ss << "abstime" << "*" << abstimeexps[i].original_expression << "*" << abstimeexps[i].position_start << "*" << abstimeexps[i].position_end << "*" << "none" << "*" << abstimeexps[i].value_lowerbound.to_string(false) << "*" << abstimeexps[i].value_upperbound.to_string(true) << "*" << show_options(abstimeexps[i]); 54 | ss >> tmpstr; 55 | result.push_back(tmpstr); 56 | } 57 | 58 | for(int i=0; i(reltimeexps.size()); i++){ 59 | ss.clear(); ss.str(""); 60 | //TODO : 相対時間表現を、どう表示させるか? 61 | ss << "reltime" << "*" << reltimeexps[i].original_expression << "*" << reltimeexps[i].position_start << "*" << reltimeexps[i].position_end << "*" << "none" << "*" << reltimeexps[i].value_lowerbound_abs.to_string(false) << "," << reltimeexps[i].value_lowerbound_rel.to_duration_string(false) << "*" << reltimeexps[i].value_upperbound_abs.to_string(true) << "," << reltimeexps[i].value_upperbound_rel.to_duration_string(true) << "*" << show_options(reltimeexps[i]); 62 | ss >> tmpstr; 63 | result.push_back(tmpstr); 64 | } 65 | 66 | for(int i=0; i(durationexps.size()); i++){ 67 | ss.clear(); ss.str(""); 68 | ss << "duration" << "*" << durationexps[i].original_expression << "*" << durationexps[i].position_start << "*" << durationexps[i].position_end << "*" << "none" << "*" << durationexps[i].value_lowerbound.to_duration_string(false) << "*" << durationexps[i].value_upperbound.to_duration_string(true) << "*" << show_options(durationexps[i]); 69 | ss >> tmpstr; 70 | result.push_back(tmpstr); 71 | } 72 | } 73 | 74 | 75 | 76 | NormalizeNumexp::NormalizeNumexp(const std::string& language) : NEN(language), AEN(language), REN(language), DEN(language), IER(language) {} 77 | 78 | 79 | void NormalizeNumexp::normalize(const std::string& text, std::vector& result){ 80 | result.clear(); 81 | std::vector numexps; 82 | std::vector abstimeexps; 83 | std::vector reltimeexps; 84 | std::vector durationexps; 85 | 86 | //4つのnormalizerで処理を行う 87 | normalize_each_type_expressions(text, numexps, abstimeexps, reltimeexps, durationexps); 88 | 89 | //それぞれの結果より、不適当な抽出を削除 90 | IER.remove_inappropriate_extraction(text, numexps, abstimeexps, reltimeexps, durationexps); 91 | 92 | //string型に変換し、resultにまとめる 93 | merge_normalize_expressions_into_result(numexps, abstimeexps, reltimeexps, durationexps, result); 94 | } 95 | 96 | } //namespace normalize_numexp 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /src/normalize_numexp.hpp: -------------------------------------------------------------------------------- 1 | #define NORMALIZENUMEXP_NAME "normalizeNumexp" 2 | #define NORMALIZENUMEXP_VERSION "3.0" 3 | #define NORMALIZENUMEXP_COPYRIGHT "Copyright (c) 2012 Katsuma Narisawa" 4 | 5 | #include "inappropriate_expression_remover.hpp" 6 | 7 | namespace normalize_numexp{ 8 | 9 | class NormalizeNumexp{ 10 | public: 11 | NormalizeNumexp(const std::string& language); 12 | void normalize(const std::string& text, std::vector& result); 13 | 14 | private: 15 | void normalize_each_type_expressions(const std::string& text, 16 | std::vector& numexps, 17 | std::vector& abstimeexps, 18 | std::vector& reltimeexps, 19 | std::vector& durationexps); 20 | 21 | numerical_expression_normalizer::NumericalExpressionNormalizer NEN; 22 | abstime_expression_normalizer::AbstimeExpressionNormalizer AEN; 23 | reltime_expression_normalizer::ReltimeExpressionNormalizer REN; 24 | duration_expression_normalizer::DurationExpressionNormalizer DEN; 25 | inappropriate_expression_remover::InappropriateExpressionRemover IER; 26 | }; 27 | } //namespace normalize_numexp 28 | 29 | 30 | -------------------------------------------------------------------------------- /src/normalize_numexp_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "normalize_numexp.hpp" 4 | 5 | using namespace normalize_numexp; 6 | using namespace std; 7 | 8 | class NumexpExtractorTest : public testing::Test { 9 | public: 10 | void SetUp() {} 11 | void TearDown() {} 12 | }; 13 | 14 | TEST_F(NumexpExtractorTest, simple1) { 15 | vector result; 16 | string language("ja"); 17 | string text("1911年から2011年の間、その100年間において、9.3万人もの死傷者がでた。"); 18 | NormalizeNumexp NN(language); 19 | NN.normalize(text, result); 20 | for(int i=0; i(result.size()); i++){ 21 | cout << result[i] << endl; 22 | } 23 | ASSERT_EQ(3u, result.size()); 24 | EXPECT_EQ("numerical*9.3万人*27*32*人*93000*93000*", result[0]); 25 | EXPECT_EQ("abstime*1911年から2011年*0*12*none*1911-XX-XX*2011-XX-XX*", result[1]); 26 | EXPECT_EQ("duration*100年間*17*22*none*P100Y*P100Y*", result[2]); 27 | } 28 | 29 | TEST_F(NumexpExtractorTest, simple2) { 30 | vector result; 31 | string language("ja"); 32 | string text("15年前、戦争があった"); 33 | NormalizeNumexp NN(language); 34 | NN.normalize(text, result); 35 | ASSERT_EQ(1u, result.size()); 36 | EXPECT_EQ("reltime*15年前*0*4*none*XX:XX:XX,P-15Y*XX:XX:XX,P-15Y*", result[0]); 37 | for(int i=0; i(result.size()); i++){ 38 | cout << result[i] << endl; 39 | } 40 | } 41 | 42 | TEST_F(NumexpExtractorTest, simple3) { 43 | vector result; 44 | string language("ja"); 45 | string text("昨年3月、僕たち2人は結婚した"); 46 | NormalizeNumexp NN(language); 47 | NN.normalize(text, result); 48 | ASSERT_EQ(2u, result.size()); 49 | EXPECT_EQ("numerical*2人*8*10*人*2*2*", result[0]); 50 | EXPECT_EQ("reltime*昨年3月*0*4*none*XXXX-03-XX,P-1Y*XXXX-03-XX,P-1Y*", result[1]); 51 | for(int i=0; i(result.size()); i++){ 52 | cout << result[i] << endl; 53 | } 54 | } 55 | 56 | TEST_F(NumexpExtractorTest, simple4) { 57 | vector result; 58 | string language("ja"); 59 | string text("131.1ポイントというスコアを叩き出した"); 60 | NormalizeNumexp NN(language); 61 | NN.normalize(text, result); 62 | ASSERT_EQ(1u, result.size()); 63 | EXPECT_EQ("numerical*131.1ポイント*0*9*ポイント*131.1*131.1*", result[0]); 64 | for(int i=0; i(result.size()); i++){ 65 | cout << result[i] << endl; 66 | } 67 | } 68 | 69 | TEST_F(NumexpExtractorTest, simple5) { 70 | vector result; 71 | string language("ja"); 72 | string text("午後3時45分に待ち合わせ"); 73 | NormalizeNumexp NN(language); 74 | NN.normalize(text, result); 75 | ASSERT_EQ(1u, result.size()); 76 | EXPECT_EQ("abstime*午後3時45分*0*7*none*15:45:XX*15:45:XX*", result[0]); 77 | for(int i=0; i(result.size()); i++){ 78 | cout << result[i] << endl; 79 | } 80 | } 81 | 82 | TEST_F(NumexpExtractorTest, day_of_week1) { 83 | vector result; 84 | string language("ja"); 85 | string text("5月3日(水)"); 86 | NormalizeNumexp NN(language); 87 | NN.normalize(text, result); 88 | ASSERT_EQ(1u, result.size()); 89 | EXPECT_EQ("abstime*5月3日(水)*0*7*none*XXXX-05-03*XXXX-05-03*Wed", result[0]); 90 | for(int i=0; i(result.size()); i++){ 91 | cout << result[i] << endl; 92 | } 93 | } 94 | 95 | /* 96 | //辞書にはあるが認識してくれない。uxが空白を認識してくれていない?? 97 | TEST_F(NumexpExtractorTest, day_of_week2) { 98 | vector result; 99 | string language("ja"); 100 | string text("2001/3/3 Sat"); 101 | NormalizeNumexp NN(language); 102 | NN.normalize(text, result); 103 | ASSERT_EQ(1u, result.size()); 104 | EXPECT_EQ("abstime*2001/3/3*8*16*none*2001-03-3*2001-03-3*Sat", result[0]); 105 | for(int i=0; i(result.size()); i++){ 106 | cout << result[i] << endl; 107 | } 108 | } 109 | */ 110 | 111 | TEST_F(NumexpExtractorTest, real_example1) { 112 | vector result; 113 | string language("ja"); 114 | string text("【今日から開催】The Fruits of Adventures @ ZEIT-FOTO SALON(東京・京橋) 4/26(Tue)まで"); 115 | NormalizeNumexp NN(language); 116 | NN.normalize(text, result); 117 | for(int i=0; i(result.size()); i++){ 118 | cout << result[i] << endl; 119 | } 120 | ASSERT_EQ(1u, result.size()); 121 | EXPECT_EQ("abstime*4/26(Tue)まで*59*70*none*XXXX-04-26*XXXX-04-26*Tue", result[0]); 122 | } 123 | 124 | TEST_F(NumexpExtractorTest, inappropriate_range1) { 125 | vector result; 126 | string language("ja"); 127 | string text("中国から30匹の鳥がきた"); 128 | NormalizeNumexp NN(language); 129 | NN.normalize(text, result); 130 | for(int i=0; i(result.size()); i++){ 131 | cout << result[i] << endl; 132 | } 133 | ASSERT_EQ(1u, result.size()); 134 | EXPECT_EQ("numerical*30匹*4*7*匹*30*30*kara_prefix", result[0]); 135 | } 136 | 137 | 138 | TEST_F(NumexpExtractorTest, inappropriate_range2) { 139 | vector result; 140 | string language("ja"); 141 | string text("30匹からのプレゼント"); 142 | NormalizeNumexp NN(language); 143 | NN.normalize(text, result); 144 | for(int i=0; i(result.size()); i++){ 145 | cout << result[i] << endl; 146 | } 147 | ASSERT_EQ(1u, result.size()); 148 | EXPECT_EQ("numerical*30匹*0*3*匹*30*30*kara_suffix", result[0]); 149 | } 150 | 151 | TEST_F(NumexpExtractorTest, inappropriate_range3) { 152 | vector result; 153 | string language("ja"); 154 | string text("一万年と二千年前から愛してる"); 155 | NormalizeNumexp NN(language); 156 | NN.normalize(text, result); 157 | for(int i=0; i(result.size()); i++){ 158 | cout << result[i] << endl; 159 | } 160 | ASSERT_EQ(2u, result.size()); 161 | EXPECT_EQ("reltime*二千年前*4*8*none*XX:XX:XX,P-2000Y*XX:XX:XX,P-2000Y*kara_suffix", result[0]); 162 | EXPECT_EQ("duration*一万年*0*3*none*P10000Y*P10000Y*", result[1]); 163 | } 164 | 165 | TEST_F(NumexpExtractorTest, inappropriate_range4) { 166 | vector result; 167 | string language("ja"); 168 | string text("話をしよう。あれは今から36万年前………いや、1万4000年前だったか。"); 169 | NormalizeNumexp NN(language); 170 | NN.normalize(text, result); 171 | for(int i=0; i(result.size()); i++){ 172 | cout << result[i] << endl; 173 | } 174 | ASSERT_EQ(2u, result.size()); 175 | EXPECT_EQ("reltime*36万年前*12*17*none*XX:XX:XX,P-360000Y*XX:XX:XX,P-360000Y*kara_prefix", result[0]); 176 | EXPECT_EQ("reltime*1万4000年前*23*31*none*XX:XX:XX,P-14000Y*XX:XX:XX,P-14000Y*", result[1]); 177 | } 178 | 179 | TEST_F(NumexpExtractorTest, inappropriate_strings1) { 180 | vector result; 181 | string language("ja"); 182 | string text("一体それがどうしたというのだね。九州。四国。"); 183 | NormalizeNumexp NN(language); 184 | NN.normalize(text, result); 185 | for(int i=0; i(result.size()); i++){ 186 | cout << result[i] << endl; 187 | } 188 | ASSERT_EQ(0u, result.size()); 189 | } 190 | 191 | 192 | TEST_F(NumexpExtractorTest, inappropriate_prefix1) { 193 | vector result; 194 | string language("ja"); 195 | string text("ver2.3.4。ver2.3。"); 196 | NormalizeNumexp NN(language); 197 | NN.normalize(text, result); 198 | for(int i=0; i(result.size()); i++){ 199 | cout << result[i] << endl; 200 | } 201 | ASSERT_EQ(0u, result.size()); 202 | } 203 | 204 | 205 | TEST_F(NumexpExtractorTest, inappropriate_abstime1) { 206 | vector result; 207 | string language("ja"); 208 | string text("080-6006-4451。ver2.0。"); 209 | NormalizeNumexp NN(language); 210 | NN.normalize(text, result); 211 | for(int i=0; i(result.size()); i++){ 212 | cout << result[i] << endl; 213 | } 214 | ASSERT_EQ(0u, result.size()); 215 | } 216 | 217 | TEST_F(NumexpExtractorTest, inappropriate_abstime2) { 218 | vector result; 219 | string language("ja"); 220 | string text("198999年30月41日。"); 221 | NormalizeNumexp NN(language); 222 | NN.normalize(text, result); 223 | for(int i=0; i(result.size()); i++){ 224 | cout << result[i] << endl; 225 | } 226 | ASSERT_EQ(3u, result.size()); //durationとして認識される 227 | } 228 | 229 | TEST_F(NumexpExtractorTest, url1) { 230 | vector result; 231 | string language("ja"); 232 | string text("tttp3gl3molggg"); 233 | NormalizeNumexp NN(language); 234 | NN.normalize(text, result); 235 | for(int i=0; i(result.size()); i++){ 236 | cout << result[i] << endl; 237 | } 238 | ASSERT_EQ(0u, result.size()); 239 | } 240 | 241 | TEST_F(NumexpExtractorTest, revise_abstime1) { 242 | vector result; 243 | string language("ja"); 244 | string text("09年5月。99年5月"); 245 | NormalizeNumexp NN(language); 246 | NN.normalize(text, result); 247 | for(int i=0; i(result.size()); i++){ 248 | cout << result[i] << endl; 249 | } 250 | ASSERT_EQ(2u, result.size()); 251 | EXPECT_EQ("abstime*09年5月*0*5*none*2009-05-XX*2009-05-XX*", result[0]); 252 | EXPECT_EQ("abstime*99年5月*6*11*none*1999-05-XX*1999-05-XX*", result[1]); 253 | } 254 | 255 | TEST_F(NumexpExtractorTest, not_abstime1) { 256 | vector result; 257 | string language("ja"); 258 | string text("1.2.2 2-2-2"); 259 | NormalizeNumexp NN(language); 260 | NN.normalize(text, result); 261 | for(int i=0; i(result.size()); i++){ 262 | cout << result[i] << endl; 263 | } 264 | ASSERT_EQ(0u, result.size()); 265 | } 266 | 267 | TEST_F(NumexpExtractorTest, revise_abstime2) { 268 | vector result; 269 | string language("ja"); 270 | string text("西暦99年5月"); 271 | NormalizeNumexp NN(language); 272 | NN.normalize(text, result); 273 | for(int i=0; i(result.size()); i++){ 274 | cout << result[i] << endl; 275 | } 276 | ASSERT_EQ(1u, result.size()); 277 | EXPECT_EQ("abstime*西暦99年5月*0*7*none*0099-05-XX*0099-05-XX*", result[0]); 278 | } 279 | 280 | TEST_F(NumexpExtractorTest, su1) { 281 | vector result; 282 | string language("ja"); 283 | string text("数十人が十数人と喧嘩して、百数十円落とした"); 284 | NormalizeNumexp NN(language); 285 | NN.normalize(text, result); 286 | for(int i=0; i(result.size()); i++){ 287 | cout << result[i] << endl; 288 | } 289 | ASSERT_EQ(3u, result.size()); 290 | EXPECT_EQ("numerical*数十人*0*3*人*10*90*", result[0]); 291 | EXPECT_EQ("numerical*十数人*4*7*人*11*19*", result[1]); 292 | EXPECT_EQ("numerical*百数十円*13*17*円*110*190*", result[2]); 293 | } 294 | 295 | TEST_F(NumexpExtractorTest, range1) { 296 | vector result; 297 | string language("ja"); 298 | string text("2012/4/3~6に行われる"); 299 | NormalizeNumexp NN(language); 300 | NN.normalize(text, result); 301 | for(int i=0; i(result.size()); i++){ 302 | cout << result[i] << endl; 303 | } 304 | ASSERT_EQ(1u, result.size()); 305 | EXPECT_EQ("abstime*2012/4/3~6*0*10*none*2012-04-03*2012-04-06*", result[0]); 306 | } 307 | 308 | TEST_F(NumexpExtractorTest, range2) { 309 | vector result; 310 | string language("ja"); 311 | string text("2012/4/3~2012/4/6に行われる"); 312 | NormalizeNumexp NN(language); 313 | NN.normalize(text, result); 314 | for(int i=0; i(result.size()); i++){ 315 | cout << result[i] << endl; 316 | } 317 | ASSERT_EQ(1u, result.size()); 318 | EXPECT_EQ("abstime*2012/4/3~2012/4/6*0*17*none*2012-04-03*2012-04-06*", result[0]); 319 | } 320 | 321 | TEST_F(NumexpExtractorTest, wari1) { 322 | vector result; 323 | string language("ja"); 324 | string text("彼の打率は3割4分5厘だ"); 325 | NormalizeNumexp NN(language); 326 | NN.normalize(text, result); 327 | for(int i=0; i(result.size()); i++){ 328 | cout << result[i] << endl; 329 | } 330 | ASSERT_EQ(1u, result.size()); 331 | EXPECT_EQ("numerical*3割4分5厘*5*11*%*34.5*34.5*", result[0]); 332 | } -------------------------------------------------------------------------------- /src/normalizer_template.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | 数量表現(「三人」「約1000円」などといった表現)や時間表現(「1989年3月」「3:30」「百年後」などといった表現)は以下のように構成される 3 | 4 | 【接頭辞 + 前置助数詞 + 数量表現or時間表現の基本パターン + 接尾辞】 5 |   ・接頭辞:「約」「およそ」など 6 |   ・前置助数詞:数量表現における「時速」「¥」や、絶対時間表現における年号など(本来はパターンに含めたいところだが、基本パターンをprefixSearchで探索している都合上、今回は別の構成要素として考える) 7 |   ・基本パターン:「*人」「*円」「*年*月」「*:*」などの正規表現パターン。 8 |   ・接尾辞:「以降」「くらい」など 9 | 10 | この構成性に着目し、この規格化モジュールでは、文中の数の周囲を正規表現でマッチングさせ、表現を認識させる。 11 | (「数」 -> 「数」+「助数詞」 -> 「前置助数詞」+「数」+「助数詞」 -> 「前置助数詞」+「数」+「助数詞」+「接尾辞」 -> 「接頭辞」+「前置助数詞」+「数」+「助数詞」 と認識範囲を増やしていく) 12 | 認識した際には、認識したパターンに対応する処理を、辞書を参照して実行し、規格化表現を作成していく。 13 | 14 | この基底クラスでは、上のようにパターンを順番に認識していく処理を書いている。 15 | 派生クラスとなるnumerical_expression_normalizer, abstime_expression_normalizer, reltime_expression_normalizer, duration_expression_normalizerでは、認識したパターンに対応する処理を書く。 16 | 17 | */ 18 | 19 | #ifndef NORMALIZER_TEMPLATE_H_ 20 | #define NORMALIZER_TEMPLATE_H_ 21 | #include 22 | #include 23 | #include "digit_utility.hpp" 24 | #include "number_normalizer.hpp" 25 | #include "normalizer_utility.hpp" 26 | #include "dictionary_dirpath.hpp" 27 | #include 28 | 29 | namespace normalizer_template{ 30 | 31 | template 32 | class NormalizerTemplate{ 33 | public: 34 | virtual void init() = 0; 35 | virtual void normalize_number(const std::string& text, std::vector& numbers) = 0; 36 | virtual void revise_any_type_expression_by_matching_limited_expression(std::vector& any_type_expressions, int& expression_id, AnyTypeLimitedExpression matching_limited_expression) = 0; 37 | virtual void revise_any_type_expression_by_matching_prefix_counter(AnyTypeExpression& any_type_expression, const AnyTypeLimitedExpression& matching_limited_expression) = 0; 38 | virtual void revise_any_type_expression_by_number_modifier(AnyTypeExpression& any_type_expression, const normalizer_utility::NumberModifier& number_modifier) = 0; 39 | virtual void delete_not_any_type_expression(std::vector& any_type_expressions) = 0; 40 | virtual void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector& any_type_expressions) = 0; 41 | 42 | void build_limited_expression_patterns_from_limited_expressions(){ 43 | //limited_expressionのpatternでprefixSearchするために、patternをキーとするトライ木を生成する。 44 | std::vector > limited_expression_pattern_table; 45 | for(int i=0; i(limited_expressions_.size()); i++){ 46 | limited_expression_pattern_table.push_back(make_pair(limited_expressions_[i].pattern, i)); 47 | } 48 | limited_expression_patterns_.build(limited_expression_pattern_table); 49 | } 50 | 51 | void load_json_from_file(const std::string& filepath, pfi::text::json::json& js) { 52 | std::ifstream in(filepath.c_str()); 53 | pfi::text::json::json_parser parser(in); 54 | try { 55 | while (true) { 56 | js.add(parser.parse()); 57 | } 58 | } catch (const pfi::lang::end_of_data&) { 59 | } 60 | } 61 | 62 | template 63 | void load_from_dictionary(const std::string& dictionary_path, std::vector& load_target) { 64 | load_target.clear(); 65 | try { 66 | pfi::text::json::json js = pfi::text::json::json(new pfi::text::json::json_array()); 67 | load_json_from_file(dictionary_path, js); 68 | pfi::text::json::from_json(js, load_target); 69 | } catch( ... ) { 70 | std::cout << "dictionary load error" << std::endl; //TODO : error処理 71 | exit(1); 72 | } 73 | } 74 | 75 | template 76 | void build_patterns_rev(const std::vector& originals, ux::Map& patterns) { 77 | //prefixSearchをつかってsuffixSearchを実現するため、uxに格納するパターンを予め前後逆にしておく 78 | std::vector > kvs; 79 | for (int i = 0; i < static_cast(originals.size()); i++) { 80 | kvs.push_back(std::make_pair(normalizer_utility::reverse_string(originals[i].pattern), i)); 81 | } 82 | patterns.build(kvs); 83 | } 84 | 85 | template 86 | void build_patterns(const std::vector& originals, ux::Map& patterns) { 87 | std::vector > kvs; 88 | for (int i = 0; i < static_cast(originals.size()); i++) { 89 | kvs.push_back(std::make_pair(originals[i].pattern, i)); 90 | } 91 | patterns.build(kvs); 92 | } 93 | 94 | void load_from_dictionaries(const std::string& limited_expression_dictionary, const std::string& prefix_counter_dictionary, const std::string& prefix_number_modifier_dictionary, const std::string& suffix_number_modifier_dictionary){ 95 | std::string dictionary_path; 96 | dictionary_path += dictionary_dirpath::get_dictionary_dirpath(); 97 | dictionary_path += language_; 98 | dictionary_path += "/"; 99 | load_from_dictionary(dictionary_path+limited_expression_dictionary, limited_expressions_); 100 | load_from_dictionary(dictionary_path+prefix_counter_dictionary, prefix_counters_); 101 | load_from_dictionary(dictionary_path+suffix_number_modifier_dictionary, suffix_number_modifier_); 102 | load_from_dictionary(dictionary_path+prefix_number_modifier_dictionary, prefix_number_modifier_); 103 | 104 | build_patterns(limited_expressions_, limited_expression_patterns_); 105 | build_patterns_rev(prefix_counters_, prefix_counter_patterns_); 106 | build_patterns_rev(prefix_number_modifier_, prefix_number_modifier_patterns_); 107 | build_patterns(suffix_number_modifier_, suffix_number_modifier_patterns_); 108 | 109 | for(int i=0; i(limited_expressions_.size()); i++){ 110 | limited_expressions_[i].set_total_number_of_place_holder(); 111 | limited_expressions_[i].set_length_of_strings_after_final_place_holder(); 112 | } 113 | } 114 | 115 | void search_matching_limited_expression(const pfi::data::string::ustring& utext_replaced, const AnyTypeExpression& any_type_expression, int& matching_pattern_id){ 116 | pfi::data::string::ustring string_after_expression; 117 | normalizer_utility::extract_after_string(utext_replaced, any_type_expression.position_end, string_after_expression); 118 | normalizer_utility::prefixSearch(string_after_expression, limited_expression_patterns_, matching_pattern_id); 119 | } 120 | 121 | void search_matching_prefix_counter(const pfi::data::string::ustring& utext_replaced, const AnyTypeExpression& any_type_expression, int& matching_pattern_id){ 122 | pfi::data::string::ustring string_before_expression; 123 | normalizer_utility::extract_before_string(utext_replaced, any_type_expression.position_start, string_before_expression); 124 | normalizer_utility::suffixSearch(string_before_expression, prefix_counter_patterns_, matching_pattern_id); 125 | } 126 | 127 | void revise_any_type_expression_by_matching_prefix_number_modifier(AnyTypeExpression& any_type_expression, const normalizer_utility::NumberModifier& number_modifier){ 128 | any_type_expression.position_start -= pfi::data::string::string_to_ustring(number_modifier.pattern).size(); 129 | revise_any_type_expression_by_number_modifier(any_type_expression, number_modifier); 130 | } 131 | 132 | void revise_any_type_expression_by_matching_suffix_number_modifier(AnyTypeExpression& any_type_expression, const normalizer_utility::NumberModifier& number_modifier){ 133 | any_type_expression.position_end += pfi::data::string::string_to_ustring(number_modifier.pattern).size(); 134 | revise_any_type_expression_by_number_modifier(any_type_expression, number_modifier); 135 | } 136 | 137 | bool normalize_limited_expression(const pfi::data::string::ustring& utext_replaced, std::vector& any_type_expressions, int &i){ 138 | int matching_pattern_id; 139 | search_matching_limited_expression(utext_replaced, any_type_expressions[i], matching_pattern_id); 140 | if(matching_pattern_id == -1) return false; 141 | revise_any_type_expression_by_matching_limited_expression(any_type_expressions, i, limited_expressions_[matching_pattern_id]); 142 | return true; 143 | } 144 | 145 | void normalize_prefix_counter(const pfi::data::string::ustring& utext_replaced, AnyTypeExpression& any_type_expression){ 146 | int matching_pattern_id; 147 | search_matching_prefix_counter(utext_replaced, any_type_expression, matching_pattern_id); 148 | if(matching_pattern_id == -1) return; 149 | revise_any_type_expression_by_matching_prefix_counter(any_type_expression, prefix_counters_[matching_pattern_id]); 150 | return; 151 | } 152 | 153 | bool normalize_suffix_number_modifier(const pfi::data::string::ustring& utext_replaced, AnyTypeExpression& any_type_expression){ 154 | int matching_pattern_id; 155 | normalizer_utility::search_suffix_number_modifier(utext_replaced, any_type_expression.position_end, suffix_number_modifier_patterns_, matching_pattern_id); 156 | if(matching_pattern_id == -1) return false; 157 | revise_any_type_expression_by_matching_suffix_number_modifier(any_type_expression, suffix_number_modifier_[matching_pattern_id]); 158 | return true; 159 | } 160 | 161 | bool normalize_prefix_number_modifier(const pfi::data::string::ustring& utext_replaced, AnyTypeExpression& any_type_expression){ 162 | int matching_pattern_id; 163 | normalizer_utility::search_prefix_number_modifier(utext_replaced, any_type_expression.position_start, prefix_number_modifier_patterns_, matching_pattern_id); 164 | if(matching_pattern_id == -1) return false; 165 | revise_any_type_expression_by_matching_prefix_number_modifier(any_type_expression, prefix_number_modifier_[matching_pattern_id]); 166 | return true; 167 | } 168 | 169 | void convert_numbers_to_any_type_expressions(const std::vector& numbers, std::vector& any_type_expressions){ 170 | for(int i=0; i(numbers.size()); i++){ 171 | any_type_expressions.push_back(numbers[i]); 172 | } 173 | } 174 | 175 | bool have_kara_prefix(const std::vector& options){ 176 | return find(options.begin(), options.end(), "kara_prefix") != options.end(); 177 | } 178 | 179 | bool have_kara_suffix(const std::vector& options){ 180 | return find(options.begin(), options.end(), "kara_suffix") != options.end(); 181 | } 182 | 183 | void merge_options(std::vector& options1, std::vector& options2){ 184 | //範囲表現の統合の際に使われる。kara_suffix, kara_prefixはここで削除する 185 | //TODO : 削除するというのが非常に分かり辛い。どうにかする。 186 | for(int i=0; i(options1.size()); i++){ 187 | if(options1[i] == "kara_suffix"){ 188 | options1.erase(options1.begin() + i); 189 | break; 190 | } 191 | } 192 | for(int i=0; i(options2.size()); i++){ 193 | if(options2[i] == "kara_prefix") continue; 194 | options1.push_back(options2[i]); 195 | } 196 | } 197 | 198 | void process(const std::string& text, std::vector& any_type_expressions) { 199 | any_type_expressions.clear(); 200 | pfi::data::string::ustring utext = pfi::data::string::string_to_ustring(text); 201 | 202 | //numbersの作成 203 | std::vector numbers; 204 | normalize_number(text, numbers); 205 | 206 | //numbersを変換して、ベースとなるany_type_expressionsを作成 207 | convert_numbers_to_any_type_expressions(numbers, any_type_expressions); 208 | 209 | //searchするために、text中の数を*に置換しておく 210 | pfi::data::string::ustring utext_replaced; 211 | normalizer_utility::replace_numbers_in_text(utext, numbers, utext_replaced); 212 | 213 | //単位の探索、規格化 214 | for(int i=0; i(any_type_expressions.size()); i++){ 215 | if(!normalize_limited_expression(utext_replaced, any_type_expressions, i)){ 216 | //TODO : 単位が存在しなかった場合の処理をどうするか、相談して決める 217 | } 218 | normalize_prefix_counter(utext_replaced, any_type_expressions[i]); 219 | if(normalize_suffix_number_modifier(utext_replaced, any_type_expressions[i])) normalize_suffix_number_modifier(utext_replaced, any_type_expressions[i]); //TODO : 2回以上の繰り返しを本当に含めて良いのか? 220 | if(normalize_prefix_number_modifier(utext_replaced, any_type_expressions[i])) normalize_prefix_counter(utext_replaced, any_type_expressions[i]); 221 | any_type_expressions[i].set_original_expression_from_position(utext); 222 | } 223 | 224 | //TODO : 範囲表現の処理 225 | fix_by_range_expression(utext, any_type_expressions); 226 | 227 | //規格化されなかったnumberを削除 228 | delete_not_any_type_expression(any_type_expressions); 229 | } 230 | 231 | ux::Map limited_expression_patterns_, prefix_counter_patterns_, suffix_number_modifier_patterns_, prefix_number_modifier_patterns_; 232 | std::vector limited_expressions_, prefix_counters_; 233 | std::vector suffix_number_modifier_, prefix_number_modifier_; 234 | std::string language_; 235 | }; 236 | 237 | } //namespace numerical_expression_normalizer 238 | 239 | #endif //NORMALIZER_TEMPLATE_H_ 240 | -------------------------------------------------------------------------------- /src/normalizer_utility.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "normalizer_utility.hpp" 3 | #include 4 | 5 | namespace normalizer_utility { 6 | 7 | void NormalizedExpressionTemplate::set_original_expression_from_position(const pfi::data::string::ustring& utext){ 8 | original_expression = utext.substr(position_start, position_end - position_start); 9 | } 10 | 11 | pfi::data::string::ustring reverse_string(const pfi::data::string::ustring& ustr) { 12 | return pfi::data::string::ustring(ustr.rbegin(), ustr.rend()); 13 | } 14 | 15 | std::string reverse_string(const std::string& str) { 16 | pfi::data::string::ustring ustr = pfi::data::string::string_to_ustring(str); 17 | return pfi::data::string::ustring_to_string(reverse_string(ustr)); 18 | } 19 | 20 | void extract_after_string(const pfi::data::string::ustring& text, const int i, pfi::data::string::ustring& after_string) { 21 | after_string = text.substr(i, text.size() - i); 22 | } 23 | 24 | void extract_before_string(const pfi::data::string::ustring& text, const int i, pfi::data::string::ustring& before_string) { 25 | before_string = text.substr(0, i); 26 | } 27 | 28 | void prefixSearch(const pfi::data::string::ustring& ustr, const ux::Map& patterns, int& matching_pattern_id) { 29 | /*patternsの中から、ustrのprefixになっているものを探索(複数ある場合は最長のもの) 30 | */ 31 | pfi::data::string::ustring ustr_shortened; 32 | shorten_place_holder_in_text(ustr, ustr_shortened); //ustrは数字が一字一字、「*」に変換されているので、patternsの表記と食い違っている。*を縮約する操作を行う 33 | std::string str = pfi::data::string::ustring_to_string(ustr_shortened); 34 | size_t retLen; 35 | int ret = patterns.prefixSearch(str.c_str(), str.size(), retLen, matching_pattern_id); 36 | if (ret == -1) matching_pattern_id = -1; 37 | } 38 | 39 | void suffixSearch(const pfi::data::string::ustring& ustr, const ux::Map& patterns_rev, int& matching_pattern_id) { 40 | /*patternsの中から、ustrのsuffixになっているものを探索(複数ある場合は最長のもの) 41 | あらかじめpatternsの文字列を逆にしたものを保管しておき(patterns_rev)、ustrも逆にしてしまい、その状態でprefixSearchを行った結果を返す 42 | */ 43 | pfi::data::string::ustring ustr_shortened; 44 | shorten_place_holder_in_text(ustr, ustr_shortened); //ustrは数字が一字一字、「*」に変換されているので、patternsの表記と食い違っている。*を縮約する操作を行う 45 | pfi::data::string::ustring ustr_rev = reverse_string(ustr_shortened); 46 | std::string str_rev = pfi::data::string::ustring_to_string(ustr_rev); 47 | size_t retLen; 48 | int ret = patterns_rev.prefixSearch(str_rev.c_str(), str_rev.size(), retLen, matching_pattern_id); 49 | if (ret == -1) matching_pattern_id = -1; 50 | } 51 | 52 | void search_suffix_number_modifier(const pfi::data::string::ustring& text, const int exp_position_end, 53 | const ux::Map& suffix_number_modifier_patterns, int& matching_pattern_id) { 54 | pfi::data::string::ustring string_after_expression; 55 | extract_after_string(text, exp_position_end, string_after_expression); 56 | prefixSearch(string_after_expression, suffix_number_modifier_patterns, matching_pattern_id); 57 | } 58 | 59 | void search_prefix_number_modifier(const pfi::data::string::ustring& text, const int exp_position_start, 60 | const ux::Map& prefix_number_modifier_patterns, int& matching_pattern_id) { 61 | pfi::data::string::ustring string_before_expression; 62 | extract_before_string(text, exp_position_start, string_before_expression); 63 | suffixSearch(string_before_expression, prefix_number_modifier_patterns, matching_pattern_id); 64 | } 65 | 66 | void replace_numbers_in_text(const pfi::data::string::ustring& utext, const std::vector& numbers, 67 | pfi::data::string::ustring& utext_replaced) { 68 | //「1989年7月」 -> 「****年*月」のように数の部分を置き換える(正規表現で一致させるため) 69 | utext_replaced = utext; 70 | for (int i = 0; i < static_cast(numbers.size()); i++) { 71 | std::fill(utext_replaced.begin() + numbers[i].position_start, utext_replaced.begin() + numbers[i].position_end, PLACE_HOLDER[0]); 72 | } 73 | } 74 | 75 | void shorten_place_holder_in_text(const pfi::data::string::ustring& utext, pfi::data::string::ustring& utext_shortened) { 76 | //「****年*月」 -> 「*年*月」のように数の部分を縮約する(uxのprefixSearchで一致させるため) 77 | utext_shortened.clear(); 78 | bool prev_is_place_holder = false; 79 | for (int i = 0; i < static_cast(utext.size()); i++) { 80 | if (utext[i] == PLACE_HOLDER[0]) { 81 | if (prev_is_place_holder) { 82 | ; 83 | } else { 84 | utext_shortened += PLACE_HOLDER; 85 | prev_is_place_holder = true; 86 | } 87 | } else { 88 | utext_shortened += utext[i]; 89 | prev_is_place_holder = false; 90 | } 91 | } 92 | } 93 | 94 | void build_number_modifier_patterns_from_number_modifiers(const std::vector& number_modifiers, 95 | ux::Map& number_modifier_patterns) { 96 | //patternでの探索を可能にするため、トライ木を構築する。 97 | std::vector > kvs; 98 | for (int i = 0; i < static_cast(number_modifiers.size()); i++) { 99 | kvs.push_back(std::make_pair(number_modifiers[i].pattern, i)); 100 | } 101 | number_modifier_patterns.build(kvs); 102 | } 103 | 104 | bool is_place_holder(pfi::data::string::uchar uc) { 105 | return uc == normalizer_utility::PLACE_HOLDER[0]; 106 | } 107 | 108 | bool is_finite(double value){ 109 | return value != INFINITY and value != -INFINITY; 110 | } 111 | 112 | bool is_null_time(const Time& t){ 113 | Time positive_inf(INFINITY), negative_inf(-INFINITY); 114 | return (positive_inf == t) || (negative_inf == t); 115 | } 116 | 117 | const std::string identify_time_detail(const normalizer_utility::Time& time) { 118 | if (normalizer_utility::is_finite(time.second)) { 119 | return "s"; 120 | } else if (normalizer_utility::is_finite(time.minute)) { 121 | return "mn"; 122 | } else if (normalizer_utility::is_finite(time.hour)) { 123 | return "h"; 124 | } else if (normalizer_utility::is_finite(time.day)) { 125 | return "d"; 126 | } else if (normalizer_utility::is_finite(time.month)) { 127 | return "m"; 128 | } else if (normalizer_utility::is_finite(time.year)) { 129 | return "y"; 130 | } 131 | return ""; 132 | } 133 | 134 | void LimitedExpressionTemplate::set_total_number_of_place_holder() { 135 | pfi::data::string::ustring ustr_pattern = pfi::data::string::string_to_ustring(pattern); 136 | total_number_of_place_holder = static_cast(count_if(ustr_pattern.begin(), ustr_pattern.end(), normalizer_utility::is_place_holder)); 137 | } 138 | 139 | void LimitedExpressionTemplate::set_length_of_strings_after_final_place_holder() { 140 | pfi::data::string::ustring ustr_pattern = pfi::data::string::string_to_ustring(pattern); 141 | length_of_strings_after_final_place_holder = ustr_pattern.size() - ustr_pattern.rfind(normalizer_utility::PLACE_HOLDER[0]) - 1; 142 | } 143 | } //normalizer_utility 144 | 145 | -------------------------------------------------------------------------------- /src/normalizer_utility.hpp: -------------------------------------------------------------------------------- 1 | #ifndef NORMALIZER_UTILITY_H_ 2 | #define NORMALIZER_UTILITY_H_ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "digit_utility.hpp" 9 | 10 | namespace normalizer_utility { 11 | using namespace digit_utility; 12 | class Time { 13 | public: 14 | Time(const double value) { 15 | year = month = day = hour = minute = second = value; 16 | } 17 | Time(const double &year, const double &month, const double &day, const double &hour, const double &minute, const double &second) 18 | : year(year), 19 | month(month), 20 | day(day), 21 | hour(hour), 22 | minute(minute), 23 | second(second) { 24 | } 25 | const bool operator==(const Time& t){ 26 | return t.year == year && 27 | t.month == month && 28 | t.day == day && 29 | t.hour == hour && 30 | t.minute == minute && 31 | t.second == second; 32 | } 33 | 34 | bool is_null_time_element(double t, bool is_upperbound){ 35 | if(is_upperbound) return t==-INFINITY; 36 | else return t==INFINITY; 37 | } 38 | 39 | bool is_infinity_time_element(double t, bool is_upperbound){ 40 | if(is_upperbound) return t==INFINITY; 41 | else return t==-INFINITY; 42 | } 43 | 44 | std::string to_string_from_time_element(double t, std::string null_string, std::string kugiri, bool is_upperbound, int width){ 45 | std::stringstream ss; 46 | std::string ret; 47 | if(is_null_time_element(t, is_upperbound)){ 48 | return null_string + kugiri; 49 | }else{ 50 | ss.fill('0'); ss.width(width); 51 | ss << t << kugiri; 52 | ss >> ret; 53 | return ret; 54 | } 55 | } 56 | 57 | std::string to_interval_string_from_time_element(double t, std::string time_position, bool is_upperbound){ 58 | std::stringstream ss; 59 | std::string ret; 60 | if(is_null_time_element(t, is_upperbound)){ 61 | return ""; 62 | }else{ 63 | ss << t << time_position; 64 | ss >> ret; 65 | return ret; 66 | } 67 | } 68 | 69 | std::string to_string(bool is_upperbound){ 70 | if(is_null_time_element(year, is_upperbound) and is_null_time_element(month, is_upperbound) and is_null_time_element(day, is_upperbound)){ 71 | return to_time_string(is_upperbound); 72 | }else{ 73 | return to_date_string(is_upperbound); 74 | } 75 | } 76 | 77 | std::string to_date_string(bool is_upperbound){ 78 | std::stringstream ss; 79 | std::string ret; 80 | if(is_infinity_time_element(year, is_upperbound)){ 81 | if(is_upperbound) return "INF"; 82 | else return "-INF"; 83 | } 84 | ss << to_string_from_time_element(year, "XXXX", "-", is_upperbound, 4); 85 | ss << to_string_from_time_element(month, "XX", "-", is_upperbound, 2); 86 | ss << to_string_from_time_element(day, "XX", "", is_upperbound, 2); 87 | ss >> ret; 88 | return ret; 89 | } 90 | 91 | std::string to_time_string(bool is_upperbound){ 92 | std::stringstream ss; 93 | std::string ret; 94 | if(is_infinity_time_element(year, is_upperbound)){ 95 | if(is_upperbound) return "INF"; 96 | else return "-INF"; 97 | } 98 | ss << to_string_from_time_element(hour, "XX", ":", is_upperbound, 2); 99 | ss << to_string_from_time_element(minute, "XX", ":", is_upperbound, 2); 100 | ss << to_string_from_time_element(second, "XX", "", is_upperbound, 2); 101 | ss >> ret; 102 | return ret; 103 | } 104 | 105 | std::string to_duration_string(bool is_upperbound){ 106 | std::stringstream ss; 107 | std::string ret; 108 | if(is_infinity_time_element(year, is_upperbound)){ 109 | if(is_upperbound) return "INF"; 110 | else return "-INF"; 111 | } 112 | ss << "P"; 113 | ss << to_interval_string_from_time_element(year, "Y", is_upperbound); 114 | ss << to_interval_string_from_time_element(month, "M", is_upperbound); 115 | ss << to_interval_string_from_time_element(day, "D", is_upperbound); 116 | ss << to_interval_string_from_time_element(hour, "h", is_upperbound); 117 | ss << to_interval_string_from_time_element(minute, "m", is_upperbound); 118 | ss << to_interval_string_from_time_element(second, "s", is_upperbound); 119 | ss >> ret; 120 | return ret; 121 | } 122 | 123 | double year, month, day, hour, minute, second; 124 | }; 125 | 126 | class NormalizedExpressionTemplate { 127 | public: 128 | NormalizedExpressionTemplate(const pfi::data::string::ustring& original_expression, const int position_start, const int position_end) 129 | : original_expression(original_expression), 130 | position_start(position_start), 131 | position_end(position_end), 132 | number_notation_type(NOT_NUMBER), 133 | include_lowerbound(true), 134 | include_upperbound(true), 135 | is_over(false), 136 | is_less(false), 137 | ordinary(false) { 138 | options.clear(); 139 | } 140 | 141 | void set_original_expression_from_position(const pfi::data::string::ustring& utext); 142 | 143 | pfi::data::string::ustring original_expression; 144 | int position_start, position_end; 145 | int number_notation_type; 146 | bool include_lowerbound, include_upperbound; 147 | bool is_over, is_less; 148 | bool ordinary; 149 | std::vector options; 150 | }; 151 | 152 | class LimitedExpressionTemplate { 153 | public: 154 | void set_total_number_of_place_holder(); 155 | void set_length_of_strings_after_final_place_holder(); 156 | 157 | std::string pattern; 158 | bool ordinary; 159 | std::string option; 160 | int total_number_of_place_holder; //patternが含むPLACE_HOLDERの数( *月*日 -> 2個) 161 | int length_of_strings_after_final_place_holder; //pattern中の最後のPLACE_HOLDERの後に続く文字列の長さ(*月*日 -> 1) positionの同定に必要 162 | }; 163 | 164 | struct NumberModifier { 165 | template 166 | void serialize(Archive &ar) { 167 | ar & MEMBER(pattern)& MEMBER(process_type); 168 | } 169 | std::string pattern, process_type; 170 | }; 171 | 172 | void extract_after_string(const pfi::data::string::ustring& text, int i, pfi::data::string::ustring& after_string); 173 | void extract_before_string(const pfi::data::string::ustring& text, int i, pfi::data::string::ustring& before_string); 174 | void prefixSearch(const pfi::data::string::ustring& ustr, const ux::Map& patterns, int& matching_pattern_id); 175 | void suffixSearch(const pfi::data::string::ustring& ustr, const ux::Map& patterns_rev, int& matching_pattern_id); 176 | void search_suffix_number_modifier(const pfi::data::string::ustring& text, const int exp_position_end, 177 | const ux::Map& suffix_number_modifier_patterns, int& matching_pattern_id); 178 | void search_prefix_number_modifier(const pfi::data::string::ustring& text, const int exp_position_start, 179 | const ux::Map& prefix_number_modifier_patterns, int& matching_pattern_id); 180 | void replace_numbers_in_text(const pfi::data::string::ustring& utext, const std::vector& numbers, 181 | pfi::data::string::ustring& utext_replaced); 182 | void shorten_place_holder_in_text(const pfi::data::string::ustring& utext, pfi::data::string::ustring& utext_shortened); 183 | bool is_place_holder(pfi::data::string::uchar uc); 184 | bool is_finite(double value); 185 | bool is_null_time(const Time& t); 186 | const std::string identify_time_detail(const normalizer_utility::Time& time); 187 | std::string reverse_string(const std::string& str); 188 | 189 | template 190 | void cast(const T1& a, T2& b) { 191 | std::stringstream ss; 192 | ss << a; 193 | ss >> b; 194 | } 195 | const pfi::data::string::ustring PLACE_HOLDER = pfi::data::string::string_to_ustring("ǂ"); //LATIN LETTER ALVEOLAR CLICK 196 | 197 | } //normalizer_utility 198 | 199 | #endif //NORMALIZER_UTILITY_H_ 200 | -------------------------------------------------------------------------------- /src/normalizer_utility_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "normalizer_utility.hpp" 5 | 6 | #include 7 | #include 8 | 9 | using namespace normalizer_utility; 10 | using namespace std; 11 | using namespace pfi::data::string; 12 | class NormalizerUtilityTest: public testing::Test { 13 | public: 14 | ux::Map uxm, uxm_rev; 15 | ustring rev(ustring str) { 16 | return ustring(str.rbegin(), str.rend()); 17 | } 18 | 19 | void SetUp() { 20 | vector > kvs, kvs_rev; 21 | kvs.push_back(make_pair("あ", 1)); 22 | kvs.push_back(make_pair("あい", 2)); 23 | kvs.push_back(make_pair("あいう", 3)); 24 | kvs.push_back(make_pair("いう", 4)); 25 | kvs.push_back(make_pair("うえ", 5)); 26 | kvs.push_back(make_pair("うえお", 6)); 27 | kvs.push_back(make_pair("えお", 7)); 28 | kvs.push_back(make_pair("いうえおあ", 8)); 29 | uxm.build(kvs); 30 | 31 | for (int i = 0; i < static_cast(kvs.size()); i++) { 32 | string str_rev = ustring_to_string(rev(string_to_ustring(kvs[i].first))); 33 | kvs_rev.push_back(make_pair(str_rev, kvs[i].second)); 34 | } 35 | uxm_rev.build(kvs_rev); 36 | } 37 | 38 | void TearDown() { 39 | } 40 | }; 41 | 42 | TEST_F(NormalizerUtilityTest, prefixSearch) { 43 | ustring ustr(string_to_ustring("あいうえお")); 44 | int matching_pattern_id; 45 | prefixSearch(ustr, uxm, matching_pattern_id); 46 | EXPECT_EQ(3, matching_pattern_id); // ("あいう", 3) 47 | } 48 | 49 | TEST_F(NormalizerUtilityTest, prefixSearch2) { 50 | ustring ustr(string_to_ustring("いうえおあいうえお")); 51 | int matching_pattern_id; 52 | prefixSearch(ustr, uxm, matching_pattern_id); 53 | EXPECT_EQ(8, matching_pattern_id); // ("いうえおあ", 8) 54 | } 55 | 56 | TEST_F(NormalizerUtilityTest, suffixSearch) { 57 | ustring ustr(string_to_ustring("あいうえお")); 58 | int matching_pattern_id; 59 | suffixSearch(ustr, uxm_rev, matching_pattern_id); 60 | EXPECT_EQ(6, matching_pattern_id); // ("うえお", 6) 61 | } 62 | 63 | TEST_F(NormalizerUtilityTest, suffixSearch2) { 64 | ustring ustr(string_to_ustring("あいうえおあ")); 65 | int matching_pattern_id; 66 | suffixSearch(ustr, uxm_rev, matching_pattern_id); 67 | EXPECT_EQ(8, matching_pattern_id); // ("いうえおあ", 8) 68 | } 69 | 70 | TEST_F(NormalizerUtilityTest, extract_after_string) { 71 | ustring text(string_to_ustring("それは秒速5センチメートルくらいで進む")); 72 | ustring str; 73 | extract_after_string(text, 6, str); 74 | EXPECT_EQ("センチメートルくらいで進む", ustring_to_string(str)); 75 | } 76 | 77 | TEST_F(NormalizerUtilityTest, extract_before_string) { 78 | ustring text(string_to_ustring("それは秒速5センチメートルくらいで進む")); 79 | ustring str; 80 | extract_before_string(text, 5, str); 81 | EXPECT_EQ("それは秒速", ustring_to_string(str)); 82 | } 83 | 84 | TEST_F(NormalizerUtilityTest, seach_suffix) { 85 | ustring text(string_to_ustring("あいうえおあ5あいうえおごごごごご")); 86 | int matching_pattern_id; 87 | search_suffix_number_modifier(text, 7, uxm, matching_pattern_id); 88 | EXPECT_EQ(3, matching_pattern_id); 89 | } 90 | 91 | TEST_F(NormalizerUtilityTest, search_prefix) { 92 | ustring text(string_to_ustring("あいうえおあ5あいうえおごごごごご")); 93 | int matching_pattern_id; 94 | search_prefix_number_modifier(text, 6, uxm_rev, matching_pattern_id); 95 | EXPECT_EQ(8, matching_pattern_id); 96 | } 97 | 98 | TEST_F(NormalizerUtilityTest, replace_numbers_in_text) { 99 | ustring text(string_to_ustring("その30人がそれは三十五人でボボボ")), text_replaced; 100 | vector numbers; 101 | ustring exp1(string_to_ustring("30人")), exp2(string_to_ustring("三十五人")); 102 | numbers.push_back(Number(exp1, 2, 4)); 103 | numbers.push_back(Number(exp2, 9, 12)); 104 | replace_numbers_in_text(text, numbers, text_replaced); 105 | EXPECT_EQ(string_to_ustring("そのǂǂ人がそれはǂǂǂ人でボボボ") , text_replaced); 106 | } 107 | 108 | TEST_F(NormalizerUtilityTest, shorten_place_holder_in_text) { 109 | ustring text(string_to_ustring("そのǂǂ人がそれはǂǂǂǂǂǂ人でボボボǂǂǂ")), text_shortened; 110 | shorten_place_holder_in_text(text, text_shortened); 111 | EXPECT_EQ(string_to_ustring("そのǂ人がそれはǂ人でボボボǂ") , text_shortened); 112 | } 113 | 114 | TEST_F(NormalizerUtilityTest, is_place_holder_true) { 115 | EXPECT_TRUE(is_place_holder(string_to_uchar("ǂ"))); 116 | } 117 | 118 | TEST_F(NormalizerUtilityTest, is_place_holder_false) { 119 | EXPECT_FALSE(is_place_holder(string_to_uchar("あ"))); 120 | } 121 | 122 | TEST_F(NormalizerUtilityTest, is_finite_false) { 123 | EXPECT_FALSE(is_finite(INFINITY)); 124 | } 125 | 126 | TEST_F(NormalizerUtilityTest, is_finite_true) { 127 | EXPECT_TRUE(is_finite(99999.0)); 128 | } 129 | 130 | TEST_F(NormalizerUtilityTest, is_null_time_true) { 131 | Time t(INFINITY); 132 | EXPECT_TRUE(is_null_time(t)); 133 | } 134 | 135 | TEST_F(NormalizerUtilityTest, is_null_time_false) { 136 | Time t(1); 137 | EXPECT_FALSE(is_null_time(t)); 138 | } 139 | 140 | TEST_F(NormalizerUtilityTest, identify_time_detail) { 141 | Time t(1,1,1,1,1,INFINITY); 142 | EXPECT_EQ(identify_time_detail(t), "mn"); 143 | } 144 | 145 | TEST_F(NormalizerUtilityTest, reverse_string) { 146 | string str("aiueo"); 147 | EXPECT_EQ(reverse_string(str), "oeuia"); 148 | } -------------------------------------------------------------------------------- /src/number_normalizer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef NUMBER_NORMALIZER_H_ 2 | #define NUMBER_NORMALIZER_H_ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "digit_utility.hpp" 9 | 10 | namespace number_normalizer { 11 | using namespace digit_utility; 12 | 13 | 14 | class NumberExtractor { 15 | public: 16 | void extract_number(const std::string& input, std::vector& output); 17 | private: 18 | bool is_invalid_notation_type(int notation_type); 19 | void return_longest_number_strings(const pfi::data::string::ustring& utext, int &i, std::string& numstr); 20 | }; 21 | 22 | 23 | 24 | class NumberConverterTemplate { 25 | public: 26 | void convert_number(const pfi::data::string::ustring& number_string_org, double& value, int& number_type); 27 | protected: 28 | virtual void convert_arabic_kansuji_mixed_of_4digit(const pfi::data::string::ustring& number_string, int& number_converted) = 0; 29 | void delete_comma(const pfi::data::string::ustring& ustr, pfi::data::string::ustring& ret); 30 | void convert_arabic_numerals(const pfi::data::string::ustring& number_string, double& value); 31 | void convert_arabic_kansuji_kurai_man_mixed(const pfi::data::string::ustring& number_string, double& value); 32 | void convert_arabic_kansuji_mixed(const pfi::data::string::ustring& number_string, double& value); 33 | }; 34 | 35 | class JapaneseNumberConverter : public NumberConverterTemplate{ 36 | private: 37 | void convert_arabic_kansuji_mixed_of_4digit(const pfi::data::string::ustring& number_string, int& number_converted); 38 | }; 39 | 40 | class ChineseNumberConverter : public NumberConverterTemplate{ 41 | private: 42 | void convert_arabic_kansuji_mixed_of_4digit(const pfi::data::string::ustring& number_string, int& number_converted); 43 | }; 44 | 45 | class ArabicNumberConverter : public NumberConverterTemplate{ 46 | public: 47 | void convert_number(const pfi::data::string::ustring& number_string_org, double& value, int& number_type); 48 | private: 49 | void convert_arabic_kansuji_mixed_of_4digit(const pfi::data::string::ustring& number_string, int& number_converted); 50 | }; 51 | 52 | 53 | 54 | class SymbolFixer { 55 | public: 56 | void fix_numbers_by_symbol(const std::string& text, std::vector& numbers); 57 | private: 58 | bool is_plus(const pfi::data::string::ustring& utext, int i, pfi::data::string::ustring& plus_strings); 59 | bool is_minus(const pfi::data::string::ustring& utext, int i, pfi::data::string::ustring& plus_strings); 60 | void fix_prefix_symbol(const pfi::data::string::ustring& utext, std::vector& numbers, int i); 61 | double create_decimal_value(const Number& number); 62 | void fix_decimal_point(std::vector& numbers, int i, pfi::data::string::ustring decimal_strings); 63 | void fix_range_expression(std::vector& numbers, int i, pfi::data::string::ustring range_strings); 64 | void fix_intermediate_symbol(const pfi::data::string::ustring& utext, std::vector& numbers, int i); 65 | void fix_suffix_symbol(const pfi::data::string::ustring& utext, std::vector& numbers, int i); 66 | }; 67 | 68 | 69 | 70 | class NumberNormalizer { 71 | public: 72 | NumberNormalizer(const std::string& language) {language_ = language; digit_utility::init_kansuji(language); } 73 | ; 74 | void process(const std::string& input, std::vector& output); 75 | void process_dont_fix_by_symbol(const std::string& input, std::vector& output); //絶対時間表現の規格化の際に使用する(絶対時間表現では、前もって記号を処理させないため) 76 | 77 | private: 78 | std::string language_; 79 | }; 80 | 81 | } //namespace number_normalizer 82 | 83 | #endif //NUMBER_NORMALIZER_H_ 84 | -------------------------------------------------------------------------------- /src/numerical_expression_extractor.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/src/numerical_expression_extractor.pyc -------------------------------------------------------------------------------- /src/numerical_expression_normalizer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "numerical_expression_normalizer.hpp" 3 | #include "digit_utility.hpp" 4 | #include "number_normalizer.hpp" 5 | 6 | namespace numerical_expression_normalizer{ 7 | 8 | void NumericalExpressionNormalizer::init(){ 9 | load_from_dictionaries("num_counter_json.txt", "num_prefix_counter_json.txt", "num_prefix_json.txt", "num_suffix_json.txt"); 10 | } 11 | 12 | void NumericalExpressionNormalizer::normalize_number(const std::string& text, std::vector& numbers) { 13 | NN.process(text, numbers); 14 | } 15 | 16 | void multiply_numexp_value(NumericalExpression& numexp, double x){ 17 | numexp.value_lowerbound *= x; 18 | numexp.value_upperbound *= x; 19 | } 20 | 21 | 22 | void do_option_wari(std::vector& numexps, int expression_id, const Counter matching_limited_expression){ 23 | pfi::data::string::ustring upattern = pfi::data::string::string_to_ustring(matching_limited_expression.pattern); 24 | numexps[expression_id].position_end += upattern.size(); 25 | numexps[expression_id].counter = pfi::data::string::string_to_ustring("%"); 26 | numexps[expression_id].ordinary = false; 27 | 28 | //set_value 29 | double value = 0; 30 | for(int i=0; i(upattern.size()); i+=2){ 31 | if(upattern[i] == pfi::data::string::string_to_ustring("割")[0]){ 32 | value += numexps[expression_id + i/2].value_lowerbound * 10; 33 | }else if(upattern[i] == pfi::data::string::string_to_ustring("分")[0]){ 34 | value += numexps[expression_id + i/2].value_lowerbound * 1; 35 | }else if(upattern[i] == pfi::data::string::string_to_ustring("厘")[0]){ 36 | value += numexps[expression_id + i/2].value_lowerbound * 0.1; 37 | } 38 | } 39 | numexps[expression_id].value_lowerbound = value; 40 | numexps[expression_id].value_upperbound = value; 41 | 42 | //erase merged numexps 43 | for(int i=2; i(upattern.size()); i+=2){ 44 | numexps.erase(numexps.begin() + expression_id + 1); 45 | } 46 | } 47 | 48 | void NumericalExpressionNormalizer::revise_any_type_expression_by_matching_limited_expression(std::vector& numexps, int& expression_id, const Counter matching_limited_expression){ 49 | //特殊なタイプをここで例外処理 50 | if(matching_limited_expression.option == "wari"){ 51 | do_option_wari(numexps, expression_id, matching_limited_expression); 52 | return; 53 | } 54 | //TODO : 今のところ特殊なタイプは分数しかないので、とりあえず保留 55 | 56 | numexps[expression_id].position_end += pfi::data::string::string_to_ustring(matching_limited_expression.pattern).size(); 57 | numexps[expression_id].counter = pfi::data::string::string_to_ustring(matching_limited_expression.counter); 58 | multiply_numexp_value(numexps[expression_id], pow(10, matching_limited_expression.SI_prefix)); 59 | multiply_numexp_value(numexps[expression_id], pow(10, matching_limited_expression.optional_power_of_ten)); 60 | numexps[expression_id].ordinary = matching_limited_expression.ordinary; 61 | } 62 | 63 | void NumericalExpressionNormalizer::revise_any_type_expression_by_matching_prefix_counter(NumericalExpression& numexp, const Counter& matching_limited_expression){ 64 | if(matching_limited_expression.option == "counter"){ 65 | numexp.position_start -= pfi::data::string::string_to_ustring(matching_limited_expression.pattern).size(); 66 | numexp.counter = pfi::data::string::string_to_ustring(matching_limited_expression.counter); 67 | multiply_numexp_value(numexp, pow(10, matching_limited_expression.SI_prefix)); 68 | multiply_numexp_value(numexp, pow(10, matching_limited_expression.optional_power_of_ten)); 69 | numexp.ordinary = matching_limited_expression.ordinary; 70 | }else if(matching_limited_expression.option == "add_suffix_counter"){ 71 | if(numexp.counter.empty()) return; //TODO : 単位が空の場合、追加は行わない? 72 | numexp.position_start -= pfi::data::string::string_to_ustring(matching_limited_expression.pattern).size(); 73 | numexp.counter += pfi::data::string::string_to_ustring(matching_limited_expression.counter); 74 | } 75 | } 76 | 77 | void NumericalExpressionNormalizer::revise_any_type_expression_by_number_modifier(NumericalExpression& numexp, const normalizer_utility::NumberModifier& number_modifier){ 78 | std::string process_type = number_modifier.process_type; 79 | /* 「約」などのNumberModifierの処理を行う。 80 | */ 81 | if(process_type == "or_over"){ 82 | numexp.value_upperbound = INFINITY; 83 | }else if(process_type == "or_less"){ 84 | numexp.value_lowerbound = -INFINITY; 85 | }else if(process_type == "over"){ 86 | numexp.value_upperbound = INFINITY; 87 | numexp.include_lowerbound = false; 88 | }else if(process_type == "less"){ 89 | numexp.value_lowerbound = -INFINITY; 90 | numexp.include_upperbound = false; 91 | }else if(process_type == "dai"){ 92 | //TODO : どんな処理をするか未定。。 該当する事例は「30代」「9秒台」のみ? 93 | }else if(process_type == "ordinary"){ 94 | numexp.ordinary = true; 95 | }else if(process_type == "han"){ 96 | numexp.value_lowerbound += 0.5; 97 | numexp.value_upperbound += 0.5; 98 | }else if(process_type[0] == '/'){ // /hour, /minなど 99 | numexp.counter += pfi::data::string::string_to_ustring(process_type); 100 | }else if(process_type == "none"){ 101 | ; 102 | }else if(process_type == "per"){ 103 | // TODO : どんな処理をするか未定。 該当する事例は「1ページ毎」など。 104 | }else if(process_type == "about"){ 105 | numexp.value_lowerbound *= 0.7; 106 | numexp.value_upperbound *= 1.3; 107 | }else if(process_type == "kyou"){ 108 | numexp.value_upperbound *= 1.6; 109 | }else if(process_type == "jaku"){ 110 | numexp.value_lowerbound *= 0.5; 111 | }else if(process_type == "made"){ 112 | if(numexp.value_lowerbound == numexp.value_upperbound) { 113 | numexp.value_lowerbound = -INFINITY; 114 | }else{ 115 | ; 116 | } 117 | }else { 118 | numexp.options.push_back(process_type); 119 | } 120 | } 121 | 122 | void NumericalExpressionNormalizer::delete_not_any_type_expression(std::vector& numexps){ 123 | for(int i=0; i(numexps.size()); i++){ 124 | if(numexps[i].counter.empty()){ 125 | numexps.erase(numexps.begin() + i); 126 | i--; 127 | } 128 | } 129 | } 130 | 131 | void delete_after_slash(pfi::data::string::ustring& ustr){ 132 | if(ustr.find(pfi::data::string::string_to_uchar("/")) == pfi::data::string::ustring::npos) return; 133 | ustr = ustr.substr(0, ustr.find(pfi::data::string::string_to_uchar("/"))); 134 | } 135 | 136 | bool suffix_match_counter(pfi::data::string::ustring counter1, pfi::data::string::ustring counter2){ 137 | //単位が一致しているかどうかを判断する。 138 | //「時速50km〜60km」のような事例に対応する(前者は[50km/h], 後者は[60km]と規格化されており、完全一致ではマッチしない)ために、スラッシュより前の単位が一致するかどうかで判断する 139 | delete_after_slash(counter1); 140 | delete_after_slash(counter2); 141 | return counter1 == counter2; 142 | } 143 | 144 | void NumericalExpressionNormalizer::fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector& numexps) { 145 | for(int i=0; i(numexps.size()-1); i++){ 146 | if(have_kara_suffix(numexps[i].options) && have_kara_prefix(numexps[i+1].options) && numexps[i].position_end +2 >= numexps[i+1].position_start){ 147 | if(!suffix_match_counter(numexps[i].counter, numexps[i+1].counter)) continue; 148 | numexps[i].value_upperbound = numexps[i+1].value_upperbound; 149 | numexps[i].position_end = numexps[i+1].position_end; 150 | numexps[i].set_original_expression_from_position(utext); 151 | //memo :単位のマージは、必ずiの方がi+1よりも長いので、する必要なし 152 | merge_options(numexps[i].options, numexps[i+1].options); 153 | numexps.erase(numexps.begin()+i+1); 154 | } 155 | } 156 | } 157 | 158 | } //namespace numerical_expression_normalizer 159 | 160 | 161 | -------------------------------------------------------------------------------- /src/numerical_expression_normalizer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef NUMERICAL_EXPRESSION_NORMALIZER_H_ 2 | #define NUMERICAL_EXPRESSION_NORMALIZER_H_ 3 | #include 4 | #include 5 | #include "digit_utility.hpp" 6 | #include "number_normalizer.hpp" 7 | #include "normalizer_utility.hpp" 8 | #include "normalizer_template.hpp" 9 | 10 | namespace numerical_expression_normalizer{ 11 | 12 | struct NumericalExpression : normalizer_utility::NormalizedExpressionTemplate{ 13 | NumericalExpression(const pfi::data::string::ustring& original_expression, // TODO : 実装方針が変わったので、この初期化リストはテストでしか用いていない。テストを変更して、これは削る。 14 | const int position_start, 15 | const int position_end, 16 | const double value_lowerbound, 17 | const double value_upperbound) 18 | : normalizer_utility::NormalizedExpressionTemplate(original_expression, position_start, position_end), 19 | value_lowerbound(value_lowerbound), 20 | value_upperbound(value_upperbound), 21 | counter(pfi::data::string::string_to_ustring("")), 22 | ordinary(false) 23 | {} 24 | 25 | NumericalExpression(digit_utility::Number number) 26 | : normalizer_utility::NormalizedExpressionTemplate(number.original_expression, number.position_start, number.position_end), 27 | value_lowerbound(number.value_lowerbound), 28 | value_upperbound(number.value_upperbound), 29 | counter(pfi::data::string::string_to_ustring("")), 30 | ordinary(false) 31 | {} 32 | 33 | double value_lowerbound, value_upperbound; 34 | pfi::data::string::ustring counter; 35 | bool ordinary; 36 | }; 37 | 38 | 39 | struct Counter : public normalizer_utility::LimitedExpressionTemplate{ 40 | template 41 | void serialize(Archive &ar){ 42 | ar & MEMBER(pattern) & MEMBER(counter) & MEMBER(SI_prefix) & MEMBER(optional_power_of_ten) & MEMBER(ordinary) & MEMBER(option); 43 | } 44 | 45 | std::string counter; 46 | int SI_prefix; //「キロ」「ミリ」などの表記に使用 47 | int optional_power_of_ten; //「トン」のような特殊な表記(数に10^4を乗算する必要がある)の時に使用 48 | }; 49 | 50 | 51 | class NumericalExpressionNormalizer : public normalizer_template::NormalizerTemplate{ 52 | public: 53 | NumericalExpressionNormalizer(const std::string& language) : NN(language) { language_ = language; init(); } 54 | 55 | private: 56 | void init(); 57 | void normalize_number(const std::string& text, std::vector& numbers); 58 | void revise_any_type_expression_by_matching_limited_expression(std::vector& numexps, int& expression_id, Counter matching_limited_expression); 59 | void revise_any_type_expression_by_matching_prefix_counter(NumericalExpression& numexps, const Counter& matching_limited_expression); 60 | void revise_any_type_expression_by_number_modifier(NumericalExpression& numexp, const normalizer_utility::NumberModifier& number_modifier); 61 | void delete_not_any_type_expression(std::vector& numexps); 62 | void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector& numexps); 63 | 64 | number_normalizer::NumberNormalizer NN; 65 | }; 66 | 67 | } //namespace numerical_expression_normalizer 68 | 69 | #endif //NUMERICAL_EXPRESSION_NORMALIZER_H_ 70 | -------------------------------------------------------------------------------- /src/numerical_expression_normalizer_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "normalizer_utility.hpp" 5 | #include "numerical_expression_normalizer.hpp" 6 | 7 | #include 8 | #include 9 | 10 | using namespace normalizer_utility; 11 | using namespace std; 12 | using namespace pfi::data::string; 13 | using namespace numerical_expression_normalizer; 14 | class NumexpNormalizerTest : public testing::Test { 15 | public: 16 | void SetUp() {} 17 | void TearDown() {} 18 | }; 19 | 20 | bool is_same_numexp(const NumericalExpression &n1, const NumericalExpression &n2){ 21 | return 22 | n1.original_expression==n2.original_expression && 23 | n1.position_start==n2.position_start && 24 | n1.position_end==n2.position_end && 25 | n1.value_lowerbound==n2.value_lowerbound && 26 | n1.value_upperbound==n2.value_upperbound && 27 | n1.counter==n2.counter; 28 | } 29 | 30 | TEST_F(NumexpNormalizerTest, simple1) { 31 | NumericalExpressionNormalizer NEN("ja"); 32 | std::string text("その三人が死んだ"); 33 | std::vector numexps; 34 | NEN.process(text, numexps); 35 | NumericalExpression ex(string_to_ustring("三人"), 2, 4, 3, 3); 36 | ex.counter = string_to_ustring("人"); 37 | 38 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 39 | } 40 | 41 | TEST_F(NumexpNormalizerTest, simple2) { 42 | NumericalExpressionNormalizer NEN("ja"); 43 | std::string text("3kgのレッドブルと、2USドルのモンスター"); 44 | std::vector numexps; 45 | NEN.process(text, numexps); 46 | NumericalExpression ex1(string_to_ustring("3kg"), 0, 3, 3000, 3000); 47 | ex1.counter = string_to_ustring("g"); 48 | NumericalExpression ex2(string_to_ustring("2USドル"),11, 16, 2, 2); 49 | ex2.counter = string_to_ustring("ドル"); 50 | EXPECT_TRUE(is_same_numexp(ex1, numexps[0])); 51 | EXPECT_TRUE(is_same_numexp(ex2, numexps[1])); 52 | } 53 | 54 | TEST_F(NumexpNormalizerTest, about1) { 55 | NumericalExpressionNormalizer NEN("ja"); 56 | std::string text("その約十人がぼぼぼぼ"); 57 | std::vector numexps; 58 | NEN.process(text, numexps); 59 | NumericalExpression ex(string_to_ustring("約十人"), 2, 5, 7, 13.0); 60 | ex.counter = string_to_ustring("人"); 61 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 62 | } 63 | 64 | TEST_F(NumexpNormalizerTest, about2) { 65 | NumericalExpressionNormalizer NEN("ja"); 66 | std::string text("そのおよそ十人がぼぼぼぼ"); 67 | std::vector numexps; 68 | NEN.process(text, numexps); 69 | NumericalExpression ex(string_to_ustring("およそ十人"), 2, 7, 7, 13.0); 70 | ex.counter = string_to_ustring("人"); 71 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 72 | } 73 | 74 | TEST_F(NumexpNormalizerTest, or_over) { 75 | NumericalExpressionNormalizer NEN("ja"); 76 | std::string text("その三人以上がぼぼぼぼ"); 77 | std::vector numexps; 78 | NEN.process(text, numexps); 79 | NumericalExpression ex(string_to_ustring("三人以上"), 2, 6, 3.0, INFINITY); 80 | ex.counter = string_to_ustring("人"); 81 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 82 | } 83 | 84 | TEST_F(NumexpNormalizerTest, about_and_or_over) { 85 | NumericalExpressionNormalizer NEN("ja"); 86 | std::string text("その約十人以上がぼぼぼぼ"); 87 | std::vector numexps; 88 | NEN.process(text, numexps); 89 | NumericalExpression ex(string_to_ustring("約十人以上"), 2, 7, 7.0, INFINITY); 90 | ex.counter = string_to_ustring("人"); 91 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 92 | } 93 | 94 | TEST_F(NumexpNormalizerTest, or_less) { 95 | NumericalExpressionNormalizer NEN("ja"); 96 | std::string text("その三人以下がぼぼぼぼ"); 97 | std::vector numexps; 98 | NEN.process(text, numexps); 99 | NumericalExpression ex(string_to_ustring("三人以下"), 2, 6, -INFINITY, 3); 100 | ex.counter = string_to_ustring("人"); 101 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 102 | } 103 | 104 | TEST_F(NumexpNormalizerTest, kyou) { 105 | NumericalExpressionNormalizer NEN("ja"); 106 | std::string text("レッドブルを10本強飲んだ"); 107 | std::vector numexps; 108 | NEN.process(text, numexps); 109 | NumericalExpression ex(string_to_ustring("10本強"), 6, 10, 10, 16); 110 | ex.counter = string_to_ustring("本"); 111 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 112 | } 113 | 114 | TEST_F(NumexpNormalizerTest, jaku) { 115 | NumericalExpressionNormalizer NEN("ja"); 116 | std::string text("レッドブルを10本弱飲んだ"); 117 | std::vector numexps; 118 | NEN.process(text, numexps); 119 | NumericalExpression ex(string_to_ustring("10本弱"), 6, 10, 5, 10); 120 | ex.counter = string_to_ustring("本"); 121 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 122 | } 123 | 124 | TEST_F(NumexpNormalizerTest, ordinary) { 125 | NumericalExpressionNormalizer NEN("ja"); 126 | std::string text("本日10本目のレッドブル"); 127 | std::vector numexps; 128 | NEN.process(text, numexps); 129 | NumericalExpression ex(string_to_ustring("10本目"), 2, 6, 10, 10); 130 | ex.counter = string_to_ustring("本"); 131 | ex.ordinary = true; 132 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 133 | } 134 | 135 | TEST_F(NumexpNormalizerTest, han) { 136 | NumericalExpressionNormalizer NEN("ja"); 137 | std::string text("レッドブルを1本半飲んだ"); 138 | std::vector numexps; 139 | NEN.process(text, numexps); 140 | NumericalExpression ex(string_to_ustring("1本半"), 6, 9, 1.5, 1.5); 141 | ex.counter = string_to_ustring("本"); 142 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 143 | } 144 | 145 | TEST_F(NumexpNormalizerTest, per) { 146 | NumericalExpressionNormalizer NEN("ja"); 147 | std::string text("1キロメートル/時"); 148 | std::vector numexps; 149 | NEN.process(text, numexps); 150 | NumericalExpression ex(string_to_ustring("1キロメートル/時"), 0, 9, 1000, 1000); 151 | ex.counter = string_to_ustring("m/h"); 152 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 153 | } 154 | 155 | TEST_F(NumexpNormalizerTest, prefix_counter1) { 156 | NumericalExpressionNormalizer NEN("ja"); 157 | std::string text("それは¥100だ"); 158 | std::vector numexps; 159 | NEN.process(text, numexps); 160 | NumericalExpression ex(string_to_ustring("¥100"), 3, 7, 100, 100); 161 | ex.counter = string_to_ustring("円"); 162 | ASSERT_EQ(1u, numexps.size()); 163 | 164 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 165 | } 166 | 167 | TEST_F(NumexpNormalizerTest, prefix_counter2) { 168 | NumericalExpressionNormalizer NEN("ja"); 169 | std::string text("それは時速40キロメートルだ"); 170 | std::vector numexps; 171 | NEN.process(text, numexps); 172 | NumericalExpression ex(string_to_ustring("時速40キロメートル"), 3, 13, 40000, 40000); 173 | ex.counter = string_to_ustring("m/h"); 174 | ASSERT_EQ(1u, numexps.size()); 175 | 176 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 177 | } 178 | 179 | TEST_F(NumexpNormalizerTest, range1) { 180 | NumericalExpressionNormalizer NEN("ja"); 181 | std::string text("このアトラクションは3人〜の運用になります"); 182 | std::vector numexps; 183 | NEN.process(text, numexps); 184 | NumericalExpression ex(string_to_ustring("3人〜"), 10, 13, 3, 3); 185 | ex.counter = string_to_ustring("人"); 186 | ASSERT_EQ(1u, numexps.size()); 187 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 188 | ASSERT_EQ(1u, numexps[0].options.size()); 189 | EXPECT_EQ(numexps[0].options[0], "kara_suffix"); 190 | } 191 | 192 | TEST_F(NumexpNormalizerTest, range2) { 193 | NumericalExpressionNormalizer NEN("ja"); 194 | std::string text("遊び方の欄には「〜8人」と書いてある"); 195 | std::vector numexps; 196 | NEN.process(text, numexps); 197 | NumericalExpression ex(string_to_ustring("〜8人"), 8, 11, 8, 8); 198 | ex.counter = string_to_ustring("人"); 199 | ASSERT_EQ(1u, numexps.size()); 200 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 201 | ASSERT_EQ(1u, numexps[0].options.size()); 202 | EXPECT_EQ(numexps[0].options[0], "kara_prefix"); 203 | } 204 | 205 | TEST_F(NumexpNormalizerTest, range3) { 206 | NumericalExpressionNormalizer NEN("ja"); 207 | std::string text("遊び方の欄には「5〜8人」と書いてある"); 208 | std::vector numexps; 209 | NEN.process(text, numexps); 210 | NumericalExpression ex(string_to_ustring("5〜8人"), 8, 12, 5, 8); 211 | ex.counter = string_to_ustring("人"); 212 | ASSERT_EQ(1u, numexps.size()); 213 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 214 | } 215 | 216 | TEST_F(NumexpNormalizerTest, range4) { 217 | NumericalExpressionNormalizer NEN("ja"); 218 | std::string text("遊び方の欄には「5人〜8人」と書いてある"); 219 | std::vector numexps; 220 | NEN.process(text, numexps); 221 | NumericalExpression ex(string_to_ustring("5人〜8人"), 8, 13, 5, 8); 222 | ex.counter = string_to_ustring("人"); 223 | ASSERT_EQ(1u, numexps.size()); 224 | 225 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 226 | } 227 | 228 | TEST_F(NumexpNormalizerTest, range5) { 229 | NumericalExpressionNormalizer NEN("ja"); 230 | std::string text("時速50km〜60km"); 231 | std::vector numexps; 232 | NEN.process(text, numexps); 233 | NumericalExpression ex(string_to_ustring("時速50km〜60km"), 0, 11, 50000, 60000); 234 | ex.counter = string_to_ustring("m/h"); 235 | ASSERT_EQ(1u, numexps.size()); 236 | 237 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 238 | } 239 | 240 | TEST_F(NumexpNormalizerTest, range6) { 241 | NumericalExpressionNormalizer NEN("ja"); 242 | std::string text("時速50kmから時速60km"); 243 | std::vector numexps; 244 | NEN.process(text, numexps); 245 | NumericalExpression ex(string_to_ustring("時速50kmから時速60km"), 0, 14, 50000, 60000); 246 | ex.counter = string_to_ustring("m/h"); 247 | ASSERT_EQ(1u, numexps.size()); 248 | 249 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 250 | } 251 | 252 | TEST_F(NumexpNormalizerTest, range7) { 253 | NumericalExpressionNormalizer NEN("ja"); 254 | std::string text("時速50〜60km"); 255 | std::vector numexps; 256 | NEN.process(text, numexps); 257 | NumericalExpression ex(string_to_ustring("時速50〜60km"), 0, 9, 50000, 60000); 258 | ex.counter = string_to_ustring("m/h"); 259 | ASSERT_EQ(1u, numexps.size()); 260 | 261 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 262 | } 263 | 264 | TEST_F(NumexpNormalizerTest, range8) { 265 | NumericalExpressionNormalizer NEN("ja"); 266 | std::string text("世界50カ国から3000人が出席予定だ"); 267 | std::vector numexps; 268 | NEN.process(text, numexps); 269 | ASSERT_EQ(2u, numexps.size()); //単位が違うので、マージされない 270 | } 271 | 272 | TEST_F(NumexpNormalizerTest, range9) { 273 | NumericalExpressionNormalizer NEN("ja"); 274 | std::string text("およそ時速50km〜60kmくらい"); 275 | std::vector numexps; 276 | NEN.process(text, numexps); 277 | NumericalExpression ex(string_to_ustring("およそ時速50km〜60kmくらい"), 0, 17, 35000, 78000); 278 | ex.counter = string_to_ustring("m/h"); 279 | ASSERT_EQ(1u, numexps.size()); 280 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 281 | } 282 | 283 | TEST_F(NumexpNormalizerTest, chinese1) { 284 | NumericalExpressionNormalizer NEN("zh"); 285 | std::string text("日本政府受清廷壓力,以千二百三元請孫中山離開日本。"); 286 | std::vector numexps; 287 | NEN.process(text, numexps); 288 | ASSERT_EQ(1u, numexps.size()); 289 | NumericalExpression ex(string_to_ustring("千二百三元"), 11, 16, 1230, 1230); 290 | ex.counter = string_to_ustring("元"); 291 | EXPECT_TRUE(is_same_numexp(ex, numexps[0])); 292 | 293 | } 294 | //"東京支部の三人" 295 | -------------------------------------------------------------------------------- /src/optparse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * An event-driven parser for command-line arguments. 3 | * 4 | * Copyright (c) 2004-2005 by Naoaki Okazaki 5 | * 6 | * This software is provided 'as-is', without any express or implied 7 | * warranty. In no event will the authors be held liable for any damages 8 | * arising from the use of this software. 9 | * 10 | * Permission is granted to anyone to use this software for any purpose, 11 | * including commercial applications, and to alter it and redistribute it 12 | * freely, subject to the following restrictions (known as zlib license): 13 | * 14 | * 1. The origin of this software must not be misrepresented; you must not 15 | * claim that you wrote the original software. If you use this software 16 | * in a product, an acknowledgment in the product documentation would be 17 | * appreciated but is not required. 18 | * 2. Altered source versions must be plainly marked as such, and must not be 19 | * misrepresented as being the original software. 20 | * 3. This notice may not be removed or altered from any source distribution. 21 | * 22 | * Naoaki Okazaki 23 | * 24 | */ 25 | 26 | /* $Id$ */ 27 | 28 | /* 29 | * Class 'optparse' implements a parser for GNU-style command-line arguments. 30 | * Inherit this class to define your own option variables and to implement an 31 | * option handler with macros, BEGIN_OPTION_MAP, ON_OPTION(_WITH_ARG), and 32 | * END_OPTION_MAP. Consult the sample program attached at the bottom of this 33 | * source code. 34 | * 35 | * This code was comfirmed to be compiled with MCVC++ 2003 and gcc 3.3. 36 | * Define _BUILD_NCL_SAMPLE if you want to build a sample program. 37 | * $ g++ -D_BUILD_NCL_SAMPLE -xc++ optparse.h 38 | */ 39 | 40 | #ifndef __NCL_OPTPRASE_H__ 41 | #define __NCL_OPTPRASE_H__ 42 | 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | 49 | #ifdef USE_NCL_NAMESPACE 50 | namespace ncl { 51 | #endif/*USE_NCL_NAMESPACE*/ 52 | 53 | 54 | /** 55 | * An event-driven parser for command-line arguments. 56 | * @author Naoaki Okazaki 57 | */ 58 | class optparse { 59 | public: 60 | /** 61 | * Exception class for unrecognized options. 62 | */ 63 | class unrecognized_option : public std::invalid_argument { 64 | public: 65 | unrecognized_option(char shortopt) 66 | : std::invalid_argument(std::string("-") + shortopt) {} 67 | unrecognized_option(const std::string& longopt) 68 | : std::invalid_argument(std::string("--") + longopt) {} 69 | }; 70 | /** 71 | * Exception class for invalid values. 72 | */ 73 | class invalid_value : public std::invalid_argument { 74 | public: 75 | invalid_value(const std::string& message) 76 | : std::invalid_argument(message) {} 77 | }; 78 | 79 | public: 80 | /** Construct. */ 81 | optparse() {} 82 | /** Destruct. */ 83 | virtual ~optparse() {} 84 | 85 | /** 86 | * Parse options. 87 | * @param argv array of null-terminated strings to be parsed 88 | * @param num_argv specifies the number, in strings, of the array 89 | * @return the number of used arguments 90 | * @throws optparse_exception 91 | */ 92 | int parse(char * const argv[], int num_argv) 93 | { 94 | int i; 95 | for (i = 1;i < num_argv;++i) { 96 | const char *token = argv[i]; 97 | if (*token++ == '-') { 98 | const char *next_token = (i+1 < num_argv) ? argv[i+1] : ""; 99 | if (!*token) { 100 | break; // only '-' was found. 101 | } else if (*token == '-') { 102 | const char *arg = std::strchr(++token, '='); 103 | if (arg) { 104 | arg++; 105 | } else { 106 | arg = next_token; 107 | } 108 | int ret = handle_option(0, token, arg); 109 | if (ret < 0) { 110 | throw unrecognized_option(token); 111 | } 112 | if (arg == next_token) { 113 | i += ret; 114 | } 115 | } else { 116 | char c; 117 | while ((c = *token++) != '\0') { 118 | const char *arg = *token ? token : next_token; 119 | int ret = handle_option(c, token, arg); 120 | if (ret < 0) { 121 | throw unrecognized_option(c); 122 | } 123 | if (ret > 0) { 124 | if (arg == token) { 125 | token = ""; 126 | } else { 127 | i++; 128 | } 129 | } 130 | } // while 131 | } // else (*token == '-') 132 | } else { 133 | break; // a non-option argument was fonud. 134 | } 135 | } // for (i) 136 | 137 | return i; 138 | } 139 | 140 | protected: 141 | /** 142 | * Option handler 143 | * This function should be overridden by inheritance class. 144 | * @param c short option character, 0 for long option 145 | * @param longname long option name 146 | * @param arg an argument for the option 147 | * @return 0 (success); 148 | 1 (success with use of an argument); 149 | -1 (failed, unrecognized option) 150 | * @throws option_parser_exception 151 | */ 152 | virtual int handle_option(char c, const char *longname, const char *arg) 153 | { 154 | return 0; 155 | } 156 | 157 | int __optstrcmp(const char *option, const char *longname) 158 | { 159 | const char *p = std::strchr(option, '='); 160 | return p ? 161 | std::strncmp(option, longname, p-option) : 162 | std::strcmp(option, longname); 163 | } 164 | }; 165 | 166 | 167 | /** The begin of inline option map. */ 168 | #define BEGIN_OPTION_MAP_INLINE() \ 169 | virtual int handle_option(char __c, const char *__longname, const char *arg) \ 170 | { \ 171 | int used_args = 0; \ 172 | if (0) { \ 173 | 174 | /** Define of option map. */ 175 | #define DEFINE_OPTION_MAP() \ 176 | virtual int handle_option(char __c, const char *__longname, const char *arg); 177 | 178 | /** Begin of option map implimentation. */ 179 | #define BEGIN_OPTION_MAP(_Class) \ 180 | int _Class::handle_option(char __c, const char *__longname, const char *arg) \ 181 | { \ 182 | int used_args = 0; \ 183 | if (0) { \ 184 | 185 | /** An entry of option map */ 186 | #define ON_OPTION(test) \ 187 | return used_args; \ 188 | } else if (test) { \ 189 | used_args = 0; \ 190 | 191 | #define ON_OPTION_WITH_ARG(test) \ 192 | return used_args; \ 193 | } else if (test) { \ 194 | used_args = 1; \ 195 | 196 | /** The end of option map implementation */ 197 | #define END_OPTION_MAP() \ 198 | return used_args; \ 199 | } \ 200 | return -1; \ 201 | } \ 202 | 203 | /** A predicator for short options */ 204 | #define SHORTOPT(x) (__c == x) 205 | /** A predicator for long options */ 206 | #define LONGOPT(x) (!__c && __optstrcmp(__longname, x) == 0) 207 | 208 | 209 | #ifdef USE_NCL_NAMESPACE 210 | }; 211 | #endif/*USE_NCL_NAMESPACE*/ 212 | 213 | 214 | 215 | 216 | 217 | 218 | #ifdef _BUILD_NCL_SAMPLE 219 | 220 | #include 221 | #include 222 | 223 | /** 224 | * A class to store parameters specified by command-line arguments 225 | */ 226 | class option : public optparse { 227 | public: 228 | int bytes; 229 | int lines; 230 | bool quiet; 231 | 232 | option() : bytes(0), lines(0), quiet(false) {} 233 | 234 | BEGIN_OPTION_MAP_INLINE() 235 | ON_OPTION(SHORTOPT('b') || LONGOPT("bytes")) 236 | bytes = std::atoi(arg); 237 | used_args = 1; // Notify the parser of a consumption of argument. 238 | 239 | ON_OPTION_WITH_ARG(SHORTOPT('l') || LONGOPT("lines")) 240 | lines = std::atoi(arg); 241 | // no need of the notification: used_args variable will be set to 1. 242 | 243 | ON_OPTION(SHORTOPT('q') || LONGOPT("quiet") || LONGOPT("silent")) 244 | quiet = true; 245 | 246 | END_OPTION_MAP() 247 | }; 248 | 249 | int main(int argc, char *argv[]) 250 | { 251 | try { 252 | option opt; 253 | int argused = opt.parse(&argv[1], argc-1); // Skip argv[0]. 254 | 255 | std::cout << "used argv: " << argused << std::endl; 256 | std::cout << "bytes: " << opt.bytes << std::endl; 257 | std::cout << "lines: " << opt.lines << std::endl; 258 | std::cout << "quiet: " << opt.quiet << std::endl; 259 | } catch (const optparse::unrecognized_option& e) { 260 | std::cout << "unrecognized option: " << e.what() << std::endl; 261 | return 1; 262 | } catch (const optparse::invalid_value& e) { 263 | std::cout << "invalid value: " << e.what() << std::endl; 264 | return 1; 265 | } 266 | 267 | return 0; 268 | } 269 | 270 | #endif/*_BUILD_NCL_SAMPLE*/ 271 | 272 | 273 | #endif/*__NCL_OPTPRASE_H__*/ 274 | -------------------------------------------------------------------------------- /src/reltime_expression_normalizer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef RELTIME_EXPRESSION_NORMALIZER_H_ 2 | #define RELTIME_EXPRESSION_NORMALIZER_H_ 3 | #include 4 | #include "digit_utility.hpp" 5 | #include "number_normalizer.hpp" 6 | #include "normalizer_utility.hpp" 7 | #include "normalizer_template.hpp" 8 | #include 9 | 10 | namespace reltime_expression_normalizer{ 11 | 12 | struct ReltimeExpression : normalizer_utility::NormalizedExpressionTemplate{ 13 | ReltimeExpression(digit_utility::Number number) 14 | : normalizer_utility::NormalizedExpressionTemplate(number.original_expression, number.position_start, number.position_end), 15 | org_value_lowerbound(number.value_lowerbound), 16 | org_value_upperbound(number.value_upperbound), 17 | value_lowerbound_abs(normalizer_utility::Time(INFINITY)), 18 | value_upperbound_abs(normalizer_utility::Time(-INFINITY)), 19 | value_lowerbound_rel(normalizer_utility::Time(INFINITY)), 20 | value_upperbound_rel(normalizer_utility::Time(-INFINITY)), 21 | ordinary(false) 22 | {} 23 | 24 | double org_value_lowerbound, org_value_upperbound; 25 | normalizer_utility::Time value_lowerbound_abs, value_upperbound_abs; 26 | normalizer_utility::Time value_lowerbound_rel, value_upperbound_rel; 27 | bool ordinary; 28 | }; 29 | 30 | 31 | class LimitedReltimeExpression : public normalizer_utility::LimitedExpressionTemplate{ 32 | public: 33 | template 34 | void serialize(Archive &ar){ 35 | ar & MEMBER(pattern) & MEMBER(corresponding_time_position) & MEMBER(process_type) & MEMBER(ordinary) & MEMBER(option); 36 | } 37 | 38 | std::vector corresponding_time_position; 39 | std::vector process_type; 40 | }; 41 | 42 | 43 | class ReltimeExpressionNormalizer : public normalizer_template::NormalizerTemplate{ 44 | public: 45 | ReltimeExpressionNormalizer(const std::string& language) : NN(language) { language_ = language; init(); } 46 | 47 | private: 48 | void init(); 49 | void normalize_number(const std::string& text, std::vector& numbers); 50 | void revise_any_type_expression_by_matching_limited_expression(std::vector& reltimeexps, int& expression_id, LimitedReltimeExpression matching_limited_reltime_expression); 51 | void revise_any_type_expression_by_matching_prefix_counter(ReltimeExpression& reltimeexp, const LimitedReltimeExpression& matching_limited_expression); 52 | void revise_any_type_expression_by_number_modifier(ReltimeExpression& reltimeexp, const normalizer_utility::NumberModifier& number_modifier); 53 | void delete_not_any_type_expression(std::vector& reltimeexps); 54 | void fix_by_range_expression(const pfi::data::string::ustring& utext, std::vector& reltimeexps); 55 | 56 | number_normalizer::NumberNormalizer NN; 57 | }; 58 | 59 | } //namespace reltime_expression_normalizer 60 | 61 | #endif //RELTIME_EXPRESSON_NORMALIZER_H_ 62 | -------------------------------------------------------------------------------- /src/wscript: -------------------------------------------------------------------------------- 1 | 2 | 3 | def build(bld): 4 | def define_test(source): 5 | target = source.split('.')[0].replace('/', '_') 6 | bld.program( 7 | features = 'gtest', 8 | source = source, 9 | target = target, 10 | use = ['PFICOMMON','normalize_numexp', 'UX', 'boost_regex']) 11 | 12 | bld.shlib( 13 | source = ['dictionary_dirpath.cpp', 'digit_utility.cpp', 'number_normalizer.cpp', 'normalizer_utility.cpp', 'numerical_expression_normalizer.cpp', 'abstime_expression_normalizer.cpp', 'reltime_expression_normalizer.cpp', 'duration_expression_normalizer.cpp', 'inappropriate_expression_remover.cpp', 'normalize_numexp.cpp'], 14 | use = ['PFICOMMON', 'UX'], 15 | target = 'normalize_numexp' 16 | ) 17 | 18 | bld(features = 'cxx cprogram', 19 | source = 'main.cpp', 20 | use = ['PFICOMMON'], 21 | target = 'normalizeNumexp', 22 | uselib_local = 'normalize_numexp') 23 | 24 | define_test('digit_utility_test.cpp') 25 | define_test('number_normalizer_test.cpp') 26 | define_test('normalizer_utility_test.cpp') 27 | define_test('numerical_expression_normalizer_test.cpp') 28 | define_test('abstime_expression_normalizer_test.cpp') 29 | define_test('reltime_expression_normalizer_test.cpp') 30 | define_test('duration_expression_normalizer_test.cpp') 31 | define_test('normalize_numexp_test.cpp') 32 | #define_test('normalizer_tester.cpp') 33 | -------------------------------------------------------------------------------- /swig/java/TestNormalizeNumexp.java: -------------------------------------------------------------------------------- 1 | import java.util.Scanner; 2 | import jp.ac.tohoku.ecei.cl.numexp.*; 3 | 4 | public class TestNormalizeNumexp { 5 | static { 6 | System.loadLibrary("normalize_numexp"); 7 | } 8 | public static void main(String [] args) { 9 | NormalizeNumexp n = new NormalizeNumexp("ja"); 10 | StringVector result = new StringVector(0); 11 | 12 | String text = "魔女狩りは15世紀〜18世紀にかけてみられ、全ヨーロッパで4万人が処刑された"; 13 | n.normalize(text, result); 14 | 15 | System.out.println("text:" + text); 16 | for (long i = 0, size = result.size(); i < size; i++) { 17 | System.out.println(result.get((int)i)); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /swig/java/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #http://www.swig.org/Doc2.0/SWIGDocumentation.html 3 | 4 | [ -d src ] || mkdir src 5 | [ -d classes ] || mkdir classes 6 | 7 | #1. create _wrap.cxx and wrapper source files by swig (you can change the package name) 8 | swig -c++ -java -I../../src/ -o normalize_numexp_wrap.cxx -package jp.ac.tohoku.ecei.cl.numexp -outdir src ../normalize_numexp.i 9 | 10 | #2. compile _wrap.cxx (JNI header files are necessary) 11 | gcc -O2 -fPIC -c normalize_numexp_wrap.cxx -I../../src/ -I/usr/lib/jvm/java-7-oracle/include -I/usr/lib/jvm/java-7-oracle/include/linux 12 | 13 | #3. create shared library 14 | gcc -shared ../../build/src/dictionary_dirpath.cpp.1.o ../../build/src/normalize_numexp.cpp.1.o ../../build/src/abstime_expression_normalizer.cpp.1.o ../../build/src/digit_utility.cpp.1.o ../../build/src/duration_expression_normalizer.cpp.1.o ../../build/src/normalizer_utility.cpp.1.o ../../build/src/number_normalizer.cpp.1.o ../../build/src/numerical_expression_normalizer.cpp.1.o ../../build/src/reltime_expression_normalizer.cpp.1.o ../../build/src/inappropriate_expression_remover.cpp.1.o normalize_numexp_wrap.o -o libnormalize_numexp.so -I/usr/lib/jvm/java-7-oracle/include -L/usr/local/lib -lpficommon -lpficommon_visualization -lpficommon_text -lpficommon_network_base -lpficommon_concurrent -lpficommon_data -lpficommon_math -lpficommon_system -lpficommon_network_http -lpficommon_lang -lpficommon_network_rpc -lpficommon_network_cgi -lux 15 | 16 | #4. create wrapper classes 17 | javac -d classes src/*.java 18 | 19 | echo finished! 20 | echo 'how to use: "java -Djava.library.path= -classpath 〜"' 21 | -------------------------------------------------------------------------------- /swig/java/readme.txt: -------------------------------------------------------------------------------- 1 | ■swigによるJava連携について 2 | compile.shを実行した後、生成されたclassesディレクトリをclasspathに含めるようにしてください。 3 | また生成された.soファイルをおいたディレクトリを、java.library.pathシステムプロパティに含めるようにしてください。 4 | -------------------------------------------------------------------------------- /swig/normalize_numexp.i: -------------------------------------------------------------------------------- 1 | %module normalize_numexp 2 | 3 | %{ 4 | #define SWIG_FILE_WITH_INIT 5 | #include "normalize_numexp.hpp" 6 | %} 7 | 8 | %include "std_string.i" 9 | %include "std_vector.i" 10 | 11 | namespace std { 12 | %template(IntVector) vector; 13 | %template(DoubleVector) vector; 14 | %template(StringVector) vector; 15 | } 16 | 17 | %include "normalize_numexp.hpp" 18 | -------------------------------------------------------------------------------- /swig/python/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #http://www.swig.org/Doc2.0/SWIGDocumentation.html 3 | 4 | #1. create _wrap.cxx and wrapper .py by swig. 5 | swig -c++ -python -I../../src/ -o normalize_numexp_wrap.cxx ../normalize_numexp.i 6 | 7 | #2. compile _wrap.cxx (Python.h is necessary.) 8 | gcc -O2 -fPIC -c normalize_numexp_wrap.cxx -I/usr/include/python2.7 -I../../src/ -I/usr/local/include 9 | 10 | #3. create shared object 11 | gcc -shared ../../build/src/dictionary_dirpath.cpp.1.o ../../build/src/normalize_numexp.cpp.1.o ../../build/src/abstime_expression_normalizer.cpp.1.o ../../build/src/digit_utility.cpp.1.o ../../build/src/duration_expression_normalizer.cpp.1.o ../../build/src/normalizer_utility.cpp.1.o ../../build/src/number_normalizer.cpp.1.o ../../build/src/numerical_expression_normalizer.cpp.1.o ../../build/src/reltime_expression_normalizer.cpp.1.o ../../build/src/inappropriate_expression_remover.cpp.1.o normalize_numexp_wrap.o -o _normalize_numexp.so -I/usr/include/python2.7 -L/usr/local/lib -lpficommon -lpficommon_visualization -lpficommon_text -lpficommon_network_base -lpficommon_concurrent -lpficommon_data -lpficommon_math -lpficommon_system -lpficommon_network_http -lpficommon_lang -lpficommon_network_rpc -lpficommon_network_cgi -lux 12 | 13 | #4. 14 | echo finished! 15 | echo 'please copy "_normalize_numexp.so" and "normalize_numexp.py" to your python "site-packages"' 16 | -------------------------------------------------------------------------------- /swig/python/readme.txt: -------------------------------------------------------------------------------- 1 | ■swigによるpython連携について 2 | distutilが上手く使えなかったため、通常のインストール方法ではなく、手動でのインストールになります。 3 | compile.shを実行した後、生成された.soファイルと.pyファイルをpythonのsite-packagesにコピーして下さい。 4 | -------------------------------------------------------------------------------- /swig/python/test_normalize_numexp.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import sys 3 | from normalize_numexp import * 4 | 5 | n = NormalizeNumexp("ja") 6 | text = "魔女狩りは15世紀〜18世紀にかけてみられ、全ヨーロッパで4万人が処刑された" 7 | result = StringVector(0) 8 | 9 | n.normalize(text, result) 10 | print "text:",text 11 | for r in result : 12 | print r 13 | -------------------------------------------------------------------------------- /swig/ruby/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #http://www.swig.org/Doc2.0/SWIGDocumentation.html 3 | 4 | #1. create _wrap.cxx by swig. 5 | swig -c++ -ruby -I../../src/ -o normalize_numexp_wrap.cxx ../normalize_numexp.i 6 | 7 | #2. compile _wrap.cxx (ruby.h and config.h are necessary.) 8 | gcc -O2 -fPIC -c normalize_numexp_wrap.cxx -I../../src/ -I/usr/include/ruby-1.9.1 -I/usr/include/ruby-1.9.1/x86_64-linux 9 | 10 | #3. create shared object 11 | gcc -shared ../../build/src/dictionary_dirpath.cpp.1.o ../../build/src/normalize_numexp.cpp.1.o ../../build/src/abstime_expression_normalizer.cpp.1.o ../../build/src/digit_utility.cpp.1.o ../../build/src/duration_expression_normalizer.cpp.1.o ../../build/src/normalizer_utility.cpp.1.o ../../build/src/number_normalizer.cpp.1.o ../../build/src/numerical_expression_normalizer.cpp.1.o ../../build/src/reltime_expression_normalizer.cpp.1.o ../../build/src/inappropriate_expression_remover.cpp.1.o normalize_numexp_wrap.o -o normalize_numexp.so -I/usr/include/ruby-1.9.1 -L/usr/local/lib -lpficommon -lpficommon_visualization -lpficommon_text -lpficommon_network_base -lpficommon_concurrent -lpficommon_data -lpficommon_math -lpficommon_system -lpficommon_network_http -lpficommon_lang -lpficommon_network_rpc -lpficommon_network_cgi -lux 12 | 13 | #4. 14 | echo finished! 15 | echo 'please copy "normalize_numexp.so" to your ruby "site-ruby"' 16 | -------------------------------------------------------------------------------- /swig/ruby/readme.txt: -------------------------------------------------------------------------------- 1 | ■swigによるruby連携について 2 | compile.shを実行した後、生成された.soファイルをsite-rubyにコピーして下さい。 3 | -------------------------------------------------------------------------------- /swig/ruby/test-normalize-numexp.rb: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | require "normalize_numexp" 3 | 4 | n = Normalize_numexp::NormalizeNumexp::new("ja") 5 | text = "魔女狩りは15世紀〜18世紀にかけてみられ、全ヨーロッパで4万人が処刑された" 6 | result = Normalize_numexp::StringVector::new(0) 7 | 8 | n.normalize(text, result) 9 | print "text:#{text}\n" 10 | result.each do |r| 11 | print "#{r}\n" 12 | end 13 | -------------------------------------------------------------------------------- /unittest_gtest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/unittest_gtest.py -------------------------------------------------------------------------------- /waf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nullnull/normalizeNumexp/9f0e855fa81cf7f0c6e06e0647054f1d997d7531/waf -------------------------------------------------------------------------------- /wscript: -------------------------------------------------------------------------------- 1 | import os 2 | APPNAME = 'numerical and temporal expression normalizer' 3 | VERSION = '0.5.0' 4 | 5 | def options(opt): 6 | opt.load('compiler_cxx') 7 | opt.load('unittest_gtest') 8 | 9 | def configure(conf): 10 | conf.env.CXXFLAGS += ['-O2', '-Wall', '-g', '-pipe'] 11 | conf.load('compiler_cxx') 12 | conf.load('unittest_gtest') 13 | conf.check_cfg(package = 'pficommon', args = '--cflags --libs') 14 | conf.check_cfg(package = 'ux', args = '--cflags --libs') 15 | #conf.check_cxx(lib='re2', libpath=conf.env.LIBDIR) 16 | #conf.check_cfg(package = 'boost', args = '--cflags --libs') 17 | #conf.check_cxx(lib='libname', header_name = 'header.h') 18 | pass 19 | 20 | def build(bld): 21 | create_dic_file(bld) 22 | bld.recurse('src') 23 | # bld.install_files('${PREFIX}/include', 'src/*.hpp') #cannot install 24 | for dpath, dnames, fnames in os.walk("src") : 25 | for fname in fnames : 26 | if not fname.endswith(".hpp") : continue 27 | bld.install_files('${PREFIX}/include/normalizeNumexp/', [dpath+"/"+fname]) 28 | for dpath, dnames, fnames in os.walk("src/dic") : 29 | for fname in fnames : 30 | if not fname.endswith(".txt") : continue 31 | bld.install_files('${PREFIX}/lib/normalizeNumexp/'+dpath[4:], [dpath+"/"+fname]) 32 | 33 | 34 | def create_dic_file(bld) : 35 | #辞書ファイルの場所を指定 36 | dictionary_dirpath = str(bld.env.PREFIX) + "/lib/normalizeNumexp/dic/" 37 | # dictionary_dirpath = "/home/katsuma/usr/local/lib/normalizeNumexp/dic/" 38 | source = """ 39 | #include "dictionary_dirpath.hpp" 40 | namespace dictionary_dirpath { 41 | std::string get_dictionary_dirpath(){ 42 | return \"%s\";}}""" 43 | 44 | source = source % dictionary_dirpath 45 | 46 | fout = open("./src/dictionary_dirpath.cpp", "w") 47 | fout.write(source) 48 | --------------------------------------------------------------------------------