├── LICENSE ├── README.md ├── cleaning.py ├── corpus.zip ├── data └── ng_words.txt ├── eval_ranking.py ├── ranking.zip └── replace_br.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # おーぷん2ちゃんねる対話コーパス 2 | ## 概要 3 | [おーぷん2ちゃんねる](https://open2ch.net/)の「なんでも実況(ジュピター)」「ニュー速VIP」「ニュース速報+」の3つの掲示板をクロールして作成した対話コーパスです. 4 | おーぷん2ちゃんねる開設時から2019年7月20日までのデータを使用して作成しました. 5 | 6 | ## 対話コーパス (corpus.zip) 7 | * データはtsv形式であり,レスアンカー「>>」で指定された投稿同士をタブ区切りで連結しています.各行が1対話となります. 8 | 9 | 例: [投稿1] [tab] [投稿2] [tab] [投稿3] ... 10 | 11 | * 上の例では,投稿2は投稿1へのレスアンカーを(原文では)含み,投稿3は投稿2へのレスアンカーを(原文では)含んでいます.コーパス中ではレスアンカーは削除されています. 12 | * レスアンカーは文頭にあるもののみを使用し,文頭以外でレスアンカーを含む投稿は除外しています. 13 | * 対話は2者が交互に投稿したもののみを収録しています.つまり,偶数番目の投稿と奇数番目の投稿はそれぞれ同じユーザが投稿したものです. 14 | * 改行は「\_\_BR\_\_」という記号に置換されています. 15 | * 以下の条件に該当する投稿は除外しています. 16 | * 文字数が5文字未満,および150文字より大きい投稿 17 | * URLおよび画像を含む投稿 18 | * 2つ以上のレスアンカーを含む投稿 19 | * ひらがな,カタカナ,漢字のいずれも含まない投稿 20 | * 4回以上の改行を含む投稿(対話としてふさわしくないもの,例えば打順や日程の列挙などを含むため) 21 | 22 | ### ファイル構成 23 | | ファイル名 | 取得先 | 対話数 | 平均対話長 | 24 | |-----------|-----------------------|-----------|-----------------------| 25 | | livejupiter.tsv |なんでも実況(ジュピター) | 5948218 | 2.24 | 26 | | news4vip.tsv | ニュー速VIP | 1983626 | 2.41 | 27 | | newsplus.tsv | ニュース速報+ | 217296 | 2.09 | 28 | | | 合計| 8149140 | 2.28 | 29 | 30 | ## 前処理用スクリプト 31 | 様々な用途に利用しやすいよう,不適切な表現を含む投稿を除去するスクリプト(cleaning.py)と改行記号「\_\_BR\_\_」を句読点に置換するスクリプト(replace_br.py)を用意しました.両方のスクリプトを適用することで,不適切表現を除去し,かつ改行記号を句読点に変換することも可能です. 32 | ただし,本スクリプトを適用しても,不適切な表現が完全に除去できるわけではありません. 33 | また,読点とすべきところを句点とするなど,改行記号が正しく置換されない場合があります. 34 | 35 | スクリプトはファイル単位で実行します. 36 | livejupiter.tsvを対象にスクリプトを実行するコマンドは以下のとおりです. 37 | 38 | ### 不適切な用語の除去 39 | ``` 40 | $ python cleaning.py --input_file corpus/livejupiter.tsv --output_file corpus/livejupiter_cleaned.tsv 41 | ``` 42 | ### 改行記号の置換 43 | 実行にはmecab-python3が必要です. 44 | ``` 45 | $ python replace_br.py --input_file corpus/livejupiter.tsv --output_file corpus/livejupiter_replaced.tsv 46 | ``` 47 | 48 | ## 応答順位付けタスク用データ (ranking.zip) 49 | 文脈に対して複数の応答を順位付けするタスクに対応した開発データとテストデータを用意しています. 50 | データの抽出元は同じく「なんでも実況(ジュピター)」「ニュー速VIP」「ニュース速報+」の3つの掲示板です. 51 | 52 | 開発データは2019年8月中に立てられたスレッドのみから,テストデータは同年9月中に立てられたスレッドのみからデータを構築しており,投稿の重複はありません. 53 | 54 | ### データ形式 55 | 1つの文脈(コーパス本体における対話から,最後の投稿を除いたもの)につき,1つの実際の投稿と9つのランダムに抽出した投稿が収録されています. 56 | 57 | データ形式はコーパス本体と同じtsv形式ですが,各行の先頭に,その行の最後の投稿が実際の投稿であるかランダム抽出の投稿であるかを意味するラベルが付与されています(1:実際の投稿,0:ランダム抽出). 58 | 59 | 開発データ(dev.tsv),テストデータ(test.tsv)にはそれぞれ2000個の文脈が収録されており,20000行ずつのファイルとなっています. 60 | 61 | ### 評価用スクリプト 62 | 応答順位付けタスクで評価尺度として用いられるRecall@k(k=1~10)を計算するスクリプトを用意しました. 63 | 64 | 開発データ(dev.tsv),もしくはテストデータ(test.tsv)の各投稿に対するスコアのみが入った20000行のファイル(各行の数値が同じ行の投稿に対応)を用意し,以下のコマンドでスクリプトを実行できます. 65 | ``` 66 | $ python eval_ranking.py --input_file ranking/dev.tsv --score_file ranking/score_sample.txt 67 | ``` 68 | 69 | ## 文献情報 70 | 本コーパスを使用した場合,以下を引用してください. 71 | ``` 72 | @inproceedings{open2chdlc2019, 73 | title={おーぷん2ちゃんねる対話コーパスを用いた用例ベース対話システム}, 74 | author={稲葉 通将}, 75 | booktitle={第87回言語・音声理解と対話処理研究会(第10回対話システムシンポジウム), 人工知能学会研究会資料 SIG-SLUD-B902-33}, 76 | pages={129--132}, 77 | year={2019} 78 | } 79 | ``` 80 | 81 | -------------------------------------------------------------------------------- /cleaning.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import argparse 3 | 4 | 5 | def include_ng(line, ng_words): 6 | line = line.lower() 7 | for w in ng_words: 8 | if w in line: 9 | return True 10 | return False 11 | 12 | 13 | if __name__ == '__main__': 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--input_file", default=None, type=str, required=True, 16 | help="The input tsv file.") 17 | parser.add_argument("--output_file", default=None, type=str, required=True, 18 | help="The output tsv file.") 19 | args = parser.parse_args() 20 | 21 | ng_words = [] 22 | with open("data/ng_words.txt") as f: 23 | for line in f: 24 | ng_words.append(line.strip()) 25 | 26 | 27 | cleaned_lines = [] 28 | with open(args.input_file) as f: 29 | for l in f: 30 | l = l.strip() 31 | while include_ng(l, ng_words): 32 | l = l.rsplit("\t", 1)[0] 33 | if "\t" not in l: 34 | break 35 | if "\t" in l: 36 | cleaned_lines.append(l + "\n") 37 | w = open(args.output_file, "w") 38 | w.writelines(cleaned_lines) 39 | w.close() 40 | -------------------------------------------------------------------------------- /corpus.zip: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:74cc746f6a2b49c9ab1ba4e5decdda99346a050fdb7309a60c7a78e1ad151c56 3 | size 534070324 4 | -------------------------------------------------------------------------------- /data/ng_words.txt: -------------------------------------------------------------------------------- 1 | ガイジ 2 | バカ 3 | ばか 4 | 池沼 5 | えっち 6 | 短小 7 | 漏 8 | 包茎 9 | カス 10 | リョナ 11 | 巨根 12 | 浮気 13 | 痴 14 | 不倫 15 | 氏ね 16 | 亀頭 17 | まんこ 18 | マンコ 19 | アバズレ 20 | あばずれ 21 | アパズレ 22 | あぱずれ 23 | アンダーヘア 24 | ペニ 25 | ヴァギナ 26 | セクロス 27 | フェラ 28 | イラマチオ 29 | イマラチオ 30 | スロート 31 | 中出し 32 | 顔射 33 | マラ 34 | 性器 35 | 遺体 36 | シックスナイン 37 | 体位 38 | 正常位 39 | 騎乗位 40 | ナマポ 41 | シックス・ナイン 42 | 69 43 | 69 44 | せんずり 45 | センズリ 46 | 後背位 47 | 前戯 48 | 後戯 49 | ピロートーク 50 | 西成 51 | 避妊 52 | ファック 53 | fuck 54 | オルガズム 55 | 事故 56 | オーガズム 57 | 絶頂 58 | 性的 59 | オルガズム 60 | 淫 61 | 遊女 62 | 女郎 63 | 売春 64 | 葬式 65 | 娼婦 66 | 風俗嬢 67 | フードル 68 | 愛人 69 | 通夜 70 | セフレ 71 | 援交 72 | 円光 73 | 援助交際 74 | パイプカット 75 | 売り専 76 | 尻軽 77 | ヤリ 78 | 肉便器 79 | リストカット 80 | 初体験 81 | 熟女 82 | ホモ 83 | ゲイ 84 | リスカ 85 | レズ 86 | ノンケ 87 | バイ 88 | ふたなり 89 | 顔射 90 | 近親相姦 91 | 筆おろし 92 | 姫始め 93 | パイパン 94 | 口内 95 | 不感 96 | 男根 97 | 陰茎 98 | クリトリス 99 | 逝 100 | 陰核 101 | 陰唇 102 | 勃起 103 | インポ 104 | 裸 105 | 射精 106 | ザーメン 107 | 精液 108 | 精子 109 | 売女 110 | 四十八手 111 | 座位 112 | 側位 113 | 立位 114 | エクスタシー 115 | 顔面騎乗 116 | まんぐり 117 | ちんぐり 118 | 貝合 119 | 練炭 120 | ペッティング 121 | クンニ 122 | 素股 123 | 乱交 124 | 虐 125 | スマタ 126 | パイズリ 127 | オナ 128 | 手マン 129 | コキ 130 | ハメ撮り 131 | 3p 132 | スカトロ 133 | 老害 134 | 膣 135 | 去勢 136 | フェチ 137 | 姦 138 | 昇天 139 | sm 140 | ブルセラ 141 | イメクラ 142 | ソープ 143 | 誘拐 144 | デリヘル 145 | ヘルス 146 | ラブホ 147 | tenga 148 | 賢者タイム 149 | コンドーム 150 | スケベ 151 | ダッチワイフ 152 | 貞操帯 153 | 電マ 154 | バイアグラ 155 | バイブ 156 | ディルド 157 | パンスト 158 | 張形 159 | ピル 160 | ローター 161 | ローション 162 | ケツ 163 | 女が 164 | 女は 165 | 女って 166 | 女を 167 | 露出 168 | 女より 169 | 女なんて 170 | 女でも 171 | 女なら 172 | 生足 173 | きちがい 174 | キチガイ 175 | 気違い 176 | 基地外 177 | ハァハァ 178 | 梅毒 179 | 淋病 180 | コンジローマ 181 | カンジダ 182 | トリコモナス 183 | 部落 184 | 在日 185 | 支那 186 | めくら 187 | つるぺた 188 | リンチ 189 | 私刑 190 | ボッキ 191 | つんぼ 192 | かたわ 193 | ドキュン 194 | dqn 195 | 天皇 196 | 陛下 197 | 皇后 198 | 皇太子 199 | 性病 200 | 馬鹿 201 | アホ 202 | あほ 203 | くそ 204 | クソ 205 | うんこ 206 | 阿保 207 | 阿呆 208 | おっぱい 209 | うるさい 210 | の犬 211 | ウンコ 212 | 童貞 213 | 処女 214 | 死 215 | 殺 216 | 姦 217 | 邪魔 218 | 乳 219 | 変態 220 | 暴力 221 | 底辺 222 | 障 223 | 犯 224 | セックス 225 | 好きじゃ 226 | ブサイク 227 | 不細工 228 | ぶさいく 229 | 臭 230 | エロ 231 | 宗教 232 | ブス 233 | ぶす 234 | デブ 235 | 自慰 236 | 大人のおもちゃ 237 | 大人のオモチャ 238 | 大人の玩具 239 | 精力 240 | 射精 241 | シコ 242 | 下ネタ 243 | ゲロ 244 | クズ 245 | 屑 246 | バスト 247 | ショタ 248 | アパズレ 249 | キチ 250 | 民度 251 | 韓 252 | チョン 253 | 売国 254 | ビッチ 255 | エッチ 256 | 人妻 257 | 幼女 258 | エッチ 259 | 奴隷 260 | 風俗 261 | アスペ 262 | きめえ 263 | きめぇ 264 | 堕胎 265 | ヤラせ 266 | 茶番 267 | 糞 268 | 膜 269 | イっ 270 | イキ 271 | 抜け 272 | 抜く 273 | 抜い 274 | メンヘラ 275 | ヌけ 276 | ヌく 277 | ヌい 278 | av 279 | jc 280 | js 281 | 詐欺 282 | 創価 283 | 黒い噂 284 | うっとうしい 285 | うっとおしい 286 | 鬱陶し 287 | パンツ 288 | パンチラ 289 | パンティ 290 | 性行為 291 | モロ 292 | ぱんつ 293 | アヌス 294 | アナル 295 | ゴミ 296 | 生理 297 | ふたなり 298 | チン 299 | ちん 300 | 劣化 301 | 外人 302 | 片親 303 | 逝去 304 | 連れ子 305 | 痴呆 306 | 植物人間 307 | 合いの子 308 | 卑猥 309 | 他界 310 | え〇 311 | エ〇 312 | ち〇 313 | チ〇 314 | ま〇 315 | マ〇 316 | 穴兄弟 317 | 竿姉妹 318 | はゴミ 319 | 冥福 320 | しんだ 321 | しんで 322 | 氏んだ 323 | 氏んで 324 | オカズ 325 | 金玉 326 | ロリ 327 | キモ 328 | ウザ 329 | うざ 330 | 底辺 331 | ニート 332 | オフパコ 333 | ガイジ 334 | -------------------------------------------------------------------------------- /eval_ranking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import argparse 4 | 5 | def recall_at_k(y_true, y_pred, k): 6 | correct = 0 7 | total = 0 8 | for yt, yp in zip(y_true, y_pred): 9 | total += 1 10 | for i in yp.argsort()[::-1][0:k]: 11 | if yt[i] == 1: 12 | correct += 1 13 | return correct / total 14 | 15 | if __name__ == '__main__': 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--input_file", default=None, type=str, required=True, 18 | help="The input dev or test tsv file.") 19 | parser.add_argument("--score_file", default=None, type=str, required=True, 20 | help="The estimated score file.") 21 | args = parser.parse_args() 22 | 23 | y_input = [] 24 | with open(args.input_file) as f: 25 | for l in f: 26 | if "\t" in l: 27 | y_input.append(int(l.split("\t")[0])) 28 | 29 | 30 | y_score = [] 31 | with open(args.score_file) as f: 32 | for l in f: 33 | l = l.strip() 34 | if len(l) > 0: 35 | y_score.append(float(l)) 36 | assert len(y_input) == len(y_score), "行数が一致しません. input: {0}, score: {1}".format(len(y_input), len(y_score)) 37 | y_input = np.array(y_input).reshape(-1, 10) 38 | y_score = np.array(y_score).reshape(-1, 10) 39 | for k in range(1, 11): 40 | print("1 in 10 Recall@" + str(k) + ": " + str(recall_at_k(y_input, y_score, k))) 41 | -------------------------------------------------------------------------------- /ranking.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/1never/open2ch-dialogue-corpus/a8ccdf2cffa81ea5f9ff5ed01c35d0272f0936f3/ranking.zip -------------------------------------------------------------------------------- /replace_br.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import MeCab 3 | import re 4 | import argparse 5 | 6 | mecab = MeCab.Tagger() 7 | mecab.parse('') 8 | 9 | symbols = ["w", "w", "。", "、", ",", ".", ",", ".", ")", ")", "?", "!", "?", "!", "…", "」"] 10 | 11 | def last_word_pos(text): 12 | node = mecab.parseToNode(text) 13 | pos = None 14 | while node: 15 | if "BOS" not in node.feature: 16 | pos = node.feature 17 | node = node.next 18 | return pos 19 | 20 | def replace_br(line): 21 | # 各文の先頭の改行記号は削除 22 | tmp = "" 23 | for l in line.split("\t"): 24 | tmp += re.sub(r'^( )+(__BR__ )+', '', l) + "\t" 25 | line = re.sub(r'\t$', "", tmp) 26 | 27 | if "__BR__" not in line: 28 | return line 29 | else: 30 | # 改行記号が3つ連続の場合は2個に置換 31 | line = line.replace(" __BR__ __BR__ __BR__ ", " __BR__ __BR__ ") 32 | # 改行記号が2つ連続の場合は,直前にsymbolsの記号があれば改行記号を削除.なければ句点に置換. 33 | if " __BR__ __BR__ " in line: 34 | tmp_line = "" 35 | ls = line.split(" __BR__ __BR__ ") 36 | for i in range(len(ls)-1): 37 | l = ls[i] 38 | contains_symbol = False 39 | for s in symbols: 40 | if l.endswith(s): 41 | contains_symbol = True 42 | if contains_symbol: 43 | tmp_line += l 44 | else: 45 | tmp_line += l + "。" 46 | tmp_line += ls[-1] 47 | line = tmp_line 48 | 49 | # 改行記号が存在する場合は,直前にsymbolsの記号があれば削除. 50 | # 改行記号の直前の語が係助詞,格助詞,接続助詞の場合は読点,それ以外は句点に置換, 51 | if " __BR__ " in line: 52 | tmp_line = "" 53 | ls = line.split(" __BR__ ") 54 | for i in range(len(ls)-1): 55 | l = ls[i] 56 | contains_symbol = False 57 | for s in symbols: 58 | if l.endswith(s): 59 | contains_symbol = True 60 | if contains_symbol: 61 | tmp_line += l 62 | else: 63 | lwpos = last_word_pos(l) 64 | if "係助詞"in lwpos or "格助詞" in lwpos or "接続助詞" in lwpos: 65 | tmp_line += l + "、" 66 | else: 67 | tmp_line += l + "。" 68 | tmp_line += ls[-1] 69 | line = tmp_line 70 | return line 71 | 72 | 73 | def main(): 74 | parser = argparse.ArgumentParser() 75 | parser.add_argument("--input_file", default=None, type=str, required=True, 76 | help="The input tsv file.") 77 | parser.add_argument("--output_file", default=None, type=str, required=True, 78 | help="The output tsv file.") 79 | args = parser.parse_args() 80 | 81 | nobr_lines = [] 82 | with open(args.input_file) as f: 83 | for l in f: 84 | l = l.strip() 85 | nobr_lines.append(replace_br(l) + "\n") 86 | 87 | w = open(args.output_file, "w") 88 | w.writelines(nobr_lines) 89 | w.close() 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | 95 | --------------------------------------------------------------------------------