├── 1-1.png ├── Flyon ├── 1-1.png ├── label_pickle │ ├── 入院记录现病史-1.txt.pkl │ └── 入院记录现病史-2.txt.pkl ├── test_data400 │ ├── 入院记录现病史-2.txtoriginal.txt │ └── 入院记录现病史-1.txtoriginal.txt ├── CCKS_CRF │ ├── eval │ │ ├── finall │ │ │ ├── 入院记录现病史-2.txt │ │ │ └── 入院记录现病史-1.txt │ │ ├── onefile.py │ │ ├── CCKS_result │ │ │ ├── 入院记录现病史-2.txt │ │ │ └── 入院记录现病史-1.txt │ │ └── get_reult.py │ ├── pat │ │ └── Tok.pat_ccks │ ├── dic │ │ └── zhenzhaung.txt │ ├── test_label_split │ │ ├── 入院记录现病史-2.txtoriginal.txt │ │ └── 入院记录现病史-1.txtoriginal.txt │ ├── wapiti_ccks.sh │ └── BIO_ccks │ │ ├── 入院记录现病史-1.txtoriginal.txt │ │ └── 入院记录现病史-2.txtoriginal.txt ├── train_data600 │ ├── 入院记录现病史-1.txtoriginal.txt │ ├── 入院记录现病史-1.txt │ ├── 入院记录现病史-2.txtoriginal.txt │ └── 入院记录现病史-2.txt ├── Bio_label │ ├── 入院记录现病史-1.txt │ └── 入院记录现病史-2.txt ├── README.md ├── readme.txt ├── raw2bio.py └── Bio_nolabel │ ├── 入院记录现病史-1.txtoriginal.txt │ └── 入院记录现病史-2.txtoriginal.txt ├── Wapiti ├── wapiti ├── bin │ └── wapiti ├── dat │ ├── pattern.txt │ ├── chpattern.txt │ ├── nppattern.txt │ └── train.txt ├── INSTALL ├── Makefile ├── COPYING ├── README.mkd ├── src │ ├── trainers.h │ ├── progress.h │ ├── thread.h │ ├── quark.h │ ├── decoder.h │ ├── pattern.h │ ├── tools.h │ ├── vmath.h │ ├── wapiti.h │ ├── options.h │ ├── reader.h │ ├── model.h │ ├── gradient.h │ ├── thread.c │ ├── tools.c │ ├── progress.c │ ├── sequence.h │ ├── rprop.c │ ├── sgdl1.c │ ├── quark.c │ └── model.c └── HISTORY ├── README.md └── LICENSE /1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyzhouhzau/Clinical-NER/HEAD/1-1.png -------------------------------------------------------------------------------- /Flyon/1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyzhouhzau/Clinical-NER/HEAD/Flyon/1-1.png -------------------------------------------------------------------------------- /Wapiti/wapiti: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyzhouhzau/Clinical-NER/HEAD/Wapiti/wapiti -------------------------------------------------------------------------------- /Wapiti/bin/wapiti: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyzhouhzau/Clinical-NER/HEAD/Wapiti/bin/wapiti -------------------------------------------------------------------------------- /Flyon/label_pickle/入院记录现病史-1.txt.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyzhouhzau/Clinical-NER/HEAD/Flyon/label_pickle/入院记录现病史-1.txt.pkl -------------------------------------------------------------------------------- /Flyon/label_pickle/入院记录现病史-2.txt.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kyzhouhzau/Clinical-NER/HEAD/Flyon/label_pickle/入院记录现病史-2.txt.pkl -------------------------------------------------------------------------------- /Wapiti/dat/pattern.txt: -------------------------------------------------------------------------------- 1 | # Unigram 2 | *1:%x[-2,0] 3 | *2:%x[-1,0] 4 | *3:%x[ 0,0] 5 | *4:%x[ 1,0] 6 | *5:%x[ 2,0] 7 | 8 | # Bigram 9 | *6:%x[-1,0]/%x[0,0] 10 | *7:%x[ 1,0]/%x[0,0] 11 | 12 | # Trigram 13 | *8:%x[-1,0]/%x[0,0]/%x[1,0] 14 | 15 | -------------------------------------------------------------------------------- /Flyon/test_data400/入院记录现病史-2.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | ,患者于半月前无明显诱因出现进食后中上腹不适,每次持续数分钟自行缓解,无恶心、呕吐、反酸、嗳气、烧心,无腹痛、腹胀、腹泻、便秘,无厌油、纳差,未予重视,未特殊处理。,半前至我院门诊行胃镜检查提示:浅表性胃窦炎伴糜烂,十二指肠球炎,,腹部彩超:肝回声增多,胆囊息肉样变。为进一步诊治,门诊“胃炎”收入我科。\U0004 患者本次发病以来,食欲正常, 神志清醒,精神尚可,睡眠尚可,大便正常,小便正常,体重无明显变化。 2 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/eval/finall/入院记录现病史-2.txt: -------------------------------------------------------------------------------- 1 | 后中上腹 16 20 解剖部位 2 | 不适 20 22 症状描述 3 | 恶心 36 38 独立症状 4 | 呕吐 39 41 独立症状 5 | 反酸 42 44 独立症状 6 | 嗳气 45 47 独立症状 7 | 烧心 48 50 独立症状 8 | 便秘 61 63 独立症状 9 | 纳差 68 70 独立症状 10 | 十二指肠 108 112 解剖部位 11 | 腹部 116 118 解剖部位 12 | 肝 121 122 解剖部位 13 | -------------------------------------------------------------------------------- /Flyon/test_data400/入院记录现病史-1.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | ,缘于入院前5年进食冰冷食物后出现中下腹部闷痛不适,为持续性隐痛,无其它部位放射,无恶心呕吐,无反酸嗳气,无呕血黑便等不适。[主诉单一时间14天主诉单一时间]前进食冰西瓜后出现中下腹部闷痛不适,为持续性隐痛,程度较前加重,无其它部位放射,翻身向右侧时疼痛有所缓解,无恶心、呕吐、返酸、嗳气、纳差、乏力、消瘦、腹泻、便秘,无呕血、黑便,无发热、咳嗽、咳痰,自行服用胃药(具体不详),症状有所好转,,此次至我院门诊行胃镜检查示:“1.胃体溃疡型癌(早癌可能)。2.贲门炎症。3.慢性萎缩性胃炎”,,病理:(体小),:管状腺癌II-III级。遂门诊拟“胃体癌”收入院。自发病以来,精神、睡眠可,食欲一般,大小便如常,体重无明显变化。 2 | -------------------------------------------------------------------------------- /Flyon/train_data600/入院记录现病史-1.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | ,患者3月前因“直肠癌”于在我院于全麻下行直肠癌根治术(DIXON术),手术过程顺利,术后给予抗感染及营养支持治疗,患者恢复好,切口愈合良好。,术后病理示:直肠腺癌(中低度分化),浸润溃疡型,面积3.5*2CM,侵达外膜。两端切线另送“近端”、“远端”及环周底部切除面未查见癌。肠壁一站(10个)、中间组(8个)淋巴结未查见癌。,免疫组化染色示:ERCC1弥漫(+)、TS少部分弱(+)、SYN(-)、CGA(-)。术后查无化疗禁忌后给予3周期化疗,,方案为:奥沙利铂150MG D1,亚叶酸钙0.3G+替加氟1.0G D2-D6,同时给与升白细胞、护肝、止吐、免疫增强治疗,患者副反应轻。院外期间患者一般情况好,无恶心,无腹痛腹胀胀不适,无现患者为行复查及化疗再次来院就诊,门诊以“直肠癌术后”收入院。 近期患者精神可,饮食可,大便正常,小便正常,近期体重无明显变化。 2 | -------------------------------------------------------------------------------- /Flyon/train_data600/入院记录现病史-1.txt: -------------------------------------------------------------------------------- 1 | 直肠 8 10 解剖部位 2 | 直肠癌根治术 21 27 手术 3 | DIXON术 28 34 手术 4 | 直肠 78 80 解剖部位 5 | 肠壁 139 141 解剖部位 6 | 淋巴结 156 159 解剖部位 7 | 奥沙利铂 230 234 药物 8 | 亚叶酸钙 243 247 药物 9 | 替加氟 252 255 药物 10 | 肝 276 277 解剖部位 11 | 吐 279 280 独立症状 12 | 恶心 308 310 独立症状 13 | 腹 312 313 解剖部位 14 | 痛 313 314 症状描述 15 | 腹 314 315 解剖部位 16 | 胀 315 316 症状描述 17 | 胀 316 317 症状描述 18 | 不适 317 319 症状描述 19 | 直肠 342 344 解剖部位 20 | -------------------------------------------------------------------------------- /Flyon/Bio_label/入院记录现病史-1.txt: -------------------------------------------------------------------------------- 1 | 直肠 8 10 B-解剖部位 2 | 直肠癌 21 24 B-手术 3 | 根治术 24 27 I-手术 4 | DIXON 28 33 B-手术 5 | 术 33 34 I-手术 6 | 直肠 78 80 B-解剖部位 7 | 肠壁 139 141 B-解剖部位 8 | 淋巴结 156 159 B-解剖部位 9 | 奥沙利 230 233 B-药物 10 | 铂 233 234 I-药物 11 | 亚 243 244 B-药物 12 | 叶酸 244 246 I-药物 13 | 钙 246 247 I-药物 14 | 替加 252 254 B-药物 15 | 氟 254 255 I-药物 16 | 肝 276 277 B-解剖部位 17 | 吐 279 280 B-独立症状 18 | 恶心 308 310 B-独立症状 19 | 腹 312 313 B-解剖部位 20 | 痛 313 314 B-症状描述 21 | 腹 314 315 B-解剖部位 22 | 胀 315 316 B-症状描述 23 | 胀 316 317 B-症状描述 24 | 不适 317 319 B-症状描述 25 | 直肠 342 344 B-解剖部位 26 | -------------------------------------------------------------------------------- /Wapiti/INSTALL: -------------------------------------------------------------------------------- 1 | 2 | Wapiti installation 3 | 4 | If you have a recent compiler, normally you can just do the classical: 5 | make 6 | make install 7 | switch to super user for the second. If you want to install somewhere else than 8 | in /usr/local you will have to edit the variable definitions at the head of the 9 | Makefile. 10 | 11 | You can disable the non C99 compliant features by modifying the wapiti.h in the 12 | src/ directory. This should allow you to compile Wapiti on almost any platform 13 | who have a C99 compiler. 14 | 15 | -------------------------------------------------------------------------------- /Flyon/train_data600/入院记录现病史-2.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | ,患者因罹患“胃癌”于2013-10-29在我院予行全麻下胃癌根治术,,术中见:腹腔内腹水,腹膜无转移,肝脏未触及明显转移性灶,肿瘤位于胃体、胃底部,小弯侧偏后壁,约5*4*2CM大小,肿瘤已侵达浆膜外,第1、3组淋巴结肿大,肿瘤尚能活动,经探查决定行全胃切除,空肠J字代胃术。手术顺利,术后积极予相关对症支持治疗;,后病理示:胃底、体小弯侧低分化腺癌,部分为印戒细胞癌图像,蕈伞型,面积5.2*3.5CM,局部侵达粘膜下层,并于少数腺管内查见癌栓。两端切线及另送“近端切线”未查见癌。呈三组(5/13个)淋巴结癌转移。一组(7个)、四组(13个)、五组(1个)、六组(4个)淋巴结未查见癌。,癌组织免疫组化染色示:ERCC1(+)、β-TUBULIN-III(+)、TS(-)、RRM1(-)、TOPOII阳性细胞数约20%、CERBB-2(2+) 。依据患者病情及肿瘤病理与分期继续术后辅助性化疗指征存在,患者及家属拒绝化疗。自术后出院以来,患者一般情况保持良好;无发热,偶有恶心,无呕吐,无反酸、嗳气,无明显进食不适,偶有进食后轻微腹胀,无腹痛。现患者为行进一步复查并必要时适当处理而再来我院就诊,门诊依情以“胃恶性肿瘤术后”收入院。目前患者精神及情绪状态良好,食欲较术前明显减少,饮食可,夜间睡眠后;今8个月体重减轻18KG。 2 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/pat/Tok.pat_ccks: -------------------------------------------------------------------------------- 1 | * 2 | 3 | U:tok:1:-2:%X[-2,0] 4 | U:tok:1:-1:%X[-1,0] 5 | U:tok:1:+0:%X[0,0] 6 | U:tok:1:+1:%X[1,0] 7 | U:tok:1:+2:%X[2,0] 8 | U:tok:2:-1:%X[-1,0]/%X[0,0] 9 | U:tok:2:+0:%X[0,0]/%X[1,0] 10 | U:tok:3:-2:%X[-2,0]/%X[-1,0]/%X[0,0] 11 | U:tok:3:-1:%X[-1,0]/%X[0,0]/%X[1,0] 12 | U:tok:3:+0:%X[0,0]/%X[1,0]/%X[2,0] 13 | U:dr:1:+0:%X[0,2]/%X[0,0] 14 | U:jp:1:+0:%X[0,3]/%X[0,0] 15 | U:ss:1:+0:%X[0,4]/%X[0,0] 16 | U:zz:1:+0:%X[0,5]/%X[0,0] 17 | U:dr:2:-1:%X[-1,2]/%X[0,2] 18 | U:dr:2:+0:%X[0,2]/%X[1,2] 19 | U:pre:1:+0:2:%M[0,0,"^.?.?"] 20 | U:suf:1:+0:2:%M[0,0,".?.?$"] -------------------------------------------------------------------------------- /Flyon/train_data600/入院记录现病史-2.txt: -------------------------------------------------------------------------------- 1 | 胃 7 8 解剖部位 2 | 胃癌根治术 29 34 手术 3 | 腹腔 40 42 解剖部位 4 | 腹 43 44 解剖部位 5 | 腹膜 46 48 解剖部位 6 | 肝脏 52 54 解剖部位 7 | 胃体 68 70 解剖部位 8 | 胃底部 71 74 解剖部位 9 | 小弯侧偏后壁 75 81 解剖部位 10 | 淋巴结 107 110 解剖部位 11 | 全胃切除,空肠J字代胃术 126 138 手术 12 | 胃底、体小弯侧 164 171 解剖部位 13 | 淋巴结 253 256 解剖部位 14 | 淋巴结 288 291 解剖部位 15 | 发热 435 437 独立症状 16 | 恶心 440 442 独立症状 17 | 呕吐 444 446 独立症状 18 | 反酸 448 450 独立症状 19 | 嗳气 451 453 独立症状 20 | 进食不适 457 461 独立症状 21 | 腹 469 470 解剖部位 22 | 胀 470 471 症状描述 23 | 腹 473 474 解剖部位 24 | 痛 474 475 症状描述 25 | 胃 508 509 解剖部位 26 | 食欲较术前明显减少 534 543 独立症状 27 | 体重减轻 558 562 独立症状 28 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/eval/finall/入院记录现病史-1.txt: -------------------------------------------------------------------------------- 1 | 中下腹部 17 21 解剖部位 2 | 闷痛不适 21 25 症状描述 3 | 隐痛 30 32 症状描述 4 | 恶心 42 44 独立症状 5 | 呕吐 44 46 独立症状 6 | 反酸 48 50 独立症状 7 | 嗳气 50 52 独立症状 8 | 呕血黑便 54 58 独立症状 9 | 不适 59 61 症状描述 10 | 中下腹部 88 92 解剖部位 11 | 闷痛不适 92 96 症状描述 12 | 隐痛 101 103 症状描述 13 | 右侧 122 124 解剖部位 14 | 疼痛 125 127 症状描述 15 | 恶心 133 135 独立症状 16 | 呕吐 136 138 独立症状 17 | 返酸 139 141 独立症状 18 | 嗳气 142 144 独立症状 19 | 纳差 145 147 独立症状 20 | 乏力 148 150 独立症状 21 | 消瘦 151 153 独立症状 22 | 便秘 157 159 独立症状 23 | 呕血 161 163 独立症状 24 | 黑便 164 166 独立症状 25 | 发热 168 170 独立症状 26 | 咳嗽 171 173 独立症状 27 | 咳痰 174 176 独立症状 28 | 胃体 215 217 解剖部位 29 | 贲门 230 232 解剖部位 30 | 胃体 273 275 解剖部位 31 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/eval/onefile.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | # files = glob.glob("./finall/*") 4 | # files = sorted(files) 5 | # print(files) 6 | import codecs 7 | with open("result.txt",'w') as wf: 8 | for num in range(1,401): 9 | file = "./finall/入院记录现病史-"+str(num)+".txt" 10 | with codecs.open(file,'r',encoding='utf-8') as rf: 11 | for i,line in enumerate(rf): 12 | result = [] 13 | name = os.path.basename(file) 14 | name1 = name.split('.')[0] 15 | name2 = name1.split('-')[-1] 16 | line = line.strip() 17 | if i == 0: 18 | wf.write("{},{};".format(int(name2),line)) 19 | else: 20 | wf.write("{};".format(line)) 21 | wf.write('\n') -------------------------------------------------------------------------------- /Flyon/README.md: -------------------------------------------------------------------------------- 1 | ## 运行流程: 2 | #### @Author zhoukaiyin 3 | #### 任务描述 4 | 本评测任务为面向中文电子病历的命名实体识别,即对于给定的一组电子病历纯文本文档,任务的目标是识别并抽取出与医学临床相关的实体提及(entity mention),并将它们归类到预先定义好的类别(pre-defined categories),比如症状,药品,手术等。 5 | ### 第一步:数据处理 6 | 7 | ##### $python raw2bio.py -1 #将训练数据分词并贴上字典特征 8 | ##### $python raw2bio.py -2 #将标签数据分词并贴上标签 9 | ##### $python raw2bio.py -3 #将标签保存成pickle文件为了后面将训练数据与标签合在一起 10 | ##### $python raw2bio.py -4 #将标签与训练数据文本接起来构成如下格式 11 | ##### $python raw2bio.py -1 test #将测试数据处理成需要的格式 12 | 13 | 14 | ### 第二部:模型训练 15 | 16 | 17 | ##### $bash wapiti_ccks.sh #训练模型,模型储存在/eval/bio_ccks中 18 | 19 | 20 | ### 第三部分:获得结果 21 | 22 | 23 | ##### $python get_result.py #提取结果文件,结果保存在CCKS_result中其格式为BIO和finall中格式为官方标签格式 24 | ##### $python onefile.py #将结果转成提交格式 25 | 26 | 27 | ### 结果文件 28 | Flyon\CCKS_CRF\eval\result.txt 29 | -------------------------------------------------------------------------------- /Wapiti/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS =-std=c99 -W -Wall -Wextra -O3 2 | LIBS =-lm -lpthread 3 | 4 | DESTDIR= 5 | PREFIX =/home/kyang/kyzhou/Wapiti 6 | 7 | INSTALL= install -p 8 | INSTALL_EXEC= $(INSTALL) -m 0755 9 | INSTALL_DATA= $(INSTALL) -m 0644 10 | 11 | SRC=src/*.c 12 | HDR=src/*.h 13 | 14 | wapiti: $(SRC) $(HDR) 15 | @echo "CC: wapiti.c --> wapiti" 16 | @$(CC) -DNDEBUG $(CFLAGS) -o wapiti $(SRC) $(LIBS) 17 | 18 | debug: $(SRC) $(HDR) 19 | @echo "CC: wapiti.c --> wapiti" 20 | @$(CC) -g $(CFLAGS) -o wapiti $(SRC) $(LIBS) 21 | 22 | install: wapiti 23 | @echo "CP: wapiti --> $(DESTDIR)$(PREFIX)/bin" 24 | @mkdir -p $(DESTDIR)$(PREFIX)/bin 25 | @mkdir -p $(DESTDIR)$(PREFIX)/share/man/man1 26 | @$(INSTALL_EXEC) wapiti $(DESTDIR)$(PREFIX)/bin 27 | @$(INSTALL_DATA) doc/wapiti.1 $(DESTDIR)$(PREFIX)/share/man/man1 28 | 29 | clean: 30 | @echo "RM: wapiti" 31 | @rm -f wapiti 32 | 33 | .PHONY: clean install 34 | -------------------------------------------------------------------------------- /Flyon/Bio_label/入院记录现病史-2.txt: -------------------------------------------------------------------------------- 1 | 胃 7 8 B-解剖部位 2 | 胃癌 29 31 B-手术 3 | 根治术 31 34 I-手术 4 | 腹腔 40 42 B-解剖部位 5 | 腹 43 44 B-解剖部位 6 | 腹膜 46 48 B-解剖部位 7 | 肝脏 52 54 B-解剖部位 8 | 胃体 68 70 B-解剖部位 9 | 胃 71 72 B-解剖部位 10 | 底部 72 74 I-解剖部位 11 | 小 75 76 B-解剖部位 12 | 弯 76 77 I-解剖部位 13 | 侧 77 78 I-解剖部位 14 | 偏后 78 80 I-解剖部位 15 | 壁 80 81 I-解剖部位 16 | 淋巴结 107 110 B-解剖部位 17 | 全胃 126 128 B-手术 18 | 切除 128 130 I-手术 19 | , 130 131 I-手术 20 | 空肠 131 133 I-手术 21 | J 133 134 I-手术 22 | 字代 134 136 I-手术 23 | 胃术 136 138 I-手术 24 | 胃底 164 166 B-解剖部位 25 | 、 166 167 I-解剖部位 26 | 体小 167 169 I-解剖部位 27 | 弯 169 170 I-解剖部位 28 | 侧 170 171 I-解剖部位 29 | 淋巴结 253 256 B-解剖部位 30 | 淋巴结 288 291 B-解剖部位 31 | 发热 435 437 B-独立症状 32 | 恶心 440 442 B-独立症状 33 | 呕吐 444 446 B-独立症状 34 | 反酸 448 450 B-独立症状 35 | 嗳气 451 453 B-独立症状 36 | 进食 457 459 B-独立症状 37 | 不适 459 461 I-独立症状 38 | 腹 469 470 B-解剖部位 39 | 胀 470 471 B-症状描述 40 | 腹 473 474 B-解剖部位 41 | 痛 474 475 B-症状描述 42 | 胃 508 509 B-解剖部位 43 | 食欲 534 536 B-独立症状 44 | 较 536 537 I-独立症状 45 | 术 537 538 I-独立症状 46 | 前 538 539 I-独立症状 47 | 明显 539 541 I-独立症状 48 | 减少 541 543 I-独立症状 49 | 体重减轻 558 562 B-独立症状 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 运行流程: 2 | #### @Author zhoukaiyin 3 | #### 任务描述 4 | 本评测任务为面向中文电子病历的命名实体识别,即对于给定的一组电子病历纯文本文档,任务的目标是识别并抽取出与医学临床相关的实体提及(entity mention),并将它们归类到预先定义好的类别(pre-defined categories),比如症状,药品,手术等。 5 | 6 | 7 | ### 第一步:数据处理(Linux) 8 | 9 | ##### $python raw2bio.py -1 #将训练数据分词并贴上字典特征 10 | ##### $python raw2bio.py -2 #将标签数据分词并贴上标签 11 | ##### $python raw2bio.py -3 #将标签保存成pickle文件为了后面将训练数据与标签合在一起 12 | ##### $python raw2bio.py -4 #将标签与训练数据文本接起来构成如下格式 13 | ##### $python raw2bio.py -1 test #将测试数据处理成需要的格式 14 | 15 | 16 | ### 第二部:模型训练(Linux) 17 | 18 | 19 | ##### $bash wapiti_ccks.sh #训练模型,模型储存在/eval/bio_ccks中 20 | 21 | ### 第三部分:获得结果(Linux) 22 | 23 | 24 | ##### $python get_result.py #提取结果文件,结果保存在CCKS_result中其格式为BIO和finall中格式为官方标签格式 25 | ##### $python onefile.py #将结果转成提交格式 26 | 27 | 28 | ### 结果文件 29 | Flyon\CCKS_CRF\eval\result.txt 30 | 31 | 32 | ##### Wapiti is a simple and fast discriminative sequence labeling toolkit ( http://wapiti.limsi.fr )。A little same as CRF++ 33 | 34 | 35 | ## 注:可以尝试BERT,ALBERT等预训练模型 参见:[NLPGNN](https://github.com/kyzhouhzau/NLPGNN) 36 | 37 | -------------------------------------------------------------------------------- /Wapiti/dat/chpattern.txt: -------------------------------------------------------------------------------- 1 | * 2 | 3 | U:Wrd-1 X=%x[ 0,0] 4 | 5 | U:wrd-1LL=%X[-2,0] 6 | U:wrd-1 L=%X[-1,0] 7 | U:wrd-1 X=%X[ 0,0] 8 | U:wrd-1 R=%X[ 1,0] 9 | U:wrd-1RR=%X[ 2,0] 10 | 11 | U:wrd-2 L=%X[-1,0]/%X[ 0,0] 12 | U:wrd-2 R=%X[ 0,0]/%X[ 1,0] 13 | 14 | *:Pos-1LL=%x[-2,1] 15 | *:Pos-1 L=%x[-1,1] 16 | *:Pos-1 X=%x[ 0,1] 17 | *:Pos-1 R=%x[ 1,1] 18 | *:Pos-1RR=%x[ 2,1] 19 | 20 | U:Pos-2 L=%X[-1,1]/%X[ 0,1] 21 | U:Pos-2 R=%X[ 0,1]/%X[ 1,1] 22 | 23 | *:Pre-1 X=%m[ 0,0,"^.?"] 24 | *:Pre-2 X=%m[ 0,0,"^.?.?"] 25 | *:Pre-3 X=%m[ 0,0,"^.?.?.?"] 26 | *:Pre-4 X=%m[ 0,0,"^.?.?.?.?"] 27 | 28 | *:Suf-1 X=%m[ 0,0,".?$"] 29 | *:Suf-2 X=%m[ 0,0,".?.?$"] 30 | *:Suf-3 X=%m[ 0,0,".?.?.?$"] 31 | *:Suf-4 X=%m[ 0,0,".?.?.?.?$"] 32 | 33 | *:Caps? L=%t[-1,0,"\u"] 34 | *:Caps? X=%t[ 0,0,"\u"] 35 | *:Caps? R=%t[ 1,0,"\u"] 36 | 37 | *:AllC? X=%t[ 0,0,"^\u*$"] 38 | *:BegC? X=%t[ 0,0,"^\u"] 39 | 40 | *:Punc? L=%t[-1,0,"\p"] 41 | *:Punc? X=%t[ 0,0,"\p"] 42 | *:Punc? R=%t[ 1,0,"\p"] 43 | 44 | *:AllP? X=%t[ 0,0,"^\p*$"] 45 | *:InsP? X=%t[ 0,0,".\p."] 46 | 47 | *:Numb? L=%t[-1,0,"\d"] 48 | *:Numb? X=%t[ 0,0,"\d"] 49 | *:Numb? R=%t[ 1,0,"\d"] 50 | 51 | *:AllN? X=%t[ 0,0,"^\d*$"] 52 | 53 | -------------------------------------------------------------------------------- /Wapiti/dat/nppattern.txt: -------------------------------------------------------------------------------- 1 | * 2 | 3 | U:Wrd-1 X=%x[ 0,0] 4 | 5 | U:wrd-1LL=%X[-2,0] 6 | U:wrd-1 L=%X[-1,0] 7 | U:wrd-1 X=%X[ 0,0] 8 | U:wrd-1 R=%X[ 1,0] 9 | U:wrd-1RR=%X[ 2,0] 10 | 11 | U:wrd-2 L=%X[-1,0]/%X[ 0,0] 12 | U:wrd-2 R=%X[ 0,0]/%X[ 1,0] 13 | 14 | *:Pos-1LL=%x[-2,1] 15 | *:Pos-1 L=%x[-1,1] 16 | *:Pos-1 X=%x[ 0,1] 17 | *:Pos-1 R=%x[ 1,1] 18 | *:Pos-1RR=%x[ 2,1] 19 | 20 | U:Pos-2 L=%X[-1,1]/%X[ 0,1] 21 | U:Pos-2 R=%X[ 0,1]/%X[ 1,1] 22 | 23 | *:Pre-1 X=%m[ 0,0,"^.?"] 24 | *:Pre-2 X=%m[ 0,0,"^.?.?"] 25 | *:Pre-3 X=%m[ 0,0,"^.?.?.?"] 26 | *:Pre-4 X=%m[ 0,0,"^.?.?.?.?"] 27 | 28 | *:Suf-1 X=%m[ 0,0,".?$"] 29 | *:Suf-2 X=%m[ 0,0,".?.?$"] 30 | *:Suf-3 X=%m[ 0,0,".?.?.?$"] 31 | *:Suf-4 X=%m[ 0,0,".?.?.?.?$"] 32 | 33 | *:Caps? L=%t[-1,0,"\u"] 34 | *:Caps? X=%t[ 0,0,"\u"] 35 | *:Caps? R=%t[ 1,0,"\u"] 36 | 37 | *:AllC? X=%t[ 0,0,"^\u*$"] 38 | *:BegC? X=%t[ 0,0,"^\u"] 39 | 40 | *:Punc? L=%t[-1,0,"\p"] 41 | *:Punc? X=%t[ 0,0,"\p"] 42 | *:Punc? R=%t[ 1,0,"\p"] 43 | 44 | *:AllP? X=%t[ 0,0,"^\p*$"] 45 | *:InsP? X=%t[ 0,0,".\p."] 46 | 47 | *:Numb? L=%t[-1,0,"\d"] 48 | *:Numb? X=%t[ 0,0,"\d"] 49 | *:Numb? R=%t[ 1,0,"\d"] 50 | 51 | *:AllN? X=%t[ 0,0,"^\d*$"] 52 | 53 | -------------------------------------------------------------------------------- /Flyon/readme.txt: -------------------------------------------------------------------------------- 1 | @Author zhoukaiyin 2 | 3 | 4 | 运行流程: 5 | 6 | ############################################################################## 7 | #############################第一步:数据处理################################# 8 | ############################################################################## 9 | 10 | $python raw2bio.py -1 #将训练数据分词并贴上字典特征 11 | $python raw2bio.py -2 #将标签数据分词并贴上标签 12 | $python raw2bio.py -3 #将标签保存成pickle文件为了后面将训练数据与标签合在一起 13 | $python raw2bio.py -4 #将标签与训练数据文本接起来构成如下格式 14 | $python raw2bio.py -1 test #将测试数据处理成需要的格式 15 | 16 | ############################################################################# 17 | #############################第二部:模型训练################################ 18 | ############################################################################# 19 | 20 | $bash bash wapiti_ccks.sh #训练模型,模型储存在/eval/bio_ccks中 21 | 22 | ############################################################################# 23 | #############################第三部分:获得结果############################## 24 | ############################################################################# 25 | 26 | $python get_result.py #提取结果文件,结果保存在CCKS_result中其格式为BIO和finall中格式为官方标签格式 27 | $python onefile.py #将结果转成提交格式 28 | 29 | 30 | -------------------------------------------------------------------------------- /Wapiti/COPYING: -------------------------------------------------------------------------------- 1 | Wapiti - A linear-chain CRF tool 2 | 3 | Copyright (c) 2009-2013 CNRS 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 18 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /Wapiti/README.mkd: -------------------------------------------------------------------------------- 1 | # Wapiti - A linear-chain CRF tool 2 | 3 | Copyright (c) 2009-2013 CNRS 4 | All rights reserved. 5 | 6 | For more detailed information see the [homepage](http://wapiti.limsi.fr). 7 | 8 | Wapiti is a very fast toolkit for segmenting and labeling sequences with 9 | discriminative models. It is based on maxent models, maximum entropy Markov 10 | models and linear-chain CRF and proposes various optimization and regularization 11 | methods to improve both the computational complexity and the prediction 12 | performance of standard models. Wapiti is ranked first on the sequence tagging 13 | task for more than a year on MLcomp web site. 14 | 15 | Wapiti is developed by LIMSI-CNRS and was partially funded by ANR projects 16 | CroTaL (ANR-07-MDCO-003) and MGA (ANR-07-BLAN-0311-02). 17 | 18 | For suggestions, comments, or patchs, you can contact me at lavergne@limsi.fr 19 | 20 | If you use Wapiti for research purpose, please use the following citation: 21 | 22 | @inproceedings{lavergne2010practical, 23 | author = {Lavergne, Thomas and Capp\'{e}, Olivier and Yvon, 24 | Fran\c{c}ois}, 25 | title = {Practical Very Large Scale {CRFs}}, 26 | booktitle = {Proceedings the 48th Annual Meeting of the Association 27 | for Computational Linguistics ({ACL})}, 28 | month = {July}, 29 | year = {2010}, 30 | location = {Uppsala, Sweden}, 31 | publisher = {Association for Computational Linguistics}, 32 | pages = {504--513}, 33 | url = {http://www.aclweb.org/anthology/P10-1052} 34 | } 35 | 36 | -------------------------------------------------------------------------------- /Wapiti/src/trainers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef trainers_h 29 | #define trainers_h 30 | 31 | #include "model.h" 32 | 33 | void trn_lbfgs(mdl_t *mdl); 34 | void trn_sgdl1(mdl_t *mdl); 35 | void trn_bcd(mdl_t *mdl); 36 | void trn_rprop(mdl_t *mdl); 37 | 38 | #endif 39 | 40 | -------------------------------------------------------------------------------- /Wapiti/src/progress.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef progress_h 29 | #define progress_h 30 | 31 | #include 32 | #include 33 | 34 | #include "wapiti.h" 35 | #include "model.h" 36 | 37 | extern bool uit_stop; 38 | 39 | void uit_setup(mdl_t *mdl); 40 | void uit_cleanup(mdl_t *mdl); 41 | bool uit_progress(mdl_t *mdl, uint32_t it, double obj); 42 | 43 | #endif 44 | 45 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/dic/zhenzhaung.txt: -------------------------------------------------------------------------------- 1 | 上 2 | 上皮 3 | 上睑 4 | 上臂 5 | 下垂 6 | 下方 7 | 下肢 8 | 下腹 9 | 下降 10 | 不 11 | 不利 12 | 不泽 13 | 不语 14 | 不适 15 | 丘疹 16 | 两侧 17 | 两畔 18 | 两腿 19 | 乳房 20 | 二尖瓣 21 | 体质 22 | 便 23 | 便秘 24 | 偏 25 | 充血 26 | 全身 27 | 关节痛 28 | 前庭 29 | 前胸 30 | 剑 31 | 动作 32 | 动脉 33 | 区 34 | 压痛 35 | 厚腻 36 | 双侧 37 | 双眼 38 | 发僵 39 | 发冷 40 | 发黑 41 | 受限 42 | 变直 43 | 口 44 | 口右 45 | 口唇 46 | 口糜 47 | 口胀 48 | 右 49 | 右侧 50 | 右眼 51 | 右足 52 | 后 53 | 呛 54 | 咽涩 55 | 增加 56 | 增生 57 | 多 58 | 大 59 | 大便 60 | 大腿 61 | 失神 62 | 头白 63 | 如常 64 | 嫩红 65 | 小便 66 | 少 67 | 尚可 68 | 尿 69 | 左 70 | 左侧 71 | 左手 72 | 左耳 73 | 左胁 74 | 左足 75 | 左足心 76 | 干裂 77 | 延后 78 | 异常 79 | 心前 80 | 心悸 81 | 心肌炎 82 | 意识 83 | 憋闷 84 | 扁桃腺 85 | 手臂 86 | 手足 87 | 指 88 | 指关节 89 | 挫伤 90 | 掌 91 | 搏动 92 | 收敛 93 | 斑 94 | 无 95 | 明显 96 | 昏 97 | 昏寐 98 | 暗 99 | 月经 100 | 有津 101 | 染色 102 | 正常 103 | 殷红 104 | 毛 105 | 气窜 106 | 泡沫 107 | 活动 108 | 流血 109 | 浮肿 110 | 消失 111 | 湿 112 | 溃疡 113 | 滤泡 114 | 点状 115 | 热 116 | 牙齿 117 | 特殊 118 | 疖肿 119 | 疙瘩 120 | 疲劳感 121 | 疼 122 | 疼痛 123 | 病变 124 | 病毒性 125 | 痒 126 | 痛 127 | 痰 128 | 瘢痕 129 | 白睛 130 | 白稍 131 | 皮肤 132 | 皮色 133 | 眠 134 | 着色 135 | 睑下垂 136 | 睡觉 137 | 硬 138 | 神识 139 | 稳 140 | 突下 141 | 筋 142 | 糜烂 143 | 紧张 144 | 紫 145 | 红 146 | 红疹 147 | 红色 148 | 红赤 149 | 经行 150 | 结膜 151 | 网膜 152 | 肌肉 153 | 股 154 | 肢体 155 | 肿大 156 | 肿痛 157 | 肿胀 158 | 胀痛 159 | 胃 160 | 背不舒 161 | 背白 162 | 背部 163 | 胫 164 | 胫后 165 | 胸 166 | 脉实 167 | 脊 168 | 脏 169 | 脱出 170 | 脱落 171 | 腓肠肌 172 | 腕凉 173 | 腮 174 | 腹痛 175 | 腻 176 | 臀部 177 | 舌 178 | 舌下 179 | 舌尖 180 | 舌根 181 | 舌苔 182 | 舌质 183 | 舌边 184 | 色少 185 | 色斑 186 | 色暗 187 | 萎缩 188 | 血压 189 | 角膜 190 | 诱发 191 | 足 192 | 足趾 193 | 趾 194 | 踏实 195 | 转筋 196 | 轻度 197 | 返流 198 | 酸软 199 | 重坠 200 | 闭目 201 | 闷 202 | 阴茎 203 | 隐痛 204 | 隐隐作痛 205 | 静脉 206 | 非 207 | 非面 208 | 面瘦 209 | 面色 210 | 面部 211 | 面颊 212 | 颈 213 | 颈椎 214 | 颈部 215 | 频发 216 | 颞侧 217 | 骨 218 | 麻木 219 | 黄 220 | 黄暗 221 | 黄染 222 | 黑 223 | 鼻 224 | -------------------------------------------------------------------------------- /Wapiti/src/thread.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef thread_h 29 | #define thread_h 30 | 31 | #include 32 | #include 33 | 34 | #include "model.h" 35 | 36 | typedef struct job_s job_t; 37 | 38 | typedef void (func_t)(job_t *job, uint32_t id, uint32_t cnt, void *ud); 39 | 40 | bool mth_getjob(job_t *job, uint32_t *cnt, uint32_t *pos); 41 | void mth_spawn(func_t *f, uint32_t W, void *ud[W], uint32_t size, uint32_t batch); 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /Wapiti/src/quark.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef quark_h 29 | #define quark_h 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | typedef struct qrk_s qrk_t; 36 | 37 | qrk_t *qrk_new(void); 38 | void qrk_free(qrk_t *qrk); 39 | uint64_t qrk_count(const qrk_t *qrk); 40 | bool qrk_lock(qrk_t *qrk, bool lock); 41 | const char *qrk_id2str(const qrk_t *qrk, uint64_t id); 42 | uint64_t qrk_str2id(qrk_t *qrk, const char *key); 43 | void qrk_load(qrk_t *qrk, FILE *file); 44 | void qrk_save(const qrk_t *qrk, FILE *file); 45 | 46 | #endif 47 | 48 | -------------------------------------------------------------------------------- /Wapiti/src/decoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef decoder_h 29 | #define decoder_h 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #include "wapiti.h" 36 | #include "model.h" 37 | #include "sequence.h" 38 | 39 | void tag_viterbi(mdl_t *mdl, const seq_t *seq, 40 | uint32_t out[], double *sc, double psc[]); 41 | void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N, 42 | uint32_t out[][N], double sc[], double psc[][N]); 43 | 44 | void tag_label(mdl_t *mdl, FILE *fin, FILE *fout); 45 | void tag_eval(mdl_t *mdl, double *te, double *se); 46 | 47 | #endif 48 | 49 | -------------------------------------------------------------------------------- /Wapiti/src/pattern.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef pattern_h 29 | #define pattern_h 30 | 31 | #include 32 | #include 33 | 34 | #include "sequence.h" 35 | 36 | typedef struct pat_s pat_t; 37 | typedef struct pat_item_s pat_item_t; 38 | struct pat_s { 39 | char *src; 40 | uint32_t ntoks; 41 | uint32_t nitems; 42 | struct pat_item_s { 43 | char type; 44 | bool caps; 45 | char *value; 46 | bool absolute; 47 | int32_t offset; 48 | uint32_t column; 49 | } items[]; 50 | }; 51 | 52 | pat_t *pat_comp(char *p); 53 | char *pat_exec(const pat_t *pat, const tok_t *tok, uint32_t at); 54 | void pat_free(pat_t *pat); 55 | 56 | #endif 57 | 58 | -------------------------------------------------------------------------------- /Wapiti/src/tools.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef tools_h 28 | #define tools_h 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #define unused(v) ((void)(v)) 36 | #define none ((uint64_t)-1) 37 | 38 | #define min(a, b) ((a) < (b) ? (a) : (b)) 39 | #define max(a, b) ((a) < (b) ? (b) : (a)) 40 | 41 | void fatal(const char *msg, ...); 42 | void pfatal(const char *msg, ...); 43 | void warning(const char *msg, ...); 44 | void info(const char *msg, ...); 45 | 46 | void *xmalloc(size_t size); 47 | void *xrealloc(void *ptr, size_t size); 48 | char *xstrdup(const char *str); 49 | 50 | char *ns_readstr(FILE *file); 51 | void ns_writestr(FILE *file, const char *str); 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/eval/CCKS_result/入院记录现病史-2.txt: -------------------------------------------------------------------------------- 1 | , 0 1 O 2 | 患者 1 3 O 3 | 于 3 4 O 4 | 半月 4 6 O 5 | 前 6 7 O 6 | 无 7 8 O 7 | 明显 8 10 O 8 | 诱因 10 12 O 9 | 出现 12 14 O 10 | 进食 14 16 O 11 | 后中 16 18 B-解剖部位 12 | 上腹 18 20 I-解剖部位 13 | 不适 20 22 B-症状描述 14 | , 22 23 O 15 | 每次 23 25 O 16 | 持续 25 27 O 17 | 数分钟 27 30 O 18 | 自行 30 32 O 19 | 缓解 32 34 O 20 | , 34 35 O 21 | 无 35 36 O 22 | 恶心 36 38 B-独立症状 23 | 、 38 39 O 24 | 呕吐 39 41 B-独立症状 25 | 、 41 42 O 26 | 反酸 42 44 B-独立症状 27 | 、 44 45 O 28 | 嗳气 45 47 B-独立症状 29 | 、 47 48 O 30 | 烧心 48 50 B-独立症状 31 | , 50 51 O 32 | 无 51 52 O 33 | 腹痛 52 54 O 34 | 、 54 55 O 35 | 腹胀 55 57 O 36 | 、 57 58 O 37 | 腹泻 58 60 O 38 | 、 60 61 O 39 | 便秘 61 63 B-独立症状 40 | , 63 64 O 41 | 无厌 64 66 O 42 | 油 66 67 O 43 | 、 67 68 O 44 | 纳差 68 70 B-独立症状 45 | , 70 71 O 46 | 未予 71 73 O 47 | 重视 73 75 O 48 | , 75 76 O 49 | 未 76 77 O 50 | 特殊 77 79 O 51 | 处理 79 81 O 52 | 。 81 82 O 53 | , 82 83 O 54 | 半前 83 85 O 55 | 至 85 86 O 56 | 我院 86 88 O 57 | 门诊 88 90 O 58 | 行 90 91 O 59 | 胃镜 91 93 O 60 | 检查 93 95 O 61 | 提示 95 97 O 62 | : 97 98 O 63 | 浅表性 98 101 O 64 | 胃窦炎 101 104 O 65 | 伴 104 105 O 66 | 糜烂 105 107 O 67 | , 107 108 O 68 | 十二指肠 108 112 B-解剖部位 69 | 球炎 112 114 O 70 | , 114 115 O 71 | , 115 116 O 72 | 腹部 116 118 B-解剖部位 73 | 彩超 118 120 O 74 | : 120 121 O 75 | 肝 121 122 B-解剖部位 76 | 回声 122 124 O 77 | 增多 124 126 O 78 | , 126 127 O 79 | 胆囊息肉 127 131 O 80 | 样变 131 133 O 81 | 。 133 134 O 82 | 为 134 135 O 83 | 进一步 135 138 O 84 | 诊治 138 140 O 85 | , 140 141 O 86 | 门诊 141 143 O 87 | “ 143 144 O 88 | 胃炎 144 146 O 89 | ” 146 147 O 90 | 收入 147 149 O 91 | 我 149 150 O 92 | 科 150 151 O 93 | 。 151 152 O 94 | \ 152 153 O 95 | U0004 153 158 O 96 | 158 159 O 97 | 159 160 O 98 | 160 161 O 99 | 患者 161 163 O 100 | 本次 163 165 O 101 | 发病 165 167 O 102 | 以来 167 169 O 103 | , 169 170 O 104 | 食欲 170 172 O 105 | 正常 172 174 O 106 | , 174 175 O 107 | 175 176 O 108 | 神志 176 178 O 109 | 清醒 178 180 O 110 | , 180 181 O 111 | 精神 181 183 O 112 | 尚可 183 185 O 113 | , 185 186 O 114 | 睡眠 186 188 O 115 | 尚可 188 190 O 116 | , 190 191 O 117 | 大便 191 193 O 118 | 正常 193 195 O 119 | , 195 196 O 120 | 小便 196 198 O 121 | 正常 198 200 O 122 | , 200 201 O 123 | 体重 201 203 O 124 | 无 203 204 O 125 | 明显 204 206 O 126 | 变化 206 208 O 127 | 。 208 209 O 128 | 209 211 O 129 | -------------------------------------------------------------------------------- /Wapiti/src/vmath.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef vmath_h 29 | #define vmath_h 30 | 31 | #include 32 | 33 | const char *xvm_mode(void); 34 | 35 | double *xvm_new(uint64_t N); 36 | void xvm_free(double x[]); 37 | 38 | void xvm_neg(double r[], const double x[], uint64_t N); 39 | void xvm_sub(double r[], const double x[], const double y[], uint64_t N); 40 | void xvm_scale(double r[], const double x[], double a, uint64_t N); 41 | double xvm_unit(double r[], const double x[], uint64_t N); 42 | 43 | double xvm_norm(const double x[], uint64_t N); 44 | double xvm_dot(const double x[], const double y[], uint64_t N); 45 | 46 | void xvm_axpy(double r[], double a, const double x[], const double y[], 47 | uint64_t N); 48 | 49 | void xvm_expma(double r[], const double x[], double a, uint64_t N); 50 | 51 | #endif 52 | 53 | -------------------------------------------------------------------------------- /Wapiti/src/wapiti.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef wapiti_h 28 | #define wapiti_h 29 | 30 | #define VERSION "1.5.0" 31 | 32 | /* XVM_ANSI: 33 | * By uncomenting the following define, you can force wapiti to not use SSE2 34 | * even if available. 35 | */ 36 | //#define XVM_ANSI 37 | 38 | /* MTH_ANSI: 39 | * By uncomenting the following define, you can disable the use of POSIX 40 | * threads in the multi-threading part of Wapiti, for non-POSIX systems. 41 | */ 42 | //#define MTH_ANSI 43 | 44 | /* ATM_ANSI: 45 | * By uncomenting the following define, you can disable the use of atomic 46 | * operation to update the gradient. This imply that multi-threaded gradient 47 | * computation will require more memory but is more portable. 48 | */ 49 | //#define ATM_ANSI 50 | 51 | /* Without multi-threading we disable atomic updates as they are not needed and 52 | * can only decrease performances in this case. 53 | */ 54 | #ifdef MTH_ANSI 55 | #define ATM_ANSI 56 | #endif 57 | 58 | #endif 59 | 60 | -------------------------------------------------------------------------------- /Wapiti/src/options.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef options_h 28 | #define options_h 29 | 30 | #include 31 | #include 32 | 33 | #include "wapiti.h" 34 | 35 | /* opt_t: 36 | * This structure hold all user configurable parameter for Wapiti and is 37 | * filled with parameters from command line. 38 | */ 39 | typedef struct opt_s opt_t; 40 | struct opt_s { 41 | int mode; 42 | char *input, *output; 43 | bool maxent; 44 | // Options for training 45 | char *type; 46 | char *algo, *pattern; 47 | char *model, *devel; 48 | char *rstate, *sstate; 49 | bool compact, sparse; 50 | uint32_t nthread; 51 | uint32_t jobsize; 52 | uint32_t maxiter; 53 | double rho1, rho2; 54 | // Window size criterion 55 | uint32_t objwin; 56 | uint32_t stopwin; 57 | double stopeps; 58 | // Options specific to L-BFGS 59 | struct { 60 | bool clip; 61 | uint32_t histsz; 62 | uint32_t maxls; 63 | } lbfgs; 64 | // Options specific to SGD-L1 65 | struct { 66 | double eta0; 67 | double alpha; 68 | } sgdl1; 69 | // Options specific to BCD 70 | struct { 71 | double kappa; 72 | } bcd; 73 | // Options specific to RPROP 74 | struct { 75 | double stpmin; 76 | double stpmax; 77 | double stpinc; 78 | double stpdec; 79 | bool cutoff; 80 | } rprop; 81 | // Options for labelling 82 | bool label; 83 | bool check; 84 | bool outsc; 85 | bool lblpost; 86 | uint32_t nbest; 87 | bool force; 88 | // Options for model dump 89 | int prec; 90 | bool all; 91 | }; 92 | 93 | extern const opt_t opt_defaults; 94 | 95 | void opt_parse(int argc, char *argv[argc], opt_t *opt); 96 | 97 | #endif 98 | 99 | -------------------------------------------------------------------------------- /Wapiti/src/reader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef reader_h 29 | #define reader_h 30 | 31 | #include 32 | #include 33 | 34 | #include "wapiti.h" 35 | #include "pattern.h" 36 | #include "quark.h" 37 | #include "sequence.h" 38 | 39 | /* rdr_t: 40 | * The reader object who hold all informations needed to parse the input file: 41 | * the patterns and quark for labels and observations. We keep separate count 42 | * for unigrams and bigrams pattern for simpler allocation of sequences. We 43 | * also store the expected number of column in the input data to check that 44 | * pattern are appliables. 45 | */ 46 | typedef struct rdr_s rdr_t; 47 | struct rdr_s { 48 | bool autouni; // Automatically add 'u' prefix 49 | uint32_t npats; // P Total number of patterns 50 | uint32_t nuni, nbi; // Number of unigram and bigram patterns 51 | uint32_t ntoks; // Expected number of tokens in input 52 | pat_t **pats; // [P] List of precompiled patterns 53 | qrk_t *lbl; // Labels database 54 | qrk_t *obs; // Observation database 55 | }; 56 | 57 | rdr_t *rdr_new(bool autouni); 58 | void rdr_free(rdr_t *rdr); 59 | void rdr_freeraw(raw_t *raw); 60 | void rdr_freeseq(seq_t *seq); 61 | void rdr_freedat(dat_t *dat); 62 | 63 | void rdr_loadpat(rdr_t *rdr, FILE *file); 64 | raw_t *rdr_readraw(rdr_t *rdr, FILE *file); 65 | seq_t *rdr_raw2seq(rdr_t *rdr, const raw_t *raw, bool lbl); 66 | seq_t *rdr_readseq(rdr_t *rdr, FILE *file, bool lbl); 67 | dat_t *rdr_readdat(rdr_t *rdr, FILE *file, bool lbl); 68 | 69 | void rdr_load(rdr_t *rdr, FILE *file); 70 | void rdr_save(const rdr_t *rdr, FILE *file); 71 | 72 | char *rdr_readline(FILE *file); 73 | 74 | #endif 75 | 76 | -------------------------------------------------------------------------------- /Wapiti/HISTORY: -------------------------------------------------------------------------------- 1 | 18/12/2013 2 | Release v1.5.0: Update mode and bug fixes 3 | 4 | Add precision specifier for model dumping 5 | Add update mode to modify a model 6 | Lots of english corrections in the manual 7 | Fix bug in model format compatibility 8 | Fix memory allocation with large models 9 | Fix small memory bug in quark database 10 | Fix bug with bigram features in raw mode 11 | 12 | 23/04/2012 13 | Release v1.4.0: Forced decoding, optimizer state, and bug fixes 14 | 15 | Add forced decoding to partialy decode sequences 16 | Add optimizer state saving for L-BFGS and R-PROP 17 | Switched to elapsed time instead of wall time in progress 18 | Fix bug in Makefile (thanks to Lars Buitinck) 19 | Fix local normalization decoding in MEMM (thanks to Anoop Deoras) 20 | Fix bad handling of objwin option 21 | Fix bug in reader for single letter obs and lbl 22 | 23 | 03/11/2011 24 | Release v1.3.0: MEMM, faster gradient and bug fixes 25 | 26 | Added support for Maximum Entropy Markov Models. 27 | Added use of atomic operation in gradient computation 28 | Improved RProp numerical stability 29 | Fix bug with unseen features in raw mode (thanks to Jurgen Van Gael) 30 | Fix bug discarding some features in maxent mode (thanks to George Foster) 31 | Switched code to stdint, should resolve some issue with size_t 32 | on exotic systems. 33 | 34 | 07/07/2011 35 | Release v1.2.0: RProp improvements and bug fix 36 | 37 | Switch from splay-trees to critbit-tries 38 | Add RProp+ and RProp- variants of the RProp algorithm 39 | Add a new projection scheme for RProp with l1 40 | Make maxent work with sequences 41 | Add space matching in regexp 42 | Fix a few small bugs 43 | 44 | 19/03/2011 45 | Release v1.1.3: Some small improvements 46 | 47 | New option --jobsize for fine grained multi threading 48 | Improved SGD index construction : a lot faster 49 | Fix a small bug in sparse multi-threaded gradient 50 | 51 | 12/11/2010 52 | Release v1.1.2: Bug fix release 53 | 54 | Fix a small bug in L-BFGS/OWL-QN, should improve a bit 55 | convergence speed in some case. 56 | Fix a bug in multi-thread job system thanks to Alexander Fraser, 57 | should fix error rates and training speed on large dataset. 58 | Fix two small memory leaks. 59 | Some improvment in quark database handling. 60 | 61 | 24/09/2010 62 | Release v1.1.1: Mainly multi-threading improvements 63 | 64 | RPROP algorithm is now fully multi-threaded. 65 | Error rate estimation during training is now multi-threaded. 66 | Better jobs scheduling in multi-threaded gradient. 67 | Multi-threading code can be disabled (compilation on Windows should 68 | be simpler). 69 | Fixed bug in L1 optimization with RPROP (should improve stability). 70 | 71 | 08/09/2010 72 | Release v1.1.0: A few new features 73 | 74 | Added maxent mode. 75 | Added decoding through posteriors, this should improve accuracy 76 | at the price of computational time. 77 | Added the RPROP optimization algorithm. 78 | Added absolute indexing in patterns. 79 | Changed the scored output format as the posterior decoding 80 | provide normalized score at each position. The output is now 81 | compatible with CRF++. 82 | Some code cleanup. 83 | 84 | 29/07/2010 85 | Release v1.0.2: Mainly a bug fix version. 86 | 87 | Fixed some memory leaks, thanks to David Keeler 88 | Fixed argument processing to be more user friendly 89 | Fixed small bug in model compaction 90 | Added reading of raw files 91 | Spell corrections in man page 92 | 93 | 18/06/2010 94 | Release v1.0.0: Initial public version. 95 | 96 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/eval/get_reult.py: -------------------------------------------------------------------------------- 1 | #! usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | """ 4 | AUTHOR:zhoukaiyin 5 | Tiime:2018年7月3号 6 | """ 7 | import glob 8 | import os 9 | 10 | def del_file(dirname): 11 | files = os.listdir(dirname) 12 | for file in files: 13 | f_path = os.path.join("CCKS_result",file) 14 | if os.path.isfile: 15 | os.remove(f_path) 16 | 17 | def split_result(openfile): 18 | with open(openfile,'r') as rf: 19 | for line in rf: 20 | lis = line.split('\t') 21 | long = len(lis) 22 | if long >2: 23 | # print(line) 24 | filename = "入院记录现病史-"+lis[-4]+".txt" 25 | word = lis[0] 26 | start = lis[-3] 27 | end = lis[-2] 28 | label = lis[-1] 29 | # if label == "O\n": 30 | # # print(label) 31 | # pass 32 | # else: 33 | # print(label) 34 | wf = open("./CCKS_result/"+filename,'a') 35 | wf.write("{}\t{}\t{}\t{}".format(word,start,end,label)) 36 | 37 | def change_format(base_path): 38 | files = glob.glob(base_path+"/*") 39 | # print(files) 40 | # files = ["./CCKS_result//入院记录现病史-32.txt"] 41 | for file in files: 42 | all=[] 43 | words = [] 44 | base_name = os.path.basename(file) 45 | with open(file,'r') as rf,open("./finall/"+base_name,'w') as wf: 46 | lines = rf.readlines() 47 | for i, line in enumerate(lines): 48 | line = line.rstrip() 49 | lis = line.split('\t') 50 | 51 | label = lis[-1] 52 | if label!="O": 53 | label_1 = label.split('-')[0] 54 | label_2 = label.split('-')[1] 55 | else: 56 | label_1="O" 57 | label_2="O" 58 | # words.append(lis) 59 | before = lines[i-1].strip().split('\t')[-1] 60 | words.append(lis) 61 | # print(words) 62 | 63 | if label_1=="B": 64 | # print(lis) 65 | words.pop() 66 | all.append(words) 67 | words=[] 68 | words.append(lis) 69 | elif before=="O" and label_1=="I": 70 | words=[] 71 | all.append(words) 72 | # print(all) 73 | for needs in all: 74 | w=[] 75 | offset = [] 76 | end = [] 77 | if len(needs)!=0: 78 | label=needs[0][-1].split('-')[-1] 79 | 80 | for ww in needs: 81 | if ww[-1]=="O": 82 | break 83 | w.append(ww[0]) 84 | offset.append(ww[1]) 85 | end.append(ww[2]) 86 | if len(w)!=0: 87 | # print(''.join(w),offset[0],end[-1],label) 88 | wf.write("{}\t{}\t{}\t{}\n".format(''.join(w),offset[0],end[-1],label)) 89 | def main(): 90 | base_path = "CCKS_result" 91 | files = glob.glob("bio_ccks/*.tab") 92 | openfile = files[0] 93 | finall_files = glob.glob("CCKS_result/*.txt") 94 | # del_file(base_path) 95 | split_result(openfile) 96 | change_format(base_path) 97 | 98 | if __name__=="__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/eval/CCKS_result/入院记录现病史-1.txt: -------------------------------------------------------------------------------- 1 | , 0 1 O 2 | 缘于 1 3 O 3 | 入院 3 5 O 4 | 前 5 6 O 5 | 5 6 7 O 6 | 年 7 8 O 7 | 进食 8 10 O 8 | 冰冷 10 12 O 9 | 食物 12 14 O 10 | 后 14 15 O 11 | 出现 15 17 O 12 | 中 17 18 B-解剖部位 13 | 下腹部 18 21 I-解剖部位 14 | 闷痛 21 23 B-症状描述 15 | 不适 23 25 I-症状描述 16 | , 25 26 O 17 | 为 26 27 O 18 | 持续性 27 30 O 19 | 隐痛 30 32 B-症状描述 20 | , 32 33 O 21 | 无 33 34 O 22 | 其它 34 36 O 23 | 部位 36 38 O 24 | 放射 38 40 O 25 | , 40 41 O 26 | 无 41 42 O 27 | 恶心 42 44 B-独立症状 28 | 呕吐 44 46 B-独立症状 29 | , 46 47 O 30 | 无 47 48 O 31 | 反酸 48 50 B-独立症状 32 | 嗳气 50 52 B-独立症状 33 | , 52 53 O 34 | 无 53 54 O 35 | 呕血 54 56 B-独立症状 36 | 黑便 56 58 I-独立症状 37 | 等 58 59 O 38 | 不适 59 61 B-症状描述 39 | 。 61 62 O 40 | [ 62 63 O 41 | 主诉 63 65 O 42 | 单一 65 67 O 43 | 时间 67 69 O 44 | 14 69 71 O 45 | 天 71 72 O 46 | 主诉 72 74 O 47 | 单一 74 76 O 48 | 时间 76 78 O 49 | ] 78 79 O 50 | 前进 79 81 O 51 | 食冰 81 83 O 52 | 西瓜 83 85 O 53 | 后 85 86 O 54 | 出现 86 88 O 55 | 中 88 89 B-解剖部位 56 | 下腹部 89 92 I-解剖部位 57 | 闷痛 92 94 B-症状描述 58 | 不适 94 96 I-症状描述 59 | , 96 97 O 60 | 为 97 98 O 61 | 持续性 98 101 O 62 | 隐痛 101 103 B-症状描述 63 | , 103 104 O 64 | 程度 104 106 O 65 | 较 106 107 O 66 | 前 107 108 O 67 | 加重 108 110 O 68 | , 110 111 O 69 | 无 111 112 O 70 | 其它 112 114 O 71 | 部位 114 116 O 72 | 放射 116 118 O 73 | , 118 119 O 74 | 翻身 119 121 O 75 | 向 121 122 O 76 | 右侧 122 124 B-解剖部位 77 | 时 124 125 O 78 | 疼痛 125 127 B-症状描述 79 | 有所 127 129 O 80 | 缓解 129 131 O 81 | , 131 132 O 82 | 无 132 133 O 83 | 恶心 133 135 B-独立症状 84 | 、 135 136 O 85 | 呕吐 136 138 B-独立症状 86 | 、 138 139 O 87 | 返 139 140 B-独立症状 88 | 酸 140 141 I-独立症状 89 | 、 141 142 O 90 | 嗳气 142 144 B-独立症状 91 | 、 144 145 O 92 | 纳差 145 147 B-独立症状 93 | 、 147 148 O 94 | 乏力 148 150 B-独立症状 95 | 、 150 151 O 96 | 消瘦 151 153 B-独立症状 97 | 、 153 154 O 98 | 腹泻 154 156 O 99 | 、 156 157 O 100 | 便秘 157 159 B-独立症状 101 | , 159 160 O 102 | 无 160 161 O 103 | 呕血 161 163 B-独立症状 104 | 、 163 164 O 105 | 黑 164 165 B-独立症状 106 | 便 165 166 I-独立症状 107 | , 166 167 O 108 | 无 167 168 O 109 | 发热 168 170 B-独立症状 110 | 、 170 171 O 111 | 咳嗽 171 173 B-独立症状 112 | 、 173 174 O 113 | 咳痰 174 176 B-独立症状 114 | , 176 177 O 115 | 自行 177 179 O 116 | 服用 179 181 O 117 | 胃药 181 183 O 118 | ( 183 184 O 119 | 具体 184 186 O 120 | 不详 186 188 O 121 | ) 188 189 O 122 | , 189 190 O 123 | 症状 190 192 O 124 | 有所 192 194 O 125 | 好转 194 196 O 126 | , 196 197 O 127 | , 197 198 O 128 | 此次 198 200 O 129 | 至 200 201 O 130 | 我院 201 203 O 131 | 门诊 203 205 O 132 | 行 205 206 O 133 | 胃镜 206 208 O 134 | 检查 208 210 O 135 | 示 210 211 O 136 | : 211 212 O 137 | “ 212 213 O 138 | 1. 213 215 O 139 | 胃体 215 217 B-解剖部位 140 | 溃疡 217 219 O 141 | 型 219 220 O 142 | 癌 220 221 O 143 | ( 221 222 O 144 | 早 222 223 O 145 | 癌 223 224 O 146 | 可能 224 226 O 147 | ) 226 227 O 148 | 。 227 228 O 149 | 2. 228 230 O 150 | 贲门 230 232 B-解剖部位 151 | 炎症 232 234 O 152 | 。 234 235 O 153 | 3. 235 237 O 154 | 慢性 237 239 O 155 | 萎缩性 239 242 O 156 | 胃炎 242 244 O 157 | ” 244 245 O 158 | , 245 246 O 159 | , 246 247 O 160 | 病理 247 249 O 161 | : 249 250 O 162 | ( 250 251 O 163 | 体小 251 253 O 164 | ) 253 254 O 165 | , 254 255 O 166 | : 255 256 O 167 | 管状 256 258 O 168 | 腺癌 258 260 O 169 | II 260 262 O 170 | - 262 263 O 171 | III 263 266 O 172 | 级 266 267 O 173 | 。 267 268 O 174 | 遂 268 269 O 175 | 门诊 269 271 O 176 | 拟 271 272 O 177 | “ 272 273 O 178 | 胃体 273 275 B-解剖部位 179 | 癌 275 276 O 180 | ” 276 277 O 181 | 收入 277 279 O 182 | 院 279 280 O 183 | 。 280 281 O 184 | 自 281 282 O 185 | 发病 282 284 O 186 | 以来 284 286 O 187 | , 286 287 O 188 | 精神 287 289 O 189 | 、 289 290 O 190 | 睡眠 290 292 O 191 | 可 292 293 O 192 | , 293 294 O 193 | 食欲 294 296 O 194 | 一般 296 298 O 195 | , 298 299 O 196 | 大小便 299 302 O 197 | 如常 302 304 O 198 | , 304 305 O 199 | 体重 305 307 O 200 | 无 307 308 O 201 | 明显 308 310 O 202 | 变化 310 312 O 203 | 。 312 313 O 204 | 313 315 O 205 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/test_label_split/入院记录现病史-2.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | , x Nd Nj Ns Nz 2 0 1 2 | 患者 n Nd Nj Ns Nz 2 1 3 3 | 于 p dr Nj Ns Nz 2 3 4 4 | 半月 t Nd Nj Ns Nz 2 4 6 5 | 前 f Nd Nj Ns Nz 2 6 7 6 | 无 v Nd Nj Ns Nz 2 7 8 7 | 明显 a Nd Nj Ns zz 2 8 10 8 | 诱因 n Nd Nj Ns Nz 2 10 12 9 | 出现 v Nd Nj Ns Nz 2 12 14 10 | 进食 v dr Nj Ns Nz 2 14 16 11 | 后中 r Nd Nj Ns Nz 2 16 18 12 | 上腹 n Nd jp Ns Nz 2 18 20 13 | 不适 a Nd Nj Ns zz 2 20 22 14 | , x Nd Nj Ns Nz 2 22 23 15 | 每次 r dr Nj Ns Nz 2 23 25 16 | 持续 vd Nd Nj ss Nz 2 25 27 17 | 数分钟 m Nd Nj Ns Nz 2 27 30 18 | 自行 r Nd Nj Ns Nz 2 30 32 19 | 缓解 v Nd Nj Ns Nz 2 32 34 20 | , x Nd Nj Ns Nz 2 34 35 21 | 无 v Nd Nj Ns Nz 2 35 36 22 | 恶心 n Nd Nj Ns Nz 2 36 38 23 | 、 x Nd Nj Ns Nz 2 38 39 24 | 呕吐 v Nd Nj Ns Nz 2 39 41 25 | 、 x Nd Nj Ns Nz 2 41 42 26 | 反酸 n Nd Nj Ns Nz 2 42 44 27 | 、 x Nd Nj Ns Nz 2 44 45 28 | 嗳气 n Nd Nj Ns Nz 2 45 47 29 | 、 x Nd Nj Ns Nz 2 47 48 30 | 烧心 v Nd Nj Ns Nz 2 48 50 31 | , x Nd Nj Ns Nz 2 50 51 32 | 无 v Nd Nj Ns Nz 2 51 52 33 | 腹痛 n Nd Nj Ns Nz 2 52 54 34 | 、 x Nd Nj Ns Nz 2 54 55 35 | 腹胀 v Nd Nj Ns Nz 2 55 57 36 | 、 x Nd Nj Ns Nz 2 57 58 37 | 腹泻 v dr Nj Ns Nz 2 58 60 38 | 、 x Nd Nj Ns Nz 2 60 61 39 | 便秘 a Nd Nj Ns Nz 2 61 63 40 | , x Nd Nj Ns Nz 2 63 64 41 | 无厌 v Nd Nj Ns Nz 2 64 66 42 | 油 n dr Nj Ns Nz 2 66 67 43 | 、 x Nd Nj Ns Nz 2 67 68 44 | 纳差 n Nd Nj Ns Nz 2 68 70 45 | , x Nd Nj Ns Nz 2 70 71 46 | 未予 v Nd Nj Ns Nz 2 71 73 47 | 重视 v Nd Nj Ns Nz 2 73 75 48 | , x Nd Nj Ns Nz 2 75 76 49 | 未 d Nd Nj Ns Nz 2 76 77 50 | 特殊 a Nd Nj Ns zz 2 77 79 51 | 处理 v Nd Nj Ns Nz 2 79 81 52 | 。 x Nd Nj Ns Nz 2 81 82 53 | 54 | , x Nd Nj Ns Nz 2 82 83 55 | 半前 t Nd Nj Ns Nz 2 83 85 56 | 至 p Nd Nj Ns Nz 2 85 86 57 | 我院 n Nd Nj Ns Nz 2 86 88 58 | 门诊 n Nd Nj Ns Nz 2 88 90 59 | 行 zg Nd Nj ss Nz 2 90 91 60 | 胃镜 n Nd Nj Ns Nz 2 91 93 61 | 检查 vn Nd Nj Ns Nz 2 93 95 62 | 提示 v Nd Nj Ns Nz 2 95 97 63 | : x Nd Nj Ns Nz 2 97 98 64 | 浅表性 n Nd Nj Ns Nz 2 98 101 65 | 胃窦炎 n Nd Nj Ns Nz 2 101 104 66 | 伴 v Nd Nj Ns Nz 2 104 105 67 | 糜烂 v Nd Nj Ns zz 2 105 107 68 | , x Nd Nj Ns Nz 2 107 108 69 | 十二指肠 l Nd Nj Ns Nz 2 108 112 70 | 球炎 n Nd Nj Ns Nz 2 112 114 71 | , x Nd Nj Ns Nz 2 114 115 72 | , x Nd Nj Ns Nz 2 115 116 73 | 腹部 n Nd Nj Ns Nz 2 116 118 74 | 彩超 j Nd Nj Ns Nz 2 118 120 75 | : x Nd Nj Ns Nz 2 120 121 76 | 肝 n Nd Nj Ns Nz 2 121 122 77 | 回声 v Nd Nj Ns Nz 2 122 124 78 | 增多 v Nd Nj Ns Nz 2 124 126 79 | , x Nd Nj Ns Nz 2 126 127 80 | 胆囊息肉 i Nd Nj ss Nz 2 127 131 81 | 样变 n Nd Nj Ns Nz 2 131 133 82 | 。 x Nd Nj Ns Nz 2 133 134 83 | 84 | 为 p Nd Nj ss Nz 2 134 135 85 | 进一步 d Nd Nj Ns Nz 2 135 138 86 | 诊治 v Nd Nj Ns Nz 2 138 140 87 | , x Nd Nj Ns Nz 2 140 141 88 | 门诊 n Nd Nj Ns Nz 2 141 143 89 | “ x Nd Nj Ns Nz 2 143 144 90 | 胃炎 n dr Nj Ns Nz 2 144 146 91 | ” x Nd Nj Ns Nz 2 146 147 92 | 收入 v Nd Nj Ns Nz 2 147 149 93 | 我 r Nd Nj Ns Nz 2 149 150 94 | 科 n dr Nj Ns Nz 2 150 151 95 | 。 x Nd Nj Ns Nz 2 151 152 96 | 97 | \ x Nd Nj Ns Nz 2 152 153 98 | U0004 eng Nd Nj Ns Nz 2 153 158 99 | x Nd Nj Ns Nz 2 158 159 100 | x Nd Nj Ns Nz 2 159 160 101 | x Nd Nj Ns Nz 2 160 161 102 | 患者 n Nd Nj Ns Nz 2 161 163 103 | 本次 r Nd Nj Ns Nz 2 163 165 104 | 发病 v Nd Nj Ns Nz 2 165 167 105 | 以来 f Nd Nj Ns Nz 2 167 169 106 | , x Nd Nj Ns Nz 2 169 170 107 | 食欲 n Nd Nj Ns Nz 2 170 172 108 | 正常 d Nd Nj Ns zz 2 172 174 109 | , x Nd Nj Ns Nz 2 174 175 110 | x Nd Nj Ns Nz 2 175 176 111 | 神志 n Nd Nj Ns Nz 2 176 178 112 | 清醒 a dr Nj Ns Nz 2 178 180 113 | , x Nd Nj Ns Nz 2 180 181 114 | 精神 n Nd Nj Ns Nz 2 181 183 115 | 尚可 d Nd Nj Ns zz 2 183 185 116 | , x Nd Nj Ns Nz 2 185 186 117 | 睡眠 v Nd Nj ss Nz 2 186 188 118 | 尚可 d Nd Nj Ns zz 2 188 190 119 | , x Nd Nj Ns Nz 2 190 191 120 | 大便 d Nd Nj Ns Nz 2 191 193 121 | 正常 d Nd Nj Ns zz 2 193 195 122 | , x Nd Nj Ns Nz 2 195 196 123 | 小便 nr Nd Nj Ns zz 2 196 198 124 | 正常 d Nd Nj Ns zz 2 198 200 125 | , x Nd Nj Ns Nz 2 200 201 126 | 体重 n Nd Nj Ns Nz 2 201 203 127 | 无 v Nd Nj Ns Nz 2 203 204 128 | 明显 a Nd Nj Ns zz 2 204 206 129 | 变化 vn Nd Nj Ns Nz 2 206 208 130 | 。 x Nd Nj Ns Nz 2 208 209 131 | 132 | 133 | x Nd Nj Ns Nz 2 209 211 134 | -------------------------------------------------------------------------------- /Wapiti/src/model.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef model_h 29 | #define model_h 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #include "wapiti.h" 36 | #include "options.h" 37 | #include "sequence.h" 38 | #include "reader.h" 39 | 40 | typedef struct timeval tms_t; 41 | 42 | /* mdl_t: 43 | * Represent a linear-chain CRF model. The model contain both unigram and 44 | * bigram features. It is caracterized by the number of labels, 45 | * the number of observations, and the number of features. 46 | * 47 | * Each observations have a corresponding entry in whose first bit is 48 | * set if the observation is unigram and second one if it is bigram. Note that 49 | * an observation can be both. An unigram observation produce Y features and a 50 | * bigram one produce Y * Y features. 51 | * The array keep all features weights. The <*off> array give for each 52 | * observations the offset in the array where the features of the 53 | * observation are stored. 54 | * 55 | * The <*off> and array are initialized only when the model is 56 | * synchronized. As you can add new labels and observations after a sync, we 57 | * keep track of the old counts in and to detect inconsistency 58 | * and resynchronize the model if needed. In this case, if the number of 59 | * labels have not changed, the previously trained weights are kept, else they 60 | * are now meaningless so discarded. 61 | */ 62 | typedef struct mdl_s mdl_t; 63 | struct mdl_s { 64 | opt_t *opt; // options for training 65 | int type; // model type 66 | 67 | // Size of various model parameters 68 | uint32_t nlbl; // Y number of labels 69 | uint64_t nobs; // O number of observations 70 | uint64_t nftr; // F number of features 71 | 72 | // Informations about observations 73 | char *kind; // [O] observations type 74 | uint64_t *uoff; // [O] unigram weights offset 75 | uint64_t *boff; // [O] bigram weights offset 76 | 77 | // The model itself 78 | double *theta; // [F] features weights 79 | 80 | // Datasets 81 | dat_t *train; // training dataset 82 | dat_t *devel; // development dataset 83 | rdr_t *reader; 84 | 85 | // Stoping criterion 86 | double *werr; // Window of error rate of last iters 87 | uint32_t wcnt; // Number of iters in the window 88 | uint32_t wpos; // Position for the next iter 89 | 90 | // Timing 91 | tms_t timer; // start time of last iter 92 | double total; // total training time 93 | }; 94 | 95 | mdl_t *mdl_new(rdr_t *rdr); 96 | void mdl_free(mdl_t *mdl); 97 | void mdl_sync(mdl_t *mdl); 98 | void mdl_compact(mdl_t *mdl); 99 | void mdl_save(mdl_t *mdl, FILE *file); 100 | void mdl_load(mdl_t *mdl, FILE *file); 101 | 102 | #endif 103 | -------------------------------------------------------------------------------- /Wapiti/src/gradient.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef gradient_h 29 | #define gradient_h 30 | 31 | #include "wapiti.h" 32 | #include "model.h" 33 | #include "sequence.h" 34 | 35 | /* grd_st_t: 36 | * State tracker for the gradient computation. To compute the gradient we need 37 | * to perform several steps and communicate between them a lot of intermediate 38 | * values, all these temporary are stored in this object. 39 | * A tracker can be used to compute sequence of length at most, before 40 | * using it you must call grd_stcheck to ensure that the tracker is big enough 41 | * for your sequence. 42 | * This tracker is used to perform single sample gradient computations or 43 | * partial gradient computation in online algorithms and for decoding with 44 | * posteriors. 45 | */ 46 | typedef struct grd_st_s grd_st_t; 47 | struct grd_st_s { 48 | mdl_t *mdl; 49 | uint32_t len; // =T max length of sequence 50 | double *g; // [F] vector where to put gradient updates 51 | double lloss; // loss value for the sequence 52 | double *psi; // [T][Y][Y] the transitions scores 53 | double *psiuni; // [T][Y] | Same as psi in sparse format 54 | uint32_t *psiyp; // [T][Y][Y] | 55 | uint32_t *psiidx; // [T][Y] | 56 | uint32_t *psioff; // [T] 57 | double *alpha; // [T][Y] forward scores 58 | double *beta; // [T][Y] backward scores 59 | double *scale; // [T] scaling factors of forward scores 60 | double *unorm; // [T] normalization factors for unigrams 61 | double *bnorm; // [T] normalization factors for bigrams 62 | uint32_t first; // first position where gradient is needed 63 | uint32_t last; // last position where gradient is needed 64 | }; 65 | 66 | grd_st_t *grd_stnew(mdl_t *mdl, double *g); 67 | void grd_stfree(grd_st_t *grd_st); 68 | void grd_stcheck(grd_st_t *grd_st, uint32_t len); 69 | 70 | void grd_fldopsi(grd_st_t *grd_st, const seq_t *seq); 71 | void grd_flfwdbwd(grd_st_t *grd_st, const seq_t *seq); 72 | void grd_flupgrad(grd_st_t *grd_st, const seq_t *seq); 73 | 74 | void grd_spdopsi(grd_st_t *grd_st, const seq_t *seq); 75 | void grd_spfwdbwd(grd_st_t *grd_st, const seq_t *seq); 76 | void grd_spupgrad(grd_st_t *grd_st, const seq_t *seq); 77 | 78 | void grd_logloss(grd_st_t *grd_st, const seq_t *seq); 79 | 80 | void grd_dospl(grd_st_t *grd_st, const seq_t *seq); 81 | 82 | /* grd_t: 83 | * Multi-threaded full dataset gradient computer. This is used to compute the 84 | * gradient by algorithm working on the full dataset at each iterations. It 85 | * efficiently compute it using the fact it is additive to use as many threads 86 | * as allowed. 87 | */ 88 | typedef struct grd_s grd_t; 89 | struct grd_s { 90 | mdl_t *mdl; 91 | grd_st_t **grd_st; 92 | }; 93 | 94 | grd_t *grd_new(mdl_t *mdl, double *g); 95 | void grd_free(grd_t *grd); 96 | double grd_gradient(grd_t *grd); 97 | 98 | #endif 99 | 100 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/wapiti_ccks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Test training and test of wapiti on TAC ADR data 4 | # For documentation on wapiti, see wapiti --help 5 | # and https://wapiti.limsi.fr/ 6 | 7 | BINDIR=$(dirname $0) 8 | 9 | #================ 10 | # option processing 11 | 12 | traininput_dir="BIO_ccks" 13 | testinput_dir="test_label_split" 14 | output_dir="eval/bio_ccks" 15 | pattern_file="../CCKS_CRF/pat/Tok.pat_ccks" 16 | #training_options=' -a bcd -t 5 -i 5 -1 0.01' 17 | training_options=' -a sgd-l1 -t 5 -i 10 -1 0.01 --eta0 0.15' 18 | #training_options=' -a l-bfgs -t 5 -i 15 --clip 0.01 ' 19 | debug=0 20 | verbose=0 21 | 22 | while getopts i:o:p:r:t:dvh OPTION 23 | do 24 | case ${OPTION} in 25 | i) traininput_dir=${OPTARG};; 26 | o) output_dir=${OPTARG};; 27 | p) pattern_file=${OPTARG};; 28 | t) training_options=${OPTARG};; 29 | d) debug=1;; 30 | v) verbose=1;; 31 | \?|h) echo "Usage: $0 [ -dvh ] [ -i traininput_dir ] [ -t training_options ] 32 | 33 | Train CRF on training part of the corpus. 34 | Apply CRF model on test part of the corpus. 35 | Evaluate the results. 36 | 37 | -i input_dir Name of input directory containing the corpus (must be in *.tab files starting with uppercase letter). 38 | -o output_dir Name of output directory, in which result files are created. 39 | -p pattern_file Name of pattern file that specifies features for the CRF (wapiti). Must end with '.pat'. 40 | -t training_options Training options for the CRF (wapiti). Protect with quotes because it contains spaces. 41 | 42 | -h show this message 43 | " 1>&2 44 | exit 2;; 45 | esac 46 | done 47 | 48 | # pass the options 49 | shift $((${OPTIND} - 1)) 50 | #================ 51 | 52 | echo "================ Processing from $traininput_dir to $output_dir ================" 1>&2 53 | if [ ! -d $traininput_dir ]; then 54 | echo "Input directory '$traininput_dir' does not exist" 1>&2 55 | exit 1 56 | fi 57 | 58 | if [ ! -f $pattern_file ]; then 59 | echo "Pattern file '$pattern_file' does not exist" 1>&2 60 | exit 2 61 | fi 62 | 63 | patname=$(basename $pattern_file .pat) 64 | corpus_name=$(basename $traininput_dir) 65 | 66 | mkdir -p $output_dir # create if does not exist 67 | 68 | echo "traininput_dir=$traininput_dir 69 | output_dir=$output_dir 70 | pattern_file='$pattern_file' 71 | training_options='$training_options' 72 | " 1>&2 73 | 74 | 75 | 76 | echo "================ Training $corpus_name (this may take some time) ================" 1>&2 77 | # training: create a MODEL based on PATTERNS and TRAINING-CORPUS 78 | # wapiti train -p PATTERNS TRAINING-CORPUS MODEL 79 | echo "wapiti train $training_options -p $pattern_file <(cat ../CCKS_CRF/$1) ../CCKS_CRF/$output_dir/$patname-train-$corpus_name-$3.mod" 1>&2 80 | 81 | ../../Wapiti/wapiti train $training_options -p $pattern_file <(cat ../CCKS_CRF/$traininput_dir/*.txt) ../CCKS_CRF/$output_dir/$patname-train-$corpus_name.mod 82 | # wapiti train -a bcd -t 2 -i 5 -p t.pat train-bio.tab t-train-bio.mod 83 | # 84 | # Note: The default algorithm, l-bfgs, stops early and does not succeed in annotating any token (all O) 85 | # sgd-l1 works; bcd works 86 | 87 | # To examine the contents of the model, first dump it into a text file then use 'less FILE' to view its contents 88 | ../../Wapiti/wapiti dump ../CCKS_CRF/$output_dir/$patname-train-$corpus_name.mod ../CCKS_CRF/$output_dir/$patname-train-$corpus_name.txt 89 | 90 | echo "================ Inference $corpus_name ================" 1>&2 91 | # inference (labeling): apply the MODEL to label the TEST-CORPUS, put results in TEST-RESULTS 92 | # wapiti label -m MODEL TEST-CORPUS TEST-RESULTS 93 | # -c : check (= evaluate) 94 | # <(COMMAND ARGUMENTS ...) : runs COMMAND on ARGUMENTS ... and provides the results as if in a file 95 | echo "wapiti label -c -m ../CCKS_CRF/$output_dir/$patname-train-$corpus_name-$3.mod <(cat ../CCKS_CRF/$1) ../CCKS_CRF$output_dir/$patname-train-test-$corpus_name-$3.tab" 1>&2 96 | ../../Wapiti/wapiti label -c -m ../CCKS_CRF/$output_dir/$patname-train-$corpus_name.mod <(cat ../CCKS_CRF/$testinput_dir/*.txt) ../CCKS_CRF/$output_dir/$patname-train-test-$corpus_name.tab 97 | # wapiti label -c -m t-train-bio.mod test-bio.tab t-train-test-bio.tab 98 | #echo "================ Evaluation with conlleval.pl $corpus_name ================" 1>&2 99 | # evaluate the resulting entities 100 | # $'\t' is a way to obtain a tabulation in bash 101 | echo "$BINDIR/conlleval.pl -d $'\t' <../CCKS_CRF/$output_dir/$patname-train-test-$corpus_name-$3.tab | tee ../CCKS_CRF/$output_dir/$patname-train-test-$corpus_name-$3.eval" 1>&2 102 | perl conlleval.pl -d $'\t' <../CCKS_CRF/$output_dir/$patname-train-test-$corpus_name.tab | tee ../CCKS_CRF/$output_dir/$patname-train-test-$corpus_name.eval 103 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/test_label_split/入院记录现病史-1.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | , x Nd Nj Ns Nz 1 0 1 2 | 缘于 p Nd Nj Ns Nz 1 1 3 3 | 入院 n Nd Nj Ns Nz 1 3 5 4 | 前 f Nd Nj Ns Nz 1 5 6 5 | 5 m Nd Nj Ns Nz 1 6 7 6 | 年 m dr Nj Ns Nz 1 7 8 7 | 进食 v dr Nj Ns Nz 1 8 10 8 | 冰冷 z Nd Nj Ns Nz 1 10 12 9 | 食物 n dr Nj Ns Nz 1 12 14 10 | 后 f Nd Nj Ns Nz 1 14 15 11 | 出现 v Nd Nj Ns Nz 1 15 17 12 | 中 f Nd Nj Ns Nz 1 17 18 13 | 下腹部 n Nd Nj Ns Nz 1 18 21 14 | 闷痛 a Nd Nj Ns Nz 1 21 23 15 | 不适 a Nd Nj Ns zz 1 23 25 16 | , x Nd Nj Ns Nz 1 25 26 17 | 为 p Nd Nj ss Nz 1 26 27 18 | 持续性 n Nd Nj ss Nz 1 27 30 19 | 隐痛 a Nd Nj Ns zz 1 30 32 20 | , x Nd Nj Ns Nz 1 32 33 21 | 无 v Nd Nj Ns Nz 1 33 34 22 | 其它 r Nd Nj Ns Nz 1 34 36 23 | 部位 n Nd Nj ss Nz 1 36 38 24 | 放射 v Nd Nj Ns Nz 1 38 40 25 | , x Nd Nj Ns Nz 1 40 41 26 | 无 v Nd Nj Ns Nz 1 41 42 27 | 恶心 n Nd Nj Ns Nz 1 42 44 28 | 呕吐 v Nd Nj Ns Nz 1 44 46 29 | , x Nd Nj Ns Nz 1 46 47 30 | 无 v Nd Nj Ns Nz 1 47 48 31 | 反酸 n Nd Nj Ns Nz 1 48 50 32 | 嗳气 n Nd Nj Ns Nz 1 50 52 33 | , x Nd Nj Ns Nz 1 52 53 34 | 无 v Nd Nj Ns Nz 1 53 54 35 | 呕血 v Nd Nj Ns Nz 1 54 56 36 | 黑便 nr Nd Nj Ns Nz 1 56 58 37 | 等 u Nd Nj Ns Nz 1 58 59 38 | 不适 a Nd Nj Ns zz 1 59 61 39 | 。 x Nd Nj Ns Nz 1 61 62 40 | 41 | [ x Nd Nj Ns Nz 1 62 63 42 | 主诉 v Nd Nj Ns Nz 1 63 65 43 | 单一 b Nd Nj ss Nz 1 65 67 44 | 时间 n Nd Nj Ns Nz 1 67 69 45 | 14 m Nd Nj Ns Nz 1 69 71 46 | 天 n dr Nj Ns Nz 1 71 72 47 | 主诉 v Nd Nj Ns Nz 1 72 74 48 | 单一 b Nd Nj ss Nz 1 74 76 49 | 时间 n Nd Nj Ns Nz 1 76 78 50 | ] x Nd Nj Ns Nz 1 78 79 51 | 前进 v Nd Nj Ns Nz 1 79 81 52 | 食冰 n Nd Nj Ns Nz 1 81 83 53 | 西瓜 ns dr Nj Ns Nz 1 83 85 54 | 后 f Nd Nj Ns Nz 1 85 86 55 | 出现 v Nd Nj Ns Nz 1 86 88 56 | 中 f Nd Nj Ns Nz 1 88 89 57 | 下腹部 n Nd Nj Ns Nz 1 89 92 58 | 闷痛 a Nd Nj Ns Nz 1 92 94 59 | 不适 a Nd Nj Ns zz 1 94 96 60 | , x Nd Nj Ns Nz 1 96 97 61 | 为 p Nd Nj ss Nz 1 97 98 62 | 持续性 n Nd Nj ss Nz 1 98 101 63 | 隐痛 a Nd Nj Ns zz 1 101 103 64 | , x Nd Nj Ns Nz 1 103 104 65 | 程度 n Nd Nj Ns Nz 1 104 106 66 | 较 d dr Nj Ns Nz 1 106 107 67 | 前 f Nd Nj Ns Nz 1 107 108 68 | 加重 v Nd Nj Ns Nz 1 108 110 69 | , x Nd Nj Ns Nz 1 110 111 70 | 无 v Nd Nj Ns Nz 1 111 112 71 | 其它 r Nd Nj Ns Nz 1 112 114 72 | 部位 n Nd Nj ss Nz 1 114 116 73 | 放射 v Nd Nj Ns Nz 1 116 118 74 | , x Nd Nj Ns Nz 1 118 119 75 | 翻身 v Nd Nj Ns Nz 1 119 121 76 | 向 p Nd Nj ss Nz 1 121 122 77 | 右侧 f Nd Nj Ns Nz 1 122 124 78 | 时 n Nd Nj Ns Nz 1 124 125 79 | 疼痛 n Nd Nj Ns Nz 1 125 127 80 | 有所 n Nd Nj Ns Nz 1 127 129 81 | 缓解 v Nd Nj Ns Nz 1 129 131 82 | , x Nd Nj Ns Nz 1 131 132 83 | 无 v Nd Nj Ns Nz 1 132 133 84 | 恶心 n Nd Nj Ns Nz 1 133 135 85 | 、 x Nd Nj Ns Nz 1 135 136 86 | 呕吐 v Nd Nj Ns Nz 1 136 138 87 | 、 x Nd Nj Ns Nz 1 138 139 88 | 返 v Nd jp Ns Nz 1 139 140 89 | 酸 n dr Nj Ns Nz 1 140 141 90 | 、 x Nd Nj Ns Nz 1 141 142 91 | 嗳气 n Nd Nj Ns Nz 1 142 144 92 | 、 x Nd Nj Ns Nz 1 144 145 93 | 纳差 n Nd Nj Ns Nz 1 145 147 94 | 、 x Nd Nj Ns Nz 1 147 148 95 | 乏力 a Nd Nj Ns Nz 1 148 150 96 | 、 x Nd Nj Ns Nz 1 150 151 97 | 消瘦 a Nd Nj Ns Nz 1 151 153 98 | 、 x Nd Nj Ns Nz 1 153 154 99 | 腹泻 v dr Nj Ns Nz 1 154 156 100 | 、 x Nd Nj Ns Nz 1 156 157 101 | 便秘 a Nd Nj Ns Nz 1 157 159 102 | , x Nd Nj Ns Nz 1 159 160 103 | 无 v Nd Nj Ns Nz 1 160 161 104 | 呕血 v Nd Nj Ns Nz 1 161 163 105 | 、 x Nd Nj Ns Nz 1 163 164 106 | 黑 a Nd Nj Ns Nz 1 164 165 107 | 便 d Nd Nj Ns Nz 1 165 166 108 | , x Nd Nj Ns Nz 1 166 167 109 | 无 v Nd Nj Ns Nz 1 167 168 110 | 发热 v Nd Nj Ns Nz 1 168 170 111 | 、 x Nd Nj Ns Nz 1 170 171 112 | 咳嗽 v Nd Nj Ns Nz 1 171 173 113 | 、 x Nd Nj Ns Nz 1 173 174 114 | 咳痰 v dr Nj Ns Nz 1 174 176 115 | , x Nd Nj Ns Nz 1 176 177 116 | 自行 r Nd Nj Ns Nz 1 177 179 117 | 服用 vn dr Nj Ns Nz 1 179 181 118 | 胃药 n dr Nj Ns Nz 1 181 183 119 | ( x Nd Nj Ns Nz 1 183 184 120 | 具体 a Nd Nj Ns Nz 1 184 186 121 | 不详 v Nd Nj Ns Nz 1 186 188 122 | ) x Nd Nj Ns Nz 1 188 189 123 | , x Nd Nj Ns Nz 1 189 190 124 | 症状 n dr Nj Ns Nz 1 190 192 125 | 有所 n Nd Nj Ns Nz 1 192 194 126 | 好转 v Nd Nj Ns Nz 1 194 196 127 | , x Nd Nj Ns Nz 1 196 197 128 | , x Nd Nj Ns Nz 1 197 198 129 | 此次 r Nd Nj Ns Nz 1 198 200 130 | 至 p Nd Nj Ns Nz 1 200 201 131 | 我院 n Nd Nj Ns Nz 1 201 203 132 | 门诊 n Nd Nj Ns Nz 1 203 205 133 | 行 zg Nd Nj ss Nz 1 205 206 134 | 胃镜 n Nd Nj Ns Nz 1 206 208 135 | 检查 vn Nd Nj Ns Nz 1 208 210 136 | 示 v Nd Nj Ns Nz 1 210 211 137 | : x Nd Nj Ns Nz 1 211 212 138 | “ x Nd Nj Ns Nz 1 212 213 139 | 1. m Nd Nj Ns Nz 1 213 215 140 | 胃体 n Nd jp Ns Nz 1 215 217 141 | 溃疡 n Nd Nj Ns Nz 1 217 219 142 | 型 k Nd Nj Ns Nz 1 219 220 143 | 癌 n Nd Nj ss Nz 1 220 221 144 | ( x Nd Nj Ns Nz 1 221 222 145 | 早 a Nd Nj Ns Nz 1 222 223 146 | 癌 n Nd Nj ss Nz 1 223 224 147 | 可能 v Nd Nj Ns Nz 1 224 226 148 | ) x Nd Nj Ns Nz 1 226 227 149 | 。 x Nd Nj Ns Nz 1 227 228 150 | 151 | 2. m Nd Nj Ns Nz 1 228 230 152 | 贲门 n Nd Nj Ns Nz 1 230 232 153 | 炎症 n Nd Nj ss Nz 1 232 234 154 | 。 x Nd Nj Ns Nz 1 234 235 155 | 156 | 3. m Nd Nj Ns Nz 1 235 237 157 | 慢性 b Nd Nj ss Nz 1 237 239 158 | 萎缩性 n Nd Nj ss Nz 1 239 242 159 | 胃炎 n dr Nj Ns Nz 1 242 244 160 | ” x Nd Nj Ns Nz 1 244 245 161 | , x Nd Nj Ns Nz 1 245 246 162 | , x Nd Nj Ns Nz 1 246 247 163 | 病理 n Nd Nj Ns Nz 1 247 249 164 | : x Nd Nj Ns Nz 1 249 250 165 | ( x Nd Nj Ns Nz 1 250 251 166 | 体小 nr Nd Nj Ns Nz 1 251 253 167 | ) x Nd Nj Ns Nz 1 253 254 168 | , x Nd Nj Ns Nz 1 254 255 169 | : x Nd Nj Ns Nz 1 255 256 170 | 管状 n Nd Nj Ns Nz 1 256 258 171 | 腺癌 n Nd Nj ss Nz 1 258 260 172 | II eng Nd Nj Ns Nz 1 260 262 173 | - x Nd Nj Ns Nz 1 262 263 174 | III eng Nd Nj Ns Nz 1 263 266 175 | 级 q dr Nj Ns Nz 1 266 267 176 | 。 x Nd Nj Ns Nz 1 267 268 177 | 178 | 遂 d Nd Nj Ns Nz 1 268 269 179 | 门诊 n Nd Nj Ns Nz 1 269 271 180 | 拟 v dr Nj Ns Nz 1 271 272 181 | “ x Nd Nj Ns Nz 1 272 273 182 | 胃体 n Nd jp Ns Nz 1 273 275 183 | 癌 n Nd Nj ss Nz 1 275 276 184 | ” x Nd Nj Ns Nz 1 276 277 185 | 收入 v Nd Nj Ns Nz 1 277 279 186 | 院 n Nd Nj Ns Nz 1 279 280 187 | 。 x Nd Nj Ns Nz 1 280 281 188 | 189 | 自 p Nd jp Ns Nz 1 281 282 190 | 发病 v Nd Nj Ns Nz 1 282 284 191 | 以来 f Nd Nj Ns Nz 1 284 286 192 | , x Nd Nj Ns Nz 1 286 287 193 | 精神 n Nd Nj Ns Nz 1 287 289 194 | 、 x Nd Nj Ns Nz 1 289 290 195 | 睡眠 v Nd Nj ss Nz 1 290 292 196 | 可 v Nd Nj Ns Nz 1 292 293 197 | , x Nd Nj Ns Nz 1 293 294 198 | 食欲 n Nd Nj Ns Nz 1 294 296 199 | 一般 a Nd Nj Ns Nz 1 296 298 200 | , x Nd Nj Ns Nz 1 298 299 201 | 大小便 nr Nd Nj Ns Nz 1 299 302 202 | 如常 t Nd Nj Ns zz 1 302 304 203 | , x Nd Nj Ns Nz 1 304 305 204 | 体重 n Nd Nj Ns Nz 1 305 307 205 | 无 v Nd Nj Ns Nz 1 307 308 206 | 明显 a Nd Nj Ns zz 1 308 310 207 | 变化 vn Nd Nj Ns Nz 1 310 312 208 | 。 x Nd Nj Ns Nz 1 312 313 209 | 210 | 211 | x Nd Nj Ns Nz 1 313 315 212 | -------------------------------------------------------------------------------- /Wapiti/src/thread.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #include 29 | 30 | #include "model.h" 31 | #include "tools.h" 32 | #include "thread.h" 33 | #include "wapiti.h" 34 | 35 | /****************************************************************************** 36 | * Multi-threading code 37 | * 38 | * This module handle the thread managment code using POSIX pthreads, on 39 | * non-POSIX systems you will have to rewrite this using your systems threads. 40 | * all code who depend on threads is located here so this process must not be 41 | * too difficult. 42 | * If you don't want to use multithreading on non-POSIX system, just enable 43 | * the definition of MTH_ANSI in wapiti.h. This will disable multithreading. 44 | * 45 | * The jobs system is a simple scheduling system, you have to provide the 46 | * number of jobs to be done and the size of each batch, a call to getjob will 47 | * return the index of the first available and the size of the batch, and mark 48 | * these jobs as done. This is usefull if your jobs are numbered but you can't 49 | * do a trivial split as each of them may require different amount of time to 50 | * be completed like gradient computation which depend on the length of the 51 | * sequences. 52 | * If you provide a count of 0, the job system is disabled. 53 | ******************************************************************************/ 54 | #ifdef MTH_ANSI 55 | struct job_s { 56 | uint32_t size; 57 | }; 58 | 59 | bool mth_getjob(job_t *job, uint32_t *cnt, uint32_t *pos) { 60 | if (job->size == 0) 61 | return false; 62 | *cnt = job->size; 63 | *pos = 0; 64 | job->size = 0; 65 | return true; 66 | } 67 | 68 | void mth_spawn(func_t *f, uint32_t W, void *ud[W], uint32_t size, uint32_t batch) { 69 | unused(batch); 70 | if (size == 0) { 71 | f(NULL, 0, 1, ud[0]); 72 | } else { 73 | job_t job = {size}; 74 | f(&job, 0, 1, ud[0]); 75 | } 76 | } 77 | 78 | #else 79 | 80 | #include 81 | 82 | struct job_s { 83 | uint32_t size; 84 | uint32_t send; 85 | uint32_t batch; 86 | pthread_mutex_t lock; 87 | }; 88 | 89 | typedef struct mth_s mth_t; 90 | struct mth_s { 91 | job_t *job; 92 | uint32_t id; 93 | uint32_t cnt; 94 | func_t *f; 95 | void *ud; 96 | }; 97 | 98 | /* mth_getjob: 99 | * Get a new bunch of sequence to process. This function will return a new 100 | * batch of sequence to process starting at position and with size 101 | * and return true. If no more batch are available, return false. 102 | * This function use a lock to ensure thread safety as it will be called by 103 | * the multiple workers threads. 104 | */ 105 | bool mth_getjob(job_t *job, uint32_t *cnt, uint32_t *pos) { 106 | if (job == NULL) 107 | return false; 108 | if (job->send == job->size) 109 | return false; 110 | pthread_mutex_lock(&job->lock); 111 | *cnt = min(job->batch, job->size - job->send); 112 | *pos = job->send; 113 | job->send += *cnt; 114 | pthread_mutex_unlock(&job->lock); 115 | return true; 116 | } 117 | 118 | static void *mth_stub(void *ud) { 119 | mth_t *mth = (mth_t *)ud; 120 | mth->f(mth->job, mth->id, mth->cnt, mth->ud); 121 | return NULL; 122 | } 123 | 124 | /* mth_spawn: 125 | * This function spawn W threads for calling the 'f' function. The function 126 | * will get a unique identifier between 0 and W-1 and a user data from the 127 | * 'ud' array. 128 | */ 129 | void mth_spawn(func_t *f, uint32_t W, void *ud[W], uint32_t size, uint32_t batch) { 130 | // First prepare the jobs scheduler 131 | job_t job, *pjob = NULL; 132 | if (size != 0) { 133 | pjob = &job; 134 | job.size = size; 135 | job.send = 0; 136 | job.batch = batch; 137 | if (pthread_mutex_init(&job.lock, NULL) != 0) 138 | fatal("failed to create mutex"); 139 | } 140 | // We handle differently the case where user requested a single thread 141 | // for efficiency. 142 | if (W == 1) { 143 | f(&job, 0, 1, ud[0]); 144 | return; 145 | } 146 | // We prepare the parameters structures that will be send to the threads 147 | // with informations for calling the user function. 148 | mth_t p[W]; 149 | for (uint32_t w = 0; w < W; w++) { 150 | p[w].job = pjob; 151 | p[w].id = w; 152 | p[w].cnt = W; 153 | p[w].f = f; 154 | p[w].ud = ud[w]; 155 | } 156 | // We are now ready to spawn the threads and wait for them to finish 157 | // their jobs. So we just create all the thread and try to join them 158 | // waiting for there return. 159 | pthread_attr_t attr; 160 | pthread_attr_init(&attr); 161 | pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM); 162 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 163 | pthread_t th[W]; 164 | for (uint32_t w = 0; w < W; w++) 165 | if (pthread_create(&th[w], &attr, &mth_stub, &p[w]) != 0) 166 | fatal("failed to create thread"); 167 | for (uint32_t w = 0; w < W; w++) 168 | if (pthread_join(th[w], NULL) != 0) 169 | fatal("failed to join thread"); 170 | pthread_attr_destroy(&attr); 171 | } 172 | #endif 173 | 174 | -------------------------------------------------------------------------------- /Wapiti/src/tools.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "tools.h" 38 | 39 | /******************************************************************************* 40 | * Error handling and memory managment 41 | * 42 | * Wapiti use a very simple system for error handling: violently fail. Errors 43 | * can occurs in two cases, when user feed Wapiti with bad datas or when there 44 | * is a problem on the system side. In both cases, there is nothing we can do, 45 | * so the best thing is to exit with a meaning full error message. 46 | * 47 | * Memory allocation is one of the possible point of failure and its painfull 48 | * to always remeber to check return value of malloc so we provide wrapper 49 | * around it and realloc who check and fail in case of error. 50 | ******************************************************************************/ 51 | 52 | /* fatal: 53 | * This is the main error function, it will print the given message with same 54 | * formating than the printf family and exit program with an error. We let the 55 | * OS care about freeing ressources. 56 | */ 57 | void fatal(const char *msg, ...) { 58 | va_list args; 59 | fprintf(stderr, "error: "); 60 | va_start(args, msg); 61 | vfprintf(stderr, msg, args); 62 | va_end(args); 63 | fprintf(stderr, "\n"); 64 | exit(EXIT_FAILURE); 65 | } 66 | 67 | /* pfatal: 68 | * This one is very similar to the fatal function but print an additional 69 | * system error message depending on the errno. This can be used when a 70 | * function who set the errno fail to print more detailed informations. You 71 | * must be carefull to not call other functino that might reset it before 72 | * calling pfatal. 73 | */ 74 | void pfatal(const char *msg, ...) { 75 | const char *err = strerror(errno); 76 | va_list args; 77 | fprintf(stderr, "error: "); 78 | va_start(args, msg); 79 | vfprintf(stderr, msg, args); 80 | va_end(args); 81 | fprintf(stderr, "\n\t<%s>\n", err); 82 | exit(EXIT_FAILURE); 83 | } 84 | 85 | /* warning: 86 | * This one is less violent as it just print a warning on stderr, but doesn't 87 | * exit the program. It is intended to inform the user that something strange 88 | * have happen and the result might be not what it have expected. 89 | */ 90 | void warning(const char *msg, ...) { 91 | va_list args; 92 | fprintf(stderr, "warning: "); 93 | va_start(args, msg); 94 | vfprintf(stderr, msg, args); 95 | va_end(args); 96 | fprintf(stderr, "\n"); 97 | } 98 | 99 | /* info: 100 | * Function used for all progress reports. This is where an eventual verbose 101 | * level can be implemented later or redirection to a logfile. For now, it is 102 | * just a wrapper for printf to stderr. Note that unlike the previous one, 103 | * this function doesn't automatically append a new line character. 104 | */ 105 | void info(const char *msg, ...) { 106 | va_list args; 107 | va_start(args, msg); 108 | vfprintf(stderr, msg, args); 109 | va_end(args); 110 | } 111 | 112 | /* xmalloc: 113 | * A simple wrapper around malloc who violently fail if memory cannot be 114 | * allocated, so it will never return NULL. 115 | */ 116 | void *xmalloc(size_t size) { 117 | void *ptr = malloc(size); 118 | if (ptr == NULL) 119 | fatal("out of memory"); 120 | return ptr; 121 | } 122 | 123 | /* xrealloc: 124 | * As xmalloc, this is a simple wrapper around realloc who fail on memory 125 | * error and so never return NULL. 126 | */ 127 | void *xrealloc(void *ptr, size_t size) { 128 | void *new = realloc(ptr, size); 129 | if (new == NULL) 130 | fatal("out of memory"); 131 | return new; 132 | } 133 | 134 | /* xstrdup: 135 | * As the previous one, this is a safe version of xstrdup who fail on 136 | * allocation error. 137 | */ 138 | char *xstrdup(const char *str) { 139 | const size_t len = strlen(str) + 1; 140 | char *res = xmalloc(sizeof(char) * len); 141 | memcpy(res, str, len); 142 | return res; 143 | } 144 | 145 | /****************************************************************************** 146 | * Netstring for persistent storage 147 | * 148 | * This follow the format proposed by D.J. Bernstein for safe and portable 149 | * storage of string in persistent file and networks. This used for storing 150 | * strings in saved models. 151 | * We just add an additional end-of-line character to make the output files 152 | * more readable. 153 | * 154 | ******************************************************************************/ 155 | 156 | /* ns_readstr: 157 | * Read a string from the given file in netstring format. The string is 158 | * returned as a newly allocated bloc of memory 0-terminated. 159 | */ 160 | char *ns_readstr(FILE *file) { 161 | uint32_t len; 162 | if (fscanf(file, "%"SCNu32":", &len) != 1) 163 | pfatal("cannot read from file"); 164 | char *buf = xmalloc(len + 1); 165 | if (fread(buf, len, 1, file) != 1) 166 | pfatal("cannot read from file"); 167 | if (fgetc(file) != ',') 168 | fatal("invalid format"); 169 | buf[len] = '\0'; 170 | fgetc(file); 171 | return buf; 172 | } 173 | 174 | /* ns_writestr: 175 | * Write a string in the netstring format to the given file. 176 | */ 177 | void ns_writestr(FILE *file, const char *str) { 178 | const uint32_t len = strlen(str); 179 | if (fprintf(file, "%"PRIu32":%s,\n", len, str) < 0) 180 | pfatal("cannot write to file"); 181 | } 182 | 183 | -------------------------------------------------------------------------------- /Wapiti/src/progress.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include 36 | #include 37 | #include 38 | 39 | #include "wapiti.h" 40 | #include "decoder.h" 41 | #include "model.h" 42 | #include "options.h" 43 | #include "progress.h" 44 | #include "tools.h" 45 | 46 | /******************************************************************************* 47 | * User interaction during training 48 | * 49 | * Handle progress reporting during training and clean early stoping. Trainers 50 | * have to call uit_progress at the end of each iterations, this will display 51 | * various informations for the user. 52 | * Timing is also done here, an iteration is assumed to take all the time 53 | * between to call to the progress function and evualtion on the devel data 54 | * are included. 55 | * 56 | * This module setup a signal handler for SIGINT. If this signal is catched, 57 | * the uit_stop global variable to inform the trainer that it have to stop as 58 | * early as possible, discarding the recent computations if they cannot be 59 | * integrated very quickly. They must leave the model in a clean state. Any 60 | * further signal will terminate the program. So it's simple : 61 | * - 1 signal mean "I can wait a little so try to stop as soon as possible 62 | * but leave me a working model" 63 | * - 2 signal mean "Stop immediatly what you are doing, I can't wait and 64 | * don't care about getting a working model" 65 | ******************************************************************************/ 66 | 67 | /* uit_stop: 68 | * This value is set to true when the user request the trainer to stop. In 69 | * this case, the trainer have to stop as soon as possible in a clean state, 70 | * discarding the lasts computations if it cannot integrate them quickly. 71 | */ 72 | bool uit_stop = false; 73 | 74 | /* uit_signal: 75 | * Signal handler to catch interupt signal. When a signal is received, the 76 | * trainer is aksed to stop as soon as possible leaving the model in a clean 77 | * state. We don't reinstall the handler so if user send a second interupt 78 | * signal, the program will stop imediatly. (to cope with BSD system, we even 79 | * reinstall explicitly the default handler) 80 | */ 81 | static void uit_signal(int sig) { 82 | signal(sig, SIG_DFL); 83 | uit_stop = true; 84 | } 85 | 86 | /* uit_setup: 87 | * Install the signal handler for clean early stop from the user if possible 88 | * and start the timer. 89 | */ 90 | void uit_setup(mdl_t *mdl) { 91 | uit_stop = false; 92 | if (signal(SIGINT, uit_signal) == SIG_ERR) 93 | warning("failed to set signal handler, no clean early stop"); 94 | gettimeofday(&mdl->timer, NULL); 95 | if (mdl->opt->stopwin != 0) 96 | mdl->werr = xmalloc(sizeof(double) * mdl->opt->stopwin); 97 | mdl->wcnt = mdl->wpos = 0; 98 | } 99 | 100 | /* uit_cleanup: 101 | * Remove the signal handler restoring the defaul behavior in case of 102 | * interrupt. 103 | */ 104 | void uit_cleanup(mdl_t *mdl) { 105 | unused(mdl); 106 | if (mdl->opt->stopwin != 0) { 107 | free(mdl->werr); 108 | mdl->werr = NULL; 109 | } 110 | signal(SIGINT, SIG_DFL); 111 | } 112 | 113 | /* uit_progress: 114 | * Display a progress repport to the user consisting of some informations 115 | * provided by the trainer: iteration count and objective function value, and 116 | * some informations computed here on the current model performances. 117 | * This function return true if the trainer have to keep training the model 118 | * and false if he must stop, so this is were we will implement the trainer 119 | * independant stoping criterion. 120 | */ 121 | bool uit_progress(mdl_t *mdl, uint32_t it, double obj) { 122 | // First we just compute the error rate on devel or train data 123 | double te, se; 124 | tag_eval(mdl, &te, &se); 125 | // Next, we compute the number of active features 126 | uint64_t act = 0; 127 | for (uint64_t f = 0; f < mdl->nftr; f++) 128 | if (mdl->theta[f] != 0.0) 129 | act++; 130 | // Compute timings. As some training algorithms are multi-threaded, we 131 | // cannot use ansi/c function and must rely on posix one to sum time 132 | // spent in main thread and in child ones. 133 | tms_t now; gettimeofday(&now, NULL); 134 | double tm = (now.tv_sec + (double)now.tv_usec * 1.0e-6) 135 | - (mdl->timer.tv_sec + (double)mdl->timer.tv_usec * 1.0e-6); 136 | mdl->total += tm; 137 | mdl->timer = now; 138 | // And display progress report 139 | info(" [%4"PRIu32"]", it); 140 | info(obj >= 0.0 ? " obj=%-10.2f" : " obj=NA", obj); 141 | info(" act=%-8"PRIu64, act); 142 | info(" err=%5.2f%%/%5.2f%%", te, se); 143 | info(" time=%.2fs/%.2fs", tm, mdl->total); 144 | info("\n"); 145 | // If requested, check the error rate stoping criterion. We check if the 146 | // error rate is stable enought over a few iterations. 147 | bool res = true; 148 | if (mdl->opt->stopwin != 0) { 149 | mdl->werr[mdl->wpos] = te; 150 | mdl->wpos = (mdl->wpos + 1) % mdl->opt->stopwin; 151 | mdl->wcnt++; 152 | if (mdl->wcnt >= mdl->opt->stopwin) { 153 | double emin = 200.0, emax = -100.0; 154 | for (uint32_t i = 0; i < mdl->opt->stopwin; i++) { 155 | emin = min(emin, mdl->werr[i]); 156 | emax = max(emax, mdl->werr[i]); 157 | } 158 | if (emax - emin < mdl->opt->stopeps) 159 | res = false; 160 | } 161 | } 162 | // And return 163 | if (uit_stop) 164 | return false; 165 | return res; 166 | } 167 | 168 | 169 | -------------------------------------------------------------------------------- /Wapiti/src/sequence.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef sequence_h 29 | #define sequence_h 30 | 31 | #include 32 | #include 33 | 34 | #include "wapiti.h" 35 | 36 | /******************************************************************************* 37 | * Sequences and Dataset objects 38 | * 39 | * Sequences represent the input data feeded by the user in Wapiti either for 40 | * training or labelling. The internal form used here is very different from 41 | * the data read from files and the convertion process is done in three steps 42 | * illustrated here: 43 | * +------+ +-------+ +-------+ +-------+ 44 | * | FILE | --> | raw_t | --> | tok_t | --> | seq_t | 45 | * +------+ +-------+ +-------+ +-------+ 46 | * First the sequence is read as a set of lines from the input file, this 47 | * give a raw_t object. Next this set of lines is split in tokens and 48 | * eventually the last one is separated as it will become a label, this result 49 | * in a tok_t object. 50 | * The last step consist in applying all the patterns givens by the user to 51 | * extract from these tokens the observations made on the sequence in order to 52 | * build the seq_t object which can be used by the trainer and tagger. 53 | * 54 | * A dataset object is just a container for a list of sequences in internal 55 | * form used to store either training or development set. 56 | * 57 | * All the convertion process is driven by the reader object and, as it is 58 | * responsible for creating the objects with a quite special allocation 59 | * scheme, we just have to implement function for freeing these objects here. 60 | ******************************************************************************/ 61 | 62 | /* raw_t: 63 | * Data-structure representing a raw sequence as a set of lines read from the 64 | * input file. This is the result of the first step of the interning process. 65 | * We keep this form separate from the tokenized one as we want to be able to 66 | * output the sequence as it was read in the labelling mode. 67 | * 68 | * This represent a sequence of lengths and for each position 't' you 69 | * find the corresponding line at [t]. 70 | * 71 | * The array is allocated with data structure, and the different lines 72 | * are allocated separatly. 73 | */ 74 | typedef struct raw_s raw_t; 75 | struct raw_s { 76 | uint32_t len; // T Sequence length 77 | char *lines[]; // [T] Raw lines directly from file 78 | }; 79 | 80 | /* tok_t: 81 | * Data-structure representing a tokenized sequence. This is the result of the 82 | * second step of the interning process after the raw sequence have been split 83 | * in tokens and eventual labels separated from the observations. 84 | * 85 | * For each position 't' in the sequence of length , you find at [t] 86 | * the eventual label provided in input file, and at [t] a list of 87 | * string tokens of length [t]. 88 | * 89 | * Memory allocation here is a bit special as the first token at each position 90 | * point to a memory block who hold a copy of the raw line. Each other tokens 91 | * and the label are pointer in this block. This reduce memory fragmentation. 92 | */ 93 | typedef struct tok_s tok_t; 94 | struct tok_s { 95 | uint32_t len; // T Sequence length 96 | char **lbl; // [T] List of labels strings 97 | uint32_t *cnts; // [T] Length of tokens lists 98 | char **toks[]; // [T][] Tokens lists 99 | }; 100 | 101 | /* seq_t: 102 | * Data-structure representing a sequence of length in the internal form 103 | * used by the trainers and the tagger. For each position 't' in the sequence 104 | * (0 <= t < ) there is some observations made on the data and an 105 | * eventual label if provided in the input file. 106 | * 107 | * There is two kind of features: unigrams and bigrams one, build by combining 108 | * one observation and one or two labels. At position 't', the unigrams 109 | * features are build using the list of observations from [t] which 110 | * contains [t] items, and the observation at [t]. The bigrams 111 | * features are obtained in the same way using and , and have to 112 | * be combined also with [t-1]. 113 | * 114 | * If the sequence is read from a file without label, as it is the case in 115 | * labelling mode, the field will be NULL and so, the sequence cannot be 116 | * used for training. 117 | * 118 | * The raw field is private and used internaly for efficient memory 119 | * allocation. This allow to allocate , <*cnt>, and all the list in 120 | * <*obs> with the datastructure itself. 121 | */ 122 | typedef struct pos_s pos_t; 123 | typedef struct seq_s seq_t; 124 | struct seq_s { 125 | uint32_t len; 126 | uint64_t *raw; 127 | struct pos_s { 128 | uint32_t lbl; 129 | uint32_t ucnt, bcnt; 130 | uint64_t *uobs, *bobs; 131 | } pos[]; 132 | }; 133 | 134 | /* dat_t: 135 | * Data-structure representing a full dataset: a collection of sequences ready 136 | * to be used for training or to be labelled. It keep tracks of the maximum 137 | * sequence length as the trainer need this for memory allocation. The dataset 138 | * contains sequence stored in . These sequences are labeled only 139 | * if is true. 140 | */ 141 | typedef struct dat_s dat_t; 142 | struct dat_s { 143 | bool lbl; // True iff sequences are labelled 144 | uint32_t mlen; // Length of the longest sequence in the set 145 | uint32_t nseq; // S Number of sequences in the set 146 | seq_t **seq; // [S] List of sequences 147 | }; 148 | 149 | #endif 150 | -------------------------------------------------------------------------------- /Flyon/raw2bio.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin python3 2 | #-*-encoding:utf-8-*- 3 | import glob 4 | import jieba.posseg as psg 5 | # jieba.load_userdict("result.txt") 6 | import os 7 | import codecs 8 | import pickle 9 | files1 = glob.glob("./train_data600/*.txtoriginal.txt") 10 | files2 = glob.glob("./train_data600/*[0-9].txt") 11 | files3 = glob.glob("./Bio_nolabel/*") 12 | files4 = glob.glob("./Bio_label/*") 13 | import sys 14 | import gensim 15 | agv = sys.argv[1] 16 | flag='' 17 | try: 18 | agv2 = sys.argv[2] 19 | print("Warning: 第二个参数必须是'test'!") 20 | flag = agv2 21 | files1 = glob.glob("./test_data400/*") 22 | except Exception: 23 | flag = False 24 | pass 25 | def cat_rawfile(file,flag): 26 | """ 27 | 对长文本用jiba进行分词。并标记位置和文件编号。存入nw_data文件夹。 28 | """ 29 | drugs = [] 30 | jiepou = [] 31 | shoushu = [] 32 | zhenzhaung = [] 33 | with open("./CCKS_CRF/dic/drugs.txt",'r') as rf: 34 | for line in rf: 35 | line = line.strip() 36 | drugs.append(line) 37 | with open("./CCKS_CRF/dic/jiepou.txt",'r') as rf: 38 | for line in rf: 39 | line = line.strip() 40 | jiepou.append(line) 41 | 42 | with open("./CCKS_CRF/dic/shoushu.txt",'r') as rf: 43 | for line in rf: 44 | line = line.strip() 45 | shoushu.append(line) 46 | with open("./CCKS_CRF/dic/zhenzhaung.txt",'r') as rf: 47 | for line in rf: 48 | line = line.strip() 49 | zhenzhaung.append(line) 50 | n1_drug = jiepou+shoushu+zhenzhaung 51 | n2_jiepo = drugs+shoushu+zhenzhaung 52 | n3_zhengzhuang = drugs+shoushu+ jiepou 53 | n3_shoushu = drugs+zhenzhaung+ jiepou 54 | count=0 55 | name = os.path.basename(file) 56 | name_code = name.split('.')[0].split('-')[-1] 57 | if flag=="test": 58 | new_path = os.path.join("./CCKS_CRF/test_label_split/",name) 59 | else: 60 | new_path = os.path.join("./Bio_nolabel/",name) 61 | rf=codecs.open(file,encoding='utf8') 62 | wf = codecs.open(new_path,'w',encoding='utf8') 63 | dom = rf.read() 64 | result = psg.cut(dom) 65 | for ww in result: 66 | w = ww.word 67 | flag = ww.flag 68 | if w in drugs and w not in n1_drug: 69 | verbd = "dr" 70 | else: 71 | verbd = "Nd" 72 | if w in jiepou and w not in n2_jiepo: 73 | verbj = "jp" 74 | else: 75 | verbj = "Nj" 76 | if w in shoushu and w not in n3_shoushu: 77 | verbs = "ss" 78 | else: 79 | verbs = "Ns" 80 | if w in zhenzhaung and w not in n3_zhengzhuang: 81 | verbz = "zz" 82 | else: 83 | verbz = "Nz" 84 | 85 | if len(w)!=0: 86 | if w in ["。","?","!"]: 87 | wf.write(w +'\t'+flag+'\t'+verbd+'\t'+verbj+'\t'+verbs+'\t'+verbz+ '\t' +name_code+'\t'+ str(count) + '\t' + str(eval(end)+1) + '\n') 88 | wf.write('\n') 89 | count = eval(end)+1 90 | else: 91 | end=str(count+len(w)) 92 | wf.write(w+'\t'+flag+'\t'+verbd+'\t'+verbj+'\t'+verbs+'\t'+verbz+'\t'+name_code+'\t'+str(count)+'\t'+end+'\n') 93 | count = eval(end) 94 | else: 95 | pass 96 | rf.close() 97 | wf.close() 98 | 99 | def cat_labelfile(file): 100 | """ 101 | 对标注结果进行分词修改标签方式转成BIO格式。写入newlabel_data文件夹。 102 | """ 103 | name = os.path.basename(file) 104 | new_path = os.path.join("./Bio_label/",name) 105 | rf=codecs.open(file,encoding='utf8') 106 | wf = codecs.open(new_path,'w',encoding='utf8') 107 | for line in rf: 108 | line_lis = line.strip().split('\t') 109 | word = line_lis[0] 110 | start = line_lis[1] 111 | label = line_lis[3] 112 | 113 | words= psg.cut(word) 114 | for i,w in enumerate(words): 115 | w = w.word 116 | if i==0: 117 | tag="B-" 118 | e = str(eval(start)+len(w)) 119 | else: 120 | tag="I-" 121 | e=str(eval(start)+len(w)) 122 | sentence=w+'\t'+start+'\t'+e+'\t'+tag+label+'\n' 123 | wf.write(sentence) 124 | start =str(eval(e)) 125 | rf.close() 126 | wf.close() 127 | 128 | def label2pickle(file4): 129 | """将标签与位置编码成字典的键值对,方便将标签整合到原始文件中去。""" 130 | basename = os.path.basename(file4) 131 | di = {} 132 | rf = codecs.open(file4, 'r', encoding='utf-8') 133 | wf= open("./label_pickle/"+basename+".pkl",'wb') 134 | for line in rf: 135 | line = line.strip() 136 | line_start = line.split('\t')[1] 137 | line_end = line.split('\t')[2] 138 | label = line.split('\t')[-1] 139 | end_and_label = line_end+"&"+label 140 | di[line_start]=end_and_label 141 | pickle.dump(di,wf) 142 | rf.close() 143 | wf.close() 144 | 145 | def cat_file_label(file3,end_labeldi): 146 | """将原始文件与标签整合。""" 147 | basename = os.path.basename(file3) 148 | filename = basename.split('.')[0].split('-')[1] 149 | rf = codecs.open(file3,'r',encoding='utf-8') 150 | wf=codecs.open("./CCKS_CRF/BIO_ccks/"+basename,'w',encoding='utf-8') 151 | for line in rf: 152 | line_lis = line.strip().split('\t') 153 | try: 154 | word = line_lis[0] 155 | flag = line_lis[1] 156 | verbd = line_lis[2] 157 | verbj = line_lis[3] 158 | verbs = line_lis[4] 159 | verbz = line_lis[5] 160 | start = line_lis[7] 161 | end = line_lis[8] 162 | try: 163 | if end_labeldi[start]: 164 | posible_label = end_labeldi[start] 165 | trueend = posible_label.split("&")[0] 166 | label = posible_label.split("&")[-1] 167 | if end ==trueend: 168 | wf.write(word+'\t'+flag+'\t'+verbd+'\t'+verbj+'\t'+verbs+'\t'+verbz+'\t'+filename+"\t"+start+"\t"+end+"\t"+label+"\n") 169 | else: 170 | wf.write(word +'\t'+flag+'\t'+verbd+'\t'+verbj+'\t'+verbs+'\t'+verbz+ '\t' +filename+"\t"+ start + "\t" + end + "\t" + "O"+"\n") 171 | else: 172 | wf.write(word +'\t'+flag+'\t'+verbd+'\t'+verbj+'\t'+verbs+'\t'+verbz+ '\t' + filename+"\t"+start + "\t" + end + "\t" + "O"+"\n") 173 | except KeyError: 174 | wf.write(word +'\t'+flag+'\t'+verbd+'\t'+verbj+'\t'+verbs+'\t'+verbz+ '\t' +filename+"\t"+ start + "\t" + end + "\t" + "O"+"\n") 175 | except IndexError: 176 | wf.write("\n") 177 | wf.close() 178 | rf.close() 179 | 180 | if agv == "-1": 181 | for file in files1: 182 | # cat_labelfile(file) 183 | cat_rawfile(file,flag) 184 | # ###第二步(训练时候使用) 185 | elif agv == "-2": 186 | for file in files2: 187 | cat_labelfile(file) 188 | elif agv == "-3": 189 | for file in files4: 190 | label2pickle(file) 191 | ##第四步 192 | elif agv == "-4": 193 | for file in files3: 194 | filename = os.path.basename(file) 195 | fil_ = filename.split('.')[0]+"."+filename.split('.')[2] 196 | ef = open("./label_pickle/"+str(fil_)+".pkl", 'rb') 197 | end_labeldi = pickle.load(ef, encoding='bytes') 198 | cat_file_label(file,end_labeldi) 199 | 200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /Flyon/Bio_nolabel/入院记录现病史-1.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | , x Nd Nj Ns Nz 1 0 1 2 | 患者 n Nd Nj Ns Nz 1 1 3 3 | 3 m Nd Nj Ns Nz 1 3 4 4 | 月前 t Nd Nj Ns Nz 1 4 6 5 | 因 c dr Nj Ns Nz 1 6 7 6 | “ x Nd Nj Ns Nz 1 7 8 7 | 直肠癌 n Nd Nj Ns Nz 1 8 11 8 | ” x Nd Nj Ns Nz 1 11 12 9 | 于 p dr Nj Ns Nz 1 12 13 10 | 在 p Nd Nj ss Nz 1 13 14 11 | 我院 n Nd Nj Ns Nz 1 14 16 12 | 于 p dr Nj Ns Nz 1 16 17 13 | 全麻 n Nd Nj Ns Nz 1 17 19 14 | 下行 v Nd Nj Ns Nz 1 19 21 15 | 直肠癌 n Nd Nj Ns Nz 1 21 24 16 | 根治术 n Nd Nj ss Nz 1 24 27 17 | ( x Nd Nj Ns Nz 1 27 28 18 | DIXON eng Nd Nj Ns Nz 1 28 33 19 | 术 v Nd Nj Ns Nz 1 33 34 20 | ) x Nd Nj Ns Nz 1 34 35 21 | , x Nd Nj Ns Nz 1 35 36 22 | 手术过程 n Nd Nj Ns Nz 1 36 40 23 | 顺利 ad Nd Nj Ns Nz 1 40 42 24 | , x Nd Nj Ns Nz 1 42 43 25 | 术后 t Nd Nj ss Nz 1 43 45 26 | 给予 v Nd Nj Ns Nz 1 45 47 27 | 抗感染 l Nd Nj Ns Nz 1 47 50 28 | 及 c Nd Nj Ns Nz 1 50 51 29 | 营养 n Nd Nj Ns Nz 1 51 53 30 | 支持 v Nd Nj Ns Nz 1 53 55 31 | 治疗 v Nd Nj Ns Nz 1 55 57 32 | , x Nd Nj Ns Nz 1 57 58 33 | 患者 n Nd Nj Ns Nz 1 58 60 34 | 恢复 v Nd Nj Ns Nz 1 60 62 35 | 好 a dr Nj Ns Nz 1 62 63 36 | , x Nd Nj Ns Nz 1 63 64 37 | 切口 n Nd Nj ss Nz 1 64 66 38 | 愈合 v Nd Nj Ns Nz 1 66 68 39 | 良好 a Nd Nj Ns Nz 1 68 70 40 | 。 x Nd Nj Ns Nz 1 70 71 41 | 42 | , x Nd Nj Ns Nz 1 71 72 43 | 术后 t Nd Nj ss Nz 1 72 74 44 | 病理 n Nd Nj Ns Nz 1 74 76 45 | 示 v Nd Nj Ns Nz 1 76 77 46 | : x Nd Nj Ns Nz 1 77 78 47 | 直肠 n Nd Nj Ns Nz 1 78 80 48 | 腺癌 n Nd Nj ss Nz 1 80 82 49 | ( x Nd Nj Ns Nz 1 82 83 50 | 中 f Nd Nj Ns Nz 1 83 84 51 | 低度 n Nd Nj Ns Nz 1 84 86 52 | 分化 vn Nd Nj Ns Nz 1 86 88 53 | ) x Nd Nj Ns Nz 1 88 89 54 | , x Nd Nj Ns Nz 1 89 90 55 | 浸润 v Nd Nj Ns Nz 1 90 92 56 | 溃疡 n Nd Nj Ns Nz 1 92 94 57 | 型 k Nd Nj Ns Nz 1 94 95 58 | , x Nd Nj Ns Nz 1 95 96 59 | 面积 n Nd Nj Ns Nz 1 96 98 60 | 3.5 m Nd Nj Ns Nz 1 98 101 61 | * x Nd Nj Ns Nz 1 101 102 62 | 2 m Nd Nj Ns Nz 1 102 103 63 | CM eng Nd Nj Ns Nz 1 103 105 64 | , x Nd Nj Ns Nz 1 105 106 65 | 侵达 v Nd Nj Ns Nz 1 106 108 66 | 外膜 n Nd jp Ns Nz 1 108 110 67 | 。 x Nd Nj Ns Nz 1 110 111 68 | 69 | 两端 m Nd Nj Ns Nz 1 111 113 70 | 切线 n Nd jp Ns Nz 1 113 115 71 | 另 r Nd Nj Ns Nz 1 115 116 72 | 送 v Nd Nj Ns Nz 1 116 117 73 | “ x Nd Nj Ns Nz 1 117 118 74 | 近 a Nd Nj Ns Nz 1 118 119 75 | 端 v Nd Nj Ns Nz 1 119 120 76 | ” x Nd Nj Ns Nz 1 120 121 77 | 、 x Nd Nj Ns Nz 1 121 122 78 | “ x Nd Nj Ns Nz 1 122 123 79 | 远端 n Nd Nj ss Nz 1 123 125 80 | ” x Nd Nj Ns Nz 1 125 126 81 | 及 c Nd Nj Ns Nz 1 126 127 82 | 环周 n Nd Nj Ns Nz 1 127 129 83 | 底部 f Nd jp Ns Nz 1 129 131 84 | 切除 v Nd Nj ss Nz 1 131 133 85 | 面 n Nd Nj Ns Nz 1 133 134 86 | 未查 v Nd Nj Ns Nz 1 134 136 87 | 见 v dr Nj Ns Nz 1 136 137 88 | 癌 n Nd Nj ss Nz 1 137 138 89 | 。 x Nd Nj Ns Nz 1 138 139 90 | 91 | 肠壁 n Nd Nj Ns Nz 1 139 141 92 | 一站 m Nd Nj Ns Nz 1 141 143 93 | ( x Nd Nj Ns Nz 1 143 144 94 | 10 m Nd Nj Ns Nz 1 144 146 95 | 个 m Nd Nj Ns Nz 1 146 147 96 | ) x Nd Nj Ns Nz 1 147 148 97 | 、 x Nd Nj Ns Nz 1 148 149 98 | 中间 f Nd jp Ns Nz 1 149 151 99 | 组 zg Nd Nj Ns Nz 1 151 152 100 | ( x Nd Nj Ns Nz 1 152 153 101 | 8 m Nd Nj Ns Nz 1 153 154 102 | 个 m Nd Nj Ns Nz 1 154 155 103 | ) x Nd Nj Ns Nz 1 155 156 104 | 淋巴结 n Nd Nj Ns Nz 1 156 159 105 | 未查 v Nd Nj Ns Nz 1 159 161 106 | 见 v dr Nj Ns Nz 1 161 162 107 | 癌 n Nd Nj ss Nz 1 162 163 108 | 。 x Nd Nj Ns Nz 1 163 164 109 | 110 | , x Nd Nj Ns Nz 1 164 165 111 | 免疫组化 n Nd Nj Ns Nz 1 165 169 112 | 染色 n Nd Nj Ns Nz 1 169 171 113 | 示 v Nd Nj Ns Nz 1 171 172 114 | : x Nd Nj Ns Nz 1 172 173 115 | ERCC1 eng Nd Nj Ns Nz 1 173 178 116 | 弥漫 v Nd Nj Ns Nz 1 178 180 117 | ( x Nd Nj Ns Nz 1 180 181 118 | + x Nd Nj Ns Nz 1 181 182 119 | ) x Nd Nj Ns Nz 1 182 183 120 | 、 x Nd Nj Ns Nz 1 183 184 121 | TS eng Nd Nj Ns Nz 1 184 186 122 | 少部分 m Nd Nj Ns Nz 1 186 189 123 | 弱 a dr Nj Ns Nz 1 189 190 124 | ( x Nd Nj Ns Nz 1 190 191 125 | + x Nd Nj Ns Nz 1 191 192 126 | ) x Nd Nj Ns Nz 1 192 193 127 | 、 x Nd Nj Ns Nz 1 193 194 128 | SYN eng Nd Nj Ns Nz 1 194 197 129 | ( x Nd Nj Ns Nz 1 197 198 130 | - x Nd Nj Ns Nz 1 198 199 131 | ) x Nd Nj Ns Nz 1 199 200 132 | 、 x Nd Nj Ns Nz 1 200 201 133 | CGA eng Nd Nj Ns Nz 1 201 204 134 | ( x Nd Nj Ns Nz 1 204 205 135 | - x Nd Nj Ns Nz 1 205 206 136 | ) x Nd Nj Ns Nz 1 206 207 137 | 。 x Nd Nj Ns Nz 1 207 208 138 | 139 | 术后 t Nd Nj ss Nz 1 208 210 140 | 查无 v Nd Nj Ns Nz 1 210 212 141 | 化疗 n Nd Nj Ns Nz 1 212 214 142 | 禁忌 v dr Nj Ns Nz 1 214 216 143 | 后 f Nd Nj Ns Nz 1 216 217 144 | 给予 v Nd Nj Ns Nz 1 217 219 145 | 3 x Nd Nj Ns Nz 1 219 220 146 | 周期 t Nd Nj Ns Nz 1 220 222 147 | 化疗 n Nd Nj Ns Nz 1 222 224 148 | , x Nd Nj Ns Nz 1 224 225 149 | , x Nd Nj Ns Nz 1 225 226 150 | 方案 n Nd Nj Ns Nz 1 226 228 151 | 为 p Nd Nj ss Nz 1 228 229 152 | : x Nd Nj Ns Nz 1 229 230 153 | 奥沙利 ns dr Nj Ns Nz 1 230 233 154 | 铂 n dr Nj Ns Nz 1 233 234 155 | 150 m Nd Nj Ns Nz 1 234 237 156 | MG eng Nd Nj Ns Nz 1 237 239 157 | x Nd Nj Ns Nz 1 239 240 158 | D1 eng Nd Nj Ns Nz 1 240 242 159 | , x Nd Nj Ns Nz 1 242 243 160 | 亚 j dr Nj Ns Nz 1 243 244 161 | 叶酸 n dr Nj Ns Nz 1 244 246 162 | 钙 n dr Nj Ns Nz 1 246 247 163 | 0.3 m Nd Nj Ns Nz 1 247 250 164 | G eng Nd Nj Ns Nz 1 250 251 165 | + x Nd Nj Ns Nz 1 251 252 166 | 替加 v dr Nj Ns Nz 1 252 254 167 | 氟 n dr Nj Ns Nz 1 254 255 168 | 1.0 m Nd Nj Ns Nz 1 255 258 169 | G eng Nd Nj Ns Nz 1 258 259 170 | x Nd Nj Ns Nz 1 259 260 171 | D2 eng Nd Nj Ns Nz 1 260 262 172 | - x Nd Nj Ns Nz 1 262 263 173 | D6 eng Nd Nj Ns Nz 1 263 265 174 | , x Nd Nj Ns Nz 1 265 266 175 | 同时 c Nd Nj ss Nz 1 266 268 176 | 给与 v Nd Nj Ns Nz 1 268 270 177 | 升 zg Nd Nj Ns Nz 1 270 271 178 | 白细胞 n dr Nj Ns Nz 1 271 274 179 | 、 x Nd Nj Ns Nz 1 274 275 180 | 护肝 n dr Nj Ns Nz 1 275 277 181 | 、 x Nd Nj Ns Nz 1 277 278 182 | 止吐 v dr Nj Ns Nz 1 278 280 183 | 、 x Nd Nj Ns Nz 1 280 281 184 | 免疫增强 n Nd Nj Ns Nz 1 281 285 185 | 治疗 v Nd Nj Ns Nz 1 285 287 186 | , x Nd Nj Ns Nz 1 287 288 187 | 患者 n Nd Nj Ns Nz 1 288 290 188 | 副反应 n Nd Nj Ns Nz 1 290 293 189 | 轻 a Nd Nj Ns Nz 1 293 294 190 | 。 x Nd Nj Ns Nz 1 294 295 191 | 192 | 院 n Nd Nj Ns Nz 1 295 296 193 | 外 f Nd Nj Ns Nz 1 296 297 194 | 期间 f Nd Nj Ns Nz 1 297 299 195 | 患者 n Nd Nj Ns Nz 1 299 301 196 | 一般 a Nd Nj Ns Nz 1 301 303 197 | 情况 n Nd Nj Ns Nz 1 303 305 198 | 好 a dr Nj Ns Nz 1 305 306 199 | , x Nd Nj Ns Nz 1 306 307 200 | 无 v Nd Nj Ns Nz 1 307 308 201 | 恶心 n Nd Nj Ns Nz 1 308 310 202 | , x Nd Nj Ns Nz 1 310 311 203 | 无 v Nd Nj Ns Nz 1 311 312 204 | 腹痛 n Nd Nj Ns Nz 1 312 314 205 | 腹胀 v Nd Nj Ns Nz 1 314 316 206 | 胀 a Nd Nj Ns Nz 1 316 317 207 | 不适 a Nd Nj Ns zz 1 317 319 208 | , x Nd Nj Ns Nz 1 319 320 209 | 无现 v Nd Nj Ns Nz 1 320 322 210 | 患者 n Nd Nj Ns Nz 1 322 324 211 | 为 p Nd Nj ss Nz 1 324 325 212 | 行 n Nd Nj ss Nz 1 325 326 213 | 复查 vn Nd Nj Ns Nz 1 326 328 214 | 及 c Nd Nj Ns Nz 1 328 329 215 | 化疗 n Nd Nj Ns Nz 1 329 331 216 | 再次 d Nd Nj ss Nz 1 331 333 217 | 来院 n Nd Nj Ns Nz 1 333 335 218 | 就诊 v Nd Nj Ns Nz 1 335 337 219 | , x Nd Nj Ns Nz 1 337 338 220 | 门诊 n Nd Nj Ns Nz 1 338 340 221 | 以 p Nd Nj Ns Nz 1 340 341 222 | “ x Nd Nj Ns Nz 1 341 342 223 | 直肠癌 n Nd Nj Ns Nz 1 342 345 224 | 术后 t Nd Nj ss Nz 1 345 347 225 | ” x Nd Nj Ns Nz 1 347 348 226 | 收入 v Nd Nj Ns Nz 1 348 350 227 | 院 n Nd Nj Ns Nz 1 350 351 228 | 。 x Nd Nj Ns Nz 1 351 352 229 | 230 | x Nd Nj Ns Nz 1 352 353 231 | x Nd Nj Ns Nz 1 353 354 232 | x Nd Nj Ns Nz 1 354 355 233 | 近期 t Nd Nj ss Nz 1 355 357 234 | 患者 n Nd Nj Ns Nz 1 357 359 235 | 精神 n Nd Nj Ns Nz 1 359 361 236 | 可 v Nd Nj Ns Nz 1 361 362 237 | , x Nd Nj Ns Nz 1 362 363 238 | 饮食 n Nd Nj Ns Nz 1 363 365 239 | 可 v Nd Nj Ns Nz 1 365 366 240 | , x Nd Nj Ns Nz 1 366 367 241 | 大便 d Nd Nj Ns Nz 1 367 369 242 | 正常 d Nd Nj Ns zz 1 369 371 243 | , x Nd Nj Ns Nz 1 371 372 244 | 小便 nr Nd Nj Ns zz 1 372 374 245 | 正常 d Nd Nj Ns zz 1 374 376 246 | , x Nd Nj Ns Nz 1 376 377 247 | 近期 t Nd Nj ss Nz 1 377 379 248 | 体重 n Nd Nj Ns Nz 1 379 381 249 | 无 v Nd Nj Ns Nz 1 381 382 250 | 明显 a Nd Nj Ns zz 1 382 384 251 | 变化 vn Nd Nj Ns Nz 1 384 386 252 | 。 x Nd Nj Ns Nz 1 386 387 253 | 254 | 255 | x Nd Nj Ns Nz 1 387 389 256 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/BIO_ccks/入院记录现病史-1.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | , x Nd Nj Ns Nz 1 0 1 O 2 | 患者 n Nd Nj Ns Nz 1 1 3 O 3 | 3 m Nd Nj Ns Nz 1 3 4 O 4 | 月前 t Nd Nj Ns Nz 1 4 6 O 5 | 因 c dr Nj Ns Nz 1 6 7 O 6 | “ x Nd Nj Ns Nz 1 7 8 O 7 | 直肠癌 n Nd Nj Ns Nz 1 8 11 O 8 | ” x Nd Nj Ns Nz 1 11 12 O 9 | 于 p dr Nj Ns Nz 1 12 13 O 10 | 在 p Nd Nj ss Nz 1 13 14 O 11 | 我院 n Nd Nj Ns Nz 1 14 16 O 12 | 于 p dr Nj Ns Nz 1 16 17 O 13 | 全麻 n Nd Nj Ns Nz 1 17 19 O 14 | 下行 v Nd Nj Ns Nz 1 19 21 O 15 | 直肠癌 n Nd Nj Ns Nz 1 21 24 B-手术 16 | 根治术 n Nd Nj ss Nz 1 24 27 I-手术 17 | ( x Nd Nj Ns Nz 1 27 28 O 18 | DIXON eng Nd Nj Ns Nz 1 28 33 B-手术 19 | 术 v Nd Nj Ns Nz 1 33 34 I-手术 20 | ) x Nd Nj Ns Nz 1 34 35 O 21 | , x Nd Nj Ns Nz 1 35 36 O 22 | 手术过程 n Nd Nj Ns Nz 1 36 40 O 23 | 顺利 ad Nd Nj Ns Nz 1 40 42 O 24 | , x Nd Nj Ns Nz 1 42 43 O 25 | 术后 t Nd Nj ss Nz 1 43 45 O 26 | 给予 v Nd Nj Ns Nz 1 45 47 O 27 | 抗感染 l Nd Nj Ns Nz 1 47 50 O 28 | 及 c Nd Nj Ns Nz 1 50 51 O 29 | 营养 n Nd Nj Ns Nz 1 51 53 O 30 | 支持 v Nd Nj Ns Nz 1 53 55 O 31 | 治疗 v Nd Nj Ns Nz 1 55 57 O 32 | , x Nd Nj Ns Nz 1 57 58 O 33 | 患者 n Nd Nj Ns Nz 1 58 60 O 34 | 恢复 v Nd Nj Ns Nz 1 60 62 O 35 | 好 a dr Nj Ns Nz 1 62 63 O 36 | , x Nd Nj Ns Nz 1 63 64 O 37 | 切口 n Nd Nj ss Nz 1 64 66 O 38 | 愈合 v Nd Nj Ns Nz 1 66 68 O 39 | 良好 a Nd Nj Ns Nz 1 68 70 O 40 | 。 x Nd Nj Ns Nz 1 70 71 O 41 | 42 | , x Nd Nj Ns Nz 1 71 72 O 43 | 术后 t Nd Nj ss Nz 1 72 74 O 44 | 病理 n Nd Nj Ns Nz 1 74 76 O 45 | 示 v Nd Nj Ns Nz 1 76 77 O 46 | : x Nd Nj Ns Nz 1 77 78 O 47 | 直肠 n Nd Nj Ns Nz 1 78 80 B-解剖部位 48 | 腺癌 n Nd Nj ss Nz 1 80 82 O 49 | ( x Nd Nj Ns Nz 1 82 83 O 50 | 中 f Nd Nj Ns Nz 1 83 84 O 51 | 低度 n Nd Nj Ns Nz 1 84 86 O 52 | 分化 vn Nd Nj Ns Nz 1 86 88 O 53 | ) x Nd Nj Ns Nz 1 88 89 O 54 | , x Nd Nj Ns Nz 1 89 90 O 55 | 浸润 v Nd Nj Ns Nz 1 90 92 O 56 | 溃疡 n Nd Nj Ns Nz 1 92 94 O 57 | 型 k Nd Nj Ns Nz 1 94 95 O 58 | , x Nd Nj Ns Nz 1 95 96 O 59 | 面积 n Nd Nj Ns Nz 1 96 98 O 60 | 3.5 m Nd Nj Ns Nz 1 98 101 O 61 | * x Nd Nj Ns Nz 1 101 102 O 62 | 2 m Nd Nj Ns Nz 1 102 103 O 63 | CM eng Nd Nj Ns Nz 1 103 105 O 64 | , x Nd Nj Ns Nz 1 105 106 O 65 | 侵达 v Nd Nj Ns Nz 1 106 108 O 66 | 外膜 n Nd jp Ns Nz 1 108 110 O 67 | 。 x Nd Nj Ns Nz 1 110 111 O 68 | 69 | 两端 m Nd Nj Ns Nz 1 111 113 O 70 | 切线 n Nd jp Ns Nz 1 113 115 O 71 | 另 r Nd Nj Ns Nz 1 115 116 O 72 | 送 v Nd Nj Ns Nz 1 116 117 O 73 | “ x Nd Nj Ns Nz 1 117 118 O 74 | 近 a Nd Nj Ns Nz 1 118 119 O 75 | 端 v Nd Nj Ns Nz 1 119 120 O 76 | ” x Nd Nj Ns Nz 1 120 121 O 77 | 、 x Nd Nj Ns Nz 1 121 122 O 78 | “ x Nd Nj Ns Nz 1 122 123 O 79 | 远端 n Nd Nj ss Nz 1 123 125 O 80 | ” x Nd Nj Ns Nz 1 125 126 O 81 | 及 c Nd Nj Ns Nz 1 126 127 O 82 | 环周 n Nd Nj Ns Nz 1 127 129 O 83 | 底部 f Nd jp Ns Nz 1 129 131 O 84 | 切除 v Nd Nj ss Nz 1 131 133 O 85 | 面 n Nd Nj Ns Nz 1 133 134 O 86 | 未查 v Nd Nj Ns Nz 1 134 136 O 87 | 见 v dr Nj Ns Nz 1 136 137 O 88 | 癌 n Nd Nj ss Nz 1 137 138 O 89 | 。 x Nd Nj Ns Nz 1 138 139 O 90 | 91 | 肠壁 n Nd Nj Ns Nz 1 139 141 B-解剖部位 92 | 一站 m Nd Nj Ns Nz 1 141 143 O 93 | ( x Nd Nj Ns Nz 1 143 144 O 94 | 10 m Nd Nj Ns Nz 1 144 146 O 95 | 个 m Nd Nj Ns Nz 1 146 147 O 96 | ) x Nd Nj Ns Nz 1 147 148 O 97 | 、 x Nd Nj Ns Nz 1 148 149 O 98 | 中间 f Nd jp Ns Nz 1 149 151 O 99 | 组 zg Nd Nj Ns Nz 1 151 152 O 100 | ( x Nd Nj Ns Nz 1 152 153 O 101 | 8 m Nd Nj Ns Nz 1 153 154 O 102 | 个 m Nd Nj Ns Nz 1 154 155 O 103 | ) x Nd Nj Ns Nz 1 155 156 O 104 | 淋巴结 n Nd Nj Ns Nz 1 156 159 B-解剖部位 105 | 未查 v Nd Nj Ns Nz 1 159 161 O 106 | 见 v dr Nj Ns Nz 1 161 162 O 107 | 癌 n Nd Nj ss Nz 1 162 163 O 108 | 。 x Nd Nj Ns Nz 1 163 164 O 109 | 110 | , x Nd Nj Ns Nz 1 164 165 O 111 | 免疫组化 n Nd Nj Ns Nz 1 165 169 O 112 | 染色 n Nd Nj Ns Nz 1 169 171 O 113 | 示 v Nd Nj Ns Nz 1 171 172 O 114 | : x Nd Nj Ns Nz 1 172 173 O 115 | ERCC1 eng Nd Nj Ns Nz 1 173 178 O 116 | 弥漫 v Nd Nj Ns Nz 1 178 180 O 117 | ( x Nd Nj Ns Nz 1 180 181 O 118 | + x Nd Nj Ns Nz 1 181 182 O 119 | ) x Nd Nj Ns Nz 1 182 183 O 120 | 、 x Nd Nj Ns Nz 1 183 184 O 121 | TS eng Nd Nj Ns Nz 1 184 186 O 122 | 少部分 m Nd Nj Ns Nz 1 186 189 O 123 | 弱 a dr Nj Ns Nz 1 189 190 O 124 | ( x Nd Nj Ns Nz 1 190 191 O 125 | + x Nd Nj Ns Nz 1 191 192 O 126 | ) x Nd Nj Ns Nz 1 192 193 O 127 | 、 x Nd Nj Ns Nz 1 193 194 O 128 | SYN eng Nd Nj Ns Nz 1 194 197 O 129 | ( x Nd Nj Ns Nz 1 197 198 O 130 | - x Nd Nj Ns Nz 1 198 199 O 131 | ) x Nd Nj Ns Nz 1 199 200 O 132 | 、 x Nd Nj Ns Nz 1 200 201 O 133 | CGA eng Nd Nj Ns Nz 1 201 204 O 134 | ( x Nd Nj Ns Nz 1 204 205 O 135 | - x Nd Nj Ns Nz 1 205 206 O 136 | ) x Nd Nj Ns Nz 1 206 207 O 137 | 。 x Nd Nj Ns Nz 1 207 208 O 138 | 139 | 术后 t Nd Nj ss Nz 1 208 210 O 140 | 查无 v Nd Nj Ns Nz 1 210 212 O 141 | 化疗 n Nd Nj Ns Nz 1 212 214 O 142 | 禁忌 v dr Nj Ns Nz 1 214 216 O 143 | 后 f Nd Nj Ns Nz 1 216 217 O 144 | 给予 v Nd Nj Ns Nz 1 217 219 O 145 | 3 x Nd Nj Ns Nz 1 219 220 O 146 | 周期 t Nd Nj Ns Nz 1 220 222 O 147 | 化疗 n Nd Nj Ns Nz 1 222 224 O 148 | , x Nd Nj Ns Nz 1 224 225 O 149 | , x Nd Nj Ns Nz 1 225 226 O 150 | 方案 n Nd Nj Ns Nz 1 226 228 O 151 | 为 p Nd Nj ss Nz 1 228 229 O 152 | : x Nd Nj Ns Nz 1 229 230 O 153 | 奥沙利 ns dr Nj Ns Nz 1 230 233 B-药物 154 | 铂 n dr Nj Ns Nz 1 233 234 I-药物 155 | 150 m Nd Nj Ns Nz 1 234 237 O 156 | MG eng Nd Nj Ns Nz 1 237 239 O 157 | 158 | D1 eng Nd Nj Ns Nz 1 240 242 O 159 | , x Nd Nj Ns Nz 1 242 243 O 160 | 亚 j dr Nj Ns Nz 1 243 244 B-药物 161 | 叶酸 n dr Nj Ns Nz 1 244 246 I-药物 162 | 钙 n dr Nj Ns Nz 1 246 247 I-药物 163 | 0.3 m Nd Nj Ns Nz 1 247 250 O 164 | G eng Nd Nj Ns Nz 1 250 251 O 165 | + x Nd Nj Ns Nz 1 251 252 O 166 | 替加 v dr Nj Ns Nz 1 252 254 B-药物 167 | 氟 n dr Nj Ns Nz 1 254 255 I-药物 168 | 1.0 m Nd Nj Ns Nz 1 255 258 O 169 | G eng Nd Nj Ns Nz 1 258 259 O 170 | 171 | D2 eng Nd Nj Ns Nz 1 260 262 O 172 | - x Nd Nj Ns Nz 1 262 263 O 173 | D6 eng Nd Nj Ns Nz 1 263 265 O 174 | , x Nd Nj Ns Nz 1 265 266 O 175 | 同时 c Nd Nj ss Nz 1 266 268 O 176 | 给与 v Nd Nj Ns Nz 1 268 270 O 177 | 升 zg Nd Nj Ns Nz 1 270 271 O 178 | 白细胞 n dr Nj Ns Nz 1 271 274 O 179 | 、 x Nd Nj Ns Nz 1 274 275 O 180 | 护肝 n dr Nj Ns Nz 1 275 277 O 181 | 、 x Nd Nj Ns Nz 1 277 278 O 182 | 止吐 v dr Nj Ns Nz 1 278 280 O 183 | 、 x Nd Nj Ns Nz 1 280 281 O 184 | 免疫增强 n Nd Nj Ns Nz 1 281 285 O 185 | 治疗 v Nd Nj Ns Nz 1 285 287 O 186 | , x Nd Nj Ns Nz 1 287 288 O 187 | 患者 n Nd Nj Ns Nz 1 288 290 O 188 | 副反应 n Nd Nj Ns Nz 1 290 293 O 189 | 轻 a Nd Nj Ns Nz 1 293 294 O 190 | 。 x Nd Nj Ns Nz 1 294 295 O 191 | 192 | 院 n Nd Nj Ns Nz 1 295 296 O 193 | 外 f Nd Nj Ns Nz 1 296 297 O 194 | 期间 f Nd Nj Ns Nz 1 297 299 O 195 | 患者 n Nd Nj Ns Nz 1 299 301 O 196 | 一般 a Nd Nj Ns Nz 1 301 303 O 197 | 情况 n Nd Nj Ns Nz 1 303 305 O 198 | 好 a dr Nj Ns Nz 1 305 306 O 199 | , x Nd Nj Ns Nz 1 306 307 O 200 | 无 v Nd Nj Ns Nz 1 307 308 O 201 | 恶心 n Nd Nj Ns Nz 1 308 310 B-独立症状 202 | , x Nd Nj Ns Nz 1 310 311 O 203 | 无 v Nd Nj Ns Nz 1 311 312 O 204 | 腹痛 n Nd Nj Ns Nz 1 312 314 O 205 | 腹胀 v Nd Nj Ns Nz 1 314 316 O 206 | 胀 a Nd Nj Ns Nz 1 316 317 B-症状描述 207 | 不适 a Nd Nj Ns zz 1 317 319 B-症状描述 208 | , x Nd Nj Ns Nz 1 319 320 O 209 | 无现 v Nd Nj Ns Nz 1 320 322 O 210 | 患者 n Nd Nj Ns Nz 1 322 324 O 211 | 为 p Nd Nj ss Nz 1 324 325 O 212 | 行 n Nd Nj ss Nz 1 325 326 O 213 | 复查 vn Nd Nj Ns Nz 1 326 328 O 214 | 及 c Nd Nj Ns Nz 1 328 329 O 215 | 化疗 n Nd Nj Ns Nz 1 329 331 O 216 | 再次 d Nd Nj ss Nz 1 331 333 O 217 | 来院 n Nd Nj Ns Nz 1 333 335 O 218 | 就诊 v Nd Nj Ns Nz 1 335 337 O 219 | , x Nd Nj Ns Nz 1 337 338 O 220 | 门诊 n Nd Nj Ns Nz 1 338 340 O 221 | 以 p Nd Nj Ns Nz 1 340 341 O 222 | “ x Nd Nj Ns Nz 1 341 342 O 223 | 直肠癌 n Nd Nj Ns Nz 1 342 345 O 224 | 术后 t Nd Nj ss Nz 1 345 347 O 225 | ” x Nd Nj Ns Nz 1 347 348 O 226 | 收入 v Nd Nj Ns Nz 1 348 350 O 227 | 院 n Nd Nj Ns Nz 1 350 351 O 228 | 。 x Nd Nj Ns Nz 1 351 352 O 229 | 230 | 231 | 232 | 233 | 近期 t Nd Nj ss Nz 1 355 357 O 234 | 患者 n Nd Nj Ns Nz 1 357 359 O 235 | 精神 n Nd Nj Ns Nz 1 359 361 O 236 | 可 v Nd Nj Ns Nz 1 361 362 O 237 | , x Nd Nj Ns Nz 1 362 363 O 238 | 饮食 n Nd Nj Ns Nz 1 363 365 O 239 | 可 v Nd Nj Ns Nz 1 365 366 O 240 | , x Nd Nj Ns Nz 1 366 367 O 241 | 大便 d Nd Nj Ns Nz 1 367 369 O 242 | 正常 d Nd Nj Ns zz 1 369 371 O 243 | , x Nd Nj Ns Nz 1 371 372 O 244 | 小便 nr Nd Nj Ns zz 1 372 374 O 245 | 正常 d Nd Nj Ns zz 1 374 376 O 246 | , x Nd Nj Ns Nz 1 376 377 O 247 | 近期 t Nd Nj ss Nz 1 377 379 O 248 | 体重 n Nd Nj Ns Nz 1 379 381 O 249 | 无 v Nd Nj Ns Nz 1 381 382 O 250 | 明显 a Nd Nj Ns zz 1 382 384 O 251 | 变化 vn Nd Nj Ns Nz 1 384 386 O 252 | 。 x Nd Nj Ns Nz 1 386 387 O 253 | 254 | 255 | 256 | -------------------------------------------------------------------------------- /Wapiti/src/rprop.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "wapiti.h" 38 | #include "gradient.h" 39 | #include "model.h" 40 | #include "options.h" 41 | #include "progress.h" 42 | #include "tools.h" 43 | #include "thread.h" 44 | #include "vmath.h" 45 | 46 | #define EPSILON (DBL_EPSILON * 64.0) 47 | 48 | #define sign(v) ((v) < -EPSILON ? -1.0 : ((v) > EPSILON ? 1.0 : 0.0)) 49 | #define sqr(v) ((v) * (v)) 50 | 51 | /****************************************************************************** 52 | * Resilient propagation optimizer 53 | * 54 | * This is an implementation of the RPROP algorithm (resilient propagation) 55 | * described by Riedmiller and Braun in [1] with an adaptation to be useable 56 | * with l1 regularization. 57 | * The adaptation consist of using a pseudo-gradient similar to the one used 58 | * in OWL-QN to choose an orthant at iterations steps and projecting the step 59 | * in this orthant before the weight update. 60 | * 61 | * [1] A direct adaptive method for faster backpropagation learning: The RPROP 62 | * algorithm, Martin Riedmiller and Heinrich Braun, IEEE International 63 | * Conference on Neural Networks, San Francisco, USA, 586-591, March 1993. 64 | ******************************************************************************/ 65 | typedef struct rprop_s rprop_t; 66 | struct rprop_s { 67 | mdl_t *mdl; 68 | double *xp; 69 | double *stp; 70 | double *g; 71 | double *gp; 72 | }; 73 | 74 | /* trn_rpropsub: 75 | * Partial update of the weight vector including partial gradient in case of 76 | * l1 regularisation. The sub vector updated depend on the id and cnt 77 | * parameter given, the job scheduling system is not used here as we can 78 | * easily split processing in equals parts. 79 | */ 80 | static void trn_rpropsub(job_t *job, uint32_t id, uint32_t cnt, rprop_t *st) { 81 | unused(job); 82 | mdl_t *mdl = st->mdl; 83 | const uint64_t F = mdl->nftr; 84 | const double stpmin = mdl->opt->rprop.stpmin; 85 | const double stpmax = mdl->opt->rprop.stpmax; 86 | const double stpinc = mdl->opt->rprop.stpinc; 87 | const double stpdec = mdl->opt->rprop.stpdec; 88 | const bool wbt = strcmp(mdl->opt->algo, "rprop-"); 89 | const double rho1 = mdl->opt->rho1; 90 | const int l1 = (rho1 != 0.0) ? mdl->opt->rprop.cutoff + 1: 0; 91 | double *x = mdl->theta; 92 | double *xp = st->xp, *stp = st->stp; 93 | double *g = st->g, *gp = st->gp; 94 | const uint64_t from = F * id / cnt; 95 | const uint64_t to = F * (id + 1) / cnt; 96 | for (uint64_t f = from; f < to; f++) { 97 | double pg = g[f]; 98 | // If there is a l1 component in the regularization component, 99 | // we either project the gradient in the current orthant or 100 | // check for cutdown depending on the projection scheme wanted. 101 | if (l1 == 1) { 102 | if (x[f] < -EPSILON) pg -= rho1; 103 | else if (x[f] > EPSILON) pg += rho1; 104 | else if (g[f] < -rho1) pg += rho1; 105 | else if (g[f] > rho1) pg -= rho1; 106 | else pg = 0.0; 107 | } else if (l1 && sqr(g[f] + rho1 * sign(x[f])) < sqr(rho1)) { 108 | if (x[f] == 0.0 || ( gp[f] * g[f] < 0.0 109 | && xp[f] * x[f] < 0.0)) { 110 | if (wbt) 111 | xp[f] = x[f]; 112 | x[f] = 0.0; 113 | gp[f] = g[f]; 114 | continue; 115 | } 116 | } 117 | const double sgp = sign(gp[f]); 118 | const double spg = sign(pg); 119 | // Next we adjust the step depending of the new and 120 | // previous gradient values. 121 | if (sgp * spg > 0.0) 122 | stp[f] = min(stp[f] * stpinc, stpmax); 123 | else if (sgp * spg < 0.0) 124 | stp[f] = max(stp[f] * stpdec, stpmin); 125 | // Finally update the weight. if there is l1 penalty 126 | // and the pseudo gradient projection is used, we have to 127 | // project back the update in the choosen orthant. 128 | if (!wbt || sgp * spg > 0.0) { 129 | double dlt = stp[f] * -sign(g[f]); 130 | if (l1 == 1 && dlt * spg >= 0.0) 131 | dlt = 0.0; 132 | if (wbt) 133 | xp[f] = x[f]; 134 | x[f] += dlt; 135 | } else if (sgp * spg < -0.0) { 136 | x[f] = xp[f]; 137 | g[f] = 0.0; 138 | } else { 139 | xp[f] = x[f]; 140 | if (l1 != 1) 141 | x[f] += stp[f] * -spg; 142 | } 143 | gp[f] = g[f]; 144 | } 145 | } 146 | 147 | void trn_rprop(mdl_t *mdl) { 148 | const uint64_t F = mdl->nftr; 149 | const uint32_t K = mdl->opt->maxiter; 150 | const uint32_t W = mdl->opt->nthread; 151 | const bool wbt = strcmp(mdl->opt->algo, "rprop-"); 152 | const int cut = mdl->opt->rprop.cutoff; 153 | // Allocate state memory and initialize it 154 | double *xp = NULL, *stp = xvm_new(F); 155 | double *g = xvm_new(F), *gp = xvm_new(F); 156 | if (wbt && !cut) 157 | xp = xvm_new(F); 158 | for (uint64_t f = 0; f < F; f++) { 159 | if (wbt && !cut) 160 | xp[f] = 0.0; 161 | gp[f] = 0.0; 162 | stp[f] = 0.1; 163 | } 164 | // Restore a saved state if given by the user 165 | if (mdl->opt->rstate != NULL) { 166 | const char *err = "invalid state file"; 167 | FILE *file = fopen(mdl->opt->rstate, "r"); 168 | if (file == NULL) 169 | fatal("failed to open input state file"); 170 | int type; 171 | uint64_t nftr; 172 | if (fscanf(file, "#state#%d#%"SCNu64"\n", &type, &nftr) != 2) 173 | fatal(err); 174 | if (type != 3) 175 | fatal("state is not for rprop model"); 176 | for (uint64_t i = 0; i < nftr; i++) { 177 | uint64_t f; 178 | double vxp, vstp, vgp; 179 | if (fscanf(file, "%"PRIu64" %la %la %la\n", &f, &vxp, 180 | &vstp, &vgp) != 4) 181 | fatal(err); 182 | if (wbt && !cut) xp[f] = vxp; 183 | gp[f] = vgp; 184 | stp[f] = vstp; 185 | } 186 | fclose(file); 187 | } 188 | // Prepare the rprop state used to send information to the rprop worker 189 | // about updating weight using the gradient. 190 | rprop_t *st = xmalloc(sizeof(rprop_t)); 191 | st->mdl = mdl; 192 | st->xp = xp; st->stp = stp; 193 | st->g = g; st->gp = gp; 194 | rprop_t *rprop[W]; 195 | for (uint32_t w = 0; w < W; w++) 196 | rprop[w] = st; 197 | // Prepare the gradient state for the distributed gradient computation. 198 | grd_t *grd = grd_new(mdl, g); 199 | // And iterate the gradient computation / weight update process until 200 | // convergence or stop request 201 | for (uint32_t k = 0; !uit_stop && k < K; k++) { 202 | double fx = grd_gradient(grd); 203 | if (uit_stop) 204 | break; 205 | mth_spawn((func_t *)trn_rpropsub, W, (void **)rprop, 0, 0); 206 | if (uit_progress(mdl, k + 1, fx) == false) 207 | break; 208 | } 209 | // Save state if user requested it 210 | if (mdl->opt->sstate != NULL) { 211 | FILE *file = fopen(mdl->opt->sstate, "w"); 212 | if (file == NULL) 213 | fatal("failed to open output state file"); 214 | fprintf(file, "#state#3#%"PRIu64"\n", F); 215 | for (uint64_t f = 0; f < F; f++) { 216 | double vxp = xp != NULL ? xp[f] : 0.0; 217 | double vstp = stp[f], vgp = gp[f]; 218 | fprintf(file, "%"PRIu64" ", f); 219 | fprintf(file, "%la %la %la\n", vxp, vstp, vgp); 220 | } 221 | fclose(file); 222 | } 223 | // Free all allocated memory 224 | if (wbt && !cut) 225 | xvm_free(xp); 226 | xvm_free(g); 227 | xvm_free(gp); 228 | xvm_free(stp); 229 | grd_free(grd); 230 | free(st); 231 | } 232 | 233 | -------------------------------------------------------------------------------- /Wapiti/src/sgdl1.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include "wapiti.h" 35 | #include "gradient.h" 36 | #include "model.h" 37 | #include "options.h" 38 | #include "progress.h" 39 | #include "sequence.h" 40 | #include "tools.h" 41 | 42 | /****************************************************************************** 43 | * The SGD-L1 trainer 44 | * 45 | * Implementation of the stochatic gradient descend with L1 penalty described 46 | * in [1] by Tsurukoa et al. This allow to build really sparse models with the 47 | * SGD method. 48 | * 49 | * [1] Stochastic gradient descent training for L1-regularized log-linear 50 | * models with cumulative penalty, Yoshimasa Tsuruoka and Jun'ichi Tsuji 51 | * and Sophia Ananiadou, in Proceedings of the ACL and the 4th IJCNLP of 52 | * the AFNLP, pages 477-485, August 2009 53 | ******************************************************************************/ 54 | typedef struct sgd_idx_s { 55 | uint64_t *uobs; 56 | uint64_t *bobs; 57 | } sgd_idx_t; 58 | 59 | /* applypenalty: 60 | * This macro is quite ugly as it make a lot of things and use local variables 61 | * of the function below. I'm sorry for this but this is allow to not 62 | * duplicate the code below. Due to the way unigrams and bigrams observation 63 | * are stored we must use this two times. As this macro is dangerous when 64 | * called outsize of sgd-l1 we undef it just after. 65 | * This function match exactly the APPLYPENALTY function defined in [1] pp 481 66 | * and the formula on the middle of the page 480. 67 | */ 68 | #define applypenalty(f) do { \ 69 | const double z = w[f]; \ 70 | if (z > 0.0) w[f] = max(0.0, z - (u + q[f])); \ 71 | else if (z < 0.0) w[f] = min(0.0, z + (u - q[f])); \ 72 | q[f] += w[f] - z; \ 73 | } while (false) 74 | 75 | /* sgd_add: 76 | * Add the value in the array of size . If the value is 77 | * already present, we do nothing, else we add it. 78 | */ 79 | static void sgd_add(uint64_t *obs, uint32_t *cnt, uint64_t new) { 80 | // First check if value is already in the array, we do a linear probing 81 | // as it is simpler and since these array will be very short in 82 | // practice, it's efficient enough. 83 | for (uint32_t p = 0; p < *cnt; p++) 84 | if (obs[p] == new) 85 | return; 86 | // Insert the new value at the end since we have not found it. 87 | obs[*cnt] = new; 88 | *cnt = *cnt + 1; 89 | } 90 | 91 | /* trn_sgdl1: 92 | * Train the model with the SGD-l1 algorithm described by tsurukoa et al. 93 | */ 94 | void trn_sgdl1(mdl_t *mdl) { 95 | const uint64_t Y = mdl->nlbl; 96 | const uint64_t F = mdl->nftr; 97 | const uint32_t U = mdl->reader->nuni; 98 | const uint32_t B = mdl->reader->nbi; 99 | const uint32_t S = mdl->train->nseq; 100 | const uint32_t K = mdl->opt->maxiter; 101 | double *w = mdl->theta; 102 | // First we have to build and index who hold, for each sequences, the 103 | // list of actives observations. 104 | // The index is a simple table indexed by sequences number. Each entry 105 | // point to two lists of observations terminated by , one for 106 | // unigrams obss and one for bigrams obss. 107 | info(" - Build the index\n"); 108 | sgd_idx_t *idx = xmalloc(sizeof(sgd_idx_t) * S); 109 | for (uint32_t s = 0; s < S; s++) { 110 | const seq_t *seq = mdl->train->seq[s]; 111 | const uint32_t T = seq->len; 112 | uint64_t uobs[U * T + 1]; 113 | uint64_t bobs[B * T + 1]; 114 | uint32_t ucnt = 0, bcnt = 0; 115 | for (uint32_t t = 0; t < seq->len; t++) { 116 | const pos_t *pos = &seq->pos[t]; 117 | for (uint32_t p = 0; p < pos->ucnt; p++) 118 | sgd_add(uobs, &ucnt, pos->uobs[p]); 119 | for (uint32_t p = 0; p < pos->bcnt; p++) 120 | sgd_add(bobs, &bcnt, pos->bobs[p]); 121 | } 122 | uobs[ucnt++] = none; 123 | bobs[bcnt++] = none; 124 | idx[s].uobs = xmalloc(sizeof(uint64_t) * ucnt); 125 | idx[s].bobs = xmalloc(sizeof(uint64_t) * bcnt); 126 | memcpy(idx[s].uobs, uobs, ucnt * sizeof(uint64_t)); 127 | memcpy(idx[s].bobs, bobs, bcnt * sizeof(uint64_t)); 128 | } 129 | info(" Done\n"); 130 | // We will process sequences in random order in each iteration, so we 131 | // will have to permute them. The current permutation is stored in a 132 | // vector called shuffled at the start of each iteration. We 133 | // just initialize it with the identity permutation. 134 | // As we use the same gradient function than the other trainers, we need 135 | // an array to store it. These functions accumulate the gradient so we 136 | // need to clear it at start and before each new computation. As we now 137 | // which features are active and so which gradient cell are updated, we 138 | // can clear them selectively instead of fully clear the gradient each 139 | // time. 140 | // We also need an aditional vector named who hold the penalty 141 | // already applied to each features. 142 | uint32_t *perm = xmalloc(sizeof(uint32_t) * S); 143 | for (uint32_t s = 0; s < S; s++) 144 | perm[s] = s; 145 | double *g = xmalloc(sizeof(double) * F); 146 | double *q = xmalloc(sizeof(double) * F); 147 | for (uint64_t f = 0; f < F; f++) 148 | g[f] = q[f] = 0.0; 149 | // We can now start training the model, we perform the requested number 150 | // of iteration, each of these going through all the sequences. For 151 | // computing the decay, we will need to keep track of the number of 152 | // already processed sequences, this is tracked by the variable. 153 | double u = 0.0; 154 | grd_st_t *grd_st = grd_stnew(mdl, g); 155 | for (uint32_t k = 0, i = 0; k < K && !uit_stop; k++) { 156 | // First we shuffle the sequence by making a lot of random swap 157 | // of entry in the permutation index. 158 | for (uint32_t s = 0; s < S; s++) { 159 | const uint32_t a = rand() % S; 160 | const uint32_t b = rand() % S; 161 | const uint32_t t = perm[a]; 162 | perm[a] = perm[b]; 163 | perm[b] = t; 164 | } 165 | // And so, we can process sequence in a random order 166 | for (uint32_t sp = 0; sp < S && !uit_stop; sp++, i++) { 167 | const uint32_t s = perm[sp]; 168 | const seq_t *seq = mdl->train->seq[s]; 169 | grd_dospl(grd_st, seq); 170 | // Before applying the gradient, we have to compute the 171 | // learning rate to apply to this sequence. For this we 172 | // use an exponential decay [1, pp 481(5)] 173 | // η_i = η_0 * α^{i/S} 174 | // And at the same time, we update the total penalty 175 | // that must have been applied to each features. 176 | // u <- u + η * rho1 / S 177 | const double n0 = mdl->opt->sgdl1.eta0; 178 | const double alpha = mdl->opt->sgdl1.alpha; 179 | const double nk = n0 * pow(alpha, (double)i / S); 180 | u = u + nk * mdl->opt->rho1 / S; 181 | // Now we apply the update to all unigrams and bigrams 182 | // observations actives in the current sequence. We must 183 | // not forget to clear the gradient for the next 184 | // sequence. 185 | for (uint32_t n = 0; idx[s].uobs[n] != none; n++) { 186 | uint64_t f = mdl->uoff[idx[s].uobs[n]]; 187 | for (uint32_t y = 0; y < Y; y++, f++) { 188 | w[f] -= nk * g[f]; 189 | applypenalty(f); 190 | g[f] = 0.0; 191 | } 192 | } 193 | for (uint32_t n = 0; idx[s].bobs[n] != none; n++) { 194 | uint64_t f = mdl->boff[idx[s].bobs[n]]; 195 | for (uint32_t d = 0; d < Y * Y; d++, f++) { 196 | w[f] -= nk * g[f]; 197 | applypenalty(f); 198 | g[f] = 0.0; 199 | } 200 | } 201 | } 202 | if (uit_stop) 203 | break; 204 | // Repport progress back to the user 205 | if (!uit_progress(mdl, k + 1, -1.0)) 206 | break; 207 | } 208 | grd_stfree(grd_st); 209 | // Cleanup allocated memory before returning 210 | for (uint32_t s = 0; s < S; s++) { 211 | free(idx[s].uobs); 212 | free(idx[s].bobs); 213 | } 214 | free(idx); 215 | free(perm); 216 | free(g); 217 | free(q); 218 | } 219 | #undef applypenalty 220 | 221 | -------------------------------------------------------------------------------- /Wapiti/src/quark.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "quark.h" 36 | #include "tools.h" 37 | 38 | /****************************************************************************** 39 | * Map object 40 | * 41 | * Implement quark object for mapping strings to identifiers through crit-bit 42 | * tree (also known as PATRICIA tries). In fact it only store a compressed 43 | * version of the trie to reduce memory footprint. The special trick of using 44 | * the last bit of the reference to differenciate between nodes and leafs come 45 | * from Daniel J. Bernstein implementation of crit-bit tree that can be found 46 | * on his web site. 47 | * [1] Morrison, Donald R. ; PATRICIA-Practical Algorithm To Retrieve 48 | * Information Coded in Alphanumeric, Journal of the ACM 15 (4): pp. 514--534, 49 | * 1968. DOI:10.1145/321479.321481 50 | * 51 | * This code is copyright 2002-2013 Thomas Lavergne and licenced under the BSD 52 | * Licence like the remaining of Wapiti. 53 | ******************************************************************************/ 54 | 55 | typedef struct node_s node_t; 56 | typedef struct leaf_s leaf_t; 57 | struct qrk_s { 58 | struct node_s { 59 | node_t *child[2]; 60 | uint32_t pos; 61 | uint8_t byte; 62 | } *root; 63 | struct leaf_s { 64 | uint64_t id; 65 | char key[]; 66 | } **leafs; 67 | bool lock; 68 | uint64_t count; 69 | uint64_t size; 70 | }; 71 | 72 | #define qrk_lf2nd(lf) ((node_t *)((intptr_t)(lf) | 1)) 73 | #define qrk_nd2lf(nd) ((leaf_t *)((intptr_t)(nd) & ~1)) 74 | #define qrk_isleaf(nd) ((intptr_t)(nd) & 1) 75 | 76 | /* qrk_new: 77 | * This initialize the object for holding a new empty trie, with some pre- 78 | * allocations. The returned object must be freed with a call to qrk_free when 79 | * not needed anymore. 80 | */ 81 | qrk_t *qrk_new(void) { 82 | const uint64_t size = 128; 83 | qrk_t *qrk = xmalloc(sizeof(qrk_t)); 84 | qrk->root = NULL; 85 | qrk->count = 0; 86 | qrk->lock = false; 87 | qrk->size = size; 88 | qrk->leafs = xmalloc(sizeof(leaf_t *) * size); 89 | return qrk; 90 | } 91 | 92 | /* qrk_free: 93 | * Release all the memory used by a qrk_t object allocated with qrk_new. This 94 | * will release all key string stored internally so all key returned by 95 | * qrk_unmap become invalid and must not be used anymore. 96 | */ 97 | void qrk_free(qrk_t *qrk) { 98 | const uint32_t stkmax = 1024; 99 | if (qrk->count != 0) { 100 | node_t *stk[stkmax]; 101 | uint32_t cnt = 0; 102 | stk[cnt++] = qrk->root; 103 | while (cnt != 0) { 104 | node_t *nd = stk[--cnt]; 105 | if (qrk_isleaf(nd)) { 106 | free(qrk_nd2lf(nd)); 107 | continue; 108 | } 109 | stk[cnt++] = nd->child[0]; 110 | stk[cnt++] = nd->child[1]; 111 | free(nd); 112 | } 113 | } 114 | free(qrk->leafs); 115 | free(qrk); 116 | } 117 | 118 | /* qrk_insert: 119 | * Map a key to a uniq identifier. If the key already exist in the map, return 120 | * its identifier, else allocate a new identifier and insert the new (key,id) 121 | * pair inside the quark. This function is not thread safe and should not be 122 | * called on the same map from different thread without locking. 123 | */ 124 | uint64_t qrk_str2id(qrk_t *qrk, const char *key) { 125 | const uint8_t *raw = (void *)key; 126 | const size_t len = strlen(key); 127 | // We first take care of the empty trie case so later we can safely 128 | // assume that the trie is well formed and so there is no NULL pointers 129 | // in it. 130 | if (qrk->count == 0) { 131 | if (qrk->lock == true) 132 | return none; 133 | const size_t size = sizeof(char) * (len + 1); 134 | leaf_t *lf = xmalloc(sizeof(leaf_t) + size); 135 | memcpy(lf->key, key, size); 136 | lf->id = 0; 137 | qrk->root = qrk_lf2nd(lf); 138 | qrk->leafs[0] = lf; 139 | qrk->count = 1; 140 | return 0; 141 | } 142 | // If the trie is not empty, we first go down the trie to the leaf like 143 | // if we are searching for the key. When at leaf there is two case, 144 | // either we have found our key or we have found another key with all 145 | // its critical bit identical to our one. So we search for the first 146 | // differing bit between them to know where we have to add the new node. 147 | const node_t *nd = qrk->root; 148 | while (!qrk_isleaf(nd)) { 149 | const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0; 150 | const int side = ((chr | nd->byte) + 1) >> 8; 151 | nd = nd->child[side]; 152 | } 153 | const char *bst = qrk_nd2lf(nd)->key; 154 | size_t pos; 155 | for (pos = 0; pos < len; pos++) 156 | if (key[pos] != bst[pos]) 157 | break; 158 | uint8_t byte; 159 | if (pos != len) 160 | byte = key[pos] ^ bst[pos]; 161 | else if (bst[pos] != '\0') 162 | byte = bst[pos]; 163 | else 164 | return qrk_nd2lf(nd)->id; 165 | if (qrk->lock == true) 166 | return none; 167 | // Now we known the two key are different and we know in which byte. It 168 | // remain to build the mask for the new critical bit and build the new 169 | // internal node and leaf. 170 | while (byte & (byte - 1)) 171 | byte &= byte - 1; 172 | byte ^= 255; 173 | const uint8_t chr = bst[pos]; 174 | const int side = ((chr | byte) + 1) >> 8; 175 | const size_t size = sizeof(char) * (len + 1); 176 | node_t *nx = xmalloc(sizeof(node_t)); 177 | leaf_t *lf = xmalloc(sizeof(leaf_t) + size); 178 | memcpy(lf->key, key, size); 179 | lf->id = qrk->count++; 180 | nx->pos = pos; 181 | nx->byte = byte; 182 | nx->child[1 - side] = qrk_lf2nd(lf); 183 | if (lf->id == qrk->size) { 184 | qrk->size *= 1.4; 185 | const size_t size = sizeof(leaf_t *) * qrk->size; 186 | qrk->leafs = xrealloc(qrk->leafs, size); 187 | } 188 | qrk->leafs[lf->id] = lf; 189 | // And last thing to do: inserting the new node in the trie. We have to 190 | // walk down the trie again as we have to keep the ordering of nodes. So 191 | // we search for the good position to insert it. 192 | node_t **trg = &qrk->root; 193 | while (true) { 194 | node_t *nd = *trg; 195 | if (qrk_isleaf(nd) || nd->pos > pos) 196 | break; 197 | if (nd->pos == pos && nd->byte > byte) 198 | break; 199 | const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0; 200 | const int side = ((chr | nd->byte) + 1) >> 8; 201 | trg = &nd->child[side]; 202 | } 203 | nx->child[side] = *trg; 204 | *trg = nx; 205 | return lf->id; 206 | } 207 | 208 | /* qrk_id2str: 209 | * Retrieve the key associated to an identifier. The key is returned as a 210 | * constant string that should not be modified or freed by the caller, it is 211 | * a pointer to the internal copy of the key kept by the map object and 212 | * remain valid only for the life time of the quark, a call to qrk_free will 213 | * make this pointer invalid. 214 | */ 215 | const char *qrk_id2str(const qrk_t *qrk, uint64_t id) { 216 | if (id >= qrk->count) 217 | fatal("invalid identifier"); 218 | return qrk->leafs[id]->key; 219 | } 220 | 221 | /* qrk_save: 222 | * Save list of keys present in the map object in the id order to the given 223 | * file. We put one key per line so, if no key contains a new line, the line 224 | * number correspond to the id. 225 | */ 226 | void qrk_save(const qrk_t *qrk, FILE *file) { 227 | if (fprintf(file, "#qrk#%"PRIu64"\n", qrk->count) < 0) 228 | pfatal("cannot write to file"); 229 | if (qrk->count == 0) 230 | return; 231 | for (uint64_t n = 0; n < qrk->count; n++) 232 | ns_writestr(file, qrk->leafs[n]->key); 233 | } 234 | 235 | /* qrk_load: 236 | * Load a list of key from the given file and add them to the map. Each lines 237 | * of the file is taken as a single key and mapped to the next available id if 238 | * not already present. If all keys are single lines and the given map is 239 | * initilay empty, this will load a map exactly as saved by qrk_save. 240 | */ 241 | void qrk_load(qrk_t *qrk, FILE *file) { 242 | uint64_t cnt = 0; 243 | if (fscanf(file, "#qrk#%"SCNu64"\n", &cnt) != 1) { 244 | if (ferror(file) != 0) 245 | pfatal("cannot read from file"); 246 | pfatal("invalid format"); 247 | } 248 | for (uint64_t n = 0; n < cnt; ++n) { 249 | char *str = ns_readstr(file); 250 | qrk_str2id(qrk, str); 251 | free(str); 252 | } 253 | } 254 | 255 | /* qrk_count: 256 | * Return the number of mappings stored in the quark. 257 | */ 258 | uint64_t qrk_count(const qrk_t *qrk) { 259 | return qrk->count; 260 | } 261 | 262 | /* qrk_lock: 263 | * Set the lock value of the quark and return the old one. 264 | */ 265 | bool qrk_lock(qrk_t *qrk, bool lock) { 266 | bool old = qrk->lock; 267 | qrk->lock = lock; 268 | return old; 269 | } 270 | 271 | -------------------------------------------------------------------------------- /Flyon/Bio_nolabel/入院记录现病史-2.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | , x Nd Nj Ns Nz 2 0 1 2 | 患者 n Nd Nj Ns Nz 2 1 3 3 | 因 p dr Nj Ns Nz 2 3 4 4 | 罹患 v Nd Nj Ns Nz 2 4 6 5 | “ x Nd Nj Ns Nz 2 6 7 6 | 胃癌 n Nd Nj Ns Nz 2 7 9 7 | ” x Nd Nj Ns Nz 2 9 10 8 | 于 p dr Nj Ns Nz 2 10 11 9 | 2013 m Nd Nj Ns Nz 2 11 15 10 | - x Nd Nj Ns Nz 2 15 16 11 | 10 m Nd Nj Ns Nz 2 16 18 12 | - x Nd Nj Ns Nz 2 18 19 13 | 29 m Nd Nj Ns Nz 2 19 21 14 | 在 p Nd Nj ss Nz 2 21 22 15 | 我院 n Nd Nj Ns Nz 2 22 24 16 | 予行 v Nd Nj Ns Nz 2 24 26 17 | 全麻 n Nd Nj Ns Nz 2 26 28 18 | 下 f Nd Nj Ns Nz 2 28 29 19 | 胃癌 n Nd Nj Ns Nz 2 29 31 20 | 根治术 n Nd Nj ss Nz 2 31 34 21 | , x Nd Nj Ns Nz 2 34 35 22 | , x Nd Nj Ns Nz 2 35 36 23 | 术 v Nd Nj Ns Nz 2 36 37 24 | 中 f Nd Nj Ns Nz 2 37 38 25 | 见 v dr Nj Ns Nz 2 38 39 26 | : x Nd Nj Ns Nz 2 39 40 27 | 腹腔 n Nd Nj Ns Nz 2 40 42 28 | 内 n Nd Nj Ns Nz 2 42 43 29 | 腹水 n Nd Nj Ns Nz 2 43 45 30 | , x Nd Nj Ns Nz 2 45 46 31 | 腹膜 n Nd Nj Ns Nz 2 46 48 32 | 无 v Nd Nj Ns Nz 2 48 49 33 | 转移 v Nd Nj ss Nz 2 49 51 34 | , x Nd Nj Ns Nz 2 51 52 35 | 肝脏 n Nd Nj Ns Nz 2 52 54 36 | 未 d Nd Nj Ns Nz 2 54 55 37 | 触及 v Nd Nj Ns Nz 2 55 57 38 | 明显 a Nd Nj Ns zz 2 57 59 39 | 转移性 n Nd Nj Ns Nz 2 59 62 40 | 灶 n Nd Nj ss Nz 2 62 63 41 | , x Nd Nj Ns Nz 2 63 64 42 | 肿瘤 n Nd Nj Ns Nz 2 64 66 43 | 位于 v Nd Nj Ns Nz 2 66 68 44 | 胃体 n Nd jp Ns Nz 2 68 70 45 | 、 x Nd Nj Ns Nz 2 70 71 46 | 胃 n Nd Nj Ns Nz 2 71 72 47 | 底部 f Nd jp Ns Nz 2 72 74 48 | , x Nd Nj Ns Nz 2 74 75 49 | 小 a Nd Nj Ns Nz 2 75 76 50 | 弯 v Nd Nj ss Nz 2 76 77 51 | 侧 v Nd Nj Ns Nz 2 77 78 52 | 偏后 f Nd Nj Ns Nz 2 78 80 53 | 壁 n Nd Nj Ns Nz 2 80 81 54 | , x Nd Nj Ns Nz 2 81 82 55 | 约 d Nd Nj Ns Nz 2 82 83 56 | 5 m Nd Nj Ns Nz 2 83 84 57 | * x Nd Nj Ns Nz 2 84 85 58 | 4 x Nd Nj Ns Nz 2 85 86 59 | * x Nd Nj Ns Nz 2 86 87 60 | 2 m Nd Nj Ns Nz 2 87 88 61 | CM eng Nd Nj Ns Nz 2 88 90 62 | 大小 b Nd Nj Ns Nz 2 90 92 63 | , x Nd Nj Ns Nz 2 92 93 64 | 肿瘤 n Nd Nj Ns Nz 2 93 95 65 | 已 d dr Nj Ns Nz 2 95 96 66 | 侵达 v Nd Nj Ns Nz 2 96 98 67 | 浆膜 n Nd jp Ns Nz 2 98 100 68 | 外 f Nd Nj Ns Nz 2 100 101 69 | , x Nd Nj Ns Nz 2 101 102 70 | 第 m dr Nj Ns Nz 2 102 103 71 | 1 m Nd Nj Ns Nz 2 103 104 72 | 、 x Nd Nj Ns Nz 2 104 105 73 | 3 m Nd Nj Ns Nz 2 105 106 74 | 组 v Nd Nj Ns Nz 2 106 107 75 | 淋巴结 n Nd Nj Ns Nz 2 107 110 76 | 肿大 v Nd Nj Ns zz 2 110 112 77 | , x Nd Nj Ns Nz 2 112 113 78 | 肿瘤 n Nd Nj Ns Nz 2 113 115 79 | 尚 d Nd Nj Ns Nz 2 115 116 80 | 能 v Nd Nj Ns Nz 2 116 117 81 | 活动 vn Nd Nj Ns Nz 2 117 119 82 | , x Nd Nj Ns Nz 2 119 120 83 | 经 n Nd Nj Ns Nz 2 120 121 84 | 探查 vn Nd Nj ss Nz 2 121 123 85 | 决定 v Nd Nj Ns Nz 2 123 125 86 | 行全胃 n Nd Nj Ns Nz 2 125 128 87 | 切除 v Nd Nj ss Nz 2 128 130 88 | , x Nd Nj Ns Nz 2 130 131 89 | 空肠 n Nd Nj Ns Nz 2 131 133 90 | J eng Nd Nj Ns Nz 2 133 134 91 | 字代 n Nd Nj Ns Nz 2 134 136 92 | 胃术 n Nd Nj Ns Nz 2 136 138 93 | 。 x Nd Nj Ns Nz 2 138 139 94 | 95 | 手术 n Nd Nj Ns Nz 2 139 141 96 | 顺利 ad Nd Nj Ns Nz 2 141 143 97 | , x Nd Nj Ns Nz 2 143 144 98 | 术后 t Nd Nj ss Nz 2 144 146 99 | 积极 ad Nd Nj Ns Nz 2 146 148 100 | 予 vg Nd Nj Ns Nz 2 148 149 101 | 相关 v Nd Nj Ns Nz 2 149 151 102 | 对症 n Nd Nj Ns Nz 2 151 153 103 | 支持 v Nd Nj Ns Nz 2 153 155 104 | 治疗 v Nd Nj Ns Nz 2 155 157 105 | ; x Nd Nj Ns Nz 2 157 158 106 | , x Nd Nj Ns Nz 2 158 159 107 | 后 f Nd Nj Ns Nz 2 159 160 108 | 病理 n Nd Nj Ns Nz 2 160 162 109 | 示 v Nd Nj Ns Nz 2 162 163 110 | : x Nd Nj Ns Nz 2 163 164 111 | 胃底 n Nd Nj Ns Nz 2 164 166 112 | 、 x Nd Nj Ns Nz 2 166 167 113 | 体小 n Nd Nj Ns Nz 2 167 169 114 | 弯 v Nd Nj ss Nz 2 169 170 115 | 侧 v Nd Nj Ns Nz 2 170 171 116 | 低 a Nd Nj Ns Nz 2 171 172 117 | 分化腺癌 n Nd Nj Ns Nz 2 172 176 118 | , x Nd Nj Ns Nz 2 176 177 119 | 部分 n Nd Nj ss Nz 2 177 179 120 | 为 p Nd Nj ss Nz 2 179 180 121 | 印戒 n Nd Nj Ns Nz 2 180 182 122 | 细胞 n Nd Nj Ns Nz 2 182 184 123 | 癌 zg Nd Nj ss Nz 2 184 185 124 | 图像 n Nd Nj Ns Nz 2 185 187 125 | , x Nd Nj Ns Nz 2 187 188 126 | 蕈 g Nd Nj Ns Nz 2 188 189 127 | 伞型 b Nd Nj Ns Nz 2 189 191 128 | , x Nd Nj Ns Nz 2 191 192 129 | 面积 n Nd Nj Ns Nz 2 192 194 130 | 5.2 m Nd Nj Ns Nz 2 194 197 131 | * x Nd Nj Ns Nz 2 197 198 132 | 3.5 m Nd Nj Ns Nz 2 198 201 133 | CM eng Nd Nj Ns Nz 2 201 203 134 | , x Nd Nj Ns Nz 2 203 204 135 | 局部 n Nd Nj Ns Nz 2 204 206 136 | 侵达 v Nd Nj Ns Nz 2 206 208 137 | 粘膜 n Nd Nj Ns Nz 2 208 210 138 | 下层 n Nd Nj Ns Nz 2 210 212 139 | , x Nd Nj Ns Nz 2 212 213 140 | 并 c Nd Nj Ns Nz 2 213 214 141 | 于 p dr Nj Ns Nz 2 214 215 142 | 少数 m Nd Nj Ns Nz 2 215 217 143 | 腺 n Nd Nj Ns Nz 2 217 218 144 | 管内 n Nd Nj Ns Nz 2 218 220 145 | 查见 v Nd Nj Ns Nz 2 220 222 146 | 癌栓 n Nd Nj ss Nz 2 222 224 147 | 。 x Nd Nj Ns Nz 2 224 225 148 | 149 | 两端 m Nd Nj Ns Nz 2 225 227 150 | 切线 n Nd jp Ns Nz 2 227 229 151 | 及 c Nd Nj Ns Nz 2 229 230 152 | 另 r Nd Nj Ns Nz 2 230 231 153 | 送 v Nd Nj Ns Nz 2 231 232 154 | “ x Nd Nj Ns Nz 2 232 233 155 | 近 a Nd Nj Ns Nz 2 233 234 156 | 端 v Nd Nj Ns Nz 2 234 235 157 | 切线 n Nd jp Ns Nz 2 235 237 158 | ” x Nd Nj Ns Nz 2 237 238 159 | 未查 v Nd Nj Ns Nz 2 238 240 160 | 见 v dr Nj Ns Nz 2 240 241 161 | 癌 n Nd Nj ss Nz 2 241 242 162 | 。 x Nd Nj Ns Nz 2 242 243 163 | 164 | 呈 v Nd Nj Ns Nz 2 243 244 165 | 三组 m Nd Nj Ns Nz 2 244 246 166 | ( x Nd Nj Ns Nz 2 246 247 167 | 5 x Nd Nj Ns Nz 2 247 248 168 | / x Nd Nj Ns Nz 2 248 249 169 | 13 m Nd Nj Ns Nz 2 249 251 170 | 个 m Nd Nj Ns Nz 2 251 252 171 | ) x Nd Nj Ns Nz 2 252 253 172 | 淋巴结 n Nd Nj Ns Nz 2 253 256 173 | 癌 zg Nd Nj ss Nz 2 256 257 174 | 转移 v Nd Nj ss Nz 2 257 259 175 | 。 x Nd Nj Ns Nz 2 259 260 176 | 177 | 一组 m Nd Nj Ns Nz 2 260 262 178 | ( x Nd Nj Ns Nz 2 262 263 179 | 7 m Nd Nj Ns Nz 2 263 264 180 | 个 m Nd Nj Ns Nz 2 264 265 181 | ) x Nd Nj Ns Nz 2 265 266 182 | 、 x Nd Nj Ns Nz 2 266 267 183 | 四组 m Nd Nj Ns Nz 2 267 269 184 | ( x Nd Nj Ns Nz 2 269 270 185 | 13 m Nd Nj Ns Nz 2 270 272 186 | 个 m Nd Nj Ns Nz 2 272 273 187 | ) x Nd Nj Ns Nz 2 273 274 188 | 、 x Nd Nj Ns Nz 2 274 275 189 | 五组 m Nd Nj Ns Nz 2 275 277 190 | ( x Nd Nj Ns Nz 2 277 278 191 | 1 m Nd Nj Ns Nz 2 278 279 192 | 个 m Nd Nj Ns Nz 2 279 280 193 | ) x Nd Nj Ns Nz 2 280 281 194 | 、 x Nd Nj Ns Nz 2 281 282 195 | 六组 m Nd Nj Ns Nz 2 282 284 196 | ( x Nd Nj Ns Nz 2 284 285 197 | 4 m Nd Nj Ns Nz 2 285 286 198 | 个 m Nd Nj Ns Nz 2 286 287 199 | ) x Nd Nj Ns Nz 2 287 288 200 | 淋巴结 n Nd Nj Ns Nz 2 288 291 201 | 未查 v Nd Nj Ns Nz 2 291 293 202 | 见 v dr Nj Ns Nz 2 293 294 203 | 癌 n Nd Nj ss Nz 2 294 295 204 | 。 x Nd Nj Ns Nz 2 295 296 205 | 206 | , x Nd Nj Ns Nz 2 296 297 207 | 癌 zg Nd Nj ss Nz 2 297 298 208 | 组织 v Nd Nj Ns Nz 2 298 300 209 | 免疫组化 n Nd Nj Ns Nz 2 300 304 210 | 染色 n Nd Nj Ns Nz 2 304 306 211 | 示 v Nd Nj Ns Nz 2 306 307 212 | : x Nd Nj Ns Nz 2 307 308 213 | ERCC1 eng Nd Nj Ns Nz 2 308 313 214 | ( x Nd Nj Ns Nz 2 313 314 215 | + x Nd Nj Ns Nz 2 314 315 216 | ) x Nd Nj Ns Nz 2 315 316 217 | 、 x Nd Nj Ns Nz 2 316 317 218 | β x Nd Nj Ns Nz 2 317 318 219 | - x Nd Nj Ns Nz 2 318 319 220 | TUBULIN eng Nd Nj Ns Nz 2 319 326 221 | - x Nd Nj Ns Nz 2 326 327 222 | III eng Nd Nj Ns Nz 2 327 330 223 | ( x Nd Nj Ns Nz 2 330 331 224 | + x Nd Nj Ns Nz 2 331 332 225 | ) x Nd Nj Ns Nz 2 332 333 226 | 、 x Nd Nj Ns Nz 2 333 334 227 | TS eng Nd Nj Ns Nz 2 334 336 228 | ( x Nd Nj Ns Nz 2 336 337 229 | - x Nd Nj Ns Nz 2 337 338 230 | ) x Nd Nj Ns Nz 2 338 339 231 | 、 x Nd Nj Ns Nz 2 339 340 232 | RRM1 eng Nd Nj Ns Nz 2 340 344 233 | ( x Nd Nj Ns Nz 2 344 345 234 | - x Nd Nj Ns Nz 2 345 346 235 | ) x Nd Nj Ns Nz 2 346 347 236 | 、 x Nd Nj Ns Nz 2 347 348 237 | TOPOII eng Nd Nj Ns Nz 2 348 354 238 | 阳性细胞 n Nd Nj Ns Nz 2 354 358 239 | 数约 n Nd Nj Ns Nz 2 358 360 240 | 20 m Nd Nj Ns Nz 2 360 362 241 | % x Nd Nj Ns Nz 2 362 363 242 | 、 x Nd Nj Ns Nz 2 363 364 243 | CERBB eng Nd Nj Ns Nz 2 364 369 244 | - x Nd Nj Ns Nz 2 369 370 245 | 2 x Nd Nj Ns Nz 2 370 371 246 | ( x Nd Nj Ns Nz 2 371 372 247 | 2 m Nd Nj Ns Nz 2 372 373 248 | + x Nd Nj Ns Nz 2 373 374 249 | ) x Nd Nj Ns Nz 2 374 375 250 | x Nd Nj Ns Nz 2 375 376 251 | 。 x Nd Nj Ns Nz 2 376 377 252 | 253 | 依据 p Nd Nj Ns Nz 2 377 379 254 | 患者 n Nd Nj Ns Nz 2 379 381 255 | 病情 n Nd Nj Ns Nz 2 381 383 256 | 及 c Nd Nj Ns Nz 2 383 384 257 | 肿瘤 n Nd Nj Ns Nz 2 384 386 258 | 病理 n Nd Nj Ns Nz 2 386 388 259 | 与 p Nd Nj Ns Nz 2 388 389 260 | 分期 vn Nd Nj Ns Nz 2 389 391 261 | 继续 v Nd Nj Ns Nz 2 391 393 262 | 术后 t Nd Nj ss Nz 2 393 395 263 | 辅助性 n Nd Nj Ns Nz 2 395 398 264 | 化疗 n Nd Nj Ns Nz 2 398 400 265 | 指征 n Nd Nj Ns Nz 2 400 402 266 | 存在 v Nd Nj Ns Nz 2 402 404 267 | , x Nd Nj Ns Nz 2 404 405 268 | 患者 n Nd Nj Ns Nz 2 405 407 269 | 及 c Nd Nj Ns Nz 2 407 408 270 | 家属 n Nd Nj Ns Nz 2 408 410 271 | 拒绝 v Nd Nj Ns Nz 2 410 412 272 | 化疗 n Nd Nj Ns Nz 2 412 414 273 | 。 x Nd Nj Ns Nz 2 414 415 274 | 275 | 自 p Nd jp Ns Nz 2 415 416 276 | 术后 t Nd Nj ss Nz 2 416 418 277 | 出院 n Nd Nj Ns Nz 2 418 420 278 | 以来 f Nd Nj Ns Nz 2 420 422 279 | , x Nd Nj Ns Nz 2 422 423 280 | 患者 n Nd Nj Ns Nz 2 423 425 281 | 一般 a Nd Nj Ns Nz 2 425 427 282 | 情况 n Nd Nj Ns Nz 2 427 429 283 | 保持良好 n Nd Nj Ns Nz 2 429 433 284 | ; x Nd Nj Ns Nz 2 433 434 285 | 无 v Nd Nj Ns Nz 2 434 435 286 | 发热 v Nd Nj Ns Nz 2 435 437 287 | , x Nd Nj Ns Nz 2 437 438 288 | 偶 d dr Nj Ns Nz 2 438 439 289 | 有 v dr Nj Ns Nz 2 439 440 290 | 恶心 n Nd Nj Ns Nz 2 440 442 291 | , x Nd Nj Ns Nz 2 442 443 292 | 无 v Nd Nj Ns Nz 2 443 444 293 | 呕吐 v Nd Nj Ns Nz 2 444 446 294 | , x Nd Nj Ns Nz 2 446 447 295 | 无 v Nd Nj Ns Nz 2 447 448 296 | 反酸 n Nd Nj Ns Nz 2 448 450 297 | 、 x Nd Nj Ns Nz 2 450 451 298 | 嗳气 n Nd Nj Ns Nz 2 451 453 299 | , x Nd Nj Ns Nz 2 453 454 300 | 无 v Nd Nj Ns Nz 2 454 455 301 | 明显 a Nd Nj Ns zz 2 455 457 302 | 进食 v dr Nj Ns Nz 2 457 459 303 | 不适 a Nd Nj Ns zz 2 459 461 304 | , x Nd Nj Ns Nz 2 461 462 305 | 偶 d dr Nj Ns Nz 2 462 463 306 | 有 v dr Nj Ns Nz 2 463 464 307 | 进食 v dr Nj Ns Nz 2 464 466 308 | 后 f Nd Nj Ns Nz 2 466 467 309 | 轻微 d Nd Nj Ns Nz 2 467 469 310 | 腹胀 v Nd Nj Ns Nz 2 469 471 311 | , x Nd Nj Ns Nz 2 471 472 312 | 无 v Nd Nj Ns Nz 2 472 473 313 | 腹痛 n Nd Nj Ns Nz 2 473 475 314 | 。 x Nd Nj Ns Nz 2 475 476 315 | 316 | 现 tg Nd Nj Ns Nz 2 476 477 317 | 患者 n Nd Nj Ns Nz 2 477 479 318 | 为 p Nd Nj ss Nz 2 479 480 319 | 行 n Nd Nj ss Nz 2 480 481 320 | 进一步 d Nd Nj Ns Nz 2 481 484 321 | 复查 vn Nd Nj Ns Nz 2 484 486 322 | 并 c Nd Nj Ns Nz 2 486 487 323 | 必要 d dr Nj Ns Nz 2 487 489 324 | 时 n Nd Nj Ns Nz 2 489 490 325 | 适当 a Nd Nj Ns Nz 2 490 492 326 | 处理 v Nd Nj Ns Nz 2 492 494 327 | 而 c dr Nj Ns Nz 2 494 495 328 | 再 d Nd Nj Ns Nz 2 495 496 329 | 来 v dr Nj Ns Nz 2 496 497 330 | 我院 n Nd Nj Ns Nz 2 497 499 331 | 就诊 v Nd Nj Ns Nz 2 499 501 332 | , x Nd Nj Ns Nz 2 501 502 333 | 门诊 n Nd Nj Ns Nz 2 502 504 334 | 依情 n Nd Nj Ns Nz 2 504 506 335 | 以 p Nd Nj Ns Nz 2 506 507 336 | “ x Nd Nj Ns Nz 2 507 508 337 | 胃 n Nd Nj Ns Nz 2 508 509 338 | 恶性肿瘤 l Nd Nj ss Nz 2 509 513 339 | 术后 t Nd Nj ss Nz 2 513 515 340 | ” x Nd Nj Ns Nz 2 515 516 341 | 收入 v Nd Nj Ns Nz 2 516 518 342 | 院 n Nd Nj Ns Nz 2 518 519 343 | 。 x Nd Nj Ns Nz 2 519 520 344 | 345 | 目前 t Nd Nj Ns Nz 2 520 522 346 | 患者 n Nd Nj Ns Nz 2 522 524 347 | 精神 n Nd Nj Ns Nz 2 524 526 348 | 及 c Nd Nj Ns Nz 2 526 527 349 | 情绪 n Nd Nj Ns Nz 2 527 529 350 | 状态 n Nd Nj Ns Nz 2 529 531 351 | 良好 a Nd Nj Ns Nz 2 531 533 352 | , x Nd Nj Ns Nz 2 533 534 353 | 食欲 n Nd Nj Ns Nz 2 534 536 354 | 较 d dr Nj Ns Nz 2 536 537 355 | 术 v Nd Nj Ns Nz 2 537 538 356 | 前 f Nd Nj Ns Nz 2 538 539 357 | 明显 a Nd Nj Ns zz 2 539 541 358 | 减少 v Nd Nj ss Nz 2 541 543 359 | , x Nd Nj Ns Nz 2 543 544 360 | 饮食 n Nd Nj Ns Nz 2 544 546 361 | 可 v Nd Nj Ns Nz 2 546 547 362 | , x Nd Nj Ns Nz 2 547 548 363 | 夜间 t Nd Nj Ns Nz 2 548 550 364 | 睡眠 v Nd Nj ss Nz 2 550 552 365 | 后 f Nd Nj Ns Nz 2 552 553 366 | ; x Nd Nj Ns Nz 2 553 554 367 | 今 t dr Nj Ns Nz 2 554 555 368 | 8 m Nd Nj Ns Nz 2 555 556 369 | 个 m Nd Nj Ns Nz 2 556 557 370 | 月 m Nd Nj Ns Nz 2 557 558 371 | 体重减轻 i Nd Nj Ns Nz 2 558 562 372 | 18 m Nd Nj Ns Nz 2 562 564 373 | KG eng Nd Nj Ns Nz 2 564 566 374 | 。 x Nd Nj Ns Nz 2 566 567 375 | 376 | 377 | x Nd Nj Ns Nz 2 567 569 378 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Wapiti/dat/train.txt: -------------------------------------------------------------------------------- 1 | 1 1 2 | 2 2 3 | 3 2 4 | 2 2 5 | 2 2 6 | 7 | 1 1 8 | 1 1 9 | 1 1 10 | 1 1 11 | 1 1 12 | 13 | 2 2 14 | 1 2 15 | 2 2 16 | 1 1 17 | 2 1 18 | 19 | 2 2 20 | 2 2 21 | 3 2 22 | 2 2 23 | 2 2 24 | 25 | 2 2 26 | 2 2 27 | 3 2 28 | 2 2 29 | 3 2 30 | 31 | 1 1 32 | 4 1 33 | 2 1 34 | 2 1 35 | 1 1 36 | 37 | 4 4 38 | 4 3 39 | 3 3 40 | 3 3 41 | 2 3 42 | 43 | 1 1 44 | 1 1 45 | 1 1 46 | 1 1 47 | 1 1 48 | 49 | 3 3 50 | 3 3 51 | 3 3 52 | 3 3 53 | 4 4 54 | 55 | 2 2 56 | 3 2 57 | 2 2 58 | 2 2 59 | 1 2 60 | 61 | 2 1 62 | 1 1 63 | 1 1 64 | 2 1 65 | 2 1 66 | 67 | 4 4 68 | 4 4 69 | 4 4 70 | 4 4 71 | 4 4 72 | 73 | 1 1 74 | 1 1 75 | 1 1 76 | 1 1 77 | 1 1 78 | 79 | 1 1 80 | 1 1 81 | 1 1 82 | 2 2 83 | 2 2 84 | 85 | 5 5 86 | 5 5 87 | 5 5 88 | 5 5 89 | 5 5 90 | 91 | 1 1 92 | 1 1 93 | 1 1 94 | 1 1 95 | 1 1 96 | 97 | 5 5 98 | 4 4 99 | 4 4 100 | 4 4 101 | 4 4 102 | 103 | 1 1 104 | 1 1 105 | 1 1 106 | 1 1 107 | 1 1 108 | 109 | 2 2 110 | 2 2 111 | 1 2 112 | 2 2 113 | 3 3 114 | 115 | 2 1 116 | 1 1 117 | 1 1 118 | 2 1 119 | 2 2 120 | 121 | 1 1 122 | 1 1 123 | 1 1 124 | 1 1 125 | 1 1 126 | 127 | 3 3 128 | 3 3 129 | 3 3 130 | 3 3 131 | 3 3 132 | 133 | 2 2 134 | 2 2 135 | 2 2 136 | 1 1 137 | 1 1 138 | 139 | 3 2 140 | 3 2 141 | 2 2 142 | 2 2 143 | 2 2 144 | 145 | 4 4 146 | 4 4 147 | 4 4 148 | 5 5 149 | 4 5 150 | 151 | 2 2 152 | 1 1 153 | 1 1 154 | 1 1 155 | 3 1 156 | 157 | 4 4 158 | 4 4 159 | 4 6 160 | 5 6 161 | 5 6 162 | 163 | 1 1 164 | 2 1 165 | 2 1 166 | 2 2 167 | 2 2 168 | 169 | 1 1 170 | 2 1 171 | 1 1 172 | 1 1 173 | 1 1 174 | 175 | 2 1 176 | 1 1 177 | 2 1 178 | 1 1 179 | 1 1 180 | 181 | 5 6 182 | 5 6 183 | 5 6 184 | 5 6 185 | 5 6 186 | 187 | 1 1 188 | 1 1 189 | 2 1 190 | 2 2 191 | 2 2 192 | 193 | 1 1 194 | 1 1 195 | 2 1 196 | 1 1 197 | 1 1 198 | 199 | 1 1 200 | 1 1 201 | 1 1 202 | 1 1 203 | 2 2 204 | 205 | 5 5 206 | 5 5 207 | 5 5 208 | 5 5 209 | 5 5 210 | 211 | 4 4 212 | 4 4 213 | 4 4 214 | 4 4 215 | 4 4 216 | 217 | 3 3 218 | 2 2 219 | 2 2 220 | 2 2 221 | 2 2 222 | 223 | 1 1 224 | 2 1 225 | 1 1 226 | 1 1 227 | 4 3 228 | 229 | 3 3 230 | 3 3 231 | 5 3 232 | 3 3 233 | 3 3 234 | 235 | 3 3 236 | 3 3 237 | 3 3 238 | 3 3 239 | 3 3 240 | 241 | 1 1 242 | 1 1 243 | 1 1 244 | 1 1 245 | 1 1 246 | 247 | 1 1 248 | 1 1 249 | 1 1 250 | 1 1 251 | 1 1 252 | 253 | 1 1 254 | 1 1 255 | 1 1 256 | 1 1 257 | 1 1 258 | 259 | 2 2 260 | 4 2 261 | 2 2 262 | 2 2 263 | 2 2 264 | 265 | 2 2 266 | 2 2 267 | 1 2 268 | 2 2 269 | 2 2 270 | 271 | 1 1 272 | 1 1 273 | 2 2 274 | 2 2 275 | 2 2 276 | 277 | 4 4 278 | 4 4 279 | 5 5 280 | 5 5 281 | 5 5 282 | 283 | 2 2 284 | 2 2 285 | 1 1 286 | 2 1 287 | 1 1 288 | 289 | 2 2 290 | 2 2 291 | 2 2 292 | 2 2 293 | 2 2 294 | 295 | 1 1 296 | 1 2 297 | 3 2 298 | 2 2 299 | 2 2 300 | 301 | 2 2 302 | 3 2 303 | 1 2 304 | 2 2 305 | 3 3 306 | 307 | 1 1 308 | 1 1 309 | 1 1 310 | 1 1 311 | 1 1 312 | 313 | 2 2 314 | 2 2 315 | 2 2 316 | 2 2 317 | 2 2 318 | 319 | 2 1 320 | 1 1 321 | 1 1 322 | 1 1 323 | 1 1 324 | 325 | 2 1 326 | 1 1 327 | 1 1 328 | 1 1 329 | 2 2 330 | 331 | 1 1 332 | 1 1 333 | 1 1 334 | 1 1 335 | 1 1 336 | 337 | 1 1 338 | 1 1 339 | 1 1 340 | 1 1 341 | 2 3 342 | 343 | 1 1 344 | 1 1 345 | 1 1 346 | 1 1 347 | 1 1 348 | 349 | 1 1 350 | 1 1 351 | 1 1 352 | 4 2 353 | 2 2 354 | 355 | 1 1 356 | 2 1 357 | 1 1 358 | 1 1 359 | 1 1 360 | 361 | 1 1 362 | 1 1 363 | 1 1 364 | 1 1 365 | 1 1 366 | 367 | 2 2 368 | 2 2 369 | 2 2 370 | 2 2 371 | 2 2 372 | 373 | 2 2 374 | 2 2 375 | 2 2 376 | 4 4 377 | 4 4 378 | 379 | 1 2 380 | 3 3 381 | 2 3 382 | 2 3 383 | 3 3 384 | 385 | 1 1 386 | 1 1 387 | 1 1 388 | 1 1 389 | 1 1 390 | 391 | 4 3 392 | 3 3 393 | 3 3 394 | 3 4 395 | 4 4 396 | 397 | 3 3 398 | 4 4 399 | 5 4 400 | 4 4 401 | 5 3 402 | 403 | 1 1 404 | 1 1 405 | 1 1 406 | 1 1 407 | 1 1 408 | 409 | 1 1 410 | 1 1 411 | 1 1 412 | 1 2 413 | 2 2 414 | 415 | 2 2 416 | 2 2 417 | 2 2 418 | 2 2 419 | 2 2 420 | 421 | 1 1 422 | 2 1 423 | 1 1 424 | 1 1 425 | 1 1 426 | 427 | 4 4 428 | 4 4 429 | 2 3 430 | 3 3 431 | 3 3 432 | 433 | 1 1 434 | 2 1 435 | 1 1 436 | 1 1 437 | 2 1 438 | 439 | 2 2 440 | 3 3 441 | 3 3 442 | 3 3 443 | 2 3 444 | 445 | 3 2 446 | 2 3 447 | 3 3 448 | 4 4 449 | 4 4 450 | 451 | 3 3 452 | 4 3 453 | 3 3 454 | 3 3 455 | 3 3 456 | 457 | 2 2 458 | 2 2 459 | 1 2 460 | 2 2 461 | 2 3 462 | 463 | 3 3 464 | 3 3 465 | 3 3 466 | 3 4 467 | 4 4 468 | 469 | 2 2 470 | 2 2 471 | 3 3 472 | 3 3 473 | 3 3 474 | 475 | 1 1 476 | 1 1 477 | 2 2 478 | 1 2 479 | 1 2 480 | 481 | 2 2 482 | 2 2 483 | 2 2 484 | 2 2 485 | 1 2 486 | 487 | 3 3 488 | 2 2 489 | 2 2 490 | 3 2 491 | 2 2 492 | 493 | 5 6 494 | 5 6 495 | 5 6 496 | 4 6 497 | 5 6 498 | 499 | 2 1 500 | 1 1 501 | 1 1 502 | 1 1 503 | 2 1 504 | 505 | 4 4 506 | 4 4 507 | 4 4 508 | 4 4 509 | 4 4 510 | 511 | 1 1 512 | 1 1 513 | 1 1 514 | 2 1 515 | 1 1 516 | 517 | 2 2 518 | 2 2 519 | 2 2 520 | 2 2 521 | 2 2 522 | 523 | 2 2 524 | 2 2 525 | 2 2 526 | 3 3 527 | 3 3 528 | 529 | 1 1 530 | 1 1 531 | 1 1 532 | 1 1 533 | 2 1 534 | 535 | 1 1 536 | 1 1 537 | 1 1 538 | 1 1 539 | 1 1 540 | 541 | 1 1 542 | 2 1 543 | 1 1 544 | 1 1 545 | 2 1 546 | 547 | 4 4 548 | 4 4 549 | 4 4 550 | 3 3 551 | 3 3 552 | 553 | 2 2 554 | 2 2 555 | 2 2 556 | 3 2 557 | 1 1 558 | 559 | 3 3 560 | 2 2 561 | 2 2 562 | 3 2 563 | 2 2 564 | 565 | 5 5 566 | 5 5 567 | 5 6 568 | 5 6 569 | 5 6 570 | 571 | 1 1 572 | 1 1 573 | 1 1 574 | 1 1 575 | 1 1 576 | 577 | 3 3 578 | 4 3 579 | 4 4 580 | 3 4 581 | 3 4 582 | 583 | 3 3 584 | 3 3 585 | 2 3 586 | 3 3 587 | 3 3 588 | 589 | 2 2 590 | 3 2 591 | 1 1 592 | 1 1 593 | 1 1 594 | 595 | 1 1 596 | 1 1 597 | 1 1 598 | 1 1 599 | 1 1 600 | 601 | 1 1 602 | 1 1 603 | 1 1 604 | 2 1 605 | 1 1 606 | 607 | 1 1 608 | 1 1 609 | 2 1 610 | 2 2 611 | 2 2 612 | 613 | 1 1 614 | 1 1 615 | 1 1 616 | 1 1 617 | 1 1 618 | 619 | 1 1 620 | 1 1 621 | 1 1 622 | 1 1 623 | 1 1 624 | 625 | 1 1 626 | 1 1 627 | 1 1 628 | 1 1 629 | 1 1 630 | 631 | 3 3 632 | 3 3 633 | 3 3 634 | 3 3 635 | 2 2 636 | 637 | 1 1 638 | 1 1 639 | 1 1 640 | 1 1 641 | 1 1 642 | 643 | 4 4 644 | 4 4 645 | 4 4 646 | 4 4 647 | 4 4 648 | 649 | 2 2 650 | 2 2 651 | 2 2 652 | 3 3 653 | 3 3 654 | 655 | 1 1 656 | 1 1 657 | 2 1 658 | 1 1 659 | 1 1 660 | 661 | 3 3 662 | 3 3 663 | 3 3 664 | 2 3 665 | 3 3 666 | 667 | 2 2 668 | 2 2 669 | 3 3 670 | 3 3 671 | 3 3 672 | 673 | 1 2 674 | 2 2 675 | 2 2 676 | 1 2 677 | 2 2 678 | 679 | 2 2 680 | 3 3 681 | 3 3 682 | 3 3 683 | 3 3 684 | 685 | 2 2 686 | 2 2 687 | 3 2 688 | 2 2 689 | 2 2 690 | 691 | 1 1 692 | 2 1 693 | 1 1 694 | 1 1 695 | 2 1 696 | 697 | 1 1 698 | 1 1 699 | 3 1 700 | 1 1 701 | 2 1 702 | 703 | 1 1 704 | 1 1 705 | 1 1 706 | 2 2 707 | 3 2 708 | 709 | 2 2 710 | 3 2 711 | 2 2 712 | 2 2 713 | 3 3 714 | 715 | 1 1 716 | 2 2 717 | 2 2 718 | 1 1 719 | 1 1 720 | 721 | 1 1 722 | 1 1 723 | 1 1 724 | 1 1 725 | 1 1 726 | 727 | 2 2 728 | 2 2 729 | 2 2 730 | 2 2 731 | 2 2 732 | 733 | 4 3 734 | 3 3 735 | 3 3 736 | 3 3 737 | 3 3 738 | 739 | 2 2 740 | 2 2 741 | 2 2 742 | 2 2 743 | 3 3 744 | 745 | 1 1 746 | 2 1 747 | 1 1 748 | 1 1 749 | 1 1 750 | 751 | 3 2 752 | 2 2 753 | 3 3 754 | 3 3 755 | 4 3 756 | 757 | 1 1 758 | 2 1 759 | 1 1 760 | 1 1 761 | 1 1 762 | 763 | 2 2 764 | 2 2 765 | 2 2 766 | 4 2 767 | 3 3 768 | 769 | 1 1 770 | 2 2 771 | 2 2 772 | 2 2 773 | 2 2 774 | 775 | 4 4 776 | 3 4 777 | 4 4 778 | 5 5 779 | 5 5 780 | 781 | 3 3 782 | 3 3 783 | 3 3 784 | 3 3 785 | 3 2 786 | 787 | 4 4 788 | 4 4 789 | 4 4 790 | 4 4 791 | 5 4 792 | 793 | 2 2 794 | 2 2 795 | 2 2 796 | 2 2 797 | 3 2 798 | 799 | 2 2 800 | 2 2 801 | 2 2 802 | 2 2 803 | 5 3 804 | 805 | 1 1 806 | 2 1 807 | 2 2 808 | 2 2 809 | 2 2 810 | 811 | 2 1 812 | 1 1 813 | 2 2 814 | 2 2 815 | 1 1 816 | 817 | 2 1 818 | 1 1 819 | 2 2 820 | 1 2 821 | 2 2 822 | 823 | 1 1 824 | 1 1 825 | 1 1 826 | 1 1 827 | 1 1 828 | 829 | 2 2 830 | 1 1 831 | 1 1 832 | 1 1 833 | 1 1 834 | 835 | 1 1 836 | 1 1 837 | 1 1 838 | 3 1 839 | 2 2 840 | 841 | 3 2 842 | 1 2 843 | 2 1 844 | 2 2 845 | 2 2 846 | 847 | 2 2 848 | 2 2 849 | 3 3 850 | 2 3 851 | 3 3 852 | 853 | 2 2 854 | 2 2 855 | 4 4 856 | 5 4 857 | 2 2 858 | 859 | 1 1 860 | 1 1 861 | 3 1 862 | 1 1 863 | 1 1 864 | 865 | 1 1 866 | 1 1 867 | 1 1 868 | 1 1 869 | 1 1 870 | 871 | 4 4 872 | 4 4 873 | 4 4 874 | 4 4 875 | 3 4 876 | 877 | 5 5 878 | 4 4 879 | 5 5 880 | 4 5 881 | 5 5 882 | 883 | 2 2 884 | 1 1 885 | 1 1 886 | 1 1 887 | 1 1 888 | 889 | 2 2 890 | 2 2 891 | 2 2 892 | 1 2 893 | 2 2 894 | 895 | 1 1 896 | 1 1 897 | 1 1 898 | 1 1 899 | 1 1 900 | 901 | 2 2 902 | 3 2 903 | 2 2 904 | 3 3 905 | 4 4 906 | 907 | 2 2 908 | 2 2 909 | 2 2 910 | 1 1 911 | 1 1 912 | 913 | 1 1 914 | 1 1 915 | 1 1 916 | 2 1 917 | 1 1 918 | 919 | 3 2 920 | 3 2 921 | 2 2 922 | 1 1 923 | 1 1 924 | 925 | 3 3 926 | 3 3 927 | 2 2 928 | 3 3 929 | 3 3 930 | 931 | 2 2 932 | 2 2 933 | 2 2 934 | 2 2 935 | 2 2 936 | 937 | 1 1 938 | 1 1 939 | 1 1 940 | 1 1 941 | 1 1 942 | 943 | 5 5 944 | 4 4 945 | 4 4 946 | 3 3 947 | 2 3 948 | 949 | 1 1 950 | 1 1 951 | 1 1 952 | 2 1 953 | 1 1 954 | 955 | 2 2 956 | 3 3 957 | 3 3 958 | 3 3 959 | 3 3 960 | 961 | 5 5 962 | 5 5 963 | 5 4 964 | 4 4 965 | 4 4 966 | 967 | 1 1 968 | 1 1 969 | 2 2 970 | 2 2 971 | 3 3 972 | 973 | 1 1 974 | 3 2 975 | 2 2 976 | 2 2 977 | 2 2 978 | 979 | 1 1 980 | 1 1 981 | 1 1 982 | 1 1 983 | 1 1 984 | 985 | 1 1 986 | 2 1 987 | 1 1 988 | 1 2 989 | 2 2 990 | 991 | 2 2 992 | 2 2 993 | 2 2 994 | 2 2 995 | 3 2 996 | 997 | 1 1 998 | 1 1 999 | 1 1 1000 | 1 1 1001 | 1 1 1002 | 1003 | 2 2 1004 | 2 2 1005 | 2 2 1006 | 2 2 1007 | 2 3 1008 | 1009 | 5 5 1010 | 4 5 1011 | 5 5 1012 | 5 6 1013 | 5 5 1014 | 1015 | 2 2 1016 | 3 2 1017 | 2 2 1018 | 1 2 1019 | 3 2 1020 | 1021 | 1 2 1022 | 2 2 1023 | 1 2 1024 | 2 2 1025 | 2 2 1026 | 1027 | 2 1 1028 | 1 1 1029 | 1 1 1030 | 1 1 1031 | 1 1 1032 | 1033 | 4 4 1034 | 3 4 1035 | 4 4 1036 | 4 4 1037 | 4 4 1038 | 1039 | 2 2 1040 | 2 2 1041 | 2 2 1042 | 2 2 1043 | 2 2 1044 | 1045 | 3 3 1046 | 3 3 1047 | 3 3 1048 | 3 3 1049 | 3 3 1050 | 1051 | 1 1 1052 | 1 1 1053 | 1 1 1054 | 1 1 1055 | 2 1 1056 | 1057 | 3 4 1058 | 4 4 1059 | 4 4 1060 | 4 4 1061 | 4 4 1062 | 1063 | 1 1 1064 | 1 1 1065 | 1 1 1066 | 1 1 1067 | 1 1 1068 | 1069 | 2 2 1070 | 2 2 1071 | 1 2 1072 | 2 2 1073 | 2 2 1074 | 1075 | 3 3 1076 | 3 3 1077 | 4 4 1078 | 3 4 1079 | 4 4 1080 | 1081 | 1 1 1082 | 1 1 1083 | 1 2 1084 | 2 2 1085 | 2 2 1086 | 1087 | 2 2 1088 | 3 4 1089 | 4 4 1090 | 4 4 1091 | 5 5 1092 | 1093 | 2 2 1094 | 2 2 1095 | 2 2 1096 | 2 2 1097 | 3 3 1098 | 1099 | 1 1 1100 | 2 2 1101 | 2 2 1102 | 1 1 1103 | 2 2 1104 | 1105 | 3 2 1106 | 3 3 1107 | 3 3 1108 | 3 3 1109 | 3 3 1110 | 1111 | 1 1 1112 | 1 1 1113 | 1 1 1114 | 1 1 1115 | 1 1 1116 | 1117 | 2 2 1118 | 2 2 1119 | 2 2 1120 | 2 2 1121 | 2 2 1122 | 1123 | 2 2 1124 | 3 3 1125 | 4 4 1126 | 4 4 1127 | 4 4 1128 | 1129 | 3 3 1130 | 5 3 1131 | 3 3 1132 | 2 2 1133 | 2 2 1134 | 1135 | 1 1 1136 | 1 1 1137 | 1 1 1138 | 1 1 1139 | 1 1 1140 | 1141 | 5 6 1142 | 5 6 1143 | 5 6 1144 | 5 6 1145 | 5 5 1146 | 1147 | 3 3 1148 | 3 3 1149 | 3 3 1150 | 3 3 1151 | 3 3 1152 | 1153 | 1 1 1154 | 1 1 1155 | 2 2 1156 | 2 2 1157 | 1 2 1158 | 1159 | 2 1 1160 | 1 1 1161 | 1 1 1162 | 1 1 1163 | 1 1 1164 | 1165 | 4 4 1166 | 1 2 1167 | 2 2 1168 | 2 2 1169 | 2 2 1170 | 1171 | 5 2 1172 | 4 2 1173 | 3 3 1174 | 3 3 1175 | 4 3 1176 | 1177 | 1 1 1178 | 1 1 1179 | 1 1 1180 | 1 1 1181 | 1 1 1182 | 1183 | 1 1 1184 | 2 1 1185 | 2 2 1186 | 2 2 1187 | 4 3 1188 | 1189 | 1 1 1190 | 1 1 1191 | 1 1 1192 | 2 1 1193 | 1 1 1194 | 1195 | 3 3 1196 | 4 4 1197 | 4 4 1198 | 3 4 1199 | 4 4 1200 | 1201 | -------------------------------------------------------------------------------- /Flyon/CCKS_CRF/BIO_ccks/入院记录现病史-2.txtoriginal.txt: -------------------------------------------------------------------------------- 1 | , x Nd Nj Ns Nz 2 0 1 O 2 | 患者 n Nd Nj Ns Nz 2 1 3 O 3 | 因 p dr Nj Ns Nz 2 3 4 O 4 | 罹患 v Nd Nj Ns Nz 2 4 6 O 5 | “ x Nd Nj Ns Nz 2 6 7 O 6 | 胃癌 n Nd Nj Ns Nz 2 7 9 O 7 | ” x Nd Nj Ns Nz 2 9 10 O 8 | 于 p dr Nj Ns Nz 2 10 11 O 9 | 2013 m Nd Nj Ns Nz 2 11 15 O 10 | - x Nd Nj Ns Nz 2 15 16 O 11 | 10 m Nd Nj Ns Nz 2 16 18 O 12 | - x Nd Nj Ns Nz 2 18 19 O 13 | 29 m Nd Nj Ns Nz 2 19 21 O 14 | 在 p Nd Nj ss Nz 2 21 22 O 15 | 我院 n Nd Nj Ns Nz 2 22 24 O 16 | 予行 v Nd Nj Ns Nz 2 24 26 O 17 | 全麻 n Nd Nj Ns Nz 2 26 28 O 18 | 下 f Nd Nj Ns Nz 2 28 29 O 19 | 胃癌 n Nd Nj Ns Nz 2 29 31 B-手术 20 | 根治术 n Nd Nj ss Nz 2 31 34 I-手术 21 | , x Nd Nj Ns Nz 2 34 35 O 22 | , x Nd Nj Ns Nz 2 35 36 O 23 | 术 v Nd Nj Ns Nz 2 36 37 O 24 | 中 f Nd Nj Ns Nz 2 37 38 O 25 | 见 v dr Nj Ns Nz 2 38 39 O 26 | : x Nd Nj Ns Nz 2 39 40 O 27 | 腹腔 n Nd Nj Ns Nz 2 40 42 B-解剖部位 28 | 内 n Nd Nj Ns Nz 2 42 43 O 29 | 腹水 n Nd Nj Ns Nz 2 43 45 O 30 | , x Nd Nj Ns Nz 2 45 46 O 31 | 腹膜 n Nd Nj Ns Nz 2 46 48 B-解剖部位 32 | 无 v Nd Nj Ns Nz 2 48 49 O 33 | 转移 v Nd Nj ss Nz 2 49 51 O 34 | , x Nd Nj Ns Nz 2 51 52 O 35 | 肝脏 n Nd Nj Ns Nz 2 52 54 B-解剖部位 36 | 未 d Nd Nj Ns Nz 2 54 55 O 37 | 触及 v Nd Nj Ns Nz 2 55 57 O 38 | 明显 a Nd Nj Ns zz 2 57 59 O 39 | 转移性 n Nd Nj Ns Nz 2 59 62 O 40 | 灶 n Nd Nj ss Nz 2 62 63 O 41 | , x Nd Nj Ns Nz 2 63 64 O 42 | 肿瘤 n Nd Nj Ns Nz 2 64 66 O 43 | 位于 v Nd Nj Ns Nz 2 66 68 O 44 | 胃体 n Nd jp Ns Nz 2 68 70 B-解剖部位 45 | 、 x Nd Nj Ns Nz 2 70 71 O 46 | 胃 n Nd Nj Ns Nz 2 71 72 B-解剖部位 47 | 底部 f Nd jp Ns Nz 2 72 74 I-解剖部位 48 | , x Nd Nj Ns Nz 2 74 75 O 49 | 小 a Nd Nj Ns Nz 2 75 76 B-解剖部位 50 | 弯 v Nd Nj ss Nz 2 76 77 I-解剖部位 51 | 侧 v Nd Nj Ns Nz 2 77 78 I-解剖部位 52 | 偏后 f Nd Nj Ns Nz 2 78 80 I-解剖部位 53 | 壁 n Nd Nj Ns Nz 2 80 81 I-解剖部位 54 | , x Nd Nj Ns Nz 2 81 82 O 55 | 约 d Nd Nj Ns Nz 2 82 83 O 56 | 5 m Nd Nj Ns Nz 2 83 84 O 57 | * x Nd Nj Ns Nz 2 84 85 O 58 | 4 x Nd Nj Ns Nz 2 85 86 O 59 | * x Nd Nj Ns Nz 2 86 87 O 60 | 2 m Nd Nj Ns Nz 2 87 88 O 61 | CM eng Nd Nj Ns Nz 2 88 90 O 62 | 大小 b Nd Nj Ns Nz 2 90 92 O 63 | , x Nd Nj Ns Nz 2 92 93 O 64 | 肿瘤 n Nd Nj Ns Nz 2 93 95 O 65 | 已 d dr Nj Ns Nz 2 95 96 O 66 | 侵达 v Nd Nj Ns Nz 2 96 98 O 67 | 浆膜 n Nd jp Ns Nz 2 98 100 O 68 | 外 f Nd Nj Ns Nz 2 100 101 O 69 | , x Nd Nj Ns Nz 2 101 102 O 70 | 第 m dr Nj Ns Nz 2 102 103 O 71 | 1 m Nd Nj Ns Nz 2 103 104 O 72 | 、 x Nd Nj Ns Nz 2 104 105 O 73 | 3 m Nd Nj Ns Nz 2 105 106 O 74 | 组 v Nd Nj Ns Nz 2 106 107 O 75 | 淋巴结 n Nd Nj Ns Nz 2 107 110 B-解剖部位 76 | 肿大 v Nd Nj Ns zz 2 110 112 O 77 | , x Nd Nj Ns Nz 2 112 113 O 78 | 肿瘤 n Nd Nj Ns Nz 2 113 115 O 79 | 尚 d Nd Nj Ns Nz 2 115 116 O 80 | 能 v Nd Nj Ns Nz 2 116 117 O 81 | 活动 vn Nd Nj Ns Nz 2 117 119 O 82 | , x Nd Nj Ns Nz 2 119 120 O 83 | 经 n Nd Nj Ns Nz 2 120 121 O 84 | 探查 vn Nd Nj ss Nz 2 121 123 O 85 | 决定 v Nd Nj Ns Nz 2 123 125 O 86 | 行全胃 n Nd Nj Ns Nz 2 125 128 O 87 | 切除 v Nd Nj ss Nz 2 128 130 I-手术 88 | , x Nd Nj Ns Nz 2 130 131 I-手术 89 | 空肠 n Nd Nj Ns Nz 2 131 133 I-手术 90 | J eng Nd Nj Ns Nz 2 133 134 I-手术 91 | 字代 n Nd Nj Ns Nz 2 134 136 I-手术 92 | 胃术 n Nd Nj Ns Nz 2 136 138 I-手术 93 | 。 x Nd Nj Ns Nz 2 138 139 O 94 | 95 | 手术 n Nd Nj Ns Nz 2 139 141 O 96 | 顺利 ad Nd Nj Ns Nz 2 141 143 O 97 | , x Nd Nj Ns Nz 2 143 144 O 98 | 术后 t Nd Nj ss Nz 2 144 146 O 99 | 积极 ad Nd Nj Ns Nz 2 146 148 O 100 | 予 vg Nd Nj Ns Nz 2 148 149 O 101 | 相关 v Nd Nj Ns Nz 2 149 151 O 102 | 对症 n Nd Nj Ns Nz 2 151 153 O 103 | 支持 v Nd Nj Ns Nz 2 153 155 O 104 | 治疗 v Nd Nj Ns Nz 2 155 157 O 105 | ; x Nd Nj Ns Nz 2 157 158 O 106 | , x Nd Nj Ns Nz 2 158 159 O 107 | 后 f Nd Nj Ns Nz 2 159 160 O 108 | 病理 n Nd Nj Ns Nz 2 160 162 O 109 | 示 v Nd Nj Ns Nz 2 162 163 O 110 | : x Nd Nj Ns Nz 2 163 164 O 111 | 胃底 n Nd Nj Ns Nz 2 164 166 B-解剖部位 112 | 、 x Nd Nj Ns Nz 2 166 167 I-解剖部位 113 | 体小 n Nd Nj Ns Nz 2 167 169 I-解剖部位 114 | 弯 v Nd Nj ss Nz 2 169 170 I-解剖部位 115 | 侧 v Nd Nj Ns Nz 2 170 171 I-解剖部位 116 | 低 a Nd Nj Ns Nz 2 171 172 O 117 | 分化腺癌 n Nd Nj Ns Nz 2 172 176 O 118 | , x Nd Nj Ns Nz 2 176 177 O 119 | 部分 n Nd Nj ss Nz 2 177 179 O 120 | 为 p Nd Nj ss Nz 2 179 180 O 121 | 印戒 n Nd Nj Ns Nz 2 180 182 O 122 | 细胞 n Nd Nj Ns Nz 2 182 184 O 123 | 癌 zg Nd Nj ss Nz 2 184 185 O 124 | 图像 n Nd Nj Ns Nz 2 185 187 O 125 | , x Nd Nj Ns Nz 2 187 188 O 126 | 蕈 g Nd Nj Ns Nz 2 188 189 O 127 | 伞型 b Nd Nj Ns Nz 2 189 191 O 128 | , x Nd Nj Ns Nz 2 191 192 O 129 | 面积 n Nd Nj Ns Nz 2 192 194 O 130 | 5.2 m Nd Nj Ns Nz 2 194 197 O 131 | * x Nd Nj Ns Nz 2 197 198 O 132 | 3.5 m Nd Nj Ns Nz 2 198 201 O 133 | CM eng Nd Nj Ns Nz 2 201 203 O 134 | , x Nd Nj Ns Nz 2 203 204 O 135 | 局部 n Nd Nj Ns Nz 2 204 206 O 136 | 侵达 v Nd Nj Ns Nz 2 206 208 O 137 | 粘膜 n Nd Nj Ns Nz 2 208 210 O 138 | 下层 n Nd Nj Ns Nz 2 210 212 O 139 | , x Nd Nj Ns Nz 2 212 213 O 140 | 并 c Nd Nj Ns Nz 2 213 214 O 141 | 于 p dr Nj Ns Nz 2 214 215 O 142 | 少数 m Nd Nj Ns Nz 2 215 217 O 143 | 腺 n Nd Nj Ns Nz 2 217 218 O 144 | 管内 n Nd Nj Ns Nz 2 218 220 O 145 | 查见 v Nd Nj Ns Nz 2 220 222 O 146 | 癌栓 n Nd Nj ss Nz 2 222 224 O 147 | 。 x Nd Nj Ns Nz 2 224 225 O 148 | 149 | 两端 m Nd Nj Ns Nz 2 225 227 O 150 | 切线 n Nd jp Ns Nz 2 227 229 O 151 | 及 c Nd Nj Ns Nz 2 229 230 O 152 | 另 r Nd Nj Ns Nz 2 230 231 O 153 | 送 v Nd Nj Ns Nz 2 231 232 O 154 | “ x Nd Nj Ns Nz 2 232 233 O 155 | 近 a Nd Nj Ns Nz 2 233 234 O 156 | 端 v Nd Nj Ns Nz 2 234 235 O 157 | 切线 n Nd jp Ns Nz 2 235 237 O 158 | ” x Nd Nj Ns Nz 2 237 238 O 159 | 未查 v Nd Nj Ns Nz 2 238 240 O 160 | 见 v dr Nj Ns Nz 2 240 241 O 161 | 癌 n Nd Nj ss Nz 2 241 242 O 162 | 。 x Nd Nj Ns Nz 2 242 243 O 163 | 164 | 呈 v Nd Nj Ns Nz 2 243 244 O 165 | 三组 m Nd Nj Ns Nz 2 244 246 O 166 | ( x Nd Nj Ns Nz 2 246 247 O 167 | 5 x Nd Nj Ns Nz 2 247 248 O 168 | / x Nd Nj Ns Nz 2 248 249 O 169 | 13 m Nd Nj Ns Nz 2 249 251 O 170 | 个 m Nd Nj Ns Nz 2 251 252 O 171 | ) x Nd Nj Ns Nz 2 252 253 O 172 | 淋巴结 n Nd Nj Ns Nz 2 253 256 B-解剖部位 173 | 癌 zg Nd Nj ss Nz 2 256 257 O 174 | 转移 v Nd Nj ss Nz 2 257 259 O 175 | 。 x Nd Nj Ns Nz 2 259 260 O 176 | 177 | 一组 m Nd Nj Ns Nz 2 260 262 O 178 | ( x Nd Nj Ns Nz 2 262 263 O 179 | 7 m Nd Nj Ns Nz 2 263 264 O 180 | 个 m Nd Nj Ns Nz 2 264 265 O 181 | ) x Nd Nj Ns Nz 2 265 266 O 182 | 、 x Nd Nj Ns Nz 2 266 267 O 183 | 四组 m Nd Nj Ns Nz 2 267 269 O 184 | ( x Nd Nj Ns Nz 2 269 270 O 185 | 13 m Nd Nj Ns Nz 2 270 272 O 186 | 个 m Nd Nj Ns Nz 2 272 273 O 187 | ) x Nd Nj Ns Nz 2 273 274 O 188 | 、 x Nd Nj Ns Nz 2 274 275 O 189 | 五组 m Nd Nj Ns Nz 2 275 277 O 190 | ( x Nd Nj Ns Nz 2 277 278 O 191 | 1 m Nd Nj Ns Nz 2 278 279 O 192 | 个 m Nd Nj Ns Nz 2 279 280 O 193 | ) x Nd Nj Ns Nz 2 280 281 O 194 | 、 x Nd Nj Ns Nz 2 281 282 O 195 | 六组 m Nd Nj Ns Nz 2 282 284 O 196 | ( x Nd Nj Ns Nz 2 284 285 O 197 | 4 m Nd Nj Ns Nz 2 285 286 O 198 | 个 m Nd Nj Ns Nz 2 286 287 O 199 | ) x Nd Nj Ns Nz 2 287 288 O 200 | 淋巴结 n Nd Nj Ns Nz 2 288 291 B-解剖部位 201 | 未查 v Nd Nj Ns Nz 2 291 293 O 202 | 见 v dr Nj Ns Nz 2 293 294 O 203 | 癌 n Nd Nj ss Nz 2 294 295 O 204 | 。 x Nd Nj Ns Nz 2 295 296 O 205 | 206 | , x Nd Nj Ns Nz 2 296 297 O 207 | 癌 zg Nd Nj ss Nz 2 297 298 O 208 | 组织 v Nd Nj Ns Nz 2 298 300 O 209 | 免疫组化 n Nd Nj Ns Nz 2 300 304 O 210 | 染色 n Nd Nj Ns Nz 2 304 306 O 211 | 示 v Nd Nj Ns Nz 2 306 307 O 212 | : x Nd Nj Ns Nz 2 307 308 O 213 | ERCC1 eng Nd Nj Ns Nz 2 308 313 O 214 | ( x Nd Nj Ns Nz 2 313 314 O 215 | + x Nd Nj Ns Nz 2 314 315 O 216 | ) x Nd Nj Ns Nz 2 315 316 O 217 | 、 x Nd Nj Ns Nz 2 316 317 O 218 | β x Nd Nj Ns Nz 2 317 318 O 219 | - x Nd Nj Ns Nz 2 318 319 O 220 | TUBULIN eng Nd Nj Ns Nz 2 319 326 O 221 | - x Nd Nj Ns Nz 2 326 327 O 222 | III eng Nd Nj Ns Nz 2 327 330 O 223 | ( x Nd Nj Ns Nz 2 330 331 O 224 | + x Nd Nj Ns Nz 2 331 332 O 225 | ) x Nd Nj Ns Nz 2 332 333 O 226 | 、 x Nd Nj Ns Nz 2 333 334 O 227 | TS eng Nd Nj Ns Nz 2 334 336 O 228 | ( x Nd Nj Ns Nz 2 336 337 O 229 | - x Nd Nj Ns Nz 2 337 338 O 230 | ) x Nd Nj Ns Nz 2 338 339 O 231 | 、 x Nd Nj Ns Nz 2 339 340 O 232 | RRM1 eng Nd Nj Ns Nz 2 340 344 O 233 | ( x Nd Nj Ns Nz 2 344 345 O 234 | - x Nd Nj Ns Nz 2 345 346 O 235 | ) x Nd Nj Ns Nz 2 346 347 O 236 | 、 x Nd Nj Ns Nz 2 347 348 O 237 | TOPOII eng Nd Nj Ns Nz 2 348 354 O 238 | 阳性细胞 n Nd Nj Ns Nz 2 354 358 O 239 | 数约 n Nd Nj Ns Nz 2 358 360 O 240 | 20 m Nd Nj Ns Nz 2 360 362 O 241 | % x Nd Nj Ns Nz 2 362 363 O 242 | 、 x Nd Nj Ns Nz 2 363 364 O 243 | CERBB eng Nd Nj Ns Nz 2 364 369 O 244 | - x Nd Nj Ns Nz 2 369 370 O 245 | 2 x Nd Nj Ns Nz 2 370 371 O 246 | ( x Nd Nj Ns Nz 2 371 372 O 247 | 2 m Nd Nj Ns Nz 2 372 373 O 248 | + x Nd Nj Ns Nz 2 373 374 O 249 | ) x Nd Nj Ns Nz 2 374 375 O 250 | 251 | 。 x Nd Nj Ns Nz 2 376 377 O 252 | 253 | 依据 p Nd Nj Ns Nz 2 377 379 O 254 | 患者 n Nd Nj Ns Nz 2 379 381 O 255 | 病情 n Nd Nj Ns Nz 2 381 383 O 256 | 及 c Nd Nj Ns Nz 2 383 384 O 257 | 肿瘤 n Nd Nj Ns Nz 2 384 386 O 258 | 病理 n Nd Nj Ns Nz 2 386 388 O 259 | 与 p Nd Nj Ns Nz 2 388 389 O 260 | 分期 vn Nd Nj Ns Nz 2 389 391 O 261 | 继续 v Nd Nj Ns Nz 2 391 393 O 262 | 术后 t Nd Nj ss Nz 2 393 395 O 263 | 辅助性 n Nd Nj Ns Nz 2 395 398 O 264 | 化疗 n Nd Nj Ns Nz 2 398 400 O 265 | 指征 n Nd Nj Ns Nz 2 400 402 O 266 | 存在 v Nd Nj Ns Nz 2 402 404 O 267 | , x Nd Nj Ns Nz 2 404 405 O 268 | 患者 n Nd Nj Ns Nz 2 405 407 O 269 | 及 c Nd Nj Ns Nz 2 407 408 O 270 | 家属 n Nd Nj Ns Nz 2 408 410 O 271 | 拒绝 v Nd Nj Ns Nz 2 410 412 O 272 | 化疗 n Nd Nj Ns Nz 2 412 414 O 273 | 。 x Nd Nj Ns Nz 2 414 415 O 274 | 275 | 自 p Nd jp Ns Nz 2 415 416 O 276 | 术后 t Nd Nj ss Nz 2 416 418 O 277 | 出院 n Nd Nj Ns Nz 2 418 420 O 278 | 以来 f Nd Nj Ns Nz 2 420 422 O 279 | , x Nd Nj Ns Nz 2 422 423 O 280 | 患者 n Nd Nj Ns Nz 2 423 425 O 281 | 一般 a Nd Nj Ns Nz 2 425 427 O 282 | 情况 n Nd Nj Ns Nz 2 427 429 O 283 | 保持良好 n Nd Nj Ns Nz 2 429 433 O 284 | ; x Nd Nj Ns Nz 2 433 434 O 285 | 无 v Nd Nj Ns Nz 2 434 435 O 286 | 发热 v Nd Nj Ns Nz 2 435 437 B-独立症状 287 | , x Nd Nj Ns Nz 2 437 438 O 288 | 偶 d dr Nj Ns Nz 2 438 439 O 289 | 有 v dr Nj Ns Nz 2 439 440 O 290 | 恶心 n Nd Nj Ns Nz 2 440 442 B-独立症状 291 | , x Nd Nj Ns Nz 2 442 443 O 292 | 无 v Nd Nj Ns Nz 2 443 444 O 293 | 呕吐 v Nd Nj Ns Nz 2 444 446 B-独立症状 294 | , x Nd Nj Ns Nz 2 446 447 O 295 | 无 v Nd Nj Ns Nz 2 447 448 O 296 | 反酸 n Nd Nj Ns Nz 2 448 450 B-独立症状 297 | 、 x Nd Nj Ns Nz 2 450 451 O 298 | 嗳气 n Nd Nj Ns Nz 2 451 453 B-独立症状 299 | , x Nd Nj Ns Nz 2 453 454 O 300 | 无 v Nd Nj Ns Nz 2 454 455 O 301 | 明显 a Nd Nj Ns zz 2 455 457 O 302 | 进食 v dr Nj Ns Nz 2 457 459 B-独立症状 303 | 不适 a Nd Nj Ns zz 2 459 461 I-独立症状 304 | , x Nd Nj Ns Nz 2 461 462 O 305 | 偶 d dr Nj Ns Nz 2 462 463 O 306 | 有 v dr Nj Ns Nz 2 463 464 O 307 | 进食 v dr Nj Ns Nz 2 464 466 O 308 | 后 f Nd Nj Ns Nz 2 466 467 O 309 | 轻微 d Nd Nj Ns Nz 2 467 469 O 310 | 腹胀 v Nd Nj Ns Nz 2 469 471 O 311 | , x Nd Nj Ns Nz 2 471 472 O 312 | 无 v Nd Nj Ns Nz 2 472 473 O 313 | 腹痛 n Nd Nj Ns Nz 2 473 475 O 314 | 。 x Nd Nj Ns Nz 2 475 476 O 315 | 316 | 现 tg Nd Nj Ns Nz 2 476 477 O 317 | 患者 n Nd Nj Ns Nz 2 477 479 O 318 | 为 p Nd Nj ss Nz 2 479 480 O 319 | 行 n Nd Nj ss Nz 2 480 481 O 320 | 进一步 d Nd Nj Ns Nz 2 481 484 O 321 | 复查 vn Nd Nj Ns Nz 2 484 486 O 322 | 并 c Nd Nj Ns Nz 2 486 487 O 323 | 必要 d dr Nj Ns Nz 2 487 489 O 324 | 时 n Nd Nj Ns Nz 2 489 490 O 325 | 适当 a Nd Nj Ns Nz 2 490 492 O 326 | 处理 v Nd Nj Ns Nz 2 492 494 O 327 | 而 c dr Nj Ns Nz 2 494 495 O 328 | 再 d Nd Nj Ns Nz 2 495 496 O 329 | 来 v dr Nj Ns Nz 2 496 497 O 330 | 我院 n Nd Nj Ns Nz 2 497 499 O 331 | 就诊 v Nd Nj Ns Nz 2 499 501 O 332 | , x Nd Nj Ns Nz 2 501 502 O 333 | 门诊 n Nd Nj Ns Nz 2 502 504 O 334 | 依情 n Nd Nj Ns Nz 2 504 506 O 335 | 以 p Nd Nj Ns Nz 2 506 507 O 336 | “ x Nd Nj Ns Nz 2 507 508 O 337 | 胃 n Nd Nj Ns Nz 2 508 509 B-解剖部位 338 | 恶性肿瘤 l Nd Nj ss Nz 2 509 513 O 339 | 术后 t Nd Nj ss Nz 2 513 515 O 340 | ” x Nd Nj Ns Nz 2 515 516 O 341 | 收入 v Nd Nj Ns Nz 2 516 518 O 342 | 院 n Nd Nj Ns Nz 2 518 519 O 343 | 。 x Nd Nj Ns Nz 2 519 520 O 344 | 345 | 目前 t Nd Nj Ns Nz 2 520 522 O 346 | 患者 n Nd Nj Ns Nz 2 522 524 O 347 | 精神 n Nd Nj Ns Nz 2 524 526 O 348 | 及 c Nd Nj Ns Nz 2 526 527 O 349 | 情绪 n Nd Nj Ns Nz 2 527 529 O 350 | 状态 n Nd Nj Ns Nz 2 529 531 O 351 | 良好 a Nd Nj Ns Nz 2 531 533 O 352 | , x Nd Nj Ns Nz 2 533 534 O 353 | 食欲 n Nd Nj Ns Nz 2 534 536 B-独立症状 354 | 较 d dr Nj Ns Nz 2 536 537 I-独立症状 355 | 术 v Nd Nj Ns Nz 2 537 538 I-独立症状 356 | 前 f Nd Nj Ns Nz 2 538 539 I-独立症状 357 | 明显 a Nd Nj Ns zz 2 539 541 I-独立症状 358 | 减少 v Nd Nj ss Nz 2 541 543 I-独立症状 359 | , x Nd Nj Ns Nz 2 543 544 O 360 | 饮食 n Nd Nj Ns Nz 2 544 546 O 361 | 可 v Nd Nj Ns Nz 2 546 547 O 362 | , x Nd Nj Ns Nz 2 547 548 O 363 | 夜间 t Nd Nj Ns Nz 2 548 550 O 364 | 睡眠 v Nd Nj ss Nz 2 550 552 O 365 | 后 f Nd Nj Ns Nz 2 552 553 O 366 | ; x Nd Nj Ns Nz 2 553 554 O 367 | 今 t dr Nj Ns Nz 2 554 555 O 368 | 8 m Nd Nj Ns Nz 2 555 556 O 369 | 个 m Nd Nj Ns Nz 2 556 557 O 370 | 月 m Nd Nj Ns Nz 2 557 558 O 371 | 体重减轻 i Nd Nj Ns Nz 2 558 562 B-独立症状 372 | 18 m Nd Nj Ns Nz 2 562 564 O 373 | KG eng Nd Nj Ns Nz 2 564 566 O 374 | 。 x Nd Nj Ns Nz 2 566 567 O 375 | 376 | 377 | 378 | -------------------------------------------------------------------------------- /Wapiti/src/model.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Wapiti - A linear-chain CRF tool 3 | * 4 | * Copyright (c) 2009-2013 CNRS 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | * POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "wapiti.h" 36 | #include "model.h" 37 | #include "options.h" 38 | #include "quark.h" 39 | #include "reader.h" 40 | #include "tools.h" 41 | #include "vmath.h" 42 | 43 | /******************************************************************************* 44 | * Linear chain CRF model 45 | * 46 | * There is three concept that must be well understand here, the labels, 47 | * observations, and features. The labels are the values predicted by the 48 | * model at each point of the sequence and denoted by Y. The observations are 49 | * the values, at each point of the sequence, given to the model in order to 50 | * predict the label and denoted by O. A feature is a test on both labels and 51 | * observations, denoted by F. In linear chain CRF there is two kinds of 52 | * features : 53 | * - unigram feature who represent a test on the observations at the current 54 | * point and the label at current point. 55 | * - bigram feature who represent a test on the observation at the current 56 | * point and two labels : the current one and the previous one. 57 | * So for each observation, there Y possible unigram features and Y*Y possible 58 | * bigram features. The kind of features used by the model for a given 59 | * observation depend on the pattern who generated it. 60 | ******************************************************************************/ 61 | 62 | /* mdl_new: 63 | * Allocate a new empty model object linked with the given reader. The model 64 | * have to be synchronized before starting training or labelling. If you not 65 | * provide a reader (as it will loaded from file for example) you must be sure 66 | * to set one in the model before any attempts to synchronize it. 67 | */ 68 | mdl_t *mdl_new(rdr_t *rdr) { 69 | mdl_t *mdl = xmalloc(sizeof(mdl_t)); 70 | mdl->nlbl = mdl->nobs = mdl->nftr = 0; 71 | mdl->kind = NULL; 72 | mdl->uoff = mdl->boff = NULL; 73 | mdl->theta = NULL; 74 | mdl->train = mdl->devel = NULL; 75 | mdl->reader = rdr; 76 | mdl->werr = NULL; 77 | mdl->total = 0.0; 78 | return mdl; 79 | } 80 | 81 | /* mdl_free: 82 | * Free all memory used by a model object inculding the reader and datasets 83 | * loaded in the model. 84 | */ 85 | void mdl_free(mdl_t *mdl) { 86 | free(mdl->kind); 87 | free(mdl->uoff); 88 | free(mdl->boff); 89 | if (mdl->theta != NULL) 90 | xvm_free(mdl->theta); 91 | if (mdl->train != NULL) 92 | rdr_freedat(mdl->train); 93 | if (mdl->devel != NULL) 94 | rdr_freedat(mdl->devel); 95 | if (mdl->reader != NULL) 96 | rdr_free(mdl->reader); 97 | if (mdl->werr != NULL) 98 | free(mdl->werr); 99 | free(mdl); 100 | } 101 | 102 | /* mdl_sync: 103 | * Synchronize the model with its reader. As the model is just a placeholder 104 | * for features weights and interned sequences, it know very few about the 105 | * labels and observations, all the informations are kept in the reader. A 106 | * sync will get the labels and observations count as well as the observation 107 | * kind from the reader and build internal structures representing the model. 108 | * 109 | * If the model was already synchronized before, there is an existing model 110 | * incompatible with the new one to be created. In this case there is two 111 | * possibility : 112 | * - If only new observations was added, the weights of the old ones remain 113 | * valid and are kept as they form a probably good starting point for 114 | * training the new model, the new observation get a 0 weight ; 115 | * - If new labels was added, the old model are trully meaningless so we 116 | * have to fully discard them and build a new empty model. 117 | * In any case, you must never change existing labels or observations, if this 118 | * happen, you need to create a new model and destroy this one. 119 | * 120 | * After synchronization, the labels and observations databases are locked to 121 | * prevent new one to be created. You must unlock them explicitly if needed. 122 | * This reduce the risk of mistakes. 123 | */ 124 | void mdl_sync(mdl_t *mdl) { 125 | const uint32_t Y = qrk_count(mdl->reader->lbl); 126 | const uint64_t O = qrk_count(mdl->reader->obs); 127 | // If model is already synchronized, do nothing and just return 128 | if (mdl->nlbl == Y && mdl->nobs == O) 129 | return; 130 | if (Y == 0 || O == 0) 131 | fatal("cannot synchronize an empty model"); 132 | // If new labels was added, we have to discard all the model. In this 133 | // case we also display a warning as this is probably not expected by 134 | // the user. If only new observations was added, we will try to expand 135 | // the model. 136 | uint64_t oldF = mdl->nftr; 137 | uint64_t oldO = mdl->nobs; 138 | if (mdl->nlbl != Y && mdl->nlbl != 0) { 139 | warning("labels count changed, discarding the model"); 140 | free(mdl->kind); mdl->kind = NULL; 141 | free(mdl->uoff); mdl->uoff = NULL; 142 | free(mdl->boff); mdl->boff = NULL; 143 | if (mdl->theta != NULL) { 144 | xvm_free(mdl->theta); 145 | mdl->theta = NULL; 146 | } 147 | oldF = oldO = 0; 148 | } 149 | mdl->nlbl = Y; 150 | mdl->nobs = O; 151 | // Allocate the observations datastructure. If the model is empty or 152 | // discarded, a new one iscreated, else the old one is expanded. 153 | char *kind = xrealloc(mdl->kind, sizeof(char ) * O); 154 | uint64_t *uoff = xrealloc(mdl->uoff, sizeof(uint64_t) * O); 155 | uint64_t *boff = xrealloc(mdl->boff, sizeof(uint64_t) * O); 156 | mdl->kind = kind; 157 | mdl->uoff = uoff; 158 | mdl->boff = boff; 159 | // Now, we can setup the features. For each new observations we fill the 160 | // kind and offsets arrays and count total number of features as well. 161 | uint64_t F = oldF; 162 | for (uint64_t o = oldO; o < O; o++) { 163 | const char *obs = qrk_id2str(mdl->reader->obs, o); 164 | switch (obs[0]) { 165 | case 'u': kind[o] = 1; break; 166 | case 'b': kind[o] = 2; break; 167 | case '*': kind[o] = 3; break; 168 | } 169 | if (kind[o] & 1) 170 | uoff[o] = F, F += Y; 171 | if (kind[o] & 2) 172 | boff[o] = F, F += Y * Y; 173 | } 174 | mdl->nftr = F; 175 | // We can finally grow the features weights vector itself. We set all 176 | // the new features to 0.0 but don't touch the old ones. 177 | // This is a bit tricky as aligned malloc cannot be simply grown so we 178 | // have to allocate a new vector and copy old values ourself. 179 | if (oldF != 0) { 180 | double *new = xvm_new(F); 181 | for (uint64_t f = 0; f < oldF; f++) 182 | new[f] = mdl->theta[f]; 183 | xvm_free(mdl->theta); 184 | mdl->theta = new; 185 | } else { 186 | mdl->theta = xvm_new(F); 187 | } 188 | for (uint64_t f = oldF; f < F; f++) 189 | mdl->theta[f] = 0.0; 190 | // And lock the databases 191 | qrk_lock(mdl->reader->lbl, true); 192 | qrk_lock(mdl->reader->obs, true); 193 | } 194 | 195 | /* mdl_compact: 196 | * Comapct the given model by removing from it all observation who lead to 197 | * zero actives features. On model trained with l1 regularization this can 198 | * lead to a drastic model size reduction and so to faster loading, training 199 | * and labeling. 200 | */ 201 | void mdl_compact(mdl_t *mdl) { 202 | const uint32_t Y = mdl->nlbl; 203 | // We first build the new observation list with only observations which 204 | // lead to at least one active feature. At the same time we build the 205 | // translation table which map the new observations index to the old 206 | // ones. 207 | info(" - Scan the model\n"); 208 | qrk_t *old_obs = mdl->reader->obs; 209 | qrk_t *new_obs = qrk_new(); 210 | uint64_t *trans = xmalloc(sizeof(uint64_t) * mdl->nobs); 211 | for (uint64_t oldo = 0; oldo < mdl->nobs; oldo++) { 212 | bool active = false; 213 | if (mdl->kind[oldo] & 1) 214 | for (uint32_t y = 0; y < Y; y++) 215 | if (mdl->theta[mdl->uoff[oldo] + y] != 0.0) 216 | active = true; 217 | if (mdl->kind[oldo] & 2) 218 | for (uint32_t d = 0; d < Y * Y; d++) 219 | if (mdl->theta[mdl->boff[oldo] + d] != 0.0) 220 | active = true; 221 | if (!active) 222 | continue; 223 | const char *str = qrk_id2str(old_obs, oldo); 224 | const uint64_t newo = qrk_str2id(new_obs, str); 225 | trans[newo] = oldo; 226 | } 227 | mdl->reader->obs = new_obs; 228 | // Now we save the old model features informations and build a new one 229 | // corresponding to the compacted model. 230 | uint64_t *old_uoff = mdl->uoff; mdl->uoff = NULL; 231 | uint64_t *old_boff = mdl->boff; mdl->boff = NULL; 232 | double *old_theta = mdl->theta; mdl->theta = NULL; 233 | free(mdl->kind); 234 | mdl->kind = NULL; 235 | mdl->nlbl = mdl->nobs = mdl->nftr = 0; 236 | mdl_sync(mdl); 237 | // The model is now ready, so we copy in it the features weights from 238 | // the old model for observations we have kept. 239 | info(" - Compact it\n"); 240 | for (uint64_t newo = 0; newo < mdl->nobs; newo++) { 241 | const uint64_t oldo = trans[newo]; 242 | if (mdl->kind[newo] & 1) { 243 | double *src = old_theta + old_uoff[oldo]; 244 | double *dst = mdl->theta + mdl->uoff[newo]; 245 | for (uint32_t y = 0; y < Y; y++) 246 | dst[y] = src[y]; 247 | } 248 | if (mdl->kind[newo] & 2) { 249 | double *src = old_theta + old_boff[oldo]; 250 | double *dst = mdl->theta + mdl->boff[newo]; 251 | for (uint32_t d = 0; d < Y * Y; d++) 252 | dst[d] = src[d]; 253 | } 254 | } 255 | // And cleanup 256 | free(trans); 257 | qrk_free(old_obs); 258 | free(old_uoff); 259 | free(old_boff); 260 | xvm_free(old_theta); 261 | } 262 | 263 | /* mdl_save: 264 | * Save a model to be restored later in a platform independant way. 265 | */ 266 | void mdl_save(mdl_t *mdl, FILE *file) { 267 | uint64_t nact = 0; 268 | for (uint64_t f = 0; f < mdl->nftr; f++) 269 | if (mdl->theta[f] != 0.0) 270 | nact++; 271 | fprintf(file, "#mdl#%d#%"PRIu64"\n", mdl->type, nact); 272 | rdr_save(mdl->reader, file); 273 | for (uint64_t f = 0; f < mdl->nftr; f++) 274 | if (mdl->theta[f] != 0.0) 275 | fprintf(file, "%"PRIu64"=%la\n", f, mdl->theta[f]); 276 | } 277 | 278 | /* mdl_load: 279 | * Read back a previously saved model to continue training or start labeling. 280 | * The returned model is synced and the quarks are locked. You must give to 281 | * this function an empty model fresh from mdl_new. 282 | */ 283 | void mdl_load(mdl_t *mdl, FILE *file) { 284 | const char *err = "invalid model format"; 285 | uint64_t nact = 0; 286 | int type; 287 | if (fscanf(file, "#mdl#%d#%"SCNu64"\n", &type, &nact) == 2) { 288 | mdl->type = type; 289 | } else { 290 | rewind(file); 291 | if (fscanf(file, "#mdl#%"SCNu64"\n", &nact) == 1) 292 | mdl->type = 0; 293 | else 294 | fatal(err); 295 | } 296 | rdr_load(mdl->reader, file); 297 | mdl_sync(mdl); 298 | for (uint64_t i = 0; i < nact; i++) { 299 | uint64_t f; 300 | double v; 301 | if (fscanf(file, "%"SCNu64"=%la\n", &f, &v) != 2) 302 | fatal(err); 303 | mdl->theta[f] = v; 304 | } 305 | } 306 | 307 | --------------------------------------------------------------------------------