├── .gitignore ├── LICENSE ├── README.md ├── conf ├── current.cancer.properties ├── current.chv.wiki.properties ├── current.chv_paper_022117.properties ├── current.deaf.properties ├── current.init_index.properties ├── current.knn.properties ├── current.socialqa.botanical.properties ├── default.cancer.properties ├── default.cancer_chv.properties ├── default.cancer_chv_ranking.properties ├── default.properties ├── log4j.properties ├── pattern_cancer.txt ├── pattern_cancer_duration.txt └── solrConf │ ├── schema.xml │ └── solrconfig.xml ├── data ├── PennTreebankP.O.S.Tags.html ├── SemGroups.txt ├── SemanticTypes_2013AA.txt ├── clinical_text.txt ├── en-chunker.bin ├── en-parser-chunking.bin ├── en-pos-maxent.bin ├── en-sent.bin ├── en-token.bin ├── pos-transformation.csv ├── prefix.txt ├── stopwords-FOR-clustering.txt ├── stopwords-empty.txt ├── stopwords.txt ├── stopwords_clustering.txt ├── stopwords_sent.txt ├── sty_pairs_preference_extended_STs.txt ├── suffix.txt └── test.text.txt ├── docs ├── dependency-package.jpg └── figurs │ ├── conceptual.png │ ├── cui_duration_heatmap3.png │ ├── evaluation_simiterm.png │ ├── figure8_human_review.png │ ├── sty_distribution.png │ └── work-flow.png ├── libs ├── bin │ └── winutils.exe ├── metamap-api-2.0.jar ├── prologbeans.jar ├── stanford-corenlp.jar └── stanfordNlp-models-url.txt ├── pom.xml ├── py ├── get_ct.py ├── ner200align.py ├── pre_run.py ├── preprocess_index.py └── xsl2csv.py ├── r ├── .Rhistory ├── README.md ├── RVisualisation.pdf ├── classify-rpart.R ├── cross-evaluation-200to300.R ├── cross-evaluation-bow.R ├── cross-evaluation.R ├── data │ ├── cross-evaluate-revice3.txt │ ├── cross-evaluate.txt │ ├── cross-evaluation-bow.txt │ ├── cross-evaluation-tf100-200to300.txt │ ├── cross-evaluation-tf100-cancer.txt │ ├── cross-evaluation-tf100.txt │ ├── cross-evaluation-tf5.txt │ ├── evaluation.txt │ ├── human_review.txt │ ├── ngram_yahoo_tf5.txt │ ├── pca.txt │ ├── result_cancer_1101.txt │ ├── result_cancer_rank.txt │ └── rpart.summary ├── ngram-distribution.R ├── pattern-heatmap.R ├── pca-draw.R ├── review-order-ranking.R ├── review_result.R └── silhouette.R ├── solr_Configuration.md ├── sql-script ├── 0923-test.sql ├── 1007.sql ├── 1018.sql ├── cancerqa_chv.sql ├── chv.sql ├── cluster.sql ├── criteria.sql ├── data_process.docx ├── data_process_0922.docx ├── deaf.sql ├── import_0919.sql ├── import_0922.sql ├── import_0924.sql ├── import_1004.sql ├── import_tag.sql ├── import_tag_0915.sql ├── import_tag_0916.sql ├── linux-test.sql ├── minsook.sql ├── minsook_1023.sql ├── minsook_1103.sql ├── minsook_1229.sql ├── ner200.sql ├── pattern.sql ├── pattern_all.sql ├── pattern_all_disease.sql ├── pattern_bibm2016.sql ├── pattern_diabetes.sql ├── pattern_sty_prefer.sql ├── ret-yahoo.sql ├── sent_1213.sq..sql ├── smb.sql ├── socialqa.sql ├── somelab-sctGraph.sql ├── synonym.sql ├── tumblr.sql ├── umls.sql ├── usuk.sql ├── wiki.sql ├── yahoo.sql └── ytex.sql ├── src ├── main │ ├── java │ │ ├── StanfordCoreNlpDemo.java │ │ └── com │ │ │ └── votors │ │ │ └── umls │ │ │ └── graph │ │ │ ├── HelloJGraphT.java │ │ │ ├── IsaEdge.java │ │ │ ├── SctGraph.java │ │ │ ├── TestJava.java │ │ │ └── UmlsVertex.java │ └── scala │ │ ├── com │ │ └── votors │ │ │ ├── Test.scala │ │ │ ├── TokenizerDemo.scala │ │ │ ├── common │ │ │ └── Utils.scala │ │ │ ├── ml │ │ │ ├── Clustering.scala │ │ │ ├── KNN.scala │ │ │ ├── Ngram.scala │ │ │ ├── Nlp.scala │ │ │ ├── StanfordNLP.scala │ │ │ └── Word2vec.scala │ │ │ └── umls │ │ │ ├── AnalyzeCT.scala │ │ │ ├── MMApi.scala │ │ │ ├── SemanticType.scala │ │ │ ├── TermIdentify.scala │ │ │ └── UmlsTagger2.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── mllib │ │ └── clustering │ │ └── MyKmean.scala └── test │ └── com │ └── votors │ └── umls │ └── UmlsTagger2Test.scala └── term_identification.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .idea/ 3 | target/ 4 | *dat.txt 5 | *.bak 6 | *.bk 7 | *.csv 8 | lvg2015/ 9 | lvg2015data/ 10 | VITTA_EC2/.git 11 | VATEC/~/eb-virt 12 | conf/current.properties 13 | r/.Rhistory -------------------------------------------------------------------------------- /conf/current.cancer.properties: -------------------------------------------------------------------------------- 1 | 2 | # configuration for cancer pattern 3 | 4 | # ###########################################################3333333333 5 | # ############# UMLS term matching configuration #####################3 6 | # # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY. 7 | # # for now, table mrstr is neccessary 8 | # jdbcDriver=jdbc:mysql://localhost:3306/umls?user=root&password=root 9 | # #url of solr we use to match umls term. do not used solr by default 10 | # solrServerUrl=http://localhost:8983/solr 11 | # 12 | # # caseFactor is [0, 1] value. It indicates how much you concern the case. It will affect the similarity score 13 | # # when you select a term from solr. Value 0 means upcase and lowcase are totally different, and 14 | # # value 1 means upcase and lowcase are not different at all. 15 | # caseFactor=0.8 16 | # 17 | # #not used for now 18 | # #Should we take the newline as the end of a sentence? or just ignore the newline? 19 | # # 1: replace with space; 2: replace with '.'; 0: do nothing 20 | # ignoreNewLine=2 21 | # 22 | # ####################################################################### 23 | # ########## data source to fetching configuration ###################### 24 | # # how to get the text to get Ngram; the blogId will select as distict, and the blogTextCol will be limit to 1 row. 25 | # blogDbUrl=jdbc:mysql://localhost:3306/ytex?user=root&password=root 26 | # blogTbl=tmp_org_yahoo 27 | # #blogTbl=content_org_new 28 | # blogIdCol=id 29 | # #blogIdCol=blogId 30 | # blogTextCol=concat(subject, ". ", content, ". ", chosenanswer) 31 | # #blogTextCol=text_content 32 | # 33 | # # limit the blog to be analyzed, mainly for test 34 | # blogLimit=200 35 | # 36 | # #target term info in database 37 | # targetTermTbl=_target_term_ 38 | # targetTermTblDropAndCreate=true 39 | # # if true, using solr for matching a ngram with target terms, else using database query for matching 40 | # targetTermUsingSolr=True 41 | # 42 | # ####################################################################### 43 | # ################### NLP relative configuration ############################### 44 | # #root dir of lvg 45 | # lvgdir=C:\\fsu\\ra\\UmlsTagger\\lvg2015\\ 46 | useStanfordNLP=true 47 | # stanfordTokenizerOption= 48 | stanfordTaggerOption=model=edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger 49 | stanfordPatternFile=/data/ra/pattern/pattern_cancer_duration.txt 50 | # # use the dependcy tree to find terms before using the syntactic tree. 51 | useDependencyTree=true 52 | partUmlsTermMatch=false 53 | 54 | # # for pattern parsing 55 | # analyzNonUmlsTerm = true 56 | # # the maximum length of a sentence. (character, not word) 57 | sentenceLenMax=500 58 | 59 | # # include POS tagger. The Ngram (basic terms) have to contain at least one of those POS tagger. it also the definition of 'noun' in this tool. No filter if empty 60 | # #posInclusive=NN NNS NNP NNPS 61 | posInclusive= 62 | # # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term 63 | umlsLikehoodLimit=80 64 | # # the window length to fetch context of a ngram 65 | # WinLen=10 66 | # 67 | # # use to force delimit gram. Delimiter = Pattern.compile("[,;/\\:\\(\\)\\[\\]\\{\\}\"]+") 68 | # delimiter =[,;/\\:\\(\\)\\[\\]\\{\\}\"]+ 69 | # 70 | # # how does ngram match the stop words list? 0:exactly matching; 1: ngram contains any stop word; 2: ngram start or end with any stop word; others: no filter 71 | # stopwordMatchType=2 72 | # # besides the file of stop word, you can specify a regex to indicate what is a stop word. 73 | # # exclude the gram start or end with digit. (remove the matched item) 74 | # # exclude words only start or end with one letter 75 | # # stopwordRegex=^\\d+.*|.*\\d$|^\\S(\\s.*|$)|(^|.*\\s)\\S ----- for clustering 76 | # # for clinical trails patterns. 77 | # stopwordRegex=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 78 | # # pos tagger filter (remove the matched item). 1: no noun; 2: ^N+P+N 3: not end with N 79 | # #posFilterRegex=[^N]* [^N]*PN .*[^N]$ 80 | # posFilterRegex=[^N]* 81 | # # a regex to check a string as a whole (may be several words) should query for a CUI or not 82 | # #(different form stopwords check since stop word checks every word in a string) 83 | # # 1. no a-z; 2. started or ended a word without a-z 84 | # cuiStringFilterRegex=[^a-zA-Z]*|^[^a-zA-Z]+\\s.*|.*\\s[^a-zA-Z]+$ 85 | # 86 | # 87 | # # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select' 88 | # # for chv paper 89 | # #semanticType=T033,T121,T061,T047,T109,T023,T184,T074,T116,T123,T059,T046 90 | # # for clinical trails pattern paper 91 | semanticType=T200,T020,T190,T049,T019,T047,T050,T037,T048,T191,T046,T184,T060,T065,T058,T059,T063,T062,T061 92 | # # all semantic type sorted by largest to smallest in size 93 | # #semanticType=T204,T007,T200,T061,T109,T002,T121,T116,T033,T004,T201,T023,T028,T123,T047,T074,T037,T060,T126,T013,T129,T044,T170,T191,T029,T059,T043,T005,T012,T114,T015,T130,T058,T014,T030,T046,T081,T011,T019,T026,T131,T167,T097,T197,T024,T195,T025,T192,T073,T034,T040,T122,T203,T083,T042,T082,T045,T048,T184,T080,T169,T194,T168,T078,T079,T125,T098,T020,T039,T190,T093,T031,T196,T049,T067,T038,T127,T062,T171,T185,T041,T091,T032,T018,T054,T055,T070,T057,T077,T065,T090,T068,T089,T064,T022,T056,T092,T104,T052,T099,T063,T086,T101,T120,T087,T051,T017,T102,T066,T001,T008,T016,T100,T075,T050,T069,T096,T095,T053,T072,T094,T010,T103,T071,T085,T021,T088 94 | # # filter the semantic type by a regular expression. tag extraction function. 95 | # #sabFilter=SNOMEDCT_US|NCI|GO 96 | # sabFilter=.* 97 | -------------------------------------------------------------------------------- /conf/current.chv.wiki.properties: -------------------------------------------------------------------------------- 1 | # ############# UMLS term matching configuration #####################3 2 | # # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY. 3 | # # for now, table mrstr is neccessary 4 | jdbcDriver=jdbc:mysql://localhost:3306/umls?user=root&password=root 5 | 6 | useStanfordNLP=true 7 | stanfordAnnotators=tokenize,ssplit,pos,lemma,depparse 8 | #lvgdir=/data/ra/lvg2015/ 9 | memcached=127.0.0.1:11211 10 | 11 | #####*_*####get the training data from (previous save) file, do not construct the Ngram again. 12 | clusteringFromFile=false 13 | # read text from files of a directory, instead of from database 14 | textFromDirectory=true 15 | # the directory of files, if textFromDirectory = true 16 | textDirectory=C:\\fsu\\ra\\wikiextractor\\wiki-test\\AA 17 | # save ngram result to a file. 18 | ngramSaveFile=c:\\fsu\\ra\\data\\ngram_wiki.serd 19 | 20 | # # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term 21 | umlsLikehoodLimit=50 22 | # # the window length to fetch context of a ngram 23 | # WinLen=10 24 | #max length of ngram 25 | ngramN=3 26 | ngramKeepSentence=false 27 | 28 | # pos tagger filter (remove the matched item). 1: no noun; 2: ^N+P+N 3: not end with N 29 | # 1. we care NGAW; 2 if more then 2 grams, must ended with NG. 30 | posFilterRegex=[^NGAW]* [^N]*PN .*[^NAGW]$ .+[^NG]$ 31 | 32 | ####################################################################### 33 | ############### Ngram relative configuration ################################### 34 | preferLongTermTfRatio=0.5 35 | # the threshold of tf when fetch ngram in partition 36 | partitionTfFilter=2 37 | # when reach this number of ngram in this partition, start to reduce ngram 38 | partitionReduceStartPoint=100000 39 | # each time this number of new ngram in this partition, after start point, reduce ngram 40 | partitionReduceStartStep=10000 41 | # At lease try to reduce how many ngram, fraction of 'stage1ReduceStartStep' 42 | partitionReduceFraction=0.01 43 | # the threshold of tf when fetch ngram in first stage 44 | stag1TfFilter=2 45 | stag1CvalueFilter=1 46 | # the threshold of tf when fetch ngram in second stage 47 | stag2TfFilter=10 48 | stag2CvalueFilter=1 49 | # the thresholh of umls/chv score. no filter if it is -1 50 | stag2UmlsScoreFilter=-1 51 | stag2ChvScoreFilter=-1 52 | 53 | 54 | ######################## bags of words configuration ############## 55 | bagsOfWord=false 56 | bowUmlsOnly=false 57 | bowTfFilter=100 58 | # maximum number of bag of words 59 | bowTopNgram=10000 60 | bowDialogSetOne=false 61 | ######################## end of bags of words configuration ###### 62 | 63 | ####################################################################### 64 | ############# Clustering relative configuration ########################## 65 | # Nlp do not allow multi-thread, so you can not use local[N] for generating Ngram, but you can use it to run kmeans 66 | sparkMaster=local[4] 67 | partitionNumber=8 68 | repartitionForce=true 69 | ########### only use chv term as trainig data 70 | trainOnlyChv=true 71 | # filter the ngran before run kmeans (remove the matched item) 72 | trainedNgramFilterPosRegex=[^N]*PN 73 | # how many percent of the data is sample as test data(for evaluation), <= 0, no thing is test 74 | testSample=30 75 | sampleRuns=1 76 | #number of ngram for training. For test purpose. <0: no limit; 77 | trainNgramCnt=-1 78 | 79 | # PCA only. Compact the feature space matrix to a N dimensions space using PCA. <=0, do nothing. 80 | pcaDimension=0.95 81 | ###### k-mean parameters ####### 82 | # if run k-mean or not 83 | runKmeans=true 84 | # the start/end/step point of the k (cluster number) 85 | k_start=50 86 | k_end=51 87 | k_step=5 88 | # the maximum of iteration of the k-mean algorithm if it is not convergent 89 | maxIterations=1000 90 | # run the following number of times for every k, and take the least cost one 91 | runs=10 92 | # # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select' 93 | # # all sty 94 | semanticType=T116,T020,T052,T100,T087,T011,T190,T008,T017,T195,T194,T123,T007,T031,T022,T053,T038,T012,T029,T091,T122,T023,T030,T118,T026,T043,T025,T019,T103,T120,T104,T185,T201,T200,T077,T049,T088,T060,T056,T203,T047,T065,T069,T111,T196,T050,T018,T071,T126,T204,T051,T099,T021,T013,T033,T004,T168,T169,T045,T083,T028,T064,T102,T096,T068,T093,T058,T131,T125,T016,T078,T129,T055,T197,T037,T170,T130,T171,T059,T034,T119,T015,T063,T066,T074,T041,T073,T048,T044,T085,T191,T114,T070,T124,T086,T057,T090,T115,T109,T032,T040,T001,T092,T042,T046,T072,T067,T039,T121,T002,T101,T098,T097,T094,T080,T081,T192,T014,T062,T075,T089,T167,T095,T054,T184,T082,T110,T024,T079,T061,T005,T127,T010 95 | # # for clinical trails pattern paper 96 | # # filter the semantic type by a regular expression. tag extraction function. 97 | # #sabFilter=SNOMEDCT_US|NCI|GO 98 | sabFilter=.* 99 | # save the above showing ngram to file 100 | saveNgram2file= 101 | 102 | ####################################################### 103 | ############### Output configuration ################################## 104 | # output normalized text for word2vex 105 | #show original ngram before training 106 | showOrgNgramNum=100 107 | # shown ngram filter based on N 108 | showOrgNgramOfN=1,2,3,4,5 109 | # shown ngram filter based on pos tagger 110 | showOrgNgramOfPosRegex=.* 111 | # shown ngram filter based on text 112 | showOrgNgramOfTextRegex=.* 113 | # show the number of ngram in every cluster. <0, show nothing 114 | showNgramInCluster=0 115 | #show the average and standard deviation of tf in clusters. Not configurable, always true 116 | #showTfAvgSdInCluster=true 117 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail; 118 | showDetailRankPt=0 119 | # if a Ngram math this filter(regex), the detail information will output to console.. 120 | debugFilterNgram=aaaaaaaaaaaaaaaaaa 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /conf/current.chv_paper_022117.properties: -------------------------------------------------------------------------------- 1 | # ############# UMLS term matching configuration #####################3 2 | # # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY. 3 | # # for now, table mrstr is neccessary 4 | jdbcDriver=jdbc:mysql://localhost:3306/umls?user=root&password=root 5 | 6 | 7 | ####################################################################### 8 | ########## data source to fetching configuration ###################### 9 | # how to get the text to get Ngram; the blogId will select as distict, and the blogTextCol will be limit to 1 row. 10 | blogDbUrl=jdbc:mysql://localhost:3306/ytex?user=root&password=root 11 | blogTbl=org_yahoo 12 | #blogTbl=content_org_new 13 | blogIdCol=id 14 | #blogIdCol=blogId 15 | blogTextCol=concat(subject, ". ", content, ". ", chosenanswer) 16 | #blogTextCol=text_content 17 | # limit the blog to be analyzed, mainly for test 18 | blogLimit=2000000 19 | 20 | useStanfordNLP=false 21 | memcached=127.0.0.1:11211 22 | 23 | #####*_*####get the training data from (previous save) file, do not construct the Ngram again. 24 | clusteringFromFile=true 25 | ngramSaveFile=/data/ra/ngram_yahoo_022117opennlp.serd 26 | 27 | # # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term 28 | umlsLikehoodLimit=30 29 | # # the window length to fetch context of a ngram 30 | # WinLen=10 31 | 32 | 33 | ####################################################################### 34 | ############### Ngram relative configuration ################################### 35 | # the threshold of tf when fetch ngram in partition 36 | partitionTfFilter=2 37 | # the threshold of tf when fetch ngram in first stage 38 | stag1TfFilter=2 39 | stag1CvalueFilter=1 40 | # the threshold of tf when fetch ngram in second stage 41 | stag2TfFilter=100 42 | stag2CvalueFilter=1 43 | # the thresholh of umls/chv score. no filter if it is -1 44 | stag2UmlsScoreFilter=-1 45 | stag2ChvScoreFilter=-1 46 | 47 | 48 | ######################## bags of words configuration ############## 49 | bagsOfWord=true 50 | bowUmlsOnly=false 51 | bowTfFilter=100 52 | # maximum number of bag of words 53 | bowTopNgram=10000 54 | bowDialogSetOne=false 55 | ######################## end of bags of words configuration ###### 56 | 57 | ####################################################################### 58 | ############# Clustering relative configuration ########################## 59 | # Nlp do not allow multi-thread, so you can not use local[N] for generating Ngram, but you can use it to run kmeans 60 | sparkMaster= 61 | partitionNumber=8 62 | ########### only use chv term as trainig data 63 | trainOnlyChv=true 64 | # filter the ngran before run kmeans (remove the matched item) 65 | trainedNgramFilterPosRegex=[^N]*PN 66 | # how many percent of the data is sample as test data(for evaluation), <= 0, no thing is test 67 | testSample=30 68 | sampleRuns=10 69 | #number of ngram for training. For test purpose. <0: no limit; 70 | trainNgramCnt=-1 71 | 72 | # PCA only. Compact the feature space matrix to a N dimensions space using PCA. <=0, do nothing. 73 | pcaDimension=0.95 74 | ###### k-mean parameters ####### 75 | # if run k-mean or not 76 | runKmeans=true 77 | # the start/end/step point of the k (cluster number) 78 | k_start=10 79 | k_end=150 80 | k_step=5 81 | # the maximum of iteration of the k-mean algorithm if it is not convergent 82 | maxIterations=1000 83 | # run the following number of times for every k, and take the least cost one 84 | runs=10 85 | # # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select' 86 | # # for chv paper 87 | semanticType=T033,T121,T061,T047,T109,T023,T184,T074,T116,T123,T059,T046 88 | # # for clinical trails pattern paper 89 | # # filter the semantic type by a regular expression. tag extraction function. 90 | # #sabFilter=SNOMEDCT_US|NCI|GO 91 | sabFilter=.* 92 | # save the above showing ngram to file 93 | saveNgram2file=/tmp/orgGram.txt 94 | 95 | 96 | 97 | 98 | 99 | ####################################################################### 100 | ############### Output configuration ################################## 101 | # output normalized text for word2vex 102 | #show original ngram before training 103 | showOrgNgramNum=1000000 104 | # shown ngram filter based on N 105 | showOrgNgramOfN=1,2,3,4,5 106 | # shown ngram filter based on pos tagger 107 | showOrgNgramOfPosRegex=.* 108 | # shown ngram filter based on text 109 | showOrgNgramOfTextRegex=.* 110 | # show the number of ngram in every cluster. <0, show nothing 111 | showNgramInCluster=0 112 | #show the average and standard deviation of tf in clusters. Not configurable, always true 113 | #showTfAvgSdInCluster=true 114 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail; 115 | showDetailRankPt=0 116 | # if a Ngram math this filter(regex), the detail information will output to console.. 117 | debugFilterNgram=aaaaaaaaaaaaaaaaaa 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /conf/current.deaf.properties: -------------------------------------------------------------------------------- 1 | # ############# UMLS term matching configuration #####################3 2 | # # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY. 3 | # # for now, table mrstr is neccessary 4 | jdbcDriver=jdbc:mysql://somelab12.cci.fsu.edu:3306/umls?user=root&password=root 5 | 6 | useStanfordNLP=true 7 | stanfordAnnotators="tokenize, ssplit" 8 | #lvgdir=/data/ra/lvg2015/ 9 | # split the text into block before sentence segmentation. For clinical trials 10 | textBlockDelimiter=#|\\n 11 | # Special segmentation of the text before sentence segmentation (':', ' - ', 'Or|OR', 'No', 'At'). For clinical trials 12 | textBlockDelimiterSpecialEnable=true 13 | 14 | 15 | # # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term 16 | umlsLikehoodLimit=80 17 | # # the window length to fetch context of a ngram 18 | 19 | ################# Metamap configuration ########################## 20 | MMenable=true 21 | # output option have to implement by yourself. don't use as a option. 22 | # -J (--restrict_to_sts) -e (--exclude_sources) -R (--restrict_to_sources) 23 | MMoptions=--allow_concept_gaps -R CHV -y 24 | #MMoptions=--allow_concept_gaps -R SNOMEDCT_US -y 25 | MMscoreThreshold = 800 26 | MMhost= 27 | MMport= 28 | # only perform metamap parsing. 29 | MMonly=true 30 | 31 | ################# end Metamap configuration ####################### 32 | 33 | ####################################################################### 34 | ############### Ngram relative configuration ################################### 35 | # the threshold of tf when fetch ngram in partition 36 | partitionTfFilter=2 37 | # the threshold of tf when fetch ngram in first stage 38 | stag1TfFilter=2 39 | stag1CvalueFilter=1 40 | # the threshold of tf when fetch ngram in second stage 41 | stag2TfFilter=5 42 | stag2CvalueFilter=1 43 | # the thresholh of umls/chv score. no filter if it is -1 44 | stag2UmlsScoreFilter=-1 45 | stag2ChvScoreFilter=-1 46 | 47 | 48 | ######################## bags of words configuration ############## 49 | bagsOfWord=false 50 | bowUmlsOnly=false 51 | bowTfFilter=100 52 | # maximum number of bag of words 53 | bowTopNgram=10000 54 | bowDialogSetOne=false 55 | ######################## end of bags of words configuration ###### 56 | 57 | ######################################################################## 58 | ############## Clustering relative configuration ########################## 59 | ## Nlp do not allow multi-thread, so you can not use local[N] for generating Ngram, but you can use it to run kmeans 60 | #sparkMaster=local[2] 61 | #partitionNumber=8 62 | ############ only use chv term as trainig data 63 | #trainOnlyChv=true 64 | ## filter the ngran before run kmeans (remove the matched item) 65 | #trainedNgramFilterPosRegex=[^N]*PN 66 | ## how many percent of the data is sample as test data(for evaluation), <= 0, no thing is test 67 | #testSample=30 68 | #sampleRuns=1 69 | ##number of ngram for training. For test purpose. <0: no limit; 70 | #trainNgramCnt=-1 71 | # 72 | ## PCA only. Compact the feature space matrix to a N dimensions space using PCA. <=0, do nothing. 73 | #pcaDimension=0.95 74 | ####### k-mean parameters ####### 75 | ## if run k-mean or not 76 | #runKmeans=true 77 | ## the start/end/step point of the k (cluster number) 78 | #k_start=10 79 | #k_end=150 80 | #k_step=5 81 | ## the maximum of iteration of the k-mean algorithm if it is not convergent 82 | #maxIterations=1000 83 | ## run the following number of times for every k, and take the least cost one 84 | #runs=10 85 | # # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select' 86 | # # for chv paper 87 | semanticType=T204,T007,T200,T061,T109,T002,T121,T116,T033,T004,T201,T023,T028,T123,T047,T074,T037,T060,T126,T013,T129,T044,T170,T191,T029,T059,T043,T005,T012,T114,T015,T130,T058,T014,T030,T046,T081,T011,T019,T026,T131,T167,T097,T197,T024,T195,T025,T192,T073,T034,T040,T122,T203,T083,T042,T082,T045,T048,T184,T080,T169,T194,T168,T078,T079,T125,T098,T020,T039,T190,T093,T031,T196,T049,T067,T038,T127,T062,T171,T185,T041,T091,T032,T018,T054,T055,T070,T057,T077,T065,T090,T068,T089,T064,T022,T056,T092,T104,T052,T099,T063,T086,T101,T120,T087,T051,T017,T102,T066,T001,T008,T016,T100,T075,T050,T069,T096,T095,T053,T072,T094,T010,T103,T071,T085,T021,T088 88 | # # for clinical trails pattern paper 89 | # # filter the semantic type by a regular expression. tag extraction function. 90 | sabFilter=SNOMEDCT_US|CHV 91 | #sabFilter=.* 92 | # save the above showing ngram to file 93 | saveNgram2file=C:/fsu/ra/data/orgGram.txt 94 | 95 | 96 | 97 | 98 | 99 | ####################################################################### 100 | ############### Output configuration ################################## 101 | # output normalized text for word2vex 102 | #show original ngram before training 103 | showOrgNgramNum=1000000 104 | # shown ngram filter based on N 105 | showOrgNgramOfN=1,2,3,4,5 106 | # shown ngram filter based on pos tagger 107 | showOrgNgramOfPosRegex=.* 108 | # shown ngram filter based on text 109 | showOrgNgramOfTextRegex=.* 110 | # show the number of ngram in every cluster. <0, show nothing 111 | showNgramInCluster=0 112 | #show the average and standard deviation of tf in clusters. Not configurable, always true 113 | #showTfAvgSdInCluster=true 114 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail; 115 | showDetailRankPt=0 116 | # if a Ngram math this filter(regex), the detail information will output to console.. 117 | debugFilterNgram=aaaaaaaaaaaaaaaaaa 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /conf/current.init_index.properties: -------------------------------------------------------------------------------- 1 | # ############# UMLS term matching configuration #####################3 2 | # # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY. 3 | # # for now, table mrstr is neccessary 4 | jdbcDriver=jdbc:mysql://127.0.0.1:3306/umls?useServerPrepStmts=false&rewriteBatchedStatements=true&user=root&password=root 5 | 6 | ################################# use for target umls terms ###################### 7 | #target term info in database 8 | sourceTermTbl=umls.mrconso 9 | targetTermTbl=_target_term_ 10 | targetTermTblDropAndCreate=true 11 | # 'a' and 'b' represent the self-join mrconso table. 12 | sourceTermQueryOption= a.lat='ENG' and b.lat='ENG' limit 100 13 | # if true, using solr for matching a ngram with target terms, else using database query for matching 14 | targetTermUsingSolr=false 15 | 16 | 17 | ####################################################################### 18 | ################### NLP relative configuration ######################## 19 | #root dir of lvg. Use stanford nlp is recommended. 20 | useStanfordNLP=true 21 | stanfordAnnotators=tokenize, ssplit, pos, lemma -------------------------------------------------------------------------------- /conf/current.knn.properties: -------------------------------------------------------------------------------- 1 | # ############# UMLS term matching configuration #####################3 2 | # # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY. 3 | # # for now, table mrstr is neccessary 4 | jdbcDriver=jdbc:mysql://somelab12.cci.fsu.edu:3306/umls?user=root&password=root 5 | 6 | useStanfordNLP=true 7 | #lvgdir=/data/ra/lvg2015/ 8 | 9 | 10 | #####*_*####get the training data from (previous save) file, do not construct the Ngram again. 11 | clusteringFromFile=true 12 | ngramSaveFile=C:/fsu/ra/data/ngram_yahoo_022117stanfordnlp.serd 13 | 14 | # # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term 15 | umlsLikehoodLimit=80 16 | # # the window length to fetch context of a ngram 17 | 18 | ################# end Metamap configuration ####################### 19 | 20 | ####################################################################### 21 | ############### Ngram relative configuration ################################### 22 | # the threshold of tf when fetch ngram in partition 23 | partitionTfFilter=2 24 | # the threshold of tf when fetch ngram in first stage 25 | stag1TfFilter=2 26 | stag1CvalueFilter=1 27 | # the threshold of tf when fetch ngram in second stage 28 | stag2TfFilter=10 29 | stag2CvalueFilter=1 30 | # the thresholh of umls/chv score. no filter if it is -1 31 | stag2UmlsScoreFilter=-1 32 | stag2ChvScoreFilter=-1 33 | 34 | ####################################################################### 35 | ############# Clustering relative configuration ########################## 36 | # Nlp do not allow multi-thread, so you can not use local[N] for generating Ngram, but you can use it to run kmeans 37 | sparkMaster=local[2] 38 | partitionNumber=8 39 | ########### only use chv term as trainig data 40 | trainOnlyChv=true 41 | # filter the ngran before run kmeans (remove the matched item) 42 | trainedNgramFilterPosRegex=[^N]*PN 43 | # how many percent of the data is sample as test data(for evaluation), <= 0, no thing is test 44 | testSample=30 45 | sampleRuns=1 46 | #number of ngram for training. For test purpose. <0: no limit; 47 | trainNgramCnt=-1 48 | 49 | # PCA only. Compact the feature space matrix to a N dimensions space using PCA. <=0, do nothing. 50 | pcaDimension=0 51 | 52 | # # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select' 53 | # # for chv paper 54 | semanticType=T204,T007,T200,T061,T109,T002,T121,T116,T033,T004,T201,T023,T028,T123,T047,T074,T037,T060,T126,T013,T129,T044,T170,T191,T029,T059,T043,T005,T012,T114,T015,T130,T058,T014,T030,T046,T081,T011,T019,T026,T131,T167,T097,T197,T024,T195,T025,T192,T073,T034,T040,T122,T203,T083,T042,T082,T045,T048,T184,T080,T169,T194,T168,T078,T079,T125,T098,T020,T039,T190,T093,T031,T196,T049,T067,T038,T127,T062,T171,T185,T041,T091,T032,T018,T054,T055,T070,T057,T077,T065,T090,T068,T089,T064,T022,T056,T092,T104,T052,T099,T063,T086,T101,T120,T087,T051,T017,T102,T066,T001,T008,T016,T100,T075,T050,T069,T096,T095,T053,T072,T094,T010,T103,T071,T085,T021,T088 55 | # # for clinical trails pattern paper 56 | # # filter the semantic type by a regular expression. tag extraction function. 57 | #sabFilter=SNOMEDCT_US|CHV 58 | sabFilter=.* 59 | # save the above showing ngram to file 60 | saveNgram2file=C:/fsu/ra/data/orgGram.txt 61 | 62 | 63 | ####################################################################### 64 | ############### Output configuration ################################## 65 | # output normalized text for word2vex 66 | #show original ngram before training 67 | showOrgNgramNum=100 68 | # shown ngram filter based on N 69 | showOrgNgramOfN=1,2,3,4,5 70 | # shown ngram filter based on pos tagger 71 | showOrgNgramOfPosRegex=.* 72 | # shown ngram filter based on text 73 | showOrgNgramOfTextRegex=.* 74 | # show the number of ngram in every cluster. <0, show nothing 75 | showNgramInCluster=0 76 | #show the average and standard deviation of tf in clusters. Not configurable, always true 77 | #showTfAvgSdInCluster=true 78 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail; 79 | showDetailRankPt=0 80 | # if a Ngram math this filter(regex), the detail information will output to console.. 81 | debugFilterNgram=aaaaaaaaaaaaaaaaaa 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /conf/current.socialqa.botanical.properties: -------------------------------------------------------------------------------- 1 | # ############# UMLS term matching configuration #####################3 2 | # # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY. 3 | # # for now, table mrstr is neccessary 4 | jdbcDriver=jdbc:mysql://somelab12.cci.fsu.edu:3306/umls?user=root&password=root 5 | 6 | useStanfordNLP=true 7 | stanfordAnnotators=tokenize, ssplit, pos, lemma 8 | 9 | memcached=127.0.0.1:11211 10 | # default expire time of memcached: 1 week. 11 | defaultExpireTime=604800 12 | ehCacheEntities=500000 13 | ##################################################################################### 14 | ################################# fuzzy matching configuration ###################### 15 | ##### You can use Solor or Mysql as the index search server. You have to initilize at least one of them 16 | ##### Use database is more easy to configure. But it may be a little slower. 17 | # if true, using solr for matching a ngram with target terms, else using database query for matching 18 | targetTermUsingSolr=False 19 | targetTermTbl=_target_term_botanical_ 20 | 21 | # # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term 22 | umlsLikehoodLimit=80 23 | # # the window length to fetch context of a ngram 24 | 25 | ################# Metamap configuration ########################## 26 | MMenable=false 27 | 28 | ################# end Metamap configuration ####################### 29 | 30 | ####################################################################### 31 | 32 | # if use semantic information. if this flag is false, all fuction about semantic are disable. 33 | useSemanticeType=false 34 | # when there are multiple matched term, reduce to one term by semantic type preference. 35 | reduceMatchedTermBySty=false 36 | # # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select' 37 | # # for chv paper 38 | #semanticType=T204 39 | # # for clinical trails pattern paper 40 | # # filter the semantic type by a regular expression. tag extraction function. 41 | sabFilter=.* 42 | 43 | 44 | 45 | 46 | ####################################################################### 47 | ############### Output configuration ################################## 48 | # output normalized text for word2vex 49 | #show original ngram before training 50 | showOrgNgramNum=1000000 51 | # shown ngram filter based on N 52 | showOrgNgramOfN=1,2,3,4,5 53 | # shown ngram filter based on pos tagger 54 | showOrgNgramOfPosRegex=.* 55 | # shown ngram filter based on text 56 | showOrgNgramOfTextRegex=.* 57 | # show the number of ngram in every cluster. <0, show nothing 58 | showNgramInCluster=0 59 | #show the average and standard deviation of tf in clusters. Not configurable, always true 60 | #showTfAvgSdInCluster=true 61 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail; 62 | showDetailRankPt=0 63 | # if a Ngram math this filter(regex), the detail information will output to console.. 64 | debugFilterNgram=aaaaaaaaaaaaaaaaaa 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /conf/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.spark-project.jetty=WARN 10 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | log4j.logger.org.apache.parquet=ERROR 14 | log4j.logger.parquet=ERROR 15 | 16 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 17 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 18 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 19 | -------------------------------------------------------------------------------- /data/PennTreebankP.O.S.Tags.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Penn Treebank P.O.S. Tags 4 | 5 | 6 | 7 | 8 |

Alphabetical list of part-of-speech tags used in the Penn Treebank Project:

9 | 10 | 11 | 14 | 17 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 |
12 |
Number
13 |
15 |
Tag
16 |
18 |
Description
19 |
1. CC Coordinating conjunction
2. CD Cardinal number
3. DT Determiner
4. EX Existential there
5. FW Foreign word
6. IN Preposition or subordinating conjunction
7. JJ Adjective
8. JJR Adjective, comparative
9. JJS Adjective, superlative
10. LS List item marker
11. MD Modal
12. NN Noun, singular or mass
13. NNS Noun, plural
14. NNP Proper noun, singular
15. NNPS Proper noun, plural
16. PDT Predeterminer
17. POS Possessive ending
18. PRP Personal pronoun
19. PRP$ Possessive pronoun
20. RB Adverb
21. RBR Adverb, comparative
22. RBS Adverb, superlative
23. RP Particle
24. SYM Symbol
25. TO to
26. UH Interjection
27. VB Verb, base form
28. VBD Verb, past tense
29. VBG Verb, gerund or present participle
30. VBN Verb, past participle
31. VBP Verb, non-3rd person singular present
32. VBZ Verb, 3rd person singular present
33. WDT Wh-determiner
34. WP Wh-pronoun
35. WP$ Possessive wh-pronoun
36. WRB Wh-adverb 200 |
201 | 202 | 203 | 204 | 205 | -------------------------------------------------------------------------------- /data/SemGroups.txt: -------------------------------------------------------------------------------- 1 | ACTI|Activities & Behaviors|T052|Activity 2 | ACTI|Activities & Behaviors|T053|Behavior 3 | ACTI|Activities & Behaviors|T056|Daily or Recreational Activity 4 | ACTI|Activities & Behaviors|T051|Event 5 | ACTI|Activities & Behaviors|T064|Governmental or Regulatory Activity 6 | ACTI|Activities & Behaviors|T055|Individual Behavior 7 | ACTI|Activities & Behaviors|T066|Machine Activity 8 | ACTI|Activities & Behaviors|T057|Occupational Activity 9 | ACTI|Activities & Behaviors|T054|Social Behavior 10 | ANAT|Anatomy|T017|Anatomical Structure 11 | ANAT|Anatomy|T029|Body Location or Region 12 | ANAT|Anatomy|T023|Body Part, Organ, or Organ Component 13 | ANAT|Anatomy|T030|Body Space or Junction 14 | ANAT|Anatomy|T031|Body Substance 15 | ANAT|Anatomy|T022|Body System 16 | ANAT|Anatomy|T025|Cell 17 | ANAT|Anatomy|T026|Cell Component 18 | ANAT|Anatomy|T018|Embryonic Structure 19 | ANAT|Anatomy|T021|Fully Formed Anatomical Structure 20 | ANAT|Anatomy|T024|Tissue 21 | CHEM|Chemicals & Drugs|T116|Amino Acid, Peptide, or Protein 22 | CHEM|Chemicals & Drugs|T195|Antibiotic 23 | CHEM|Chemicals & Drugs|T123|Biologically Active Substance 24 | CHEM|Chemicals & Drugs|T122|Biomedical or Dental Material 25 | CHEM|Chemicals & Drugs|T118|Carbohydrate 26 | CHEM|Chemicals & Drugs|T103|Chemical 27 | CHEM|Chemicals & Drugs|T120|Chemical Viewed Functionally 28 | CHEM|Chemicals & Drugs|T104|Chemical Viewed Structurally 29 | CHEM|Chemicals & Drugs|T200|Clinical Drug 30 | CHEM|Chemicals & Drugs|T111|Eicosanoid 31 | CHEM|Chemicals & Drugs|T196|Element, Ion, or Isotope 32 | CHEM|Chemicals & Drugs|T126|Enzyme 33 | CHEM|Chemicals & Drugs|T131|Hazardous or Poisonous Substance 34 | CHEM|Chemicals & Drugs|T125|Hormone 35 | CHEM|Chemicals & Drugs|T129|Immunologic Factor 36 | CHEM|Chemicals & Drugs|T130|Indicator, Reagent, or Diagnostic Aid 37 | CHEM|Chemicals & Drugs|T197|Inorganic Chemical 38 | CHEM|Chemicals & Drugs|T119|Lipid 39 | CHEM|Chemicals & Drugs|T124|Neuroreactive Substance or Biogenic Amine 40 | CHEM|Chemicals & Drugs|T114|Nucleic Acid, Nucleoside, or Nucleotide 41 | CHEM|Chemicals & Drugs|T109|Organic Chemical 42 | CHEM|Chemicals & Drugs|T115|Organophosphorus Compound 43 | CHEM|Chemicals & Drugs|T121|Pharmacologic Substance 44 | CHEM|Chemicals & Drugs|T192|Receptor 45 | CHEM|Chemicals & Drugs|T110|Steroid 46 | CHEM|Chemicals & Drugs|T127|Vitamin 47 | CONC|Concepts & Ideas|T185|Classification 48 | CONC|Concepts & Ideas|T077|Conceptual Entity 49 | CONC|Concepts & Ideas|T169|Functional Concept 50 | CONC|Concepts & Ideas|T102|Group Attribute 51 | CONC|Concepts & Ideas|T078|Idea or Concept 52 | CONC|Concepts & Ideas|T170|Intellectual Product 53 | CONC|Concepts & Ideas|T171|Language 54 | CONC|Concepts & Ideas|T080|Qualitative Concept 55 | CONC|Concepts & Ideas|T081|Quantitative Concept 56 | CONC|Concepts & Ideas|T089|Regulation or Law 57 | CONC|Concepts & Ideas|T082|Spatial Concept 58 | CONC|Concepts & Ideas|T079|Temporal Concept 59 | DEVI|Devices|T203|Drug Delivery Device 60 | DEVI|Devices|T074|Medical Device 61 | DEVI|Devices|T075|Research Device 62 | DISO|Disorders|T020|Acquired Abnormality 63 | DISO|Disorders|T190|Anatomical Abnormality 64 | DISO|Disorders|T049|Cell or Molecular Dysfunction 65 | DISO|Disorders|T019|Congenital Abnormality 66 | DISO|Disorders|T047|Disease or Syndrome 67 | DISO|Disorders|T050|Experimental Model of Disease 68 | DISO|Disorders|T033|Finding 69 | DISO|Disorders|T037|Injury or Poisoning 70 | DISO|Disorders|T048|Mental or Behavioral Dysfunction 71 | DISO|Disorders|T191|Neoplastic Process 72 | DISO|Disorders|T046|Pathologic Function 73 | DISO|Disorders|T184|Sign or Symptom 74 | GENE|Genes & Molecular Sequences|T087|Amino Acid Sequence 75 | GENE|Genes & Molecular Sequences|T088|Carbohydrate Sequence 76 | GENE|Genes & Molecular Sequences|T028|Gene or Genome 77 | GENE|Genes & Molecular Sequences|T085|Molecular Sequence 78 | GENE|Genes & Molecular Sequences|T086|Nucleotide Sequence 79 | GEOG|Geographic Areas|T083|Geographic Area 80 | LIVB|Living Beings|T100|Age Group 81 | LIVB|Living Beings|T011|Amphibian 82 | LIVB|Living Beings|T008|Animal 83 | LIVB|Living Beings|T194|Archaeon 84 | LIVB|Living Beings|T007|Bacterium 85 | LIVB|Living Beings|T012|Bird 86 | LIVB|Living Beings|T204|Eukaryote 87 | LIVB|Living Beings|T099|Family Group 88 | LIVB|Living Beings|T013|Fish 89 | LIVB|Living Beings|T004|Fungus 90 | LIVB|Living Beings|T096|Group 91 | LIVB|Living Beings|T016|Human 92 | LIVB|Living Beings|T015|Mammal 93 | LIVB|Living Beings|T001|Organism 94 | LIVB|Living Beings|T101|Patient or Disabled Group 95 | LIVB|Living Beings|T002|Plant 96 | LIVB|Living Beings|T098|Population Group 97 | LIVB|Living Beings|T097|Professional or Occupational Group 98 | LIVB|Living Beings|T014|Reptile 99 | LIVB|Living Beings|T010|Vertebrate 100 | LIVB|Living Beings|T005|Virus 101 | OBJC|Objects|T071|Entity 102 | OBJC|Objects|T168|Food 103 | OBJC|Objects|T073|Manufactured Object 104 | OBJC|Objects|T072|Physical Object 105 | OBJC|Objects|T167|Substance 106 | OCCU|Occupations|T091|Biomedical Occupation or Discipline 107 | OCCU|Occupations|T090|Occupation or Discipline 108 | ORGA|Organizations|T093|Health Care Related Organization 109 | ORGA|Organizations|T092|Organization 110 | ORGA|Organizations|T094|Professional Society 111 | ORGA|Organizations|T095|Self-help or Relief Organization 112 | PHEN|Phenomena|T038|Biologic Function 113 | PHEN|Phenomena|T069|Environmental Effect of Humans 114 | PHEN|Phenomena|T068|Human-caused Phenomenon or Process 115 | PHEN|Phenomena|T034|Laboratory or Test Result 116 | PHEN|Phenomena|T070|Natural Phenomenon or Process 117 | PHEN|Phenomena|T067|Phenomenon or Process 118 | PHYS|Physiology|T043|Cell Function 119 | PHYS|Physiology|T201|Clinical Attribute 120 | PHYS|Physiology|T045|Genetic Function 121 | PHYS|Physiology|T041|Mental Process 122 | PHYS|Physiology|T044|Molecular Function 123 | PHYS|Physiology|T032|Organism Attribute 124 | PHYS|Physiology|T040|Organism Function 125 | PHYS|Physiology|T042|Organ or Tissue Function 126 | PHYS|Physiology|T039|Physiologic Function 127 | PROC|Procedures|T060|Diagnostic Procedure 128 | PROC|Procedures|T065|Educational Activity 129 | PROC|Procedures|T058|Health Care Activity 130 | PROC|Procedures|T059|Laboratory Procedure 131 | PROC|Procedures|T063|Molecular Biology Research Technique 132 | PROC|Procedures|T062|Research Activity 133 | PROC|Procedures|T061|Therapeutic or Preventive Procedure 134 | -------------------------------------------------------------------------------- /data/SemanticTypes_2013AA.txt: -------------------------------------------------------------------------------- 1 | aapp|T116|Amino Acid, Peptide, or Protein 2 | acab|T020|Acquired Abnormality 3 | acty|T052|Activity 4 | aggp|T100|Age Group 5 | amas|T087|Amino Acid Sequence 6 | amph|T011|Amphibian 7 | anab|T190|Anatomical Abnormality 8 | anim|T008|Animal 9 | anst|T017|Anatomical Structure 10 | antb|T195|Antibiotic 11 | arch|T194|Archaeon 12 | bacs|T123|Biologically Active Substance 13 | bact|T007|Bacterium 14 | bdsu|T031|Body Substance 15 | bdsy|T022|Body System 16 | bhvr|T053|Behavior 17 | biof|T038|Biologic Function 18 | bird|T012|Bird 19 | blor|T029|Body Location or Region 20 | bmod|T091|Biomedical Occupation or Discipline 21 | bodm|T122|Biomedical or Dental Material 22 | bpoc|T023|Body Part, Organ, or Organ Component 23 | bsoj|T030|Body Space or Junction 24 | carb|T118|Carbohydrate 25 | celc|T026|Cell Component 26 | celf|T043|Cell Function 27 | cell|T025|Cell 28 | cgab|T019|Congenital Abnormality 29 | chem|T103|Chemical 30 | chvf|T120|Chemical Viewed Functionally 31 | chvs|T104|Chemical Viewed Structurally 32 | clas|T185|Classification 33 | clna|T201|Clinical Attribute 34 | clnd|T200|Clinical Drug 35 | cnce|T077|Conceptual Entity 36 | comd|T049|Cell or Molecular Dysfunction 37 | crbs|T088|Carbohydrate Sequence 38 | diap|T060|Diagnostic Procedure 39 | dora|T056|Daily or Recreational Activity 40 | drdd|T203|Drug Delivery Device 41 | dsyn|T047|Disease or Syndrome 42 | edac|T065|Educational Activity 43 | eehu|T069|Environmental Effect of Humans 44 | eico|T111|Eicosanoid 45 | elii|T196|Element, Ion, or Isotope 46 | emod|T050|Experimental Model of Disease 47 | emst|T018|Embryonic Structure 48 | enty|T071|Entity 49 | enzy|T126|Enzyme 50 | euka|T204|Eukaryote 51 | evnt|T051|Event 52 | famg|T099|Family Group 53 | ffas|T021|Fully Formed Anatomical Structure 54 | fish|T013|Fish 55 | fndg|T033|Finding 56 | fngs|T004|Fungus 57 | food|T168|Food 58 | ftcn|T169|Functional Concept 59 | genf|T045|Genetic Function 60 | geoa|T083|Geographic Area 61 | gngm|T028|Gene or Genome 62 | gora|T064|Governmental or Regulatory Activity 63 | grpa|T102|Group Attribute 64 | grup|T096|Group 65 | hcpp|T068|Human-caused Phenomenon or Process 66 | hcro|T093|Health Care Related Organization 67 | hlca|T058|Health Care Activity 68 | hops|T131|Hazardous or Poisonous Substance 69 | horm|T125|Hormone 70 | humn|T016|Human 71 | idcn|T078|Idea or Concept 72 | imft|T129|Immunologic Factor 73 | inbe|T055|Individual Behavior 74 | inch|T197|Inorganic Chemical 75 | inpo|T037|Injury or Poisoning 76 | inpr|T170|Intellectual Product 77 | irda|T130|Indicator, Reagent, or Diagnostic Aid 78 | lang|T171|Language 79 | lbpr|T059|Laboratory Procedure 80 | lbtr|T034|Laboratory or Test Result 81 | lipd|T119|Lipid 82 | mamm|T015|Mammal 83 | mbrt|T063|Molecular Biology Research Technique 84 | mcha|T066|Machine Activity 85 | medd|T074|Medical Device 86 | menp|T041|Mental Process 87 | mnob|T073|Manufactured Object 88 | mobd|T048|Mental or Behavioral Dysfunction 89 | moft|T044|Molecular Function 90 | mosq|T085|Molecular Sequence 91 | neop|T191|Neoplastic Process 92 | nnon|T114|Nucleic Acid, Nucleoside, or Nucleotide 93 | npop|T070|Natural Phenomenon or Process 94 | nsba|T124|Neuroreactive Substance or Biogenic Amine 95 | nusq|T086|Nucleotide Sequence 96 | ocac|T057|Occupational Activity 97 | ocdi|T090|Occupation or Discipline 98 | opco|T115|Organophosphorus Compound 99 | orch|T109|Organic Chemical 100 | orga|T032|Organism Attribute 101 | orgf|T040|Organism Function 102 | orgm|T001|Organism 103 | orgt|T092|Organization 104 | ortf|T042|Organ or Tissue Function 105 | patf|T046|Pathologic Function 106 | phob|T072|Physical Object 107 | phpr|T067|Phenomenon or Process 108 | phsf|T039|Physiologic Function 109 | phsu|T121|Pharmacologic Substance 110 | plnt|T002|Plant 111 | podg|T101|Patient or Disabled Group 112 | popg|T098|Population Group 113 | prog|T097|Professional or Occupational Group 114 | pros|T094|Professional Society 115 | qlco|T080|Qualitative Concept 116 | qnco|T081|Quantitative Concept 117 | rcpt|T192|Receptor 118 | rept|T014|Reptile 119 | resa|T062|Research Activity 120 | resd|T075|Research Device 121 | rnlw|T089|Regulation or Law 122 | sbst|T167|Substance 123 | shro|T095|Self-help or Relief Organization 124 | socb|T054|Social Behavior 125 | sosy|T184|Sign or Symptom 126 | spco|T082|Spatial Concept 127 | strd|T110|Steroid 128 | tisu|T024|Tissue 129 | tmco|T079|Temporal Concept 130 | topp|T061|Therapeutic or Preventive Procedure 131 | virs|T005|Virus 132 | vita|T127|Vitamin 133 | vtbt|T010|Vertebrate 134 | -------------------------------------------------------------------------------- /data/clinical_text.txt: -------------------------------------------------------------------------------- 1 | Criteria 2 | Inclusion Criteria: 3 | 4 | female 5 | >= 65 years old 6 | postmenopausal for > 5 years (WHO definition of menopause) 7 | Exclusion Criteria: 8 | 9 | currently taking osteoporosis related medication (HRT, SERM, bisphosphonate, PTH, calcitonin, fluoride) 10 | had cancer in past 10 years, likely to metastasize to bone (ie: breast, lung) 11 | have intrinsic bone disease (ie: Paget's Disease, Cushings syndrome) 12 | have untreated malabsorption syndrome (ie: Celiac Disease) 13 | renal insufficiency (CrCl <30ml/min) 14 | hyperparathyroidism, hypoparathyroidism 15 | chronic systemic glucocorticosteroid use > 3mos, dose>2.5mg daily 16 | -------------------------------------------------------------------------------- /data/en-chunker.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-chunker.bin -------------------------------------------------------------------------------- /data/en-parser-chunking.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-parser-chunking.bin -------------------------------------------------------------------------------- /data/en-pos-maxent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-pos-maxent.bin -------------------------------------------------------------------------------- /data/en-sent.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-sent.bin -------------------------------------------------------------------------------- /data/en-token.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-token.bin -------------------------------------------------------------------------------- /data/pos-transformation.csv: -------------------------------------------------------------------------------- 1 | Number,Tag,Description,abbr.,,,distinct abbr. 2 | 1,CC,Coordinating conjunction,C,,,C 3 | 2,CD,Cardinal number,M,,,M 4 | 3,DT,Determiner,D,,,D 5 | 4,EX,Existential?there,E,,,E 6 | 5,FW,Foreign word,F,,,F 7 | 6,IN,Preposition or subordinating conjunction,P,,,P 8 | 7,JJ,Adjective,A,,,A 9 | 8,JJR,"Adjective, comparative",A,,,O 10 | 9,JJS,"Adjective, superlative",A,,,N 11 | 10,LS,List item marker,O,,,U 12 | 11,MD,Modal,O,,,R 13 | 12,NN,"Noun, singular or mass",N,,,T 14 | 13,NNS,"Noun, plural",N,,,V 15 | 14,NNP,"Proper noun, singular",N,,,G 16 | 15,NNPS,"Proper noun, plural",N,,,B 17 | 16,PDT,Predeterminer,D,,, 18 | 17,POS,Possessive ending,O,,, 19 | 18,PRP,Personal pronoun,U,,, 20 | 19,PRP$,Possessive pronoun,U,,, 21 | 20,RB,Adverb,R,,, 22 | 21,RBR,"Adverb, comparative",R,,, 23 | 22,RBS,"Adverb, superlative",R,,, 24 | 23,RP,Particle,O,,, 25 | 24,SYM,Symbol,O,,, 26 | 25,TO,to,T,,, 27 | 26,UH,Interjection,O,,, 28 | 27,VB,"Verb, base form",V,,, 29 | 28,VBD,"Verb, past tense",V,,, 30 | 29,VBG,"Verb, gerund or present participle",G,,, 31 | 30,VBN,"Verb, past participle",B,,, 32 | 31,VBP,"Verb, non-3rd person singular present",V,,, 33 | 32,VBZ,"Verb, 3rd person singular present",V,,, 34 | 33,WDT,Wh-determiner,D,,, 35 | 34,WP,Wh-pronoun,U,,, 36 | 35,WP$,Possessive wh-pronoun,U,,, 37 | 36,WRB,Wh-adverb,R,,, 38 | -------------------------------------------------------------------------------- /data/prefix.txt: -------------------------------------------------------------------------------- 1 | #Prefix 2 | an 3 | anti 4 | apo 5 | auto 6 | bi 7 | bio 8 | bis 9 | circum 10 | co 11 | counter 12 | cryo 13 | de 14 | di 15 | dis 16 | dys 17 | electro 18 | epi 19 | extra 20 | fore 21 | geo 22 | haemo 23 | hemi 24 | hemo 25 | hetero 26 | homo 27 | hydro 28 | hyper 29 | hypo 30 | in 31 | infra 32 | inter 33 | intra 34 | ir 35 | iso 36 | macro 37 | mega 38 | meta 39 | micro 40 | mid 41 | mini 42 | mis 43 | mono 44 | multi 45 | neo 46 | non 47 | ortho 48 | over 49 | pan 50 | para 51 | peri 52 | photo 53 | poly 54 | post 55 | pre 56 | pro 57 | proto 58 | pseudo 59 | pyro 60 | quasi 61 | re 62 | retro 63 | self 64 | semi 65 | socio 66 | sub 67 | super 68 | supra 69 | tele 70 | trans 71 | tri 72 | ultra 73 | un 74 | under 75 | uni -------------------------------------------------------------------------------- /data/stopwords-FOR-clustering.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | about 4 | across 5 | after 6 | all 7 | almost 8 | also 9 | am 10 | among 11 | an 12 | and 13 | any 14 | are 15 | as 16 | at 17 | be 18 | because 19 | been 20 | but 21 | by 22 | can 23 | cannot 24 | could 25 | dear 26 | did 27 | do 28 | does 29 | either 30 | else 31 | ever 32 | every 33 | for 34 | from 35 | get 36 | got 37 | had 38 | has 39 | have 40 | he 41 | her 42 | hers 43 | him 44 | his 45 | how 46 | however 47 | i 48 | if 49 | in 50 | into 51 | is 52 | it 53 | its 54 | just 55 | #least 56 | let 57 | #like 58 | #likely 59 | may 60 | me 61 | might 62 | #most 63 | must 64 | my 65 | neither 66 | no 67 | nor 68 | not 69 | of 70 | off 71 | often 72 | on 73 | only 74 | or 75 | other 76 | our 77 | own 78 | rather 79 | said 80 | say 81 | says 82 | she 83 | should 84 | since 85 | so 86 | some 87 | than 88 | that 89 | the 90 | their 91 | them 92 | then 93 | there 94 | these 95 | they 96 | this 97 | tis 98 | to 99 | too 100 | twas 101 | us 102 | wants 103 | was 104 | we 105 | were 106 | what 107 | when 108 | where 109 | which 110 | while 111 | who 112 | whom 113 | why 114 | will 115 | with 116 | would 117 | yet 118 | you 119 | your 120 | above 121 | afterwards 122 | again 123 | against 124 | alone 125 | along 126 | already 127 | although 128 | always 129 | amongst 130 | amoungst 131 | amount 132 | another 133 | anyhow 134 | anyone 135 | anything 136 | anyway 137 | anywhere 138 | around 139 | back 140 | became 141 | become 142 | becomes 143 | becoming 144 | before 145 | beforehand 146 | behind 147 | being 148 | below 149 | beside 150 | besides 151 | between 152 | #beyond 153 | bill 154 | both 155 | #bottom 156 | call 157 | cant 158 | co 159 | #computer 160 | con 161 | couldnt 162 | cry 163 | de 164 | #describe 165 | #detail 166 | done 167 | down 168 | due 169 | during 170 | each 171 | eg 172 | eight 173 | eleven 174 | elsewhere 175 | empty 176 | enough 177 | etc 178 | even 179 | everyone 180 | everything 181 | everywhere 182 | except 183 | few 184 | fifteen 185 | fify 186 | fill 187 | find 188 | fire 189 | first 190 | five 191 | former 192 | formerly 193 | forty 194 | found 195 | four 196 | front 197 | full 198 | further 199 | give 200 | go 201 | hasnt 202 | hence 203 | here 204 | hereafter 205 | hereby 206 | herein 207 | hereupon 208 | herse 209 | himse 210 | hundred 211 | ie 212 | inc 213 | indeed 214 | interest 215 | itse 216 | keep 217 | last 218 | latter 219 | latterly 220 | less 221 | ltd 222 | made 223 | many 224 | meanwhile 225 | mill 226 | mine 227 | more 228 | moreover 229 | mostly 230 | move 231 | much 232 | myse 233 | name 234 | namely 235 | never 236 | nevertheless 237 | next 238 | nine 239 | #nobody 240 | none 241 | noone 242 | nothing 243 | now 244 | nowhere 245 | once 246 | one 247 | onto 248 | others 249 | otherwise 250 | ours 251 | ourselves 252 | out 253 | over 254 | part 255 | per 256 | perhaps 257 | please 258 | put 259 | re 260 | same 261 | see 262 | seem 263 | seemed 264 | seeming 265 | seems 266 | serious 267 | several 268 | show 269 | side 270 | sincere 271 | six 272 | sixty 273 | somehow 274 | someone 275 | something 276 | sometime 277 | sometimes 278 | somewhere 279 | still 280 | such 281 | system 282 | take 283 | ten 284 | themselves 285 | thence 286 | thereafter 287 | thereby 288 | therefore 289 | therein 290 | thereupon 291 | thick 292 | thin 293 | third 294 | those 295 | though 296 | three 297 | through 298 | throughout 299 | thru 300 | thus 301 | together 302 | top 303 | toward 304 | towards 305 | twelve 306 | twenty 307 | two 308 | un 309 | under 310 | until 311 | up 312 | upon 313 | very 314 | via 315 | well 316 | whatever 317 | whence 318 | whenever 319 | whereafter 320 | whereas 321 | whereby 322 | wherein 323 | whereupon 324 | wherever 325 | whether 326 | whither 327 | whoever 328 | whole 329 | whose 330 | within 331 | without 332 | yours 333 | yourself 334 | yourselves 335 | s 336 | ll 337 | d 338 | im -------------------------------------------------------------------------------- /data/stopwords-empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/stopwords-empty.txt -------------------------------------------------------------------------------- /data/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | about 4 | across 5 | after 6 | all 7 | almost 8 | also 9 | am 10 | among 11 | an 12 | and 13 | any 14 | are 15 | as 16 | at 17 | be 18 | because 19 | been 20 | but 21 | by 22 | can 23 | cannot 24 | could 25 | dear 26 | did 27 | do 28 | does 29 | either 30 | else 31 | ever 32 | every 33 | for 34 | from 35 | get 36 | got 37 | had 38 | has 39 | have 40 | he 41 | her 42 | hers 43 | him 44 | his 45 | how 46 | however 47 | i 48 | if 49 | in 50 | into 51 | is 52 | it 53 | its 54 | just 55 | #least 56 | let 57 | #like 58 | #likely 59 | may 60 | me 61 | might 62 | #most 63 | must 64 | my 65 | neither 66 | no 67 | nor 68 | not 69 | of 70 | off 71 | often 72 | on 73 | only 74 | or 75 | other 76 | our 77 | own 78 | rather 79 | said 80 | say 81 | says 82 | she 83 | should 84 | since 85 | so 86 | some 87 | than 88 | that 89 | the 90 | their 91 | them 92 | then 93 | there 94 | these 95 | they 96 | this 97 | tis 98 | to 99 | too 100 | twas 101 | us 102 | wants 103 | was 104 | we 105 | were 106 | what 107 | when 108 | where 109 | which 110 | while 111 | who 112 | whom 113 | why 114 | will 115 | with 116 | would 117 | yet 118 | you 119 | your 120 | above 121 | afterwards 122 | again 123 | against 124 | alone 125 | along 126 | already 127 | although 128 | always 129 | amongst 130 | amoungst 131 | amount 132 | another 133 | anyhow 134 | anyone 135 | anything 136 | anyway 137 | anywhere 138 | around 139 | #back 140 | became 141 | become 142 | becomes 143 | becoming 144 | before 145 | beforehand 146 | behind 147 | being 148 | below 149 | beside 150 | besides 151 | between 152 | #beyond 153 | #bill 154 | both 155 | #bottom 156 | call 157 | cant 158 | co 159 | #computer 160 | con 161 | couldnt 162 | cry 163 | de 164 | #describe 165 | #detail 166 | done 167 | down 168 | due to 169 | during 170 | each 171 | eg 172 | eight 173 | eleven 174 | elsewhere 175 | #empty 176 | enough 177 | etc 178 | even 179 | everyone 180 | everything 181 | everywhere 182 | except 183 | few 184 | fifteen 185 | fify 186 | fill 187 | find 188 | fire 189 | first 190 | five 191 | former 192 | formerly 193 | forty 194 | #found 195 | four 196 | front 197 | full 198 | further 199 | give 200 | go 201 | hasnt 202 | hence 203 | here 204 | hereafter 205 | hereby 206 | herein 207 | hereupon 208 | herse 209 | himse 210 | hundred 211 | ie 212 | inc 213 | indeed 214 | interest 215 | itse 216 | #keep 217 | last 218 | latter 219 | latterly 220 | less 221 | ltd 222 | made 223 | many 224 | meanwhile 225 | mill 226 | mine 227 | more 228 | moreover 229 | mostly 230 | move 231 | much 232 | myse 233 | #name 234 | namely 235 | never 236 | nevertheless 237 | next 238 | nine 239 | #nobody 240 | none 241 | noone 242 | nothing 243 | now 244 | nowhere 245 | once 246 | one 247 | onto 248 | others 249 | otherwise 250 | ours 251 | ourselves 252 | out 253 | over 254 | part 255 | per 256 | perhaps 257 | please 258 | put 259 | re 260 | same 261 | see 262 | seem 263 | seemed 264 | seeming 265 | seems 266 | #serious 267 | several 268 | show 269 | side 270 | sincere 271 | six 272 | sixty 273 | somehow 274 | someone 275 | something 276 | sometime 277 | sometimes 278 | somewhere 279 | still 280 | such 281 | #system 282 | take 283 | ten 284 | themselves 285 | thence 286 | thereafter 287 | thereby 288 | therefore 289 | therein 290 | thereupon 291 | #thick 292 | thin 293 | third 294 | those 295 | though 296 | three 297 | through 298 | throughout 299 | thru 300 | thus 301 | together 302 | #top 303 | toward 304 | towards 305 | twelve 306 | twenty 307 | two 308 | un 309 | under 310 | until 311 | up 312 | upon 313 | very 314 | via 315 | well 316 | whatever 317 | whence 318 | whenever 319 | whereafter 320 | whereas 321 | whereby 322 | wherein 323 | whereupon 324 | wherever 325 | whether 326 | whither 327 | whoever 328 | whole 329 | whose 330 | within 331 | without 332 | yours 333 | yourself 334 | yourselves 335 | s 336 | ll 337 | d 338 | im 339 | 's -------------------------------------------------------------------------------- /data/stopwords_clustering.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | about 4 | across 5 | after 6 | all 7 | almost 8 | also 9 | am 10 | among 11 | an 12 | and 13 | any 14 | are 15 | as 16 | at 17 | be 18 | because 19 | been 20 | but 21 | by 22 | can 23 | cannot 24 | could 25 | dear 26 | did 27 | do 28 | does 29 | either 30 | else 31 | ever 32 | every 33 | for 34 | from 35 | get 36 | got 37 | had 38 | has 39 | have 40 | he 41 | her 42 | hers 43 | him 44 | his 45 | how 46 | however 47 | i 48 | if 49 | in 50 | into 51 | is 52 | it 53 | its 54 | just 55 | #least 56 | let 57 | #like 58 | #likely 59 | may 60 | me 61 | might 62 | #most 63 | must 64 | my 65 | neither 66 | no 67 | nor 68 | not 69 | of 70 | off 71 | often 72 | on 73 | only 74 | or 75 | other 76 | our 77 | own 78 | rather 79 | said 80 | say 81 | says 82 | she 83 | should 84 | since 85 | so 86 | some 87 | than 88 | that 89 | the 90 | their 91 | them 92 | then 93 | there 94 | these 95 | they 96 | this 97 | tis 98 | to 99 | too 100 | twas 101 | us 102 | wants 103 | was 104 | we 105 | were 106 | what 107 | when 108 | where 109 | which 110 | while 111 | who 112 | whom 113 | why 114 | will 115 | with 116 | would 117 | yet 118 | you 119 | your 120 | above 121 | afterwards 122 | again 123 | against 124 | alone 125 | along 126 | already 127 | although 128 | always 129 | amongst 130 | amoungst 131 | amount 132 | another 133 | anyhow 134 | anyone 135 | anything 136 | anyway 137 | anywhere 138 | around 139 | back 140 | became 141 | become 142 | becomes 143 | becoming 144 | before 145 | beforehand 146 | behind 147 | being 148 | below 149 | beside 150 | besides 151 | between 152 | #beyond 153 | bill 154 | both 155 | #bottom 156 | call 157 | cant 158 | co 159 | #computer 160 | con 161 | couldnt 162 | cry 163 | de 164 | #describe 165 | #detail 166 | done 167 | down 168 | due 169 | during 170 | each 171 | eg 172 | eight 173 | eleven 174 | elsewhere 175 | empty 176 | enough 177 | etc 178 | even 179 | everyone 180 | everything 181 | everywhere 182 | except 183 | few 184 | fifteen 185 | fify 186 | fill 187 | find 188 | fire 189 | first 190 | five 191 | former 192 | formerly 193 | forty 194 | found 195 | four 196 | front 197 | full 198 | further 199 | give 200 | go 201 | hasnt 202 | hence 203 | here 204 | hereafter 205 | hereby 206 | herein 207 | hereupon 208 | herse 209 | himse 210 | hundred 211 | ie 212 | inc 213 | indeed 214 | interest 215 | itse 216 | keep 217 | last 218 | latter 219 | latterly 220 | less 221 | ltd 222 | made 223 | many 224 | meanwhile 225 | mill 226 | mine 227 | more 228 | moreover 229 | mostly 230 | move 231 | much 232 | myse 233 | name 234 | namely 235 | never 236 | nevertheless 237 | next 238 | nine 239 | #nobody 240 | none 241 | noone 242 | nothing 243 | now 244 | nowhere 245 | once 246 | one 247 | onto 248 | others 249 | otherwise 250 | ours 251 | ourselves 252 | out 253 | over 254 | part 255 | per 256 | perhaps 257 | please 258 | put 259 | re 260 | same 261 | see 262 | seem 263 | seemed 264 | seeming 265 | seems 266 | serious 267 | several 268 | show 269 | side 270 | sincere 271 | six 272 | sixty 273 | somehow 274 | someone 275 | something 276 | sometime 277 | sometimes 278 | somewhere 279 | still 280 | such 281 | system 282 | take 283 | ten 284 | themselves 285 | thence 286 | thereafter 287 | thereby 288 | therefore 289 | therein 290 | thereupon 291 | thick 292 | thin 293 | third 294 | those 295 | though 296 | three 297 | through 298 | throughout 299 | thru 300 | thus 301 | together 302 | top 303 | toward 304 | towards 305 | twelve 306 | twenty 307 | two 308 | un 309 | under 310 | until 311 | up 312 | upon 313 | very 314 | via 315 | well 316 | whatever 317 | whence 318 | whenever 319 | whereafter 320 | whereas 321 | whereby 322 | wherein 323 | whereupon 324 | wherever 325 | whether 326 | whither 327 | whoever 328 | whole 329 | whose 330 | within 331 | without 332 | yours 333 | yourself 334 | yourselves 335 | s 336 | ll 337 | d 338 | im -------------------------------------------------------------------------------- /data/suffix.txt: -------------------------------------------------------------------------------- 1 | Suffix 2 | able 3 | ably 4 | ad 5 | ade 6 | age 7 | agogy 8 | al 9 | al 10 | ality 11 | an 12 | ance 13 | ancy 14 | ant 15 | ar 16 | ard 17 | ary 18 | arch 19 | archy 20 | arium 21 | asia 22 | ate 23 | athlon 24 | ation 25 | ative 26 | atory 27 | bound 28 | coele 29 | coel 30 | cele 31 | centesis 32 | cephalic 33 | chondrion 34 | cide 35 | city 36 | cy 37 | cycle 38 | dom 39 | ectasis 40 | ectasia 41 | ectomy 42 | ed 43 | ee 44 | eer 45 | eme 46 | emia 47 | en 48 | enchyma 49 | ence 50 | ency 51 | ent 52 | eous 53 | er 54 | ergy 55 | ern 56 | ery 57 | esce 58 | ese 59 | esque 60 | ess 61 | esthesia 62 | esthesis 63 | etic 64 | ette 65 | fare 66 | ful 67 | gate 68 | gnosis 69 | gon 70 | graph 71 | gram 72 | gry 73 | hedron 74 | holic 75 | hood 76 | ia 77 | iable 78 | ial 79 | ian 80 | iant 81 | iary 82 | iasis 83 | iate 84 | ible 85 | ibly 86 | ic 87 | ical 88 | ics 89 | id 90 | iency 91 | ient 92 | ier 93 | fy 94 | ify 95 | ile 96 | illion 97 | ious 98 | ing 99 | ion 100 | ish 101 | ism 102 | ist 103 | ista 104 | ite 105 | itis 106 | itive 107 | itude 108 | ity 109 | ium 110 | ive 111 | isation 112 | ization 113 | ize 114 | ise 115 | izzle 116 | kinesis 117 | less 118 | let 119 | like 120 | ling 121 | ly 122 | man 123 | mancy 124 | mania 125 | ment 126 | meter 127 | metry 128 | mony 129 | morphism 130 | most 131 | ness 132 | nik 133 | ocracy 134 | ogram 135 | ography 136 | oid 137 | ologist 138 | ology 139 | oma 140 | ome 141 | omics 142 | onomy 143 | onym 144 | opsy 145 | or 146 | our 147 | ory 148 | ose 149 | osis 150 | ous 151 | phagia 152 | phagy 153 | philia 154 | phobia 155 | phone 156 | physeal 157 | phyte 158 | polis 159 | #s 160 | science 161 | scope 162 | script 163 | ship 164 | sion 165 | sis 166 | some 167 | stan 168 | ster 169 | eth 170 | #t 171 | th 172 | tion 173 | tome 174 | tom 175 | tropism 176 | ty 177 | uary 178 | ular 179 | ulent 180 | um 181 | uous 182 | ure 183 | us 184 | ville 185 | vorous 186 | vore 187 | wards 188 | ward 189 | ware 190 | ways 191 | wise 192 | wright 193 | #y -------------------------------------------------------------------------------- /data/test.text.txt: -------------------------------------------------------------------------------- 1 | I love Cy, because she is nice. She is so nice. 2 | I miss you, as you know. -------------------------------------------------------------------------------- /docs/dependency-package.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/dependency-package.jpg -------------------------------------------------------------------------------- /docs/figurs/conceptual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/conceptual.png -------------------------------------------------------------------------------- /docs/figurs/cui_duration_heatmap3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/cui_duration_heatmap3.png -------------------------------------------------------------------------------- /docs/figurs/evaluation_simiterm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/evaluation_simiterm.png -------------------------------------------------------------------------------- /docs/figurs/figure8_human_review.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/figure8_human_review.png -------------------------------------------------------------------------------- /docs/figurs/sty_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/sty_distribution.png -------------------------------------------------------------------------------- /docs/figurs/work-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/work-flow.png -------------------------------------------------------------------------------- /libs/bin/winutils.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/libs/bin/winutils.exe -------------------------------------------------------------------------------- /libs/metamap-api-2.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/libs/metamap-api-2.0.jar -------------------------------------------------------------------------------- /libs/prologbeans.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/libs/prologbeans.jar -------------------------------------------------------------------------------- /libs/stanford-corenlp.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/libs/stanford-corenlp.jar -------------------------------------------------------------------------------- /libs/stanfordNlp-models-url.txt: -------------------------------------------------------------------------------- 1 | http://stanfordnlp.github.io/CoreNLP/download.html 2 | Add the downloaded file to project dependency path. -------------------------------------------------------------------------------- /py/get_ct.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Jason' 2 | 3 | 4 | import urllib2 5 | from urllib2 import urlopen 6 | from bs4 import BeautifulSoup 7 | import re 8 | 9 | 10 | def visible(element): 11 | if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: 12 | return False 13 | # elif re.match("", str(element)): 14 | # return False 15 | return True 16 | 17 | f = open("C:\\fsu\\ra\\data\\201612\\index_url.txt") 18 | for line in f.readlines(): 19 | if len(line)<10: continue 20 | (index, url) = line.split('\t',1) 21 | #url = "https://www.ncbi.nlm.nih.gov/pubmed/20482476" 22 | # print (url) 23 | try: 24 | html = urllib2.urlopen(url).read() 25 | soup = BeautifulSoup(html, 'html.parser') 26 | texts = soup.findAll(text=True) 27 | visible_texts = filter(visible, texts) 28 | text = filter(lambda x: len(x.strip())>5, visible_texts) 29 | text2 = " ".join(text) 30 | match = re.match(".*(\\bNCT\\d{5,15}\\b).*", text2, re.MULTILINE+re.DOTALL+re.UNICODE) 31 | if match is not None: 32 | ct = match.group(1) 33 | # print (text2.encode('utf-8')) 34 | print("%s\t%s" % (index, ct)) 35 | #elif None != re.match(".*(\\bclinicaltrials\\.gov\\b).*", text2, re.MULTILINE+re.DOTALL+re.UNICODE+re.IGNORECASE): 36 | elif None != re.match(".*(\\bclinicaltrials\\b).*", text2, re.MULTILINE+re.DOTALL+re.UNICODE+re.IGNORECASE): 37 | print("%s\t%s" % (index, "clinicaltrials.gov")) 38 | else: 39 | print("%s\t%s" % (index, "None")) 40 | except: 41 | print("%s\t%s" % (index, "Error")) 42 | 43 | class aaa(object): 44 | pass 45 | -------------------------------------------------------------------------------- /py/ner200align.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Jason' 2 | 3 | 4 | f = open("""C:\\Users\\Jason\\Dropbox\\clinicalTrialPattern\\Evaluation\\random_200_sentences_cancer_studies-extend-non-major-term.txt""") 5 | out = open("""C:\\Users\\Jason\\Dropbox\\clinicalTrialPattern\\Evaluation\\random_200_sentences_cancer_studies-extend-non-major-term-ret.txt""",'w+') 6 | 7 | firstLine = True 8 | for line in f.readlines(): 9 | if len(line.strip())>10: 10 | tokens_org = line.strip().split('\t') 11 | tokens_org[0]=tokens_org[1] 12 | other_term = tokens_org[3].strip('\" ') 13 | if not firstLine: 14 | tokens_org[3]="" 15 | out.write('\t'.join(tokens_org) + '\n') 16 | if firstLine==False and len(other_term.strip())>1: 17 | for other in other_term.split(','): 18 | tokens_org[2]=other.strip() 19 | out.write('\t'.join(tokens_org) + '\n') 20 | firstLine = False 21 | 22 | f.close() 23 | out.close() 24 | -------------------------------------------------------------------------------- /py/pre_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ''' 4 | Prepare the data for the project, for now, just unzip the files. 5 | I don't find the way to find the exact type of the file, but I know it can be decompressed by gzip. 6 | So I call the shell command "gzip -d -f" to do it. 7 | ''' 8 | import os 9 | import gzip 10 | 11 | def walk_dir(dir,topdown=True): 12 | for root, dirs, files in os.walk(dir, topdown): 13 | for name in files: 14 | print name 15 | if not name.endswith(".Z"): continue 16 | print("unzip file: " + os.path.join(root,name)) 17 | os.system("gzip -d -f " + os.path.join(root,name)) 18 | 19 | 20 | walk_dir("./data") 21 | 22 | 23 | -------------------------------------------------------------------------------- /py/preprocess_index.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Jason' 2 | 3 | 4 | import csv 5 | import sys,os,re 6 | 7 | 8 | with open(r'C:\fsu\ra\data\201708\Copy of Botanical_with_dsld_cat_termlist.csv', 'w+') as output: 9 | with open(r'C:\fsu\ra\data\201708\Copy of Botanical_with_dsld_cat.csv', 'rb') as csvfile: 10 | spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') 11 | head = True 12 | aui = 0 13 | for row in spamreader: 14 | '''column: id, name, Scientific Name, category_DSLD''' 15 | if head: 16 | head = False 17 | continue 18 | terms = str(row[2]) 19 | terms = re.sub(r'\.\s*Family:',', ',terms) 20 | terms = terms.replace('/',', ') 21 | terms = terms.replace(';',', ') 22 | terms = terms.replace('synonyms','') 23 | terms = terms.replace('synonym','') 24 | 25 | res_list = [] 26 | terms_list = terms.split(', ') 27 | for term in terms_list: 28 | term = term.strip() 29 | term = term.strip('.,?!"\'\r') 30 | # extract (*) 31 | match = re.match(r'(.+?)\((.+?)\)(.*?)',term) 32 | if match == None: 33 | print(term) 34 | res_list.append(term) 35 | else: 36 | res_list.append(match.group(1).strip() + match.group(3).strip()) 37 | res_list.append(match.group(2).strip()) 38 | print(term, match.group(1)+match.group(3), match.group(2)) 39 | # print('\t'.join(res_list)) 40 | 41 | cui = row[0] 42 | sab = 'unknown' 43 | for term in res_list: 44 | aui += 1 45 | preStr = term 46 | output.write('\t'.join([cui,str(aui),sab,term,preStr]) + '\n') 47 | 48 | 49 | -------------------------------------------------------------------------------- /py/xsl2csv.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Jason' 2 | 3 | from openpyxl import load_workbook 4 | wb = load_workbook("C:\\Users\\Jason\\Desktop\\alldeaf_Health_08_20_2015.xlsx") 5 | 6 | ws = wb['Health'] 7 | 8 | print (ws['A2']) 9 | 10 | -------------------------------------------------------------------------------- /r/.Rhistory: -------------------------------------------------------------------------------- 1 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R') 2 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R') 3 | source('C:/fsu/ra/UmlsTagger/r/review-order-ranking.R') 4 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation-200to300.R') 5 | source('C:/fsu/ra/UmlsTagger/r/evaluation.R') 6 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R') 7 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R') 8 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R') 9 | source('C:/fsu/ra/UmlsTagger/r/pattern-heatmap.R') 10 | source('C:/fsu/ra/UmlsTagger/r/pattern-heatmap.R') 11 | source('C:/fsu/ra/UmlsTagger/r/pattern-heatmap.R') 12 | source('C:/fsu/ra/UmlsTagger/r/pattern-heatmap.R') 13 | -------------------------------------------------------------------------------- /r/README.md: -------------------------------------------------------------------------------- 1 | ## R code 2 | 3 | Most of the R code here is used to draw the figures for our publications. 4 | 5 | ### Directory `data` 6 | Store the data for the R code below. 7 | 8 | ### cross-evaluation.R 9 | Use to draw the precision/recall/f-score for the CHV paper for the semiTerm features 10 | 11 | ### cross-evaluation-bow.R 12 | Use to draw the precision/recall/f-score for the CHV paper for the bag-of-word features 13 | 14 | ### cross-evaluation-200to300.R 15 | Use to draw the precision/recall/f-score for the CHV paper for the semiTerm features for Kmeans-parameter 200 to 300. 16 | We do this because at the beginning we only processed parameter 5 to 200 17 | 18 | ### pattern-heatmap.R 19 | Use to draw the heap map figure for CHV paper also for IBMB 2016 paper 20 | 21 | ### pca-draw.R 22 | Use to draw the 3-D scatter figure for the CHV paper for the semiTerm features 23 | 24 | ### review_result.R and review-order-ranking.R 25 | Use to draw the human review figure for CHV paper 26 | 27 | ### silhouette.R 28 | Draw the silhouette figure 29 | 30 | ### ngram-distribution.R 31 | Draw the ngram distribution figure 32 | 33 | ### classify-rpart.R 34 | Classify ngrams using r-part. 35 | -------------------------------------------------------------------------------- /r/RVisualisation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/r/RVisualisation.pdf -------------------------------------------------------------------------------- /r/classify-rpart.R: -------------------------------------------------------------------------------- 1 | library(rpart) 2 | 3 | #ngrams=read.table("C:\\fsu\\ra\\data\\ngram_vectors_all_0129.txt") 4 | ngrams=read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\ngram_vectors_all_0227.txt") 5 | colnames(ngrams)=c("tfdf-1","tf-2","df-3","cvalue-4","umls_score-5","chv_score-6","contain_umls-7","contain_chv-8","nn-9","an-10","pn-11","anpn-12","stys-13","stys-14","stys-15","stys-16","stys-17","stys-18","stys-19","stys-20","stys-21","stys-22","stys-23","stys-24","win_pos-25","win_pos-26","win_pos-27","win_pos-28","win_pos-29","win_pos-30","win_pos-31","win_pos-32","win_pos-33","win_pos-34","win_pos-35","win_pos-36","win_pos-37","capt_first-38","capt_all-39","capt_term-40","win_umls-41","win_chv-42","sent_umls-43","sent_chv-44","umls_dist-45","chv_dist-46","prefix-47","prefix-48","prefix-49","prefix-50","prefix-51","prefix-52","prefix-53","prefix-54","prefix-55","prefix-56","prefix-57","prefix-58","prefix-59","prefix-60","prefix-61","prefix-62","prefix-63","prefix-64","prefix-65","prefix-66","prefix-67","prefix-68","prefix-69","prefix-70","prefix-71","prefix-72","prefix-73","prefix-74","prefix-75","prefix-76","prefix-77","prefix-78","prefix-79","prefix-80","prefix-81","prefix-82","prefix-83","prefix-84","prefix-85","prefix-86","prefix-87","prefix-88","prefix-89","prefix-90","prefix-91","prefix-92","prefix-93","prefix-94","prefix-95","prefix-96","prefix-97","prefix-98","prefix-99","prefix-100","prefix-101","prefix-102","prefix-103","prefix-104","prefix-105","prefix-106","prefix-107","prefix-108","prefix-109","prefix-110","prefix-111","prefix-112","prefix-113","prefix-114","prefix-115","prefix-116","prefix-117","prefix-118","prefix-119","prefix-120","suffix-121","suffix-122","suffix-123","suffix-124","suffix-125","suffix-126","suffix-127","suffix-128","suffix-129","suffix-130","suffix-131","suffix-132","suffix-133","suffix-134","suffix-135","suffix-136","suffix-137","suffix-138","suffix-139","suffix-140","suffix-141","suffix-142","suffix-143","suffix-144","suffix-145","suffix-146","suffix-147","suffix-148","suffix-149","suffix-150","suffix-151","suffix-152","suffix-153","suffix-154","suffix-155","suffix-156","suffix-157","suffix-158","suffix-159","suffix-160","suffix-161","suffix-162","suffix-163","suffix-164","suffix-165","suffix-166","suffix-167","suffix-168","suffix-169","suffix-170","suffix-171","suffix-172","suffix-173","suffix-174","suffix-175","suffix-176","suffix-177","suffix-178","suffix-179","suffix-180","suffix-181","suffix-182","suffix-183","suffix-184","suffix-185","suffix-186","suffix-187","suffix-188","suffix-189","suffix-190","suffix-191","suffix-192","suffix-193","suffix-194","suffix-195","suffix-196","suffix-197","suffix-198","suffix-199","suffix-200","suffix-201","suffix-202","suffix-203","suffix-204","suffix-205","suffix-206","suffix-207","suffix-208","suffix-209","suffix-210","suffix-211","suffix-212","suffix-213","suffix-214","suffix-215","suffix-216","suffix-217","suffix-218","suffix-219","suffix-220","suffix-221","suffix-222","suffix-223","suffix-224","suffix-225","suffix-226","suffix-227","suffix-228","suffix-229","suffix-230","suffix-231","suffix-232","suffix-233","suffix-234","suffix-235","suffix-236","suffix-237","suffix-238","suffix-239","suffix-240","suffix-241","suffix-242","suffix-243","suffix-244","suffix-245","suffix-246","suffix-247","suffix-248","suffix-249","suffix-250","suffix-251","suffix-252","suffix-253","suffix-254","suffix-255","suffix-256","suffix-257","suffix-258","suffix-259","suffix-260","suffix-261","suffix-262","suffix-263","suffix-264","suffix-265","suffix-266","suffix-267","suffix-268","suffix-269","suffix-270","suffix-271","suffix-272","suffix-273","suffix-274","suffix-275","suffix-276","suffix-277","suffix-278","suffix-279","suffix-280","suffix-281","suffix-282","suffix-283","suffix-284","suffix-285","suffix-286","suffix-287","suffix-288","suffix-289","suffix-290","suffix-291","suffix-292","suffix-293","suffix-294","suffix-295","suffix-296","suffix-297","suffix-298","suffix-299","suffix-300","suffix-301","suffix-302","suffix-303","suffix-304","suffix-305","suffix-306","suffix-307","suffix-308","suffix-309","suffix-310","suffix-311","suffix-312","suffix-313") 6 | v=ngrams 7 | v[,"chv_score-6"]=v[,"chv_score-6"]>0.3 8 | 9 | setwd("C:\\fsu\\ra\\UmlsTagger\\r\\data") 10 | tree <- rpart(`chv_score-6`~.,v,method="class", control=rpart.control(maxdepth = 30, minsplit = 5, minbucket = 2, cp = 0.0001)) 11 | summary(tree,file="rpart.summary") 12 | 13 | 14 | xmat=xpred.rpart(tree, xval = 10, return.all = FALSE) 15 | xerr=(xmat-1-v$`chv_score-6`)^2 16 | e2=apply(xerr, 2, sum)/nrow(v) # cross-validated error estimate 17 | e3=apply(xerr, 2, sum) 18 | printcp(tree) 19 | 20 | -------------------------------------------------------------------------------- /r/cross-evaluation-200to300.R: -------------------------------------------------------------------------------- 1 | library('base') 2 | 3 | # tf > 100, filter cluster < 3 4 | data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-tf100-200to300.txt",sep='\t') 5 | rd_pc=25.5 6 | 7 | 8 | data.avg=aggregate(data[,1:ncol(data)], list(data[,1]),mean) 9 | 10 | # ev=read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\evaluation.txt", sep = '\t') 11 | ev=data.avg[order(data.avg$Group.1),] 12 | 13 | write.table(data.avg[,2:ncol(data.avg)], "C:\\fsu\\ra\\UmlsTagger\\r\\data\\tmp.txt", sep='\t',row.names = FALSE,col.names = FALSE) 14 | x = matrix(seq(5,200,5),ncol=1) * (100/100) 15 | 16 | # random baseline data 17 | y_rd_pc = rep(rd_pc,40) 18 | dim(y_rd_pc)=c(40,1) 19 | y_rd_rc = seq(5,200,5)*rd_pc/100 20 | dim(y_rd_rc)=c(40,1) 21 | y_rd_fs=(1+0.5^2)*(y_rd_pc*y_rd_rc)/((0.5^2*y_rd_pc+y_rd_rc))/100 22 | 23 | 24 | #precision 25 | startcol=11+40 26 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 27 | y = cbind(y,y_rd_pc) 28 | matplot(x,y,type=c('l'), 29 | #pch=c(1,2,3), 30 | lwd=1, 31 | lty=1, 32 | #add=TRUE, 33 | col=gray.colors(nrow(ev),0.9,0), 34 | xlab="top-N percent", ylab="precision (%)") 35 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 36 | y2 = cbind(y2,y_rd_pc) 37 | matplot(x,y2,type=c('o'), 38 | pch=c(1,5,6), 39 | lwd=1, 40 | lty=1, 41 | lend=3, 42 | add=TRUE, 43 | col=rainbow(3,start=1)) 44 | 45 | 46 | 47 | legend("topright",legend = c("tf", "c-value", "random", "k=5", "k=300"), 48 | col=c(rainbow(3,start=1), 49 | gray.colors(2,0.9,0)), 50 | pch=c(1,5,6,16,16)) # optiona 51 | 52 | 53 | #recall 54 | startcol=11+00 55 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 56 | y = cbind(y,y_rd_rc) 57 | #View(y) 58 | matplot(x,y,type=c('l'), 59 | #pch=c(1,2,3), 60 | lwd=1, 61 | lty=1, 62 | #add=TRUE, 63 | col=gray.colors(nrow(ev),0.9,0), 64 | xlab="top-N percent", ylab="recall (%)") 65 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 66 | y2 = cbind(y2,y_rd_rc) 67 | matplot(x,y2,type=c('o'), 68 | pch=c(1,5,6), 69 | lwd=1, 70 | lty=1, 71 | lend=3, 72 | add=TRUE, 73 | col=rainbow(3,start=1)) 74 | legend("topleft",legend = c("tf", "c-value", "random", "k=5", "k=300"), 75 | col=c(rainbow(3,start=1), 76 | gray.colors(2,0.9,0)), 77 | pch=c(1,5,6,16,16)) # optiona 78 | 79 | 80 | #f-score 81 | startcol=11+80 82 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 83 | y = cbind(y,y_rd_fs) 84 | #View(y) 85 | matplot(x,y,type=c('l'), 86 | #pch=c(1,2,3), 87 | lwd=1, 88 | lty=1, 89 | #add=TRUE, 90 | col=gray.colors(nrow(ev),0.9,0), 91 | xlab="top-N percent", ylab="f-score") 92 | 93 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 94 | y2 = cbind(y2,y_rd_fs) 95 | matplot(x,y2,type=c('o'), 96 | pch=c(1,5,6), 97 | lwd=1, 98 | lty=1, 99 | lend=3, 100 | add=TRUE, 101 | col=rainbow(3,start=1)) 102 | 103 | legend("topright",legend = c("tf", "c-value","random", "k=5", "k=300"), 104 | col=c(rainbow(3,start=1), 105 | gray.colors(3,0.9,0)), 106 | pch=c(1,5,6,16,16)) # optiona 107 | -------------------------------------------------------------------------------- /r/cross-evaluation-bow.R: -------------------------------------------------------------------------------- 1 | library('base') 2 | 3 | #tf > 100, filter cluster < 3 4 | data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-bow.txt",sep='\t') 5 | rd_pc=25.5 6 | cnt=c(3541,1895) #(#_ngram_in_test, #_chv_in_test) 7 | tf=100 8 | 9 | 10 | data.avg=aggregate(data[,1:ncol(data)], list(data[,1]),mean) 11 | 12 | ev=data.avg[order(data.avg$Group.1),] 13 | 14 | write.table(data.avg[,2:ncol(data.avg)], "C:\\fsu\\ra\\UmlsTagger\\r\\data\\tmp.txt", sep='\t',row.names = FALSE,col.names = FALSE) 15 | x = matrix(seq(5,200,5),ncol=1) /100 * cnt[2] 16 | 17 | 18 | # random baseline data 19 | y_rd_pc = rep(rd_pc,40) 20 | dim(y_rd_pc)=c(40,1) 21 | y_rd_rc = seq(5,200,5)*rd_pc/100 22 | dim(y_rd_rc)=c(40,1) 23 | y_rd_fs=(1+0.5^2)*(y_rd_pc*y_rd_rc)/((0.5^2*y_rd_pc+y_rd_rc))/100 24 | 25 | 26 | #precision 27 | startcol=11+40 28 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 29 | y = cbind(y,y_rd_pc) 30 | matplot(x,y,type=c('l'), 31 | #pch=c(1,2,3), 32 | lwd=1, 33 | lty=1, 34 | #add=TRUE, 35 | col=gray.colors(nrow(ev),0.9,0), 36 | xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="precision (%)") 37 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 38 | y2 = cbind(y2,y_rd_pc) 39 | matplot(x,y2,type=c('o'), 40 | pch=c(1,5,6), 41 | lwd=1, 42 | lty=1, 43 | lend=3, 44 | add=TRUE, 45 | col=rainbow(3,start=1)) 46 | 47 | legend("topright",legend = c("tf", "c-value", "random", "BOW (k=5)", "BOW (k=300)"), 48 | col=c(rainbow(3,start=1), 49 | gray.colors(2,0.9,0)), 50 | pch=c(1,5,6,16,16)) # optiona 51 | 52 | 53 | #recall 54 | startcol=11+00 55 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 56 | y = cbind(y,y_rd_rc) 57 | #View(y) 58 | matplot(x,y,type=c('l'), 59 | #pch=c(1,2,3), 60 | lwd=1, 61 | lty=1, 62 | #add=TRUE, 63 | col=gray.colors(nrow(ev),0.9,0), 64 | xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="recall (%)") 65 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 66 | y2 = cbind(y2,y_rd_rc) 67 | matplot(x,y2,type=c('o'), 68 | pch=c(1,5,6), 69 | lwd=1, 70 | lty=1, 71 | lend=3, 72 | add=TRUE, 73 | col=rainbow(3,start=1)) 74 | legend("topleft",legend = c("tf", "c-value", "random", "BOW (k=5)", "BOW (k=300)"), 75 | col=c(rainbow(3,start=1), 76 | gray.colors(2,0.9,0)), 77 | pch=c(1,5,6,16,16)) # optiona 78 | 79 | 80 | #f-score 81 | startcol=11+80 82 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 83 | y = cbind(y,y_rd_fs) 84 | #View(y) 85 | matplot(x,y,type=c('l'), 86 | #pch=c(1,2,3), 87 | lwd=1, 88 | lty=1, 89 | #add=TRUE, 90 | col=gray.colors(nrow(ev),0.9,0), 91 | xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="f-score") 92 | 93 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 94 | y2 = cbind(y2,y_rd_fs) 95 | matplot(x,y2,type=c('o'), 96 | pch=c(1,5,6), 97 | lwd=1, 98 | lty=1, 99 | lend=3, 100 | add=TRUE, 101 | col=rainbow(3,start=1)) 102 | 103 | legend("bottomright",legend = c("tf", "c-value","random", "BOW (k=5)", "BOW (k=300)"), 104 | col=c(rainbow(3,start=1), 105 | gray.colors(2,0.9,0)), 106 | pch=c(1,5,6,16,16)) # optiona 107 | -------------------------------------------------------------------------------- /r/cross-evaluation.R: -------------------------------------------------------------------------------- 1 | library('base') 2 | 3 | # tf > 100, filter cluster < 3 4 | # data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-tf100.txt",sep='\t') 5 | # rd_pc=25.5 6 | # cnt=c(3541,1895) #(#_ngram_in_test, #_chv_in_test) 7 | # tf=100 8 | 9 | # 10 | # # tf > 100, filter cluster < 3 -- cancer data 11 | data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-tf100-cancer.txt",sep='\t') 12 | rd_pc=25.5 13 | cnt=c(4344,2280) 14 | tf=100 15 | 16 | 17 | #tf > 5 18 | # data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-tf5.txt",sep='\t') 19 | # rd_pc=9.78 20 | # cnt=c(7374,2212) 21 | # tf=5 22 | 23 | data.avg=aggregate(data[,1:ncol(data)], list(data[,1]),mean) 24 | 25 | # ev=read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\evaluation.txt", sep = '\t') 26 | ev=data.avg[order(data.avg$Group.1),] 27 | 28 | write.table(data.avg[,2:ncol(data.avg)], "C:\\fsu\\ra\\UmlsTagger\\r\\data\\tmp.txt", sep='\t',row.names = FALSE,col.names = FALSE) 29 | x = matrix(seq(5,200,5),ncol=1) /100 * cnt[2] 30 | 31 | 32 | # random baseline data 33 | y_rd_pc = rep(rd_pc,40) 34 | dim(y_rd_pc)=c(40,1) 35 | y_rd_rc = seq(5,200,5)*rd_pc/100 36 | dim(y_rd_rc)=c(40,1) 37 | y_rd_fs=(1+0.5^2)*(y_rd_pc*y_rd_rc)/((0.5^2*y_rd_pc+y_rd_rc))/100 38 | 39 | 40 | #precision 41 | startcol=11+40 42 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 43 | y = cbind(y,y_rd_pc) 44 | matplot(x,y,type=c('l'), 45 | #pch=c(1,2,3), 46 | lwd=1, 47 | lty=1, 48 | #add=TRUE, 49 | col=gray.colors(nrow(ev),0.9,0), 50 | xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="precision (%)") 51 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 52 | y2 = cbind(y2,y_rd_pc) 53 | matplot(x,y2,type=c('o'), 54 | pch=c(1,5,6), 55 | lwd=1, 56 | lty=1, 57 | lend=3, 58 | add=TRUE, 59 | col=rainbow(3,start=1)) 60 | 61 | legend("topright",legend = c("tf", "c-value", "random", "simiTerm (k=5)", "simiTerm (k=300)"), 62 | col=c(rainbow(3,start=1), 63 | gray.colors(2,0.9,0)), 64 | pch=c(1,5,6,16,16)) # optiona 65 | 66 | 67 | #recall 68 | startcol=11+00 69 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 70 | y = cbind(y,y_rd_rc) 71 | #View(y) 72 | matplot(x,y,type=c('l'), 73 | #pch=c(1,2,3), 74 | lwd=1, 75 | lty=1, 76 | #add=TRUE, 77 | col=gray.colors(nrow(ev),0.9,0), 78 | xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="recall (%)") 79 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 80 | y2 = cbind(y2,y_rd_rc) 81 | matplot(x,y2,type=c('o'), 82 | pch=c(1,5,6), 83 | lwd=1, 84 | lty=1, 85 | lend=3, 86 | add=TRUE, 87 | col=rainbow(3,start=1)) 88 | legend("topleft",legend = c("tf", "c-value", "random", "simiTerm (k=5)", "simiTerm (k=300)"), 89 | col=c(rainbow(3,start=1), 90 | gray.colors(2,0.9,0)), 91 | pch=c(1,5,6,16,16)) # optiona 92 | 93 | 94 | #f-score 95 | startcol=11+80 96 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)]) 97 | y = cbind(y,y_rd_fs) 98 | #View(y) 99 | matplot(x,y,type=c('l'), 100 | #pch=c(1,2,3), 101 | lwd=1, 102 | lty=1, 103 | #add=TRUE, 104 | col=gray.colors(nrow(ev),0.9,0), 105 | xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="f-score") 106 | 107 | y2 = t(ev[1:2,startcol:(startcol+40-1)]) 108 | y2 = cbind(y2,y_rd_fs) 109 | matplot(x,y2,type=c('o'), 110 | pch=c(1,5,6), 111 | lwd=1, 112 | lty=1, 113 | lend=3, 114 | add=TRUE, 115 | col=rainbow(3,start=1)) 116 | 117 | legend("topright",legend = c("tf", "c-value","random", "simiTerm (k=5)", "simiTerm (k=300)"), 118 | col=c(rainbow(3,start=1), 119 | gray.colors(2,0.9,0)), 120 | pch=c(1,5,6,16,16)) # optiona 121 | -------------------------------------------------------------------------------- /r/data/human_review.txt: -------------------------------------------------------------------------------- 1 | 73 47 44 73 88 2 | 1 0 1 1 2 3 | 26 53 55 26 10 4 | -------------------------------------------------------------------------------- /r/ngram-distribution.R: -------------------------------------------------------------------------------- 1 | org =read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\ngram_yahoo_tf5.txt",header=TRUE,sep='\t') 2 | x=seq(1,nrow(org)) 3 | gram1=subset(org,org['n']==1) 4 | gram2=subset(org,org['n']==2) 5 | gram3=subset(org,org['n']==3) 6 | gram4=subset(org,org['n']==4) 7 | gram5=subset(org,org['n']==5) 8 | gram6=subset(org,org['type']=="chv") 9 | gram7=subset(org,org['type']=="umls") 10 | gram8=subset(org,org['type']=="others") 11 | 12 | n=1 13 | matplot(seq(1,nrow(gram1)),log(gram1[,'tf']),type='l',pch=n,col=n, xlab="Index of ranked n-grams", ylab="log(term frequency)") 14 | matpoints(seq(1,nrow(gram1))[seq(1,nrow(gram1),500)],log(gram1[,'tf'])[seq(1,nrow(gram1),500)],pch=n,col=n) 15 | 16 | n=2 17 | matplot(seq(1,nrow(gram2)),log(gram2[,'tf']),type='l',pch=n,col=rainbow(5,start=0.2), add=TRUE) 18 | matpoints(seq(1,nrow(gram2))[seq(1,nrow(gram2),500)],log(gram2[,'tf'])[seq(1,nrow(gram2),500)],pch=n,col=n) 19 | 20 | n=3 21 | matplot(seq(1,nrow(gram3)),log(gram3[,'tf']),type='l',pch=n,col=rainbow(5,start=0.3), add=TRUE) 22 | matpoints(seq(1,nrow(gram3))[seq(1,nrow(gram3),500)],log(gram3[,'tf'])[seq(1,nrow(gram3),500)],pch=n,col=n) 23 | 24 | n=4 25 | matplot(seq(1,nrow(gram4)),log(gram4[,'tf']),type='l',pch=n,col=rainbow(5,start=0.4), add=TRUE) 26 | matpoints(seq(1,nrow(gram4))[seq(1,nrow(gram4),500)],log(gram4[,'tf'])[seq(1,nrow(gram4),500)],pch=n,col=n) 27 | 28 | n=5 29 | matplot(seq(1,nrow(gram5)),log(gram5[,'tf']),type='l',pch=n,col=rainbow(5,start=0.5), add=TRUE) 30 | matpoints(seq(1,nrow(gram5))[seq(1,nrow(gram5),500)],log(gram5[,'tf'])[seq(1,nrow(gram5),500)],pch=n,col=n) 31 | 32 | legend("topright",legend = c("1-gram","2-gram","3-gram","4-gram","5-gram"), col=1:n, pch=1:n) # optiona 33 | 34 | 35 | 36 | n=6 37 | matplot(seq(1,nrow(gram6)),log(gram6[,'tf']),type='l',pch=n,col=rainbow(n,start=0.1*n), xlab="Index of ranked n-grams", ylab="log(term frequency)") 38 | matpoints(seq(1,nrow(gram6))[seq(1,nrow(gram6),500)],log(gram6[,'tf'])[seq(1,nrow(gram6),500)],pch=n,col=n) 39 | 40 | n=7 41 | matplot(seq(1,nrow(gram7)),log(gram7[,'tf']),type='l',pch=n,col=rainbow(n,start=0.1*n), add=TRUE) 42 | matpoints(seq(1,nrow(gram7))[seq(1,nrow(gram7),500)],log(gram7[,'tf'])[seq(1,nrow(gram7),500)],pch=n,col=n) 43 | 44 | n=8 45 | matplot(seq(1,nrow(gram8)),log(gram8[,'tf']),type='l',pch=n,col=rainbow(n,start=0.1*n), add=TRUE) 46 | matpoints(seq(1,nrow(gram8))[seq(1,nrow(gram8),500)],log(gram8[,'tf'])[seq(1,nrow(gram8),500)],pch=n,col=n) 47 | 48 | legend("topright",legend = c("CHV terms","UMLS w/o CHV terms","other terms"), col=6:n, pch=6:n) # optiona 49 | 50 | -------------------------------------------------------------------------------- /r/pattern-heatmap.R: -------------------------------------------------------------------------------- 1 | ######################################################### 2 | ### A) Installing and loading required packages 3 | ######################################################### 4 | 5 | if (!require("gplots")) { 6 | install.packages("gplots", dependencies = TRUE) 7 | library(gplots) 8 | } 9 | if (!require("RColorBrewer")) { 10 | install.packages("RColorBrewer", dependencies = TRUE) 11 | library(RColorBrewer) 12 | } 13 | if (!require("d3heatmap")) { 14 | install.packages("d3heatmap", dependencies = TRUE) 15 | library(d3heatmap) 16 | } 17 | 18 | 19 | 20 | df <- read.csv("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cui-duration-freq.csv", sep=",",colClasses = "character") 21 | #df <- read.csv("C:\\fsu\\ra\\data\\201601\\split_criteria\\cui-duration-freq.csv", sep=",",colClasses = "character") 22 | 23 | 24 | 25 | cuis <- unique(df[,"cui"]) # already sort by the sql 26 | durs <- sort(unique(df[,"month"])) 27 | 28 | 29 | topN = 50 30 | if (length(cuis)nchar(cuiStr[match(cui,cuis)])) { 45 | cuiStr[match(cui,cuis)] = sprintf("%s(%s)",df[r,"cui_str"],df[r,"sty"]) 46 | } 47 | print(c(cui,dur,mat[cui,dur])) 48 | maxRow = r 49 | } 50 | } 51 | 52 | print(mat) 53 | 54 | # only use the column that contain non-zero value 55 | print(dim(mat)) 56 | print(colSums(mat)) 57 | colFilter = colSums(mat)!=0 58 | mat <- mat[,colFilter] 59 | mat <- matrix(mat,nrow=topN,ncol=sum(colFilter)) 60 | print(dim(mat)) 61 | 62 | rownames(mat) <- cuiStr 63 | colnames(mat) <- as.character(sort(as.integer(durs)))[colFilter] 64 | # creates a own color palette from red to green 65 | my_palette <- c(colorRampPalette(c("grey90"),1)(n = 1), 66 | colorRampPalette(c("light green", "yellow", "orange", "red"),0.2)(n = 99) ) 67 | 68 | # (optional) defines the color breaks manually for a "skewed" color transition 69 | col_breaks = c(seq(0,0.1,length=1), # for grey 70 | seq(0.2,30,length=40), # for green 71 | seq(30.1,150,length=30), # for yellow 72 | seq(150.1,max(mat)+1,length=30)) # for red 73 | 74 | # creates a 5 x 5 inch image 75 | png("C:\\Users\\Jason\\Desktop\\cui_duration_heatmap.png", 76 | width = 5*400, # 5 x 300 pixels 77 | height = 5*400, 78 | res = 400, # 300 pixels per inch 79 | pointsize = 6) # smaller font size 80 | 81 | 82 | labels <- as.character(mat) 83 | labels[mat==0] = "" 84 | dim(labels) = dim(mat) 85 | heatmap.2(mat, 86 | cellnote = labels, # same data set for cell labels 87 | #main = "duration vs frequency for top N CUI", # heat map title 88 | notecol="black", # change font color of cell labels to black 89 | density.info="histogram", # turns off density plot inside color legend 90 | key.par=list(mar=c(3.5,1,3,1)), 91 | key.title = "frequency to color mapping", 92 | key.xlab = "", 93 | key.ylab = "", 94 | #labCol = colnames(mat), 95 | #labRow = seq(nrow(mat)),#rownames(mat), 96 | xlab = "Number of months", 97 | cexRow = 1.4, 98 | cexcol = 6, 99 | srtRow = -23, 100 | trace="both", # turns off trace lines inside the heat map 101 | tracecol="ghostwhite", 102 | margins =c(3.5,6), # widens margins around plot, col and row 103 | col=my_palette, # use on color palette defined earlier 104 | breaks=col_breaks, # enable color transition at specified limits 105 | dendrogram="none", # only draw a row dendrogram 106 | Colv="NA", # turn off column clustering 107 | lmat=rbind(c(5, 4, 2), c(6, 1, 3)), 108 | lhei=c(1.5, 9), 109 | lwid=c(0.01, 10,1.9) 110 | ) 111 | #mtext("Number of months",side=1,line=3) 112 | 113 | #nba_heatmap <- heatmap(mat, Rowv=NA, Colv=NA, col = cm.colors(256), scale="column") 114 | 115 | 116 | 117 | # 118 | # library(d3heatmap) 119 | # url <- "http://datasets.flowingdata.com/ppg2008.csv" 120 | # nba_players <- read.csv(url, row.names = 1) 121 | # d3heatmap(nba_players, scale = "column",dendrogram = "none",color = "Blues") 122 | # 123 | # 124 | # install.packages("heatmaply") 125 | # library(heatmaply) 126 | # heatmaply(mtcars, k_col = 2, k_row = 3) %>% layout(margin = list(l = 130, b = 40)) 127 | 128 | 129 | dev.off() 130 | -------------------------------------------------------------------------------- /r/pca-draw.R: -------------------------------------------------------------------------------- 1 | library(rgl) 2 | 3 | ngramspca=read.table("C:\\fsu\\ra\\data\\pca.txt") 4 | plot(ngramspca[,1:3]) 5 | plot3d(ngramspca[,1:3]) 6 | apply(ngramspca[,1:10], 2, mean) 7 | apply(ngramspca[,1:10], 2, sd) 8 | 9 | 10 | ngramall=read.table("C:\\fsu\\ra\\data\\ngram_vectors_all_0227.txt") 11 | tmp=subset(ngramall,ngramall[,6]>0.3) 12 | #tmp=ngramall 13 | tmp[,6]=0 14 | p=prcomp(tmp,scale. = FALSE) 15 | plot(p) 16 | plot3d(p$x[,1:3]) 17 | plot(p$x[,1:3]) 18 | apply(p$x[,1:10], 2, mean) 19 | apply(p$x[,1:10], 2, sd) 20 | -------------------------------------------------------------------------------- /r/review-order-ranking.R: -------------------------------------------------------------------------------- 1 | 2 | #data =read.table("C:\\fsu\\ra\\data\\rank-review-ranking.csv",header=TRUE,sep=',') 3 | data =read.table("C:\\fsu\\ra\\data\\rank-review-ranking-30sample.csv",header=TRUE,sep=',') 4 | 5 | ds=data[order(-data[,"kTfAvg"],data[,"cost"]),] 6 | ds2 = cbind(ds,seq(1,nrow(ds),1)/nrow(ds)) 7 | chvCnt = sum(ds[,'type']=='chv') 8 | 9 | recall = rep(0,nrow(ds)) 10 | for (i in seq(1,nrow(ds2),1)) { 11 | recall[i]=sum(ds[1:i,'type']=='chv')/chvCnt 12 | } 13 | precision = rep(0,nrow(ds)) 14 | for (i in seq(1,nrow(ds2),1)) { 15 | precision[i]=sum(ds[1:i,'type']=='chv')/i 16 | } 17 | fscore = rep(0,nrow(ds)) 18 | for (i in seq(1,nrow(ds2),1)) { 19 | fscore[i]=(1+0.5^2)*(precision[i]*recall[i]/(0.5^2*precision[i]+recall[i])) 20 | } 21 | 22 | x = seq(1,nrow(ds),1)*100/chvCnt 23 | #precision 24 | y = cbind(recall,precision,fscore) 25 | matplot(x,y,type=c('l'), 26 | pch=c(1,5,6), 27 | lwd=1, 28 | lty=1, 29 | #add=TRUE, 30 | col=rainbow(3,start=1), 31 | xlab="top-N percent", ylab="recall/precision/F-score") 32 | 33 | pp=seq(1,nrow(ds),1) %% 200==0 34 | matpoints(x[pp], y[pp,], type = "p", lty = 1, lwd = 1, pch = c(1,5,6), 35 | col = rainbow(3,start=1)) 36 | 37 | legend("topright",legend = c("recall", "precision", "F-score"), 38 | col=rainbow(3,start=1), 39 | pch=c(1,5,6)) # optiona 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /r/review_result.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/r/review_result.R -------------------------------------------------------------------------------- /r/silhouette.R: -------------------------------------------------------------------------------- 1 | library(cluster) 2 | ngrams=read.table("data\\ngram_vectors_all_0227.txt") 3 | colnames(ngrams)=c("tfdf-1","tf-2","df-3","cvalue-4","umls_score-5","chv_score-6","contain_umls-7","contain_chv-8","nn-9","an-10","pn-11","anpn-12","stys-13","stys-14","stys-15","stys-16","stys-17","stys-18","stys-19","stys-20","stys-21","stys-22","stys-23","stys-24","win_pos-25","win_pos-26","win_pos-27","win_pos-28","win_pos-29","win_pos-30","win_pos-31","win_pos-32","win_pos-33","win_pos-34","win_pos-35","win_pos-36","win_pos-37","capt_first-38","capt_all-39","capt_term-40","win_umls-41","win_chv-42","sent_umls-43","sent_chv-44","umls_dist-45","chv_dist-46","prefix-47","prefix-48","prefix-49","prefix-50","prefix-51","prefix-52","prefix-53","prefix-54","prefix-55","prefix-56","prefix-57","prefix-58","prefix-59","prefix-60","prefix-61","prefix-62","prefix-63","prefix-64","prefix-65","prefix-66","prefix-67","prefix-68","prefix-69","prefix-70","prefix-71","prefix-72","prefix-73","prefix-74","prefix-75","prefix-76","prefix-77","prefix-78","prefix-79","prefix-80","prefix-81","prefix-82","prefix-83","prefix-84","prefix-85","prefix-86","prefix-87","prefix-88","prefix-89","prefix-90","prefix-91","prefix-92","prefix-93","prefix-94","prefix-95","prefix-96","prefix-97","prefix-98","prefix-99","prefix-100","prefix-101","prefix-102","prefix-103","prefix-104","prefix-105","prefix-106","prefix-107","prefix-108","prefix-109","prefix-110","prefix-111","prefix-112","prefix-113","prefix-114","prefix-115","prefix-116","prefix-117","prefix-118","prefix-119","prefix-120","suffix-121","suffix-122","suffix-123","suffix-124","suffix-125","suffix-126","suffix-127","suffix-128","suffix-129","suffix-130","suffix-131","suffix-132","suffix-133","suffix-134","suffix-135","suffix-136","suffix-137","suffix-138","suffix-139","suffix-140","suffix-141","suffix-142","suffix-143","suffix-144","suffix-145","suffix-146","suffix-147","suffix-148","suffix-149","suffix-150","suffix-151","suffix-152","suffix-153","suffix-154","suffix-155","suffix-156","suffix-157","suffix-158","suffix-159","suffix-160","suffix-161","suffix-162","suffix-163","suffix-164","suffix-165","suffix-166","suffix-167","suffix-168","suffix-169","suffix-170","suffix-171","suffix-172","suffix-173","suffix-174","suffix-175","suffix-176","suffix-177","suffix-178","suffix-179","suffix-180","suffix-181","suffix-182","suffix-183","suffix-184","suffix-185","suffix-186","suffix-187","suffix-188","suffix-189","suffix-190","suffix-191","suffix-192","suffix-193","suffix-194","suffix-195","suffix-196","suffix-197","suffix-198","suffix-199","suffix-200","suffix-201","suffix-202","suffix-203","suffix-204","suffix-205","suffix-206","suffix-207","suffix-208","suffix-209","suffix-210","suffix-211","suffix-212","suffix-213","suffix-214","suffix-215","suffix-216","suffix-217","suffix-218","suffix-219","suffix-220","suffix-221","suffix-222","suffix-223","suffix-224","suffix-225","suffix-226","suffix-227","suffix-228","suffix-229","suffix-230","suffix-231","suffix-232","suffix-233","suffix-234","suffix-235","suffix-236","suffix-237","suffix-238","suffix-239","suffix-240","suffix-241","suffix-242","suffix-243","suffix-244","suffix-245","suffix-246","suffix-247","suffix-248","suffix-249","suffix-250","suffix-251","suffix-252","suffix-253","suffix-254","suffix-255","suffix-256","suffix-257","suffix-258","suffix-259","suffix-260","suffix-261","suffix-262","suffix-263","suffix-264","suffix-265","suffix-266","suffix-267","suffix-268","suffix-269","suffix-270","suffix-271","suffix-272","suffix-273","suffix-274","suffix-275","suffix-276","suffix-277","suffix-278","suffix-279","suffix-280","suffix-281","suffix-282","suffix-283","suffix-284","suffix-285","suffix-286","suffix-287","suffix-288","suffix-289","suffix-290","suffix-291","suffix-292","suffix-293","suffix-294","suffix-295","suffix-296","suffix-297","suffix-298","suffix-299","suffix-300","suffix-301","suffix-302","suffix-303","suffix-304","suffix-305","suffix-306","suffix-307","suffix-308","suffix-309","suffix-310","suffix-311","suffix-312","suffix-313") 4 | v=subset(ngrams,`chv_score-6`>0.3) 5 | #v$`chv_score-6`=0 6 | dissE <- daisy(v) 7 | 8 | msk = c() 9 | kkk=seq(2,20,1) 10 | for (k in kkk) { 11 | km <- kmeans(v,k,iter.max = 1000,nstart=10) 12 | sk <- silhouette(km$cluster, dissE) 13 | #print(summary(sk)) 14 | print(mean(sk[,3])) 15 | msk = append(msk,mean(sk[,3])) 16 | #plot(sk) 17 | } 18 | 19 | #tiff("Plot2.tif", res = 300) 20 | 21 | plot(cbind(kkk,msk),type='b',xlab = "Number of clusters", ylab="Silhouette score") 22 | 23 | 24 | #sss=c(0.2643342,0.2798345,0.2909478,0.2932777,0.2771142,0.258752,0.2134653,0.217159,0.181204,0.1885766,0.1786644,0.1713124,0.206528,0.1611574,0.1845627,0.1712207,0.152299,0.1767387,0.157803,0.1702419,0.1557435,0.1296721,0.1368338,0.1490912,0.1171227,0.1211971,0.1221018,0.1244477,0.1108768,0.1215782,0.1197631,0.1129861,0.1221741,0.1015608,0.1007896,0.1075403) 25 | #cluster=c(seq(2,20,1),seq(20,100,5)) 26 | #plot(cluster,sss,type='b',xlab = "cluster number", ylab="Silhouette score") 27 | -------------------------------------------------------------------------------- /solr_Configuration.md: -------------------------------------------------------------------------------- 1 | It is tedious to configure Solr, that is why I change it to perform the matching task in Mysql even it is slower. 2 | I do not recommend to use Solr except you has a strong consideration. 3 | 4 | 4. **Download and Customize Solr** (only 4.6.1 is tested, later it will support solr 5.x) 5 | Solr is available for download [here](https://archive.apache.org/dist/lucene/solr/4.6.1/). 6 | After downloading you will need to expand it locally, then update the schema.xml and solrconfig.xml 7 | in the conf subdirectory as shown below: 8 | **(Tips: Instead of modify by yourself as following,, you can just copy the config files in 9 | ${project-root}/conf directory to ${solr-4.6.1}/example/solr/collection1/conf)** 10 | 11 | ``` 12 | tar xvzf solr-4.6.1.tgz 13 | cd solr-4.6.1/example/solr/collection1/conf 14 | ``` 15 | 16 | Update the schema.xml to replace the field definitions with our own. Our fields list and the definition 17 | of the field type "tag" (copied from the documentation of SolrTextTagger) is shown. The "id" field is 18 | just a integer sequence (unique key for Solr), the "cui" and "descr" comes from the CUI and 19 | STR fields from the UMLS database, and the descr_norm, descr_sorted, descr_stemmed are case/punctuation normalized, 20 | alpha sorted and stemmed versions of STR. The descr_tagged field is identical to descr_norm but is analyzed differently as specified below. 21 | (add the new fields to the beginning of the ): 22 | 23 | ``` 24 | 25 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 37 | 38 | 39 | 40 | ... 41 | 42 | ... 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | ... 53 | 54 | 55 | ``` 56 | We then add in the requestHandler definition for SolrTextTagger's tag service into the solrconfig.xml file (also in conf). 57 | The definition is shown below(add it above the first exists requestHandler): 58 | 59 | ``` 60 | 62 | descr_tagged 63 | descr_norm 64 | false 65 | 5000 66 | taggerCache.dat 67 | 68 | ``` 69 | Finally, we create a lib directory and copy over the solr-text-tagger-1.3-SNAPSHOT.jar into it. 70 | Then go up to the example directory and start Solr. Solr is now listening on port 8983 on localhost. 71 | 72 | ``` 73 | cd solr-4.6.1/example/solr/collection1 74 | mkdir lib 75 | cp ${SolrTextTagger-path}/SolrTextTagger/target/*jar lib/ 76 | cd ../.. 77 | java -jar start.jar 78 | ``` 79 | 5. Load Data and Build FST 80 | We use the same cuistr1.csv file that we downloaded from our MySQL UMLS database. I guess I could have 81 | written custom code to load the data into the index, but I had started experimenting with SolrTextTagger using curl, 82 | so I just wrote some code that converted the (CUI,STR) CSV format into JSON, 83 | with additional fields created by our case/punctuation normalization, alpha sort and stemming. 84 | I used the same Scala code since I already had the transformations coded up from last week. 85 | Once I generated the JSON file (cuistr1.json), I uploaded it into Solr and built the FST using the following curl commands. 86 | 87 | ``` 88 | cd solr-4.6.1\example\exampledocs 89 | java -Durl=http://localhost:8983/solr/update -Dtype=application/json \ 90 | -jar post.jar ${your-path}/first_10000.csv 91 | 92 | curl "http://localhost:8983/solr/tag?build=true" (or you can run this url in a browser) 93 | ``` 94 | 95 | 96 | 6. **Download and Build SolrTextTagger** 97 | The code for SolrTextTagger resides on GitHub, so to download and build the custom Solr JAR, 98 | execute the following sequence of commands. This will create a solr-text-tagger-1.3-SNAPSHOT. 99 | jar file in your target subdirectory in the SolrTextTagger project. 100 | 101 | ``` 102 | git clone https://github.com/OpenSextant/SolrTextTagger.git 103 | cd SolrTextTagger 104 | git checkout -b v1x --track origin/v1x 105 | mvn test 106 | mvn package 107 | ``` -------------------------------------------------------------------------------- /sql-script/0923-test.sql: -------------------------------------------------------------------------------- 1 | use umls; 2 | /*add a new column for all rel+rela, then get the rel+rela as a string*/ 3 | -- alter table umls.content_tag_diabetes_T047_unique2_output add (rel_all text default null); 4 | -- alter table ytex.content_tag_ytex_T047_unique_output add (rel_all text default null); 5 | /* 6 | update content_tag_diabetes_T047_unique2_output as ret set rel_all = 7 | (select GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') from umls.MRREL as r 8 | where (r.CUI1=ret.cui COLLATE utf8_unicode_ci and r.CUI2 = ret.rel_cui COLLATE utf8_unicode_ci) 9 | or (r.CUI2=ret.cui COLLATE utf8_unicode_ci and r.CUI1=ret.rel_cui COLLATE utf8_unicode_ci) 10 | GROUP BY CUI1,CUI2 limit 1 11 | ) 12 | where rel_all is null; 13 | */ 14 | drop table umls.tmp_pairs; 15 | create table umls.tmp_pairs as select DISTINCT CUI,REL_CUI FROM umls.content_tag_diabetes_T047_unique2_output where rel_all is null; 16 | 17 | update umls.content_tag_diabetes_T047_unique2_output as t 18 | inner join 19 | (select r.CUI1,r.CUI2,GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') as rel_all from 20 | tmp_pairs as ret 21 | inner join 22 | umls.MRREL as r 23 | on (r.CUI1=ret.cui COLLATE utf8_unicode_ci and r.CUI2 = ret.rel_cui COLLATE utf8_unicode_ci) 24 | or (r.CUI2=ret.cui COLLATE utf8_unicode_ci and r.CUI1=ret.rel_cui COLLATE utf8_unicode_ci) 25 | GROUP BY CUI1,CUI2 26 | ) as temp 27 | on (temp.CUI1=t.cui COLLATE utf8_unicode_ci and temp.CUI2 = t.rel_cui COLLATE utf8_unicode_ci) 28 | or (temp.CUI2=t.cui COLLATE utf8_unicode_ci and temp.CUI1=t.rel_cui COLLATE utf8_unicode_ci) 29 | set t.rel_all = temp.rel_all 30 | where t.rel_all is null 31 | ; 32 | 33 | select count(*) from umls.content_tag_diabetes_T047_unique2_output where rel_all is null; 34 | select count(*) from tmp_pairs; 35 | 36 | 37 | 38 | 39 | drop table if exists ytex.tmp_pairs; 40 | create table ytex.tmp_pairs as select DISTINCT CUI,REL_CUI FROM ytex.content_tag_ytex_T047_unique_output where rel_all is null; 41 | update ytex.content_tag_ytex_T047_unique_output as t 42 | inner join 43 | (select r.CUI1,r.CUI2,GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') as rel_all from umls.MRREL as r 44 | inner join ytex.tmp_pairs as ret 45 | on (r.CUI1=ret.cui COLLATE utf8_unicode_ci and r.CUI2 = ret.rel_cui COLLATE utf8_unicode_ci) 46 | or (r.CUI2=ret.cui COLLATE utf8_unicode_ci and r.CUI1=ret.rel_cui COLLATE utf8_unicode_ci) 47 | GROUP BY CUI1,CUI2 48 | ) as temp 49 | on (temp.CUI1=t.cui COLLATE utf8_unicode_ci and temp.CUI2 = t.rel_cui COLLATE utf8_unicode_ci) 50 | or (temp.CUI2=t.cui COLLATE utf8_unicode_ci and temp.CUI1=t.rel_cui COLLATE utf8_unicode_ci) 51 | set t.rel_all = temp.rel_all 52 | where t.rel_all is null 53 | ; 54 | 55 | select * from ytex.content_tag_ytex_T047_unique_output where rel_all is not null; 56 | 57 | use umls; 58 | select * from mrconso where SAB='SNOMEDCT_US' AND TTY = 'PT' AND CODE = '251314005'; 59 | select * from mrconso where AUI='A3601659'; 60 | -------------------------------------------------------------------------------- /sql-script/cancerqa_chv.sql: -------------------------------------------------------------------------------- 1 | create database cancerqa char set utf8; 2 | use cancerqa; 3 | 4 | create index idx_qid on cancerqa_questions(qid); 5 | create index idx_nick on cancerqa_questions(chosenanswernick); 6 | create index idx_nick2 on cancerqa_answers(usernick); 7 | create index idx_qid2 on cancerqa_answers(qid); 8 | 9 | alter table qa_data add column id int not null auto_increment primary key; 10 | 11 | select * from cancerqa_questions; 12 | select * from cancerqa_answers; 13 | 14 | create table qa_data as select Q.qid, A.usernick, Q.subject, Q.content as question_content, A.content as answer_content from cancerqa_questions Q, cancerqa_answers A where Q.qid=A.qid and Q.chosenanswernick=A.usernick; 15 | 16 | select * from qa_data; 17 | 18 | select qid,usernick, count(*) as cnt from qa_data group by qid,usernick order by cnt desc; 19 | 20 | 21 | select Q.qid, A.usernick, Q.subject, Q.content as question_content, A.content as answer_content from cancerqa_questions Q, cancerqa_answers A where Q.qid=A.qid and Q.chosenanswernick=A.usernick; 22 | 23 | select count(*) from qa_data; 24 | 25 | 26 | select * from cancerqa_answers B join (select A.qid as qid, max(A.rating) as maxrating from cancerqa_answers A group by A.qid) M 27 | on B.qid=M.qid and B.rating=M.maxrating; 28 | 29 | create table qa_data2 as 30 | select Q.qid, A.usernick, Q.subject, Q.content as question_content, A.content as answer_content, M.maxrating 31 | from cancerqa_questions Q, cancerqa_answers A,(select qid as qid, max(rating) as maxrating from cancerqa_answers group by qid) M where Q.qid=A.qid and Q.qid=M.qid and Q.chosenanswernick=A.usernick and M.maxrating=A.rating; 32 | select nested, count(*) from -------------------------------------------------------------------------------- /sql-script/chv.sql: -------------------------------------------------------------------------------- 1 | create database chv char set utf8; 2 | use chv; 3 | create table cancer_ngram ( 4 | ngram varchar(100), 5 | train varchar(100), 6 | n int, 7 | tfdf int, 8 | tf int, 9 | df int, 10 | cvalue float, 11 | nest float, 12 | umls_score float, 13 | chv_score float, 14 | cui_umls varchar(100), 15 | cui_chv varchar(100), 16 | contain_umls varchar(100), 17 | contain_chv varchar(100), 18 | win_umls int, 19 | win_chv int, 20 | sent_umls int, 21 | sent_chv int, 22 | umls_dist int, 23 | chv_dist int, 24 | win_pos varchar(100), 25 | prefix varchar(100), 26 | suffix varchar(100), 27 | bow_total int, 28 | bow_words int, 29 | sytax varchar(100), 30 | nn varchar(100), 31 | an varchar(100), 32 | pn varchar(100), 33 | anpn varchar(100), 34 | isTrain varchar(100), 35 | capt_first varchar(100), 36 | capt_term varchar(100), 37 | capt_all varchar(100), 38 | stys varchar(100), 39 | text_org varchar(100), 40 | sentence text 41 | ); 42 | create table diabetes_ngram like cancer_ngram; 43 | load data local infile '/data/ra/data/ngram_cancer_tf5.txt' into table cancer_ngram fields terminated by '\t' enclosed by '"' lines terminated by '\n' ignore 1 lines; 44 | load data local infile '/data/ra/data/ngram_diabetes_tf5.txt' into table diabetes_ngram fields terminated by '\t' enclosed by '"' lines terminated by '\n' ignore 1 lines; 45 | 46 | select n,sum(length(cui_chv)>0) as chv,sum(length(cui_chv)=0 and length(cui_umls)>0) as `umls-chv`, sum(length(cui_umls)=0) as others, count(*) as cnt from diabetes_ngram group by n; 47 | select n,sum(length(cui_chv)>0) as chv,sum(length(cui_chv)=0 and length(cui_umls)>0) as `umls-chv`, sum(length(cui_umls)=0) as others, count(*) as cnt from cancer_ngram group by n; -------------------------------------------------------------------------------- /sql-script/cluster.sql: -------------------------------------------------------------------------------- 1 | create database cluster character set utf8; 2 | use cluster; 3 | 4 | create table k20all ( 5 | `k` int(8) DEFAULT NULL, 6 | `type` varchar(40) DEFAULT NULL, 7 | `ngram` varchar(200) DEFAULT NULL, 8 | `n` int(8) DEFAULT NULL, 9 | `tfdf` float DEFAULT NULL, 10 | `tf` int(8) DEFAULT NULL, 11 | `df` int(8) DEFAULT NULL, 12 | `cvalue` varchar(40) DEFAULT NULL, 13 | `nest` int(8) DEFAULT NULL, 14 | `nest_tf` int(8) DEFAULT NULL, 15 | `umls_score` int(8) DEFAULT NULL, 16 | `chv_score` int(8) DEFAULT NULL, 17 | `contain_umls` varchar(8) DEFAULT NULL, 18 | `contain_chv` varchar(8) DEFAULT NULL, 19 | `win_umls` int(8) DEFAULT NULL, 20 | `win_chv` int(8) DEFAULT NULL, 21 | `sent_umls` int(8) DEFAULT NULL, 22 | `sent_chv` int(8) DEFAULT NULL, 23 | `umls_dist` int(8) DEFAULT NULL, 24 | `chv_dist` int(8) DEFAULT NULL, 25 | `sytax` varchar(40) DEFAULT NULL, 26 | `nn` varchar(8) DEFAULT NULL, 27 | `an` varchar(8) DEFAULT NULL, 28 | `pn` varchar(8) DEFAULT NULL, 29 | `anpn` varchar(8) DEFAULT NULL 30 | ); 31 | 32 | load data local infile 'C:\\fsu\\ra\\data\\ra-cluster.txt' 33 | into table k20all 34 | fields terminated by '\t' 35 | -- enclosed by '"' 36 | lines terminated by '\r\n' 37 | ignore 1 lines; 38 | truncate table k20all; 39 | 40 | select k,type,count(ngram) from k20all group by k,type; 41 | select * from k20all where k=18; 42 | 43 | 44 | select * from umls.mrconso where cui='C1552861'; 45 | select tui from umls.mrsty where cui='C0018684'; 46 | 47 | select count(distinct blogId) from ytex.content_org; 48 | select distinct blogId from ytex.content_org_new; 49 | 50 | select * from umls.mrconso where str = 'help'; -------------------------------------------------------------------------------- /sql-script/data_process.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/sql-script/data_process.docx -------------------------------------------------------------------------------- /sql-script/data_process_0922.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/sql-script/data_process_0922.docx -------------------------------------------------------------------------------- /sql-script/deaf.sql: -------------------------------------------------------------------------------- 1 | 2 | create database deaf char set utf8; 3 | use deaf; 4 | drop table deaf.dataset_deaf; 5 | create table dataset_deaf ( 6 | `PostID` int(20), 7 | `UserID` int(20), 8 | `User` varchar(100), 9 | `Date` varchar(100), 10 | `Year` varchar(100), 11 | `Month` varchar(100), 12 | `Time` varchar(100), 13 | `AMPM` varchar(100), 14 | `Content` text, 15 | `ThreadTitle` varchar(500), 16 | `ThreadPath` varchar(500), 17 | `ThreadNumber` int(20), 18 | -- `LinkName` varchar(100), 19 | `Type` varchar(100) 20 | ) 21 | ; 22 | 23 | drop table dataset_autism; 24 | create table dataset_autism ( 25 | `link` varchar(256), 26 | `topic` varchar(256), 27 | `User` varchar(100), 28 | `userInfo` varchar(500), 29 | `Content` text 30 | -- `date` varchar(100) 31 | ); 32 | alter table dataset_autism add column PostID int primary key AUTO_INCREMENT first; 33 | 34 | alter table dataset_autism drop column PostID; 35 | create table deaf_cui like cancer.cancer_cui; 36 | create table noncui like cancer.noncui; 37 | create table deaf_metamap like cancer.cancer_metamap_cui; 38 | -- drop table forum_metamap; 39 | create table forum_metamap like deaf_metamap; 40 | create table qa8000_metamap like deaf_metamap; 41 | 42 | load data local infile '/tmp/alldeaf_HealthFitness_08_20_2015_Final.txt' into table dataset_deaf fields terminated by '\t' enclosed by '"' lines terminated by '\r\n' ignore 1 lines; 43 | load data local infile '/tmp/autism_health_Final.txt' into table dataset_autism fields terminated by '\t' enclosed by '"' lines terminated by '\r\n' ignore 1 lines; 44 | 45 | 46 | select distinct PostID,Content,threadNumber from dataset_deaf 47 | into outfile '/tmp/deaf_dataset.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 48 | select distinct PostID,Content,link from dataset_autism 49 | into outfile '/tmp/autism_dataset.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 50 | 51 | select distinct tui from umls.mrsty; 52 | 53 | delete from dataset_deaf where length(PostID) <2; 54 | select count(distinct tid) from deaf_cui; 55 | select count(*) from deaf_metamap where task='autism'; 56 | select count(*) from deaf_metamap where task='deaf'; 57 | select distinct sab from umls.mrconso; 58 | 59 | select splitType, count(*) from deaf_metamap group by splitType ; 60 | select * from deaf_metamap where length(sty)>6; 61 | alter table deaf_metamap add column sab varchar(100) after preferStr; 62 | -- truncate deaf_metamap; 63 | 64 | select task, count(*)/count(distinct tid), count(distinct tid) from deaf_metamap group by task ; 65 | select * from deaf_metamap where task = 'autism'; 66 | delete from deaf_metamap where sab='CHV' and sentLen > 51; 67 | 68 | select count(*) from deaf_metamap where sab = 'SNOMEDCT_US'; -- 1077473 69 | select count(*) from deaf_metamap_no_thread_id where sab = 'SNOMEDCT_US'; -- 1140048 70 | 71 | select count(*) from deaf_metamap where sab = 'SNOMEDCT_US' and task='deaf'; -- 475902 72 | select count(*) from deaf_metamap_no_thread_id where sab = 'SNOMEDCT_US' and task='deaf'; -- 494458 73 | 74 | select count(*) from deaf_metamap where sab = 'SNOMEDCT_US' and task='austim'; -- 475902 75 | select count(*) from deaf_metamap_no_thread_id where sab = 'SNOMEDCT_US' and task='austim'; -- 494458 76 | 77 | select count(*) from deaf_metamap where sentLen > 51 and sab='SNOMEDCT_US'; -- 0 78 | select count(*) from deaf_metamap where sentLen > 51 and sab='CHV'; -- 76004 79 | select sentLen,count(*) from deaf_metamap where sab='SNOMEDCT_US' group by sentLen; 80 | select sentLen,count(*) from deaf_metamap where sab='CHV' group by sentLen; 81 | 82 | select * from qa8000_metamap; 83 | select * from umls.mrconso where cui = 'C1258068'; 84 | 85 | create index task on deaf_metamap_excluding_sty_distinct_sentences(task); 86 | create index sab on deaf_metamap_excluding_sty_distinct_sentences(sab); 87 | create index cui_str on deaf_metamap_excluding_sty_distinct_sentences(cui_str(10)); 88 | 89 | create index task on qa8000_metamap_excluding_sty_distinct_sentences(task); 90 | create index sab on qa8000_metamap_excluding_sty_distinct_sentences(sab); 91 | create index cui_str on qa8000_metamap_excluding_sty_distinct_sentences(cui_str(10)); 92 | 93 | 94 | create table qa8000 like qa8000_metamap; 95 | ALTER TABLE `deaf`.`qa8000` 96 | CHANGE COLUMN `threadId` `userId` VARCHAR(256) NULL DEFAULT NULL ; 97 | rename table qa8000_metamap to qa8000_metamap_without_userid; 98 | rename table qa8000 to qa8000_metamap; 99 | 100 | -- select term foun in snomed but not in chv 101 | select * from ( 102 | select tid,org_str,sentence, count(*) as cnt, group_concat(distinct sab ) as gsab 103 | from autism_metamap_excluding_sty_distinct_sentences_user 104 | group by tid,org_str,sentence ) temp_table 105 | where gsab not like '%CHV%' 106 | order by cnt desc; -------------------------------------------------------------------------------- /sql-script/import_0919.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `content_tag_compare_only_ytex` ( 2 | `blogId` bigint(20) NOT NULL DEFAULT '0', 3 | `target` longtext, 4 | `CUI` varchar(20) DEFAULT NULL, 5 | `SAB` varchar(40) DEFAULT NULL, 6 | `umlsStr` longtext, 7 | `TUI` varchar(4) DEFAULT NULL, 8 | `styName` varchar(50) DEFAULT NULL, /* semantic type name*/ 9 | `worldIndex` int(11) NOT NULL DEFAULT '0', /* the position of the target term in the blog content*/ 10 | `sentence` longtext, 11 | `rel_cui` char(8) NOT NULL DEFAULT '', /* the cui that relevant to current target term*/ 12 | `rel_str` varchar(1000) DEFAULT NULL, /* the preferred string of rel_cui that relevant to current target term*/ 13 | `id` int(11) NOT NULL DEFAULT '0' /* the primary key of this table, it is unique.*/ 14 | ) ; 15 | 16 | | content_tag_compare_only_our | CREATE TABLE `content_tag_compare_only_our` ( 17 | `blogId` varchar(40) DEFAULT NULL, 18 | `target` varchar(300) DEFAULT NULL, 19 | `umlsFlag` varchar(10) DEFAULT NULL, 20 | `score` float DEFAULT NULL, 21 | `CUI` varchar(45) DEFAULT NULL, 22 | `SAB` varchar(45) DEFAULT NULL, 23 | `AUI` varchar(45) DEFAULT NULL, 24 | `umlsStr` varchar(1000) DEFAULT NULL, 25 | `TUI` varchar(45) DEFAULT NULL, 26 | `styName` varchar(45) DEFAULT NULL, 27 | `semName` varchar(100) DEFAULT NULL, 28 | `tagId` int(11) DEFAULT '0', 29 | `wordIndex` int(11) DEFAULT '0', 30 | `wordIndexInSentence` int(11) DEFAULT '0', 31 | `sentenceIndex` int(11) DEFAULT '0', 32 | `targetNorm` varchar(300) DEFAULT NULL, 33 | `tags` varchar(500) DEFAULT NULL, 34 | `sentence` varchar(1000) DEFAULT NULL, 35 | `cui1` char(8) NOT NULL, 36 | `cui2` char(8) NOT NULL, 37 | `aui1` varchar(9) DEFAULT NULL, 38 | `aui2` varchar(9) DEFAULT NULL, 39 | `REL` varchar(4) NOT NULL, 40 | `RELA` varchar(100) DEFAULT NULL, 41 | `rel_str` varchar(1000) DEFAULT NULL, /* the preferred string of rel_cui that relevant to current target term*/ 42 | `id` int(11) NOT NULL DEFAULT '0', 43 | `rel_cui` varchar(45) DEFAULT NULL /* the cui that relevant to current target term*/ 44 | ) ; 45 | 46 | load data local infile 'csv_path' 47 | into table table_name 48 | fields terminated by ',' 49 | enclosed by '"' 50 | lines terminated by '\n'; -------------------------------------------------------------------------------- /sql-script/import_0924.sql: -------------------------------------------------------------------------------- 1 | USE ytex; 2 | drop table if exists TMP_ORG; 3 | CREATE TABLE TMP_ORG ( 4 | `blogId` BIGINT(20) DEFAULT NULL, /*----blog id */ 5 | `blog_name` varchar(200) DEFAULT NULL, 6 | `text_link_title` varchar(500) DEFAULT NULL, 7 | `text_content` varchar(10000) DEFAULT NULL, 8 | `photo_caption` varchar(300) DEFAULT NULL, 9 | `photo_link` varchar(300) DEFAULT NULL, 10 | `photo_source` varchar(300) DEFAULT NULL, 11 | `link_content` varchar(300) DEFAULT NULL, 12 | `post_likes` varchar(300) DEFAULT NULL, 13 | `post_reblogged` varchar(300) DEFAULT NULL, 14 | `post_hashtag` varchar(300) DEFAULT NULL 15 | ); 16 | /*load result in to the table*/ 17 | load data local infile 'C:\\fsu\\ra\\UmlsTagger\\data\\raw_data_CHV_study2.csv.txt' 18 | into table TMP_ORG 19 | fields terminated by '\t' 20 | enclosed by '"' 21 | lines terminated by '`' 22 | ; 23 | 24 | CREATE TABLE CONTENT_ORG_NEW AS 25 | SELECT o.blogId,o.post_hashtag,o.blog_name,o.text_link_title,o.text_content FROM TMP_ORG o; 26 | 27 | select * from CONTENT_ORG_NEW into outfile 'C:\\fsu\\ra\\UmlsTagger\\data\\raw_data_CHV_study2.csv' 28 | fields terminated by ',' enclosed by '"' lines terminated by '\n'; 29 | 30 | select distinct blogId INSTANCE_ID from ytex.CONTENT_ORG_NEW ; 31 | select * from ytex.content_org_new where blogId = 0; 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /sql-script/import_1004.sql: -------------------------------------------------------------------------------- 1 | USE ytex; 2 | drop table if exists TMP_ORG; 3 | truncate TMP_ORG; 4 | CREATE TABLE TMP_ORG ( 5 | `blogId` BIGINT(20) DEFAULT NULL, /*----blog id */ 6 | `blog_name` varchar(200) DEFAULT NULL, 7 | `text_link_title` varchar(500) DEFAULT NULL, 8 | `text_content` text DEFAULT NULL, 9 | `photo_caption` varchar(300) DEFAULT NULL, 10 | `photo_link` varchar(300) DEFAULT NULL, 11 | `photo_source` varchar(300) DEFAULT NULL, 12 | `link_content` varchar(300) DEFAULT NULL, 13 | `post_likes` varchar(300) DEFAULT NULL, 14 | `post_reblogged` varchar(300) DEFAULT NULL, 15 | `post_hashtag` varchar(300) DEFAULT NULL 16 | ); 17 | CREATE TABLE TMP_ORG ( 18 | `blogId` BIGINT(20) DEFAULT NULL, /*----blog id */ 19 | `post_hashtag` varchar(300) DEFAULT NULL, 20 | `blog_name` varchar(200) DEFAULT NULL, 21 | `text_link_title` varchar(500) DEFAULT NULL, 22 | `text_content` text DEFAULT NULL 23 | ); 24 | -- first input to TMP_ORG, THEN INSERT into TEMP_ORG_NEW, adding a 'disease' column. 25 | truncate TMP_ORG; 26 | /*load result in to the table*/ 27 | load data local infile 'C:\\fsu\\ra\\UmlsTagger\\data\\newdataset_chronic\\chronic_newdataset_obesity.csv' 28 | into table TMP_ORG 29 | fields terminated by ',' 30 | enclosed by '"' 31 | lines terminated by '`' 32 | ignore 1 LINES 33 | ; 34 | 35 | /* 36 | drop table if exists CONTENT_ORG_NEW; 37 | CREATE TABLE CONTENT_ORG_NEW ( 38 | `blogId` BIGINT(20) DEFAULT NULL, /*----blog id */ 39 | `post_hashtag` varchar(300) DEFAULT NULL, 40 | `blog_name` varchar(200) DEFAULT NULL, 41 | `text_link_title` varchar(500) DEFAULT NULL, 42 | `text_content` text DEFAULT NULL, 43 | `disease` varchar(100) 44 | ); 45 | 46 | INSERT ignore CONTENT_ORG_NEW (blogId,post_hashtag,blog_name,text_link_title,text_content,disease) 47 | SELECT distinct o.blogId,o.post_hashtag,o.blog_name,o.text_link_title,o.text_content,'obesity' FROM TMP_ORG o ; 48 | 49 | select count(distinct disease,blogId) from CONTENT_ORG_NEW; 50 | 51 | truncate table content_tag_ytex; 52 | /* 53 | drop table if exists content_tag_ytex; 54 | create table content_tag_ytex as 55 | select a.anno_text, d.instance_id, c.*, 'alzheimer' as disease from v_document_cui_sent c 56 | inner join v_annotation a on c.anno_base_id = a.anno_base_id 57 | inner join v_document d on d.document_id = c.document_id; 58 | *//*,7230*/ 59 | 60 | alter table content_tag_ytex add `disease` varchar(100) default null ; 61 | insert content_tag_ytex 62 | select a.anno_text, d.instance_id, c.*, 'diabetes' as disease from v_document_cui_sent c 63 | inner join v_annotation a on c.anno_base_id = a.anno_base_id 64 | inner join v_document d on d.document_id = c.document_id; 65 | 66 | select count(distinct code) from content_tag_ytex; 67 | show create table v_document_cui_sent; 68 | 69 | 70 | -------------------------------------------------------------------------------- /sql-script/import_tag_0916.sql: -------------------------------------------------------------------------------- 1 | use umls; 2 | /*DELET ALL non-english record*/ 3 | -- delete from mrconso where lat <> 'ENG'; 4 | drop table IF EXISTS CONTENT_TAG_UNIQUE_CUI; 5 | /*create table for the original terms from blogs.*/ 6 | CREATE TABLE CONTENT_TAG_UNIQUE_CUI ( 7 | `blogId` varchar(40) DEFAULT NULL, /*----blog id */ 8 | `target` varchar(300) DEFAULT NULL, /* ----the term found in the content. It much be found in UMLS too. If a term is not found in UMLS, it will be ignored. */ 9 | `umlsFlag` varchar(10) DEFAULT NULL, /* ----If it is found in UMLS. This column is used by hashTags.*/ 10 | `score` float DEFAULT NULL, /* ----The similarity metic between the term in the content and the string in UMLS.*/ 11 | `CUI` varchar(45) DEFAULT NULL, /*----- CUI of UMLS*/ 12 | `SAB` varchar(45) DEFAULT NULL, /* ----- SAB of UMLS*/ 13 | `AUI` varchar(45) DEFAULT NULL, /*----- AUI of UMLS */ 14 | `umlsStr` varchar(1000) DEFAULT NULL, /* ---STR of UMLS mrconso table*/ 15 | `TUI` varchar(45) DEFAULT NULL, /*------TUI of UMLS MRSTY table */ 16 | `styName` varchar(45) DEFAULT NULL, /*------Semantic name of UMLS MRSTY table*/ 17 | `semName` varchar(100) DEFAULT NULL, /*---semantic group name in SemGroup website*/ 18 | `tagId` int default 0, /*----If the term match a hash_tag of the blog, tagId is the index of the tags. if not match any hash_tag, it is 0.*/ 19 | `wordIndex` int default 0, /*----the position of the term in the content*/ 20 | `wordIndexInSentence` int default 0, /*---- the position of the term in the sentence that it is found in. */ 21 | `sentenceIndex` int default 0, /*-- the index of the sentence of the target*/ 22 | `targetNorm` varchar(300) default NULL, /*--- the normalized string of the term.*/ 23 | `tags` varchar(500) default NULL, /*---- all the hash_tags of the blog. */ 24 | `sentence` varchar(1000) default NULL, /* The sentence that the target is found in*/ 25 | `rel` varchar(4) , /*rel field in mrrel*/ 26 | `rela` varchar(100) , /*rela fild in mrrel*/ 27 | `rel_str` varchar(1000) , /*str field in mrconso for the term relevant to current term */ 28 | `id` int(11) /* auto_increment primary key for the result table*/ 29 | ); 30 | 31 | /*load result in to the table*/ 32 | load data local infile 'C:\\fsu\\tag_diabetes_distinct.csv' 33 | into table CONTENT_TAG_UNIQUE_CUI 34 | fields terminated by ',' 35 | enclosed by '"' 36 | lines terminated by '\n' 37 | ignore 1 lines; 38 | 39 | select * from CONTENT_TAG_UNIQUE_CUI; -------------------------------------------------------------------------------- /sql-script/linux-test.sql: -------------------------------------------------------------------------------- 1 | update ret.content_tag_ytex_T047_unique as t 2 | inner join 3 | (select r.CUI1,r.CUI2,GROUP_CONCAT(DISTINCT REL,' ',RELA SEPARATOR ',') as rel_all from umls.MRREL as r 4 | inner join ret.content_tag_ytex_T047_unique as ret 5 | on (r.CUI1=ret.cui COLLATE utf8_unicode_ci and r.CUI2 = ret.rel_cui COLLATE utf8_unicode_ci) 6 | or (r.CUI2=ret.cui COLLATE utf8_unicode_ci and r.CUI1=ret.rel_cui COLLATE utf8_unicode_ci) 7 | GROUP BY CUI1,CUI2 8 | ) as temp 9 | on (temp.CUI1=t.cui COLLATE utf8_unicode_ci and temp.CUI2 = t.rel_cui COLLATE utf8_unicode_ci) 10 | or (temp.CUI2=t.cui COLLATE utf8_unicode_ci and temp.CUI1=t.rel_cui COLLATE utf8_unicode_ci) 11 | set t.rel_all = temp.rel_all 12 | ; -------------------------------------------------------------------------------- /sql-script/minsook_1023.sql: -------------------------------------------------------------------------------- 1 | use ytex; 2 | CREATE TABLE TMP_ORG_1027 ( 3 | `blogId` BIGINT(20) DEFAULT NULL, /*----blog id */ 4 | `text_content` text DEFAULT NULL 5 | ); 6 | 7 | drop table TMP_ORG_1027; 8 | /*load result in to the table*/ 9 | load data local infile 'C:\\fsu\\ra\\data\\content_raw_cleaned_ForZhiwei.csv' 10 | into table TMP_ORG_1027 11 | fields terminated by ',' 12 | enclosed by '"' 13 | lines terminated by '`' 14 | ignore 1 LINES 15 | ; 16 | select count(distinct blogId) from TMP_ORG_1023 LIMIT 1; 17 | -- 50252 18 | select * from TMP_ORG_1027 where text_content like '%`%'; 19 | 20 | use ytex; 21 | select * from v_document_ontoanno; 22 | select * from v_corpus_group_class; 23 | 24 | create table ytex.content_tag_ytex_1023 as 25 | select a.anno_text, d.instance_id, c.* from v_document_cui_sent c 26 | inner join v_annotation a on c.anno_base_id = a.anno_base_id 27 | inner join v_document d on d.document_id = c.document_id; 28 | 29 | select * from ytex.content_tag_ytex_1023 order by instance_id, sentence_text 30 | into outfile 'C:\\fsu\\ra\\data\\content_tag_ytex_1023.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 31 | 32 | select count(distinct instance_id) from content_tag_ytex_1023; 33 | 34 | 35 | show create table ytex.content_tag_ytex_1023; 36 | -------------------------------------------------------------------------------- /sql-script/minsook_1103.sql: -------------------------------------------------------------------------------- 1 | use tumblr_db; 2 | 3 | select count(distinct cui) from tem_tag_ids; 4 | -- 935 5 | select count(distinct blogId) from tem_tag_ids; 6 | -- 147 7 | select count(distinct code) from content_tag_ytex_noseedtag; 8 | -- 8227 9 | select count(distinct instance_id) from content_tag_ytex_noseedtag; 10 | -- 22171 11 | 12 | select count(*) from content_tag_ytex_noseedtag; 13 | -- 265599 14 | select count(*) from tem_tag_ids; 15 | -- 118729 16 | 17 | drop table tmp_pairs; 18 | create table tmp_pairs as 19 | select distinct c.blogId, c.cui,s.cui as cui_tag from ( 20 | select distinct instance_id as blogId, code as cui from content_tag_ytex_noseedtag) as c 21 | inner join (select distinct blogId,cui from tem_tag_ids) as s 22 | on c.cui<>s.cui and c.blogId = s.blogId 23 | ; 24 | -- 6059 25 | select count(distinct blogId) from tmp_pairs; 26 | -- 120 27 | create table tmp_pairs as 28 | select distinct c.cui,s.cui as cui_tag from ( 29 | select distinct code as cui from content_tag_ytex_noseedtag) as c 30 | inner join (select distinct cui from tem_tag_ids) as s 31 | on c.cui<>s.cui 32 | ; 33 | -- 7692072 34 | alter table content_tag_ytex_noseedtag add (rel_all text default null); 35 | alter table content_tag_ytex_noseedtag add (cui_tag char(8) default null); 36 | 37 | update content_tag_ytex_noseedtag set rel_all=null; 38 | 39 | update content_tag_ytex_noseedtag as t 40 | inner join 41 | (select ret.cui,ret.cui_tag,GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') as rel_all from umls.mrrel as r 42 | inner join tmp_pairs as ret 43 | on (r.CUI1=ret.cui and r.CUI2 = ret.cui_tag ) 44 | or (r.CUI2=ret.cui and r.CUI1=ret.cui_tag ) 45 | GROUP BY cui,cui_tag 46 | ) as temp 47 | on (temp.cui=t.code and temp.blogId=t.instance_id) 48 | set t.rel_all = temp.rel_all, t.cui_tag= temp.cui_tag 49 | where t.rel_all is null 50 | ; 51 | 52 | update content_tag_ytex_noseedtag as t 53 | inner join 54 | (select ret.blogId, ret.cui,ret.cui_tag,GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') as rel_all from umls.mrrel as r 55 | inner join tmp_pairs as ret 56 | on (r.CUI1=ret.cui and r.CUI2 = ret.cui_tag ) 57 | or (r.CUI2=ret.cui and r.CUI1=ret.cui_tag ) 58 | GROUP BY blogId,cui,cui_tag 59 | ) as temp 60 | on (temp.cui=t.code and temp.blogId=t.instance_id) 61 | set t.rel_all = temp.rel_all, t.cui_tag= temp.cui_tag 62 | where t.rel_all is null 63 | ; 64 | 65 | select * from content_tag_ytex_noseedtag where cui_tag is not null; 66 | -------------------------------------------------------------------------------- /sql-script/minsook_1229.sql: -------------------------------------------------------------------------------- 1 | select C.sab, C.code, group_concat(distinct C.str), count(*) 2 | 3 | from 4 | 5 | (select A.code, A.str, B.sab 6 | 7 | FROM 8 | 9 | (select distinct c.instance_id, c.sentence_text, c.code, m.str from ret1007.content_tag_ytex c, umls.mrconso m 10 | 11 | where c.code = m.cui and m.lat = 'ENG' and c.disambiguated=1 and c.disease='diabetes'and m.TS='P' AND m.stt='PF' AND m.ispref='Y') as A, 12 | 13 | (select distinct cui, sab from umls.mrconso where lat='ENG') AS B 14 | 15 | WHERE A.code = B.cui) as C 16 | 17 | group by C.sab, C.code 18 | 19 | order by C.sab, count(*) desc 20 | into outfile '/tmp/minsook_1230.ret' fields terminated by '\t' enclosed by '"' lines terminated by '\n'; 21 | 22 | 23 | 24 | select C.sab, C.code, group_concat(distinct C.str), count(*) 25 | 26 | from 27 | 28 | (select A.code, A.str, B.sab 29 | 30 | FROM 31 | 32 | (select distinct c.instance_id, c.sentence_text, c.code, m.str from retyahoo.content_tag_ytex_yahoo_question c, umls.mrconso m 33 | 34 | where c.code = m.cui and m.lat = 'ENG' and c.disambiguated=1 and m.TS='P' AND m.stt='PF' AND m.ispref='Y') as A, 35 | 36 | (select distinct cui, sab from umls.mrconso where lat='ENG') AS B 37 | 38 | WHERE A.code = B.cui) as C 39 | 40 | group by C.sab, C.code 41 | 42 | order by C.sab, count(*) desc 43 | into outfile '/tmp/minsook_1230_question.ret' fields terminated by '\t' enclosed by '"' lines terminated by '\n'; 44 | 45 | 46 | 47 | select C.sab, C.code, group_concat(distinct C.str), count(*) 48 | 49 | from 50 | 51 | (select A.code, A.str, B.sab 52 | 53 | FROM 54 | 55 | (select distinct c.instance_id, c.sentence_text, c.code, m.str from retyahoo.content_tag_ytex_yahoo_answer c, umls.mrconso m 56 | 57 | where c.code = m.cui and m.lat = 'ENG' and c.disambiguated=1 and m.TS='P' AND m.stt='PF' AND m.ispref='Y') as A, 58 | 59 | (select distinct cui, sab from umls.mrconso where lat='ENG') AS B 60 | 61 | WHERE A.code = B.cui) as C 62 | 63 | group by C.sab, C.code 64 | 65 | order by C.sab, count(*) desc 66 | into outfile '/tmp/minsook_1230_answer.ret' fields terminated by '\t' enclosed by '"' lines terminated by '\n'; 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /sql-script/ner200.sql: -------------------------------------------------------------------------------- 1 | create database ner200 char set utf8; 2 | use ner200; 3 | drop table cancer_cui; 4 | create table cancer_cui like cancer.cancer_cui; 5 | 6 | drop table bioportal; 7 | drop table lvalue; 8 | drop table lvaluerake; 9 | drop table manual; 10 | CREATE TABLE bioportal ( 11 | `tid` varchar(30), 12 | `org_str` varchar(100), 13 | `sentence` text 14 | ); 15 | 16 | CREATE TABLE lvalue ( 17 | `tid` varchar(30), 18 | `org_str` varchar(100), 19 | `sentence` text 20 | ); 21 | 22 | CREATE TABLE lvaluerake ( 23 | `tid` varchar(30), 24 | `org_str` varchar(100), 25 | `sentence` text 26 | ); 27 | 28 | CREATE TABLE manual ( 29 | `tid` varchar(30), 30 | `major` varchar(100), 31 | `others` text, 32 | `durStr` varchar(100), 33 | `sentence` text 34 | ); 35 | 36 | 37 | 38 | load data local infile 'C:\\fsu\\ra\\data\\201601\\split_criteria\\1308_colorectal_trials_criteria_0413_ret.csv.cui' into table cancer_cui fields terminated by '\t' enclosed by '"' lines terminated by '\n' ignore 1 lines; 39 | load data local infile 'C:\\fsu\\ra\\data\\201601\\split_criteria\\1308_colorectal_trials_criteria_0413_ret.csv.mm.cui' into table cancer_mm_cui fields terminated by '\t' enclosed by '"' lines terminated by '\n' ignore 1 lines; 40 | 41 | load data local infile 'C:\\Users\\Jason\\Downloads\\bioportal.txt' into table bioportal fields terminated by '\t' enclosed by '"' lines terminated by '\r\n'; 42 | load data local infile 'C:\\Users\\Jason\\Downloads\\lvalue.txt' into table lvalue fields terminated by '\t' enclosed by '"' lines terminated by '\r\n'; 43 | load data local infile 'C:\\Users\\Jason\\Downloads\\lvaluerake.txt' into table lvaluerake fields terminated by '\t' enclosed by '"' lines terminated by '\r\n'; 44 | load data local infile 'C:\\Users\\Jason\\Downloads\\random_200_sentences_cancer_studies.txt' into table manual fields terminated by '\t' enclosed by '"' lines terminated by '\r\n' ignore 1 lines; 45 | select * from cancer_cui where length(cui)>0; 46 | select * from cancer_mm_cui; 47 | select * from bioportal; 48 | select * from lvalue; 49 | select * from lvaluerake; 50 | select * from manual where cui is not null; 51 | delete from manual where length(tid) < 1; 52 | delete from bioportal where length(tid) < 1; 53 | delete from lvalue where length(tid) < 1; 54 | delete from lvaluerake where length(tid) < 1; 55 | 56 | alter table manual add column `cui` varchar(20); 57 | alter table bioportal add column `cui` varchar(20); 58 | alter table lvalue add column `cui` varchar(20); 59 | alter table lvaluerake add column `cui` varchar(20); 60 | update manual m set cui=(select u.cui from umls.mrconso u,umls.mrsty s where u.str=m.major and u.cui=s.cui and s.tui in ("T200","T020","T190","T049","T019","T047","T050","T037","T048","T191","T046","T184","T060","T065","T058","T059","T063","T062","T061") limit 1); 61 | update bioportal m set cui=(select u.cui from umls.mrconso u,umls.mrsty s where u.str=m.org_str and u.cui=s.cui and s.tui in ("T200","T020","T190","T049","T019","T047","T050","T037","T048","T191","T046","T184","T060","T065","T058","T059","T063","T062","T061") limit 1); 62 | update lvalue m set cui=(select u.cui from umls.mrconso u,umls.mrsty s where u.str=m.org_str and u.cui=s.cui and s.tui in ("T200","T020","T190","T049","T019","T047","T050","T037","T048","T191","T046","T184","T060","T065","T058","T059","T063","T062","T061") limit 1); 63 | update lvaluerake m set cui=(select u.cui from umls.mrconso u,umls.mrsty s where u.str=m.org_str and u.cui=s.cui and s.tui in ("T200","T020","T190","T049","T019","T047","T050","T037","T048","T191","T046","T184","T060","T065","T058","T059","T063","T062","T061") limit 1); 64 | 65 | select distinct tid from cancer_cui where pattern!= 'CUI_ALL'; 66 | select distinct pattern from cancer_cui; 67 | 68 | select distinct c.tid,c.org_str,c.sentence from cancer_cui c, cancer_mm_cui m where c.org_str = m.org_str and c.tid = m.tid; 69 | select distinct c.tid,c.org_str,c.sentence from cancer_cui c, bioportal m where c.org_str = m.major and c.tid = m.tid; 70 | 71 | select distinct m.tid,m.major,m.sentence from cancer_cui c, manual m where c.tid = m.tid and length(m.cui)>0 and c.`group`='CUI_DISEASE_MAIN' and instr(c.org_str,m.major) > 0; 72 | select distinct m.tid,m.major,m.sentence from cancer_cui c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and instr(c.org_str,m.major) > 0; 73 | select distinct m.tid,m.major,m.sentence from cancer_mm_cui c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and instr(c.org_str,m.major) > 0; 74 | select distinct m.tid,m.major,m.sentence from bioportal c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and instr(c.org_str,m.major) > 0; 75 | select distinct m.tid,m.major,m.sentence from lvalue c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and instr(c.org_str,m.major) > 0; 76 | select distinct m.tid,m.major,m.sentence from lvaluerake c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and instr(c.org_str,m.major) > 0; 77 | 78 | select * from cancer_cui; 79 | 80 | select SUM(`group` like '%_PT'),SUM(`group` like '%_AF') from cancer_cui; -------------------------------------------------------------------------------- /sql-script/pattern_all.sql: -------------------------------------------------------------------------------- 1 | use compact_092316; 2 | 3 | select D.Disease, count(*) as cnt from cancer_cui C join all_diseases_trials D 4 | on C.tid=D.tid and C.month>-1 and nested!='nesting' 5 | group by D.Disease order by cnt desc; -------------------------------------------------------------------------------- /sql-script/pattern_all_disease.sql: -------------------------------------------------------------------------------- 1 | 2 | SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='cancer_cui'; 3 | 4 | use compact_092316; 5 | 6 | alter table cancer_cui add column var boolean after tags; 7 | 8 | create table variable (var varchar(200)); 9 | create index idx_var on variable(var) using hash; 10 | 11 | create index idx_tid on all_diseases_trials(TID) using hash; 12 | create index idx_disease on all_diseases_trials(disease) using hash; 13 | create index idx_sty on cancer_cui(sty) using hash; 14 | create index idx_nested on cancer_cui(nested) using hash; 15 | 16 | alter table meta add primary key (tid); 17 | create index idx_status on meta (overall_status); 18 | create index idx_min_age2 on meta (minimum_age_in_year); 19 | create index idx_max_age2 on meta (maximum_age_in_year); 20 | create index idx_phase on meta (phase); 21 | create index idx_int on meta (intervention_type); 22 | create index idx_std on meta (study_type); 23 | 24 | select count(*) from cancer_cui; -- 6179540 25 | select count(*) from meta; -- 225364 26 | select count(*) from all_diseases_trials; -- 1016579 27 | 28 | select tid, count(*) as cnt from meta group by tid order by cnt desc; -- one to one 29 | select maximum_age_in_year, count(*) as cnt from meta group by maximum_age_in_year order by cnt desc; 30 | 31 | select count(*) from meta where study_type ='Interventional' and (STR_TO_DATE(start_date,'%M %Y') >= STR_TO_DATE('JANUARY 2000','%M %Y')) 32 | and (STR_TO_DATE(start_date,'%M %Y') <= STR_TO_DATE('SEPTEMBER 2016','%M %Y')); -- 172245 33 | select count(distinct tid) from all_diseases_trials where disease ='diabetes-mellitus-type-2'; -- 5000 34 | 35 | CREATE TABLE T2DM_0100_0916 AS (select * from meta where study_type ='Interventional' and (STR_TO_DATE(start_date,'%M %Y') >= STR_TO_DATE('JANUARY 2000','%M %Y')) 36 | and (STR_TO_DATE(start_date,'%M %Y') <= STR_TO_DATE('SEPTEMBER 2016','%M %Y')) 37 | and tid in (select distinct tid from all_diseases_trials where disease ='diabetes-mellitus-type-2')); -- 4201 38 | create table T2DM_CUI as (select C.* from cancer_cui C join T2DM_0100_0916 T on C.tid=T.tid ); -- 121341 39 | 40 | select var, count(*) from cancer_cui group by var; 41 | 42 | -- Results to be reported: 43 | -- Rank of the diseases by the number of criteria with temporal constraints, (# of umls terms with temporal constrains) 44 | select T.disease, count(*) as cnt from cancer_cui C join all_diseases_trials T on C.tid=T.tid and C.month >= 0 and C.nested != 'nested' and C.var is null group by T.disease order by cnt desc; 45 | -- Rank of the diseases by the average number of criteria with temporal constraints per trial 46 | select T.disease, count(distinct tid) as cnt from all_diseases_trials T group by T.disease order by cnt desc; 47 | -- Distribution of semantic types (overall) 48 | select sty, count(*) as cnt from cancer_cui where nested != 'nested' and C.var is null group by sty order by cnt desc; 49 | -- Frequency of temporal patterns (overall) 50 | select month, count(*) as cnt from cancer_cui where nested != 'nested' and C.var is null group by month order by cnt desc; 51 | 52 | 53 | select * from cancer_cui where month<-1 ; 54 | 55 | update cancer_cui d join compact_092316.variable v on (v.var=d.cui_str or v.var=d.org_str) set d.var = true; 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /sql-script/pattern_sty_prefer.sql: -------------------------------------------------------------------------------- 1 | use cancer; 2 | select * from cancer_cui where skipNum>0 and length(method)>0 and method != 'fullDep' and sentence not like '% or %'; 3 | select distinct method from cancer_cui; 4 | select org_str, sentence from cancer_cui where sentence='Active gastrointestinal tract disease with malabsorption syndrome.'; 5 | 6 | select * from noncui order by freq desc; 7 | select distinct sentence from cancer_cui where splitType='No'; 8 | 9 | create database cancer_more_sty char set utf8; 10 | use cancer_more_sty; 11 | create table cancer_cui like cancer.cancer_cui; 12 | create table noncui like cancer.noncui; 13 | create table cancer_metamap_cui like cancer.cancer_metamap_cui; 14 | 15 | use ner200; 16 | -- get the sty pairs that a term belongs to both of them 17 | select A.sty,B.sty, count(*) cnt from cancer_cui A, cancer_cui B where length(A.cui)>0 and length(B.cui)>0 and A.tid=B.tid and A.criteriaId=B.criteriaId and A.sentId=B.sentId and A.org_str=B.org_str and A.sty > B.sty group by A.sty,B.sty order by cnt desc; 18 | 19 | select count(*) from cancer_more_sty.cancer_cui; 20 | 21 | select * from sty_prefer_cui where sty_prefer_cui.sty1=null ; 22 | select sty_prefer_cui.sty1=null from sty_prefer_cui; 23 | 24 | alter table cancer_cui add column flag int after sty; 25 | create table sty_prefer_orgstr like sty_prefer_cui; 26 | 27 | update sty_prefer_orgstr,sty_prefer_cui set sty_prefer_orgstr.prefer=sty_prefer_cui.prefer,sty_prefer_orgstr.reason=sty_prefer_cui.reason where sty_prefer_orgstr.sty1=sty_prefer_cui.sty1 and sty_prefer_orgstr.sty2=sty_prefer_cui.sty2; 28 | 29 | select * from sty_prefer_orgstr; 30 | 31 | select A.sty,B.sty, count(*) cnt from cancer_cui A, cancer_cui B where length(A.cui)>0 and length(B.cui)>0 and A.tid=B.tid and A.criteriaId=B.criteriaId and A.sentId=B.sentId and A.org_str=B.org_str and A.sty > B.sty group by A.sty,B.sty order by cnt desc; 32 | 33 | update cancer_cui A,cancer_cui B,cancer_more_sty.sty_prefer_cui C set A.sty_ignored = true where length(A.cui)>0 and length(B.cui)>0 and A.tid=B.tid and A.criteriaId=B.criteriaId and A.sentId=B.sentId and A.org_str=B.org_str and A.sty > B.sty and C.prefer != A.sty and ((A.sty=C.sty1 and C.sty2=B.sty) or (A.sty=C.sty2 and C.sty2=B.sty)); 34 | 35 | select * from cancer_cui A where sty='T116'; 36 | -- update cancer_cui set flag=null; 37 | select sty, count(*) cnt from cancer_cui group by sty order by cnt desc; 38 | create table sty_prefer_orgstr like cancer_more_sty.sty_prefer_orgstr; 39 | select * from sty_prefer_orgstr where length(prefer)>0; 40 | select * from cancer_more_sty.sty_prefer_orgstr; 41 | update sty_prefer_orgstr A, cancer_more_sty.sty_prefer_cui B set A.prefer=B.prefer,A.reason=B.reason where A.sty1=B.sty1 and A.sty2=B.sty2; 42 | 43 | select * from cancer_cui where sty_ignored is null; 44 | select count(distinct org_str) from cancer_cui; 45 | -------------------------------------------------------------------------------- /sql-script/ret-yahoo.sql: -------------------------------------------------------------------------------- 1 | create database if not exists retyahoo character set utf8; 2 | use retyahoo; 3 | 4 | select * from content_tag_ytex_yahoo; 5 | 6 | select count(distinct instance_id) from content_tag_ytex_yahoo; 7 | select count(distinct id) from org_yahoo; 8 | 9 | drop table content_tag_ytex_yahoo; 10 | rename table retyahoo.org_yahoo to ytex.org_yahoo; 11 | 12 | 13 | use ytex; 14 | select * from ytex.org_yahoo; 15 | select max(id) from (select id from org_yahoo where id>= 0 and id < 120753 order by id limit 5000) a; 16 | -- 120753 17 | select max(id) from (select id from org_yahoo where id>= 120753 and id < 334572 order by id limit 5000) a; 18 | -- 334572 19 | select max(id) from (select id from org_yahoo where id>= 334572 and id < 640612 order by id limit 5000) a; 20 | -- 640612 21 | select max(id) from (select id from org_yahoo where id>= 640612 and id < 925081 order by id limit 5000) a; 22 | -- 925081 23 | select max(id) from (select id from org_yahoo where id>= 925081 and id < 1219340 order by id limit 5000) a; 24 | -- 1219340 25 | select max(id) from (select id from org_yahoo where id>= 1219340 and id < 1664699 order by id limit 5000) a; 26 | -- 1664699 27 | select max(id) from (select id from org_yahoo where id>= 1664699 and id < 1994240 order by id limit 5000) a; 28 | -- 1994240 29 | select max(id) from (select id from org_yahoo where id>= 1994240 and id < 2685340 order by id limit 5000) a; 30 | -- 2685340 31 | select max(id) from (select id from org_yahoo where id>= 2685340 and id < 3079989 order by id limit 5000) a; 32 | -- 3079989 33 | select max(id) from (select id from org_yahoo where id>= 3079989 and id < 3440146 order by id limit 5000) a; 34 | -- 3440146 35 | select max(id) from (select id from org_yahoo where id>= 3440146 and id < 3680908 order by id limit 5000) a; 36 | -- 3680908 37 | select max(id) from (select id from org_yahoo where id>= 3680908 and id < 1000000000 order by id limit 5000) a; 38 | -- 3822084 39 | 40 | 41 | select count(id) from org_yahoo where id>= 640612 and id < 925081 order by id; 42 | select count(id) from org_yahoo where id>= 3680908 and id < 1000000000; 43 | 44 | select count(distinct document_id) from document; 45 | -------------------------------------------------------------------------------- /sql-script/sent_1213.sq..sql: -------------------------------------------------------------------------------- 1 | use ytex; 2 | 3 | /* 4 | sed -i -- 's/concat(subject, " . ", content)/chosenanswer/g' *.xml 5 | sed -i -- 's/question<\/string>/answer<\/string>/g' *.xml 6 | 7 | */ 8 | 9 | create table yahootumblr ( 10 | id bigint(20) default null, 11 | content text default null 12 | ); 13 | 14 | insert yahootumblr (id,content) select distinct blogId, text_content from content_org_new; 15 | insert yahootumblr (id,content) select distinct id, concat(subject, ". ", content, ". ", chosenanswer) from org_yahoo; 16 | select * from yahootumblr; 17 | delete from yahootumblr where id is null; 18 | 19 | -- question 20 | select count(*) from anno_sentence; -- 267617 21 | select count(sentence_text) from v_document_cui_sent; -- 399076 22 | select count(distinct sentence_text) from v_document_cui_sent; -- 142802 23 | 24 | rename table retyahoo.org_yahoo to ytex.org_yahoo; 25 | select count(distinct id) from org_yahoo; 26 | select count(distinct blogId) from content_org_new; 27 | 28 | -- question 29 | select count(*) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'question'; -- 267617 30 | select count(distinct substr(`d`.`doc_text`,(`b`.`span_begin` + 1),(`b`.`span_end` - `b`.`span_begin`))) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'question'; -- 249013 31 | 32 | -- answer 33 | select count(*) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'answer'; -- 428881 34 | select count(distinct substr(`d`.`doc_text`,(`b`.`span_begin` + 1),(`b`.`span_end` - `b`.`span_begin`))) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'answer'; -- 348793 35 | 36 | -- blog 37 | select count(*) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'blog'; -- 52551 38 | select count(distinct substr(`d`.`doc_text`,(`b`.`span_begin` + 1),(`b`.`span_end` - `b`.`span_begin`))) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'blog'; -- 47413 39 | 40 | select distinct blogId from content_org_new where blogId >=116473932370 order by blogId limit 1000; 41 | 42 | -------------------------------------------------------------------------------- /sql-script/smb.sql: -------------------------------------------------------------------------------- 1 | 2 | use ret1007; 3 | 4 | select * from content_tag_disease_ytex_T047_unique_output order by blogId,sentence 5 | into outfile '/tmp/ret-final-1007.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 6 | select * from content_tag_ytex order by instance_id,sentence_text 7 | into outfile '/tmp/ret-basic-1007.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 8 | 9 | 10 | 11 | select * from co_occur ; 12 | 13 | use ret1018; 14 | select * from content_tag_disease_ytex_unique_output; 15 | 16 | 17 | select * from ret1018.content_tag_disease_ytex_unique_output; 18 | 19 | alter table ret1007.content_tag_ytex add column `sab` varchar(200) after `cui`; 20 | 21 | update ret1007.content_tag_ytex as cty 22 | set cty.sab= ( 23 | select group_concat(distinct sab separator ',') from umls.mrconso as con 24 | where cty.cui = con.cui 25 | group by cty.cui 26 | ); 27 | select * from ret1007.content_tag_ytex; 28 | 29 | rename table ret.content_tag_increased_target to ret1018.content_tag_increased_target; 30 | -------------------------------------------------------------------------------- /sql-script/socialqa.sql: -------------------------------------------------------------------------------- 1 | 2 | use socialqa; 3 | 4 | select count(*) from socialqa.qdataH; -- 3822256 5 | select distinct top_level_category from socialqa.qdataH; -- 29 6 | select count(*) from socialqa.qdataH where top_level_category='Health'; -- 2820179 7 | 8 | create table health_answers_for_8000_questions as (select a.qid,a.content,a.rating, a.userid,a.usernick from adataH a, health_questions_random_8000 q where a.qid=q.qid); 9 | select * from socialqa.qdataH; 10 | select count(*) from `health_questions_random_8000`; 11 | 12 | -- pick 8000 question and answer. 13 | set group_concat_max_len=1024000; 14 | select q.qid, replace(group_concat(q.content,'\n',a.content),'\r','') 15 | from `health_questions_random_8000` q, `health_answers_for_8000_questions` a 16 | where a.qid=q.qid group by qid 17 | into outfile '/tmp/socialqa_8000.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 18 | select q.qid,q.content, q.userid 19 | from `health_questions_random_8000` q 20 | union all 21 | select a.qid,a.content,a.userid 22 | from `health_answers_for_8000_questions` a 23 | into outfile '/tmp/socialqa_8000_uid.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 24 | 25 | 26 | select U.userid, count(*) as cnt from 27 | (select userid from health_questions_random_8000 28 | union all 29 | select userid from health_answers_for_8000_questions ) as U 30 | group by U.userid 31 | order by cnt desc; 32 | select userid,count(*) as cnt from health_answers_for_8000_questions group by userid order by cnt desc; 33 | 34 | -- split all answers into multiple files. 35 | select id, replace(concat(subject, ' ', content,' ',chosenanswer),'\r','') from socialqa.qdataH 36 | where id > 3822256/4*0 and id <= 3822256/4*1 37 | into outfile '/tmp/socialqa_dataset1.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 38 | select id, replace(concat(subject, ' ', content,' ',chosenanswer),'\r','') from socialqa.qdataH 39 | where id > 3822256/4*1 and id <= 3822256/4*2 40 | into outfile '/tmp/socialqa_dataset2.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 41 | select id, replace(concat(subject, ' ', content,' ',chosenanswer),'\r','') from socialqa.qdataH 42 | where id > 3822256/4*2 and id <= 3822256/4*3 43 | into outfile '/tmp/socialqa_dataset3.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 44 | select id, replace(concat(subject, ' ', content,' ',chosenanswer),'\r','') from socialqa.qdataH 45 | where id > 3822256/4*3 and id <= 3822256/4*4 + 4 46 | into outfile '/tmp/socialqa_dataset4.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 47 | 48 | select count(*) from socialqa.qdataH; 49 | rename table umls._target_term_ to umls._target_term_botanical_; 50 | select * from umls._target_term_botanical_; 51 | 52 | -------------------------------------------------------------------------------- /sql-script/somelab-sctGraph.sql: -------------------------------------------------------------------------------- 1 | show databases; 2 | use ret; 3 | use umls; 4 | use gg; 5 | show tables; 6 | select * from content_tag_compare_same; 7 | select count(distinct cui) from umls.mrconso; 8 | 9 | create database gg character set utf8; 10 | 11 | drop table social; 12 | CREATE TABLE `social` ( 13 | `stt` varchar(300) DEFAULT NULL, 14 | `sty` varchar(300) DEFAULT NULL, 15 | `ptr` varchar(300) DEFAULT NULL, 16 | `aui` varchar(9) DEFAULT NULL, 17 | `sid` bigint DEFAULT NULL, 18 | `fsn` varchar(500) DEFAULT NULL 19 | ) CHARSET=utf8; 20 | 21 | load data local infile 'C:\\fsu\\ra\\data\\graph-group\\SNOMED_database\\SNOMEDCT_SOCIAL_CONTEXT_with_PATH.tsv' 22 | into table social 23 | fields terminated by '\t' 24 | -- enclosed by '"' 25 | lines terminated by '\n' 26 | ignore 1 lines; 27 | 28 | insert observe (stt,sty,ptr,aui) values ('aaa','bbb','111.222.333','444'); 29 | insert observe (stt,sty,ptr,aui) values ('aaa','bbb','555.666.777','333'); 30 | use gg; 31 | 32 | ALTER TABLE observe add ( pt_aui varchar(30) default null); 33 | /*get the preferred term by snomedct code from mrconso. time consuming 4 hours*/ 34 | update observe as o 35 | set o.pt_aui = ( 36 | select aui from umls.mrconso where SAB='SNOMEDCT_US' AND TTY = 'PT' AND CODE = o.sid); 37 | ALTER TABLE social add ( pt_aui varchar(30) default null); 38 | UPDATE social AS s 39 | inner join umls.mrconso AS con 40 | ON con.SAB='SNOMEDCT_US' AND con.TTY = 'PT' AND con.CODE = s.sid 41 | SET s.pt_aui = con.AUI 42 | ; 43 | ALTER TABLE observe add ( pt_aui_str varchar(1000) default null); 44 | UPDATE observe AS s 45 | inner join umls.mrconso AS con 46 | ON s.pt_aui = con.AUI 47 | SET s.pt_aui_str = con.STR 48 | ; 49 | ALTER TABLE social add ( pt_aui_str varchar(1000) default null); 50 | UPDATE social AS s 51 | inner join umls.mrconso AS con 52 | ON s.pt_aui = con.AUI 53 | SET s.pt_aui_str = con.STR 54 | ; 55 | 56 | set group_concat_max_len=102400000; 57 | select count(*) from observe group by stt,sty; 58 | 59 | create table observe_group as 60 | select o1.stt, o1.sty, count(*) as cnt, group_concat(distinct o1.pt_aui, ' ', IFNULL(o2.pt_aui, 'null') separator ',co_occur') from 61 | (select stt,sty,ptr,pt_aui from observe o) o1 62 | left join observe o2 63 | on o1.stt=o2.stt and o1.sty=o2.sty 64 | and o1.ptr regexp concat('.*',o2.pt_aui,'$') 65 | group by o1.stt,o1.sty 66 | ; 67 | 68 | /*create the */ 69 | drop table if exists observe_group; 70 | create table observe_group as 71 | select o1.stt, o1.sty, count(distinct o1.pt_aui) as cnt_all, count(distinct o1.pt_aui,o2.pt_aui) as cnt_parent, 72 | group_concat(distinct o1.pt_aui, '\t', IFNULL(o2.pt_aui, 'null') separator '`') as pairs, 73 | group_concat(distinct o1.pt_aui, '\t', IFNULL(o1.pt_aui_str, 'null') separator '`') as pairs_str1, 74 | group_concat(distinct o2.pt_aui, '\t', IFNULL(o2.pt_aui_str, 'null') separator '`') as pairs_str2 75 | from observe o1 76 | left join observe o2 77 | on o1.stt=o2.stt and o1.sty=o2.sty 78 | and o1.ptr regexp concat('.*',o2.pt_aui,'$') 79 | group by o1.stt,o1.sty 80 | ; 81 | drop table if exists social_group; 82 | create table social_group as 83 | select o1.stt, o1.sty, count(distinct o1.pt_aui) as cnt_all, count(distinct o1.pt_aui,o2.pt_aui) as cnt_parent, 84 | group_concat(distinct o1.pt_aui, '\t', IFNULL(o2.pt_aui, 'null') separator '`') as pairs, 85 | group_concat(distinct o1.pt_aui, '\t', IFNULL(o1.pt_aui_str, 'null') separator '`') as pairs_str1, 86 | group_concat(distinct o2.pt_aui, '\t', IFNULL(o2.pt_aui_str, 'null') separator '`') as pairs_str2 87 | from social o1 88 | left join social o2 89 | on o1.stt=o2.stt and o1.sty=o2.sty 90 | and o1.ptr regexp concat('.*',o2.pt_aui,'$') 91 | group by o1.stt,o1.sty 92 | ; 93 | 94 | select * from observe_group order by cnt_parent; 95 | select * from social_group order by cnt_parent; 96 | 97 | select * from observe_group order by cnt_parent into outfile '/tmp/observe_group.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 98 | select * from social_group order by cnt_parent into outfile '/tmp/social_group.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 99 | 100 | 101 | use ret; 102 | 103 | select * from content_tag_ytex_T047_unique ; 104 | 105 | 106 | select distinct A.target2 from (select * from co_occur where sab1 is not null and sab2 is null) A; 107 | select * from co_occur where sab1 is not null and sab2 is null; 108 | 109 | select count(distinct target) from content_tag_our_T047_unique; 110 | select count(distinct target) from content_tag_ytex_T047_unique; 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /sql-script/synonym.sql: -------------------------------------------------------------------------------- 1 | create database synonym char set utf8; 2 | use synonym; 3 | create table test_term_umls ( 4 | term varchar(100), 5 | cui varchar(10), 6 | synonym text 7 | ); 8 | 9 | create table wiki_ngram like chv.cancer_ngram; 10 | create table wiki_ngram_tf5 like wiki_ngram; 11 | 12 | rename table socialqa.wiki_ngram to synonym.wiki_ngram; 13 | 14 | load data local infile '/tmp/wiki.tf5.ngram.novector' into table wiki_ngram_tf5 fields terminated by '\t' enclosed by '"' lines terminated by '\r\n' ignore 1 lines; 15 | 16 | 17 | set group_concat_max_len=10240; 18 | select * from test_term_umls; 19 | -- synonym with at most 3 words 20 | truncate test_term_umls; 21 | insert into test_term_umls (term,cui) select distinct ngram, cui_umls from wiki_ngram; 22 | -- update test_term_umls t set synonym = (select GROUP_CONCAT(distinct s.descr SEPARATOR '|' ) from umls._target_term_ s where cui=t.cui and (length(descr)-length(replace(descr,' ', '')))<= 2); 23 | 24 | update test_term_umls t set synonym = (select GROUP_CONCAT(distinct s.str SEPARATOR '|' ) from umls.mrconso s where s.cui=t.cui and (s.sab like 'SNOMEDCT_US%' or s.sab like 'RXNORM%' OR s.sab like 'ICD%' OR s.sab like 'NCI%' OR s.sab like 'LOINC%') and (length(str)-length(replace(str,' ', '')))<= 3); 25 | 26 | 27 | select * from umls._target_term_ ; 28 | select * from umls.mrconso where cui='C0439234'; 29 | select count(*) from wiki_ngram; 30 | select * from test_term_umls 31 | into outfile '/tmp/freq_term.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 32 | 33 | select distinct sab from umls.mrconso; 34 | select ngram from wiki_ngram_tf5 where n>1 35 | into outfile '/tmp/wiki_ngram_tf5.txt'; 36 | 37 | select cvalue, count(*) as cnt from wiki_ngram group by cvalue order by cvalue; 38 | select count(*) from wiki_ngram where tf>=100 and ngram like '%(A)%'; 39 | select count(*) from wiki_ngram_tf5 where tf>=100 and ngram like '%(A)%' order by tf; 40 | select * from test_term_umls where term like '%mother%'; 41 | -------------------------------------------------------------------------------- /sql-script/umls.sql: -------------------------------------------------------------------------------- 1 | use umls; 2 | 3 | select * from MRCONSO where cui = 'C1875802' limit 2000 ; 4 | select count(*) from mrsmap; 5 | select * from mrcols where col='STY'; 6 | 7 | select count(distinct cui) from mrconso; 8 | 9 | SHOW FULL PROCESSLIST; 10 | 11 | select * from mrsty; 12 | select * from mrrel where rel is null; 13 | 14 | select distinct sab from umls.mrconso; 15 | 16 | show processlist; 17 | 18 | select CUI, AUI, SAB, STR from MRCONSO where LAT = 'ENG' into outfile 'c:/fsu/all.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 19 | 20 | select * from mrconso where stt='PF'; 21 | 22 | select CUI, COUNT(*) FROM MRSTY GROUP BY CUI; 23 | 24 | desc mrsty; 25 | 26 | select sab, count(*) from mrconso group by sab; 27 | 28 | select count(*) from mrconso where str like '%diabetes%' or str like '%type 1 diabetes%' or str like '%type 2 diabetes%'; 29 | /*4021 result*/ 30 | 31 | select distinct cui from mrconso where str like '%diabetes%' or str like '%type 1 diabetes%' or str like '%type 2 diabetes%'; 32 | /*more than 1k*/ 33 | 34 | select * from mrconso where sab ='SNOMEDCT_US' AND ( str like '%diabetes%' or str like '%type 1 diabetes%' or str like '%type 2 diabetes%'); 35 | 36 | select distinct cui from mrconso where str = 'diabetes' or str = 'type 1 diabetes' or str = 'type 2 diabetes'; 37 | /*4 results*/ 38 | select count(*) from mrrel where cui1 in ('C0011847','C0011849','C0011854','C0011860') or cui2 in ('C0011847','C0011849','C0011854','C0011860') ; 39 | /*5994 result*/ 40 | 41 | create view rel_diabetes as select * from mrrel where cui1 in ('C0011847','C0011849','C0011854','C0011860') or cui2 in ('C0011847','C0011849','C0011854','C0011860') ; 42 | 43 | create view content_rel as select distinct c.cui from CONTENT_TAG c inner join rel_diabetes r 44 | on c.cui = r.cui1 COLLATE utf8_unicode_ci or c.cui = r.cui2 COLLATE utf8_unicode_ci; 45 | 46 | select count(distinct CUI) from MRCONSO; 47 | select * from tmp_rel_diabetes; 48 | select * from CONTENT_TAG; 49 | delete from content_tag where blogId='post_id'; 50 | 51 | 52 | 53 | drop table if exists CONTENT_ORG; 54 | CREATE TABLE CONTENT_ORG ( 55 | `blogId` BIGINT(20) DEFAULT NULL, /*----blog id */ 56 | `post_hashtag` varchar(300) DEFAULT NULL, 57 | `blog_name` varchar(200) DEFAULT NULL, 58 | `text_link_title` varchar(500) DEFAULT NULL, 59 | `text_content` varchar(10000) DEFAULT NULL 60 | ); 61 | 62 | truncate CONTENT_ORG; 63 | 64 | /*load result in to the table*/ 65 | load data local infile 'C:\\fsu\\ra\\UmlsTagger\\data\\data_content_tag_diabetes_0821.csv' 66 | into table CONTENT_ORG 67 | fields terminated by ',' 68 | enclosed by '"' 69 | lines terminated by '`' 70 | ; 71 | 72 | select *, length(text_content) AS len from content_org ; 73 | select count(distinct blogid) from content_org; 74 | desc content_org; 75 | 76 | select blogId instance_id from umls.CONTENT_ORG; 77 | -- select text_content note_text from umls.CONTENT_ORG where blogId = :instance_id; 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /sql-script/usuk.sql: -------------------------------------------------------------------------------- 1 | create database usuk char set utf8; 2 | use usuk; 3 | 4 | create table UK_T2DM_Trials AS 5 | (select tid, criteria from compact_092316.trials where 6 | (STR_TO_DATE(start_date,'%M %Y') >= STR_TO_DATE('JANUARY 2005','%M %Y')) 7 | and (STR_TO_DATE(start_date,'%M %Y') <= STR_TO_DATE('September 2016','%M %Y')) 8 | and study_type = 'Interventional' and tid in (select tid from compact_092316.all_diseases_trials where disease ='diabetes-mellitus-type-2') 9 | and tid in (select tid from compact_092316.authority where authority like '%United Kingdom%')); 10 | 11 | 12 | create table US_T2DM_Trials AS 13 | (select tid, criteria from compact_092316.trials where 14 | (STR_TO_DATE(start_date,'%M %Y') >= STR_TO_DATE('JANUARY 2005','%M %Y')) 15 | and (STR_TO_DATE(start_date,'%M %Y') <= STR_TO_DATE('September 2016','%M %Y')) 16 | and study_type = 'Interventional' and tid in (select tid from compact_092316.all_diseases_trials where disease ='diabetes-mellitus-type-2') 17 | and tid in (select tid from compact_092316.authority where authority like '%United States%')); 18 | 19 | select tid,criteria from US_T2DM_Trials; 20 | 21 | create table cancer_cui like cancer.cancer_cui; 22 | create table noncui like cancer.noncui; 23 | create table cancer_mm_cui like ner200.cancer_mm_cui; 24 | 25 | 26 | -------------------------------------------------------------------------------- /sql-script/yahoo.sql: -------------------------------------------------------------------------------- 1 | use ytex; 2 | drop table tmp_org_yahoo; 3 | CREATE TABLE TMP_ORG_yahoo ( 4 | `qid` varchar(50), 5 | `id` int(11), 6 | `category` varchar(50), 7 | `categoryId` int(11), 8 | `subject` text, 9 | `content` text, 10 | `day` varchar(22), 11 | `link` varchar(256), 12 | `userid` varchar(50), 13 | `usernick` varchar(200), 14 | `numanswers` int(10), 15 | `numcomments` int(10), 16 | `chosenanswer` text, 17 | `chosenanswererid` varchar(50), 18 | `chosenanswerernick` varchar(200), 19 | `chosenanswertimestamp` varchar(20) 20 | ); 21 | load data local infile 'C:\\fsu\\ra\\data\\qdataH.diabetes.all_58425.csv' 22 | into table TMP_ORG_yahoo 23 | fields terminated by ',' 24 | enclosed by '"' 25 | lines terminated by '\r\n' 26 | ignore 1 LINES 27 | ; 28 | 29 | select * from tmp_org_yahoo ; 30 | select count( distinct qid) from tmp_org_yahoo; 31 | 32 | -- ytex using sql: 33 | select distinct id INSTANCE_ID from ytex.TMP_ORG_yahoo where qid is not null; 34 | select concat(subject, ". ", content, ". ", chosenanswer) note_text from ytex.TMP_ORG_yahoo where id = :instance_id limit 1; 35 | select chosenanswer note_text from ytex.TMP_ORG_yahoo where id = :instance_id limit 1; 36 | 37 | drop table content_tag_ytex_yahoo_answer; 38 | create table ytex.content_tag_ytex_yahoo_answer as 39 | select yh.qid, a.anno_text, d.instance_id, c.* from v_document_cui_sent c 40 | inner join v_annotation a on c.anno_base_id = a.anno_base_id 41 | inner join v_document d on d.document_id = c.document_id 42 | inner join TMP_ORG_yahoo yh on yh.id = d.instance_id 43 | ; 44 | insert into ytex.content_tag_ytex_yahoo 45 | select yh.qid, a.anno_text, d.instance_id, c.* from v_document_cui_sent c 46 | inner join v_annotation a on c.anno_base_id = a.anno_base_id 47 | inner join v_document d on d.document_id = c.document_id 48 | inner join TMP_ORG_yahoo yh on yh.id = d.instance_id 49 | ; 50 | select count(distinct qid) from content_tag_ytex_yahoo where analysis_batch='answer'; 51 | -- q: 15378, a:25342, all:40720 52 | 53 | select distinct doc_text from document; 54 | select count(distinct anno_base_id) from anno_base; 55 | select * from anno_named_entity; 56 | select count(distinct code) from anno_ontology_concept; 57 | select count(distinct document_id) from content_tag_ytex_yahoo; 58 | select count(distinct qid) from content_tag_ytex_yahoo_answer; 59 | select count(distinct document_id) from v_document_cui_sent; 60 | 61 | select concat(subject, ". ", content, ". ", chosenanswer) from TMP_ORG_yahoo; 62 | 63 | select * from ytex.TMP_ORG_yahoo where chosenanswer like '%article%'; 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /src/main/java/StanfordCoreNlpDemo.java: -------------------------------------------------------------------------------- 1 | 2 | import java.io.*; 3 | import java.util.*; 4 | 5 | import edu.stanford.nlp.coref.CorefCoreAnnotations; 6 | 7 | import edu.stanford.nlp.coref.data.CorefChain; 8 | import edu.stanford.nlp.io.*; 9 | import edu.stanford.nlp.ling.*; 10 | import edu.stanford.nlp.pipeline.*; 11 | import edu.stanford.nlp.semgraph.SemanticGraph; 12 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; 13 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; 14 | import edu.stanford.nlp.trees.*; 15 | import edu.stanford.nlp.util.*; 16 | 17 | /** This class demonstrates building and using a Stanford CoreNLP pipeline. */ 18 | public class StanfordCoreNlpDemo { 19 | 20 | /** Usage: java -cp "*" StanfordCoreNlpDemo [inputFile [outputTextFile [outputXmlFile]]] */ 21 | public static void main(String[] args) throws IOException { 22 | // set up optional output files 23 | PrintWriter out; 24 | if (args.length > 1) { 25 | out = new PrintWriter(args[1]); 26 | } else { 27 | out = new PrintWriter(System.out); 28 | } 29 | PrintWriter xmlOut = null; 30 | if (args.length > 2) { 31 | xmlOut = new PrintWriter(args[2]); 32 | } 33 | 34 | // Create a CoreNLP pipeline. To build the default pipeline, you can just use: 35 | // StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 36 | // Here's a more complex setup example: 37 | // Properties props = new Properties(); 38 | // props.put("annotators", "tokenize, ssplit, pos, lemma, ner, depparse"); 39 | // props.put("ner.model", "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"); 40 | // props.put("ner.applyNumericClassifiers", "false"); 41 | // StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 42 | 43 | // Add in sentiment 44 | Properties props = new Properties(); 45 | props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment"); 46 | 47 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 48 | 49 | // Initialize an Annotation with some text to be annotated. The text is the argument to the constructor. 50 | Annotation annotation; 51 | if (args.length > 0) { 52 | annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[0])); 53 | } else { 54 | annotation = new Annotation("Kosgi Santosh sent an email to Stanford University. He didn't get a reply."); 55 | } 56 | 57 | // run all the selected Annotators on this text 58 | pipeline.annotate(annotation); 59 | 60 | // this prints out the results of sentence analysis to file(s) in good formats 61 | pipeline.prettyPrint(annotation, out); 62 | if (xmlOut != null) { 63 | pipeline.xmlPrint(annotation, xmlOut); 64 | } 65 | 66 | // Access the Annotation in code 67 | // The toString() method on an Annotation just prints the text of the Annotation 68 | // But you can see what is in it with other methods like toShorterString() 69 | out.println(); 70 | out.println("The top level annotation"); 71 | out.println(annotation.toShorterString()); 72 | out.println(); 73 | 74 | // An Annotation is a Map with Class keys for the linguistic analysis types. 75 | // You can get and use the various analyses individually. 76 | // For instance, this gets the parse tree of the first sentence in the text. 77 | List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); 78 | if (sentences != null && ! sentences.isEmpty()) { 79 | CoreMap sentence = sentences.get(0); 80 | out.println("The keys of the first sentence's CoreMap are:"); 81 | out.println(sentence.keySet()); 82 | out.println(); 83 | out.println("The first sentence is:"); 84 | out.println(sentence.toShorterString()); 85 | out.println(); 86 | out.println("The first sentence tokens are:"); 87 | for (CoreMap token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { 88 | out.println(token.toShorterString()); 89 | } 90 | Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); 91 | out.println(); 92 | out.println("The first sentence parse tree is:"); 93 | tree.pennPrint(out); 94 | out.println(); 95 | out.println("The first sentence basic dependencies are:"); 96 | out.println(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.LIST)); 97 | out.println("The first sentence collapsed, CC-processed dependencies are:"); 98 | SemanticGraph graph = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class); 99 | out.println(graph.toString(SemanticGraph.OutputFormat.LIST)); 100 | 101 | // Access coreference. In the coreference link graph, 102 | // each chain stores a set of mentions that co-refer with each other, 103 | // along with a method for getting the most representative mention. 104 | // Both sentence and token offsets start at 1! 105 | out.println("Coreference information"); 106 | Map corefChains = 107 | annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class); 108 | if (corefChains == null) { return; } 109 | for (Map.Entry entry: corefChains.entrySet()) { 110 | out.println("Chain " + entry.getKey()); 111 | for (CorefChain.CorefMention m : entry.getValue().getMentionsInTextualOrder()) { 112 | // We need to subtract one since the indices count from 1 but the Lists start from 0 113 | List tokens = sentences.get(m.sentNum - 1).get(CoreAnnotations.TokensAnnotation.class); 114 | // We subtract two for end: one for 0-based indexing, and one because we want last token of mention not one following. 115 | out.println(" " + m + ", i.e., 0-based character offsets [" + tokens.get(m.startIndex - 1).beginPosition() + 116 | ", " + tokens.get(m.endIndex - 2).endPosition() + ")"); 117 | } 118 | } 119 | out.println(); 120 | 121 | out.println("The first sentence overall sentiment rating is " + sentence.get(SentimentCoreAnnotations.SentimentClass.class)); 122 | } 123 | IOUtils.closeIgnoringExceptions(out); 124 | IOUtils.closeIgnoringExceptions(xmlOut); 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/com/votors/umls/graph/HelloJGraphT.java: -------------------------------------------------------------------------------- 1 | package com.votors.umls.graph; 2 | 3 | import java.net.*; 4 | import org.jgrapht.*; 5 | import org.jgrapht.graph.*; 6 | /** 7 | * A simple introduction to using JGraphT. 8 | * 9 | * @author Barak Naveh 10 | * @since Jul 27, 2003 11 | */ 12 | public final class HelloJGraphT 13 | { 14 | private HelloJGraphT() 15 | { 16 | } // ensure non-instantiability. 17 | 18 | /** 19 | * The starting point for the demo. 20 | * 21 | * @param args ignored. 22 | */ 23 | public static void main(String [] args) 24 | { 25 | UndirectedGraph stringGraph = createStringGraph(); 26 | 27 | // note undirected edges are printed as: {,} 28 | System.out.println(stringGraph.toString()); 29 | 30 | // create a graph based on URL objects 31 | DirectedGraph hrefGraph = createHrefGraph(); 32 | 33 | // note directed edges are printed as: (,) 34 | System.out.println(hrefGraph.toString()); 35 | } 36 | 37 | /** 38 | * Creates a toy directed graph based on URL objects that represents link 39 | * structure. 40 | * 41 | * @return a graph based on URL objects. 42 | */ 43 | private static DirectedGraph createHrefGraph() 44 | { 45 | DirectedGraph g = 46 | new DefaultDirectedGraph(DefaultEdge.class); 47 | 48 | try { 49 | URL amazon = new URL("http://www.amazon.com"); 50 | URL yahoo = new URL("http://www.yahoo.com"); 51 | URL ebay = new URL("http://www.ebay.com"); 52 | 53 | // add the vertices 54 | g.addVertex(amazon); 55 | g.addVertex(yahoo); 56 | g.addVertex(ebay); 57 | 58 | // add edges to create linking structure 59 | g.addEdge(yahoo, amazon); 60 | g.addEdge(yahoo, ebay); 61 | } catch (MalformedURLException e) { 62 | e.printStackTrace(); 63 | } 64 | 65 | return g; 66 | } 67 | 68 | /** 69 | * Create a toy graph based on String objects. 70 | * 71 | * @return a graph based on String objects. 72 | */ 73 | private static UndirectedGraph createStringGraph() 74 | { 75 | UndirectedGraph g = 76 | new SimpleGraph(DefaultEdge.class); 77 | 78 | String v1 = "v1"; 79 | String v2 = "v2"; 80 | String v3 = "v3"; 81 | String v4 = "v4"; 82 | 83 | // add the vertices 84 | g.addVertex(v1); 85 | g.addVertex(v2); 86 | g.addVertex(v3); 87 | g.addVertex(v4); 88 | 89 | // add edges to create a circuit 90 | g.addEdge(v1, v2); 91 | g.addEdge(v2, v3); 92 | g.addEdge(v3, v4); 93 | g.addEdge(v4, v1); 94 | 95 | return g; 96 | } 97 | } 98 | 99 | // End HelloJGraphT.java -------------------------------------------------------------------------------- /src/main/java/com/votors/umls/graph/IsaEdge.java: -------------------------------------------------------------------------------- 1 | package com.votors.umls.graph; 2 | 3 | import org.jgrapht.graph.*; 4 | 5 | import java.util.Objects; 6 | 7 | /** 8 | * Created by Jason on 2015/9/26 0026. 9 | */ 10 | public class IsaEdge extends DefaultEdge{ 11 | 12 | @Override public String toString () {return "";} 13 | @Override public boolean equals(Object obj) { 14 | if (obj instanceof IsaEdge && ((IsaEdge)obj).getSource().equals(this.getSource()) 15 | && ((IsaEdge)obj).getTarget().equals(this.getTarget())) { 16 | return true; 17 | } 18 | return false; 19 | } 20 | @Override public UmlsVertex getTarget() {return (UmlsVertex)super.getTarget();} 21 | @Override public UmlsVertex getSource() {return (UmlsVertex)super.getSource();} 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/votors/umls/graph/TestJava.java: -------------------------------------------------------------------------------- 1 | 2 | package com.votors.umls.graph; 3 | 4 | import java.io.*; 5 | import java.util.*; 6 | import java.text.*; 7 | import java.math.*; 8 | import java.util.regex.*; 9 | 10 | public class TestJava { 11 | 12 | public static void main(String[] args) { 13 | Scanner in = new Scanner(System.in); 14 | String time = in.next(); 15 | 16 | boolean pm = false; 17 | if (time.contains("PM"))pm=true; 18 | String[] t = time.substring(0,time.length()-2).split(":"); 19 | int h = Integer.parseInt(t[0]); 20 | int m = Integer.parseInt(t[1]); 21 | int s = Integer.parseInt(t[2]); 22 | if (pm && h < 12) h += 12; 23 | if (!pm && h == 12) h=0; 24 | System.out.println(String.format("%02d:%02d:%02d", h,m,s)); 25 | 26 | 27 | 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/votors/umls/graph/UmlsVertex.java: -------------------------------------------------------------------------------- 1 | package com.votors.umls.graph; 2 | 3 | import org.jgrapht.*; 4 | import org.jgrapht.graph.ListenableDirectedGraph; 5 | 6 | import java.io.Serializable; 7 | 8 | /** 9 | * Created by Jason on 2015/9/26 0026. 10 | */ 11 | public class UmlsVertex implements Serializable { 12 | private String aui = null; 13 | private String auiStr = null; 14 | /*status of the vertex: "root" or "child" */ 15 | public static final String ROOT = "root"; 16 | public static final String ROOT_NEW = "new-root"; 17 | public static final String CHILD = "child"; 18 | //public static final String RELAY = "relay"; 19 | public static final String COPY = "copy"; 20 | public String status = ROOT; 21 | public UmlsVertex root = this; // who is the root of this vertex 22 | public int groupId = 0; // which group this vertex belong to; 0 is no group yet. 23 | public int layer = 0; // which layer do the vertex locate in? for method SctGraph.fix() 24 | public boolean fix = false; 25 | transient private ListenableDirectedGraph g = null; 26 | private static int copyCnt = 0; 27 | private static UmlsVertex NULL = null; 28 | 29 | public UmlsVertex(String aui) { 30 | this.aui = aui; 31 | } 32 | public UmlsVertex(UmlsVertex cp) { 33 | copyCnt++; 34 | aui = cp.aui + "-copy-"+copyCnt; 35 | root = cp.root; 36 | groupId = cp.groupId; 37 | status = UmlsVertex.COPY; 38 | layer = cp.layer; 39 | auiStr = cp.auiStr; 40 | g = cp.g; 41 | } 42 | public String getAui() { return aui;} 43 | public void setGraph(ListenableDirectedGraph graph) {g = graph;} 44 | public int getOutDegree() { if (g == null) return 0; else return g.outDegreeOf(this);} 45 | public int getInDegree() { if (g == null) return 0; else return g.inDegreeOf(this);} 46 | public void setAuiStr(String str) { auiStr = str;} 47 | public String getAuiStr() { return auiStr;} 48 | 49 | @Override public String toString () { 50 | if (auiStr == null) { 51 | return groupId + ":" + aui; 52 | } else { 53 | return groupId + ":" + aui + "\n" + auiStr; 54 | } 55 | } 56 | @Override public int hashCode() {return aui.hashCode();} 57 | @Override public boolean equals(Object obj) { 58 | if ((obj instanceof UmlsVertex) && aui.equals(((UmlsVertex)obj).aui)) { 59 | return true; 60 | } 61 | return false; 62 | } 63 | 64 | public static UmlsVertex getNULL () { 65 | if (NULL == null) { 66 | NULL = new UmlsVertex("null"); 67 | NULL.status = ROOT; 68 | } 69 | return NULL; 70 | } 71 | 72 | public String toString2() { 73 | return "Aui:" + aui + ",\tstatus: " + status + ",\tgroupId: " + groupId + ",\troot: " 74 | + root.getAui() + ",\tlayer: " + layer + ",\tout: " + getOutDegree() + ",\tin: " + getInDegree(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/com/votors/Test.scala: -------------------------------------------------------------------------------- 1 | package com.votors 2 | 3 | import java.time.Duration 4 | 5 | import com.votors.common.TimeX 6 | 7 | /** 8 | * Created by Jason on 2016/6/15 0015. 9 | */ 10 | object Test { 11 | def main(args:Array[String]) = { 12 | 13 | var splitType = "#" 14 | val criteria = "Fertile patients must use effective contraception during and for 3 months after study No other malignancy within the past 3 years No serious concurrent medical illness or active infection that would preclude study chemotherapy No allergy or sensitivity to imidazole antifungal medications (e.g., fluconazole, ketoconazole, miconazole, itraconazole, and clotrimazole)" 15 | criteria.split("#|\\n").flatMap(s=> { 16 | // if there is more than tow ':' in a sentence, we should split it using ':', cuz some clinical trails use ':' as separate symbol. 17 | if (s.count(_ == ':') >= 3) { 18 | splitType = ":" 19 | s.split(":") 20 | } else if (s.count(_ == '-') >= 3) { 21 | splitType = "-" 22 | s.split(" - ") 23 | } else if (s.split("\\s").count(_=="No") >= 3) { 24 | // some sentences without any punctuation to separate 25 | splitType = "No" 26 | s.split("(?=\\sNo\\s)") 27 | } else if (s.split("\\s").count(s=> s.equals("OR") || s.equals("Or")) >= 3) { 28 | // some sentences without any punctuation to separate 29 | splitType = "or" 30 | s.split("Or|OR") 31 | } else { 32 | s :: Nil 33 | } 34 | }).filter(_.trim.size > 2).foreach(sent_org => { 35 | val sent = sent_org.trim.replaceAll("^[\\p{Punct}\\s]*", "") // the punctuation at the beginning of a sentence 36 | println(sent) 37 | }) 38 | 39 | } 40 | 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/votors/TokenizerDemo.scala: -------------------------------------------------------------------------------- 1 | package com.votors 2 | 3 | /** 4 | * Created by Jason on 2016/5/2 0002. 5 | */ 6 | import java.io.FileReader 7 | import java.io.IOException 8 | import java.util 9 | import java.util.List 10 | 11 | import scala.collection.JavaConversions.asScalaIterator 12 | import scala.collection.immutable.{List, Range} 13 | import scala.collection.mutable 14 | import scala.collection.mutable.{ListBuffer, ArrayBuffer} 15 | 16 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser 17 | 18 | import scala.collection.JavaConversions.asScalaIterator 19 | import scala.collection.immutable.{List, Range} 20 | import scala.collection.mutable 21 | import scala.collection.mutable.{ListBuffer, ArrayBuffer} 22 | import scala.io.Source 23 | import scala.io.Codec 24 | 25 | import edu.stanford.nlp.ling.{TaggedWord, CoreLabel, HasWord} 26 | import edu.stanford.nlp.process.CoreLabelTokenFactory 27 | import edu.stanford.nlp.process.DocumentPreprocessor 28 | import edu.stanford.nlp.process.PTBTokenizer 29 | 30 | object TokenizerDemo { 31 | 32 | def main(args: Array[String]) { 33 | for (arg <- args) { 34 | // option #1: By sentence. 35 | val dp = new DocumentPreprocessor(arg).iterator() 36 | val tokens = dp.map(_.toArray().map(_.toString)).flatMap(_.toSeq).toArray 37 | println(tokens.mkString(" ")) 38 | // option #2: By token 39 | val ptbt = new PTBTokenizer(new FileReader(arg), 40 | new CoreLabelTokenFactory(), ""); 41 | while (ptbt.hasNext()) { 42 | val label = ptbt.next(); 43 | System.out.println(label); 44 | } 45 | } 46 | // 47 | // set up grammar and options as appropriate 48 | // val lp = LexicalizedParser.loadModel(); 49 | // val sent3 = Array("I", "can", "do", "it", "." ) 50 | // // Parser gets tag of second "can" wrong without help 51 | // val tag3 = Array( "PRP", "MD", "VB", "PRP", "." ) 52 | // val sentence3 = new util.ArrayList[TaggedWord]() 53 | // for (i <- 0 to (sent3.length-1)) { 54 | // sentence3.add(new TaggedWord(sent3(i), tag3(i))); 55 | // } 56 | // val sents = Array("I go to school at 9:00 tomorrow.") 57 | // val parse = lp.parse(sentence3); 58 | // //val parse = lp.parseStrings(sents); 59 | // parse.pennPrint(); 60 | 61 | } 62 | } -------------------------------------------------------------------------------- /src/main/scala/com/votors/umls/MMApi.scala: -------------------------------------------------------------------------------- 1 | package com.votors.umls 2 | 3 | import scala.collection.JavaConversions._ 4 | import java.io.{FileReader, FileWriter, PrintStream, PrintWriter} 5 | import java.util.concurrent.atomic.AtomicInteger 6 | import java.io._ 7 | import java.util 8 | 9 | import com.votors.common.{Conf, TimeX} 10 | import com.votors.common.Utils.Trace._ 11 | import com.votors.common.Utils._ 12 | import com.votors.ml.{Nlp, StanfordNLP} 13 | import edu.stanford.nlp.util.IntPair 14 | import gov.nih.nlm.nls.metamap.AcronymsAbbrevs 15 | import gov.nih.nlm.nls.metamap.MetaMapApi 16 | import gov.nih.nlm.nls.metamap.MetaMapApiImpl 17 | import gov.nih.nlm.nls.metamap.Result 18 | 19 | case class MMResult(cui:String, score:Int,orgStr:String,cuiStr:String,pfName:String, sent:String) { 20 | val sourceSet = new util.HashSet[String] 21 | val stySet = new util.HashSet[String] 22 | val span = new IntPair(-1,-1) 23 | var neg = -1 24 | var sentId = 0 25 | var termId = 0 26 | var matchType = 0; //match with our result: 1=same cui; 2= same orgStr; 3=1+2 27 | val matchDesc = new StringBuilder 28 | def shortDesc = { 29 | val sb: StringBuilder = new StringBuilder 30 | sb.append(cui + "|" 31 | + orgStr + "|" 32 | + score) 33 | sb.toString() 34 | } 35 | override def toString = { 36 | val sb: StringBuilder = new StringBuilder 37 | sb.append(cui + "|" 38 | + orgStr + "|" 39 | + cuiStr + "|" 40 | + score + "|" 41 | + span + "|" 42 | + stySet.mkString(" ") + "|" 43 | + sourceSet.mkString(" ") + "|" 44 | + sent) 45 | sb.toString() 46 | } 47 | } 48 | 49 | /** 50 | * Created by Jason on 2016/11/30 0030. 51 | */ 52 | object MMApi { 53 | var api: MetaMapApi = null 54 | 55 | /** 56 | * given a string (sentence), return the result from Metamap. 57 | */ 58 | def process(terms: String, sentId:Int=0): Seq[MMResult] = { 59 | if (!Conf.MMenable) return Seq() 60 | init() 61 | // the character \031 will cause metamap dead. 62 | val resultList: util.List[Result] = api.processCitationsFromString(terms.replaceAll("[^\\p{Graph}\\x20\\t\\r\\n]","")) 63 | val mmRets = new util.ArrayList[MMResult]() 64 | for (result <- resultList) { 65 | /** write result as: cui|score|semtypes|sources|utterance */ 66 | for (utterance <- result.getUtteranceList) { 67 | for (pcm <- utterance.getPCMList) { 68 | for (map <- pcm.getMappingList) { 69 | var termId = 0 70 | for (mapEv <- map.getEvList) { 71 | val mmRet = MMResult(mapEv.getConceptId, math.abs(mapEv.getScore), mapEv.getMatchedWords.mkString(" "), mapEv.getConceptName, mapEv.getPreferredName, terms) 72 | mmRet.sentId = sentId 73 | val sb: StringBuilder = new StringBuilder 74 | mmRet.sourceSet.addAll(mapEv.getSources.filter(sab => sab.matches(Conf.sabFilter))) 75 | mmRet.stySet.addAll(mapEv.getSemanticTypes.map(SemanticType.mapAbbr2sty.getOrElse(_,"None")).filter(sty => Conf.semanticType.indexOf(sty) >= 0)) 76 | if (mmRet.sourceSet.size > 0 77 | && mmRet.stySet.size > 0 78 | && mmRet.score >= Conf.MMscoreThreshold 79 | && !Nlp.checkStopword(mmRet.orgStr,true) 80 | && !mmRet.orgStr.matches(Conf.cuiStringFilterRegex) 81 | && !mmRets.exists(mm=>mm.cui.equals(mmRet.cui) && mm.orgStr.equals(mmRet.orgStr) && mm.score==mmRet.score) 82 | //&& !mmRets.exists(mm=>mm.orgStr.toLowerCase.contains(mmRet.orgStr.toLowerCase)) // not exactly what we mean 'overlap'. 83 | ) { 84 | mmRets.add(mmRet) 85 | for (p <- mapEv.getPositionalInfo) { 86 | if (mmRet.span.get(0) == -1 || p.getX < mmRet.span.get(0)) mmRet.span.set(0, p.getX) 87 | if (mmRet.span.get(1) == -1 || p.getX + p.getY > mmRet.span.get(1)) mmRet.span.set(1, p.getX + p.getY) 88 | } 89 | mmRet.neg = mapEv.getNegationStatus 90 | termId += 1 91 | mmRet.termId = termId 92 | println(mmRet.toString) 93 | } else { 94 | println(s"filter by sty:${mmRet.stySet.size}, sab:${mmRet.sourceSet.size}, ${mmRet.score}, ${mmRet.cui}, ${mmRet.orgStr}, or already exists.") 95 | } 96 | } 97 | } 98 | } 99 | } 100 | } 101 | return mmRets.to[Seq] 102 | } 103 | 104 | private def init():Unit = { 105 | if (api != null) return 106 | api = new MetaMapApiImpl 107 | if (Conf.MMhost.trim.size > 0)api.setHost(Conf.MMhost) 108 | if (Conf.MMport.trim.size > 0)api.setPort(Conf.MMport.toInt) 109 | val options: String = Conf.MMoptions 110 | api.setOptions(options) 111 | } 112 | 113 | def main(args: Array[String]) { 114 | init() 115 | var startTime = System.currentTimeMillis() 116 | process("People who don\u0019t smoke but who breathe the smoke of others also have a higher risk of lung cancer.") 117 | println(System.currentTimeMillis() - startTime) 118 | startTime = System.currentTimeMillis() 119 | process("People who don\u0019t smoke but who breathe the smoke of others also have a higher risk of lung cancer.") 120 | println(System.currentTimeMillis() - startTime) 121 | startTime = System.currentTimeMillis() 122 | process("My focus is on the code in word2vec.c for training the skip-gram architecture with negative sampling, so for now I have ignored the CBOW and Hierarchical Softmax code. I also haven't looked much at the testing code.\n\nBecause the code supports both models and both training approaches, I highly recommended viewing the code in an editor which allows you to collapse code blocks. The training code is much more readable when you hide the implementations that you aren't interested in..") 123 | println(System.currentTimeMillis() - startTime) 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/main/scala/com/votors/umls/SemanticType.scala: -------------------------------------------------------------------------------- 1 | package com.votors.umls 2 | 3 | import java.io._ 4 | import java.nio.charset.CodingErrorAction 5 | import java.util.regex.Pattern 6 | import java.util.{Date, Properties} 7 | 8 | import com.votors.common.{SqlUtils, Conf} 9 | import com.votors.ml.{Clustering, Nlp} 10 | import edu.stanford.nlp.util.IntPair 11 | import opennlp.tools.sentdetect.{SentenceDetectorME, SentenceModel} 12 | import org.apache.log4j.{Level, Logger} 13 | import org.apache.spark.{SparkContext, SparkConf} 14 | 15 | import scala.collection.JavaConversions.asScalaIterator 16 | import scala.collection.immutable.{List, Range} 17 | import scala.collection.mutable 18 | import scala.collection.mutable.{ListBuffer, ArrayBuffer} 19 | import scala.io.Source 20 | import scala.io.Codec 21 | /** 22 | * Created by Jason on 2016/12/2 0002. 23 | */ 24 | case class SemanticType(sty:String,var abbr:String="", var fullName:String="",var groupAbbr:String="", var groupName:String="") 25 | 26 | object SemanticType { 27 | val mapSty = new mutable.HashMap[String, SemanticType]() 28 | val mapAbbr2sty = new mutable.HashMap[String, String]() 29 | def init() = { 30 | val ftype=Source.fromFile(Conf.rootDir + "/data/SemanticTypes_2013AA.txt") 31 | for (line <- ftype.getLines() if line.trim.size > 5) { 32 | val tokens = line.split("\\|") 33 | val sty = mapSty.getOrElseUpdate(tokens(1),SemanticType(tokens(1))) 34 | mapAbbr2sty.getOrElseUpdate(tokens(0),tokens(1)) 35 | sty.abbr = tokens(0) 36 | sty.fullName = tokens(2) 37 | } 38 | val fgroup=Source.fromFile(Conf.rootDir + "/data/SemGroups.txt") 39 | for (line <- fgroup.getLines() if line.trim.size > 5) { 40 | val tokens = line.split("\\|") 41 | val sty = mapSty.getOrElseUpdate(tokens(2),SemanticType(tokens(2))) 42 | sty.groupAbbr = tokens(0) 43 | sty.groupName = tokens(1) 44 | sty.fullName = tokens(3) 45 | } 46 | } 47 | init() 48 | 49 | def main(args:Array[String]) = { 50 | init() 51 | println(mapSty) 52 | println(mapAbbr2sty) 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/votors/umls/TermIdentify.scala: -------------------------------------------------------------------------------- 1 | package com.votors.umls 2 | 3 | import java.io.{FileWriter, PrintWriter} 4 | import java.util.Date 5 | import java.util.concurrent.atomic.AtomicInteger 6 | 7 | import com.votors.common.Utils.Trace 8 | import com.votors.common.{Conf, MyCache, Utils} 9 | import com.votors.ml.{Ngram, Nlp, Sentence} 10 | import org.apache.log4j.{Level, Logger} 11 | import org.apache.spark.{SparkConf, SparkContext} 12 | 13 | import scala.collection.mutable.ListBuffer 14 | import scala.collection.JavaConversions.asScalaIterator 15 | import scala.collection.immutable.{List, Range} 16 | import scala.collection.mutable 17 | import scala.collection.mutable.{ArrayBuffer, ListBuffer} 18 | import scala.io.Source 19 | import scala.io.Codec 20 | /** 21 | * Created by Jason on 2016/4/1 0001. 22 | */ 23 | class TermIdentify { 24 | 25 | 26 | 27 | } 28 | 29 | 30 | /** 31 | * input : a csv file, but only identify the text in one clumn. 32 | */ 33 | object TermIdentify { 34 | 35 | def main(avgs: Array[String]): Unit = { 36 | println("the input args are:\n" + avgs.mkString("\n")) 37 | if (avgs.size < 2) { 38 | println(s"invalid inputs, should be: prepare|parse dir file1,file2... extern-file") 39 | sys.exit(1) 40 | } 41 | val inFile = avgs(0) 42 | val outFile = avgs(1) 43 | // init spark 44 | val startTime = new Date() 45 | 46 | 47 | val records = Utils.readCsvFile(avgs(0)).toSeq 48 | val headSorted = Range(0,records.head.size()).map(index=>records.head.get(index)).zipWithIndex 49 | val head = headSorted.toMap 50 | var cnt = 0 51 | val tagger = new UmlsTagger2() 52 | 53 | var writer = new PrintWriter(new FileWriter(outFile)) 54 | writer.println("INDEX,"+headSorted.map(_._1).mkString(",")+",CUI,AUI,CODE,STRING") 55 | 56 | records.tail.foreach(rec =>{ 57 | cnt += 1 58 | if (true) { 59 | println(rec.toString) 60 | val hNgrams = mutable.LinkedHashMap[String,Ngram]() 61 | val sents = Nlp.generateSentence(cnt, rec.get(head("display_name")), null) 62 | val gramId = new AtomicInteger() 63 | Nlp.generateNgram(sents.toSeq, gramId, hNgrams) 64 | 65 | hNgrams.foreach(kv=>{ 66 | val key = kv._1 67 | val gram = kv._2 68 | val (umlsBestScore, stys) = tagger.getUmlsScore(gram.text) 69 | if (umlsBestScore._3 != null && umlsBestScore._3.score>Conf.umlsLikehoodLimit) { 70 | val ret = tagger.execQuery(s"select code from umls.mrconso where CUI='${umlsBestScore._3.cui}' and AUI='${umlsBestScore._3.aui}';") 71 | var code = "" 72 | while (ret.next) { 73 | code += ret.getString("code") + ':' 74 | } 75 | println(s"${key},${umlsBestScore._3.cui},${code.dropRight(1)},${umlsBestScore._3.descr}") 76 | writer.println(s"${cnt},"+"\""+headSorted.map(keyIndex=>rec.get(keyIndex._2)).mkString("\",\"")+"\","+s"${umlsBestScore._3.cui},${umlsBestScore._3.aui},${code.dropRight(1)},${umlsBestScore._3.descr}") 77 | 78 | } 79 | }) 80 | } 81 | }) 82 | 83 | System.out.println("### used time: "+(new Date().getTime()-startTime.getTime())+" ###") 84 | } 85 | 86 | } 87 | 88 | 89 | /** 90 | * input : a csv file, but only identify the text in one clumn. 91 | */ 92 | object TermIdentifySeq{ 93 | 94 | def main(avgs: Array[String]): Unit = { 95 | println("the input args are:\n" + avgs.mkString("\n")) 96 | if (avgs.size < 2) { 97 | println(s"invalid inputs, should be: input_file output-file") 98 | sys.exit(1) 99 | } 100 | val inFile = avgs(0) 101 | val outFile = avgs(1) 102 | // init spark 103 | val startTime = new Date() 104 | 105 | 106 | val records = Utils.readCsvFile(avgs(0)).toSeq 107 | val headSorted = Range(0,records.head.size()).map(index=>records.head.get(index)).zipWithIndex 108 | var cnt = 0 109 | val tagger = new UmlsTagger2() 110 | 111 | var writer = new PrintWriter(new FileWriter(outFile)) 112 | writer.println("qaID\ttermID\tterm\tcui\taui\tscore\tcuiStr\tsentLen\tsentence") 113 | 114 | records.foreach(rec =>{ 115 | cnt += 1 116 | if (true) { 117 | // println(rec.toString) 118 | val hNgrams = new ListBuffer[(Sentence,Ngram)]() 119 | val qaid = rec.get(0) 120 | print(s"${cnt}\t${qaid}\r") 121 | val sents = Nlp.generateSentence(cnt, rec.get(1), null) 122 | val gramId = new AtomicInteger() 123 | Nlp.generateNgramSeq(sents.toSeq, gramId, hNgrams) 124 | var termId = 0 125 | hNgrams.foreach(kv=>{ 126 | val sent = kv._1 127 | val gram = kv._2 128 | val key = gram.key 129 | // println(key) 130 | val (umlsBestScore, stys) = tagger.getUmlsScore(gram.text) 131 | if (umlsBestScore != null && umlsBestScore._3 != null && umlsBestScore._3.score > Conf.umlsLikehoodLimit) { 132 | termId += 1 133 | val sugg = umlsBestScore._3 134 | val outStr = f"${qaid}\t${termId}\t${gram.textOrg}\t${sugg.cui}\t${sugg.aui}\t${sugg.score}%.0f\t${sugg.descr}\t${sent.sentId}\t${sent.words.mkString(" ")}" 135 | println(outStr) 136 | writer.println(outStr) 137 | } 138 | }) 139 | if (cnt%100 == 0) writer.flush() 140 | } 141 | }) 142 | writer.close() 143 | MyCache.close() 144 | System.out.println("### used time: "+(new Date().getTime()-startTime.getTime())+" ###") 145 | } 146 | 147 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/mllib/clustering/MyKmean.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.clustering 2 | 3 | import org.apache.spark.broadcast.Broadcast 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | 7 | import org.apache.spark.{SparkContext} 8 | import org.apache.spark.annotation.Experimental 9 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 10 | import org.apache.spark.mllib.linalg.BLAS.{axpy, scal} 11 | import org.apache.spark.mllib.util.MLUtils 12 | import org.apache.spark.rdd.RDD 13 | import org.apache.spark.storage.StorageLevel 14 | import org.apache.spark.util.Utils 15 | import org.apache.spark.util.random.XORShiftRandom 16 | 17 | /** 18 | * Created by Jason on 2015/12/4 0004. 19 | */ 20 | object MyKmean extends KMeans{ 21 | // def pointCost2(centers: TraversableOnce[VectorWithNorm], 22 | // point: Vector) = KMeans.pointCost(centers, new VectorWithNorm(point)) 23 | 24 | /** 25 | * Returns the index of the closest center to the given point, as well as the squared distance. 26 | */ 27 | def findClosest(centers: TraversableOnce[Vector],p: Vector): (Int,Double) = { 28 | KMeans.findClosest(clusterCentersWithNorm(centers), new VectorWithNorm(p)) 29 | } 30 | def clusterCentersWithNorm(clusterCenters: TraversableOnce[Vector]): TraversableOnce[VectorWithNorm] = 31 | clusterCenters.map(new VectorWithNorm(_)) 32 | 33 | def fastSquaredDistance( v1: Vector, norm1:Double, v2: Vector, norm2:Double): Double = { 34 | KMeans.fastSquaredDistance(new VectorWithNorm(v1,norm1),new VectorWithNorm(v2,norm2)) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/com/votors/umls/UmlsTagger2Test.scala: -------------------------------------------------------------------------------- 1 | package com.votors.umls 2 | 3 | import java.io.{FileWriter, PrintWriter, File} 4 | import com.votors.common.Conf 5 | import com.votors.common.Utils.Trace 6 | import com.votors.common.Utils.Trace._ 7 | import org.junit.{AfterClass, Assert, Test} 8 | 9 | import scala.io.Source 10 | 11 | class UmlsTagger2Test { 12 | 13 | // The root config dir of the opennlp models files and and 14 | val rootDir = Conf.rootDir 15 | Trace.currLevel = ERROR 16 | 17 | if (! new File(rootDir).exists()) { 18 | println("Error! You have to config a valid dataDir in class UmlsTagger2Test first") 19 | sys.exit(1) 20 | } 21 | 22 | @Test 23 | def testBuildIndexJson(): Unit = { 24 | val tagger = new UmlsTagger2("",rootDir) 25 | tagger.buildIndexJson( 26 | "C:\\fsu\\ra\\data\\201708\\Copy of Botanical_with_dsld_cat_termlist.csv", 27 | "C:\\fsu\\ra\\data\\201708\\Copy of Botanical_with_dsld_cat_termlist.txt") 28 | } 29 | @Test 30 | def testBuildIndexPlainText(): Unit = { 31 | val tagger = new UmlsTagger2("",rootDir) 32 | tagger.buildIndexPlainText( 33 | "C:\\fsu\\ra\\data\\201708\\Copy of Botanical_with_dsld_cat_termlist.csv", 34 | "C:\\fsu\\ra\\data\\201708\\Copy of Botanical_with_dsld_cat_termlist.txt") 35 | } 36 | /* 37 | @Test 38 | def testBuildIndexXml(): Unit = { 39 | val tagger = new UmlsTagger2("",rootDir) 40 | tagger.buildIndexCsv( 41 | new File("C:\\fsu\\target.terms.csv"), 42 | new File("C:\\fsu\\target.terms.ret.csv")) 43 | }*/ 44 | @Test 45 | def testBuildIndex2db(): Unit = { 46 | val tagger = new UmlsTagger2("",rootDir) 47 | tagger.buildIndex2db() 48 | } 49 | 50 | @Test 51 | def testGetFull(): Unit = { 52 | val tagger = new UmlsTagger2(Conf.solrServerUrl,rootDir) 53 | val phrases = List("age") 54 | phrases.foreach(phrase => { 55 | Console.println() 56 | Console.println("Query: %s".format(phrase)) 57 | val suggestions = tagger.select(phrase) 58 | suggestions match { 59 | case suggestion: Array[Suggestion] => { 60 | suggestion.foreach(s => Console.println(s.toString())) 61 | //Assert.assertNotNull(suggestion.cui) 62 | } 63 | case _ => 64 | Assert.fail("No results for [%s]".format(phrase)) 65 | } 66 | }) 67 | } 68 | 69 | 70 | @Test 71 | def testStermWord(): Unit = { 72 | val tagger = new UmlsTagger2(Conf.solrServerUrl,rootDir) 73 | 74 | val phrases = List("green tea") 75 | phrases.foreach(phrase => { 76 | Console.println(s"$phrase,${tagger.normalizeAll(phrase)}") 77 | // Console.println(s"$phrase,${tagger.normalizeCasePunct(phrase)}") 78 | 79 | }) 80 | 81 | } 82 | 83 | @Test 84 | def testAnnotateSentence() = { 85 | val tagger = new UmlsTagger2(Conf.solrServerUrl, rootDir) 86 | val sent = "I lost a tone of weight i was alway 130 or above , i eat fine but i cant have really big meals ." 87 | val sugg=tagger.annotateSentence(sent,5) 88 | sugg.filter(_._2.size>0).foreach(s=>{ 89 | println(s"${s._1}\t${s._2.mkString(",")}") 90 | }) 91 | } 92 | 93 | 94 | @Test 95 | def testAnnotateFile(): Unit = { 96 | val tagger = new UmlsTagger2(Conf.solrServerUrl, rootDir) 97 | // tagger.annotateFile(s"C:/fsu/ra/data/201603/nsrr-canonical-data-dictionary.txt", 98 | // s"C:/fsu/ra/data/201603/ret-nsrr-canonical-data-dictionary.txt", 99 | tagger.annotateFile(s"C:/fsu/ra/data/201603/nsrr-canonical-data-dictionary.txt", 100 | s"C:/fsu/ra/data/201603/ret-nsrr-canonical-data-dictionary.txt", 101 | 2, 102 | 5, 103 | '\t','\n') 104 | } 105 | 106 | // find terms from dictionary for a string 107 | @Test 108 | def testAnnotateTag(): Unit = { 109 | val tagger = new UmlsTagger2(Conf.solrServerUrl, rootDir) 110 | //tagger.annotateTag(s"${rootDir}/data/taglist-zhiwei.txt",s"${rootDir}/data/taglist-zhiwei.csv") 111 | tagger.annotateTagAppend(s"C:/fsu/ra/data/201603/nsrr-canonical-data-dictionary.txt", 112 | s"C:/fsu/ra/data/201603/ret-nsrr-canonical-data-dictionary.txt",1) 113 | 114 | tagger.jdbcClose() 115 | } 116 | 117 | @Test 118 | def testPosFilter():Unit = { 119 | 120 | } 121 | 122 | @Test 123 | def testSql():Unit = { 124 | val tagger = new UmlsTagger2(Conf.solrServerUrl, rootDir) 125 | val rs = tagger.execQuery("select count(*) as cnt from umls.mrsty") 126 | 127 | while (rs.next) { 128 | println(rs.getString("cnt")) 129 | } 130 | 131 | tagger.jdbcClose() 132 | } 133 | 134 | 135 | // @AfterClass 136 | // def cleanup():Unit = { 137 | // 138 | // } 139 | } -------------------------------------------------------------------------------- /term_identification.md: -------------------------------------------------------------------------------- 1 | ## Overviw 2 | Given a list of terms T {(tid,term)} and some textual data set D {(did,text)}, identify any of the term in T occurs in data set D. 3 | 4 | ## Steps of method 5 | * Build the lockup table for the given terms T; 6 | * Convert the text into N-gram, and match the N-gram in the lookup table to see if an N-gram matches any of the terms. 7 | 8 | ## steps of operation 9 | * preparation: 10 | * Compile the project and get the Jar file of the project. 11 | * set the alias for tasks run-import-term and run-extract-term: 12 | ``` 13 | alias run-import-term='spark-submit --master spark://somelab12.cci.fsu.edu:7077 --jars /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar --driver-class-path /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar --conf 'spark.executor.extraJavaOptions=-DCTM_ROOT_PATH=/tmp/ctm_root' --driver-java-options=-DCTM_ROOT_PATH=/tmp/ctm_root --files /tmp/ctm_root/conf/default.properties --executor-memory 3g --class com.votors.umls.BuildTargetTerm /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar ' 14 | alias run-extract-term='spark-submit --master spark://somelab12.cci.fsu.edu:7077 --jars /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar --driver-class-path /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar --conf 'spark.executor.extraJavaOptions=-DCTM_ROOT_PATH=/tmp/ctm_root' --driver-java-options=-DCTM_ROOT_PATH=/tmp/ctm_root --files /tmp/ctm_root/conf/default.properties --executor-memory 3g --class com.votors.umls.IdentfyTargetTerm /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar ' 15 | ``` 16 | * Configure the conf/default.properties properly. 17 | * Store you textual data set in Mysql. Make sure there is a unique integer id for every text. 18 | 19 | * execution 20 | * Import the term list to build a lookup table. The format of the input should be one term one line: id [tab] term. 21 | you can write it in the Excel file, and save it as table separated (*.txt) file. 22 | ``` 23 | run-import-term /tmp/supp_list.txt 24 | ``` 25 | * Configure the conf/default.properties to tell the tool where to file you textual data set. 26 | ``` 27 | blogDbUrl=jdbc:mysql://[hostname or IP]:3306/[database name]?user=[username of Mysql]&password=[password of the user] 28 | blogTbl= the table name of you data set 29 | blogIdCol=the column name of the id in the table. it has to be integer 30 | blogTextCol= the column name of the text in the table 31 | ``` 32 | * Run the identification command. Note that if the data set is large, it will take a long time. 33 | So you'd better run this command using screen to avoid the network problem interrupts the processing. 34 | ``` 35 | run-extract-term /tmp/ret_list.csv 36 | ``` 37 | 38 | ## More configuration 39 | ### stop words list: data/stopwords.txt 40 | ### conf/default.properties 41 | * how to get the text to get Ngram; the blogId will select as distict, and the blogTextCol will be limit to 1 row. 42 | blogDbUrl=jdbc:mysql://localhost:3306/ytex?user=root&password=root 43 | blogTbl=tmp_org_yahoo 44 | blogIdCol=id 45 | blogTextCol=concat(subject, ". ", content, ". ", chosenanswer) 46 | 47 | * limit the blog to be analyzed, mainly for test 48 | blogLimit=200 49 | *target term info in database 50 | targetTermTbl=_target_term_ 51 | targetTermTblDropAndCreate=true 52 | * if true, using solr for matching a ngram with target terms, else using database query for matching 53 | targetTermUsingSolr=false 54 | ### Other configuration items may affect the result too! 55 | 56 | --------------------------------------------------------------------------------