├── .gitignore
├── LICENSE
├── README.md
├── conf
    ├── current.cancer.properties
    ├── current.chv.wiki.properties
    ├── current.chv_paper_022117.properties
    ├── current.deaf.properties
    ├── current.init_index.properties
    ├── current.knn.properties
    ├── current.socialqa.botanical.properties
    ├── default.cancer.properties
    ├── default.cancer_chv.properties
    ├── default.cancer_chv_ranking.properties
    ├── default.properties
    ├── log4j.properties
    ├── pattern_cancer.txt
    ├── pattern_cancer_duration.txt
    └── solrConf
    │   ├── schema.xml
    │   └── solrconfig.xml
├── data
    ├── PennTreebankP.O.S.Tags.html
    ├── SemGroups.txt
    ├── SemanticTypes_2013AA.txt
    ├── clinical_text.txt
    ├── en-chunker.bin
    ├── en-parser-chunking.bin
    ├── en-pos-maxent.bin
    ├── en-sent.bin
    ├── en-token.bin
    ├── pos-transformation.csv
    ├── prefix.txt
    ├── stopwords-FOR-clustering.txt
    ├── stopwords-empty.txt
    ├── stopwords.txt
    ├── stopwords_clustering.txt
    ├── stopwords_sent.txt
    ├── sty_pairs_preference_extended_STs.txt
    ├── suffix.txt
    └── test.text.txt
├── docs
    ├── dependency-package.jpg
    └── figurs
    │   ├── conceptual.png
    │   ├── cui_duration_heatmap3.png
    │   ├── evaluation_simiterm.png
    │   ├── figure8_human_review.png
    │   ├── sty_distribution.png
    │   └── work-flow.png
├── libs
    ├── bin
    │   └── winutils.exe
    ├── metamap-api-2.0.jar
    ├── prologbeans.jar
    ├── stanford-corenlp.jar
    └── stanfordNlp-models-url.txt
├── pom.xml
├── py
    ├── get_ct.py
    ├── ner200align.py
    ├── pre_run.py
    ├── preprocess_index.py
    └── xsl2csv.py
├── r
    ├── .Rhistory
    ├── README.md
    ├── RVisualisation.pdf
    ├── classify-rpart.R
    ├── cross-evaluation-200to300.R
    ├── cross-evaluation-bow.R
    ├── cross-evaluation.R
    ├── data
    │   ├── cross-evaluate-revice3.txt
    │   ├── cross-evaluate.txt
    │   ├── cross-evaluation-bow.txt
    │   ├── cross-evaluation-tf100-200to300.txt
    │   ├── cross-evaluation-tf100-cancer.txt
    │   ├── cross-evaluation-tf100.txt
    │   ├── cross-evaluation-tf5.txt
    │   ├── evaluation.txt
    │   ├── human_review.txt
    │   ├── ngram_yahoo_tf5.txt
    │   ├── pca.txt
    │   ├── result_cancer_1101.txt
    │   ├── result_cancer_rank.txt
    │   └── rpart.summary
    ├── ngram-distribution.R
    ├── pattern-heatmap.R
    ├── pca-draw.R
    ├── review-order-ranking.R
    ├── review_result.R
    └── silhouette.R
├── solr_Configuration.md
├── sql-script
    ├── 0923-test.sql
    ├── 1007.sql
    ├── 1018.sql
    ├── cancerqa_chv.sql
    ├── chv.sql
    ├── cluster.sql
    ├── criteria.sql
    ├── data_process.docx
    ├── data_process_0922.docx
    ├── deaf.sql
    ├── import_0919.sql
    ├── import_0922.sql
    ├── import_0924.sql
    ├── import_1004.sql
    ├── import_tag.sql
    ├── import_tag_0915.sql
    ├── import_tag_0916.sql
    ├── linux-test.sql
    ├── minsook.sql
    ├── minsook_1023.sql
    ├── minsook_1103.sql
    ├── minsook_1229.sql
    ├── ner200.sql
    ├── pattern.sql
    ├── pattern_all.sql
    ├── pattern_all_disease.sql
    ├── pattern_bibm2016.sql
    ├── pattern_diabetes.sql
    ├── pattern_sty_prefer.sql
    ├── ret-yahoo.sql
    ├── sent_1213.sq..sql
    ├── smb.sql
    ├── socialqa.sql
    ├── somelab-sctGraph.sql
    ├── synonym.sql
    ├── tumblr.sql
    ├── umls.sql
    ├── usuk.sql
    ├── wiki.sql
    ├── yahoo.sql
    └── ytex.sql
├── src
    ├── main
    │   ├── java
    │   │   ├── StanfordCoreNlpDemo.java
    │   │   └── com
    │   │   │   └── votors
    │   │   │       └── umls
    │   │   │           └── graph
    │   │   │               ├── HelloJGraphT.java
    │   │   │               ├── IsaEdge.java
    │   │   │               ├── SctGraph.java
    │   │   │               ├── TestJava.java
    │   │   │               └── UmlsVertex.java
    │   └── scala
    │   │   ├── com
    │   │       └── votors
    │   │       │   ├── Test.scala
    │   │       │   ├── TokenizerDemo.scala
    │   │       │   ├── common
    │   │       │       └── Utils.scala
    │   │       │   ├── ml
    │   │       │       ├── Clustering.scala
    │   │       │       ├── KNN.scala
    │   │       │       ├── Ngram.scala
    │   │       │       ├── Nlp.scala
    │   │       │       ├── StanfordNLP.scala
    │   │       │       └── Word2vec.scala
    │   │       │   └── umls
    │   │       │       ├── AnalyzeCT.scala
    │   │       │       ├── MMApi.scala
    │   │       │       ├── SemanticType.scala
    │   │       │       ├── TermIdentify.scala
    │   │       │       └── UmlsTagger2.scala
    │   │   └── org
    │   │       └── apache
    │   │           └── spark
    │   │               └── mllib
    │   │                   └── clustering
    │   │                       └── MyKmean.scala
    └── test
    │   └── com
    │       └── votors
    │           └── umls
    │               └── UmlsTagger2Test.scala
└── term_identification.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | .idea/
 3 | target/
 4 | *dat.txt
 5 | *.bak
 6 | *.bk
 7 | *.csv
 8 | lvg2015/
 9 | lvg2015data/
10 | VITTA_EC2/.git
11 | VATEC/~/eb-virt
12 | conf/current.properties
13 | r/.Rhistory


--------------------------------------------------------------------------------
/conf/current.cancer.properties:
--------------------------------------------------------------------------------
 1 | 
 2 | # configuration for cancer pattern
 3 | 
 4 | #  ###########################################################3333333333
 5 | #  ############# UMLS term matching configuration #####################3
 6 | #  # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY.
 7 | #  # for now, table mrstr is neccessary
 8 | #  jdbcDriver=jdbc:mysql://localhost:3306/umls?user=root&password=root
 9 | #  #url of solr we use to match umls term. do not used solr by default
10 | #  solrServerUrl=http://localhost:8983/solr
11 | #  
12 | #  # caseFactor is [0, 1] value. It indicates how much you concern the case. It will affect the similarity score
13 | #  # when you select a term from solr. Value 0 means upcase and lowcase are totally different, and
14 | #  # value 1 means upcase and lowcase are not different at all.
15 | #  caseFactor=0.8
16 | #  
17 | #  #not used for now
18 | #  #Should we take the newline as the end of a sentence? or just ignore the newline?
19 | #  #  1: replace with space; 2: replace with '.'; 0: do nothing
20 | #  ignoreNewLine=2
21 | #  
22 | #  #######################################################################
23 | #  ########## data source to fetching configuration ######################
24 | #  # how to get the text to get Ngram; the blogId will select as distict, and the blogTextCol will be limit to 1 row.
25 | #  blogDbUrl=jdbc:mysql://localhost:3306/ytex?user=root&password=root
26 | #  blogTbl=tmp_org_yahoo
27 | #  #blogTbl=content_org_new
28 | #  blogIdCol=id
29 | #  #blogIdCol=blogId
30 | #  blogTextCol=concat(subject, ". ", content, ". ", chosenanswer)
31 | #  #blogTextCol=text_content
32 | #  
33 | #  # limit the blog to be analyzed, mainly for test
34 | #  blogLimit=200
35 | #  
36 | #  #target term info in database
37 | #  targetTermTbl=_target_term_
38 | #  targetTermTblDropAndCreate=true
39 | #  # if true, using solr for matching a ngram with target terms, else using database query for matching
40 | #  targetTermUsingSolr=True
41 | #  
42 | #  #######################################################################
43 | #  ################### NLP relative configuration ###############################
44 | #  #root dir of lvg
45 | #  lvgdir=C:\\fsu\\ra\\UmlsTagger\\lvg2015\\
46 |   useStanfordNLP=true
47 | #  stanfordTokenizerOption=
48 |   stanfordTaggerOption=model=edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger
49 |   stanfordPatternFile=/data/ra/pattern/pattern_cancer_duration.txt
50 | #  # use the dependcy tree to find terms before using the syntactic tree.
51 |   useDependencyTree=true
52 |   partUmlsTermMatch=false
53 | 
54 | #  # for pattern parsing
55 | #  analyzNonUmlsTerm = true
56 | #  # the maximum length of a sentence. (character, not word)
57 |   sentenceLenMax=500
58 | 
59 | #  # include POS tagger. The Ngram (basic terms) have to contain at least one of those POS tagger. it also the definition of 'noun' in this tool. No filter if empty
60 | #  #posInclusive=NN NNS NNP NNPS
61 |   posInclusive=
62 | #  # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term
63 |   umlsLikehoodLimit=80
64 | #  # the window length to fetch context of a ngram
65 | #  WinLen=10
66 | #  
67 | #  # use to force delimit gram. Delimiter = Pattern.compile("[,;/\\:\\(\\)\\[\\]\\{\\}\"]+")
68 | #  delimiter =[,;/\\:\\(\\)\\[\\]\\{\\}\"]+
69 | #  
70 | #  # how does ngram  match the stop words list? 0:exactly matching; 1: ngram contains any stop word; 2: ngram start or end with any stop word; others: no filter
71 | #  stopwordMatchType=2
72 | #  # besides the file of stop word, you can specify a regex to indicate what is a stop word.
73 | #  # exclude the gram start or end with digit. (remove the matched item)
74 | #  # exclude words only start or end with one letter
75 | #  # stopwordRegex=^\\d+.*|.*\\d$|^\\S(\\s.*|$)|(^|.*\\s)\\S    ----- for clustering
76 | #  # for clinical trails patterns.
77 | #  stopwordRegex=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
78 | #  # pos tagger filter (remove the matched item). 1: no noun; 2: ^N+P+N 3: not end with N
79 | #  #posFilterRegex=[^N]* [^N]*PN .*[^N]$
80 | #  posFilterRegex=[^N]*
81 | #  # a regex to check a string as a whole (may be several words) should query for a CUI or not
82 | #  #(different form stopwords check since stop word checks every word in a string)
83 | #  # 1. no a-z; 2. started or ended a word without a-z
84 | #  cuiStringFilterRegex=[^a-zA-Z]*|^[^a-zA-Z]+\\s.*|.*\\s[^a-zA-Z]+$
85 | #
86 | #  
87 | #  # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select'
88 | #  # for chv paper
89 | #  #semanticType=T033,T121,T061,T047,T109,T023,T184,T074,T116,T123,T059,T046
90 | #  # for clinical trails pattern paper
91 |   semanticType=T200,T020,T190,T049,T019,T047,T050,T037,T048,T191,T046,T184,T060,T065,T058,T059,T063,T062,T061
92 | #  # all semantic type sorted by largest to smallest in size
93 | #  #semanticType=T204,T007,T200,T061,T109,T002,T121,T116,T033,T004,T201,T023,T028,T123,T047,T074,T037,T060,T126,T013,T129,T044,T170,T191,T029,T059,T043,T005,T012,T114,T015,T130,T058,T014,T030,T046,T081,T011,T019,T026,T131,T167,T097,T197,T024,T195,T025,T192,T073,T034,T040,T122,T203,T083,T042,T082,T045,T048,T184,T080,T169,T194,T168,T078,T079,T125,T098,T020,T039,T190,T093,T031,T196,T049,T067,T038,T127,T062,T171,T185,T041,T091,T032,T018,T054,T055,T070,T057,T077,T065,T090,T068,T089,T064,T022,T056,T092,T104,T052,T099,T063,T086,T101,T120,T087,T051,T017,T102,T066,T001,T008,T016,T100,T075,T050,T069,T096,T095,T053,T072,T094,T010,T103,T071,T085,T021,T088
94 | #  # filter the semantic type by a regular expression. tag extraction function.
95 | #  #sabFilter=SNOMEDCT_US|NCI|GO
96 | #  sabFilter=.*
97 | 


--------------------------------------------------------------------------------
/conf/current.chv.wiki.properties:
--------------------------------------------------------------------------------
  1 | #  ############# UMLS term matching configuration #####################3
  2 | #  # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY.
  3 | #  # for now, table mrstr is neccessary
  4 |  jdbcDriver=jdbc:mysql://localhost:3306/umls?user=root&password=root
  5 | 
  6 | useStanfordNLP=true
  7 | stanfordAnnotators=tokenize,ssplit,pos,lemma,depparse
  8 | #lvgdir=/data/ra/lvg2015/
  9 | memcached=127.0.0.1:11211
 10 | 
 11 | #####*_*####get the training data from (previous save) file, do not construct the Ngram again.
 12 | clusteringFromFile=false
 13 | # read text from files of a directory, instead of from database
 14 | textFromDirectory=true
 15 | # the directory of files, if textFromDirectory = true
 16 | textDirectory=C:\\fsu\\ra\\wikiextractor\\wiki-test\\AA
 17 | # save ngram result to a file.
 18 | ngramSaveFile=c:\\fsu\\ra\\data\\ngram_wiki.serd
 19 | 
 20 | #  # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term
 21 | umlsLikehoodLimit=50
 22 | #  # the window length to fetch context of a ngram
 23 | #  WinLen=10
 24 | #max length of ngram
 25 | ngramN=3
 26 | ngramKeepSentence=false
 27 | 
 28 | # pos tagger filter (remove the matched item). 1: no noun; 2: ^N+P+N 3: not end with N
 29 | # 1. we care NGAW; 2 if more then 2 grams, must ended with NG.
 30 | posFilterRegex=[^NGAW]* [^N]*PN .*[^NAGW]$ .+[^NG]$
 31 | 
 32 | #######################################################################
 33 | ############### Ngram relative configuration ###################################
 34 | preferLongTermTfRatio=0.5
 35 | # the threshold of tf when fetch ngram in partition
 36 | partitionTfFilter=2
 37 | # when reach this number of ngram in this partition, start to reduce ngram
 38 | partitionReduceStartPoint=100000
 39 | # each time this number of new ngram in this partition, after start point, reduce ngram
 40 | partitionReduceStartStep=10000
 41 | # At lease try to reduce how many ngram, fraction of 'stage1ReduceStartStep'
 42 | partitionReduceFraction=0.01
 43 | # the threshold of tf when fetch ngram in first stage
 44 | stag1TfFilter=2
 45 | stag1CvalueFilter=1
 46 | # the threshold of tf when fetch ngram in second stage
 47 | stag2TfFilter=10
 48 | stag2CvalueFilter=1
 49 | # the thresholh of umls/chv score. no filter if it is -1
 50 | stag2UmlsScoreFilter=-1
 51 | stag2ChvScoreFilter=-1
 52 | 
 53 | 
 54 | ######################## bags of words configuration ##############
 55 | bagsOfWord=false
 56 | bowUmlsOnly=false
 57 | bowTfFilter=100
 58 | # maximum number of bag of words
 59 | bowTopNgram=10000
 60 | bowDialogSetOne=false
 61 | ######################## end of bags of words configuration ######
 62 | 
 63 | #######################################################################
 64 | ############# Clustering relative configuration ##########################
 65 | # Nlp do not allow multi-thread, so you can not use local[N] for generating Ngram, but you can use it to run kmeans
 66 | sparkMaster=local[4]
 67 | partitionNumber=8
 68 | repartitionForce=true
 69 | ########### only use chv term as trainig data
 70 | trainOnlyChv=true
 71 | # filter the ngran before run kmeans (remove the matched item)
 72 | trainedNgramFilterPosRegex=[^N]*PN
 73 | # how many percent of the data is sample as test data(for evaluation), <= 0, no thing is test
 74 | testSample=30
 75 | sampleRuns=1
 76 | #number of ngram for training. For test purpose. <0: no limit;
 77 | trainNgramCnt=-1
 78 | 
 79 | # PCA only. Compact the feature space matrix to a N dimensions space using PCA. <=0, do nothing.
 80 | pcaDimension=0.95
 81 | ###### k-mean parameters #######
 82 | # if run k-mean or not
 83 | runKmeans=true
 84 | # the start/end/step point of the k (cluster number)
 85 | k_start=50
 86 | k_end=51
 87 | k_step=5
 88 | # the maximum of iteration of the k-mean algorithm if it is not convergent
 89 | maxIterations=1000
 90 | # run the following number of times for every k, and take the least cost one
 91 | runs=10
 92 | #  # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select'
 93 | #  # all sty
 94 | semanticType=T116,T020,T052,T100,T087,T011,T190,T008,T017,T195,T194,T123,T007,T031,T022,T053,T038,T012,T029,T091,T122,T023,T030,T118,T026,T043,T025,T019,T103,T120,T104,T185,T201,T200,T077,T049,T088,T060,T056,T203,T047,T065,T069,T111,T196,T050,T018,T071,T126,T204,T051,T099,T021,T013,T033,T004,T168,T169,T045,T083,T028,T064,T102,T096,T068,T093,T058,T131,T125,T016,T078,T129,T055,T197,T037,T170,T130,T171,T059,T034,T119,T015,T063,T066,T074,T041,T073,T048,T044,T085,T191,T114,T070,T124,T086,T057,T090,T115,T109,T032,T040,T001,T092,T042,T046,T072,T067,T039,T121,T002,T101,T098,T097,T094,T080,T081,T192,T014,T062,T075,T089,T167,T095,T054,T184,T082,T110,T024,T079,T061,T005,T127,T010
 95 | #  # for clinical trails pattern paper
 96 | #  # filter the semantic type by a regular expression. tag extraction function.
 97 | #  #sabFilter=SNOMEDCT_US|NCI|GO
 98 |  sabFilter=.*
 99 | # save the above showing ngram to file
100 | saveNgram2file=
101 | 
102 | #######################################################
103 | ############### Output configuration ##################################
104 | # output normalized text for word2vex
105 | #show original ngram before training
106 | showOrgNgramNum=100
107 | # shown ngram filter based on N
108 | showOrgNgramOfN=1,2,3,4,5
109 | # shown ngram filter based on pos tagger
110 | showOrgNgramOfPosRegex=.*
111 | # shown ngram filter based on text
112 | showOrgNgramOfTextRegex=.*
113 | # show the number of ngram in every cluster. <0, show nothing
114 | showNgramInCluster=0
115 | #show the average and standard deviation of tf in clusters. Not configurable, always true
116 | #showTfAvgSdInCluster=true
117 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail;
118 | showDetailRankPt=0
119 | # if a Ngram math this filter(regex), the detail information will output to console..
120 | debugFilterNgram=aaaaaaaaaaaaaaaaaa
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/conf/current.chv_paper_022117.properties:
--------------------------------------------------------------------------------
  1 | #  ############# UMLS term matching configuration #####################3
  2 | #  # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY.
  3 | #  # for now, table mrstr is neccessary
  4 |  jdbcDriver=jdbc:mysql://localhost:3306/umls?user=root&password=root
  5 | 
  6 | 
  7 | #######################################################################
  8 | ########## data source to fetching configuration ######################
  9 | # how to get the text to get Ngram; the blogId will select as distict, and the blogTextCol will be limit to 1 row.
 10 | blogDbUrl=jdbc:mysql://localhost:3306/ytex?user=root&password=root
 11 | blogTbl=org_yahoo
 12 | #blogTbl=content_org_new
 13 | blogIdCol=id
 14 | #blogIdCol=blogId
 15 | blogTextCol=concat(subject, ". ", content, ". ", chosenanswer)
 16 | #blogTextCol=text_content
 17 | # limit the blog to be analyzed, mainly for test
 18 | blogLimit=2000000
 19 | 
 20 | useStanfordNLP=false
 21 | memcached=127.0.0.1:11211
 22 | 
 23 | #####*_*####get the training data from (previous save) file, do not construct the Ngram again.
 24 | clusteringFromFile=true
 25 | ngramSaveFile=/data/ra/ngram_yahoo_022117opennlp.serd
 26 | 
 27 | #  # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term
 28 | umlsLikehoodLimit=30
 29 | #  # the window length to fetch context of a ngram
 30 | #  WinLen=10
 31 | 
 32 | 
 33 | #######################################################################
 34 | ############### Ngram relative configuration ###################################
 35 | # the threshold of tf when fetch ngram in partition
 36 | partitionTfFilter=2
 37 | # the threshold of tf when fetch ngram in first stage
 38 | stag1TfFilter=2
 39 | stag1CvalueFilter=1
 40 | # the threshold of tf when fetch ngram in second stage
 41 | stag2TfFilter=100
 42 | stag2CvalueFilter=1
 43 | # the thresholh of umls/chv score. no filter if it is -1
 44 | stag2UmlsScoreFilter=-1
 45 | stag2ChvScoreFilter=-1
 46 | 
 47 | 
 48 | ######################## bags of words configuration ##############
 49 | bagsOfWord=true
 50 | bowUmlsOnly=false
 51 | bowTfFilter=100
 52 | # maximum number of bag of words
 53 | bowTopNgram=10000
 54 | bowDialogSetOne=false
 55 | ######################## end of bags of words configuration ######
 56 | 
 57 | #######################################################################
 58 | ############# Clustering relative configuration ##########################
 59 | # Nlp do not allow multi-thread, so you can not use local[N] for generating Ngram, but you can use it to run kmeans
 60 | sparkMaster=
 61 | partitionNumber=8
 62 | ########### only use chv term as trainig data
 63 | trainOnlyChv=true
 64 | # filter the ngran before run kmeans (remove the matched item)
 65 | trainedNgramFilterPosRegex=[^N]*PN
 66 | # how many percent of the data is sample as test data(for evaluation), <= 0, no thing is test
 67 | testSample=30
 68 | sampleRuns=10
 69 | #number of ngram for training. For test purpose. <0: no limit;
 70 | trainNgramCnt=-1
 71 | 
 72 | # PCA only. Compact the feature space matrix to a N dimensions space using PCA. <=0, do nothing.
 73 | pcaDimension=0.95
 74 | ###### k-mean parameters #######
 75 | # if run k-mean or not
 76 | runKmeans=true
 77 | # the start/end/step point of the k (cluster number)
 78 | k_start=10
 79 | k_end=150
 80 | k_step=5
 81 | # the maximum of iteration of the k-mean algorithm if it is not convergent
 82 | maxIterations=1000
 83 | # run the following number of times for every k, and take the least cost one
 84 | runs=10
 85 | #  # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select'
 86 | #  # for chv paper
 87 | semanticType=T033,T121,T061,T047,T109,T023,T184,T074,T116,T123,T059,T046
 88 | #  # for clinical trails pattern paper
 89 | #  # filter the semantic type by a regular expression. tag extraction function.
 90 | #  #sabFilter=SNOMEDCT_US|NCI|GO
 91 |  sabFilter=.*
 92 | # save the above showing ngram to file
 93 | saveNgram2file=/tmp/orgGram.txt
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | #######################################################################
100 | ############### Output configuration ##################################
101 | # output normalized text for word2vex
102 | #show original ngram before training
103 | showOrgNgramNum=1000000
104 | # shown ngram filter based on N
105 | showOrgNgramOfN=1,2,3,4,5
106 | # shown ngram filter based on pos tagger
107 | showOrgNgramOfPosRegex=.*
108 | # shown ngram filter based on text
109 | showOrgNgramOfTextRegex=.*
110 | # show the number of ngram in every cluster. <0, show nothing
111 | showNgramInCluster=0
112 | #show the average and standard deviation of tf in clusters. Not configurable, always true
113 | #showTfAvgSdInCluster=true
114 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail;
115 | showDetailRankPt=0
116 | # if a Ngram math this filter(regex), the detail information will output to console..
117 | debugFilterNgram=aaaaaaaaaaaaaaaaaa
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/conf/current.deaf.properties:
--------------------------------------------------------------------------------
  1 | #  ############# UMLS term matching configuration #####################3
  2 | #  # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY.
  3 | #  # for now, table mrstr is neccessary
  4 | jdbcDriver=jdbc:mysql://somelab12.cci.fsu.edu:3306/umls?user=root&password=root
  5 | 
  6 | useStanfordNLP=true
  7 | stanfordAnnotators="tokenize, ssplit"
  8 | #lvgdir=/data/ra/lvg2015/
  9 | # split the text into block before sentence segmentation. For clinical trials
 10 | textBlockDelimiter=#|\\n
 11 | # Special segmentation of the text before sentence segmentation (':', ' - ', 'Or|OR', 'No', 'At'). For clinical trials
 12 | textBlockDelimiterSpecialEnable=true
 13 | 
 14 | 
 15 | #  # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term
 16 | umlsLikehoodLimit=80
 17 | #  # the window length to fetch context of a ngram
 18 | 
 19 | #################  Metamap configuration ##########################
 20 | MMenable=true
 21 | # output option have to implement by yourself. don't use as a option.
 22 | # -J (--restrict_to_sts) <list>  -e (--exclude_sources) <list>  -R (--restrict_to_sources) <list>
 23 | MMoptions=--allow_concept_gaps -R CHV -y 
 24 | #MMoptions=--allow_concept_gaps -R SNOMEDCT_US -y
 25 | MMscoreThreshold = 800
 26 | MMhost=
 27 | MMport=
 28 | # only perform metamap parsing.
 29 | MMonly=true
 30 | 
 31 | ################# end Metamap configuration #######################
 32 | 
 33 | #######################################################################
 34 | ############### Ngram relative configuration ###################################
 35 | # the threshold of tf when fetch ngram in partition
 36 | partitionTfFilter=2
 37 | # the threshold of tf when fetch ngram in first stage
 38 | stag1TfFilter=2
 39 | stag1CvalueFilter=1
 40 | # the threshold of tf when fetch ngram in second stage
 41 | stag2TfFilter=5
 42 | stag2CvalueFilter=1
 43 | # the thresholh of umls/chv score. no filter if it is -1
 44 | stag2UmlsScoreFilter=-1
 45 | stag2ChvScoreFilter=-1
 46 | 
 47 | 
 48 | ######################## bags of words configuration ##############
 49 | bagsOfWord=false
 50 | bowUmlsOnly=false
 51 | bowTfFilter=100
 52 | # maximum number of bag of words
 53 | bowTopNgram=10000
 54 | bowDialogSetOne=false
 55 | ######################## end of bags of words configuration ######
 56 | 
 57 | ########################################################################
 58 | ############## Clustering relative configuration ##########################
 59 | ## Nlp do not allow multi-thread, so you can not use local[N] for generating Ngram, but you can use it to run kmeans
 60 | #sparkMaster=local[2]
 61 | #partitionNumber=8
 62 | ############ only use chv term as trainig data
 63 | #trainOnlyChv=true
 64 | ## filter the ngran before run kmeans (remove the matched item)
 65 | #trainedNgramFilterPosRegex=[^N]*PN
 66 | ## how many percent of the data is sample as test data(for evaluation), <= 0, no thing is test
 67 | #testSample=30
 68 | #sampleRuns=1
 69 | ##number of ngram for training. For test purpose. <0: no limit;
 70 | #trainNgramCnt=-1
 71 | #
 72 | ## PCA only. Compact the feature space matrix to a N dimensions space using PCA. <=0, do nothing.
 73 | #pcaDimension=0.95
 74 | ####### k-mean parameters #######
 75 | ## if run k-mean or not
 76 | #runKmeans=true
 77 | ## the start/end/step point of the k (cluster number)
 78 | #k_start=10
 79 | #k_end=150
 80 | #k_step=5
 81 | ## the maximum of iteration of the k-mean algorithm if it is not convergent
 82 | #maxIterations=1000
 83 | ## run the following number of times for every k, and take the least cost one
 84 | #runs=10
 85 | #  # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select'
 86 | #  # for chv paper
 87 | semanticType=T204,T007,T200,T061,T109,T002,T121,T116,T033,T004,T201,T023,T028,T123,T047,T074,T037,T060,T126,T013,T129,T044,T170,T191,T029,T059,T043,T005,T012,T114,T015,T130,T058,T014,T030,T046,T081,T011,T019,T026,T131,T167,T097,T197,T024,T195,T025,T192,T073,T034,T040,T122,T203,T083,T042,T082,T045,T048,T184,T080,T169,T194,T168,T078,T079,T125,T098,T020,T039,T190,T093,T031,T196,T049,T067,T038,T127,T062,T171,T185,T041,T091,T032,T018,T054,T055,T070,T057,T077,T065,T090,T068,T089,T064,T022,T056,T092,T104,T052,T099,T063,T086,T101,T120,T087,T051,T017,T102,T066,T001,T008,T016,T100,T075,T050,T069,T096,T095,T053,T072,T094,T010,T103,T071,T085,T021,T088
 88 | #  # for clinical trails pattern paper
 89 | #  # filter the semantic type by a regular expression. tag extraction function.
 90 | sabFilter=SNOMEDCT_US|CHV
 91 | #sabFilter=.*
 92 | # save the above showing ngram to file
 93 | saveNgram2file=C:/fsu/ra/data/orgGram.txt
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | #######################################################################
100 | ############### Output configuration ##################################
101 | # output normalized text for word2vex
102 | #show original ngram before training
103 | showOrgNgramNum=1000000
104 | # shown ngram filter based on N
105 | showOrgNgramOfN=1,2,3,4,5
106 | # shown ngram filter based on pos tagger
107 | showOrgNgramOfPosRegex=.*
108 | # shown ngram filter based on text
109 | showOrgNgramOfTextRegex=.*
110 | # show the number of ngram in every cluster. <0, show nothing
111 | showNgramInCluster=0
112 | #show the average and standard deviation of tf in clusters. Not configurable, always true
113 | #showTfAvgSdInCluster=true
114 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail;
115 | showDetailRankPt=0
116 | # if a Ngram math this filter(regex), the detail information will output to console..
117 | debugFilterNgram=aaaaaaaaaaaaaaaaaa
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/conf/current.init_index.properties:
--------------------------------------------------------------------------------
 1 | #  ############# UMLS term matching configuration #####################3
 2 | #  # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY.
 3 | #  # for now, table mrstr is neccessary
 4 | jdbcDriver=jdbc:mysql://127.0.0.1:3306/umls?useServerPrepStmts=false&rewriteBatchedStatements=true&user=root&password=root
 5 | 
 6 | ################################# use for target umls terms ######################
 7 | #target term info in database
 8 | sourceTermTbl=umls.mrconso
 9 | targetTermTbl=_target_term_
10 | targetTermTblDropAndCreate=true
11 | # 'a' and 'b' represent the self-join mrconso table.
12 | sourceTermQueryOption= a.lat='ENG' and b.lat='ENG' limit 100
13 | # if true, using solr for matching a ngram with target terms, else using database query for matching
14 | targetTermUsingSolr=false
15 | 
16 | 
17 | #######################################################################
18 | ################### NLP relative configuration ########################
19 | #root dir of lvg. Use stanford nlp is recommended.
20 | useStanfordNLP=true
21 | stanfordAnnotators=tokenize, ssplit, pos, lemma


--------------------------------------------------------------------------------
/conf/current.knn.properties:
--------------------------------------------------------------------------------
 1 | #  ############# UMLS term matching configuration #####################3
 2 | #  # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY.
 3 | #  # for now, table mrstr is neccessary
 4 | jdbcDriver=jdbc:mysql://somelab12.cci.fsu.edu:3306/umls?user=root&password=root
 5 | 
 6 | useStanfordNLP=true
 7 | #lvgdir=/data/ra/lvg2015/
 8 | 
 9 | 
10 | #####*_*####get the training data from (previous save) file, do not construct the Ngram again.
11 | clusteringFromFile=true
12 | ngramSaveFile=C:/fsu/ra/data/ngram_yahoo_022117stanfordnlp.serd
13 | 
14 | #  # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term
15 | umlsLikehoodLimit=80
16 | #  # the window length to fetch context of a ngram
17 | 
18 | ################# end Metamap configuration #######################
19 | 
20 | #######################################################################
21 | ############### Ngram relative configuration ###################################
22 | # the threshold of tf when fetch ngram in partition
23 | partitionTfFilter=2
24 | # the threshold of tf when fetch ngram in first stage
25 | stag1TfFilter=2
26 | stag1CvalueFilter=1
27 | # the threshold of tf when fetch ngram in second stage
28 | stag2TfFilter=10
29 | stag2CvalueFilter=1
30 | # the thresholh of umls/chv score. no filter if it is -1
31 | stag2UmlsScoreFilter=-1
32 | stag2ChvScoreFilter=-1
33 | 
34 | #######################################################################
35 | ############# Clustering relative configuration ##########################
36 | # Nlp do not allow multi-thread, so you can not use local[N] for generating Ngram, but you can use it to run kmeans
37 | sparkMaster=local[2]
38 | partitionNumber=8
39 | ########### only use chv term as trainig data
40 | trainOnlyChv=true
41 | # filter the ngran before run kmeans (remove the matched item)
42 | trainedNgramFilterPosRegex=[^N]*PN
43 | # how many percent of the data is sample as test data(for evaluation), <= 0, no thing is test
44 | testSample=30
45 | sampleRuns=1
46 | #number of ngram for training. For test purpose. <0: no limit;
47 | trainNgramCnt=-1
48 | 
49 | # PCA only. Compact the feature space matrix to a N dimensions space using PCA. <=0, do nothing.
50 | pcaDimension=0
51 | 
52 | #  # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select'
53 | #  # for chv paper
54 | semanticType=T204,T007,T200,T061,T109,T002,T121,T116,T033,T004,T201,T023,T028,T123,T047,T074,T037,T060,T126,T013,T129,T044,T170,T191,T029,T059,T043,T005,T012,T114,T015,T130,T058,T014,T030,T046,T081,T011,T019,T026,T131,T167,T097,T197,T024,T195,T025,T192,T073,T034,T040,T122,T203,T083,T042,T082,T045,T048,T184,T080,T169,T194,T168,T078,T079,T125,T098,T020,T039,T190,T093,T031,T196,T049,T067,T038,T127,T062,T171,T185,T041,T091,T032,T018,T054,T055,T070,T057,T077,T065,T090,T068,T089,T064,T022,T056,T092,T104,T052,T099,T063,T086,T101,T120,T087,T051,T017,T102,T066,T001,T008,T016,T100,T075,T050,T069,T096,T095,T053,T072,T094,T010,T103,T071,T085,T021,T088
55 | #  # for clinical trails pattern paper
56 | #  # filter the semantic type by a regular expression. tag extraction function.
57 | #sabFilter=SNOMEDCT_US|CHV
58 | sabFilter=.*
59 | # save the above showing ngram to file
60 | saveNgram2file=C:/fsu/ra/data/orgGram.txt
61 | 
62 | 
63 | #######################################################################
64 | ############### Output configuration ##################################
65 | # output normalized text for word2vex
66 | #show original ngram before training
67 | showOrgNgramNum=100
68 | # shown ngram filter based on N
69 | showOrgNgramOfN=1,2,3,4,5
70 | # shown ngram filter based on pos tagger
71 | showOrgNgramOfPosRegex=.*
72 | # shown ngram filter based on text
73 | showOrgNgramOfTextRegex=.*
74 | # show the number of ngram in every cluster. <0, show nothing
75 | showNgramInCluster=0
76 | #show the average and standard deviation of tf in clusters. Not configurable, always true
77 | #showTfAvgSdInCluster=true
78 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail;
79 | showDetailRankPt=0
80 | # if a Ngram math this filter(regex), the detail information will output to console..
81 | debugFilterNgram=aaaaaaaaaaaaaaaaaa
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/conf/current.socialqa.botanical.properties:
--------------------------------------------------------------------------------
 1 | #  ############# UMLS term matching configuration #####################3
 2 | #  # jdbcDriver is the database url that uses for extern info for a term in UMLS. e.g. selecting TUI by CUI from the table MRSTY.
 3 | #  # for now, table mrstr is neccessary
 4 | jdbcDriver=jdbc:mysql://somelab12.cci.fsu.edu:3306/umls?user=root&password=root
 5 | 
 6 | useStanfordNLP=true
 7 | stanfordAnnotators=tokenize, ssplit, pos, lemma
 8 | 
 9 | memcached=127.0.0.1:11211
10 | # default expire time of memcached: 1 week.
11 | defaultExpireTime=604800
12 | ehCacheEntities=500000
13 | #####################################################################################
14 | ################################# fuzzy matching configuration ######################
15 | ##### You can use Solor or Mysql as the index search server. You have to initilize at least one of them
16 | ##### Use database is more easy to configure. But it may be a little slower.
17 | # if true, using solr for matching a ngram with target terms, else using database query for matching
18 | targetTermUsingSolr=False
19 | targetTermTbl=_target_term_botanical_
20 | 
21 | #  # 0 - 100. if the similarity score for a ngram is greater than this threshold, the ngran will be consider as umls term
22 | umlsLikehoodLimit=80
23 | #  # the window length to fetch context of a ngram
24 | 
25 | #################  Metamap configuration ##########################
26 | MMenable=false
27 | 
28 | ################# end Metamap configuration #######################
29 | 
30 | #######################################################################
31 | 
32 | # if use semantic information. if this flag is false, all fuction about semantic are disable.
33 | useSemanticeType=false
34 | # when there are multiple matched term, reduce to one term by semantic type preference.
35 | reduceMatchedTermBySty=false
36 | #  # the top semantic type we make it as features; only for 'getUmlsScore' function, not 'select'
37 | #  # for chv paper
38 | #semanticType=T204
39 | #  # for clinical trails pattern paper
40 | #  # filter the semantic type by a regular expression. tag extraction function.
41 | sabFilter=.*
42 | 
43 | 
44 | 
45 | 
46 | #######################################################################
47 | ############### Output configuration ##################################
48 | # output normalized text for word2vex
49 | #show original ngram before training
50 | showOrgNgramNum=1000000
51 | # shown ngram filter based on N
52 | showOrgNgramOfN=1,2,3,4,5
53 | # shown ngram filter based on pos tagger
54 | showOrgNgramOfPosRegex=.*
55 | # shown ngram filter based on text
56 | showOrgNgramOfTextRegex=.*
57 | # show the number of ngram in every cluster. <0, show nothing
58 | showNgramInCluster=0
59 | #show the average and standard deviation of tf in clusters. Not configurable, always true
60 | #showTfAvgSdInCluster=true
61 | #how many percent of ngram is shown the detail after rank. it show info of every ngram in this top ${showDetailRankPt} percent; <0 don't show detail;
62 | showDetailRankPt=0
63 | # if a Ngram math this filter(regex), the detail information will output to console..
64 | debugFilterNgram=aaaaaaaaaaaaaaaaaa
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/conf/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=INFO, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.spark-project.jetty=WARN
10 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
13 | log4j.logger.org.apache.parquet=ERROR
14 | log4j.logger.parquet=ERROR
15 | 
16 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
17 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
18 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
19 | 


--------------------------------------------------------------------------------
/data/PennTreebankP.O.S.Tags.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 | <title>Penn Treebank P.O.S. Tags</title>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
  5 | </head>
  6 | 
  7 | <body bgcolor="#FFFFFF">
  8 | <h3>Alphabetical list of part-of-speech tags used in the Penn Treebank Project:</h3>
  9 | <table cellpadding="2" cellspacing="2" border="0">
 10 |   <tr bgcolor="#DFDFFF" align="none"> 
 11 |     <td align="none"> 
 12 |       <div align="left">Number</div>
 13 |     </td>
 14 |     <td> 
 15 |       <div align="left">Tag</div>
 16 |     </td>
 17 |     <td> 
 18 |       <div align="left">Description</div>
 19 |     </td>
 20 |   </tr>
 21 |   <tr bgcolor="#FFFFCA"> 
 22 |     <td align="none"> 1. </td>
 23 |     <td>CC </td>
 24 |     <td>Coordinating conjunction </td>
 25 |   </tr>
 26 |   <tr bgcolor="#FFFFCA"> 
 27 |     <td align="none"> 2. </td>
 28 |     <td>CD </td>
 29 |     <td>Cardinal number </td>
 30 |   </tr>
 31 |   <tr bgcolor="#FFFFCA"> 
 32 |     <td align="none"> 3. </td>
 33 |     <td>DT </td>
 34 |     <td>Determiner </td>
 35 |   </tr>
 36 |   <tr bgcolor="#FFFFCA"> 
 37 |     <td align="none"> 4. </td>
 38 |     <td>EX </td>
 39 |     <td>Existential <i>there<i> </i></i></td>
 40 |   </tr>
 41 |   <tr bgcolor="#FFFFCA"> 
 42 |     <td align="none"> 5. </td>
 43 |     <td>FW </td>
 44 |     <td>Foreign word </td>
 45 |   </tr>
 46 |   <tr bgcolor="#FFFFCA"> 
 47 |     <td align="none"> 6. </td>
 48 |     <td>IN </td>
 49 |     <td>Preposition or subordinating conjunction </td>
 50 |   </tr>
 51 |   <tr bgcolor="#FFFFCA"> 
 52 |     <td align="none"> 7. </td>
 53 |     <td>JJ </td>
 54 |     <td>Adjective </td>
 55 |   </tr>
 56 |   <tr bgcolor="#FFFFCA"> 
 57 |     <td align="none"> 8. </td>
 58 |     <td>JJR </td>
 59 |     <td>Adjective, comparative </td>
 60 |   </tr>
 61 |   <tr bgcolor="#FFFFCA"> 
 62 |     <td align="none"> 9. </td>
 63 |     <td>JJS </td>
 64 |     <td>Adjective, superlative </td>
 65 |   </tr>
 66 |   <tr bgcolor="#FFFFCA"> 
 67 |     <td align="none"> 10. </td>
 68 |     <td>LS </td>
 69 |     <td>List item marker </td>
 70 |   </tr>
 71 |   <tr bgcolor="#FFFFCA"> 
 72 |     <td align="none"> 11. </td>
 73 |     <td>MD </td>
 74 |     <td>Modal </td>
 75 |   </tr>
 76 |   <tr bgcolor="#FFFFCA"> 
 77 |     <td align="none"> 12. </td>
 78 |     <td>NN </td>
 79 |     <td>Noun, singular or mass </td>
 80 |   </tr>
 81 |   <tr bgcolor="#FFFFCA"> 
 82 |     <td align="none"> 13. </td>
 83 |     <td>NNS </td>
 84 |     <td>Noun, plural </td>
 85 |   </tr>
 86 |   <tr bgcolor="#FFFFCA"> 
 87 |     <td align="none"> 14. </td>
 88 |     <td>NNP </td>
 89 |     <td>Proper noun, singular </td>
 90 |   </tr>
 91 |   <tr bgcolor="#FFFFCA"> 
 92 |     <td align="none"> 15. </td>
 93 |     <td>NNPS </td>
 94 |     <td>Proper noun, plural </td>
 95 |   </tr>
 96 |   <tr bgcolor="#FFFFCA"> 
 97 |     <td align="none"> 16. </td>
 98 |     <td>PDT </td>
 99 |     <td>Predeterminer </td>
100 |   </tr>
101 |   <tr bgcolor="#FFFFCA"> 
102 |     <td align="none"> 17. </td>
103 |     <td>POS </td>
104 |     <td>Possessive ending </td>
105 |   </tr>
106 |   <tr bgcolor="#FFFFCA"> 
107 |     <td align="none"> 18. </td>
108 |     <td>PRP </td>
109 |     <td>Personal pronoun </td>
110 |   </tr>
111 |   <tr bgcolor="#FFFFCA"> 
112 |     <td align="none"> 19. </td>
113 |     <td>PRP$ </td>
114 |     <td>Possessive pronoun </td>
115 |   </tr>
116 |   <tr bgcolor="#FFFFCA"> 
117 |     <td align="none"> 20. </td>
118 |     <td>RB </td>
119 |     <td>Adverb </td>
120 |   </tr>
121 |   <tr bgcolor="#FFFFCA"> 
122 |     <td align="none"> 21. </td>
123 |     <td>RBR </td>
124 |     <td>Adverb, comparative </td>
125 |   </tr>
126 |   <tr bgcolor="#FFFFCA"> 
127 |     <td align="none"> 22. </td>
128 |     <td>RBS </td>
129 |     <td>Adverb, superlative </td>
130 |   </tr>
131 |   <tr bgcolor="#FFFFCA"> 
132 |     <td align="none"> 23. </td>
133 |     <td>RP </td>
134 |     <td>Particle </td>
135 |   </tr>
136 |   <tr bgcolor="#FFFFCA"> 
137 |     <td align="none"> 24. </td>
138 |     <td>SYM </td>
139 |     <td>Symbol </td>
140 |   </tr>
141 |   <tr bgcolor="#FFFFCA"> 
142 |     <td align="none"> 25. </td>
143 |     <td>TO </td>
144 |     <td><i>to</i> </td>
145 |   </tr>
146 |   <tr bgcolor="#FFFFCA"> 
147 |     <td align="none"> 26. </td>
148 |     <td>UH </td>
149 |     <td>Interjection </td>
150 |   </tr>
151 |   <tr bgcolor="#FFFFCA"> 
152 |     <td align="none"> 27. </td>
153 |     <td>VB </td>
154 |     <td>Verb, base form </td>
155 |   </tr>
156 |   <tr bgcolor="#FFFFCA"> 
157 |     <td align="none"> 28. </td>
158 |     <td>VBD </td>
159 |     <td>Verb, past tense </td>
160 |   </tr>
161 |   <tr bgcolor="#FFFFCA"> 
162 |     <td align="none"> 29. </td>
163 |     <td>VBG </td>
164 |     <td>Verb, gerund or present participle </td>
165 |   </tr>
166 |   <tr bgcolor="#FFFFCA"> 
167 |     <td align="none"> 30. </td>
168 |     <td>VBN </td>
169 |     <td>Verb, past participle </td>
170 |   </tr>
171 |   <tr bgcolor="#FFFFCA"> 
172 |     <td align="none"> 31. </td>
173 |     <td>VBP </td>
174 |     <td>Verb, non-3rd person singular present </td>
175 |   </tr>
176 |   <tr bgcolor="#FFFFCA"> 
177 |     <td align="none"> 32. </td>
178 |     <td>VBZ </td>
179 |     <td>Verb, 3rd person singular present </td>
180 |   </tr>
181 |   <tr bgcolor="#FFFFCA"> 
182 |     <td align="none"> 33. </td>
183 |     <td>WDT </td>
184 |     <td>Wh-determiner </td>
185 |   </tr>
186 |   <tr bgcolor="#FFFFCA"> 
187 |     <td align="none"> 34. </td>
188 |     <td>WP </td>
189 |     <td>Wh-pronoun </td>
190 |   </tr>
191 |   <tr bgcolor="#FFFFCA"> 
192 |     <td align="none"> 35. </td>
193 |     <td>WP$ </td>
194 |     <td>Possessive wh-pronoun </td>
195 |   </tr>
196 |   <tr bgcolor="#FFFFCA"> 
197 |     <td align="none"> 36. </td>
198 |     <td>WRB </td>
199 |     <td>Wh-adverb 
200 | </table>
201 | 
202 | 
203 | </body>
204 | </html>
205 | 


--------------------------------------------------------------------------------
/data/SemGroups.txt:
--------------------------------------------------------------------------------
  1 | ACTI|Activities & Behaviors|T052|Activity
  2 | ACTI|Activities & Behaviors|T053|Behavior
  3 | ACTI|Activities & Behaviors|T056|Daily or Recreational Activity
  4 | ACTI|Activities & Behaviors|T051|Event
  5 | ACTI|Activities & Behaviors|T064|Governmental or Regulatory Activity
  6 | ACTI|Activities & Behaviors|T055|Individual Behavior
  7 | ACTI|Activities & Behaviors|T066|Machine Activity
  8 | ACTI|Activities & Behaviors|T057|Occupational Activity
  9 | ACTI|Activities & Behaviors|T054|Social Behavior
 10 | ANAT|Anatomy|T017|Anatomical Structure
 11 | ANAT|Anatomy|T029|Body Location or Region
 12 | ANAT|Anatomy|T023|Body Part, Organ, or Organ Component
 13 | ANAT|Anatomy|T030|Body Space or Junction
 14 | ANAT|Anatomy|T031|Body Substance
 15 | ANAT|Anatomy|T022|Body System
 16 | ANAT|Anatomy|T025|Cell
 17 | ANAT|Anatomy|T026|Cell Component
 18 | ANAT|Anatomy|T018|Embryonic Structure
 19 | ANAT|Anatomy|T021|Fully Formed Anatomical Structure
 20 | ANAT|Anatomy|T024|Tissue
 21 | CHEM|Chemicals & Drugs|T116|Amino Acid, Peptide, or Protein
 22 | CHEM|Chemicals & Drugs|T195|Antibiotic
 23 | CHEM|Chemicals & Drugs|T123|Biologically Active Substance
 24 | CHEM|Chemicals & Drugs|T122|Biomedical or Dental Material
 25 | CHEM|Chemicals & Drugs|T118|Carbohydrate
 26 | CHEM|Chemicals & Drugs|T103|Chemical
 27 | CHEM|Chemicals & Drugs|T120|Chemical Viewed Functionally
 28 | CHEM|Chemicals & Drugs|T104|Chemical Viewed Structurally
 29 | CHEM|Chemicals & Drugs|T200|Clinical Drug
 30 | CHEM|Chemicals & Drugs|T111|Eicosanoid
 31 | CHEM|Chemicals & Drugs|T196|Element, Ion, or Isotope
 32 | CHEM|Chemicals & Drugs|T126|Enzyme
 33 | CHEM|Chemicals & Drugs|T131|Hazardous or Poisonous Substance
 34 | CHEM|Chemicals & Drugs|T125|Hormone
 35 | CHEM|Chemicals & Drugs|T129|Immunologic Factor
 36 | CHEM|Chemicals & Drugs|T130|Indicator, Reagent, or Diagnostic Aid
 37 | CHEM|Chemicals & Drugs|T197|Inorganic Chemical
 38 | CHEM|Chemicals & Drugs|T119|Lipid
 39 | CHEM|Chemicals & Drugs|T124|Neuroreactive Substance or Biogenic Amine
 40 | CHEM|Chemicals & Drugs|T114|Nucleic Acid, Nucleoside, or Nucleotide
 41 | CHEM|Chemicals & Drugs|T109|Organic Chemical
 42 | CHEM|Chemicals & Drugs|T115|Organophosphorus Compound
 43 | CHEM|Chemicals & Drugs|T121|Pharmacologic Substance
 44 | CHEM|Chemicals & Drugs|T192|Receptor
 45 | CHEM|Chemicals & Drugs|T110|Steroid
 46 | CHEM|Chemicals & Drugs|T127|Vitamin
 47 | CONC|Concepts & Ideas|T185|Classification
 48 | CONC|Concepts & Ideas|T077|Conceptual Entity
 49 | CONC|Concepts & Ideas|T169|Functional Concept
 50 | CONC|Concepts & Ideas|T102|Group Attribute
 51 | CONC|Concepts & Ideas|T078|Idea or Concept
 52 | CONC|Concepts & Ideas|T170|Intellectual Product
 53 | CONC|Concepts & Ideas|T171|Language
 54 | CONC|Concepts & Ideas|T080|Qualitative Concept
 55 | CONC|Concepts & Ideas|T081|Quantitative Concept
 56 | CONC|Concepts & Ideas|T089|Regulation or Law
 57 | CONC|Concepts & Ideas|T082|Spatial Concept
 58 | CONC|Concepts & Ideas|T079|Temporal Concept
 59 | DEVI|Devices|T203|Drug Delivery Device
 60 | DEVI|Devices|T074|Medical Device
 61 | DEVI|Devices|T075|Research Device
 62 | DISO|Disorders|T020|Acquired Abnormality
 63 | DISO|Disorders|T190|Anatomical Abnormality
 64 | DISO|Disorders|T049|Cell or Molecular Dysfunction
 65 | DISO|Disorders|T019|Congenital Abnormality
 66 | DISO|Disorders|T047|Disease or Syndrome
 67 | DISO|Disorders|T050|Experimental Model of Disease
 68 | DISO|Disorders|T033|Finding
 69 | DISO|Disorders|T037|Injury or Poisoning
 70 | DISO|Disorders|T048|Mental or Behavioral Dysfunction
 71 | DISO|Disorders|T191|Neoplastic Process
 72 | DISO|Disorders|T046|Pathologic Function
 73 | DISO|Disorders|T184|Sign or Symptom
 74 | GENE|Genes & Molecular Sequences|T087|Amino Acid Sequence
 75 | GENE|Genes & Molecular Sequences|T088|Carbohydrate Sequence
 76 | GENE|Genes & Molecular Sequences|T028|Gene or Genome
 77 | GENE|Genes & Molecular Sequences|T085|Molecular Sequence
 78 | GENE|Genes & Molecular Sequences|T086|Nucleotide Sequence
 79 | GEOG|Geographic Areas|T083|Geographic Area
 80 | LIVB|Living Beings|T100|Age Group
 81 | LIVB|Living Beings|T011|Amphibian
 82 | LIVB|Living Beings|T008|Animal
 83 | LIVB|Living Beings|T194|Archaeon
 84 | LIVB|Living Beings|T007|Bacterium
 85 | LIVB|Living Beings|T012|Bird
 86 | LIVB|Living Beings|T204|Eukaryote
 87 | LIVB|Living Beings|T099|Family Group
 88 | LIVB|Living Beings|T013|Fish
 89 | LIVB|Living Beings|T004|Fungus
 90 | LIVB|Living Beings|T096|Group
 91 | LIVB|Living Beings|T016|Human
 92 | LIVB|Living Beings|T015|Mammal
 93 | LIVB|Living Beings|T001|Organism
 94 | LIVB|Living Beings|T101|Patient or Disabled Group
 95 | LIVB|Living Beings|T002|Plant
 96 | LIVB|Living Beings|T098|Population Group
 97 | LIVB|Living Beings|T097|Professional or Occupational Group
 98 | LIVB|Living Beings|T014|Reptile
 99 | LIVB|Living Beings|T010|Vertebrate
100 | LIVB|Living Beings|T005|Virus
101 | OBJC|Objects|T071|Entity
102 | OBJC|Objects|T168|Food
103 | OBJC|Objects|T073|Manufactured Object
104 | OBJC|Objects|T072|Physical Object
105 | OBJC|Objects|T167|Substance
106 | OCCU|Occupations|T091|Biomedical Occupation or Discipline
107 | OCCU|Occupations|T090|Occupation or Discipline
108 | ORGA|Organizations|T093|Health Care Related Organization
109 | ORGA|Organizations|T092|Organization
110 | ORGA|Organizations|T094|Professional Society
111 | ORGA|Organizations|T095|Self-help or Relief Organization
112 | PHEN|Phenomena|T038|Biologic Function
113 | PHEN|Phenomena|T069|Environmental Effect of Humans
114 | PHEN|Phenomena|T068|Human-caused Phenomenon or Process
115 | PHEN|Phenomena|T034|Laboratory or Test Result
116 | PHEN|Phenomena|T070|Natural Phenomenon or Process
117 | PHEN|Phenomena|T067|Phenomenon or Process
118 | PHYS|Physiology|T043|Cell Function
119 | PHYS|Physiology|T201|Clinical Attribute
120 | PHYS|Physiology|T045|Genetic Function
121 | PHYS|Physiology|T041|Mental Process
122 | PHYS|Physiology|T044|Molecular Function
123 | PHYS|Physiology|T032|Organism Attribute
124 | PHYS|Physiology|T040|Organism Function
125 | PHYS|Physiology|T042|Organ or Tissue Function
126 | PHYS|Physiology|T039|Physiologic Function
127 | PROC|Procedures|T060|Diagnostic Procedure
128 | PROC|Procedures|T065|Educational Activity
129 | PROC|Procedures|T058|Health Care Activity
130 | PROC|Procedures|T059|Laboratory Procedure
131 | PROC|Procedures|T063|Molecular Biology Research Technique
132 | PROC|Procedures|T062|Research Activity
133 | PROC|Procedures|T061|Therapeutic or Preventive Procedure
134 | 


--------------------------------------------------------------------------------
/data/SemanticTypes_2013AA.txt:
--------------------------------------------------------------------------------
  1 | aapp|T116|Amino Acid, Peptide, or Protein
  2 | acab|T020|Acquired Abnormality
  3 | acty|T052|Activity
  4 | aggp|T100|Age Group
  5 | amas|T087|Amino Acid Sequence
  6 | amph|T011|Amphibian
  7 | anab|T190|Anatomical Abnormality
  8 | anim|T008|Animal
  9 | anst|T017|Anatomical Structure
 10 | antb|T195|Antibiotic
 11 | arch|T194|Archaeon
 12 | bacs|T123|Biologically Active Substance
 13 | bact|T007|Bacterium
 14 | bdsu|T031|Body Substance
 15 | bdsy|T022|Body System
 16 | bhvr|T053|Behavior
 17 | biof|T038|Biologic Function
 18 | bird|T012|Bird
 19 | blor|T029|Body Location or Region
 20 | bmod|T091|Biomedical Occupation or Discipline
 21 | bodm|T122|Biomedical or Dental Material
 22 | bpoc|T023|Body Part, Organ, or Organ Component
 23 | bsoj|T030|Body Space or Junction
 24 | carb|T118|Carbohydrate
 25 | celc|T026|Cell Component
 26 | celf|T043|Cell Function
 27 | cell|T025|Cell
 28 | cgab|T019|Congenital Abnormality
 29 | chem|T103|Chemical
 30 | chvf|T120|Chemical Viewed Functionally
 31 | chvs|T104|Chemical Viewed Structurally
 32 | clas|T185|Classification
 33 | clna|T201|Clinical Attribute
 34 | clnd|T200|Clinical Drug
 35 | cnce|T077|Conceptual Entity
 36 | comd|T049|Cell or Molecular Dysfunction
 37 | crbs|T088|Carbohydrate Sequence
 38 | diap|T060|Diagnostic Procedure
 39 | dora|T056|Daily or Recreational Activity
 40 | drdd|T203|Drug Delivery Device
 41 | dsyn|T047|Disease or Syndrome
 42 | edac|T065|Educational Activity
 43 | eehu|T069|Environmental Effect of Humans
 44 | eico|T111|Eicosanoid
 45 | elii|T196|Element, Ion, or Isotope
 46 | emod|T050|Experimental Model of Disease
 47 | emst|T018|Embryonic Structure
 48 | enty|T071|Entity
 49 | enzy|T126|Enzyme
 50 | euka|T204|Eukaryote
 51 | evnt|T051|Event
 52 | famg|T099|Family Group
 53 | ffas|T021|Fully Formed Anatomical Structure
 54 | fish|T013|Fish
 55 | fndg|T033|Finding
 56 | fngs|T004|Fungus
 57 | food|T168|Food
 58 | ftcn|T169|Functional Concept
 59 | genf|T045|Genetic Function
 60 | geoa|T083|Geographic Area
 61 | gngm|T028|Gene or Genome
 62 | gora|T064|Governmental or Regulatory Activity
 63 | grpa|T102|Group Attribute
 64 | grup|T096|Group
 65 | hcpp|T068|Human-caused Phenomenon or Process
 66 | hcro|T093|Health Care Related Organization
 67 | hlca|T058|Health Care Activity
 68 | hops|T131|Hazardous or Poisonous Substance
 69 | horm|T125|Hormone
 70 | humn|T016|Human
 71 | idcn|T078|Idea or Concept
 72 | imft|T129|Immunologic Factor
 73 | inbe|T055|Individual Behavior
 74 | inch|T197|Inorganic Chemical
 75 | inpo|T037|Injury or Poisoning
 76 | inpr|T170|Intellectual Product
 77 | irda|T130|Indicator, Reagent, or Diagnostic Aid
 78 | lang|T171|Language
 79 | lbpr|T059|Laboratory Procedure
 80 | lbtr|T034|Laboratory or Test Result
 81 | lipd|T119|Lipid
 82 | mamm|T015|Mammal
 83 | mbrt|T063|Molecular Biology Research Technique
 84 | mcha|T066|Machine Activity
 85 | medd|T074|Medical Device
 86 | menp|T041|Mental Process
 87 | mnob|T073|Manufactured Object
 88 | mobd|T048|Mental or Behavioral Dysfunction
 89 | moft|T044|Molecular Function
 90 | mosq|T085|Molecular Sequence
 91 | neop|T191|Neoplastic Process
 92 | nnon|T114|Nucleic Acid, Nucleoside, or Nucleotide
 93 | npop|T070|Natural Phenomenon or Process
 94 | nsba|T124|Neuroreactive Substance or Biogenic Amine
 95 | nusq|T086|Nucleotide Sequence
 96 | ocac|T057|Occupational Activity
 97 | ocdi|T090|Occupation or Discipline
 98 | opco|T115|Organophosphorus Compound
 99 | orch|T109|Organic Chemical
100 | orga|T032|Organism Attribute
101 | orgf|T040|Organism Function
102 | orgm|T001|Organism
103 | orgt|T092|Organization
104 | ortf|T042|Organ or Tissue Function
105 | patf|T046|Pathologic Function
106 | phob|T072|Physical Object
107 | phpr|T067|Phenomenon or Process
108 | phsf|T039|Physiologic Function
109 | phsu|T121|Pharmacologic Substance
110 | plnt|T002|Plant
111 | podg|T101|Patient or Disabled Group
112 | popg|T098|Population Group
113 | prog|T097|Professional or Occupational Group
114 | pros|T094|Professional Society
115 | qlco|T080|Qualitative Concept
116 | qnco|T081|Quantitative Concept
117 | rcpt|T192|Receptor
118 | rept|T014|Reptile
119 | resa|T062|Research Activity
120 | resd|T075|Research Device
121 | rnlw|T089|Regulation or Law
122 | sbst|T167|Substance
123 | shro|T095|Self-help or Relief Organization
124 | socb|T054|Social Behavior
125 | sosy|T184|Sign or Symptom
126 | spco|T082|Spatial Concept
127 | strd|T110|Steroid
128 | tisu|T024|Tissue
129 | tmco|T079|Temporal Concept
130 | topp|T061|Therapeutic or Preventive Procedure
131 | virs|T005|Virus
132 | vita|T127|Vitamin
133 | vtbt|T010|Vertebrate
134 | 


--------------------------------------------------------------------------------
/data/clinical_text.txt:
--------------------------------------------------------------------------------
 1 | Criteria
 2 | Inclusion Criteria:
 3 | 
 4 | female
 5 | >= 65 years old
 6 | postmenopausal for > 5 years (WHO definition of menopause)
 7 | Exclusion Criteria:
 8 | 
 9 | currently taking osteoporosis related medication (HRT, SERM, bisphosphonate, PTH, calcitonin, fluoride)
10 | had cancer in past 10 years, likely to metastasize to bone (ie: breast, lung)
11 | have intrinsic bone disease (ie: Paget's Disease, Cushings syndrome)
12 | have untreated malabsorption syndrome (ie: Celiac Disease)
13 | renal insufficiency (CrCl <30ml/min)
14 | hyperparathyroidism, hypoparathyroidism
15 | chronic systemic glucocorticosteroid use > 3mos, dose>2.5mg daily
16 | 


--------------------------------------------------------------------------------
/data/en-chunker.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-chunker.bin


--------------------------------------------------------------------------------
/data/en-parser-chunking.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-parser-chunking.bin


--------------------------------------------------------------------------------
/data/en-pos-maxent.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-pos-maxent.bin


--------------------------------------------------------------------------------
/data/en-sent.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-sent.bin


--------------------------------------------------------------------------------
/data/en-token.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/en-token.bin


--------------------------------------------------------------------------------
/data/pos-transformation.csv:
--------------------------------------------------------------------------------
 1 | Number,Tag,Description,abbr.,,,distinct abbr.
 2 | 1,CC,Coordinating conjunction,C,,,C
 3 | 2,CD,Cardinal number,M,,,M
 4 | 3,DT,Determiner,D,,,D
 5 | 4,EX,Existential?there,E,,,E
 6 | 5,FW,Foreign word,F,,,F
 7 | 6,IN,Preposition or subordinating conjunction,P,,,P
 8 | 7,JJ,Adjective,A,,,A
 9 | 8,JJR,"Adjective, comparative",A,,,O
10 | 9,JJS,"Adjective, superlative",A,,,N
11 | 10,LS,List item marker,O,,,U
12 | 11,MD,Modal,O,,,R
13 | 12,NN,"Noun, singular or mass",N,,,T
14 | 13,NNS,"Noun, plural",N,,,V
15 | 14,NNP,"Proper noun, singular",N,,,G
16 | 15,NNPS,"Proper noun, plural",N,,,B
17 | 16,PDT,Predeterminer,D,,,
18 | 17,POS,Possessive ending,O,,,
19 | 18,PRP,Personal pronoun,U,,,
20 | 19,PRP$,Possessive pronoun,U,,,
21 | 20,RB,Adverb,R,,,
22 | 21,RBR,"Adverb, comparative",R,,,
23 | 22,RBS,"Adverb, superlative",R,,,
24 | 23,RP,Particle,O,,,
25 | 24,SYM,Symbol,O,,,
26 | 25,TO,to,T,,,
27 | 26,UH,Interjection,O,,,
28 | 27,VB,"Verb, base form",V,,,
29 | 28,VBD,"Verb, past tense",V,,,
30 | 29,VBG,"Verb, gerund or present participle",G,,,
31 | 30,VBN,"Verb, past participle",B,,,
32 | 31,VBP,"Verb, non-3rd person singular present",V,,,
33 | 32,VBZ,"Verb, 3rd person singular present",V,,,
34 | 33,WDT,Wh-determiner,D,,,
35 | 34,WP,Wh-pronoun,U,,,
36 | 35,WP$,Possessive wh-pronoun,U,,,
37 | 36,WRB,Wh-adverb,R,,,
38 | 


--------------------------------------------------------------------------------
/data/prefix.txt:
--------------------------------------------------------------------------------
 1 | #Prefix
 2 | an
 3 | anti
 4 | apo
 5 | auto
 6 | bi
 7 | bio
 8 | bis
 9 | circum
10 | co
11 | counter
12 | cryo
13 | de
14 | di
15 | dis
16 | dys
17 | electro
18 | epi
19 | extra
20 | fore
21 | geo
22 | haemo
23 | hemi
24 | hemo
25 | hetero
26 | homo
27 | hydro
28 | hyper
29 | hypo
30 | in
31 | infra
32 | inter
33 | intra
34 | ir
35 | iso
36 | macro
37 | mega
38 | meta
39 | micro
40 | mid
41 | mini
42 | mis
43 | mono
44 | multi
45 | neo
46 | non
47 | ortho
48 | over
49 | pan
50 | para
51 | peri
52 | photo
53 | poly
54 | post
55 | pre
56 | pro
57 | proto
58 | pseudo
59 | pyro
60 | quasi
61 | re
62 | retro
63 | self
64 | semi
65 | socio
66 | sub
67 | super
68 | supra
69 | tele
70 | trans
71 | tri
72 | ultra
73 | un
74 | under
75 | uni


--------------------------------------------------------------------------------
/data/stopwords-FOR-clustering.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | across
  5 | after
  6 | all
  7 | almost
  8 | also
  9 | am
 10 | among
 11 | an
 12 | and
 13 | any
 14 | are
 15 | as
 16 | at
 17 | be
 18 | because
 19 | been
 20 | but
 21 | by
 22 | can
 23 | cannot
 24 | could
 25 | dear
 26 | did
 27 | do
 28 | does
 29 | either
 30 | else
 31 | ever
 32 | every
 33 | for
 34 | from
 35 | get
 36 | got
 37 | had
 38 | has
 39 | have
 40 | he
 41 | her
 42 | hers
 43 | him
 44 | his
 45 | how
 46 | however
 47 | i
 48 | if
 49 | in
 50 | into
 51 | is
 52 | it
 53 | its
 54 | just
 55 | #least
 56 | let
 57 | #like
 58 | #likely
 59 | may
 60 | me
 61 | might
 62 | #most
 63 | must
 64 | my
 65 | neither
 66 | no
 67 | nor
 68 | not
 69 | of
 70 | off
 71 | often
 72 | on
 73 | only
 74 | or
 75 | other
 76 | our
 77 | own
 78 | rather
 79 | said
 80 | say
 81 | says
 82 | she
 83 | should
 84 | since
 85 | so
 86 | some
 87 | than
 88 | that
 89 | the
 90 | their
 91 | them
 92 | then
 93 | there
 94 | these
 95 | they
 96 | this
 97 | tis
 98 | to
 99 | too
100 | twas
101 | us
102 | wants
103 | was
104 | we
105 | were
106 | what
107 | when
108 | where
109 | which
110 | while
111 | who
112 | whom
113 | why
114 | will
115 | with
116 | would
117 | yet
118 | you
119 | your
120 | above
121 | afterwards
122 | again
123 | against
124 | alone
125 | along
126 | already
127 | although
128 | always
129 | amongst
130 | amoungst
131 | amount
132 | another
133 | anyhow
134 | anyone
135 | anything
136 | anyway
137 | anywhere
138 | around
139 | back
140 | became
141 | become
142 | becomes
143 | becoming
144 | before
145 | beforehand
146 | behind
147 | being
148 | below
149 | beside
150 | besides
151 | between
152 | #beyond
153 | bill
154 | both
155 | #bottom
156 | call
157 | cant
158 | co
159 | #computer
160 | con
161 | couldnt
162 | cry
163 | de
164 | #describe
165 | #detail
166 | done
167 | down
168 | due
169 | during
170 | each
171 | eg
172 | eight
173 | eleven
174 | elsewhere
175 | empty
176 | enough
177 | etc
178 | even
179 | everyone
180 | everything
181 | everywhere
182 | except
183 | few
184 | fifteen
185 | fify
186 | fill
187 | find
188 | fire
189 | first
190 | five
191 | former
192 | formerly
193 | forty
194 | found
195 | four
196 | front
197 | full
198 | further
199 | give
200 | go
201 | hasnt
202 | hence
203 | here
204 | hereafter
205 | hereby
206 | herein
207 | hereupon
208 | herse
209 | himse
210 | hundred
211 | ie
212 | inc
213 | indeed
214 | interest
215 | itse
216 | keep
217 | last
218 | latter
219 | latterly
220 | less
221 | ltd
222 | made
223 | many
224 | meanwhile
225 | mill
226 | mine
227 | more
228 | moreover
229 | mostly
230 | move
231 | much
232 | myse
233 | name
234 | namely
235 | never
236 | nevertheless
237 | next
238 | nine
239 | #nobody
240 | none
241 | noone
242 | nothing
243 | now
244 | nowhere
245 | once
246 | one
247 | onto
248 | others
249 | otherwise
250 | ours
251 | ourselves
252 | out
253 | over
254 | part
255 | per
256 | perhaps
257 | please
258 | put
259 | re
260 | same
261 | see
262 | seem
263 | seemed
264 | seeming
265 | seems
266 | serious
267 | several
268 | show
269 | side
270 | sincere
271 | six
272 | sixty
273 | somehow
274 | someone
275 | something
276 | sometime
277 | sometimes
278 | somewhere
279 | still
280 | such
281 | system
282 | take
283 | ten
284 | themselves
285 | thence
286 | thereafter
287 | thereby
288 | therefore
289 | therein
290 | thereupon
291 | thick
292 | thin
293 | third
294 | those
295 | though
296 | three
297 | through
298 | throughout
299 | thru
300 | thus
301 | together
302 | top
303 | toward
304 | towards
305 | twelve
306 | twenty
307 | two
308 | un
309 | under
310 | until
311 | up
312 | upon
313 | very
314 | via
315 | well
316 | whatever
317 | whence
318 | whenever
319 | whereafter
320 | whereas
321 | whereby
322 | wherein
323 | whereupon
324 | wherever
325 | whether
326 | whither
327 | whoever
328 | whole
329 | whose
330 | within
331 | without
332 | yours
333 | yourself
334 | yourselves
335 | s
336 | ll
337 | d
338 | im


--------------------------------------------------------------------------------
/data/stopwords-empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/data/stopwords-empty.txt


--------------------------------------------------------------------------------
/data/stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | across
  5 | after
  6 | all
  7 | almost
  8 | also
  9 | am
 10 | among
 11 | an
 12 | and
 13 | any
 14 | are
 15 | as
 16 | at
 17 | be
 18 | because
 19 | been
 20 | but
 21 | by
 22 | can
 23 | cannot
 24 | could
 25 | dear
 26 | did
 27 | do
 28 | does
 29 | either
 30 | else
 31 | ever
 32 | every
 33 | for
 34 | from
 35 | get
 36 | got
 37 | had
 38 | has
 39 | have
 40 | he
 41 | her
 42 | hers
 43 | him
 44 | his
 45 | how
 46 | however
 47 | i
 48 | if
 49 | in
 50 | into
 51 | is
 52 | it
 53 | its
 54 | just
 55 | #least
 56 | let
 57 | #like
 58 | #likely
 59 | may
 60 | me
 61 | might
 62 | #most
 63 | must
 64 | my
 65 | neither
 66 | no
 67 | nor
 68 | not
 69 | of
 70 | off
 71 | often
 72 | on
 73 | only
 74 | or
 75 | other
 76 | our
 77 | own
 78 | rather
 79 | said
 80 | say
 81 | says
 82 | she
 83 | should
 84 | since
 85 | so
 86 | some
 87 | than
 88 | that
 89 | the
 90 | their
 91 | them
 92 | then
 93 | there
 94 | these
 95 | they
 96 | this
 97 | tis
 98 | to
 99 | too
100 | twas
101 | us
102 | wants
103 | was
104 | we
105 | were
106 | what
107 | when
108 | where
109 | which
110 | while
111 | who
112 | whom
113 | why
114 | will
115 | with
116 | would
117 | yet
118 | you
119 | your
120 | above
121 | afterwards
122 | again
123 | against
124 | alone
125 | along
126 | already
127 | although
128 | always
129 | amongst
130 | amoungst
131 | amount
132 | another
133 | anyhow
134 | anyone
135 | anything
136 | anyway
137 | anywhere
138 | around
139 | #back
140 | became
141 | become
142 | becomes
143 | becoming
144 | before
145 | beforehand
146 | behind
147 | being
148 | below
149 | beside
150 | besides
151 | between
152 | #beyond
153 | #bill
154 | both
155 | #bottom
156 | call
157 | cant
158 | co
159 | #computer
160 | con
161 | couldnt
162 | cry
163 | de
164 | #describe
165 | #detail
166 | done
167 | down
168 | due to
169 | during
170 | each
171 | eg
172 | eight
173 | eleven
174 | elsewhere
175 | #empty
176 | enough
177 | etc
178 | even
179 | everyone
180 | everything
181 | everywhere
182 | except
183 | few
184 | fifteen
185 | fify
186 | fill
187 | find
188 | fire
189 | first
190 | five
191 | former
192 | formerly
193 | forty
194 | #found
195 | four
196 | front
197 | full
198 | further
199 | give
200 | go
201 | hasnt
202 | hence
203 | here
204 | hereafter
205 | hereby
206 | herein
207 | hereupon
208 | herse
209 | himse
210 | hundred
211 | ie
212 | inc
213 | indeed
214 | interest
215 | itse
216 | #keep
217 | last
218 | latter
219 | latterly
220 | less
221 | ltd
222 | made
223 | many
224 | meanwhile
225 | mill
226 | mine
227 | more
228 | moreover
229 | mostly
230 | move
231 | much
232 | myse
233 | #name
234 | namely
235 | never
236 | nevertheless
237 | next
238 | nine
239 | #nobody
240 | none
241 | noone
242 | nothing
243 | now
244 | nowhere
245 | once
246 | one
247 | onto
248 | others
249 | otherwise
250 | ours
251 | ourselves
252 | out
253 | over
254 | part
255 | per
256 | perhaps
257 | please
258 | put
259 | re
260 | same
261 | see
262 | seem
263 | seemed
264 | seeming
265 | seems
266 | #serious
267 | several
268 | show
269 | side
270 | sincere
271 | six
272 | sixty
273 | somehow
274 | someone
275 | something
276 | sometime
277 | sometimes
278 | somewhere
279 | still
280 | such
281 | #system
282 | take
283 | ten
284 | themselves
285 | thence
286 | thereafter
287 | thereby
288 | therefore
289 | therein
290 | thereupon
291 | #thick
292 | thin
293 | third
294 | those
295 | though
296 | three
297 | through
298 | throughout
299 | thru
300 | thus
301 | together
302 | #top
303 | toward
304 | towards
305 | twelve
306 | twenty
307 | two
308 | un
309 | under
310 | until
311 | up
312 | upon
313 | very
314 | via
315 | well
316 | whatever
317 | whence
318 | whenever
319 | whereafter
320 | whereas
321 | whereby
322 | wherein
323 | whereupon
324 | wherever
325 | whether
326 | whither
327 | whoever
328 | whole
329 | whose
330 | within
331 | without
332 | yours
333 | yourself
334 | yourselves
335 | s
336 | ll
337 | d
338 | im
339 | 's


--------------------------------------------------------------------------------
/data/stopwords_clustering.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | able
  3 | about
  4 | across
  5 | after
  6 | all
  7 | almost
  8 | also
  9 | am
 10 | among
 11 | an
 12 | and
 13 | any
 14 | are
 15 | as
 16 | at
 17 | be
 18 | because
 19 | been
 20 | but
 21 | by
 22 | can
 23 | cannot
 24 | could
 25 | dear
 26 | did
 27 | do
 28 | does
 29 | either
 30 | else
 31 | ever
 32 | every
 33 | for
 34 | from
 35 | get
 36 | got
 37 | had
 38 | has
 39 | have
 40 | he
 41 | her
 42 | hers
 43 | him
 44 | his
 45 | how
 46 | however
 47 | i
 48 | if
 49 | in
 50 | into
 51 | is
 52 | it
 53 | its
 54 | just
 55 | #least
 56 | let
 57 | #like
 58 | #likely
 59 | may
 60 | me
 61 | might
 62 | #most
 63 | must
 64 | my
 65 | neither
 66 | no
 67 | nor
 68 | not
 69 | of
 70 | off
 71 | often
 72 | on
 73 | only
 74 | or
 75 | other
 76 | our
 77 | own
 78 | rather
 79 | said
 80 | say
 81 | says
 82 | she
 83 | should
 84 | since
 85 | so
 86 | some
 87 | than
 88 | that
 89 | the
 90 | their
 91 | them
 92 | then
 93 | there
 94 | these
 95 | they
 96 | this
 97 | tis
 98 | to
 99 | too
100 | twas
101 | us
102 | wants
103 | was
104 | we
105 | were
106 | what
107 | when
108 | where
109 | which
110 | while
111 | who
112 | whom
113 | why
114 | will
115 | with
116 | would
117 | yet
118 | you
119 | your
120 | above
121 | afterwards
122 | again
123 | against
124 | alone
125 | along
126 | already
127 | although
128 | always
129 | amongst
130 | amoungst
131 | amount
132 | another
133 | anyhow
134 | anyone
135 | anything
136 | anyway
137 | anywhere
138 | around
139 | back
140 | became
141 | become
142 | becomes
143 | becoming
144 | before
145 | beforehand
146 | behind
147 | being
148 | below
149 | beside
150 | besides
151 | between
152 | #beyond
153 | bill
154 | both
155 | #bottom
156 | call
157 | cant
158 | co
159 | #computer
160 | con
161 | couldnt
162 | cry
163 | de
164 | #describe
165 | #detail
166 | done
167 | down
168 | due
169 | during
170 | each
171 | eg
172 | eight
173 | eleven
174 | elsewhere
175 | empty
176 | enough
177 | etc
178 | even
179 | everyone
180 | everything
181 | everywhere
182 | except
183 | few
184 | fifteen
185 | fify
186 | fill
187 | find
188 | fire
189 | first
190 | five
191 | former
192 | formerly
193 | forty
194 | found
195 | four
196 | front
197 | full
198 | further
199 | give
200 | go
201 | hasnt
202 | hence
203 | here
204 | hereafter
205 | hereby
206 | herein
207 | hereupon
208 | herse
209 | himse
210 | hundred
211 | ie
212 | inc
213 | indeed
214 | interest
215 | itse
216 | keep
217 | last
218 | latter
219 | latterly
220 | less
221 | ltd
222 | made
223 | many
224 | meanwhile
225 | mill
226 | mine
227 | more
228 | moreover
229 | mostly
230 | move
231 | much
232 | myse
233 | name
234 | namely
235 | never
236 | nevertheless
237 | next
238 | nine
239 | #nobody
240 | none
241 | noone
242 | nothing
243 | now
244 | nowhere
245 | once
246 | one
247 | onto
248 | others
249 | otherwise
250 | ours
251 | ourselves
252 | out
253 | over
254 | part
255 | per
256 | perhaps
257 | please
258 | put
259 | re
260 | same
261 | see
262 | seem
263 | seemed
264 | seeming
265 | seems
266 | serious
267 | several
268 | show
269 | side
270 | sincere
271 | six
272 | sixty
273 | somehow
274 | someone
275 | something
276 | sometime
277 | sometimes
278 | somewhere
279 | still
280 | such
281 | system
282 | take
283 | ten
284 | themselves
285 | thence
286 | thereafter
287 | thereby
288 | therefore
289 | therein
290 | thereupon
291 | thick
292 | thin
293 | third
294 | those
295 | though
296 | three
297 | through
298 | throughout
299 | thru
300 | thus
301 | together
302 | top
303 | toward
304 | towards
305 | twelve
306 | twenty
307 | two
308 | un
309 | under
310 | until
311 | up
312 | upon
313 | very
314 | via
315 | well
316 | whatever
317 | whence
318 | whenever
319 | whereafter
320 | whereas
321 | whereby
322 | wherein
323 | whereupon
324 | wherever
325 | whether
326 | whither
327 | whoever
328 | whole
329 | whose
330 | within
331 | without
332 | yours
333 | yourself
334 | yourselves
335 | s
336 | ll
337 | d
338 | im


--------------------------------------------------------------------------------
/data/suffix.txt:
--------------------------------------------------------------------------------
  1 | Suffix
  2 | able
  3 | ably
  4 | ad
  5 | ade
  6 | age
  7 | agogy
  8 | al 
  9 | al
 10 | ality
 11 | an
 12 | ance
 13 | ancy
 14 | ant
 15 | ar
 16 | ard
 17 | ary
 18 | arch
 19 | archy
 20 | arium
 21 | asia
 22 | ate
 23 | athlon
 24 | ation
 25 | ative
 26 | atory
 27 | bound
 28 | coele
 29 | coel
 30 | cele
 31 | centesis
 32 | cephalic
 33 | chondrion
 34 | cide
 35 | city
 36 | cy
 37 | cycle
 38 | dom
 39 | ectasis
 40 | ectasia
 41 | ectomy
 42 | ed
 43 | ee
 44 | eer
 45 | eme
 46 | emia
 47 | en
 48 | enchyma
 49 | ence
 50 | ency
 51 | ent
 52 | eous
 53 | er
 54 | ergy
 55 | ern
 56 | ery
 57 | esce
 58 | ese
 59 | esque
 60 | ess
 61 | esthesia
 62 | esthesis
 63 | etic
 64 | ette
 65 | fare
 66 | ful
 67 | gate
 68 | gnosis
 69 | gon
 70 | graph
 71 | gram
 72 | gry
 73 | hedron
 74 | holic
 75 | hood
 76 | ia
 77 | iable
 78 | ial
 79 | ian
 80 | iant
 81 | iary
 82 | iasis
 83 | iate
 84 | ible
 85 | ibly
 86 | ic
 87 | ical
 88 | ics
 89 | id
 90 | iency
 91 | ient
 92 | ier
 93 | fy
 94 | ify
 95 | ile
 96 | illion
 97 | ious
 98 | ing
 99 | ion
100 | ish
101 | ism
102 | ist
103 | ista
104 | ite
105 | itis
106 | itive
107 | itude
108 | ity
109 | ium
110 | ive
111 | isation
112 | ization
113 | ize
114 | ise
115 | izzle
116 | kinesis
117 | less
118 | let
119 | like
120 | ling
121 | ly
122 | man
123 | mancy
124 | mania
125 | ment
126 | meter
127 | metry
128 | mony
129 | morphism
130 | most
131 | ness
132 | nik
133 | ocracy
134 | ogram
135 | ography
136 | oid
137 | ologist
138 | ology
139 | oma
140 | ome
141 | omics
142 | onomy
143 | onym
144 | opsy
145 | or
146 | our
147 | ory
148 | ose
149 | osis
150 | ous
151 | phagia
152 | phagy
153 | philia
154 | phobia
155 | phone
156 | physeal
157 | phyte
158 | polis
159 | #s
160 | science
161 | scope
162 | script
163 | ship
164 | sion
165 | sis
166 | some
167 | stan
168 | ster
169 | eth
170 | #t
171 | th
172 | tion
173 | tome
174 | tom
175 | tropism
176 | ty
177 | uary
178 | ular
179 | ulent
180 | um
181 | uous
182 | ure
183 | us
184 | ville
185 | vorous
186 | vore
187 | wards
188 | ward
189 | ware
190 | ways
191 | wise
192 | wright
193 | #y


--------------------------------------------------------------------------------
/data/test.text.txt:
--------------------------------------------------------------------------------
1 | I love Cy, because she is nice. She is so nice.
2 | I miss you, as you know.


--------------------------------------------------------------------------------
/docs/dependency-package.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/dependency-package.jpg


--------------------------------------------------------------------------------
/docs/figurs/conceptual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/conceptual.png


--------------------------------------------------------------------------------
/docs/figurs/cui_duration_heatmap3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/cui_duration_heatmap3.png


--------------------------------------------------------------------------------
/docs/figurs/evaluation_simiterm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/evaluation_simiterm.png


--------------------------------------------------------------------------------
/docs/figurs/figure8_human_review.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/figure8_human_review.png


--------------------------------------------------------------------------------
/docs/figurs/sty_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/sty_distribution.png


--------------------------------------------------------------------------------
/docs/figurs/work-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/docs/figurs/work-flow.png


--------------------------------------------------------------------------------
/libs/bin/winutils.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/libs/bin/winutils.exe


--------------------------------------------------------------------------------
/libs/metamap-api-2.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/libs/metamap-api-2.0.jar


--------------------------------------------------------------------------------
/libs/prologbeans.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/libs/prologbeans.jar


--------------------------------------------------------------------------------
/libs/stanford-corenlp.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/libs/stanford-corenlp.jar


--------------------------------------------------------------------------------
/libs/stanfordNlp-models-url.txt:
--------------------------------------------------------------------------------
1 | http://stanfordnlp.github.io/CoreNLP/download.html
2 | Add the downloaded file to project dependency path.


--------------------------------------------------------------------------------
/py/get_ct.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Jason'
 2 | 
 3 | 
 4 | import urllib2
 5 | from urllib2 import urlopen
 6 | from bs4 import BeautifulSoup
 7 | import re
 8 | 
 9 | 
10 | def visible(element):
11 |     if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
12 |         return False
13 |     # elif re.match("<!--.*-->", str(element)):
14 |     #     return False
15 |     return True
16 | 
17 | f = open("C:\\fsu\\ra\\data\\201612\\index_url.txt")
18 | for line in f.readlines():
19 |     if len(line)<10: continue
20 |     (index, url) = line.split('\t',1)
21 |     #url = "https://www.ncbi.nlm.nih.gov/pubmed/20482476"
22 |     # print (url)
23 |     try:
24 |         html = urllib2.urlopen(url).read()
25 |         soup = BeautifulSoup(html, 'html.parser')
26 |         texts = soup.findAll(text=True)
27 |         visible_texts = filter(visible, texts)
28 |         text = filter(lambda x: len(x.strip())>5, visible_texts)
29 |         text2 = " ".join(text)
30 |         match = re.match(".*(\\bNCT\\d{5,15}\\b).*", text2, re.MULTILINE+re.DOTALL+re.UNICODE)
31 |         if match is not None:
32 |             ct = match.group(1)
33 |             # print (text2.encode('utf-8'))
34 |             print("%s\t%s" % (index, ct))
35 |         #elif None != re.match(".*(\\bclinicaltrials\\.gov\\b).*", text2, re.MULTILINE+re.DOTALL+re.UNICODE+re.IGNORECASE):
36 |         elif None != re.match(".*(\\bclinicaltrials\\b).*", text2, re.MULTILINE+re.DOTALL+re.UNICODE+re.IGNORECASE):
37 |             print("%s\t%s" % (index, "clinicaltrials.gov"))
38 |         else:
39 |             print("%s\t%s" % (index, "None"))
40 |     except:
41 |         print("%s\t%s" % (index, "Error"))
42 | 
43 | class aaa(object):
44 |     pass
45 | 


--------------------------------------------------------------------------------
/py/ner200align.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Jason'
 2 | 
 3 | 
 4 | f = open("""C:\\Users\\Jason\\Dropbox\\clinicalTrialPattern\\Evaluation\\random_200_sentences_cancer_studies-extend-non-major-term.txt""")
 5 | out = open("""C:\\Users\\Jason\\Dropbox\\clinicalTrialPattern\\Evaluation\\random_200_sentences_cancer_studies-extend-non-major-term-ret.txt""",'w+')
 6 | 
 7 | firstLine = True
 8 | for line in f.readlines():
 9 |     if len(line.strip())>10:
10 |         tokens_org = line.strip().split('\t')
11 |         tokens_org[0]=tokens_org[1]
12 |         other_term = tokens_org[3].strip('\" ')
13 |         if not firstLine:
14 |             tokens_org[3]=""
15 |         out.write('\t'.join(tokens_org) + '\n')
16 |         if firstLine==False and len(other_term.strip())>1:
17 |             for other in other_term.split(','):
18 |                 tokens_org[2]=other.strip()
19 |                 out.write('\t'.join(tokens_org) + '\n')
20 |     firstLine = False
21 | 
22 | f.close()
23 | out.close()
24 | 


--------------------------------------------------------------------------------
/py/pre_run.py:
--------------------------------------------------------------------------------
 1 | ﻿#!/usr/bin/python
 2 | 
 3 | '''
 4 |  Prepare the data for the project, for now, just unzip the files.
 5 |  I don't find the way to find the exact type of the file, but I know it can be decompressed by gzip.
 6 |  So I call the shell command "gzip -d -f" to do it.
 7 | '''
 8 | import os
 9 | import gzip  
10 | 
11 | def walk_dir(dir,topdown=True):
12 |     for root, dirs, files in os.walk(dir, topdown):
13 |         for name in files:
14 |             print name
15 |             if not name.endswith(".Z"): continue
16 |             print("unzip file: " + os.path.join(root,name))
17 |             os.system("gzip -d -f " + os.path.join(root,name))
18 |            
19 | 
20 | walk_dir("./data")
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/py/preprocess_index.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Jason'
 2 | 
 3 | 
 4 | import csv
 5 | import sys,os,re
 6 | 
 7 | 
 8 | with open(r'C:\fsu\ra\data\201708\Copy of Botanical_with_dsld_cat_termlist.csv', 'w+') as output:
 9 |     with open(r'C:\fsu\ra\data\201708\Copy of Botanical_with_dsld_cat.csv', 'rb') as csvfile:
10 |         spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
11 |         head = True
12 |         aui = 0
13 |         for row in spamreader:
14 |             '''column: id, name, Scientific Name, category_DSLD'''
15 |             if head:
16 |                 head = False
17 |                 continue
18 |             terms  = str(row[2])
19 |             terms = re.sub(r'\.\s*Family:',', ',terms)
20 |             terms = terms.replace('/',', ')
21 |             terms = terms.replace(';',', ')
22 |             terms = terms.replace('synonyms','')
23 |             terms = terms.replace('synonym','')
24 | 
25 |             res_list = []
26 |             terms_list = terms.split(', ')
27 |             for term in terms_list:
28 |                 term = term.strip()
29 |                 term = term.strip('.,?!"\'\r')
30 |                 # extract (*)
31 |                 match = re.match(r'(.+?)\((.+?)\)(.*?)',term)
32 |                 if match == None:
33 |                     print(term)
34 |                     res_list.append(term)
35 |                 else:
36 |                     res_list.append(match.group(1).strip() + match.group(3).strip())
37 |                     res_list.append(match.group(2).strip())
38 |                     print(term, match.group(1)+match.group(3), match.group(2))
39 |             # print('\t'.join(res_list))
40 | 
41 |             cui = row[0]
42 |             sab = 'unknown'
43 |             for term in res_list:
44 |                 aui += 1
45 |                 preStr = term
46 |                 output.write('\t'.join([cui,str(aui),sab,term,preStr]) + '\n')
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/py/xsl2csv.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Jason'
 2 | 
 3 | from openpyxl import load_workbook
 4 | wb = load_workbook("C:\\Users\\Jason\\Desktop\\alldeaf_Health_08_20_2015.xlsx")
 5 | 
 6 | ws = wb['Health']
 7 | 
 8 | print (ws['A2'])
 9 | 
10 | 


--------------------------------------------------------------------------------
/r/.Rhistory:
--------------------------------------------------------------------------------
 1 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R')
 2 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R')
 3 | source('C:/fsu/ra/UmlsTagger/r/review-order-ranking.R')
 4 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation-200to300.R')
 5 | source('C:/fsu/ra/UmlsTagger/r/evaluation.R')
 6 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R')
 7 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R')
 8 | source('C:/fsu/ra/UmlsTagger/r/cross-evaluation.R')
 9 | source('C:/fsu/ra/UmlsTagger/r/pattern-heatmap.R')
10 | source('C:/fsu/ra/UmlsTagger/r/pattern-heatmap.R')
11 | source('C:/fsu/ra/UmlsTagger/r/pattern-heatmap.R')
12 | source('C:/fsu/ra/UmlsTagger/r/pattern-heatmap.R')
13 | 


--------------------------------------------------------------------------------
/r/README.md:
--------------------------------------------------------------------------------
 1 | ## R code
 2 | 
 3 | Most of the R code here is used to draw the figures for our publications.
 4 | 
 5 | ### Directory `data`
 6 | Store the data for the R code below.
 7 | 
 8 | ### cross-evaluation.R
 9 | Use to draw the precision/recall/f-score for the CHV paper for the semiTerm features
10 | 
11 | ### cross-evaluation-bow.R
12 | Use to draw the precision/recall/f-score for the CHV paper for the bag-of-word features
13 | 
14 | ### cross-evaluation-200to300.R
15 | Use to draw the precision/recall/f-score for the CHV paper for the semiTerm features for Kmeans-parameter 200 to 300.
16 | We do this because at the beginning we only processed parameter 5 to 200
17 | 
18 | ### pattern-heatmap.R
19 | Use to draw the heap map figure for CHV paper also for IBMB 2016 paper
20 | 
21 | ### pca-draw.R
22 | Use to draw the 3-D scatter figure for the CHV paper for the semiTerm features
23 | 
24 | ### review_result.R and review-order-ranking.R
25 | Use to draw the human review figure for CHV paper
26 | 
27 | ### silhouette.R
28 | Draw the silhouette figure
29 | 
30 | ### ngram-distribution.R
31 | Draw the ngram distribution figure
32 | 
33 | ### classify-rpart.R
34 | Classify ngrams using r-part.
35 | 


--------------------------------------------------------------------------------
/r/RVisualisation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/r/RVisualisation.pdf


--------------------------------------------------------------------------------
/r/classify-rpart.R:
--------------------------------------------------------------------------------
 1 | library(rpart)
 2 | 
 3 | #ngrams=read.table("C:\\fsu\\ra\\data\\ngram_vectors_all_0129.txt")
 4 | ngrams=read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\ngram_vectors_all_0227.txt")
 5 | colnames(ngrams)=c("tfdf-1","tf-2","df-3","cvalue-4","umls_score-5","chv_score-6","contain_umls-7","contain_chv-8","nn-9","an-10","pn-11","anpn-12","stys-13","stys-14","stys-15","stys-16","stys-17","stys-18","stys-19","stys-20","stys-21","stys-22","stys-23","stys-24","win_pos-25","win_pos-26","win_pos-27","win_pos-28","win_pos-29","win_pos-30","win_pos-31","win_pos-32","win_pos-33","win_pos-34","win_pos-35","win_pos-36","win_pos-37","capt_first-38","capt_all-39","capt_term-40","win_umls-41","win_chv-42","sent_umls-43","sent_chv-44","umls_dist-45","chv_dist-46","prefix-47","prefix-48","prefix-49","prefix-50","prefix-51","prefix-52","prefix-53","prefix-54","prefix-55","prefix-56","prefix-57","prefix-58","prefix-59","prefix-60","prefix-61","prefix-62","prefix-63","prefix-64","prefix-65","prefix-66","prefix-67","prefix-68","prefix-69","prefix-70","prefix-71","prefix-72","prefix-73","prefix-74","prefix-75","prefix-76","prefix-77","prefix-78","prefix-79","prefix-80","prefix-81","prefix-82","prefix-83","prefix-84","prefix-85","prefix-86","prefix-87","prefix-88","prefix-89","prefix-90","prefix-91","prefix-92","prefix-93","prefix-94","prefix-95","prefix-96","prefix-97","prefix-98","prefix-99","prefix-100","prefix-101","prefix-102","prefix-103","prefix-104","prefix-105","prefix-106","prefix-107","prefix-108","prefix-109","prefix-110","prefix-111","prefix-112","prefix-113","prefix-114","prefix-115","prefix-116","prefix-117","prefix-118","prefix-119","prefix-120","suffix-121","suffix-122","suffix-123","suffix-124","suffix-125","suffix-126","suffix-127","suffix-128","suffix-129","suffix-130","suffix-131","suffix-132","suffix-133","suffix-134","suffix-135","suffix-136","suffix-137","suffix-138","suffix-139","suffix-140","suffix-141","suffix-142","suffix-143","suffix-144","suffix-145","suffix-146","suffix-147","suffix-148","suffix-149","suffix-150","suffix-151","suffix-152","suffix-153","suffix-154","suffix-155","suffix-156","suffix-157","suffix-158","suffix-159","suffix-160","suffix-161","suffix-162","suffix-163","suffix-164","suffix-165","suffix-166","suffix-167","suffix-168","suffix-169","suffix-170","suffix-171","suffix-172","suffix-173","suffix-174","suffix-175","suffix-176","suffix-177","suffix-178","suffix-179","suffix-180","suffix-181","suffix-182","suffix-183","suffix-184","suffix-185","suffix-186","suffix-187","suffix-188","suffix-189","suffix-190","suffix-191","suffix-192","suffix-193","suffix-194","suffix-195","suffix-196","suffix-197","suffix-198","suffix-199","suffix-200","suffix-201","suffix-202","suffix-203","suffix-204","suffix-205","suffix-206","suffix-207","suffix-208","suffix-209","suffix-210","suffix-211","suffix-212","suffix-213","suffix-214","suffix-215","suffix-216","suffix-217","suffix-218","suffix-219","suffix-220","suffix-221","suffix-222","suffix-223","suffix-224","suffix-225","suffix-226","suffix-227","suffix-228","suffix-229","suffix-230","suffix-231","suffix-232","suffix-233","suffix-234","suffix-235","suffix-236","suffix-237","suffix-238","suffix-239","suffix-240","suffix-241","suffix-242","suffix-243","suffix-244","suffix-245","suffix-246","suffix-247","suffix-248","suffix-249","suffix-250","suffix-251","suffix-252","suffix-253","suffix-254","suffix-255","suffix-256","suffix-257","suffix-258","suffix-259","suffix-260","suffix-261","suffix-262","suffix-263","suffix-264","suffix-265","suffix-266","suffix-267","suffix-268","suffix-269","suffix-270","suffix-271","suffix-272","suffix-273","suffix-274","suffix-275","suffix-276","suffix-277","suffix-278","suffix-279","suffix-280","suffix-281","suffix-282","suffix-283","suffix-284","suffix-285","suffix-286","suffix-287","suffix-288","suffix-289","suffix-290","suffix-291","suffix-292","suffix-293","suffix-294","suffix-295","suffix-296","suffix-297","suffix-298","suffix-299","suffix-300","suffix-301","suffix-302","suffix-303","suffix-304","suffix-305","suffix-306","suffix-307","suffix-308","suffix-309","suffix-310","suffix-311","suffix-312","suffix-313")
 6 | v=ngrams
 7 | v[,"chv_score-6"]=v[,"chv_score-6"]>0.3
 8 | 
 9 | setwd("C:\\fsu\\ra\\UmlsTagger\\r\\data")
10 | tree <- rpart(`chv_score-6`~.,v,method="class", control=rpart.control(maxdepth = 30, minsplit = 5, minbucket = 2, cp = 0.0001))
11 | summary(tree,file="rpart.summary")
12 | 
13 | 
14 | xmat=xpred.rpart(tree, xval = 10,  return.all = FALSE)
15 | xerr=(xmat-1-v$`chv_score-6`)^2
16 | e2=apply(xerr, 2, sum)/nrow(v)   # cross-validated error estimate
17 | e3=apply(xerr, 2, sum)
18 | printcp(tree)
19 | 
20 | 


--------------------------------------------------------------------------------
/r/cross-evaluation-200to300.R:
--------------------------------------------------------------------------------
  1 | library('base')
  2 | 
  3 | # tf > 100, filter cluster < 3
  4 | data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-tf100-200to300.txt",sep='\t')
  5 | rd_pc=25.5
  6 | 
  7 | 
  8 | data.avg=aggregate(data[,1:ncol(data)], list(data[,1]),mean)
  9 | 
 10 | # ev=read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\evaluation.txt", sep = '\t')
 11 | ev=data.avg[order(data.avg$Group.1),]
 12 | 
 13 | write.table(data.avg[,2:ncol(data.avg)], "C:\\fsu\\ra\\UmlsTagger\\r\\data\\tmp.txt", sep='\t',row.names = FALSE,col.names = FALSE)
 14 | x = matrix(seq(5,200,5),ncol=1) * (100/100)
 15 | 
 16 | # random baseline data
 17 | y_rd_pc = rep(rd_pc,40)
 18 | dim(y_rd_pc)=c(40,1)
 19 | y_rd_rc = seq(5,200,5)*rd_pc/100
 20 | dim(y_rd_rc)=c(40,1)
 21 | y_rd_fs=(1+0.5^2)*(y_rd_pc*y_rd_rc)/((0.5^2*y_rd_pc+y_rd_rc))/100
 22 | 
 23 | 
 24 | #precision
 25 | startcol=11+40
 26 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 27 | y = cbind(y,y_rd_pc)
 28 | matplot(x,y,type=c('l'),
 29 |         #pch=c(1,2,3),
 30 |         lwd=1,
 31 |         lty=1,
 32 |         #add=TRUE,
 33 |         col=gray.colors(nrow(ev),0.9,0),
 34 |         xlab="top-N percent", ylab="precision (%)")
 35 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
 36 | y2 = cbind(y2,y_rd_pc)
 37 | matplot(x,y2,type=c('o'),
 38 |         pch=c(1,5,6),
 39 |         lwd=1,
 40 |         lty=1,
 41 |         lend=3,
 42 |         add=TRUE,
 43 |         col=rainbow(3,start=1))
 44 | 
 45 | 
 46 | 
 47 | legend("topright",legend = c("tf", "c-value", "random", "k=5", "k=300"), 
 48 |        col=c(rainbow(3,start=1),
 49 |              gray.colors(2,0.9,0)), 
 50 |        pch=c(1,5,6,16,16)) # optiona
 51 | 
 52 | 
 53 | #recall
 54 | startcol=11+00
 55 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 56 | y = cbind(y,y_rd_rc)
 57 | #View(y)
 58 | matplot(x,y,type=c('l'),
 59 |         #pch=c(1,2,3),
 60 |         lwd=1,
 61 |         lty=1,
 62 |         #add=TRUE,
 63 |         col=gray.colors(nrow(ev),0.9,0),
 64 |         xlab="top-N percent", ylab="recall (%)")
 65 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
 66 | y2 = cbind(y2,y_rd_rc)
 67 | matplot(x,y2,type=c('o'),
 68 |         pch=c(1,5,6),
 69 |         lwd=1,
 70 |         lty=1,
 71 |         lend=3,
 72 |         add=TRUE,
 73 |         col=rainbow(3,start=1))
 74 | legend("topleft",legend = c("tf", "c-value", "random", "k=5", "k=300"), 
 75 |        col=c(rainbow(3,start=1),
 76 |              gray.colors(2,0.9,0)), 
 77 |        pch=c(1,5,6,16,16)) # optiona
 78 | 
 79 | 
 80 | #f-score
 81 | startcol=11+80
 82 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 83 | y = cbind(y,y_rd_fs)
 84 | #View(y)
 85 | matplot(x,y,type=c('l'),
 86 |         #pch=c(1,2,3),
 87 |         lwd=1,
 88 |         lty=1,
 89 |         #add=TRUE,
 90 |         col=gray.colors(nrow(ev),0.9,0),
 91 |         xlab="top-N percent", ylab="f-score")
 92 | 
 93 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
 94 | y2 = cbind(y2,y_rd_fs)
 95 | matplot(x,y2,type=c('o'),
 96 |         pch=c(1,5,6),
 97 |         lwd=1,
 98 |         lty=1,
 99 |         lend=3,
100 |         add=TRUE,
101 |         col=rainbow(3,start=1))
102 | 
103 | legend("topright",legend = c("tf", "c-value","random", "k=5", "k=300"), 
104 |        col=c(rainbow(3,start=1),
105 |              gray.colors(3,0.9,0)), 
106 |        pch=c(1,5,6,16,16)) # optiona
107 | 


--------------------------------------------------------------------------------
/r/cross-evaluation-bow.R:
--------------------------------------------------------------------------------
  1 | library('base')
  2 | 
  3 | #tf > 100, filter cluster < 3
  4 | data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-bow.txt",sep='\t')
  5 | rd_pc=25.5
  6 | cnt=c(3541,1895) #(#_ngram_in_test, #_chv_in_test)
  7 | tf=100
  8 | 
  9 | 
 10 | data.avg=aggregate(data[,1:ncol(data)], list(data[,1]),mean)
 11 | 
 12 | ev=data.avg[order(data.avg$Group.1),]
 13 | 
 14 | write.table(data.avg[,2:ncol(data.avg)], "C:\\fsu\\ra\\UmlsTagger\\r\\data\\tmp.txt", sep='\t',row.names = FALSE,col.names = FALSE)
 15 | x = matrix(seq(5,200,5),ncol=1) /100 * cnt[2]
 16 | 
 17 | 
 18 | # random baseline data
 19 | y_rd_pc = rep(rd_pc,40)
 20 | dim(y_rd_pc)=c(40,1)
 21 | y_rd_rc = seq(5,200,5)*rd_pc/100
 22 | dim(y_rd_rc)=c(40,1)
 23 | y_rd_fs=(1+0.5^2)*(y_rd_pc*y_rd_rc)/((0.5^2*y_rd_pc+y_rd_rc))/100
 24 | 
 25 | 
 26 | #precision
 27 | startcol=11+40
 28 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 29 | y = cbind(y,y_rd_pc)
 30 | matplot(x,y,type=c('l'),
 31 |         #pch=c(1,2,3),
 32 |         lwd=1,
 33 |         lty=1,
 34 |         #add=TRUE,
 35 |         col=gray.colors(nrow(ev),0.9,0),
 36 |         xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="precision (%)")
 37 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
 38 | y2 = cbind(y2,y_rd_pc)
 39 | matplot(x,y2,type=c('o'),
 40 |         pch=c(1,5,6),
 41 |         lwd=1,
 42 |         lty=1,
 43 |         lend=3,
 44 |         add=TRUE,
 45 |         col=rainbow(3,start=1))
 46 | 
 47 | legend("topright",legend = c("tf", "c-value", "random", "BOW (k=5)", "BOW (k=300)"), 
 48 |        col=c(rainbow(3,start=1),
 49 |              gray.colors(2,0.9,0)), 
 50 |        pch=c(1,5,6,16,16)) # optiona
 51 | 
 52 | 
 53 | #recall
 54 | startcol=11+00
 55 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 56 | y = cbind(y,y_rd_rc)
 57 | #View(y)
 58 | matplot(x,y,type=c('l'),
 59 |         #pch=c(1,2,3),
 60 |         lwd=1,
 61 |         lty=1,
 62 |         #add=TRUE,
 63 |         col=gray.colors(nrow(ev),0.9,0),
 64 |         xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="recall (%)")
 65 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
 66 | y2 = cbind(y2,y_rd_rc)
 67 | matplot(x,y2,type=c('o'),
 68 |         pch=c(1,5,6),
 69 |         lwd=1,
 70 |         lty=1,
 71 |         lend=3,
 72 |         add=TRUE,
 73 |         col=rainbow(3,start=1))
 74 | legend("topleft",legend = c("tf", "c-value", "random", "BOW (k=5)", "BOW (k=300)"), 
 75 |        col=c(rainbow(3,start=1),
 76 |              gray.colors(2,0.9,0)), 
 77 |        pch=c(1,5,6,16,16)) # optiona
 78 | 
 79 | 
 80 | #f-score
 81 | startcol=11+80
 82 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 83 | y = cbind(y,y_rd_fs)
 84 | #View(y)
 85 | matplot(x,y,type=c('l'),
 86 |         #pch=c(1,2,3),
 87 |         lwd=1,
 88 |         lty=1,
 89 |         #add=TRUE,
 90 |         col=gray.colors(nrow(ev),0.9,0),
 91 |         xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="f-score")
 92 | 
 93 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
 94 | y2 = cbind(y2,y_rd_fs)
 95 | matplot(x,y2,type=c('o'),
 96 |         pch=c(1,5,6),
 97 |         lwd=1,
 98 |         lty=1,
 99 |         lend=3,
100 |         add=TRUE,
101 |         col=rainbow(3,start=1))
102 | 
103 | legend("bottomright",legend = c("tf", "c-value","random", "BOW (k=5)", "BOW (k=300)"), 
104 |        col=c(rainbow(3,start=1),
105 |              gray.colors(2,0.9,0)), 
106 |        pch=c(1,5,6,16,16)) # optiona
107 | 


--------------------------------------------------------------------------------
/r/cross-evaluation.R:
--------------------------------------------------------------------------------
  1 | library('base')
  2 | 
  3 | # tf > 100, filter cluster < 3
  4 | # data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-tf100.txt",sep='\t')
  5 | # rd_pc=25.5
  6 | # cnt=c(3541,1895) #(#_ngram_in_test, #_chv_in_test)
  7 | # tf=100
  8 | 
  9 | # 
 10 | # # tf > 100, filter cluster < 3   -- cancer data
 11 | data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-tf100-cancer.txt",sep='\t')
 12 | rd_pc=25.5
 13 | cnt=c(4344,2280)
 14 | tf=100
 15 | 
 16 | 
 17 | #tf > 5
 18 | # data = read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cross-evaluation-tf5.txt",sep='\t')
 19 | # rd_pc=9.78
 20 | # cnt=c(7374,2212)
 21 | # tf=5
 22 | 
 23 | data.avg=aggregate(data[,1:ncol(data)], list(data[,1]),mean)
 24 | 
 25 | # ev=read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\evaluation.txt", sep = '\t')
 26 | ev=data.avg[order(data.avg$Group.1),]
 27 | 
 28 | write.table(data.avg[,2:ncol(data.avg)], "C:\\fsu\\ra\\UmlsTagger\\r\\data\\tmp.txt", sep='\t',row.names = FALSE,col.names = FALSE)
 29 | x = matrix(seq(5,200,5),ncol=1) /100 * cnt[2]
 30 | 
 31 | 
 32 | # random baseline data
 33 | y_rd_pc = rep(rd_pc,40)
 34 | dim(y_rd_pc)=c(40,1)
 35 | y_rd_rc = seq(5,200,5)*rd_pc/100
 36 | dim(y_rd_rc)=c(40,1)
 37 | y_rd_fs=(1+0.5^2)*(y_rd_pc*y_rd_rc)/((0.5^2*y_rd_pc+y_rd_rc))/100
 38 | 
 39 | 
 40 | #precision
 41 | startcol=11+40
 42 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 43 | y = cbind(y,y_rd_pc)
 44 | matplot(x,y,type=c('l'),
 45 |         #pch=c(1,2,3),
 46 |         lwd=1,
 47 |         lty=1,
 48 |         #add=TRUE,
 49 |         col=gray.colors(nrow(ev),0.9,0),
 50 |         xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="precision (%)")
 51 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
 52 | y2 = cbind(y2,y_rd_pc)
 53 | matplot(x,y2,type=c('o'),
 54 |         pch=c(1,5,6),
 55 |         lwd=1,
 56 |         lty=1,
 57 |         lend=3,
 58 |         add=TRUE,
 59 |         col=rainbow(3,start=1))
 60 | 
 61 | legend("topright",legend = c("tf", "c-value", "random", "simiTerm (k=5)", "simiTerm (k=300)"), 
 62 |        col=c(rainbow(3,start=1),
 63 |              gray.colors(2,0.9,0)), 
 64 |        pch=c(1,5,6,16,16)) # optiona
 65 | 
 66 | 
 67 | #recall
 68 | startcol=11+00
 69 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 70 | y = cbind(y,y_rd_rc)
 71 | #View(y)
 72 | matplot(x,y,type=c('l'),
 73 |         #pch=c(1,2,3),
 74 |         lwd=1,
 75 |         lty=1,
 76 |         #add=TRUE,
 77 |         col=gray.colors(nrow(ev),0.9,0),
 78 |         xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="recall (%)")
 79 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
 80 | y2 = cbind(y2,y_rd_rc)
 81 | matplot(x,y2,type=c('o'),
 82 |         pch=c(1,5,6),
 83 |         lwd=1,
 84 |         lty=1,
 85 |         lend=3,
 86 |         add=TRUE,
 87 |         col=rainbow(3,start=1))
 88 | legend("topleft",legend = c("tf", "c-value", "random", "simiTerm (k=5)", "simiTerm (k=300)"), 
 89 |        col=c(rainbow(3,start=1),
 90 |              gray.colors(2,0.9,0)), 
 91 |        pch=c(1,5,6,16,16)) # optiona
 92 | 
 93 | 
 94 | #f-score
 95 | startcol=11+80
 96 | y = t(ev[1:(nrow(ev)),startcol:(startcol+40-1)])
 97 | y = cbind(y,y_rd_fs)
 98 | #View(y)
 99 | matplot(x,y,type=c('l'),
100 |         #pch=c(1,2,3),
101 |         lwd=1,
102 |         lty=1,
103 |         #add=TRUE,
104 |         col=gray.colors(nrow(ev),0.9,0),
105 |         xlab=sprintf("top-N of %d terms (tf>%d)",cnt[1],tf), ylab="f-score")
106 | 
107 | y2 = t(ev[1:2,startcol:(startcol+40-1)])
108 | y2 = cbind(y2,y_rd_fs)
109 | matplot(x,y2,type=c('o'),
110 |         pch=c(1,5,6),
111 |         lwd=1,
112 |         lty=1,
113 |         lend=3,
114 |         add=TRUE,
115 |         col=rainbow(3,start=1))
116 | 
117 | legend("topright",legend = c("tf", "c-value","random", "simiTerm (k=5)", "simiTerm (k=300)"), 
118 |        col=c(rainbow(3,start=1),
119 |              gray.colors(2,0.9,0)), 
120 |        pch=c(1,5,6,16,16)) # optiona
121 | 


--------------------------------------------------------------------------------
/r/data/human_review.txt:
--------------------------------------------------------------------------------
1 | 73	47	44	73	88
2 | 1	0	1	1	2
3 | 26	53	55	26	10
4 | 


--------------------------------------------------------------------------------
/r/ngram-distribution.R:
--------------------------------------------------------------------------------
 1 | org =read.table("C:\\fsu\\ra\\UmlsTagger\\r\\data\\ngram_yahoo_tf5.txt",header=TRUE,sep='\t')
 2 | x=seq(1,nrow(org))
 3 | gram1=subset(org,org['n']==1)
 4 | gram2=subset(org,org['n']==2)
 5 | gram3=subset(org,org['n']==3)
 6 | gram4=subset(org,org['n']==4)
 7 | gram5=subset(org,org['n']==5)
 8 | gram6=subset(org,org['type']=="chv")
 9 | gram7=subset(org,org['type']=="umls")
10 | gram8=subset(org,org['type']=="others")
11 | 
12 | n=1
13 | matplot(seq(1,nrow(gram1)),log(gram1[,'tf']),type='l',pch=n,col=n, xlab="Index of ranked n-grams", ylab="log(term frequency)")
14 | matpoints(seq(1,nrow(gram1))[seq(1,nrow(gram1),500)],log(gram1[,'tf'])[seq(1,nrow(gram1),500)],pch=n,col=n)
15 | 
16 | n=2
17 | matplot(seq(1,nrow(gram2)),log(gram2[,'tf']),type='l',pch=n,col=rainbow(5,start=0.2), add=TRUE)
18 | matpoints(seq(1,nrow(gram2))[seq(1,nrow(gram2),500)],log(gram2[,'tf'])[seq(1,nrow(gram2),500)],pch=n,col=n)
19 | 
20 | n=3
21 | matplot(seq(1,nrow(gram3)),log(gram3[,'tf']),type='l',pch=n,col=rainbow(5,start=0.3), add=TRUE)
22 | matpoints(seq(1,nrow(gram3))[seq(1,nrow(gram3),500)],log(gram3[,'tf'])[seq(1,nrow(gram3),500)],pch=n,col=n)
23 | 
24 | n=4
25 | matplot(seq(1,nrow(gram4)),log(gram4[,'tf']),type='l',pch=n,col=rainbow(5,start=0.4), add=TRUE)
26 | matpoints(seq(1,nrow(gram4))[seq(1,nrow(gram4),500)],log(gram4[,'tf'])[seq(1,nrow(gram4),500)],pch=n,col=n)
27 | 
28 | n=5
29 | matplot(seq(1,nrow(gram5)),log(gram5[,'tf']),type='l',pch=n,col=rainbow(5,start=0.5), add=TRUE)
30 | matpoints(seq(1,nrow(gram5))[seq(1,nrow(gram5),500)],log(gram5[,'tf'])[seq(1,nrow(gram5),500)],pch=n,col=n)
31 | 
32 | legend("topright",legend = c("1-gram","2-gram","3-gram","4-gram","5-gram"), col=1:n, pch=1:n) # optiona
33 | 
34 | 
35 | 
36 | n=6
37 | matplot(seq(1,nrow(gram6)),log(gram6[,'tf']),type='l',pch=n,col=rainbow(n,start=0.1*n), xlab="Index of ranked n-grams", ylab="log(term frequency)")
38 | matpoints(seq(1,nrow(gram6))[seq(1,nrow(gram6),500)],log(gram6[,'tf'])[seq(1,nrow(gram6),500)],pch=n,col=n)
39 | 
40 | n=7
41 | matplot(seq(1,nrow(gram7)),log(gram7[,'tf']),type='l',pch=n,col=rainbow(n,start=0.1*n), add=TRUE)
42 | matpoints(seq(1,nrow(gram7))[seq(1,nrow(gram7),500)],log(gram7[,'tf'])[seq(1,nrow(gram7),500)],pch=n,col=n)
43 | 
44 | n=8
45 | matplot(seq(1,nrow(gram8)),log(gram8[,'tf']),type='l',pch=n,col=rainbow(n,start=0.1*n), add=TRUE)
46 | matpoints(seq(1,nrow(gram8))[seq(1,nrow(gram8),500)],log(gram8[,'tf'])[seq(1,nrow(gram8),500)],pch=n,col=n)
47 | 
48 | legend("topright",legend = c("CHV terms","UMLS w/o CHV terms","other terms"), col=6:n, pch=6:n) # optiona
49 | 
50 | 


--------------------------------------------------------------------------------
/r/pattern-heatmap.R:
--------------------------------------------------------------------------------
  1 | #########################################################
  2 | ### A) Installing and loading required packages
  3 | #########################################################
  4 | 
  5 | if (!require("gplots")) {
  6 |   install.packages("gplots", dependencies = TRUE)
  7 |   library(gplots)
  8 | }
  9 | if (!require("RColorBrewer")) {
 10 |   install.packages("RColorBrewer", dependencies = TRUE)
 11 |   library(RColorBrewer)
 12 | }
 13 | if (!require("d3heatmap")) {
 14 |   install.packages("d3heatmap", dependencies = TRUE)
 15 |   library(d3heatmap)
 16 | }
 17 | 
 18 | 
 19 | 
 20 | df <- read.csv("C:\\fsu\\ra\\UmlsTagger\\r\\data\\cui-duration-freq.csv", sep=",",colClasses = "character")
 21 | #df <- read.csv("C:\\fsu\\ra\\data\\201601\\split_criteria\\cui-duration-freq.csv", sep=",",colClasses = "character")
 22 | 
 23 | 
 24 | 
 25 | cuis <- unique(df[,"cui"])  # already sort by the sql
 26 | durs <- sort(unique(df[,"month"]))
 27 | 
 28 | 
 29 | topN = 50
 30 | if (length(cuis)<topN)topN=length(cuis)
 31 | 
 32 | mat <- matrix(data=rep(0,topN*length(durs)),nrow=topN,ncol=length(durs))
 33 | 
 34 | rownames(mat) <- cuis[1:topN]
 35 | colnames(mat) <- as.character(sort(as.integer(durs)))
 36 | 
 37 | cuiStr = rep("",topN) #cuis[1:topN]  # for cui string
 38 | maxRow = 0
 39 | for (r in 1:nrow(df)) {
 40 |   cui = df[[r,"cui"]]
 41 |   if (cui %in%  cuis[1:topN]) {
 42 |     dur = as.character(df[r,"month"])
 43 |     mat[cui,dur] = as.integer(df[r,"num"])
 44 |     if (nchar(df[r,"cui_str"])>nchar(cuiStr[match(cui,cuis)])) {
 45 |       cuiStr[match(cui,cuis)] = sprintf("%s(%s)",df[r,"cui_str"],df[r,"sty"])
 46 |     }
 47 |     print(c(cui,dur,mat[cui,dur]))
 48 |     maxRow = r
 49 |   }
 50 | }
 51 | 
 52 | print(mat)
 53 | 
 54 | # only use the column that contain non-zero value
 55 | print(dim(mat))
 56 | print(colSums(mat))
 57 | colFilter = colSums(mat)!=0
 58 | mat <- mat[,colFilter]
 59 | mat <- matrix(mat,nrow=topN,ncol=sum(colFilter))
 60 | print(dim(mat))
 61 | 
 62 | rownames(mat) <- cuiStr
 63 | colnames(mat) <- as.character(sort(as.integer(durs)))[colFilter]
 64 | # creates a own color palette from red to green
 65 | my_palette <- c(colorRampPalette(c("grey90"),1)(n = 1),
 66 |                 colorRampPalette(c("light green", "yellow", "orange", "red"),0.2)(n = 99) )
 67 | 
 68 | # (optional) defines the color breaks manually for a "skewed" color transition
 69 | col_breaks = c(seq(0,0.1,length=1),   # for grey
 70 |                seq(0.2,30,length=40),            # for green
 71 |                seq(30.1,150,length=30),          # for yellow
 72 |                seq(150.1,max(mat)+1,length=30))              # for red
 73 | 
 74 | # creates a 5 x 5 inch image
 75 | png("C:\\Users\\Jason\\Desktop\\cui_duration_heatmap.png",
 76 |     width = 5*400,        # 5 x 300 pixels
 77 |     height = 5*400,
 78 |     res = 400,            # 300 pixels per inch
 79 |     pointsize = 6)        # smaller font size
 80 | 
 81 | 
 82 | labels <- as.character(mat)
 83 | labels[mat==0] = ""
 84 | dim(labels) = dim(mat)
 85 | heatmap.2(mat,
 86 |           cellnote = labels,  # same data set for cell labels
 87 |           #main = "duration vs frequency for top N CUI", # heat map title
 88 |           notecol="black",      # change font color of cell labels to black
 89 |           density.info="histogram",  # turns off density plot inside color legend
 90 |           key.par=list(mar=c(3.5,1,3,1)),
 91 |           key.title = "frequency to color mapping",
 92 |           key.xlab = "",
 93 |           key.ylab = "",
 94 |           #labCol = colnames(mat),
 95 |           #labRow = seq(nrow(mat)),#rownames(mat),
 96 |           xlab = "Number of months",
 97 |           cexRow = 1.4,
 98 |           cexcol = 6, 
 99 |           srtRow = -23,
100 |           trace="both",         # turns off trace lines inside the heat map
101 |           tracecol="ghostwhite",
102 |           margins =c(3.5,6),     # widens margins around plot, col and row
103 |           col=my_palette,       # use on color palette defined earlier
104 |           breaks=col_breaks,    # enable color transition at specified limits
105 |           dendrogram="none",     # only draw a row dendrogram
106 |           Colv="NA",             # turn off column clustering
107 |           lmat=rbind(c(5, 4, 2), c(6, 1, 3)), 
108 |           lhei=c(1.5, 9), 
109 |           lwid=c(0.01, 10,1.9)
110 |           )
111 | #mtext("Number of months",side=1,line=3)
112 | 
113 | #nba_heatmap <- heatmap(mat, Rowv=NA, Colv=NA, col = cm.colors(256), scale="column")
114 | 
115 | 
116 | 
117 | # 
118 | # library(d3heatmap)
119 | # url <- "http://datasets.flowingdata.com/ppg2008.csv"
120 | # nba_players <- read.csv(url, row.names = 1)
121 | # d3heatmap(nba_players, scale = "column",dendrogram = "none",color = "Blues")
122 | # 
123 | # 
124 | # install.packages("heatmaply")
125 | # library(heatmaply)
126 | # heatmaply(mtcars, k_col = 2, k_row = 3) %>% layout(margin = list(l = 130, b = 40))
127 | 
128 | 
129 | dev.off()
130 | 


--------------------------------------------------------------------------------
/r/pca-draw.R:
--------------------------------------------------------------------------------
 1 | library(rgl)
 2 | 
 3 | ngramspca=read.table("C:\\fsu\\ra\\data\\pca.txt")
 4 | plot(ngramspca[,1:3])
 5 | plot3d(ngramspca[,1:3])
 6 | apply(ngramspca[,1:10], 2, mean)
 7 | apply(ngramspca[,1:10], 2, sd)
 8 | 
 9 | 
10 | ngramall=read.table("C:\\fsu\\ra\\data\\ngram_vectors_all_0227.txt")
11 | tmp=subset(ngramall,ngramall[,6]>0.3)
12 | #tmp=ngramall
13 | tmp[,6]=0
14 | p=prcomp(tmp,scale. = FALSE)
15 | plot(p)
16 | plot3d(p$x[,1:3])
17 | plot(p$x[,1:3])
18 | apply(p$x[,1:10], 2, mean)
19 | apply(p$x[,1:10], 2, sd)
20 | 


--------------------------------------------------------------------------------
/r/review-order-ranking.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #data =read.table("C:\\fsu\\ra\\data\\rank-review-ranking.csv",header=TRUE,sep=',')
 3 | data =read.table("C:\\fsu\\ra\\data\\rank-review-ranking-30sample.csv",header=TRUE,sep=',')
 4 | 
 5 | ds=data[order(-data[,"kTfAvg"],data[,"cost"]),]
 6 | ds2 = cbind(ds,seq(1,nrow(ds),1)/nrow(ds))
 7 | chvCnt = sum(ds[,'type']=='chv')
 8 | 
 9 | recall = rep(0,nrow(ds))
10 | for (i in seq(1,nrow(ds2),1)) {
11 |   recall[i]=sum(ds[1:i,'type']=='chv')/chvCnt
12 | }
13 | precision = rep(0,nrow(ds))
14 | for (i in seq(1,nrow(ds2),1)) {
15 |   precision[i]=sum(ds[1:i,'type']=='chv')/i
16 | }
17 | fscore = rep(0,nrow(ds))
18 | for (i in seq(1,nrow(ds2),1)) {
19 |   fscore[i]=(1+0.5^2)*(precision[i]*recall[i]/(0.5^2*precision[i]+recall[i]))
20 | }
21 | 
22 | x = seq(1,nrow(ds),1)*100/chvCnt
23 | #precision
24 | y = cbind(recall,precision,fscore)
25 | matplot(x,y,type=c('l'),
26 |         pch=c(1,5,6),
27 |         lwd=1,
28 |         lty=1,
29 |         #add=TRUE,
30 |         col=rainbow(3,start=1),
31 |         xlab="top-N percent", ylab="recall/precision/F-score")
32 | 
33 | pp=seq(1,nrow(ds),1) %% 200==0
34 | matpoints(x[pp], y[pp,], type = "p", lty = 1, lwd = 1, pch = c(1,5,6),
35 |           col = rainbow(3,start=1))
36 | 
37 | legend("topright",legend = c("recall", "precision", "F-score"), 
38 |        col=rainbow(3,start=1), 
39 |        pch=c(1,5,6)) # optiona
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/r/review_result.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/r/review_result.R


--------------------------------------------------------------------------------
/r/silhouette.R:
--------------------------------------------------------------------------------
 1 | library(cluster)
 2 | ngrams=read.table("data\\ngram_vectors_all_0227.txt")
 3 | colnames(ngrams)=c("tfdf-1","tf-2","df-3","cvalue-4","umls_score-5","chv_score-6","contain_umls-7","contain_chv-8","nn-9","an-10","pn-11","anpn-12","stys-13","stys-14","stys-15","stys-16","stys-17","stys-18","stys-19","stys-20","stys-21","stys-22","stys-23","stys-24","win_pos-25","win_pos-26","win_pos-27","win_pos-28","win_pos-29","win_pos-30","win_pos-31","win_pos-32","win_pos-33","win_pos-34","win_pos-35","win_pos-36","win_pos-37","capt_first-38","capt_all-39","capt_term-40","win_umls-41","win_chv-42","sent_umls-43","sent_chv-44","umls_dist-45","chv_dist-46","prefix-47","prefix-48","prefix-49","prefix-50","prefix-51","prefix-52","prefix-53","prefix-54","prefix-55","prefix-56","prefix-57","prefix-58","prefix-59","prefix-60","prefix-61","prefix-62","prefix-63","prefix-64","prefix-65","prefix-66","prefix-67","prefix-68","prefix-69","prefix-70","prefix-71","prefix-72","prefix-73","prefix-74","prefix-75","prefix-76","prefix-77","prefix-78","prefix-79","prefix-80","prefix-81","prefix-82","prefix-83","prefix-84","prefix-85","prefix-86","prefix-87","prefix-88","prefix-89","prefix-90","prefix-91","prefix-92","prefix-93","prefix-94","prefix-95","prefix-96","prefix-97","prefix-98","prefix-99","prefix-100","prefix-101","prefix-102","prefix-103","prefix-104","prefix-105","prefix-106","prefix-107","prefix-108","prefix-109","prefix-110","prefix-111","prefix-112","prefix-113","prefix-114","prefix-115","prefix-116","prefix-117","prefix-118","prefix-119","prefix-120","suffix-121","suffix-122","suffix-123","suffix-124","suffix-125","suffix-126","suffix-127","suffix-128","suffix-129","suffix-130","suffix-131","suffix-132","suffix-133","suffix-134","suffix-135","suffix-136","suffix-137","suffix-138","suffix-139","suffix-140","suffix-141","suffix-142","suffix-143","suffix-144","suffix-145","suffix-146","suffix-147","suffix-148","suffix-149","suffix-150","suffix-151","suffix-152","suffix-153","suffix-154","suffix-155","suffix-156","suffix-157","suffix-158","suffix-159","suffix-160","suffix-161","suffix-162","suffix-163","suffix-164","suffix-165","suffix-166","suffix-167","suffix-168","suffix-169","suffix-170","suffix-171","suffix-172","suffix-173","suffix-174","suffix-175","suffix-176","suffix-177","suffix-178","suffix-179","suffix-180","suffix-181","suffix-182","suffix-183","suffix-184","suffix-185","suffix-186","suffix-187","suffix-188","suffix-189","suffix-190","suffix-191","suffix-192","suffix-193","suffix-194","suffix-195","suffix-196","suffix-197","suffix-198","suffix-199","suffix-200","suffix-201","suffix-202","suffix-203","suffix-204","suffix-205","suffix-206","suffix-207","suffix-208","suffix-209","suffix-210","suffix-211","suffix-212","suffix-213","suffix-214","suffix-215","suffix-216","suffix-217","suffix-218","suffix-219","suffix-220","suffix-221","suffix-222","suffix-223","suffix-224","suffix-225","suffix-226","suffix-227","suffix-228","suffix-229","suffix-230","suffix-231","suffix-232","suffix-233","suffix-234","suffix-235","suffix-236","suffix-237","suffix-238","suffix-239","suffix-240","suffix-241","suffix-242","suffix-243","suffix-244","suffix-245","suffix-246","suffix-247","suffix-248","suffix-249","suffix-250","suffix-251","suffix-252","suffix-253","suffix-254","suffix-255","suffix-256","suffix-257","suffix-258","suffix-259","suffix-260","suffix-261","suffix-262","suffix-263","suffix-264","suffix-265","suffix-266","suffix-267","suffix-268","suffix-269","suffix-270","suffix-271","suffix-272","suffix-273","suffix-274","suffix-275","suffix-276","suffix-277","suffix-278","suffix-279","suffix-280","suffix-281","suffix-282","suffix-283","suffix-284","suffix-285","suffix-286","suffix-287","suffix-288","suffix-289","suffix-290","suffix-291","suffix-292","suffix-293","suffix-294","suffix-295","suffix-296","suffix-297","suffix-298","suffix-299","suffix-300","suffix-301","suffix-302","suffix-303","suffix-304","suffix-305","suffix-306","suffix-307","suffix-308","suffix-309","suffix-310","suffix-311","suffix-312","suffix-313")
 4 | v=subset(ngrams,`chv_score-6`>0.3)
 5 | #v$`chv_score-6`=0
 6 | dissE <- daisy(v)
 7 | 
 8 | msk = c()
 9 | kkk=seq(2,20,1)
10 | for (k in kkk) {
11 |   km <- kmeans(v,k,iter.max = 1000,nstart=10)
12 |   sk <- silhouette(km$cluster, dissE)
13 |   #print(summary(sk))
14 |   print(mean(sk[,3]))
15 |   msk = append(msk,mean(sk[,3]))
16 |   #plot(sk)
17 | }
18 | 
19 | #tiff("Plot2.tif", res = 300)
20 | 
21 | plot(cbind(kkk,msk),type='b',xlab = "Number of clusters", ylab="Silhouette score")
22 | 
23 | 
24 | #sss=c(0.2643342,0.2798345,0.2909478,0.2932777,0.2771142,0.258752,0.2134653,0.217159,0.181204,0.1885766,0.1786644,0.1713124,0.206528,0.1611574,0.1845627,0.1712207,0.152299,0.1767387,0.157803,0.1702419,0.1557435,0.1296721,0.1368338,0.1490912,0.1171227,0.1211971,0.1221018,0.1244477,0.1108768,0.1215782,0.1197631,0.1129861,0.1221741,0.1015608,0.1007896,0.1075403)
25 | #cluster=c(seq(2,20,1),seq(20,100,5))
26 | #plot(cluster,sss,type='b',xlab = "cluster number", ylab="Silhouette score")
27 | 


--------------------------------------------------------------------------------
/solr_Configuration.md:
--------------------------------------------------------------------------------
  1 |   It is tedious to configure Solr, that is why I change it to perform the matching task in Mysql even it is slower.
  2 |   I do not recommend to use Solr except you has a strong consideration.
  3 | 
  4 | 4. **Download and Customize Solr** (only 4.6.1 is tested, later it will support solr 5.x)
  5 |    Solr is available for download [here](https://archive.apache.org/dist/lucene/solr/4.6.1/).
  6 |    After downloading you will need to expand it locally, then update the schema.xml and solrconfig.xml
  7 |    in the conf subdirectory as shown below:  
  8 |   **(Tips: Instead of modify by yourself as following,, you can just copy the config files in
  9 |   ${project-root}/conf directory to ${solr-4.6.1}/example/solr/collection1/conf)**
 10 | 
 11 |    ```
 12 |    tar xvzf solr-4.6.1.tgz
 13 |    cd solr-4.6.1/example/solr/collection1/conf
 14 |    ```
 15 | 
 16 |    Update the schema.xml to replace the field definitions with our own. Our fields list and the definition
 17 |    of the field type "tag" (copied from the documentation of SolrTextTagger) is shown. The "id" field is
 18 |     just a integer sequence (unique key for Solr), the "cui" and "descr" comes from the CUI and
 19 |     STR fields from the UMLS database, and the descr_norm, descr_sorted, descr_stemmed are case/punctuation normalized,
 20 |     alpha sorted and stemmed versions of STR. The descr_tagged field is identical to descr_norm but is analyzed differently as specified below.
 21 |     (add the new fields to the beginning of the <fields>):
 22 | 
 23 |     ```
 24 |     <fields>
 25 |         <field name="id" type="string" indexed="true" stored="true"
 26 |           required="true"/>
 27 |         <field name="cui" type="string" indexed="true" stored="true"/>
 28 |         <field name="aui" type="string" indexed="true" stored="true"/>
 29 |         <field name="sab" type="string" indexed="false" stored="true"/>
 30 |         <field name="descr" type="string" indexed="true" stored="true"/>
 31 |         <field name="descr_norm" type="string" indexed="true" stored="true"/>
 32 |         <field name="descr_sorted" type="string" indexed="true" stored="true"/>
 33 |         <field name="descr_stemmed" type="string" indexed="true" stored="true"/>
 34 |         <field name="preferStr" type="string" indexed="true" stored="true"/>
 35 |         <field name="descr_tagged" type="tag" indexed="true" stored="false"
 36 |              omitTermFreqAndPositions="true" omitNorms="true"/>
 37 |         <copyField source="descr_norm" dest="descr_tagged"/>
 38 |         <dynamicField name="*" type="string" indexed="true" stored="true"/>
 39 |         
 40 |         ...
 41 |     </fields>
 42 |     ...
 43 |     <types>
 44 |         <fieldType name="tag" class="solr.TextField" positionIncrementGap="100">
 45 |           <analyzer>
 46 |             <tokenizer class="solr.StandardTokenizerFactory"/>
 47 |             <filter class="solr.EnglishPossessiveFilterFactory"/>
 48 |             <filter class="solr.ASCIIFoldingFilterFactory"/>
 49 |             <filter class="solr.LowerCaseFilterFactory"/>
 50 |           </analyzer>
 51 |         </fieldType>
 52 |     ...
 53 |     </types>        
 54 |         
 55 |     ```
 56 |     We then add in the requestHandler definition for SolrTextTagger's tag service into the solrconfig.xml file (also in conf).
 57 |     The definition is shown below(add it above the first exists requestHandler):
 58 | 
 59 |     ```
 60 |     <requestHandler name="/tag"
 61 |          class="org.opensextant.solrtexttagger.TaggerRequestHandler">
 62 |         <str name="indexedField">descr_tagged</str>
 63 |         <str name="storedField">descr_norm</str>
 64 |         <bool name="partialMatches">false</bool>
 65 |         <int name="valueMaxLen">5000</int>
 66 |         <str name="cacheFile">taggerCache.dat</str>
 67 |      </requestHandler>
 68 |       ```
 69 |      Finally, we create a lib directory and copy over the solr-text-tagger-1.3-SNAPSHOT.jar into it.
 70 |      Then go up to the example directory and start Solr. Solr is now listening on port 8983 on localhost.
 71 | 
 72 |      ```
 73 |      cd solr-4.6.1/example/solr/collection1
 74 |      mkdir lib
 75 |      cp ${SolrTextTagger-path}/SolrTextTagger/target/*jar lib/
 76 |      cd ../..
 77 |      java -jar start.jar
 78 |      ```
 79 | 5. Load Data and Build FST
 80 |     We use the same cuistr1.csv file that we downloaded from our MySQL UMLS database. I guess I could have
 81 |     written custom code to load the data into the index, but I had started experimenting with SolrTextTagger using curl,
 82 |     so I just wrote some code that converted the (CUI,STR) CSV format into JSON,
 83 |     with additional fields created by our case/punctuation normalization, alpha sort and stemming.
 84 |     I used the same Scala code since I already had the transformations coded up from last week.
 85 |     Once I generated the JSON file (cuistr1.json), I uploaded it into Solr and built the FST using the following curl commands.
 86 | 
 87 |     ```
 88 |     cd solr-4.6.1\example\exampledocs
 89 |     java -Durl=http://localhost:8983/solr/update -Dtype=application/json \
 90 |       -jar post.jar ${your-path}/first_10000.csv
 91 | 
 92 |     curl "http://localhost:8983/solr/tag?build=true" (or you can run this url in a browser)
 93 |     ```
 94 |     
 95 |     
 96 | 6. **Download and Build SolrTextTagger**    
 97 |    The code for SolrTextTagger resides on GitHub, so to download and build the custom Solr JAR,
 98 |     execute the following sequence of commands. This will create a solr-text-tagger-1.3-SNAPSHOT.
 99 |     jar file in your target subdirectory in the SolrTextTagger project.
100 | 
101 |    ```
102 |    git clone  https://github.com/OpenSextant/SolrTextTagger.git
103 |    cd SolrTextTagger
104 |    git checkout -b v1x --track origin/v1x
105 |    mvn test
106 |    mvn package
107 |    ```


--------------------------------------------------------------------------------
/sql-script/0923-test.sql:
--------------------------------------------------------------------------------
 1 | use umls;
 2 | /*add a new column for all rel+rela, then get the rel+rela as a string*/  
 3 |  -- alter table umls.content_tag_diabetes_T047_unique2_output add (rel_all text default null);  
 4 |  -- alter table ytex.content_tag_ytex_T047_unique_output add (rel_all text default null);  
 5 | /*
 6 | update content_tag_diabetes_T047_unique2_output as ret set rel_all = 
 7 | 	(select GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') from umls.MRREL as r 
 8 | 		where (r.CUI1=ret.cui COLLATE utf8_unicode_ci and r.CUI2 = ret.rel_cui COLLATE utf8_unicode_ci) 
 9 | 		   or (r.CUI2=ret.cui COLLATE utf8_unicode_ci and r.CUI1=ret.rel_cui COLLATE utf8_unicode_ci)
10 |         GROUP BY CUI1,CUI2 limit 1
11 |         )
12 |         where rel_all is null;
13 | */
14 | drop table umls.tmp_pairs;
15 | create table umls.tmp_pairs as select DISTINCT CUI,REL_CUI FROM umls.content_tag_diabetes_T047_unique2_output where rel_all is null;
16 | 
17 | update umls.content_tag_diabetes_T047_unique2_output as t 
18 | 	inner join
19 | 	(select  r.CUI1,r.CUI2,GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') as rel_all from 
20 | 		tmp_pairs as ret 
21 | 		inner join 
22 |         umls.MRREL as r 
23 |         on  (r.CUI1=ret.cui COLLATE utf8_unicode_ci and r.CUI2 = ret.rel_cui COLLATE utf8_unicode_ci) 
24 | 		 or (r.CUI2=ret.cui COLLATE utf8_unicode_ci and r.CUI1=ret.rel_cui COLLATE utf8_unicode_ci)
25 |         GROUP BY CUI1,CUI2
26 |         ) as temp
27 |     on  (temp.CUI1=t.cui COLLATE utf8_unicode_ci and temp.CUI2 = t.rel_cui COLLATE utf8_unicode_ci) 
28 | 	 or (temp.CUI2=t.cui COLLATE utf8_unicode_ci and temp.CUI1=t.rel_cui COLLATE utf8_unicode_ci)   
29 | 	set t.rel_all = temp.rel_all
30 |     where t.rel_all is null
31 |     ;
32 | 
33 |  select count(*) from     umls.content_tag_diabetes_T047_unique2_output where rel_all is null;
34 |  select count(*) from tmp_pairs;
35 |  
36 |  
37 |  
38 |  
39 | drop table if exists ytex.tmp_pairs;
40 | create table ytex.tmp_pairs as select DISTINCT CUI,REL_CUI FROM ytex.content_tag_ytex_T047_unique_output where rel_all is null;
41 | update ytex.content_tag_ytex_T047_unique_output as t 
42 | 	inner join
43 | 	(select r.CUI1,r.CUI2,GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') as rel_all from umls.MRREL as r 
44 | 		inner join ytex.tmp_pairs as ret 
45 |         on  (r.CUI1=ret.cui COLLATE utf8_unicode_ci and r.CUI2 = ret.rel_cui COLLATE utf8_unicode_ci) 
46 | 		 or (r.CUI2=ret.cui COLLATE utf8_unicode_ci and r.CUI1=ret.rel_cui COLLATE utf8_unicode_ci)
47 |         GROUP BY CUI1,CUI2
48 |         ) as temp
49 |     on  (temp.CUI1=t.cui COLLATE utf8_unicode_ci and temp.CUI2 = t.rel_cui COLLATE utf8_unicode_ci) 
50 | 	 or (temp.CUI2=t.cui COLLATE utf8_unicode_ci and temp.CUI1=t.rel_cui COLLATE utf8_unicode_ci)   
51 | 	set t.rel_all = temp.rel_all
52 |     where t.rel_all is null
53 |     ;
54 | 
55 |  select * from     ytex.content_tag_ytex_T047_unique_output where rel_all is not null;
56 | 
57 | use umls;
58 | select * from mrconso where SAB='SNOMEDCT_US' AND TTY = 'PT' AND CODE = '251314005';
59 | select * from mrconso where AUI='A3601659';
60 | 


--------------------------------------------------------------------------------
/sql-script/cancerqa_chv.sql:
--------------------------------------------------------------------------------
 1 | create database cancerqa char set utf8;
 2 | use cancerqa;
 3 | 
 4 | create index idx_qid on cancerqa_questions(qid);
 5 | create index idx_nick on cancerqa_questions(chosenanswernick);
 6 | create index idx_nick2 on cancerqa_answers(usernick);
 7 | create index idx_qid2 on cancerqa_answers(qid);
 8 | 
 9 | alter table qa_data add column id int not null auto_increment primary key;
10 | 
11 | select * from cancerqa_questions;
12 | select * from cancerqa_answers;
13 | 
14 | create table qa_data as select Q.qid, A.usernick, Q.subject, Q.content as question_content, A.content as answer_content from cancerqa_questions Q, cancerqa_answers A where Q.qid=A.qid and Q.chosenanswernick=A.usernick;
15 | 
16 | select * from qa_data;
17 | 
18 | select qid,usernick, count(*) as cnt from qa_data group by qid,usernick order by cnt desc;
19 | 
20 | 
21 | select Q.qid, A.usernick, Q.subject, Q.content as question_content, A.content as answer_content from cancerqa_questions Q, cancerqa_answers A where Q.qid=A.qid and Q.chosenanswernick=A.usernick;
22 | 
23 | select count(*) from qa_data;
24 | 
25 | 
26 | select * from cancerqa_answers B join (select A.qid as qid, max(A.rating) as maxrating from cancerqa_answers A group by A.qid) M
27 | 	on B.qid=M.qid and B.rating=M.maxrating;
28 | 
29 | create table qa_data2 as 
30 | 	select Q.qid, A.usernick, Q.subject, Q.content as question_content, A.content as answer_content, M.maxrating
31 | 		from cancerqa_questions Q, cancerqa_answers A,(select qid as qid, max(rating) as maxrating from cancerqa_answers  group by qid) M  			where Q.qid=A.qid and Q.qid=M.qid and Q.chosenanswernick=A.usernick and M.maxrating=A.rating;
32 | select nested, count(*) from 


--------------------------------------------------------------------------------
/sql-script/chv.sql:
--------------------------------------------------------------------------------
 1 | create database chv char set utf8;
 2 | use chv;
 3 | create table cancer_ngram (
 4 | ngram            varchar(100),
 5 | train            varchar(100),
 6 | n                int,
 7 | tfdf             int,
 8 | tf               int,
 9 | df               int,
10 | cvalue           float,
11 | nest             float,
12 | umls_score       float,
13 | chv_score        float,
14 | cui_umls         varchar(100),
15 | cui_chv          varchar(100),
16 | contain_umls     varchar(100),
17 | contain_chv      varchar(100),
18 | win_umls         int,
19 | win_chv          int,
20 | sent_umls        int,
21 | sent_chv         int,
22 | umls_dist        int,
23 | chv_dist         int,
24 | win_pos          varchar(100),
25 | prefix           varchar(100),
26 | suffix           varchar(100),
27 | bow_total        int,
28 | bow_words        int,
29 | sytax            varchar(100),
30 | nn               varchar(100),
31 | an               varchar(100),
32 | pn               varchar(100),
33 | anpn             varchar(100),
34 | isTrain          varchar(100),
35 | capt_first       varchar(100),
36 | capt_term        varchar(100),
37 | capt_all         varchar(100),
38 | stys             varchar(100),
39 | text_org         varchar(100),
40 | sentence         text
41 | );
42 | create table diabetes_ngram like cancer_ngram;
43 | load data local infile '/data/ra/data/ngram_cancer_tf5.txt' into table cancer_ngram fields terminated by '\t' enclosed by '"' lines terminated by '\n' ignore 1 lines;
44 | load data local infile '/data/ra/data/ngram_diabetes_tf5.txt' into table diabetes_ngram fields terminated by '\t' enclosed by '"' lines terminated by '\n' ignore 1 lines;
45 | 
46 | select n,sum(length(cui_chv)>0) as chv,sum(length(cui_chv)=0 and length(cui_umls)>0) as `umls-chv`, sum(length(cui_umls)=0) as others, count(*) as cnt from diabetes_ngram group by n;
47 | select n,sum(length(cui_chv)>0) as chv,sum(length(cui_chv)=0 and length(cui_umls)>0) as `umls-chv`, sum(length(cui_umls)=0) as others, count(*) as cnt from cancer_ngram group by n;


--------------------------------------------------------------------------------
/sql-script/cluster.sql:
--------------------------------------------------------------------------------
 1 | create database cluster character set utf8;
 2 | use cluster;
 3 | 
 4 | create table k20all (
 5 | 	`k`	int(8) DEFAULT NULL,
 6 | 	`type`	varchar(40) DEFAULT NULL,
 7 | 	`ngram`	varchar(200) DEFAULT NULL,
 8 | 	`n`	int(8) DEFAULT NULL,
 9 | 	`tfdf`	float DEFAULT NULL,
10 | 	`tf`	int(8) DEFAULT NULL,
11 | 	`df`	int(8) DEFAULT NULL,
12 | 	`cvalue`	varchar(40) DEFAULT NULL,
13 | 	`nest`	int(8) DEFAULT NULL,
14 | 	`nest_tf`	int(8) DEFAULT NULL,
15 | 	`umls_score`	int(8) DEFAULT NULL,
16 | 	`chv_score`	int(8) DEFAULT NULL,
17 | 	`contain_umls`	varchar(8) DEFAULT NULL,
18 | 	`contain_chv`	varchar(8) DEFAULT NULL,
19 | 	`win_umls`	int(8) DEFAULT NULL,
20 | 	`win_chv`	int(8) DEFAULT NULL,
21 | 	`sent_umls`  int(8) DEFAULT NULL,
22 | 	`sent_chv`	int(8) DEFAULT NULL,
23 | 	`umls_dist`	int(8) DEFAULT NULL,
24 | 	`chv_dist`	int(8) DEFAULT NULL,
25 | 	`sytax`	varchar(40) DEFAULT NULL,
26 | 	`nn`	varchar(8) DEFAULT NULL,
27 | 	`an`	varchar(8) DEFAULT NULL,
28 | 	`pn`	varchar(8) DEFAULT NULL,
29 | 	`anpn`  varchar(8)  DEFAULT NULL
30 |     );
31 |     
32 | load data local infile 'C:\\fsu\\ra\\data\\ra-cluster.txt' 
33 | into table k20all
34 | fields terminated by '\t'
35 | -- enclosed by '"'
36 | lines terminated by '\r\n'
37 | ignore 1 lines;
38 | truncate table k20all;
39 | 
40 | select k,type,count(ngram)  from k20all group by k,type;
41 | select * from k20all where k=18;
42 | 
43 | 
44 | select * from umls.mrconso where cui='C1552861';
45 | select  tui from umls.mrsty  where cui='C0018684';
46 | 
47 | select count(distinct blogId) from ytex.content_org;
48 | select distinct blogId from  ytex.content_org_new;
49 | 
50 | select * from umls.mrconso where str = 'help';


--------------------------------------------------------------------------------
/sql-script/data_process.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/sql-script/data_process.docx


--------------------------------------------------------------------------------
/sql-script/data_process_0922.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zwChan/Clinical-Text-Mining/268caeeee8a0a57835a8c6263cf5d5d54cf85c6e/sql-script/data_process_0922.docx


--------------------------------------------------------------------------------
/sql-script/deaf.sql:
--------------------------------------------------------------------------------
  1 | 
  2 | create database deaf char set utf8;
  3 | use deaf;
  4 | drop table deaf.dataset_deaf;
  5 | create table dataset_deaf (
  6 | `PostID`           int(20), 
  7 | `UserID`           int(20),
  8 | `User`              varchar(100),
  9 | `Date`              varchar(100),
 10 | `Year`              varchar(100),
 11 | `Month`        varchar(100),
 12 | `Time`              varchar(100),
 13 | `AMPM`             varchar(100),
 14 | `Content`           text,
 15 | `ThreadTitle`      varchar(500),
 16 | `ThreadPath`       varchar(500),
 17 | `ThreadNumber`     int(20),
 18 | -- `LinkName`         varchar(100),
 19 | `Type`              varchar(100)
 20 | )
 21 | ;
 22 | 
 23 | drop table dataset_autism;
 24 | create table dataset_autism (
 25 | `link`           varchar(256), 
 26 | `topic`          varchar(256),
 27 | `User`              varchar(100),
 28 | `userInfo`              varchar(500),
 29 | `Content`           text
 30 | -- `date`              varchar(100)
 31 | );
 32 | alter table dataset_autism add column PostID int primary key AUTO_INCREMENT first;
 33 | 
 34 | alter table dataset_autism drop column PostID;
 35 | create table deaf_cui like cancer.cancer_cui;
 36 | create table noncui like cancer.noncui;
 37 | create table deaf_metamap like cancer.cancer_metamap_cui;
 38 | -- drop table forum_metamap;
 39 | create table forum_metamap like deaf_metamap;
 40 | create table qa8000_metamap like deaf_metamap;
 41 | 
 42 | load data local infile '/tmp/alldeaf_HealthFitness_08_20_2015_Final.txt' into table dataset_deaf fields terminated by '\t' enclosed by '"' lines terminated by '\r\n' ignore 1 lines;
 43 | load data local infile '/tmp/autism_health_Final.txt' into table dataset_autism fields terminated by '\t' enclosed by '"' lines terminated by '\r\n' ignore 1 lines;
 44 | 
 45 | 
 46 | select distinct PostID,Content,threadNumber from dataset_deaf
 47 | 	into outfile '/tmp/deaf_dataset.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
 48 | select distinct PostID,Content,link from dataset_autism
 49 | 	into outfile '/tmp/autism_dataset.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
 50 | 
 51 | select distinct tui from umls.mrsty;
 52 | 
 53 | delete from dataset_deaf where length(PostID) <2;
 54 | select count(distinct tid) from deaf_cui;
 55 | select count(*) from deaf_metamap where task='autism';
 56 | select count(*) from deaf_metamap where task='deaf';
 57 | select distinct sab from umls.mrconso;
 58 | 
 59 | select splitType, count(*) from deaf_metamap group by splitType ;
 60 | select * from deaf_metamap where length(sty)>6;
 61 | alter table deaf_metamap add column sab varchar(100) after preferStr;
 62 | -- truncate deaf_metamap;
 63 | 
 64 | select task, count(*)/count(distinct tid), count(distinct tid) from deaf_metamap group by task ;
 65 | select * from deaf_metamap where task = 'autism';
 66 | delete from deaf_metamap where sab='CHV' and sentLen > 51;
 67 | 
 68 | select count(*) from deaf_metamap where sab = 'SNOMEDCT_US'; -- 1077473
 69 | select count(*) from deaf_metamap_no_thread_id where sab = 'SNOMEDCT_US'; -- 1140048
 70 | 
 71 | select count(*) from deaf_metamap where sab = 'SNOMEDCT_US' and task='deaf'; -- 475902
 72 | select count(*) from deaf_metamap_no_thread_id where sab = 'SNOMEDCT_US' and task='deaf'; -- 494458
 73 | 
 74 | select count(*) from deaf_metamap where sab = 'SNOMEDCT_US' and task='austim'; -- 475902
 75 | select count(*) from deaf_metamap_no_thread_id where sab = 'SNOMEDCT_US' and task='austim'; -- 494458
 76 | 
 77 | select count(*) from deaf_metamap where sentLen > 51 and sab='SNOMEDCT_US'; -- 0
 78 | select count(*) from deaf_metamap where sentLen > 51 and sab='CHV'; -- 76004
 79 | select sentLen,count(*) from deaf_metamap where sab='SNOMEDCT_US' group by sentLen;
 80 | select sentLen,count(*) from deaf_metamap where sab='CHV' group by sentLen;
 81 | 
 82 | select * from qa8000_metamap;
 83 | select * from umls.mrconso where cui = 'C1258068';
 84 | 
 85 | create index task on deaf_metamap_excluding_sty_distinct_sentences(task);
 86 | create index sab on deaf_metamap_excluding_sty_distinct_sentences(sab);
 87 | create index cui_str on deaf_metamap_excluding_sty_distinct_sentences(cui_str(10));
 88 | 
 89 | create index task on qa8000_metamap_excluding_sty_distinct_sentences(task);
 90 | create index sab on qa8000_metamap_excluding_sty_distinct_sentences(sab);
 91 | create index cui_str on qa8000_metamap_excluding_sty_distinct_sentences(cui_str(10));
 92 | 
 93 | 
 94 | create table qa8000 like qa8000_metamap;
 95 | ALTER TABLE `deaf`.`qa8000` 
 96 | CHANGE COLUMN `threadId` `userId` VARCHAR(256) NULL DEFAULT NULL ;
 97 | rename table qa8000_metamap to qa8000_metamap_without_userid;
 98 | rename table qa8000 to qa8000_metamap;
 99 | 
100 | -- select term foun in snomed but not in chv
101 | select * from (
102 |   select tid,org_str,sentence, count(*) as cnt, group_concat(distinct sab ) as gsab 
103 |     from autism_metamap_excluding_sty_distinct_sentences_user 
104 | 	group by tid,org_str,sentence ) temp_table
105 |   where gsab not like '%CHV%'
106 |   order by cnt desc;


--------------------------------------------------------------------------------
/sql-script/import_0919.sql:
--------------------------------------------------------------------------------
 1 |  CREATE TABLE `content_tag_compare_only_ytex` (
 2 |   `blogId` bigint(20) NOT NULL DEFAULT '0',
 3 |   `target` longtext,
 4 |   `CUI` varchar(20) DEFAULT NULL,
 5 |   `SAB` varchar(40) DEFAULT NULL,
 6 |   `umlsStr` longtext,
 7 |   `TUI` varchar(4) DEFAULT NULL,
 8 |   `styName` varchar(50) DEFAULT NULL,      /* semantic type name*/
 9 |   `worldIndex` int(11) NOT NULL DEFAULT '0', /* the position of the target term in the blog content*/
10 |   `sentence` longtext,
11 |   `rel_cui` char(8) NOT NULL DEFAULT '',   /* the cui that relevant to current target term*/
12 |   `rel_str` varchar(1000) DEFAULT NULL,    /* the preferred string of rel_cui that relevant to current target term*/
13 |   `id` int(11) NOT NULL DEFAULT '0'        /* the primary key of this table, it is unique.*/
14 | ) ;
15 | 
16 | | content_tag_compare_only_our | CREATE TABLE `content_tag_compare_only_our` (
17 |   `blogId` varchar(40)  DEFAULT NULL,
18 |   `target` varchar(300)  DEFAULT NULL,
19 |   `umlsFlag` varchar(10)  DEFAULT NULL,
20 |   `score` float DEFAULT NULL,
21 |   `CUI` varchar(45)  DEFAULT NULL,
22 |   `SAB` varchar(45)  DEFAULT NULL,
23 |   `AUI` varchar(45)  DEFAULT NULL,
24 |   `umlsStr` varchar(1000)  DEFAULT NULL,
25 |   `TUI` varchar(45)  DEFAULT NULL,
26 |   `styName` varchar(45)  DEFAULT NULL,
27 |   `semName` varchar(100)  DEFAULT NULL,
28 |   `tagId` int(11) DEFAULT '0',
29 |   `wordIndex` int(11) DEFAULT '0',
30 |   `wordIndexInSentence` int(11) DEFAULT '0',
31 |   `sentenceIndex` int(11) DEFAULT '0',
32 |   `targetNorm` varchar(300)  DEFAULT NULL,
33 |   `tags` varchar(500)  DEFAULT NULL,
34 |   `sentence` varchar(1000)  DEFAULT NULL,
35 |   `cui1` char(8) NOT NULL,
36 |   `cui2` char(8) NOT NULL,
37 |   `aui1` varchar(9) DEFAULT NULL,
38 |   `aui2` varchar(9) DEFAULT NULL,
39 |   `REL` varchar(4) NOT NULL,
40 |   `RELA` varchar(100) DEFAULT NULL,
41 |   `rel_str` varchar(1000)  DEFAULT NULL,   /* the preferred string of rel_cui that relevant to current target term*/
42 |   `id` int(11) NOT NULL DEFAULT '0',
43 |   `rel_cui` varchar(45)  DEFAULT NULL      /* the cui that relevant to current target term*/
44 | ) ;
45 | 
46 | load data local infile 'csv_path'
47 | into table table_name
48 | fields terminated by ','
49 | enclosed by '"'
50 | lines terminated by '\n';


--------------------------------------------------------------------------------
/sql-script/import_0924.sql:
--------------------------------------------------------------------------------
 1 | USE ytex;
 2 | drop table if exists TMP_ORG;
 3 | CREATE TABLE TMP_ORG (
 4 | `blogId` BIGINT(20) DEFAULT NULL,  /*----blog id */
 5 | `blog_name` varchar(200) DEFAULT NULL, 
 6 | `text_link_title` varchar(500) DEFAULT NULL,   
 7 | `text_content` varchar(10000) DEFAULT NULL,
 8 | `photo_caption`  varchar(300) DEFAULT NULL,  
 9 | `photo_link`  varchar(300) DEFAULT NULL,  
10 | `photo_source`  varchar(300) DEFAULT NULL,  
11 | `link_content`  varchar(300) DEFAULT NULL,  
12 | `post_likes`  varchar(300) DEFAULT NULL,  
13 | `post_reblogged`  varchar(300) DEFAULT NULL,  
14 | `post_hashtag`  varchar(300) DEFAULT NULL
15 | );
16 | /*load result in to the table*/
17 | load data local infile 'C:\\fsu\\ra\\UmlsTagger\\data\\raw_data_CHV_study2.csv.txt' 
18 | into table TMP_ORG
19 | fields terminated by '\t'
20 | enclosed by '"'
21 | lines terminated by '`'
22 | ;
23 | 
24 | CREATE TABLE CONTENT_ORG_NEW AS 
25 | 	SELECT o.blogId,o.post_hashtag,o.blog_name,o.text_link_title,o.text_content FROM TMP_ORG o;
26 |     
27 | select * from CONTENT_ORG_NEW into outfile 'C:\\fsu\\ra\\UmlsTagger\\data\\raw_data_CHV_study2.csv' 
28 | 	fields terminated by ',' enclosed by '"' lines terminated by '\n';
29 | 
30 | select distinct blogId  INSTANCE_ID  from ytex.CONTENT_ORG_NEW ;
31 | select * from ytex.content_org_new where blogId = 0;
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/sql-script/import_1004.sql:
--------------------------------------------------------------------------------
 1 | USE ytex;
 2 | drop table if exists TMP_ORG;
 3 | truncate TMP_ORG;
 4 | CREATE TABLE TMP_ORG (
 5 | `blogId` BIGINT(20) DEFAULT NULL,  /*----blog id */
 6 | `blog_name` varchar(200) DEFAULT NULL, 
 7 | `text_link_title` varchar(500) DEFAULT NULL,   
 8 | `text_content` text DEFAULT NULL,
 9 | `photo_caption`  varchar(300) DEFAULT NULL,  
10 | `photo_link`  varchar(300) DEFAULT NULL,  
11 | `photo_source`  varchar(300) DEFAULT NULL,  
12 | `link_content`  varchar(300) DEFAULT NULL,  
13 | `post_likes`  varchar(300) DEFAULT NULL,  
14 | `post_reblogged`  varchar(300) DEFAULT NULL,  
15 | `post_hashtag`  varchar(300) DEFAULT NULL
16 | );
17 | CREATE TABLE TMP_ORG (
18 | `blogId` BIGINT(20) DEFAULT NULL,  /*----blog id */
19 | `post_hashtag`  varchar(300) DEFAULT NULL,
20 | `blog_name` varchar(200) DEFAULT NULL, 
21 | `text_link_title` varchar(500) DEFAULT NULL,   
22 | `text_content` text DEFAULT NULL
23 | );
24 | -- first input to TMP_ORG, THEN INSERT into TEMP_ORG_NEW, adding a 'disease' column.
25 | truncate TMP_ORG;
26 | /*load result in to the table*/
27 | load data local infile 'C:\\fsu\\ra\\UmlsTagger\\data\\newdataset_chronic\\chronic_newdataset_obesity.csv' 
28 | into table TMP_ORG
29 | fields terminated by ','
30 | enclosed by '"'
31 | lines terminated by '`'
32 | ignore 1 LINES
33 | ;
34 | 
35 | /*
36 | drop table if exists CONTENT_ORG_NEW;
37 | CREATE TABLE CONTENT_ORG_NEW (
38 | `blogId` BIGINT(20) DEFAULT NULL,  /*----blog id */
39 | `post_hashtag`  varchar(300) DEFAULT NULL,
40 | `blog_name` varchar(200) DEFAULT NULL, 
41 | `text_link_title` varchar(500) DEFAULT NULL,   
42 | `text_content` text DEFAULT NULL,
43 | `disease` varchar(100)
44 | );
45 | 
46 | INSERT ignore CONTENT_ORG_NEW  (blogId,post_hashtag,blog_name,text_link_title,text_content,disease)
47 | 	SELECT distinct o.blogId,o.post_hashtag,o.blog_name,o.text_link_title,o.text_content,'obesity'  FROM TMP_ORG o  ;
48 | 
49 | select count(distinct disease,blogId) from CONTENT_ORG_NEW;
50 | 
51 | truncate table content_tag_ytex;
52 | /*
53 | drop table if exists content_tag_ytex;
54 | create table content_tag_ytex as 
55 |  select a.anno_text, d.instance_id, c.*, 'alzheimer' as disease from v_document_cui_sent c 
56 |   inner join  v_annotation a on c.anno_base_id = a.anno_base_id
57 |   inner join v_document d on d.document_id = c.document_id;
58 |   *//*,7230*/
59 | 
60 | alter table content_tag_ytex add `disease` varchar(100) default null ;
61 | insert content_tag_ytex 
62 | select a.anno_text, d.instance_id, c.*, 'diabetes' as disease from v_document_cui_sent c 
63 |   inner join  v_annotation a on c.anno_base_id = a.anno_base_id
64 |   inner join v_document d on d.document_id = c.document_id;
65 | 
66 | select count(distinct code) from content_tag_ytex;
67 | show create table v_document_cui_sent;
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/sql-script/import_tag_0916.sql:
--------------------------------------------------------------------------------
 1 | use umls;
 2 | /*DELET ALL non-english record*/
 3 | -- delete from mrconso where lat <> 'ENG';
 4 | drop table IF EXISTS CONTENT_TAG_UNIQUE_CUI;
 5 | /*create table for the original terms from blogs.*/
 6 | CREATE TABLE CONTENT_TAG_UNIQUE_CUI (
 7 | `blogId` varchar(40) DEFAULT NULL,  /*----blog id */
 8 | `target`  varchar(300) DEFAULT NULL,  /* ----the term found in the content. It much be found in UMLS too. If a term is not found in UMLS, it will be ignored. */
 9 | `umlsFlag` varchar(10) DEFAULT NULL, /* ----If it is found in UMLS. This column is used by hashTags.*/
10 | `score` float DEFAULT NULL,                /* ----The similarity metic between the term in the content and the string in UMLS.*/
11 | `CUI` varchar(45) DEFAULT NULL,        /*----- CUI of UMLS*/
12 | `SAB` varchar(45) DEFAULT NULL,       /* ----- SAB of UMLS*/
13 | `AUI` varchar(45) DEFAULT NULL,        /*----- AUI of UMLS */
14 | `umlsStr` varchar(1000) DEFAULT NULL,  /* ---STR of UMLS mrconso table*/
15 | `TUI` varchar(45) DEFAULT NULL,        /*------TUI of UMLS MRSTY table */
16 | `styName` varchar(45) DEFAULT NULL, /*------Semantic name of UMLS MRSTY table*/
17 | `semName` varchar(100) DEFAULT NULL, /*---semantic group name in SemGroup website*/
18 | `tagId` int default 0,               /*----If the term match a hash_tag of the blog, tagId is the index of the tags. if not match any hash_tag, it is 0.*/
19 | `wordIndex` int default 0,            /*----the position of the term in the content*/
20 | `wordIndexInSentence` int default 0,  /*---- the position of the term in the sentence that it is found in. */
21 | `sentenceIndex` int default 0,         /*-- the index of the sentence of the target*/
22 | `targetNorm` varchar(300) default NULL,  /*--- the normalized string of the term.*/
23 | `tags` varchar(500) default NULL,         /*---- all the hash_tags of the blog. */
24 | `sentence` varchar(1000) default NULL,    /* The sentence that the target is found in*/
25 | `rel`     varchar(4)   ,   /*rel field in mrrel*/
26 | `rela`    varchar(100) ,   /*rela fild in mrrel*/
27 | `rel_str` varchar(1000) ,  /*str field  in mrconso for the term relevant to current term */
28 | `id`      int(11)          /* auto_increment primary key for the result table*/
29 | );
30 | 
31 | /*load result in to the table*/
32 | load data local infile 'C:\\fsu\\tag_diabetes_distinct.csv' 
33 | into table CONTENT_TAG_UNIQUE_CUI 
34 | fields terminated by ','
35 | enclosed by '"'
36 | lines terminated by '\n'
37 | ignore 1 lines;
38 | 
39 | select * from CONTENT_TAG_UNIQUE_CUI;


--------------------------------------------------------------------------------
/sql-script/linux-test.sql:
--------------------------------------------------------------------------------
 1 | update ret.content_tag_ytex_T047_unique as t
 2 | 	inner join
 3 | 	(select r.CUI1,r.CUI2,GROUP_CONCAT(DISTINCT REL,' ',RELA SEPARATOR ',') as rel_all from umls.MRREL as r
 4 | 		inner join ret.content_tag_ytex_T047_unique as ret
 5 |         on  (r.CUI1=ret.cui COLLATE utf8_unicode_ci and r.CUI2 = ret.rel_cui COLLATE utf8_unicode_ci)
 6 | 		 or (r.CUI2=ret.cui COLLATE utf8_unicode_ci and r.CUI1=ret.rel_cui COLLATE utf8_unicode_ci)
 7 |         GROUP BY CUI1,CUI2
 8 |         ) as temp
 9 |     on  (temp.CUI1=t.cui COLLATE utf8_unicode_ci and temp.CUI2 = t.rel_cui COLLATE utf8_unicode_ci)
10 | 	 or (temp.CUI2=t.cui COLLATE utf8_unicode_ci and temp.CUI1=t.rel_cui COLLATE utf8_unicode_ci)
11 | 	set t.rel_all = temp.rel_all
12 |     ;


--------------------------------------------------------------------------------
/sql-script/minsook_1023.sql:
--------------------------------------------------------------------------------
 1 | use ytex;
 2 | CREATE TABLE TMP_ORG_1027 (
 3 | `blogId` BIGINT(20) DEFAULT NULL,  /*----blog id */
 4 | `text_content` text DEFAULT NULL
 5 | );
 6 | 
 7 | drop table TMP_ORG_1027;
 8 | /*load result in to the table*/
 9 | load data local infile 'C:\\fsu\\ra\\data\\content_raw_cleaned_ForZhiwei.csv' 
10 | into table TMP_ORG_1027
11 | fields terminated by ','
12 | enclosed by '"'
13 | lines terminated by '`'
14 | ignore 1 LINES
15 | ;
16 | select count(distinct blogId) from TMP_ORG_1023 LIMIT 1;
17 | -- 50252
18 | select * from TMP_ORG_1027 where text_content like '%`%';
19 | 
20 | use ytex;
21 | select * from v_document_ontoanno;
22 | select * from v_corpus_group_class;
23 | 
24 | create table ytex.content_tag_ytex_1023 as 
25 |  select a.anno_text, d.instance_id, c.* from v_document_cui_sent c 
26 |   inner join  v_annotation a on c.anno_base_id = a.anno_base_id
27 |   inner join v_document d on d.document_id = c.document_id;
28 |   
29 | select * from ytex.content_tag_ytex_1023 order by instance_id, sentence_text
30 | 	into outfile 'C:\\fsu\\ra\\data\\content_tag_ytex_1023.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 
31 |   
32 | select count(distinct instance_id) from content_tag_ytex_1023;  
33 | 
34 | 
35 | show create table ytex.content_tag_ytex_1023;
36 | 


--------------------------------------------------------------------------------
/sql-script/minsook_1103.sql:
--------------------------------------------------------------------------------
 1 | use tumblr_db;
 2 | 
 3 | select count(distinct cui) from tem_tag_ids;
 4 | -- 935
 5 | select count(distinct blogId) from tem_tag_ids;
 6 | -- 147
 7 | select count(distinct code) from content_tag_ytex_noseedtag;
 8 | -- 8227
 9 | select count(distinct instance_id) from content_tag_ytex_noseedtag;
10 | -- 22171
11 | 
12 | select count(*) from content_tag_ytex_noseedtag;
13 | -- 265599
14 | select count(*) from tem_tag_ids;
15 | -- 118729
16 | 
17 | drop table tmp_pairs;
18 | create table tmp_pairs as 
19 | select distinct c.blogId, c.cui,s.cui as cui_tag from (
20 | 	select distinct instance_id as blogId, code as cui from content_tag_ytex_noseedtag) as c
21 | 		inner join (select distinct blogId,cui from tem_tag_ids) as s
22 | 			on c.cui<>s.cui and c.blogId = s.blogId
23 |                 ;
24 | -- 6059
25 | select count(distinct blogId) from tmp_pairs;
26 | -- 120
27 | create table tmp_pairs as 
28 | 	select distinct c.cui,s.cui as cui_tag from (
29 | 		select distinct code as cui from content_tag_ytex_noseedtag) as c
30 | 			inner join (select distinct cui from tem_tag_ids) as s
31 | 				on c.cui<>s.cui
32 | 					;
33 | -- 7692072
34 | alter table content_tag_ytex_noseedtag add (rel_all text default null);  
35 | alter table content_tag_ytex_noseedtag add (cui_tag char(8) default null);  
36 | 
37 | update content_tag_ytex_noseedtag set rel_all=null;
38 | 
39 | update content_tag_ytex_noseedtag as t
40 | 	inner join
41 | 	(select ret.cui,ret.cui_tag,GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') as rel_all from umls.mrrel as r 
42 | 		inner join tmp_pairs as ret 
43 |         on  (r.CUI1=ret.cui and r.CUI2 = ret.cui_tag ) 
44 | 		 or (r.CUI2=ret.cui and r.CUI1=ret.cui_tag )
45 |         GROUP BY cui,cui_tag
46 |         ) as temp
47 |     on  (temp.cui=t.code and temp.blogId=t.instance_id) 
48 | 	set t.rel_all = temp.rel_all, t.cui_tag= temp.cui_tag
49 |     where t.rel_all is null
50 |     ;
51 | 
52 | update content_tag_ytex_noseedtag as t
53 | 	inner join
54 | 	(select ret.blogId, ret.cui,ret.cui_tag,GROUP_CONCAT(DISTINCT REL,' ',IFNULL(RELA,'null') SEPARATOR ',') as rel_all from umls.mrrel as r 
55 | 		inner join tmp_pairs as ret 
56 |         on  (r.CUI1=ret.cui and r.CUI2 = ret.cui_tag ) 
57 | 		 or (r.CUI2=ret.cui and r.CUI1=ret.cui_tag )
58 |         GROUP BY blogId,cui,cui_tag
59 |         ) as temp
60 |     on  (temp.cui=t.code and temp.blogId=t.instance_id) 
61 | 	set t.rel_all = temp.rel_all, t.cui_tag= temp.cui_tag
62 |     where t.rel_all is null
63 |     ;
64 | 
65 | select * from content_tag_ytex_noseedtag where cui_tag is not null;
66 | 


--------------------------------------------------------------------------------
/sql-script/minsook_1229.sql:
--------------------------------------------------------------------------------
 1 | select C.sab, C.code, group_concat(distinct C.str), count(*)
 2 | 
 3 | from
 4 | 
 5 | (select A.code, A.str, B.sab
 6 | 
 7 | FROM
 8 | 
 9 |   (select distinct c.instance_id, c.sentence_text, c.code, m.str from ret1007.content_tag_ytex c, umls.mrconso m
10 | 
11 |     where c.code = m.cui and m.lat = 'ENG' and c.disambiguated=1 and c.disease='diabetes'and  m.TS='P' AND m.stt='PF' AND m.ispref='Y') as A,
12 | 
13 |   (select distinct cui, sab from umls.mrconso where lat='ENG') AS B
14 | 
15 | WHERE A.code = B.cui) as C
16 | 
17 | group by C.sab, C.code
18 | 
19 | order by C.sab, count(*) desc
20 | into outfile '/tmp/minsook_1230.ret' fields terminated by '\t' enclosed by '"' lines terminated by '\n';
21 | 
22 | 
23 | 
24 | select C.sab, C.code, group_concat(distinct C.str), count(*) 
25 | 
26 | from
27 | 
28 | (select A.code, A.str, B.sab
29 | 
30 | FROM
31 | 
32 |   (select distinct c.instance_id, c.sentence_text, c.code, m.str from retyahoo.content_tag_ytex_yahoo_question c, umls.mrconso m
33 | 
34 |     where c.code = m.cui and m.lat = 'ENG' and c.disambiguated=1 and  m.TS='P' AND m.stt='PF' AND m.ispref='Y') as A, 
35 | 
36 |   (select distinct cui, sab from umls.mrconso where lat='ENG') AS B
37 | 
38 | WHERE A.code = B.cui) as C
39 | 
40 | group by C.sab, C.code
41 | 
42 | order by C.sab, count(*) desc 
43 | into outfile '/tmp/minsook_1230_question.ret' fields terminated by '\t' enclosed by '"' lines terminated by '\n';
44 | 
45 | 
46 | 
47 | select C.sab, C.code, group_concat(distinct C.str), count(*) 
48 | 
49 | from
50 | 
51 | (select A.code, A.str, B.sab
52 | 
53 | FROM
54 | 
55 |   (select distinct c.instance_id, c.sentence_text, c.code, m.str from retyahoo.content_tag_ytex_yahoo_answer c, umls.mrconso m
56 | 
57 |     where c.code = m.cui and m.lat = 'ENG' and c.disambiguated=1 and  m.TS='P' AND m.stt='PF' AND m.ispref='Y') as A, 
58 | 
59 |   (select distinct cui, sab from umls.mrconso where lat='ENG') AS B
60 | 
61 | WHERE A.code = B.cui) as C
62 | 
63 | group by C.sab, C.code
64 | 
65 | order by C.sab, count(*) desc 
66 | into outfile '/tmp/minsook_1230_answer.ret' fields terminated by '\t' enclosed by '"' lines terminated by '\n';
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/sql-script/ner200.sql:
--------------------------------------------------------------------------------
 1 | create database ner200 char set utf8;
 2 | use ner200;
 3 | drop table cancer_cui;
 4 | create table cancer_cui like cancer.cancer_cui;
 5 | 
 6 | drop table bioportal;
 7 | drop table lvalue;
 8 | drop table lvaluerake;
 9 | drop table manual;
10 | CREATE TABLE bioportal (
11 | `tid` varchar(30),
12 | `org_str` varchar(100),
13 | `sentence` text
14 | );
15 | 
16 | CREATE TABLE lvalue (
17 | `tid` varchar(30),
18 | `org_str` varchar(100),
19 | `sentence` text
20 | );
21 | 
22 | CREATE TABLE lvaluerake (
23 | `tid` varchar(30),
24 | `org_str` varchar(100),
25 | `sentence` text
26 | );
27 | 
28 | CREATE TABLE manual (
29 | `tid` varchar(30),
30 | `major` varchar(100),
31 | `others` text,
32 | `durStr` varchar(100),
33 | `sentence` text
34 | );
35 | 
36 | 
37 | 
38 | load data local infile 'C:\\fsu\\ra\\data\\201601\\split_criteria\\1308_colorectal_trials_criteria_0413_ret.csv.cui' into table cancer_cui fields terminated by '\t' enclosed by '"' lines terminated by '\n' ignore 1 lines;
39 | load data local infile 'C:\\fsu\\ra\\data\\201601\\split_criteria\\1308_colorectal_trials_criteria_0413_ret.csv.mm.cui' into table cancer_mm_cui fields terminated by '\t' enclosed by '"' lines terminated by '\n' ignore 1 lines;
40 | 
41 | load data local infile 'C:\\Users\\Jason\\Downloads\\bioportal.txt' into table bioportal fields terminated by '\t' enclosed by '"' lines terminated by '\r\n';
42 | load data local infile 'C:\\Users\\Jason\\Downloads\\lvalue.txt' into table lvalue fields terminated by '\t' enclosed by '"' lines terminated by '\r\n';
43 | load data local infile 'C:\\Users\\Jason\\Downloads\\lvaluerake.txt' into table lvaluerake fields terminated by '\t' enclosed by '"' lines terminated by '\r\n';
44 | load data local infile 'C:\\Users\\Jason\\Downloads\\random_200_sentences_cancer_studies.txt' into table manual fields terminated by '\t' enclosed by '"' lines terminated by '\r\n' ignore 1 lines;
45 | select * from cancer_cui where length(cui)>0;
46 | select * from cancer_mm_cui;
47 | select * from bioportal;
48 | select * from lvalue;
49 | select * from lvaluerake;
50 | select * from manual where cui is not null;
51 | delete from manual where length(tid) < 1;
52 | delete from bioportal where length(tid) < 1;
53 | delete from lvalue where length(tid) < 1;
54 | delete from lvaluerake where length(tid) < 1;
55 | 
56 | alter table manual add column `cui` varchar(20);
57 | alter table bioportal add column `cui` varchar(20);
58 | alter table lvalue add column `cui` varchar(20);
59 | alter table lvaluerake add column `cui` varchar(20);
60 | update manual m set cui=(select u.cui from umls.mrconso u,umls.mrsty s where u.str=m.major and u.cui=s.cui and s.tui in ("T200","T020","T190","T049","T019","T047","T050","T037","T048","T191","T046","T184","T060","T065","T058","T059","T063","T062","T061") limit 1);
61 | update bioportal m set cui=(select u.cui from umls.mrconso u,umls.mrsty s where u.str=m.org_str and u.cui=s.cui and s.tui in ("T200","T020","T190","T049","T019","T047","T050","T037","T048","T191","T046","T184","T060","T065","T058","T059","T063","T062","T061") limit 1);
62 | update lvalue m set cui=(select u.cui from umls.mrconso u,umls.mrsty s where u.str=m.org_str and u.cui=s.cui and s.tui in ("T200","T020","T190","T049","T019","T047","T050","T037","T048","T191","T046","T184","T060","T065","T058","T059","T063","T062","T061") limit 1);
63 | update lvaluerake m set cui=(select u.cui from umls.mrconso u,umls.mrsty s where u.str=m.org_str and u.cui=s.cui and s.tui in ("T200","T020","T190","T049","T019","T047","T050","T037","T048","T191","T046","T184","T060","T065","T058","T059","T063","T062","T061") limit 1);
64 | 
65 | select distinct tid from cancer_cui where pattern!= 'CUI_ALL';
66 | select distinct pattern from cancer_cui;
67 | 
68 | select distinct c.tid,c.org_str,c.sentence from cancer_cui c, cancer_mm_cui m  where c.org_str = m.org_str and c.tid = m.tid;
69 | select distinct c.tid,c.org_str,c.sentence from cancer_cui c, bioportal m where c.org_str = m.major and c.tid = m.tid; 
70 | 
71 | select distinct m.tid,m.major,m.sentence from cancer_cui c, manual m where c.tid = m.tid and length(m.cui)>0 and c.`group`='CUI_DISEASE_MAIN' and  instr(c.org_str,m.major) > 0; 
72 | select distinct m.tid,m.major,m.sentence from cancer_cui c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and  instr(c.org_str,m.major) > 0; 
73 | select distinct m.tid,m.major,m.sentence from cancer_mm_cui c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and  instr(c.org_str,m.major) > 0; 
74 | select distinct m.tid,m.major,m.sentence from bioportal c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and  instr(c.org_str,m.major) > 0; 
75 | select distinct m.tid,m.major,m.sentence from lvalue c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and  instr(c.org_str,m.major) > 0; 
76 | select distinct m.tid,m.major,m.sentence from lvaluerake c, manual m where c.tid = m.tid and c.cui is not null and m.cui is not null and  instr(c.org_str,m.major) > 0; 
77 | 
78 | select * from cancer_cui;
79 | 
80 | select SUM(`group` like '%_PT'),SUM(`group` like '%_AF') from cancer_cui;


--------------------------------------------------------------------------------
/sql-script/pattern_all.sql:
--------------------------------------------------------------------------------
1 | use compact_092316;
2 | 
3 | select D.Disease, count(*) as cnt from cancer_cui C join all_diseases_trials D 
4 | 	on C.tid=D.tid and C.month>-1 and nested!='nesting'
5 |     group by D.Disease order by cnt desc;


--------------------------------------------------------------------------------
/sql-script/pattern_all_disease.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='cancer_cui';
 3 | 
 4 | use compact_092316;
 5 | 
 6 |  alter table cancer_cui add column var boolean after tags;
 7 | 
 8 | create table variable (var varchar(200));
 9 | create index idx_var on variable(var) using hash;
10 | 
11 | create index idx_tid on all_diseases_trials(TID) using hash;
12 | create index idx_disease on all_diseases_trials(disease) using hash;
13 | create index idx_sty on cancer_cui(sty) using hash;
14 | create index idx_nested on cancer_cui(nested) using hash;
15 | 
16 | alter table meta add primary key (tid);
17 | create index idx_status on meta (overall_status);
18 | create index idx_min_age2 on meta (minimum_age_in_year);
19 | create index idx_max_age2 on meta (maximum_age_in_year);
20 | create index idx_phase on meta (phase);
21 | create index idx_int on meta (intervention_type);
22 | create index idx_std on meta (study_type);
23 | 
24 | select count(*) from cancer_cui; -- 6179540
25 | select count(*) from meta; -- 225364
26 | select count(*) from all_diseases_trials; -- 1016579
27 | 
28 | select tid, count(*) as cnt from meta group by tid order by cnt desc; -- one to one
29 | select maximum_age_in_year, count(*) as cnt from meta group by maximum_age_in_year order by cnt desc;
30 | 
31 | select count(*) from meta where study_type ='Interventional' and (STR_TO_DATE(start_date,'%M %Y') >= STR_TO_DATE('JANUARY 2000','%M %Y'))
32 | and (STR_TO_DATE(start_date,'%M %Y') <= STR_TO_DATE('SEPTEMBER 2016','%M %Y'));  -- 172245
33 | select count(distinct tid) from all_diseases_trials where disease ='diabetes-mellitus-type-2'; -- 5000
34 | 
35 | CREATE TABLE T2DM_0100_0916 AS (select * from meta where study_type ='Interventional' and (STR_TO_DATE(start_date,'%M %Y') >= STR_TO_DATE('JANUARY 2000','%M %Y'))
36 | and (STR_TO_DATE(start_date,'%M %Y') <= STR_TO_DATE('SEPTEMBER 2016','%M %Y')) 
37 | and tid in (select distinct tid from all_diseases_trials where disease ='diabetes-mellitus-type-2'));  -- 4201
38 | create table T2DM_CUI as (select C.* from cancer_cui C join T2DM_0100_0916 T on C.tid=T.tid ); -- 121341
39 | 
40 | select var, count(*) from cancer_cui group by var;
41 | 
42 | -- Results to be reported: 
43 | -- Rank of the diseases by the number of criteria with temporal constraints, (# of umls terms with temporal constrains)
44 | select T.disease, count(*) as cnt from cancer_cui C join all_diseases_trials T on C.tid=T.tid and C.month >= 0 and C.nested != 'nested' and C.var is null group by T.disease order by cnt desc;
45 | -- Rank of the diseases by the average number of criteria with temporal constraints per trial
46 | select T.disease, count(distinct tid) as cnt from all_diseases_trials T group by T.disease order by cnt desc; 
47 | -- Distribution of semantic types (overall)
48 | select sty, count(*) as cnt from cancer_cui where nested != 'nested'  and C.var is null group by sty order by cnt desc;
49 | -- Frequency of temporal patterns (overall)
50 | select month, count(*) as cnt from cancer_cui where nested != 'nested'  and C.var is null group by month order by cnt desc;
51 | 
52 | 
53 | select * from cancer_cui where month<-1 ;
54 | 
55 |  update cancer_cui d join compact_092316.variable v on (v.var=d.cui_str or v.var=d.org_str) set d.var = true;
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/sql-script/pattern_sty_prefer.sql:
--------------------------------------------------------------------------------
 1 | use cancer;
 2 | select * from cancer_cui where skipNum>0 and length(method)>0 and method != 'fullDep'  and sentence not like '% or %';
 3 | select distinct method from cancer_cui;
 4 | select org_str, sentence from cancer_cui where sentence='Active gastrointestinal tract disease with malabsorption syndrome.';
 5 | 
 6 | select * from noncui order by freq desc;
 7 | select distinct sentence from cancer_cui where splitType='No';
 8 | 
 9 | create database cancer_more_sty char set utf8;
10 | use cancer_more_sty;
11 | create table cancer_cui like cancer.cancer_cui;
12 | create table noncui like cancer.noncui;
13 | create table cancer_metamap_cui like cancer.cancer_metamap_cui;
14 | 
15 | use ner200;
16 | -- get the sty pairs that a term belongs to both of them
17 | select  A.sty,B.sty, count(*) cnt from cancer_cui A, cancer_cui B where length(A.cui)>0 and length(B.cui)>0 and A.tid=B.tid and A.criteriaId=B.criteriaId and A.sentId=B.sentId and A.org_str=B.org_str and A.sty > B.sty group by A.sty,B.sty order by cnt desc;
18 | 
19 | select count(*) from cancer_more_sty.cancer_cui;
20 | 
21 | select * from sty_prefer_cui where sty_prefer_cui.sty1=null ;
22 | select  sty_prefer_cui.sty1=null from sty_prefer_cui;
23 | 
24 | alter table cancer_cui add column flag int after sty;
25 | create table sty_prefer_orgstr like sty_prefer_cui;
26 | 
27 | update sty_prefer_orgstr,sty_prefer_cui set sty_prefer_orgstr.prefer=sty_prefer_cui.prefer,sty_prefer_orgstr.reason=sty_prefer_cui.reason where sty_prefer_orgstr.sty1=sty_prefer_cui.sty1 and sty_prefer_orgstr.sty2=sty_prefer_cui.sty2;
28 | 
29 | select *  from sty_prefer_orgstr;
30 | 
31 | select  A.sty,B.sty, count(*) cnt from cancer_cui A, cancer_cui B where length(A.cui)>0 and length(B.cui)>0 and A.tid=B.tid and A.criteriaId=B.criteriaId and A.sentId=B.sentId and A.org_str=B.org_str and A.sty > B.sty group by A.sty,B.sty order by cnt desc;
32 | 
33 | update cancer_cui A,cancer_cui B,cancer_more_sty.sty_prefer_cui C set A.sty_ignored = true where length(A.cui)>0 and length(B.cui)>0 and A.tid=B.tid and A.criteriaId=B.criteriaId and A.sentId=B.sentId and A.org_str=B.org_str and A.sty > B.sty and C.prefer != A.sty and  ((A.sty=C.sty1 and C.sty2=B.sty) or (A.sty=C.sty2 and C.sty2=B.sty));
34 | 
35 | select * from cancer_cui A where sty='T116';
36 | -- update cancer_cui set flag=null;
37 | select sty, count(*) cnt from cancer_cui group by sty order by cnt desc;
38 | create table sty_prefer_orgstr like cancer_more_sty.sty_prefer_orgstr;
39 | select * from sty_prefer_orgstr where length(prefer)>0;
40 | select * from cancer_more_sty.sty_prefer_orgstr;
41 | update sty_prefer_orgstr A, cancer_more_sty.sty_prefer_cui B set A.prefer=B.prefer,A.reason=B.reason where A.sty1=B.sty1 and A.sty2=B.sty2;
42 | 
43 | select * from cancer_cui where sty_ignored is null;
44 | select count(distinct org_str) from cancer_cui;
45 | 


--------------------------------------------------------------------------------
/sql-script/ret-yahoo.sql:
--------------------------------------------------------------------------------
 1 | create database if not exists retyahoo character set utf8;
 2 | use retyahoo;
 3 | 
 4 | select * from content_tag_ytex_yahoo;
 5 | 
 6 | select count(distinct instance_id) from content_tag_ytex_yahoo;
 7 | select count(distinct id) from org_yahoo;
 8 | 
 9 | drop table content_tag_ytex_yahoo;
10 | rename table retyahoo.org_yahoo to ytex.org_yahoo; 
11 | 
12 | 
13 | use ytex;
14 | select * from ytex.org_yahoo;
15 | select max(id)  from (select id from org_yahoo where id>= 0 and id < 120753 order by id  limit 5000) a;
16 | -- 120753
17 | select max(id)  from (select id from org_yahoo where id>= 120753 and id < 334572 order by id  limit 5000) a;
18 | -- 334572
19 | select max(id)  from (select id from org_yahoo where id>= 334572 and id < 640612 order by id  limit 5000) a;
20 | -- 640612
21 | select max(id)  from (select id from org_yahoo where id>= 640612 and id < 925081 order by id  limit 5000) a;
22 | -- 925081
23 | select max(id)  from (select id from org_yahoo where id>= 925081 and id < 1219340 order by id  limit 5000) a;
24 | -- 1219340
25 | select max(id)  from (select id from org_yahoo where id>= 1219340 and id < 1664699 order by id  limit 5000) a;
26 | -- 1664699
27 | select max(id)  from (select id from org_yahoo where id>= 1664699 and id < 1994240 order by id  limit 5000) a;
28 | -- 1994240
29 | select max(id)  from (select id from org_yahoo where id>= 1994240 and id < 2685340 order by id  limit 5000) a;
30 | -- 2685340
31 | select max(id)  from (select id from org_yahoo where id>= 2685340 and id < 3079989 order by id  limit 5000) a;
32 | -- 3079989
33 | select max(id)  from (select id from org_yahoo where id>= 3079989 and id < 3440146 order by id  limit 5000) a;
34 | -- 3440146
35 | select max(id)  from (select id from org_yahoo where id>= 3440146 and id < 3680908 order by id  limit 5000) a;
36 | -- 3680908
37 | select max(id)  from (select id from org_yahoo where id>= 3680908 and id < 1000000000 order by id  limit 5000) a;
38 | -- 3822084
39 | 
40 | 
41 | select count(id) from org_yahoo where id>= 640612 and id < 925081 order by id;
42 | select count(id) from org_yahoo where id>= 3680908 and id < 1000000000;
43 | 
44 | select count(distinct document_id) from document;
45 | 


--------------------------------------------------------------------------------
/sql-script/sent_1213.sq..sql:
--------------------------------------------------------------------------------
 1 | use ytex;
 2 | 
 3 | /*
 4 | sed -i -- 's/concat(subject, " . ", content)/chosenanswer/g' *.xml
 5 | sed -i -- 's/<string>question<\/string>/<string>answer<\/string>/g' *.xml
 6 | 
 7 | */
 8 | 
 9 | create table yahootumblr (
10 |   id bigint(20) default null,
11 |   content text default null
12 |   );
13 | 
14 | insert yahootumblr (id,content) select distinct blogId, text_content from content_org_new;
15 | insert yahootumblr (id,content) select distinct id, concat(subject, ". ", content, ". ", chosenanswer) from org_yahoo;
16 | select * from yahootumblr;
17 | delete from yahootumblr where id is null;
18 | 
19 | -- question
20 | select count(*) from anno_sentence; -- 267617
21 | select count(sentence_text) from v_document_cui_sent; -- 399076
22 | select count(distinct sentence_text) from v_document_cui_sent; -- 142802
23 | 
24 | rename table retyahoo.org_yahoo to ytex.org_yahoo;
25 | select  count(distinct id) from org_yahoo;
26 | select  count(distinct blogId) from content_org_new;
27 | 
28 | -- question
29 | select count(*) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'question'; -- 267617
30 | select count(distinct substr(`d`.`doc_text`,(`b`.`span_begin` + 1),(`b`.`span_end` - `b`.`span_begin`))) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'question'; -- 249013
31 | 
32 | -- answer
33 | select count(*) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'answer'; -- 428881
34 | select count(distinct substr(`d`.`doc_text`,(`b`.`span_begin` + 1),(`b`.`span_end` - `b`.`span_begin`))) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'answer'; -- 348793
35 | 
36 | -- blog
37 | select count(*) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'blog'; -- 52551
38 | select count(distinct substr(`d`.`doc_text`,(`b`.`span_begin` + 1),(`b`.`span_end` - `b`.`span_begin`))) from anno_sentence s, anno_base b,document d where s.anno_base_id = b.anno_base_id and b.document_id=d.document_id and d.analysis_batch = 'blog'; -- 47413
39 | 
40 | select distinct blogId from content_org_new where blogId >=116473932370 order by blogId limit 1000;
41 | 
42 | 


--------------------------------------------------------------------------------
/sql-script/smb.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | use ret1007;
 3 | 
 4 | select * from content_tag_disease_ytex_T047_unique_output order by blogId,sentence 
 5 | 	into outfile '/tmp/ret-final-1007.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 
 6 | select * from content_tag_ytex order by instance_id,sentence_text 
 7 | 	into outfile '/tmp/ret-basic-1007.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n'; 
 8 |     
 9 |     
10 |     
11 |     select * from co_occur ;
12 |     
13 |     use ret1018;
14 |     select * from content_tag_disease_ytex_unique_output;
15 |     
16 |     
17 |     select * from ret1018.content_tag_disease_ytex_unique_output;
18 |     
19 |     alter table ret1007.content_tag_ytex add column `sab` varchar(200) after `cui`;
20 |     
21 |     update ret1007.content_tag_ytex as cty 
22 | 		set cty.sab= (
23 | 			select group_concat(distinct sab separator ',') from umls.mrconso as con 
24 | 				where cty.cui = con.cui
25 |                 group by cty.cui
26 |         );
27 |         select * from ret1007.content_tag_ytex;
28 |         
29 |  rename table ret.content_tag_increased_target to ret1018.content_tag_increased_target;       
30 |         


--------------------------------------------------------------------------------
/sql-script/socialqa.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | use socialqa;
 3 | 
 4 | select count(*) from socialqa.qdataH; -- 3822256
 5 | select distinct top_level_category from socialqa.qdataH; -- 29
 6 | select count(*) from socialqa.qdataH where top_level_category='Health'; -- 2820179
 7 | 
 8 | create table health_answers_for_8000_questions as (select a.qid,a.content,a.rating, a.userid,a.usernick from adataH a, health_questions_random_8000 q where a.qid=q.qid);
 9 | select * from socialqa.qdataH;
10 | select count(*) from `health_questions_random_8000`;
11 | 
12 | -- pick 8000 question and answer.
13 | set group_concat_max_len=1024000;
14 | select q.qid, replace(group_concat(q.content,'\n',a.content),'\r','') 
15 | 	from `health_questions_random_8000` q, `health_answers_for_8000_questions` a 
16 | 		where a.qid=q.qid group by qid
17 | 	into outfile '/tmp/socialqa_8000.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
18 | select q.qid,q.content, q.userid
19 | 	from `health_questions_random_8000` q
20 |     union all 
21 |     select a.qid,a.content,a.userid 
22 |     from `health_answers_for_8000_questions` a 
23 | 	into outfile '/tmp/socialqa_8000_uid.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
24 | 
25 | 
26 | select U.userid, count(*) as cnt from 
27 | 	(select userid from health_questions_random_8000 
28 | 		union all
29 | 	 select userid from health_answers_for_8000_questions ) as U
30 |        group by U.userid 
31 |        order by cnt desc;
32 | select userid,count(*) as cnt from health_answers_for_8000_questions group by userid order by cnt desc;
33 | 
34 | -- split all answers into multiple files.
35 | select  id, replace(concat(subject, ' ', content,' ',chosenanswer),'\r','') from socialqa.qdataH 
36 |     where id > 3822256/4*0 and id <= 3822256/4*1
37 | 	into outfile '/tmp/socialqa_dataset1.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
38 | select  id, replace(concat(subject, ' ', content,' ',chosenanswer),'\r','') from socialqa.qdataH 
39 |     where id > 3822256/4*1 and id <= 3822256/4*2
40 | 	into outfile '/tmp/socialqa_dataset2.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
41 | select  id, replace(concat(subject, ' ', content,' ',chosenanswer),'\r','') from socialqa.qdataH 
42 |     where id > 3822256/4*2 and id <= 3822256/4*3
43 | 	into outfile '/tmp/socialqa_dataset3.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
44 | select  id, replace(concat(subject, ' ', content,' ',chosenanswer),'\r','') from socialqa.qdataH 
45 |     where id > 3822256/4*3 and id <= 3822256/4*4 + 4
46 | 	into outfile '/tmp/socialqa_dataset4.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
47 |     
48 | select  count(*) from socialqa.qdataH;
49 | rename table umls._target_term_ to umls._target_term_botanical_;
50 | select * from umls._target_term_botanical_;
51 | 
52 | 


--------------------------------------------------------------------------------
/sql-script/somelab-sctGraph.sql:
--------------------------------------------------------------------------------
  1 | show databases;
  2 | use ret;
  3 | use umls;
  4 | use gg;
  5 | show tables;
  6 | select * from content_tag_compare_same;
  7 | select count(distinct cui) from umls.mrconso;
  8 | 
  9 | create database gg character set utf8;
 10 | 
 11 | drop table social;
 12 | CREATE TABLE `social` (
 13 |   `stt` varchar(300)  DEFAULT NULL,   
 14 |   `sty` varchar(300)  DEFAULT NULL,		
 15 |   `ptr` varchar(300)  DEFAULT NULL,			
 16 |   `aui` varchar(9)  DEFAULT NULL,	 
 17 |   `sid` bigint DEFAULT NULL,				
 18 |   `fsn` varchar(500)  DEFAULT NULL
 19 |   ) CHARSET=utf8;
 20 |   
 21 |  load data local infile 'C:\\fsu\\ra\\data\\graph-group\\SNOMED_database\\SNOMEDCT_SOCIAL_CONTEXT_with_PATH.tsv' 
 22 | 	into table social 
 23 | 	fields terminated by '\t'
 24 | 	-- enclosed by '"'
 25 | 	lines terminated by '\n'
 26 | 	ignore 1 lines; 
 27 | 
 28 | insert observe (stt,sty,ptr,aui) values ('aaa','bbb','111.222.333','444');
 29 | insert observe (stt,sty,ptr,aui) values ('aaa','bbb','555.666.777','333');
 30 | use gg;
 31 | 
 32 | ALTER TABLE observe add ( pt_aui varchar(30) default null);
 33 | /*get the preferred term by snomedct code from mrconso. time consuming 4 hours*/
 34 | update observe as o
 35 | 	set o.pt_aui = (
 36 | 		select aui from umls.mrconso where SAB='SNOMEDCT_US' AND TTY = 'PT' AND CODE = o.sid);
 37 | ALTER TABLE social add ( pt_aui varchar(30) default null);
 38 | UPDATE social AS s 
 39 | 	inner join umls.mrconso AS con
 40 | 		ON con.SAB='SNOMEDCT_US' AND con.TTY = 'PT' AND con.CODE = s.sid
 41 | 	SET s.pt_aui = con.AUI
 42 |     ;
 43 | ALTER TABLE observe add ( pt_aui_str varchar(1000) default null);
 44 | UPDATE observe AS s 
 45 | 	inner join umls.mrconso AS con
 46 | 		ON s.pt_aui = con.AUI
 47 | 	SET s.pt_aui_str = con.STR
 48 |     ;    
 49 | ALTER TABLE social add ( pt_aui_str varchar(1000) default null);
 50 | UPDATE social AS s 
 51 | 	inner join umls.mrconso AS con
 52 | 		ON s.pt_aui = con.AUI
 53 | 	SET s.pt_aui_str = con.STR
 54 |     ;        
 55 |     
 56 | set group_concat_max_len=102400000;
 57 | select count(*) from observe group by stt,sty;
 58 | 
 59 | create table observe_group as 
 60 |   select o1.stt, o1.sty, count(*) as cnt, group_concat(distinct o1.pt_aui, ' ', IFNULL(o2.pt_aui, 'null') separator ',co_occur') from 
 61 | 	(select stt,sty,ptr,pt_aui from observe o) o1 
 62 | 	left join observe o2
 63 | 	on o1.stt=o2.stt and o1.sty=o2.sty
 64 | 		and o1.ptr regexp concat('.*',o2.pt_aui,'$')
 65 |     group by o1.stt,o1.sty
 66 |     ;
 67 | 
 68 | /*create the */
 69 | drop table if exists observe_group;
 70 | create table observe_group as 
 71 |   select o1.stt, o1.sty, count(distinct o1.pt_aui) as cnt_all, count(distinct o1.pt_aui,o2.pt_aui) as cnt_parent,
 72 | 		group_concat(distinct o1.pt_aui, '\t', IFNULL(o2.pt_aui, 'null') separator '`') as pairs,
 73 | 		group_concat(distinct o1.pt_aui, '\t', IFNULL(o1.pt_aui_str, 'null') separator '`') as pairs_str1,
 74 | 		group_concat(distinct o2.pt_aui, '\t', IFNULL(o2.pt_aui_str, 'null') separator '`') as pairs_str2
 75 |     from  observe o1 
 76 | 	left join observe o2
 77 | 	on o1.stt=o2.stt and o1.sty=o2.sty
 78 | 		and o1.ptr regexp concat('.*',o2.pt_aui,'$')
 79 |     group by o1.stt,o1.sty
 80 |     ;
 81 | drop table if exists social_group;
 82 | create table social_group as 
 83 |   select o1.stt, o1.sty, count(distinct o1.pt_aui) as cnt_all, count(distinct o1.pt_aui,o2.pt_aui) as cnt_parent,
 84 | 		group_concat(distinct o1.pt_aui, '\t', IFNULL(o2.pt_aui, 'null') separator '`') as pairs,
 85 | 		group_concat(distinct o1.pt_aui, '\t', IFNULL(o1.pt_aui_str, 'null') separator '`') as pairs_str1,
 86 | 		group_concat(distinct o2.pt_aui, '\t', IFNULL(o2.pt_aui_str, 'null') separator '`') as pairs_str2
 87 |     from  social o1 
 88 | 	left join social o2
 89 | 	on o1.stt=o2.stt and o1.sty=o2.sty
 90 | 		and o1.ptr regexp concat('.*',o2.pt_aui,'$')
 91 |     group by o1.stt,o1.sty
 92 |     ;
 93 | 
 94 | select * from observe_group order by cnt_parent;
 95 | select * from social_group  order by cnt_parent;
 96 | 
 97 | select * from observe_group order by cnt_parent into outfile '/tmp/observe_group.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
 98 | select * from social_group  order by cnt_parent into outfile '/tmp/social_group.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
 99 | 
100 | 
101 | use ret;
102 | 
103 | select * from content_tag_ytex_T047_unique ;
104 | 
105 | 
106 | select distinct A.target2 from (select * from co_occur where sab1 is not null and sab2 is null) A;
107 | select * from co_occur where sab1 is not null and sab2 is null;
108 | 
109 | select count(distinct target) from content_tag_our_T047_unique;
110 | select count(distinct target) from content_tag_ytex_T047_unique;
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/sql-script/synonym.sql:
--------------------------------------------------------------------------------
 1 | create database synonym char set utf8;
 2 | use synonym;
 3 | create table test_term_umls (
 4 | term varchar(100),
 5 | cui varchar(10),
 6 | synonym text
 7 | );
 8 | 
 9 | create table wiki_ngram like chv.cancer_ngram;
10 | create table wiki_ngram_tf5 like wiki_ngram;
11 | 
12 | rename table socialqa.wiki_ngram to synonym.wiki_ngram;
13 | 
14 | load data local infile '/tmp/wiki.tf5.ngram.novector' into table wiki_ngram_tf5 fields terminated by '\t' enclosed by '"' lines terminated by '\r\n' ignore 1 lines;
15 | 
16 | 
17 | set group_concat_max_len=10240;
18 | select * from test_term_umls;
19 | -- synonym with at most 3 words
20 | truncate test_term_umls;
21 | insert into test_term_umls (term,cui) select distinct ngram, cui_umls from wiki_ngram;
22 | -- update test_term_umls t set synonym = (select GROUP_CONCAT(distinct s.descr SEPARATOR  '|' ) from umls._target_term_ s where cui=t.cui and (length(descr)-length(replace(descr,' ', '')))<= 2);
23 | 
24 | update test_term_umls t set synonym = (select GROUP_CONCAT(distinct s.str SEPARATOR  '|' ) from umls.mrconso s where s.cui=t.cui and (s.sab like 'SNOMEDCT_US%' or s.sab like 'RXNORM%' OR s.sab like 'ICD%' OR s.sab like 'NCI%' OR s.sab like 'LOINC%') and (length(str)-length(replace(str,' ', '')))<= 3);
25 | 
26 | 
27 | select * from umls._target_term_ ;
28 | select * from umls.mrconso where cui='C0439234';
29 | select count(*) from  wiki_ngram;
30 | select * from test_term_umls 
31 | 	into outfile '/tmp/freq_term.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
32 | 
33 | select distinct sab from umls.mrconso;
34 | select ngram from wiki_ngram_tf5 where n>1
35 | 	into outfile '/tmp/wiki_ngram_tf5.txt';
36 | 
37 | select cvalue, count(*) as cnt from wiki_ngram group by cvalue order by cvalue;
38 | select count(*) from wiki_ngram where tf>=100 and ngram like '%(A)%';
39 | select count(*) from wiki_ngram_tf5 where tf>=100 and ngram like '%(A)%' order by tf;
40 | select * from test_term_umls where term like '%mother%';
41 | 


--------------------------------------------------------------------------------
/sql-script/umls.sql:
--------------------------------------------------------------------------------
 1 | use umls;
 2 | 
 3 | select  * from MRCONSO where cui = 'C1875802' limit 2000 ;
 4 | select count(*) from mrsmap;
 5 | select * from mrcols where col='STY';
 6 | 
 7 | select count(distinct cui) from mrconso;
 8 | 
 9 | SHOW FULL PROCESSLIST;
10 | 
11 | select * from mrsty;
12 | select * from mrrel where rel is null;
13 | 
14 | select distinct sab from umls.mrconso;
15 | 
16 | show processlist;
17 | 
18 | select CUI, AUI, SAB, STR from MRCONSO where LAT = 'ENG'  into outfile 'c:/fsu/all.csv' fields terminated by ',' enclosed by '"' lines terminated by '\n';
19 | 
20 | select * from mrconso where stt='PF';
21 | 
22 | select CUI, COUNT(*) FROM MRSTY GROUP BY CUI;
23 | 
24 | desc mrsty;
25 | 
26 | select sab, count(*) from mrconso group by sab;
27 | 
28 | select count(*) from mrconso where str like '%diabetes%' or str like '%type 1 diabetes%' or str like '%type 2 diabetes%';
29 | /*4021 result*/
30 | 
31 | select distinct cui from mrconso where str like '%diabetes%' or str like '%type 1 diabetes%' or str like '%type 2 diabetes%';
32 | /*more than 1k*/
33 | 
34 | select * from mrconso where sab ='SNOMEDCT_US' AND ( str like '%diabetes%' or str like '%type 1 diabetes%' or str like '%type 2 diabetes%');
35 | 
36 | select distinct cui from mrconso where str = 'diabetes' or str = 'type 1 diabetes' or str = 'type 2 diabetes';
37 | /*4 results*/
38 | select count(*) from mrrel where cui1 in ('C0011847','C0011849','C0011854','C0011860') or cui2 in ('C0011847','C0011849','C0011854','C0011860') ;
39 | /*5994 result*/
40 | 
41 | create view rel_diabetes as select * from mrrel where cui1 in ('C0011847','C0011849','C0011854','C0011860') or cui2 in ('C0011847','C0011849','C0011854','C0011860') ;
42 | 
43 | create view content_rel as select distinct c.cui from CONTENT_TAG c inner join rel_diabetes r 
44 | on c.cui = r.cui1 COLLATE utf8_unicode_ci or c.cui = r.cui2 COLLATE utf8_unicode_ci;
45 | 
46 |  select count(distinct CUI) from MRCONSO;
47 | select * from tmp_rel_diabetes;
48 | select * from CONTENT_TAG;
49 | delete  from content_tag where blogId='post_id';
50 | 
51 | 
52 | 
53 | drop table if exists CONTENT_ORG;
54 | CREATE TABLE CONTENT_ORG (
55 | `blogId` BIGINT(20) DEFAULT NULL,  /*----blog id */
56 | `post_hashtag`  varchar(300) DEFAULT NULL,  
57 | `blog_name` varchar(200) DEFAULT NULL, 
58 | `text_link_title` varchar(500) DEFAULT NULL,   
59 | `text_content` varchar(10000) DEFAULT NULL
60 | );
61 | 
62 | truncate  CONTENT_ORG;
63 | 
64 | /*load result in to the table*/
65 | load data local infile 'C:\\fsu\\ra\\UmlsTagger\\data\\data_content_tag_diabetes_0821.csv' 
66 | into table CONTENT_ORG
67 | fields terminated by ','
68 | enclosed by '"'
69 | lines terminated by '`'
70 | ;
71 | 
72 | select *, length(text_content) AS len from content_org ;
73 | select count(distinct blogid) from content_org;
74 | desc content_org;
75 | 
76 | select blogId  instance_id from umls.CONTENT_ORG;
77 | -- select text_content note_text from umls.CONTENT_ORG where blogId = :instance_id;
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/sql-script/usuk.sql:
--------------------------------------------------------------------------------
 1 | create database usuk char set utf8;
 2 | use usuk;
 3 | 
 4 | create table UK_T2DM_Trials AS
 5 | (select tid, criteria from compact_092316.trials where
 6 | (STR_TO_DATE(start_date,'%M %Y') >= STR_TO_DATE('JANUARY 2005','%M %Y'))
 7 | and (STR_TO_DATE(start_date,'%M %Y') <= STR_TO_DATE('September 2016','%M %Y'))
 8 | and study_type = 'Interventional' and tid in (select tid from compact_092316.all_diseases_trials where disease ='diabetes-mellitus-type-2')
 9 | and tid in (select tid from compact_092316.authority where authority like '%United Kingdom%'));
10 | 
11 | 
12 | create table US_T2DM_Trials AS
13 | (select tid, criteria from compact_092316.trials where
14 | (STR_TO_DATE(start_date,'%M %Y') >= STR_TO_DATE('JANUARY 2005','%M %Y'))
15 | and (STR_TO_DATE(start_date,'%M %Y') <= STR_TO_DATE('September 2016','%M %Y'))
16 | and study_type = 'Interventional' and tid in (select tid from compact_092316.all_diseases_trials where disease ='diabetes-mellitus-type-2')
17 | and tid in (select tid from compact_092316.authority where authority like '%United States%'));
18 | 
19 | select tid,criteria from US_T2DM_Trials;
20 | 
21 | create table cancer_cui like cancer.cancer_cui;
22 | create table noncui like cancer.noncui;
23 | create table cancer_mm_cui like ner200.cancer_mm_cui;
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/sql-script/yahoo.sql:
--------------------------------------------------------------------------------
 1 | use ytex;
 2 | drop table tmp_org_yahoo;
 3 | CREATE TABLE TMP_ORG_yahoo (
 4 | `qid` varchar(50),
 5 | `id` int(11),
 6 | `category` varchar(50),
 7 | `categoryId` int(11),
 8 | `subject` text,
 9 | `content` text,
10 | `day` varchar(22),
11 | `link` varchar(256),
12 | `userid` varchar(50),
13 | `usernick` varchar(200),
14 | `numanswers` int(10),
15 | `numcomments` int(10),
16 | `chosenanswer` text,
17 | `chosenanswererid` varchar(50),
18 | `chosenanswerernick` varchar(200),
19 | `chosenanswertimestamp` varchar(20)
20 | );
21 | load data local infile 'C:\\fsu\\ra\\data\\qdataH.diabetes.all_58425.csv' 
22 | into table TMP_ORG_yahoo
23 | fields terminated by ','
24 | enclosed by '"'
25 | lines terminated by '\r\n'
26 | ignore 1 LINES
27 | ;
28 | 
29 | select * from tmp_org_yahoo ;
30 | select count( distinct qid) from tmp_org_yahoo;
31 | 
32 | -- ytex using sql:
33 | select distinct id INSTANCE_ID from ytex.TMP_ORG_yahoo where qid is not null;
34 |  select concat(subject, ". ", content, ". ", chosenanswer) note_text from ytex.TMP_ORG_yahoo where id = :instance_id limit 1;
35 |  select chosenanswer note_text from ytex.TMP_ORG_yahoo where id = :instance_id limit 1;
36 | 
37 | drop table content_tag_ytex_yahoo_answer;
38 | create table ytex.content_tag_ytex_yahoo_answer as 
39 |  select yh.qid, a.anno_text, d.instance_id, c.* from v_document_cui_sent c 
40 |   inner join  v_annotation a on c.anno_base_id = a.anno_base_id
41 |   inner join v_document d on d.document_id = c.document_id
42 |   inner join TMP_ORG_yahoo yh on yh.id = d.instance_id
43 | ;
44 | insert into ytex.content_tag_ytex_yahoo 
45 |  select yh.qid, a.anno_text, d.instance_id, c.* from v_document_cui_sent c 
46 |   inner join  v_annotation a on c.anno_base_id = a.anno_base_id
47 |   inner join v_document d on d.document_id = c.document_id
48 |   inner join TMP_ORG_yahoo yh on yh.id = d.instance_id
49 | ;
50 | select count(distinct qid) from content_tag_ytex_yahoo where analysis_batch='answer';
51 | -- q: 15378, a:25342, all:40720
52 | 
53 | select distinct doc_text from document;
54 | select count(distinct anno_base_id) from anno_base;
55 | select * from anno_named_entity;
56 | select count(distinct code) from anno_ontology_concept;
57 | select count(distinct document_id) from content_tag_ytex_yahoo;
58 | select count(distinct qid) from content_tag_ytex_yahoo_answer;
59 | select count(distinct document_id) from v_document_cui_sent;
60 | 
61 | select concat(subject, ". ", content, ". ", chosenanswer) from TMP_ORG_yahoo;
62 | 
63 | select * from ytex.TMP_ORG_yahoo where chosenanswer like '%article%';
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/src/main/java/StanfordCoreNlpDemo.java:
--------------------------------------------------------------------------------
  1 | 
  2 | import java.io.*;
  3 | import java.util.*;
  4 | 
  5 | import edu.stanford.nlp.coref.CorefCoreAnnotations;
  6 | 
  7 | import edu.stanford.nlp.coref.data.CorefChain;
  8 | import edu.stanford.nlp.io.*;
  9 | import edu.stanford.nlp.ling.*;
 10 | import edu.stanford.nlp.pipeline.*;
 11 | import edu.stanford.nlp.semgraph.SemanticGraph;
 12 | import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
 13 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
 14 | import edu.stanford.nlp.trees.*;
 15 | import edu.stanford.nlp.util.*;
 16 | 
 17 | /** This class demonstrates building and using a Stanford CoreNLP pipeline. */
 18 | public class StanfordCoreNlpDemo {
 19 | 
 20 |   /** Usage: java -cp "*" StanfordCoreNlpDemo [inputFile [outputTextFile [outputXmlFile]]] */
 21 |   public static void main(String[] args) throws IOException {
 22 |     // set up optional output files
 23 |     PrintWriter out;
 24 |     if (args.length > 1) {
 25 |       out = new PrintWriter(args[1]);
 26 |     } else {
 27 |       out = new PrintWriter(System.out);
 28 |     }
 29 |     PrintWriter xmlOut = null;
 30 |     if (args.length > 2) {
 31 |       xmlOut = new PrintWriter(args[2]);
 32 |     }
 33 | 
 34 |     // Create a CoreNLP pipeline. To build the default pipeline, you can just use:
 35 |     //   StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 36 |     // Here's a more complex setup example:
 37 |     //   Properties props = new Properties();
 38 |     //   props.put("annotators", "tokenize, ssplit, pos, lemma, ner, depparse");
 39 |     //   props.put("ner.model", "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz");
 40 |     //   props.put("ner.applyNumericClassifiers", "false");
 41 |     //   StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 42 | 
 43 |     // Add in sentiment
 44 |     Properties props = new Properties();
 45 |     props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment");
 46 | 
 47 |     StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 48 | 
 49 |     // Initialize an Annotation with some text to be annotated. The text is the argument to the constructor.
 50 |     Annotation annotation;
 51 |     if (args.length > 0) {
 52 |       annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[0]));
 53 |     } else {
 54 |       annotation = new Annotation("Kosgi Santosh sent an email to Stanford University. He didn't get a reply.");
 55 |     }
 56 | 
 57 |     // run all the selected Annotators on this text
 58 |     pipeline.annotate(annotation);
 59 | 
 60 |     // this prints out the results of sentence analysis to file(s) in good formats
 61 |     pipeline.prettyPrint(annotation, out);
 62 |     if (xmlOut != null) {
 63 |       pipeline.xmlPrint(annotation, xmlOut);
 64 |     }
 65 | 
 66 |     // Access the Annotation in code
 67 |     // The toString() method on an Annotation just prints the text of the Annotation
 68 |     // But you can see what is in it with other methods like toShorterString()
 69 |     out.println();
 70 |     out.println("The top level annotation");
 71 |     out.println(annotation.toShorterString());
 72 |     out.println();
 73 | 
 74 |     // An Annotation is a Map with Class keys for the linguistic analysis types.
 75 |     // You can get and use the various analyses individually.
 76 |     // For instance, this gets the parse tree of the first sentence in the text.
 77 |     List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
 78 |     if (sentences != null && ! sentences.isEmpty()) {
 79 |       CoreMap sentence = sentences.get(0);
 80 |       out.println("The keys of the first sentence's CoreMap are:");
 81 |       out.println(sentence.keySet());
 82 |       out.println();
 83 |       out.println("The first sentence is:");
 84 |       out.println(sentence.toShorterString());
 85 |       out.println();
 86 |       out.println("The first sentence tokens are:");
 87 |       for (CoreMap token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
 88 |         out.println(token.toShorterString());
 89 |       }
 90 |       Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
 91 |       out.println();
 92 |       out.println("The first sentence parse tree is:");
 93 |       tree.pennPrint(out);
 94 |       out.println();
 95 |       out.println("The first sentence basic dependencies are:");
 96 |       out.println(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.LIST));
 97 |       out.println("The first sentence collapsed, CC-processed dependencies are:");
 98 |       SemanticGraph graph = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
 99 |       out.println(graph.toString(SemanticGraph.OutputFormat.LIST));
100 | 
101 |       // Access coreference. In the coreference link graph,
102 |       // each chain stores a set of mentions that co-refer with each other,
103 |       // along with a method for getting the most representative mention.
104 |       // Both sentence and token offsets start at 1!
105 |       out.println("Coreference information");
106 |       Map<Integer, CorefChain> corefChains =
107 |               annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
108 |       if (corefChains == null) { return; }
109 |       for (Map.Entry<Integer,CorefChain> entry: corefChains.entrySet()) {
110 |         out.println("Chain " + entry.getKey());
111 |         for (CorefChain.CorefMention m : entry.getValue().getMentionsInTextualOrder()) {
112 |           // We need to subtract one since the indices count from 1 but the Lists start from 0
113 |           List<CoreLabel> tokens = sentences.get(m.sentNum - 1).get(CoreAnnotations.TokensAnnotation.class);
114 |           // We subtract two for end: one for 0-based indexing, and one because we want last token of mention not one following.
115 |           out.println("  " + m + ", i.e., 0-based character offsets [" + tokens.get(m.startIndex - 1).beginPosition() +
116 |                   ", " + tokens.get(m.endIndex - 2).endPosition() + ")");
117 |         }
118 |       }
119 |       out.println();
120 | 
121 |       out.println("The first sentence overall sentiment rating is " + sentence.get(SentimentCoreAnnotations.SentimentClass.class));
122 |     }
123 |     IOUtils.closeIgnoringExceptions(out);
124 |     IOUtils.closeIgnoringExceptions(xmlOut);
125 |   }
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/java/com/votors/umls/graph/HelloJGraphT.java:
--------------------------------------------------------------------------------
 1 | package com.votors.umls.graph;
 2 | 
 3 | import java.net.*;
 4 | import org.jgrapht.*;
 5 | import org.jgrapht.graph.*;
 6 | /**
 7 |  * A simple introduction to using JGraphT.
 8 |  *
 9 |  * @author Barak Naveh
10 |  * @since Jul 27, 2003
11 |  */
12 | public final class HelloJGraphT
13 | {
14 |     private HelloJGraphT()
15 |     {
16 |     } // ensure non-instantiability.
17 | 
18 |     /**
19 |      * The starting point for the demo.
20 |      *
21 |      * @param args ignored.
22 |      */
23 |     public static void main(String [] args)
24 |     {
25 |         UndirectedGraph<String, DefaultEdge> stringGraph = createStringGraph();
26 | 
27 |         // note undirected edges are printed as: {<v1>,<v2>}
28 |         System.out.println(stringGraph.toString());
29 | 
30 |         // create a graph based on URL objects
31 |         DirectedGraph<URL, DefaultEdge> hrefGraph = createHrefGraph();
32 | 
33 |         // note directed edges are printed as: (<v1>,<v2>)
34 |         System.out.println(hrefGraph.toString());
35 |     }
36 | 
37 |     /**
38 |      * Creates a toy directed graph based on URL objects that represents link
39 |      * structure.
40 |      *
41 |      * @return a graph based on URL objects.
42 |      */
43 |     private static DirectedGraph<URL, DefaultEdge> createHrefGraph()
44 |     {
45 |         DirectedGraph<URL, DefaultEdge> g =
46 |                 new DefaultDirectedGraph<URL, DefaultEdge>(DefaultEdge.class);
47 | 
48 |         try {
49 |             URL amazon = new URL("http://www.amazon.com");
50 |             URL yahoo = new URL("http://www.yahoo.com");
51 |             URL ebay = new URL("http://www.ebay.com");
52 | 
53 |             // add the vertices
54 |             g.addVertex(amazon);
55 |             g.addVertex(yahoo);
56 |             g.addVertex(ebay);
57 | 
58 |             // add edges to create linking structure
59 |             g.addEdge(yahoo, amazon);
60 |             g.addEdge(yahoo, ebay);
61 |         } catch (MalformedURLException e) {
62 |             e.printStackTrace();
63 |         }
64 | 
65 |         return g;
66 |     }
67 | 
68 |     /**
69 |      * Create a toy graph based on String objects.
70 |      *
71 |      * @return a graph based on String objects.
72 |      */
73 |     private static UndirectedGraph<String, DefaultEdge> createStringGraph()
74 |     {
75 |         UndirectedGraph<String, DefaultEdge> g =
76 |                 new SimpleGraph<String, DefaultEdge>(DefaultEdge.class);
77 | 
78 |         String v1 = "v1";
79 |         String v2 = "v2";
80 |         String v3 = "v3";
81 |         String v4 = "v4";
82 | 
83 |         // add the vertices
84 |         g.addVertex(v1);
85 |         g.addVertex(v2);
86 |         g.addVertex(v3);
87 |         g.addVertex(v4);
88 | 
89 |         // add edges to create a circuit
90 |         g.addEdge(v1, v2);
91 |         g.addEdge(v2, v3);
92 |         g.addEdge(v3, v4);
93 |         g.addEdge(v4, v1);
94 | 
95 |         return g;
96 |     }
97 | }
98 | 
99 | // End HelloJGraphT.java


--------------------------------------------------------------------------------
/src/main/java/com/votors/umls/graph/IsaEdge.java:
--------------------------------------------------------------------------------
 1 | package com.votors.umls.graph;
 2 | 
 3 | import org.jgrapht.graph.*;
 4 | 
 5 | import java.util.Objects;
 6 | 
 7 | /**
 8 |  * Created by Jason on 2015/9/26 0026.
 9 |  */
10 | public class IsaEdge extends DefaultEdge{
11 | 
12 |     @Override public String toString () {return "";}
13 |     @Override public boolean equals(Object obj) {
14 |         if (obj instanceof IsaEdge && ((IsaEdge)obj).getSource().equals(this.getSource())
15 |                 && ((IsaEdge)obj).getTarget().equals(this.getTarget())) {
16 |             return true;
17 |         }
18 |         return false;
19 |     }
20 |     @Override public UmlsVertex getTarget() {return (UmlsVertex)super.getTarget();}
21 |     @Override public UmlsVertex getSource() {return (UmlsVertex)super.getSource();}
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/com/votors/umls/graph/TestJava.java:
--------------------------------------------------------------------------------
 1 | 
 2 | package com.votors.umls.graph;
 3 | 
 4 | import java.io.*;
 5 | import java.util.*;
 6 | import java.text.*;
 7 | import java.math.*;
 8 | import java.util.regex.*;
 9 | 
10 | public class TestJava {
11 | 
12 |     public static void main(String[] args) {
13 |         Scanner in = new Scanner(System.in);
14 |         String time = in.next();
15 | 
16 |         boolean pm = false;
17 |         if (time.contains("PM"))pm=true;
18 |         String[] t = time.substring(0,time.length()-2).split(":");
19 |         int h = Integer.parseInt(t[0]);
20 |         int m = Integer.parseInt(t[1]);
21 |         int s = Integer.parseInt(t[2]);
22 |         if (pm && h < 12) h += 12;
23 |         if (!pm && h == 12) h=0;
24 |         System.out.println(String.format("%02d:%02d:%02d", h,m,s));
25 | 
26 | 
27 | 
28 | 
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/votors/umls/graph/UmlsVertex.java:
--------------------------------------------------------------------------------
 1 | package com.votors.umls.graph;
 2 | 
 3 | import org.jgrapht.*;
 4 | import org.jgrapht.graph.ListenableDirectedGraph;
 5 | 
 6 | import java.io.Serializable;
 7 | 
 8 | /**
 9 |  * Created by Jason on 2015/9/26 0026.
10 |  */
11 | public class UmlsVertex implements Serializable {
12 |     private String aui = null;
13 |     private String auiStr = null;
14 |     /*status of the vertex: "root" or "child" */
15 |     public static final String ROOT = "root";
16 |     public static final String ROOT_NEW = "new-root";
17 |     public static final String CHILD = "child";
18 |     //public static final String RELAY = "relay";
19 |     public static final String COPY = "copy";
20 |     public String status = ROOT;
21 |     public UmlsVertex root = this;   // who is the root of this vertex
22 |     public int groupId = 0;     // which group this vertex belong to; 0 is no group yet.
23 |     public int layer = 0;       // which layer do the vertex locate in? for method SctGraph.fix()
24 |     public boolean fix = false;
25 |     transient private ListenableDirectedGraph g = null;
26 |     private static int copyCnt = 0;
27 |     private static UmlsVertex NULL = null;
28 | 
29 |     public UmlsVertex(String aui) {
30 |         this.aui = aui;
31 |     }
32 |     public UmlsVertex(UmlsVertex cp) {
33 |         copyCnt++;
34 |         aui = cp.aui + "-copy-"+copyCnt;
35 |         root = cp.root;
36 |         groupId = cp.groupId;
37 |         status = UmlsVertex.COPY;
38 |         layer = cp.layer;
39 |         auiStr = cp.auiStr;
40 |         g = cp.g;
41 |     }
42 |     public String getAui() { return aui;}
43 |     public void setGraph(ListenableDirectedGraph graph) {g = graph;}
44 |     public int getOutDegree() { if (g == null) return 0; else return g.outDegreeOf(this);}
45 |     public int getInDegree() { if (g == null) return 0; else return g.inDegreeOf(this);}
46 |     public void setAuiStr(String str) { auiStr = str;}
47 |     public String getAuiStr() { return auiStr;}
48 | 
49 |     @Override public String toString () {
50 |         if (auiStr == null) {
51 |             return groupId + ":" + aui;
52 |         } else {
53 |             return groupId + ":" + aui + "\n" + auiStr;
54 |         }
55 |     }
56 |     @Override public int hashCode() {return aui.hashCode();}
57 |     @Override public boolean equals(Object obj) {
58 |         if ((obj instanceof UmlsVertex) && aui.equals(((UmlsVertex)obj).aui)) {
59 |             return true;
60 |         }
61 |         return false;
62 |     }
63 | 
64 |     public static UmlsVertex getNULL () {
65 |         if (NULL == null) {
66 |             NULL = new UmlsVertex("null");
67 |             NULL.status = ROOT;
68 |         }
69 |         return NULL;
70 |     }
71 | 
72 |     public String toString2() {
73 |         return "Aui:" + aui + ",\tstatus: " + status + ",\tgroupId: " + groupId + ",\troot: "
74 |                 + root.getAui() + ",\tlayer: " + layer + ",\tout: " + getOutDegree() + ",\tin: " + getInDegree();
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/com/votors/Test.scala:
--------------------------------------------------------------------------------
 1 | package com.votors
 2 | 
 3 | import java.time.Duration
 4 | 
 5 | import com.votors.common.TimeX
 6 | 
 7 | /**
 8 |  * Created by Jason on 2016/6/15 0015.
 9 |  */
10 | object Test {
11 |   def main(args:Array[String]) = {
12 | 
13 |     var splitType = "#"
14 |     val criteria = "Fertile patients must use effective contraception during and for 3 months after study No other malignancy within the past 3 years No serious concurrent medical illness or active infection that would preclude study chemotherapy No allergy or sensitivity to imidazole antifungal medications (e.g., fluconazole, ketoconazole, miconazole, itraconazole, and clotrimazole)"
15 |     criteria.split("#|\\n").flatMap(s=> {
16 |       // if there is more than tow ':' in a sentence, we should split it using ':', cuz some clinical trails use ':' as separate symbol.
17 |       if (s.count(_ == ':') >= 3) {
18 |         splitType = ":"
19 |         s.split(":")
20 |       } else if (s.count(_ == '-') >= 3) {
21 |         splitType = "-"
22 |         s.split(" - ")
23 |       } else if (s.split("\\s").count(_=="No") >= 3) {
24 |         // some sentences without any punctuation to separate
25 |         splitType = "No"
26 |         s.split("(?=\\sNo\\s)")
27 |       }  else if (s.split("\\s").count(s=> s.equals("OR") || s.equals("Or")) >= 3) {
28 |         // some sentences without any punctuation to separate
29 |         splitType = "or"
30 |         s.split("Or|OR")
31 |       } else {
32 |         s :: Nil
33 |       }
34 |     }).filter(_.trim.size > 2).foreach(sent_org => {
35 |       val sent = sent_org.trim.replaceAll("^[\\p{Punct}\\s]*", "") // the punctuation at the beginning of a sentence
36 |       println(sent)
37 |     })
38 | 
39 |   }
40 | 
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/com/votors/TokenizerDemo.scala:
--------------------------------------------------------------------------------
 1 | package com.votors
 2 | 
 3 | /**
 4 |  * Created by Jason on 2016/5/2 0002.
 5 |  */
 6 | import java.io.FileReader
 7 | import java.io.IOException
 8 | import java.util
 9 | import java.util.List
10 | 
11 | import scala.collection.JavaConversions.asScalaIterator
12 | import scala.collection.immutable.{List, Range}
13 | import scala.collection.mutable
14 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
15 | 
16 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser
17 | 
18 | import scala.collection.JavaConversions.asScalaIterator
19 | import scala.collection.immutable.{List, Range}
20 | import scala.collection.mutable
21 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
22 | import scala.io.Source
23 | import scala.io.Codec
24 | 
25 | import edu.stanford.nlp.ling.{TaggedWord, CoreLabel, HasWord}
26 | import edu.stanford.nlp.process.CoreLabelTokenFactory
27 | import edu.stanford.nlp.process.DocumentPreprocessor
28 | import edu.stanford.nlp.process.PTBTokenizer
29 | 
30 | object TokenizerDemo {
31 | 
32 |   def main(args: Array[String]) {
33 |     for (arg <- args) {
34 |       // option #1: By sentence.
35 |       val dp = new DocumentPreprocessor(arg).iterator()
36 |       val tokens = dp.map(_.toArray().map(_.toString)).flatMap(_.toSeq).toArray
37 |       println(tokens.mkString(" "))
38 |       // option #2: By token
39 |       val ptbt = new PTBTokenizer(new FileReader(arg),
40 |         new CoreLabelTokenFactory(), "");
41 |       while (ptbt.hasNext()) {
42 |         val label = ptbt.next();
43 |         System.out.println(label);
44 |       }
45 |     }
46 | //
47 |     // set up grammar and options as appropriate
48 | //    val lp = LexicalizedParser.loadModel();
49 | //    val sent3 = Array("I", "can", "do", "it", "." )
50 | //    // Parser gets tag of second "can" wrong without help
51 | //    val tag3 = Array( "PRP", "MD", "VB", "PRP", "." )
52 | //    val sentence3 = new util.ArrayList[TaggedWord]()
53 | //    for (i <- 0 to (sent3.length-1)) {
54 | //      sentence3.add(new TaggedWord(sent3(i), tag3(i)));
55 | //    }
56 | //    val sents = Array("I go to school at 9:00 tomorrow.")
57 | //    val parse = lp.parse(sentence3);
58 | //    //val parse = lp.parseStrings(sents);
59 | //    parse.pennPrint();
60 | 
61 |   }
62 | }


--------------------------------------------------------------------------------
/src/main/scala/com/votors/umls/MMApi.scala:
--------------------------------------------------------------------------------
  1 | package com.votors.umls
  2 | 
  3 | import scala.collection.JavaConversions._
  4 | import java.io.{FileReader, FileWriter, PrintStream, PrintWriter}
  5 | import java.util.concurrent.atomic.AtomicInteger
  6 | import java.io._
  7 | import java.util
  8 | 
  9 | import com.votors.common.{Conf, TimeX}
 10 | import com.votors.common.Utils.Trace._
 11 | import com.votors.common.Utils._
 12 | import com.votors.ml.{Nlp, StanfordNLP}
 13 | import edu.stanford.nlp.util.IntPair
 14 | import gov.nih.nlm.nls.metamap.AcronymsAbbrevs
 15 | import gov.nih.nlm.nls.metamap.MetaMapApi
 16 | import gov.nih.nlm.nls.metamap.MetaMapApiImpl
 17 | import gov.nih.nlm.nls.metamap.Result
 18 | 
 19 | case class MMResult(cui:String, score:Int,orgStr:String,cuiStr:String,pfName:String, sent:String) {
 20 |   val sourceSet = new util.HashSet[String]
 21 |   val stySet = new util.HashSet[String]
 22 |   val span = new IntPair(-1,-1)
 23 |   var neg = -1
 24 |   var sentId = 0
 25 |   var termId = 0
 26 |   var matchType = 0; //match with our result: 1=same cui; 2= same orgStr; 3=1+2
 27 |   val matchDesc = new StringBuilder
 28 |   def shortDesc = {
 29 |     val sb: StringBuilder = new StringBuilder
 30 |     sb.append(cui + "|"
 31 |       + orgStr + "|"
 32 |       + score)
 33 |     sb.toString()
 34 |   }
 35 |   override def toString = {
 36 |     val sb: StringBuilder = new StringBuilder
 37 |     sb.append(cui + "|"
 38 |       + orgStr + "|"
 39 |       + cuiStr + "|"
 40 |       + score + "|"
 41 |       + span + "|"
 42 |       + stySet.mkString(" ") + "|"
 43 |       + sourceSet.mkString(" ") + "|"
 44 |       + sent)
 45 |     sb.toString()
 46 |   }
 47 | }
 48 | 
 49 | /**
 50 |   * Created by Jason on 2016/11/30 0030.
 51 |   */
 52 | object MMApi {
 53 |   var api: MetaMapApi = null
 54 | 
 55 |   /**
 56 |     * given a string (sentence), return the result from Metamap.
 57 |   */
 58 |   def process(terms: String, sentId:Int=0): Seq[MMResult] = {
 59 |     if (!Conf.MMenable) return Seq()
 60 |     init()
 61 |     // the character \031 will cause metamap dead.
 62 |     val resultList: util.List[Result] = api.processCitationsFromString(terms.replaceAll("[^\\p{Graph}\\x20\\t\\r\\n]",""))
 63 |     val mmRets = new util.ArrayList[MMResult]()
 64 |     for (result <- resultList) {
 65 |       /** write result as: cui|score|semtypes|sources|utterance */
 66 |       for (utterance <- result.getUtteranceList) {
 67 |         for (pcm <- utterance.getPCMList) {
 68 |           for (map <- pcm.getMappingList) {
 69 |             var termId = 0
 70 |             for (mapEv <- map.getEvList) {
 71 |               val mmRet = MMResult(mapEv.getConceptId, math.abs(mapEv.getScore), mapEv.getMatchedWords.mkString(" "), mapEv.getConceptName, mapEv.getPreferredName, terms)
 72 |               mmRet.sentId = sentId
 73 |               val sb: StringBuilder = new StringBuilder
 74 |               mmRet.sourceSet.addAll(mapEv.getSources.filter(sab => sab.matches(Conf.sabFilter)))
 75 |               mmRet.stySet.addAll(mapEv.getSemanticTypes.map(SemanticType.mapAbbr2sty.getOrElse(_,"None")).filter(sty => Conf.semanticType.indexOf(sty) >= 0))
 76 |               if (mmRet.sourceSet.size > 0
 77 |                 && mmRet.stySet.size > 0
 78 |                 && mmRet.score >= Conf.MMscoreThreshold
 79 |                 && !Nlp.checkStopword(mmRet.orgStr,true)
 80 |                 && !mmRet.orgStr.matches(Conf.cuiStringFilterRegex)
 81 |                 && !mmRets.exists(mm=>mm.cui.equals(mmRet.cui) && mm.orgStr.equals(mmRet.orgStr) && mm.score==mmRet.score)
 82 |                 //&& !mmRets.exists(mm=>mm.orgStr.toLowerCase.contains(mmRet.orgStr.toLowerCase))  // not exactly what we mean 'overlap'.
 83 |                ) {
 84 |                 mmRets.add(mmRet)
 85 |                 for (p <- mapEv.getPositionalInfo) {
 86 |                   if (mmRet.span.get(0) == -1 || p.getX < mmRet.span.get(0)) mmRet.span.set(0, p.getX)
 87 |                   if (mmRet.span.get(1) == -1 || p.getX + p.getY > mmRet.span.get(1)) mmRet.span.set(1, p.getX + p.getY)
 88 |                 }
 89 |                 mmRet.neg = mapEv.getNegationStatus
 90 |                 termId += 1
 91 |                 mmRet.termId = termId
 92 |                 println(mmRet.toString)
 93 |               } else {
 94 |                 println(s"filter by sty:${mmRet.stySet.size}, sab:${mmRet.sourceSet.size}, ${mmRet.score}, ${mmRet.cui}, ${mmRet.orgStr}, or already exists.")
 95 |               }
 96 |             }
 97 |           }
 98 |         }
 99 |       }
100 |     }
101 |     return mmRets.to[Seq]
102 |   }
103 | 
104 |   private def init():Unit = {
105 |     if (api != null) return
106 |     api = new MetaMapApiImpl
107 |     if (Conf.MMhost.trim.size > 0)api.setHost(Conf.MMhost)
108 |     if (Conf.MMport.trim.size > 0)api.setPort(Conf.MMport.toInt)
109 |     val options: String = Conf.MMoptions
110 |     api.setOptions(options)
111 |   }
112 | 
113 |   def main(args: Array[String]) {
114 |     init()
115 |     var startTime = System.currentTimeMillis()
116 |     process("People who don\u0019t smoke but who breathe the smoke of others also have a higher risk of lung cancer.")
117 |     println(System.currentTimeMillis() - startTime)
118 |     startTime = System.currentTimeMillis()
119 |     process("People who don\u0019t smoke but who breathe the smoke of others also have a higher risk of lung cancer.")
120 |     println(System.currentTimeMillis() - startTime)
121 |     startTime = System.currentTimeMillis()
122 |     process("My focus is on the code in word2vec.c for training the skip-gram architecture with negative sampling, so for now I have ignored the CBOW and Hierarchical Softmax code. I also haven't looked much at the testing code.\n\nBecause the code supports both models and both training approaches, I highly recommended viewing the code in an editor which allows you to collapse code blocks. The training code is much more readable when you hide the implementations that you aren't interested in..")
123 |     println(System.currentTimeMillis() - startTime)
124 |   }
125 | }
126 | 


--------------------------------------------------------------------------------
/src/main/scala/com/votors/umls/SemanticType.scala:
--------------------------------------------------------------------------------
 1 | package com.votors.umls
 2 | 
 3 | import java.io._
 4 | import java.nio.charset.CodingErrorAction
 5 | import java.util.regex.Pattern
 6 | import java.util.{Date, Properties}
 7 | 
 8 | import com.votors.common.{SqlUtils, Conf}
 9 | import com.votors.ml.{Clustering, Nlp}
10 | import edu.stanford.nlp.util.IntPair
11 | import opennlp.tools.sentdetect.{SentenceDetectorME, SentenceModel}
12 | import org.apache.log4j.{Level, Logger}
13 | import org.apache.spark.{SparkContext, SparkConf}
14 | 
15 | import scala.collection.JavaConversions.asScalaIterator
16 | import scala.collection.immutable.{List, Range}
17 | import scala.collection.mutable
18 | import scala.collection.mutable.{ListBuffer, ArrayBuffer}
19 | import scala.io.Source
20 | import scala.io.Codec
21 | /**
22 |   * Created by Jason on 2016/12/2 0002.
23 |   */
24 | case class SemanticType(sty:String,var abbr:String="", var fullName:String="",var groupAbbr:String="", var groupName:String="")
25 | 
26 | object SemanticType {
27 |   val mapSty = new mutable.HashMap[String, SemanticType]()
28 |   val mapAbbr2sty = new mutable.HashMap[String, String]()
29 |   def init() = {
30 |     val ftype=Source.fromFile(Conf.rootDir + "/data/SemanticTypes_2013AA.txt")
31 |     for (line <- ftype.getLines() if line.trim.size > 5) {
32 |       val tokens = line.split("\\|")
33 |       val sty = mapSty.getOrElseUpdate(tokens(1),SemanticType(tokens(1)))
34 |       mapAbbr2sty.getOrElseUpdate(tokens(0),tokens(1))
35 |       sty.abbr = tokens(0)
36 |       sty.fullName = tokens(2)
37 |     }
38 |     val fgroup=Source.fromFile(Conf.rootDir + "/data/SemGroups.txt")
39 |     for (line <- fgroup.getLines() if line.trim.size > 5) {
40 |       val tokens = line.split("\\|")
41 |       val sty = mapSty.getOrElseUpdate(tokens(2),SemanticType(tokens(2)))
42 |       sty.groupAbbr = tokens(0)
43 |       sty.groupName = tokens(1)
44 |       sty.fullName = tokens(3)
45 |     }
46 |   }
47 | init()
48 | 
49 | def main(args:Array[String]) = {
50 |   init()
51 |   println(mapSty)
52 |   println(mapAbbr2sty)
53 | }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/com/votors/umls/TermIdentify.scala:
--------------------------------------------------------------------------------
  1 | package com.votors.umls
  2 | 
  3 | import java.io.{FileWriter, PrintWriter}
  4 | import java.util.Date
  5 | import java.util.concurrent.atomic.AtomicInteger
  6 | 
  7 | import com.votors.common.Utils.Trace
  8 | import com.votors.common.{Conf, MyCache, Utils}
  9 | import com.votors.ml.{Ngram, Nlp, Sentence}
 10 | import org.apache.log4j.{Level, Logger}
 11 | import org.apache.spark.{SparkConf, SparkContext}
 12 | 
 13 | import scala.collection.mutable.ListBuffer
 14 | import scala.collection.JavaConversions.asScalaIterator
 15 | import scala.collection.immutable.{List, Range}
 16 | import scala.collection.mutable
 17 | import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 18 | import scala.io.Source
 19 | import scala.io.Codec
 20 | /**
 21 |  * Created by Jason on 2016/4/1 0001.
 22 |  */
 23 | class TermIdentify {
 24 | 
 25 | 
 26 | 
 27 | }
 28 | 
 29 | 
 30 | /**
 31 |  *  input : a csv file, but only identify the text in one clumn.
 32 |  */
 33 | object TermIdentify {
 34 | 
 35 |   def main(avgs: Array[String]): Unit = {
 36 |     println("the input args are:\n" + avgs.mkString("\n"))
 37 |     if (avgs.size < 2) {
 38 |       println(s"invalid inputs, should be: prepare|parse dir file1,file2... extern-file")
 39 |       sys.exit(1)
 40 |     }
 41 |     val inFile = avgs(0)
 42 |     val outFile = avgs(1)
 43 |     // init spark
 44 |     val startTime = new Date()
 45 | 
 46 | 
 47 |     val records = Utils.readCsvFile(avgs(0)).toSeq
 48 |     val headSorted = Range(0,records.head.size()).map(index=>records.head.get(index)).zipWithIndex
 49 |     val head = headSorted.toMap
 50 |     var cnt = 0
 51 |     val tagger = new UmlsTagger2()
 52 | 
 53 |     var writer = new PrintWriter(new FileWriter(outFile))
 54 |     writer.println("INDEX,"+headSorted.map(_._1).mkString(",")+",CUI,AUI,CODE,STRING")
 55 | 
 56 |     records.tail.foreach(rec =>{
 57 |       cnt += 1
 58 |       if (true) {
 59 |         println(rec.toString)
 60 |         val hNgrams = mutable.LinkedHashMap[String,Ngram]()
 61 |         val sents = Nlp.generateSentence(cnt, rec.get(head("display_name")), null)
 62 |         val gramId = new AtomicInteger()
 63 |         Nlp.generateNgram(sents.toSeq, gramId, hNgrams)
 64 | 
 65 |         hNgrams.foreach(kv=>{
 66 |           val key = kv._1
 67 |           val gram = kv._2
 68 |           val (umlsBestScore, stys) = tagger.getUmlsScore(gram.text)
 69 |           if (umlsBestScore._3 != null && umlsBestScore._3.score>Conf.umlsLikehoodLimit) {
 70 |             val ret = tagger.execQuery(s"select code from umls.mrconso where CUI='${umlsBestScore._3.cui}' and AUI='${umlsBestScore._3.aui}';")
 71 |             var code = ""
 72 |             while (ret.next) {
 73 |               code += ret.getString("code") + ':'
 74 |             }
 75 |             println(s"${key},${umlsBestScore._3.cui},${code.dropRight(1)},${umlsBestScore._3.descr}")
 76 |             writer.println(s"${cnt},"+"\""+headSorted.map(keyIndex=>rec.get(keyIndex._2)).mkString("\",\"")+"\","+s"${umlsBestScore._3.cui},${umlsBestScore._3.aui},${code.dropRight(1)},${umlsBestScore._3.descr}")
 77 | 
 78 |           }
 79 |         })
 80 |       }
 81 |     })
 82 | 
 83 |     System.out.println("### used time: "+(new Date().getTime()-startTime.getTime())+" ###")
 84 |   }
 85 | 
 86 | }
 87 | 
 88 | 
 89 | /**
 90 |   *  input : a csv file, but only identify the text in one clumn.
 91 |   */
 92 | object TermIdentifySeq{
 93 | 
 94 |   def main(avgs: Array[String]): Unit = {
 95 |     println("the input args are:\n" + avgs.mkString("\n"))
 96 |     if (avgs.size < 2) {
 97 |       println(s"invalid inputs, should be: input_file  output-file")
 98 |       sys.exit(1)
 99 |     }
100 |     val inFile = avgs(0)
101 |     val outFile = avgs(1)
102 |     // init spark
103 |     val startTime = new Date()
104 | 
105 | 
106 |     val records = Utils.readCsvFile(avgs(0)).toSeq
107 |     val headSorted = Range(0,records.head.size()).map(index=>records.head.get(index)).zipWithIndex
108 |     var cnt = 0
109 |     val tagger = new UmlsTagger2()
110 | 
111 |     var writer = new PrintWriter(new FileWriter(outFile))
112 |     writer.println("qaID\ttermID\tterm\tcui\taui\tscore\tcuiStr\tsentLen\tsentence")
113 | 
114 |     records.foreach(rec =>{
115 |       cnt += 1
116 |       if (true) {
117 | //        println(rec.toString)
118 |         val hNgrams = new ListBuffer[(Sentence,Ngram)]()
119 |         val qaid = rec.get(0)
120 |         print(s"${cnt}\t${qaid}\r")
121 |         val sents = Nlp.generateSentence(cnt, rec.get(1), null)
122 |         val gramId = new AtomicInteger()
123 |         Nlp.generateNgramSeq(sents.toSeq, gramId, hNgrams)
124 |         var termId = 0
125 |         hNgrams.foreach(kv=>{
126 |           val sent = kv._1
127 |           val gram = kv._2
128 |           val key = gram.key
129 | //          println(key)
130 |           val (umlsBestScore, stys) = tagger.getUmlsScore(gram.text)
131 |           if (umlsBestScore != null && umlsBestScore._3 != null && umlsBestScore._3.score > Conf.umlsLikehoodLimit) {
132 |             termId += 1
133 |             val sugg = umlsBestScore._3
134 |             val outStr = f"${qaid}\t${termId}\t${gram.textOrg}\t${sugg.cui}\t${sugg.aui}\t${sugg.score}%.0f\t${sugg.descr}\t${sent.sentId}\t${sent.words.mkString(" ")}"
135 |             println(outStr)
136 |             writer.println(outStr)
137 |           }
138 |         })
139 |         if (cnt%100 == 0) writer.flush()
140 |       }
141 |     })
142 |     writer.close()
143 |     MyCache.close()
144 |     System.out.println("### used time: "+(new Date().getTime()-startTime.getTime())+" ###")
145 |   }
146 | 
147 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/mllib/clustering/MyKmean.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.mllib.clustering
 2 | 
 3 | import org.apache.spark.broadcast.Broadcast
 4 | 
 5 | import scala.collection.mutable.ArrayBuffer
 6 | 
 7 | import org.apache.spark.{SparkContext}
 8 | import org.apache.spark.annotation.Experimental
 9 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
10 | import org.apache.spark.mllib.linalg.BLAS.{axpy, scal}
11 | import org.apache.spark.mllib.util.MLUtils
12 | import org.apache.spark.rdd.RDD
13 | import org.apache.spark.storage.StorageLevel
14 | import org.apache.spark.util.Utils
15 | import org.apache.spark.util.random.XORShiftRandom
16 | 
17 | /**
18 |  * Created by Jason on 2015/12/4 0004.
19 |  */
20 | object MyKmean extends KMeans{
21 | //  def pointCost2(centers: TraversableOnce[VectorWithNorm],
22 | //                 point: Vector) = KMeans.pointCost(centers, new VectorWithNorm(point))
23 | 
24 |   /**
25 |    * Returns the index of the closest center to the given point, as well as the squared distance.
26 |    */
27 |   def findClosest(centers: TraversableOnce[Vector],p: Vector): (Int,Double) = {
28 |     KMeans.findClosest(clusterCentersWithNorm(centers), new VectorWithNorm(p))
29 |   }
30 |   def clusterCentersWithNorm(clusterCenters: TraversableOnce[Vector]): TraversableOnce[VectorWithNorm] =
31 |     clusterCenters.map(new VectorWithNorm(_))
32 | 
33 |   def fastSquaredDistance( v1: Vector, norm1:Double, v2: Vector, norm2:Double): Double = {
34 |     KMeans.fastSquaredDistance(new VectorWithNorm(v1,norm1),new VectorWithNorm(v2,norm2))
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/test/com/votors/umls/UmlsTagger2Test.scala:
--------------------------------------------------------------------------------
  1 | package com.votors.umls
  2 | 
  3 | import java.io.{FileWriter, PrintWriter, File}
  4 | import com.votors.common.Conf
  5 | import com.votors.common.Utils.Trace
  6 | import com.votors.common.Utils.Trace._
  7 | import org.junit.{AfterClass, Assert, Test}
  8 | 
  9 | import scala.io.Source
 10 | 
 11 | class UmlsTagger2Test   {
 12 | 
 13 |     // The root config dir of the opennlp models files and and
 14 |     val rootDir = Conf.rootDir
 15 |     Trace.currLevel = ERROR
 16 | 
 17 |     if (! new File(rootDir).exists()) {
 18 |       println("Error! You have to config a valid dataDir in class UmlsTagger2Test first")
 19 |       sys.exit(1)
 20 |     }
 21 | 
 22 |     @Test
 23 |     def testBuildIndexJson(): Unit = {
 24 |       val tagger = new UmlsTagger2("",rootDir)
 25 |       tagger.buildIndexJson(
 26 |         "C:\\fsu\\ra\\data\\201708\\Copy of Botanical_with_dsld_cat_termlist.csv",
 27 |         "C:\\fsu\\ra\\data\\201708\\Copy of Botanical_with_dsld_cat_termlist.txt")
 28 |     }
 29 |   @Test
 30 |   def testBuildIndexPlainText(): Unit = {
 31 |     val tagger = new UmlsTagger2("",rootDir)
 32 |     tagger.buildIndexPlainText(
 33 |       "C:\\fsu\\ra\\data\\201708\\Copy of Botanical_with_dsld_cat_termlist.csv",
 34 |       "C:\\fsu\\ra\\data\\201708\\Copy of Botanical_with_dsld_cat_termlist.txt")
 35 |   }
 36 | /*
 37 |   @Test
 38 |   def testBuildIndexXml(): Unit = {
 39 |     val tagger = new UmlsTagger2("",rootDir)
 40 |     tagger.buildIndexCsv(
 41 |       new File("C:\\fsu\\target.terms.csv"),
 42 |       new File("C:\\fsu\\target.terms.ret.csv"))
 43 |   }*/
 44 |   @Test
 45 |   def testBuildIndex2db(): Unit = {
 46 |     val tagger = new UmlsTagger2("",rootDir)
 47 |     tagger.buildIndex2db()
 48 |   }
 49 | 
 50 |     @Test
 51 |     def testGetFull(): Unit = {
 52 |       val tagger = new UmlsTagger2(Conf.solrServerUrl,rootDir)
 53 |       val phrases = List("age")
 54 |       phrases.foreach(phrase => {
 55 |         Console.println()
 56 |         Console.println("Query: %s".format(phrase))
 57 |         val suggestions = tagger.select(phrase)
 58 |         suggestions match {
 59 |           case suggestion: Array[Suggestion] => {
 60 |             suggestion.foreach(s => Console.println(s.toString()))
 61 |             //Assert.assertNotNull(suggestion.cui)
 62 |           }
 63 |           case _ =>
 64 |             Assert.fail("No results for [%s]".format(phrase))
 65 |         }
 66 |       })
 67 |     }
 68 | 
 69 | 
 70 |     @Test
 71 |     def testStermWord(): Unit = {
 72 |       val tagger = new UmlsTagger2(Conf.solrServerUrl,rootDir)
 73 | 
 74 |       val phrases = List("green tea")
 75 |       phrases.foreach(phrase => {
 76 |         Console.println(s"$phrase,${tagger.normalizeAll(phrase)}")
 77 |        // Console.println(s"$phrase,${tagger.normalizeCasePunct(phrase)}")
 78 | 
 79 |       })
 80 | 
 81 |     }
 82 | 
 83 |   @Test
 84 |   def testAnnotateSentence() = {
 85 |     val tagger = new UmlsTagger2(Conf.solrServerUrl, rootDir)
 86 |     val sent = "I lost a tone of weight i was alway 130 or above , i eat fine but i cant have really big meals ."
 87 |     val sugg=tagger.annotateSentence(sent,5)
 88 |     sugg.filter(_._2.size>0).foreach(s=>{
 89 |       println(s"${s._1}\t${s._2.mkString(",")}")
 90 |     })
 91 |   }
 92 | 
 93 | 
 94 |     @Test
 95 |     def testAnnotateFile(): Unit = {
 96 |       val tagger = new UmlsTagger2(Conf.solrServerUrl, rootDir)
 97 | //      tagger.annotateFile(s"C:/fsu/ra/data/201603/nsrr-canonical-data-dictionary.txt",
 98 | //        s"C:/fsu/ra/data/201603/ret-nsrr-canonical-data-dictionary.txt",
 99 |         tagger.annotateFile(s"C:/fsu/ra/data/201603/nsrr-canonical-data-dictionary.txt",
100 |           s"C:/fsu/ra/data/201603/ret-nsrr-canonical-data-dictionary.txt",
101 |         2,
102 |         5,
103 |         '\t','\n')
104 |     }
105 | 
106 |     // find terms from dictionary for a string
107 |     @Test
108 |     def testAnnotateTag(): Unit = {
109 |       val tagger = new UmlsTagger2(Conf.solrServerUrl, rootDir)
110 |       //tagger.annotateTag(s"${rootDir}/data/taglist-zhiwei.txt",s"${rootDir}/data/taglist-zhiwei.csv")
111 |       tagger.annotateTagAppend(s"C:/fsu/ra/data/201603/nsrr-canonical-data-dictionary.txt",
112 |         s"C:/fsu/ra/data/201603/ret-nsrr-canonical-data-dictionary.txt",1)
113 | 
114 |       tagger.jdbcClose()
115 |     }
116 | 
117 |     @Test
118 |     def testPosFilter():Unit = {
119 | 
120 |     }
121 | 
122 |     @Test
123 |     def testSql():Unit = {
124 |       val tagger = new UmlsTagger2(Conf.solrServerUrl, rootDir)
125 |       val rs = tagger.execQuery("select count(*) as cnt from umls.mrsty")
126 | 
127 |       while (rs.next) {
128 |         println(rs.getString("cnt"))
129 |       }
130 | 
131 |       tagger.jdbcClose()
132 |     }
133 | 
134 | 
135 |     //  @AfterClass
136 | //  def cleanup():Unit = {
137 | //
138 | //  }
139 | }


--------------------------------------------------------------------------------
/term_identification.md:
--------------------------------------------------------------------------------
 1 | ## Overviw
 2 | Given a list of terms T {(tid,term)} and some textual data set D {(did,text)}, identify any of the term in T occurs in data set D.
 3 | 
 4 | ## Steps of method
 5 | * Build the lockup table for the given terms T;
 6 | * Convert the text into N-gram, and match the N-gram in the lookup table to see if an N-gram matches any of the terms.
 7 |  
 8 | ## steps of operation
 9 | * preparation: 
10 |     * Compile the project and get the Jar file of the project.
11 |     * set the alias for tasks run-import-term and run-extract-term:
12 |     ```
13 |     alias run-import-term='spark-submit --master spark://somelab12.cci.fsu.edu:7077 --jars /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar  --driver-class-path /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar --conf 'spark.executor.extraJavaOptions=-DCTM_ROOT_PATH=/tmp/ctm_root' --driver-java-options=-DCTM_ROOT_PATH=/tmp/ctm_root --files /tmp/ctm_root/conf/default.properties --executor-memory 3g --class com.votors.umls.BuildTargetTerm  /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar   '
14 |     alias run-extract-term='spark-submit --master spark://somelab12.cci.fsu.edu:7077 --jars /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar  --driver-class-path /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar --conf 'spark.executor.extraJavaOptions=-DCTM_ROOT_PATH=/tmp/ctm_root' --driver-java-options=-DCTM_ROOT_PATH=/tmp/ctm_root --files /tmp/ctm_root/conf/default.properties --executor-memory 3g   --class com.votors.umls.IdentfyTargetTerm /data/ra/Clinical-Text-Mining/target/Clinical-Text-Mining-0.0.1-SNAPSHOT-jar-with-dependencies.jar  '
15 |     ```
16 |     * Configure the conf/default.properties properly.
17 |     * Store you textual data set in Mysql. Make sure there is a unique integer id for every text.
18 |     
19 | * execution
20 |     * Import the term list to build a lookup table. The format of the input should be one term one line: id [tab] term.
21 |     you can write it in the Excel file, and save it as table separated (*.txt) file.
22 |     ```
23 |     run-import-term /tmp/supp_list.txt 
24 |     ```
25 |     * Configure the conf/default.properties to tell the tool where to file you textual data set.
26 |     ```
27 |     blogDbUrl=jdbc:mysql://[hostname or IP]:3306/[database name]?user=[username of Mysql]&password=[password of the user]         
28 |     blogTbl= the table name of you data set 
29 |     blogIdCol=the column name of the id in the table. it has to be integer
30 |     blogTextCol= the column name of the text in the table 
31 |     ```
32 |     * Run the identification command. Note that if the data set is large, it will take a long time. 
33 |     So you'd better run this command using screen to avoid the network problem interrupts the processing.
34 |     ```
35 |     run-extract-term /tmp/ret_list.csv
36 |     ```
37 |     
38 | ## More configuration
39 | ### stop words list: data/stopwords.txt
40 | ### conf/default.properties
41 |     * how to get the text to get Ngram; the blogId will select as distict, and the blogTextCol will be limit to 1 row.  
42 |       blogDbUrl=jdbc:mysql://localhost:3306/ytex?user=root&password=root  
43 |       blogTbl=tmp_org_yahoo  
44 |       blogIdCol=id  
45 |       blogTextCol=concat(subject, ". ", content, ". ", chosenanswer)  
46 |       
47 |      * limit the blog to be analyzed, mainly for test    
48 |       blogLimit=200  
49 |      *target term info in database
50 |      targetTermTbl=_target_term_
51 |      targetTermTblDropAndCreate=true
52 |      * if true, using solr for matching a ngram with target terms, else using database query for matching
53 |      targetTermUsingSolr=false
54 | ### Other configuration items may affect the result too!
55 | 
56 | 


--------------------------------------------------------------------------------