├── .gitattributes
├── .gitignore
├── CurationAPIs-UserGuide.pdf
├── Index
    ├── _a.cfe
    ├── _a.cfs
    ├── _a.si
    ├── segments.gen
    └── segments_b
├── IndexSentence
    ├── _2.fdt
    ├── _2.fdx
    ├── _2.fnm
    ├── _2.si
    ├── _2.tvd
    ├── _2.tvf
    ├── _2.tvx
    ├── _2_Lucene41_0.doc
    ├── _2_Lucene41_0.pay
    ├── _2_Lucene41_0.pos
    ├── _2_Lucene41_0.tim
    ├── _2_Lucene41_0.tip
    ├── _2_nrm.cfe
    ├── _2_nrm.cfs
    ├── segments.gen
    └── segments_3
├── LICENSE
├── README.md
├── Result.xml
├── Sentence_Index
    ├── _0.cfe
    ├── _0.cfs
    ├── _0.si
    ├── segments.gen
    └── segments_1
├── Stem.txt
├── TextClassification
    ├── Knn1248308.dat
    ├── Knn1793382.dat
    ├── Knn2053135.dat
    ├── Knn4725799.dat
    ├── Knn5442488.dat
    ├── Knn8341982.dat
    ├── Knn8586130.dat
    ├── Knn8814335.dat
    ├── NaiveBayes1567923.dat
    ├── NaiveBayes1902130.dat
    ├── NaiveBayes3099268.dat
    └── NaiveBayes5432235.dat
├── cosine.txt
├── cosineDoc
    ├── data.txt
    └── tweets.txt
├── data.txt
├── englishStopwords.txt
├── entity.txt
├── pom.xml
├── pos.txt
├── src
    └── main
    │   └── java
    │       └── unsw
    │           └── curation
    │               └── api
    │                   ├── classify
    │                       └── TextClassifier.java
    │                   ├── cosinesentence
    │                       ├── AllTermsSentence.java
    │                       ├── CosineSimilaritySentence.java
    │                       ├── DocVectorSentence.java
    │                       ├── IndexSentence.java
    │                       └── VectorGeneratorSentence.java
    │                   ├── cosinetext
    │                       ├── AllTerms.java
    │                       ├── CosineSimilarity.java
    │                       ├── DocVector.java
    │                       ├── Index.java
    │                       └── VectorGenerator.java
    │                   ├── domain
    │                       ├── Classification.java
    │                       ├── ExtractNamedEntity.java
    │                       ├── ExtractNumberSimilarity.java
    │                       ├── ExtractPosTag.java
    │                       ├── ExtractStem.java
    │                       ├── ExtractSynonym.java
    │                       ├── ExtractTextCosineSimilarity.java
    │                       ├── ExtractTextSimilarity.java
    │                       ├── ExtractTextTfidfSimilarity.java
    │                       ├── ExtractionKeyword.java
    │                       └── abstraction
    │                       │   ├── IClassificationTextDecisionTree.java
    │                       │   ├── IClassificationTextKNN.java
    │                       │   ├── IClassificationTextLogisticRegression.java
    │                       │   ├── IClassificationTextNaiveBays.java
    │                       │   ├── IClassificationTextNeuralNetwork.java
    │                       │   ├── IClassificationTextRandomForest.java
    │                       │   ├── IClassificationTextSVM.java
    │                       │   ├── IKeywordEx.java
    │                       │   ├── INamedEntity.java
    │                       │   ├── INumberCosineSimilarity.java
    │                       │   ├── INumberDiceSimilarity.java
    │                       │   ├── INumberEuclideanSimilarity.java
    │                       │   ├── INumberJaccardSimilarity.java
    │                       │   ├── IPosTag.java
    │                       │   ├── IStem.java
    │                       │   ├── ISynonym.java
    │                       │   ├── ITextCosineSimilarity.java
    │                       │   ├── ITextJaccardSimilarity.java
    │                       │   ├── ITextJaroSimilarity.java
    │                       │   ├── ITextLevenshtainSimilarity.java
    │                       │   ├── ITextQGramSimilarity.java
    │                       │   ├── ITextSoundexSimilarity.java
    │                       │   ├── ITextTfidfSimilarity.java
    │                       │   └── IUrlExtraction.java
    │                   ├── extractnamedentity
    │                       ├── ExtractEntityFile.java
    │                       ├── ExtractEntitySentence.java
    │                       ├── RegexClass.java
    │                       ├── curation.jpg
    │                       └── curation.ucls
    │                   ├── extractpostag
    │                       └── ExtractPosTagData.java
    │                   ├── extractsimilarity
    │                       ├── ExtractNumberCosineSimilarityImpl.java
    │                       ├── ExtractNumberDiceSimilarityImpl.java
    │                       ├── ExtractNumberEuclideanSimilarity.java
    │                       ├── ExtractNumberJaccardSimilarityImpl.java
    │                       ├── ExtractTextCosineSimilarityImpl.java
    │                       ├── ExtractTextJaccardSimilarityImpl.java
    │                       ├── ExtractTextJaroSimialrity.java
    │                       ├── ExtractTextLevenshtainImpl.java
    │                       ├── ExtractTextQGramSimilarity.java
    │                       ├── ExtractTextSoundexSimilarity.java
    │                       └── ExtractTextTfIdfSimilarityImpl.java
    │                   ├── extractstem
    │                       └── ExtractStemImpl.java
    │                   ├── extractsynonym
    │                       └── WordNetFile.java
    │                   ├── index
    │                       ├── DataSearch.java
    │                       ├── Index.java
    │                       └── SchIndData.java
    │                   ├── linking
    │                       ├── ConceptNet.java
    │                       ├── GoogleKnowledgeGraph.java
    │                       └── WikiData.java
    │                   ├── run
    │                       └── run.java
    │                   ├── textclassification
    │                       ├── EvaluateClassifier.java
    │                       ├── ExtractClassificationTextDecisionTreeImpl.java
    │                       ├── ExtractClassificationTextKNNImpl.java
    │                       ├── ExtractClassificationTextLogisticRegressionImpl.java
    │                       ├── ExtractClassificationTextNaiveBaysImpl.java
    │                       ├── ExtractClassificationTextNeuralNetworkImpl.java
    │                       ├── ExtractClassificationTextRandomForestImpl.java
    │                       ├── ExtractClassificationTextSVMImpl.java
    │                       └── TextClassifierImpl.java
    │                   ├── tfidf
    │                       ├── DataSearchSentence.java
    │                       ├── IndexSentence.java
    │                       └── ReadDataSentence.java
    │                   ├── tokenization
    │                       └── ExtractionKeywordImpl.java
    │                   ├── twitter
    │                       ├── KeywordExtraction.java
    │                       ├── MyStemExtraction.java
    │                       ├── NamedEntityExtraction.java
    │                       ├── Synonyms.java
    │                       ├── TweetInfo.java
    │                       ├── URLExtraction.java
    │                       └── XmlGenerator.java
    │                   ├── twitterdomain
    │                       ├── KeywordDomain.java
    │                       ├── NamedEntityDomain.java
    │                       ├── StemDomain.java
    │                       ├── SynonymDomain.java
    │                       ├── TweetInfoDomain.java
    │                       └── UrlDomain.java
    │                   └── url
    │                       ├── GetHTMLFile.java
    │                       └── GetURL.java
├── test.txt
├── text.txt
└── tweets.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.jar filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Eclipse
 3 | .classpath
 4 | .project
 5 | .settings/
 6 | 
 7 | # Intellij
 8 | .idea/
 9 | *.iml
10 | *.iws
11 | 
12 | # Mac
13 | .DS_Store
14 | 
15 | # Maven
16 | log/
17 | target/
18 | 


--------------------------------------------------------------------------------
/CurationAPIs-UserGuide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/CurationAPIs-UserGuide.pdf


--------------------------------------------------------------------------------
/Index/_a.cfe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/_a.cfe


--------------------------------------------------------------------------------
/Index/_a.cfs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/_a.cfs


--------------------------------------------------------------------------------
/Index/_a.si:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/_a.si


--------------------------------------------------------------------------------
/Index/segments.gen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/segments.gen


--------------------------------------------------------------------------------
/Index/segments_b:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/segments_b


--------------------------------------------------------------------------------
/IndexSentence/_2.fdt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.fdt


--------------------------------------------------------------------------------
/IndexSentence/_2.fdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.fdx


--------------------------------------------------------------------------------
/IndexSentence/_2.fnm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.fnm


--------------------------------------------------------------------------------
/IndexSentence/_2.si:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.si


--------------------------------------------------------------------------------
/IndexSentence/_2.tvd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.tvd


--------------------------------------------------------------------------------
/IndexSentence/_2.tvf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.tvf


--------------------------------------------------------------------------------
/IndexSentence/_2.tvx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.tvx


--------------------------------------------------------------------------------
/IndexSentence/_2_Lucene41_0.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.doc


--------------------------------------------------------------------------------
/IndexSentence/_2_Lucene41_0.pay:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.pay


--------------------------------------------------------------------------------
/IndexSentence/_2_Lucene41_0.pos:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.pos


--------------------------------------------------------------------------------
/IndexSentence/_2_Lucene41_0.tim:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.tim


--------------------------------------------------------------------------------
/IndexSentence/_2_Lucene41_0.tip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.tip


--------------------------------------------------------------------------------
/IndexSentence/_2_nrm.cfe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_nrm.cfe


--------------------------------------------------------------------------------
/IndexSentence/_2_nrm.cfs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_nrm.cfs


--------------------------------------------------------------------------------
/IndexSentence/segments.gen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/segments.gen


--------------------------------------------------------------------------------
/IndexSentence/segments_3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/segments_3


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ----Data Curation APIs-------------
 2 | 
 3 | Understanding and analyzing big data is firmly recognized as a powerful and strategic priority. For deeper interpretation of and better intelligence with big data, it is important to transform raw data (unstructured, semi-structured and structured data sources, e.g., text, video, image data sets) into curated data: contextualized data and knowledge that is maintained and made available for use by end-users and applications. In particular, data curation acts as the glue between raw data and analytics, providing an abstraction layer that relieves users from time consuming, tedious and error prone curation tasks. In this context, the data curation process becomes a vital analytics asset for increasing added value and insights. 
 4 | 
 5 | We identify and implement a set of curation APIs and make them available (as an open source project on GitHub) to researchers and developers to assist them transforming their raw data into curated data. The curation APIs enable developers to easily add features - such as extracting keyword, part of speech, and named entities such as Persons, Locations, Organizations, Companies, Products, Diseases, Drugs, etc.; providing synonyms and stems for extracted information items leveraging lexical knowledge bases for the English language such as WordNet; linking extracted entities to external knowledge bases such as Google Knowledge Graph and Wikidata; discovering similarity among the extracted information items, such as calculating similarity between string, number, date and time data; classifying, sorting and categorizing data into various types, forms or any other distinct class; and indexing structured and unstructured data - into their applications.
 6 | 
 7 | 
 8 | Notice: 
 9 | 
10 | We encourage researchers/developers to cite our paper if you have used our APIs, libraries, tools or datasets. 
11 | 
12 | * Beheshti, Tabebordbar, Benatallah, Nouri: "On Automating Basic Data Curation Tasks", WWW 2017, Perth Australia.
13 | * Beheshti, Tabebordbar, Benatallah, Nouri: "Data Curation APIs", CoRR abs/1612.03277 (2016).
14 | 
15 | 
16 | You can find the technical report and user guide from the following link:
17 | https://arxiv.org/abs/1612.03277
18 | 
19 | Curation Services Rest APIs:
20 | http://d2dcrc.cse.unsw.edu.au:9091/ExtractionAPI-0.0.1-SNAPSHOT/
21 | 
22 | 
23 | 
24 | ----License-----------------------
25 | 
26 | License: This software is licensed under the Apache 2.0 license, quoted below.
27 | 
28 | Copyright 2016 UNSW.CSE.SOC Research Group
29 | <unsw.cse.soc@gmail.com>
30 | 
31 | You may not use these APIs except in compliance with the License. You may obtain a copy of
32 | the License at
33 | 
34 |     http://www.apache.org/licenses/LICENSE-2.0
35 | 
36 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
37 | 
38 | ----Contributors-----------------------
39 | 
40 | 
41 | Amin (SMR) Beheshti
42 | 
43 | Alireza Tabebordbar
44 | 
45 | Boualem Benatallah
46 | 
47 | Seyed Mohammad Reza Nouri
48 | 
49 | Service Oriented Computing (SOC) Research Group, School of Computer Sience and Engineering, The University of New South Wales, Sydney, Australia. This work is part of the Data Curation Foundry project stream, D2D CRC.
50 | 
51 | 


--------------------------------------------------------------------------------
/Result.xml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Result.xml


--------------------------------------------------------------------------------
/Sentence_Index/_0.cfe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/_0.cfe


--------------------------------------------------------------------------------
/Sentence_Index/_0.cfs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/_0.cfs


--------------------------------------------------------------------------------
/Sentence_Index/_0.si:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/_0.si


--------------------------------------------------------------------------------
/Sentence_Index/segments.gen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/segments.gen


--------------------------------------------------------------------------------
/Sentence_Index/segments_1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/segments_1


--------------------------------------------------------------------------------
/Stem.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Stem.txt


--------------------------------------------------------------------------------
/TextClassification/Knn1248308.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn1248308.dat


--------------------------------------------------------------------------------
/TextClassification/Knn1793382.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn1793382.dat


--------------------------------------------------------------------------------
/TextClassification/Knn2053135.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn2053135.dat


--------------------------------------------------------------------------------
/TextClassification/Knn4725799.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn4725799.dat


--------------------------------------------------------------------------------
/TextClassification/Knn5442488.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn5442488.dat


--------------------------------------------------------------------------------
/TextClassification/Knn8341982.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn8341982.dat


--------------------------------------------------------------------------------
/TextClassification/Knn8586130.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn8586130.dat


--------------------------------------------------------------------------------
/TextClassification/Knn8814335.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn8814335.dat


--------------------------------------------------------------------------------
/TextClassification/NaiveBayes1567923.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/NaiveBayes1567923.dat


--------------------------------------------------------------------------------
/TextClassification/NaiveBayes1902130.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/NaiveBayes1902130.dat


--------------------------------------------------------------------------------
/TextClassification/NaiveBayes3099268.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/NaiveBayes3099268.dat


--------------------------------------------------------------------------------
/TextClassification/NaiveBayes5432235.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/NaiveBayes5432235.dat


--------------------------------------------------------------------------------
/cosine.txt:
--------------------------------------------------------------------------------
1 | 1,2,5,6
2 | 4,5,9,6
3 | 7,5,8,6
4 | 3,2,5,9
5 | 11,15,19,17
6 | 10,1,1,5


--------------------------------------------------------------------------------
/entity.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/entity.txt


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 |   <groupId>TextAPI</groupId>
  4 |   <artifactId>TextAPI</artifactId>
  5 |   <version>0.0.1-SNAPSHOT</version>
  6 |   <dependencies>
  7 |      <dependency>
  8 |        <groupId>edu.stanford.nlp</groupId>
  9 |        <artifactId>stanford-corenlp</artifactId>
 10 |        <version>3.5.2</version>
 11 |      </dependency>
 12 |     
 13 |      <dependency>
 14 |        <groupId>edu.stanford.nlp</groupId>
 15 |        <artifactId>stanford-corenlp</artifactId>
 16 |        <version>3.5.2</version>
 17 |        <classifier>models</classifier>
 18 |      </dependency>
 19 |      
 20 |      <dependency>
 21 |       <groupId>edu.mit</groupId>
 22 |       <artifactId>jwi</artifactId>
 23 |       <version>2.2.3</version>
 24 |     </dependency>
 25 |     
 26 |     <dependency>
 27 |      <groupId>org.apache.opennlp</groupId>
 28 |        <artifactId>opennlp-tools</artifactId>
 29 |      <version>1.6.0</version>
 30 |     </dependency>
 31 |     
 32 |     <dependency>
 33 |      <groupId>commons-codec</groupId>
 34 |        <artifactId>commons-codec</artifactId>
 35 |      <version>1.9</version>
 36 |     </dependency>
 37 |    	
 38 |    	<dependency>
 39 |       <groupId>nz.ac.waikato.cms.weka</groupId>
 40 |        <artifactId>weka-dev</artifactId>
 41 |       <version>3.7.10</version>
 42 |     </dependency>
 43 |    	
 44 |      <dependency>
 45 |       <groupId>info.debatty</groupId>
 46 |         <artifactId>java-string-similarity</artifactId>
 47 |       <version>0.13</version>
 48 |     </dependency>
 49 |    <!--  <dependency>
 50 |     <groupId>org.apache.lucene</groupId>
 51 |     <artifactId>lucene-queryparser</artifactId>
 52 |     <version>4.1.0</version>
 53 |     </dependency> -->
 54 |     
 55 |     <dependency>
 56 |      <groupId>org.jsoup</groupId>
 57 |        <artifactId>jsoup</artifactId>
 58 |      <version>1.7.2</version>
 59 |    </dependency>
 60 |     
 61 |     <dependency>
 62 |     <groupId>org.apache.commons</groupId>
 63 |     <artifactId>commons-math3</artifactId>
 64 |     <version>3.2</version>
 65 |     </dependency>
 66 |    <!--  <dependency>
 67 |     <groupId>org.apache.lucene</groupId>
 68 |     <artifactId>lucene-core</artifactId>
 69 |     <version>4.1.0</version>
 70 |     </dependency>
 71 |     
 72 |     <dependency>
 73 |     <groupId>org.apache.lucene</groupId>
 74 |     <artifactId>lucene-analyzers-common</artifactId>
 75 |     <version>4.1.0</version>
 76 |     </dependency> -->
 77 |     
 78 |     <dependency>
 79 |   	 <groupId>org.apache.commons</groupId>
 80 |       <artifactId>commons-lang3</artifactId>
 81 |      <version>3.4</version>
 82 |    	</dependency>
 83 |    	
 84 |    	 <dependency>
 85 |     <groupId>org.twitter4j</groupId>
 86 |     <artifactId>twitter4j-stream</artifactId>
 87 |     <version>4.0.4</version>
 88 |     </dependency>  
 89 |       
 90 |     <dependency>
 91 |     <groupId>org.twitter4j</groupId>
 92 |     <artifactId>twitter4j-core</artifactId>
 93 |     <version>4.0.1</version>
 94 |     </dependency>
 95 |     
 96 |     <dependency>
 97 |      <groupId>org.json</groupId>
 98 |      <artifactId>json</artifactId>
 99 |      <version>20160810</version>
100 |     </dependency>
101 |        <dependency>
102 |             <groupId>org.apache.lucene</groupId>
103 |             <artifactId>lucene-core</artifactId>
104 |             <version>4.6.0</version>
105 |         </dependency>
106 |         
107 |         <dependency>
108 |             <groupId>org.apache.lucene</groupId>
109 |             <artifactId>lucene-analyzers-common</artifactId>
110 |             <version>4.6.0</version>
111 |        </dependency>
112 | 
113 |        <dependency>
114 |             <groupId>org.apache.lucene</groupId>
115 |             <artifactId>lucene-queryparser</artifactId>
116 |             <version>4.6.0</version>
117 |       </dependency>
118 |     
119 |   </dependencies>
120 | </project>


--------------------------------------------------------------------------------
/pos.txt:
--------------------------------------------------------------------------------
1 | Taylor Alison Swift (born December 13, 1989) is an American singer-songwriter. Throughout her career, she has become one of the most popular female contemporary singers. She is known for narrative songs about her personal life, which has received much media attention.
2 | Raised in Wyomissing, Pennsylvania, Swift moved to Nashville, Tennessee, at age 14 to pursue a career in country music. She signed with the independent label Big Machine Records and became the youngest artist ever signed by the Sony/ATV Music publishing house. The release of her self-titled debut album in 2006 marked the start of her career as a country music singer. The album's third single, "Our Song", made her the youngest person to single-handedly write and perform a number-one song on the Hot Country Songs chart. Swift's second album, Fearless, was released in 2008. Buoyed by the pop crossover success of the singles "Love Story" and "You Belong with Me", Fearless became the best-selling album of 2009 in the United States. The album won four Grammy Awards, with Swift becoming the youngest Album of the Year winner.
3 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/classify/TextClassifier.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.classify;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.BufferedWriter;
 5 | import java.io.File;
 6 | import java.io.FileInputStream;
 7 | import java.io.FileNotFoundException;
 8 | import java.io.FileReader;
 9 | import java.io.FileWriter;
10 | import java.io.IOException;
11 | import java.io.ObjectInputStream;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | import weka.classifiers.meta.FilteredClassifier;
15 | import weka.core.Instances;
16 | import weka.core.converters.ArffLoader;
17 | 
18 | public class TextClassifier {
19 | 
20 |     private Instances test;
21 |     private FilteredClassifier classifier;
22 |     List<String> lstLoadData=new ArrayList<>();
23 |     public List<String> lstClassLabel=new ArrayList<>();
24 |     
25 |     public void LoadTestData(File ArffFileName) throws FileNotFoundException, IOException
26 |     {
27 |         BufferedReader bTestReader=new BufferedReader(
28 |                 new FileReader(ArffFileName));
29 |         ArffLoader.ArffReader myarff=new ArffLoader.ArffReader(bTestReader);
30 |         test=myarff.getData();
31 |         bTestReader.close();
32 |     }
33 |     public void loadModel(String ModelName) throws FileNotFoundException, IOException, ClassNotFoundException {
34 | 		
35 | 	ObjectInputStream in = new ObjectInputStream(new FileInputStream(ModelName));
36 |         Object tmp = in.readObject();			
37 |         classifier = (FilteredClassifier) tmp;
38 |         in.close();
39 |  	System.out.println("Model loaded: " + ModelName);
40 |      
41 | 	}
42 |     
43 | 	public void Predict(String OutPutFileName) throws Exception
44 |     {
45 |         List<String> lstLabels=new ArrayList<>();
46 |         test.setClassIndex(test.numAttributes()-1);
47 |         int numOfAttribute=test.numAttributes();
48 |         String createData;
49 |         for (int i = 0; i < test.numInstances(); i++) {
50 |            String FinalVal="";
51 |            double predict = classifier.classifyInstance(test.instance(i));
52 |            for(int k=0;k<numOfAttribute-1;k++)  {
53 |             createData= test.instance(i).stringValue(k);
54 |            FinalVal+= createData+ ",";
55 |            }
56 |            lstLabels.add(FinalVal+test.classAttribute().value((int) predict));
57 |             try (BufferedWriter writer = new BufferedWriter(
58 |                     new FileWriter(OutPutFileName))) {
59 |                 for(String label:lstLabels)
60 |                 {
61 |                     writer.write(label);
62 |                     writer.newLine();
63 |                 }
64 |                 writer.flush();
65 |             }
66 |          }
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinesentence/AllTermsSentence.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinesentence;
 2 | 
 3 | 
 4 | import java.io.File;
 5 | import java.io.IOException;
 6 | import java.util.HashMap;
 7 | import java.util.Map;
 8 | import java.util.Map.Entry;
 9 | import org.apache.lucene.index.DirectoryReader;
10 | import org.apache.lucene.index.IndexReader;
11 | import org.apache.lucene.index.Terms;
12 | import org.apache.lucene.index.TermsEnum;
13 | import org.apache.lucene.store.FSDirectory;
14 | import org.apache.lucene.util.BytesRef;
15 | 
16 | 
17 | public class AllTermsSentence {
18 |     private Map<String,Integer> allTerms;
19 |     Integer totalNoOfDocumentInIndex;
20 |     IndexReader indexReader;
21 |     
22 |     public AllTermsSentence() throws IOException
23 |     {    
24 |         allTerms = new HashMap<>();
25 |         String current = System.getProperty("user.dir");
26 |          indexReader = DirectoryReader
27 |                 .open(FSDirectory.open(new File(current+"\\IndexSentence\\")));
28 |         totalNoOfDocumentInIndex = indexReader.maxDoc();
29 |     }
30 |         
31 |     public void initAllTerms() throws IOException
32 |     {
33 |         int pos = 0;
34 |         for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) {
35 |             Terms vector = indexReader.getTermVector(docId, "contents");
36 |             TermsEnum termsEnum = null;
37 |             termsEnum = vector.iterator(termsEnum);
38 |             BytesRef text = null;
39 |             while ((text = termsEnum.next()) != null) {
40 |                 String term = text.utf8ToString();
41 |                 allTerms.put(term, pos++);
42 |             }
43 |         }       
44 |         pos = 0;
45 |         for(Entry<String,Integer> s : allTerms.entrySet())
46 |         {        
47 |            s.setValue(pos++);
48 |         }
49 |     }
50 |     public Map<String, Integer> getAllTerms() {
51 |         return allTerms;
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinesentence/CosineSimilaritySentence.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinesentence;
 2 | 
 3 | 
 4 | public class CosineSimilaritySentence {    
 5 |     public static double CosineSimilarity(DocVectorSentence d1,DocVectorSentence d2) {
 6 |         double cosinesimilarity;
 7 |         try {
 8 |             cosinesimilarity = (d1.vector.dotProduct(d2.vector))
 9 |                     / (d1.vector.getNorm() * d2.vector.getNorm());
10 |         } catch (Exception e) {
11 |             return 0.0;
12 |         }
13 |         return cosinesimilarity;
14 |     }
15 | }


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinesentence/DocVectorSentence.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinesentence;
 2 | 
 3 | import java.util.Map;
 4 | import org.apache.commons.math3.linear.ArrayRealVector;
 5 | import org.apache.commons.math3.linear.RealVector;
 6 | import org.apache.commons.math3.linear.RealVectorFormat;
 7 | 
 8 | public class DocVectorSentence 
 9 | {
10 | 
11 |     public Map<String, Integer> terms;
12 |     public RealVector vector;
13 |     public DocVectorSentence(Map<String, Integer> terms) {
14 |         this.terms = terms;
15 |         this.vector = new ArrayRealVector(terms.size());
16 |     }
17 | 
18 |     public void setEntry(String term, int freq) {
19 |         if (terms.containsKey(term)) {
20 |             int pos = terms.get(term);
21 |             vector.setEntry(pos, (double) freq);
22 |         }
23 |     }
24 | 
25 |     public void normalize() {
26 |         double sum = vector.getL1Norm();
27 |         vector = (RealVector) vector.mapDivide(sum);
28 |     }
29 | 
30 |     @Override
31 |     public String toString() {
32 |         RealVectorFormat formatter = new RealVectorFormat();
33 |         return formatter.format(vector);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinesentence/IndexSentence.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinesentence;
 2 | 
 3 | import java.io.*;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | import org.apache.lucene.analysis.Analyzer;
 7 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 8 | import org.apache.lucene.document.Document;
 9 | import org.apache.lucene.document.Field;
10 | import org.apache.lucene.document.FieldType;
11 | import org.apache.lucene.index.CorruptIndexException;
12 | import org.apache.lucene.index.FieldInfo;
13 | import org.apache.lucene.index.IndexWriter;
14 | import org.apache.lucene.index.IndexWriterConfig;
15 | import org.apache.lucene.store.Directory;
16 | import org.apache.lucene.store.FSDirectory;
17 | import org.apache.lucene.store.LockObtainFailedException;
18 | import org.apache.lucene.util.Version;
19 | 
20 | 
21 | public class IndexSentence {
22 | 
23 |     private final File sourceFileName;
24 |     private final File indexDirectory;
25 |     private static String fieldName;
26 |     private final String QueryText;
27 | 
28 |     public IndexSentence(String fileName, String Query)
29 |     {
30 |        QueryText=Query;
31 |        String current = System.getProperty("user.dir");
32 |        this.sourceFileName = new File(fileName);
33 |        this.indexDirectory = new File(current+"\\IndexSentence\\");
34 |        fieldName="contents";
35 |     }
36 |     public void index() throws CorruptIndexException,
37 |         LockObtainFailedException, IOException {
38 |         Directory dir = FSDirectory.open(indexDirectory);
39 |         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41,StandardAnalyzer.STOP_WORDS_SET);
40 |         IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_41, analyzer);
41 |         if (indexDirectory.exists()) {
42 |             iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
43 |         } else {
44 |             iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
45 |         }
46 |         IndexWriter writer = new IndexWriter(dir, iwc);
47 |         List<String> lstText=ExtractText(sourceFileName);
48 |         for (String f : lstText) 
49 |         {
50 |             System.out.println("Indexing Sentences...  ");//+f.getName());
51 |             Document doc = new Document();
52 |             FieldType fieldType = new FieldType();
53 |             fieldType.setIndexed(true);
54 |             fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
55 |             fieldType.setStored(true);
56 |             fieldType.setStoreTermVectors(true);
57 |             fieldType.setTokenized(true);
58 |             Field contentField = new Field(fieldName, f, fieldType);
59 |             doc.add(contentField);
60 |             writer.addDocument(doc);
61 |         }
62 |             Document doc = new Document();
63 |             FieldType fieldType = new FieldType();
64 |             fieldType.setIndexed(true);
65 |             fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
66 |             fieldType.setStored(true);
67 |             fieldType.setStoreTermVectors(true);
68 |             fieldType.setTokenized(true);
69 |             Field contentField = new Field(fieldName, QueryText, fieldType);
70 |             doc.add(contentField);
71 |             writer.addDocument(doc);
72 |         System.out.println("Indexing Finished...  ");
73 |         writer.close();
74 |     }
75 |     public List<String> ExtractText(File f) throws FileNotFoundException, IOException 
76 |     {
77 |         List<String> lstValues=new ArrayList<>();
78 |         BufferedReader reader=new BufferedReader(new FileReader(f));
79 |         String Line="";
80 |         while((Line=reader.readLine())!=null)
81 |         {
82 |             lstValues.add(Line);
83 |         }
84 |         return lstValues;
85 |     }
86 |     
87 | 
88 | }


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinesentence/VectorGeneratorSentence.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinesentence;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.ArrayList;
 6 | import java.util.HashMap;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | import org.apache.lucene.document.Document;
10 | import org.apache.lucene.index.DirectoryReader;
11 | import org.apache.lucene.index.IndexReader;
12 | import org.apache.lucene.index.Terms;
13 | import org.apache.lucene.index.TermsEnum;
14 | import org.apache.lucene.store.FSDirectory;
15 | import org.apache.lucene.util.BytesRef;
16 | 
17 | 
18 | public class VectorGeneratorSentence 
19 | {
20 | 
21 |     public int DocId;
22 |     public String DocName;
23 |     public VectorGeneratorSentence(int DocId,String DocName)
24 |     {
25 |         this.DocId=DocId;
26 |         this.DocName=DocName;
27 |     }
28 |     DocVectorSentence[] docVector;
29 |     private Map allterms;
30 |     Integer totalNoOfDocumentInIndex;
31 |     IndexReader indexReader;
32 |     
33 |     private List<VectorGeneratorSentence> lstData=new ArrayList<>();
34 |     public void setLstData(VectorGeneratorSentence VG)
35 |     {
36 |         lstData.add(new VectorGeneratorSentence(VG.DocId, VG.DocName));
37 |     }
38 |     public List<VectorGeneratorSentence> getLstData()
39 |     {
40 |         return lstData;
41 |     }
42 |     
43 |     public VectorGeneratorSentence() throws IOException
44 |     {
45 |        String current = System.getProperty("user.dir");
46 |        allterms = new HashMap<>();
47 |        indexReader=DirectoryReader.open(FSDirectory.open(new File(current+"\\IndexSentence\\")));
48 |        totalNoOfDocumentInIndex=indexReader.maxDoc();
49 |        docVector = new DocVectorSentence[totalNoOfDocumentInIndex];
50 |     }
51 |     
52 |     public void GetAllTerms() throws IOException
53 |     {
54 |         AllTermsSentence allTerms = new AllTermsSentence();
55 |         allTerms.initAllTerms();
56 |         allterms = allTerms.getAllTerms();
57 |     }
58 |     public void ExtractVectorsName(int i) throws IOException
59 |     {
60 |         
61 |         Document doc=indexReader.document(i);
62 |         String docName=doc.get("contents");
63 |         setLstData(new VectorGeneratorSentence(i, docName));
64 | 
65 |     }
66 |     
67 |     public DocVectorSentence[] GetDocumentVectors() throws IOException
68 |     {
69 |         for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) 
70 |         {
71 |             Terms vector = indexReader.getTermVector(docId, "contents");
72 | //            Document doc=indexReader.document(docId);
73 | //            String FileName=doc.get("FileName");
74 | //            System.out.println(FileName+" "+docId);
75 |             ExtractVectorsName(docId);
76 |             TermsEnum termsEnum = null;
77 |             termsEnum = vector.iterator(termsEnum);
78 |             BytesRef text = null;            
79 |             docVector[docId] = new DocVectorSentence(allterms);            
80 |             while ((text = termsEnum.next()) != null) {
81 |                 String term = text.utf8ToString();
82 |                 int freq = (int) termsEnum.totalTermFreq();
83 |                 docVector[docId].setEntry(term, freq);
84 |             }
85 |             docVector[docId].normalize();
86 |         }
87 |         indexReader.close();
88 |         return docVector;
89 |     }
90 | }


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinetext/AllTerms.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinetext;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | import java.util.Map.Entry;
 8 | import org.apache.lucene.index.DirectoryReader;
 9 | import org.apache.lucene.index.IndexReader;
10 | import org.apache.lucene.index.Terms;
11 | import org.apache.lucene.index.TermsEnum;
12 | import org.apache.lucene.store.FSDirectory;
13 | import org.apache.lucene.util.BytesRef;
14 | 
15 | 
16 | public class AllTerms {
17 |     private Map<String,Integer> allTerms;
18 |     Integer totalNoOfDocumentInIndex;
19 |     IndexReader indexReader;
20 |     
21 |     public AllTerms() throws IOException
22 |     {    
23 |         allTerms = new HashMap<>();
24 |         String current = System.getProperty("user.dir");
25 |          indexReader = DirectoryReader
26 |                 .open(FSDirectory.open(new File(current+"\\Index\\")));
27 |         totalNoOfDocumentInIndex = indexReader.maxDoc();
28 |     }
29 |         
30 |     public void initAllTerms() throws IOException
31 |     {
32 |         int pos = 0;
33 |         for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) {
34 |             Terms vector = indexReader.getTermVector(docId, "contents");
35 |             TermsEnum termsEnum = null;
36 |             termsEnum = vector.iterator(termsEnum);
37 |             BytesRef text = null;
38 |             while ((text = termsEnum.next()) != null) {
39 |                 String term = text.utf8ToString();
40 |                 allTerms.put(term, pos++);
41 |             }
42 |         }       
43 |         pos = 0;
44 |         for(Entry<String,Integer> s : allTerms.entrySet())
45 |         {        
46 |            s.setValue(pos++);
47 |         }
48 |     }
49 |     public Map<String, Integer> getAllTerms() {
50 |         return allTerms;
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinetext/CosineSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinetext;
 2 | 
 3 | public class CosineSimilarity {    
 4 |     public static double CosineSimilarity(DocVector d1,DocVector d2) {
 5 |         double cosinesimilarity;
 6 |         try {
 7 |             cosinesimilarity = (d1.vector.dotProduct(d2.vector))
 8 |                     / (d1.vector.getNorm() * d2.vector.getNorm());
 9 |         } catch (Exception e) {
10 |             return 0.0;
11 |         }
12 |         return cosinesimilarity;
13 |     }
14 | }


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinetext/DocVector.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinetext;
 2 | 
 3 | import java.util.Map;
 4 | import org.apache.commons.math3.linear.ArrayRealVector;
 5 | import org.apache.commons.math3.linear.RealVector;
 6 | import org.apache.commons.math3.linear.RealVectorFormat;
 7 | 
 8 | public class DocVector {
 9 | 
10 |     public Map<String, Integer> terms;
11 |     public RealVector vector;
12 |     
13 |     public DocVector(Map<String, Integer> terms) {
14 |         this.terms = terms;
15 |         this.vector = new ArrayRealVector(terms.size());        
16 |     }
17 | 
18 |     public void setEntry(String term, int freq) {
19 |         if (terms.containsKey(term)) {
20 |             int pos = terms.get(term);
21 |             vector.setEntry(pos, (double) freq);
22 |         }
23 |     }
24 | 
25 |     public void normalize() {
26 |         double sum = vector.getL1Norm();
27 |         vector = (RealVector) vector.mapDivide(sum);
28 |     }
29 | 
30 |     @Override
31 |     public String toString() {
32 |         RealVectorFormat formatter = new RealVectorFormat();
33 |         return formatter.format(vector);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinetext/Index.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinetext;
 2 | 
 3 | import java.io.*;
 4 | import java.nio.file.Files;
 5 | import java.nio.file.Paths;
 6 | import org.apache.lucene.analysis.Analyzer;
 7 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 8 | import org.apache.lucene.document.Document;
 9 | import org.apache.lucene.document.Field;
10 | import org.apache.lucene.document.FieldType;
11 | import org.apache.lucene.document.TextField;
12 | import org.apache.lucene.index.CorruptIndexException;
13 | import org.apache.lucene.index.FieldInfo;
14 | import org.apache.lucene.index.IndexWriter;
15 | import org.apache.lucene.index.IndexWriterConfig;
16 | import org.apache.lucene.store.Directory;
17 | import org.apache.lucene.store.FSDirectory;
18 | import org.apache.lucene.store.LockObtainFailedException;
19 | import org.apache.lucene.util.Version;
20 | 
21 | 
22 | public class Index {
23 | 
24 |     private final File sourceDirectory;
25 |     private final File indexDirectory;
26 |     private static String fieldName;
27 | 
28 |     public Index(String DataDir)
29 |     {
30 |        String current = System.getProperty("user.dir");
31 |        this.sourceDirectory = new File(DataDir);
32 |        this.indexDirectory = new File(current+"\\Index\\");
33 |        fieldName="contents";
34 |     }
35 |     public void index() throws CorruptIndexException,
36 |         LockObtainFailedException, IOException {
37 |         Directory dir = FSDirectory.open(indexDirectory);
38 |         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41,StandardAnalyzer.STOP_WORDS_SET);
39 |         IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_41, analyzer);
40 |         if (indexDirectory.exists()) {
41 |             iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
42 |         } else {
43 |             iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
44 |         }
45 |         IndexWriter writer = new IndexWriter(dir, iwc);
46 |         for (File f : sourceDirectory.listFiles()) {
47 |             System.out.println("Indexing Document  "+f.getName());
48 |             Document doc = new Document();
49 |             FieldType fieldType = new FieldType();
50 |             fieldType.setIndexed(true);
51 |             fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
52 |             fieldType.setStored(true);
53 |             fieldType.setStoreTermVectors(true);
54 |             fieldType.setTokenized(true);
55 |             Field contentField = new Field(fieldName, ExtractText(f), fieldType);
56 |             doc.add(contentField);
57 |             doc.add(new TextField("FileName", f.getName(), Field.Store.YES));
58 |             doc.add(new TextField("FilePath",f.getCanonicalPath(),Field.Store.YES));
59 |             writer.addDocument(doc);
60 |         }
61 |         writer.close();
62 |     }
63 |     public String ExtractText(File f) throws FileNotFoundException, IOException 
64 |     {
65 |         String textFileContent = "";
66 |         for (String line : Files.readAllLines(Paths.get(f.getAbsolutePath())))
67 |         {
68 |             textFileContent += line;
69 |         }
70 |         return textFileContent;
71 |     }
72 |     
73 | 
74 | }


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/cosinetext/VectorGenerator.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.cosinetext;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.ArrayList;
 6 | import java.util.Arrays;
 7 | import java.util.Comparator;
 8 | import java.util.HashMap;
 9 | import java.util.List;
10 | import java.util.Map;
11 | import org.apache.lucene.document.Document;
12 | import org.apache.lucene.index.DirectoryReader;
13 | import org.apache.lucene.index.IndexReader;
14 | import org.apache.lucene.index.Terms;
15 | import org.apache.lucene.index.TermsEnum;
16 | import org.apache.lucene.store.FSDirectory;
17 | import org.apache.lucene.util.BytesRef;
18 | 
19 | 
20 | public class VectorGenerator 
21 | {
22 | 
23 |     public int DocId;
24 |     public String DocName;
25 |     public VectorGenerator(int DocId,String DocName)
26 |     {
27 |         this.DocId=DocId;
28 |         this.DocName=DocName;
29 |     }
30 |     DocVector[] docVector;
31 |     private Map allterms;
32 |     Integer totalNoOfDocumentInIndex;
33 |     IndexReader indexReader;
34 |     
35 |     private List<VectorGenerator> lstData=new ArrayList<>();
36 |     public void setLstData(VectorGenerator VG)
37 |     {
38 |         lstData.add(new VectorGenerator(VG.DocId, VG.DocName));
39 |     }
40 |     public List<VectorGenerator> getLstData()
41 |     {
42 |         return lstData;
43 |     }
44 |     
45 |     public VectorGenerator() throws IOException
46 |     {
47 |        String current = System.getProperty("user.dir");
48 |        allterms = new HashMap<>();
49 |        indexReader=DirectoryReader.open(FSDirectory.open(new File(current+"\\Index\\")));
50 |        totalNoOfDocumentInIndex=indexReader.maxDoc();
51 |        docVector = new DocVector[totalNoOfDocumentInIndex];
52 |     }
53 |     
54 |     public void GetAllTerms() throws IOException
55 |     {
56 |         AllTerms allTerms = new AllTerms();
57 |         allTerms.initAllTerms();
58 |         allterms = allTerms.getAllTerms();
59 |     }
60 |     public void ExtractVectorsName(int i) throws IOException
61 |     {
62 |         
63 |         Document doc=indexReader.document(i);
64 |         String docName=doc.get("FileName");
65 |         setLstData(new VectorGenerator(i, docName));
66 | 
67 |     }
68 |     
69 |     public DocVector[] GetDocumentVectors() throws IOException
70 |     {
71 |         for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) 
72 |         {
73 |             Terms vector = indexReader.getTermVector(docId, "contents");
74 | //            Document doc=indexReader.document(docId);
75 | //            String FileName=doc.get("FileName");
76 | //            System.out.println(FileName+" "+docId);
77 |             ExtractVectorsName(docId);
78 |             TermsEnum termsEnum = null;
79 |             termsEnum = vector.iterator(termsEnum);
80 |             BytesRef text = null;            
81 |             docVector[docId] = new DocVector(allterms);            
82 |             while ((text = termsEnum.next()) != null) {
83 |                 String term = text.utf8ToString();
84 |                 int freq = (int) termsEnum.totalTermFreq();
85 |                 docVector[docId].setEntry(term, freq);
86 |             }
87 |             docVector[docId].normalize();
88 |         }
89 |         indexReader.close();
90 |         return docVector;
91 |     }
92 | }


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/Classification.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.domain;
  2 | 
  3 | 
  4 | 
  5 | 
  6 | public class Classification {
  7 | 
  8 | 	public Classification(){}
  9 | 	public Classification(double pre,double recall,double auc,double correct,double inCorrect, double errorRate,
 10 | 			double fn,double fp,double tn,double tp,double kappa,double MAbsError,
 11 | 			double numInstances,double relAbsErorr,double fMeasure)
 12 | 	{
 13 | 		this.precision=pre;
 14 | 		this.recall=recall;
 15 | 		this.auc=auc;
 16 | 		this.incorrect=inCorrect;
 17 | 		this.correct=correct;
 18 | 		this.errorRate=errorRate;
 19 | 		this.fn=fn;
 20 | 		this.fp=fp;
 21 | 		this.tn=tn;
 22 | 		this.tp=tp;
 23 | 		this.kappa=kappa;
 24 | 		this.meanAbsoluteError=MAbsError;
 25 | 		this.numInstances=numInstances;
 26 | 		this.relativeAbsoluteError=relAbsErorr;
 27 | 		this.fMeasure=fMeasure;
 28 | 	}
 29 | 	private double precision;
 30 | 	private double recall;
 31 | 	private double auc;
 32 | 	private double correct;
 33 | 	private double incorrect;
 34 | 	private double errorRate;
 35 | 	private double fn;
 36 | 	private double fp;
 37 | 	private double tn;
 38 | 	private double tp;
 39 | 	private double kappa;
 40 | 	private double meanAbsoluteError;
 41 | 	private double numInstances;
 42 | 	private double relativeAbsoluteError;
 43 | 	private double fMeasure;
 44 | 	
 45 | 	public void setInCorrect(double incorrect)
 46 | 	{
 47 | 		this.incorrect=incorrect;
 48 | 	}
 49 | 	public double getInCorrect()
 50 | 	{
 51 | 		return this.incorrect;
 52 | 	}
 53 | 	
 54 | 	public void setCorrect(double correct)
 55 | 	{
 56 | 		this.correct=correct;
 57 | 	}
 58 | 	public double getCorrect()
 59 | 	{
 60 | 		return this.correct;
 61 | 	}
 62 | 	public void setPrecision(double precision)
 63 | 	{
 64 | 		this.precision=precision;
 65 | 	}
 66 | 	public double getPrecision()
 67 | 	{
 68 | 		return this.precision;
 69 | 	}
 70 | 	public void setRecall(double recall)
 71 | 	{
 72 | 		this.recall=recall;
 73 | 	}
 74 | 	public double getRecall()
 75 | 	{
 76 | 		return this.recall;
 77 | 	}
 78 | 	public void setAuc(double auc)
 79 | 	{
 80 | 		this.auc=auc;
 81 | 	}
 82 | 	public double getAuc()
 83 | 	{
 84 | 		return this.auc;
 85 | 	}
 86 | 	public void setErrorRate(double errorRate)
 87 | 	{
 88 | 		this.errorRate=errorRate;
 89 | 	}
 90 | 	public double getErrorRate()
 91 | 	{
 92 | 		return this.errorRate;
 93 | 	}
 94 | 	public void setFn(double fn)
 95 | 	{
 96 | 		this.fn=fn;
 97 | 	}
 98 | 	public double getFn()
 99 | 	{
100 | 		return this.fn;
101 | 	}
102 | 	public void setFp(double fp)
103 | 	{
104 | 		this.fp=fp;
105 | 	}
106 | 	public double getFp()
107 | 	{
108 | 		return this.fp;
109 | 	}
110 | 	public void setTn(double tn)
111 | 	{
112 | 		this.tn=tn;
113 | 	}
114 | 	public double getTn()
115 | 	{
116 | 		return this.tn;
117 | 	}
118 | 	public void setTp(double tp)
119 | 	{
120 | 		this.tp=tp;
121 | 	}
122 | 	public double getTp()
123 | 	{
124 | 		return tp;
125 | 	}
126 | 	public void setKappa(double kappa)
127 | 	{
128 | 		this.kappa=kappa;
129 | 	}
130 | 	public double getKappa()
131 | 	{
132 | 		return this.kappa;
133 | 	}
134 | 	public void setMeanAbsoluteError(double meanAbsoluteError)
135 | 	{
136 | 		this.meanAbsoluteError=meanAbsoluteError;
137 | 	}
138 | 	public double getMeanAbsoluteError()
139 | 	{
140 | 		return this.meanAbsoluteError;
141 | 	}
142 | 	public void setNumInstances(double d)
143 | 	{
144 | 		this.numInstances=d;
145 | 	}
146 | 	public double getNumInstances()
147 | 	{
148 | 		return this.numInstances;
149 | 	}
150 | 	public void setRelativeAbsoluteError(double relativeAbsoluteError)
151 | 	{
152 | 		this.relativeAbsoluteError=relativeAbsoluteError;
153 | 	}
154 | 	public double getRelativeAbsoluteError()
155 | 	{
156 | 		return this.relativeAbsoluteError;
157 | 	}
158 | 	public void setFMeasure(double fMeasure)
159 | 	{
160 | 		this.fMeasure=fMeasure;
161 | 	}
162 | 	public double getFMeasure()
163 | 	{
164 | 		return this.fMeasure;
165 | 	}
166 | }
167 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractNamedEntity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | public class ExtractNamedEntity {
 4 | 
 5 | 	public ExtractNamedEntity()
 6 | 	{
 7 | 		
 8 | 	}
 9 | 	
10 | 	 public String word;
11 | 	 public String ner;
12 | 	 public int position;
13 | 	/*public ExtractNamedEntity(String word,String ner)
14 | 	{
15 | 		this.word=word;
16 | 		this.ner=ner;
17 | 	}*/
18 | 	public ExtractNamedEntity(String word,String ner,int position)
19 | 	{
20 | 		this.word=word;
21 | 		this.ner=ner;
22 | 		this.position=position;
23 | 	}
24 | 	 public int getPosition() {
25 | 		return position;
26 | 	}
27 | 	public void setPosition(int position) {
28 | 		this.position = position;
29 | 	}	 
30 | 	 public void setWord(String word)
31 | 	 {
32 | 		 this.word=word;
33 | 	 }
34 | 	 public String getWord()
35 | 	 {
36 | 		 return this.word;
37 | 	 }
38 | 	 
39 | 	 public void setNer(String ner)
40 | 	 {
41 | 		 this.ner=ner;
42 | 	 }
43 | 	 public String getNer()
44 | 	 {
45 | 		 return this.ner;
46 | 	 }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractNumberSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | public class ExtractNumberSimilarity {
 4 | 
 5 | 	private String vector1;
 6 | 	private String vector2;
 7 | 	private double score;
 8 | 	
 9 | 	public ExtractNumberSimilarity(){}
10 | 	
11 | 	public ExtractNumberSimilarity(String vector1,String vector2,double score)
12 | 	{
13 | 		this.vector1=vector1;
14 | 		this.vector2=vector2;
15 | 		this.score=score;
16 | 	}
17 | 	
18 | 	public void setVector1(String vector1)
19 | 	{
20 | 		this.vector1=vector1;
21 | 	}
22 | 	public String getVector1()
23 | 	{
24 | 		return this.vector1;
25 | 	}
26 | 	
27 | 	public void setVecor2(String vector2)
28 | 	{
29 | 		this.vector2=vector2;
30 | 	}
31 | 	public String getVector2()
32 | 	{
33 | 		return this.vector2;
34 | 	}
35 | 	public void setScore(double score)
36 | 	{
37 | 		this.score=score;
38 | 	}
39 | 	public double getScore()
40 | 	{
41 | 		return this.score;
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractPosTag.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | public class ExtractPosTag {
 4 | 	 
 5 |   public ExtractPosTag(){}
 6 |   public ExtractPosTag(String wordPart,String tag)
 7 |   {
 8 | 	  this.wordPart=wordPart;
 9 | 	  this.tag=tag;
10 |   }
11 |   private String wordPart;
12 |   private String tag;
13 |   private int itemCount;
14 |   
15 |   public void setWordPart(String wordPart)
16 |   {
17 | 	  this.wordPart=wordPart;
18 |   }
19 |   public String getWordPart()
20 |   {
21 | 	  return this.wordPart;
22 |   }
23 |   public void setTag(String tag)
24 |   {
25 | 	  this.tag=tag;
26 |   }
27 |   public String getTag()
28 |   {
29 | 	  return this.tag;
30 |   }
31 |   public void setItemCount(int itemCount)
32 |   {
33 | 	  this.itemCount=itemCount;
34 |   }
35 |   public int getItemCount()
36 |   {
37 | 	  return this.itemCount;
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractStem.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | public class ExtractStem {
 4 | 
 5 | 	 private String word1;
 6 | 	    public void setWord1(String word)
 7 | 	    {
 8 | 	        this.word1=word;
 9 | 	    }
10 | 	    public String getWord1()
11 | 	    {
12 | 	        return word1;
13 | 	    }
14 | 	    private String derived1;
15 | 	    public void setDerived1(String derived)
16 | 	    {
17 | 	        this.derived1=derived;
18 | 	    }
19 | 	    public String getDerived1()
20 | 	    {
21 | 	    	return this.derived1;
22 | 	    }
23 | 	    private String word2;
24 | 	    public void setWord2(String word)
25 | 	    {
26 | 	        this.word2=word;
27 | 	    }
28 | 	    public String getWord2()
29 | 	    {
30 | 	        return word2;
31 | 	    }
32 | 	    private String derived2;
33 | 	    public void setDerived2(String derived)
34 | 	    {
35 | 	        this.derived2=derived;
36 | 	    }
37 | 	    public String getDerived2()
38 | 	    {
39 | 	        return this.derived2;
40 | 	    }
41 | 	    public ExtractStem(String word1,String derived1,String word2,String derived2)
42 | 	    {
43 | 	        this.word1=word1;
44 | 	        this.word2=word2;
45 | 	        this.derived1=derived1;
46 | 	        this.derived2=derived2;
47 | 	    }
48 | 		public ExtractStem() {
49 | 			// TODO Auto-generated constructor stub
50 | 		}
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractSynonym.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | public class ExtractSynonym {
 4 | 
 5 | 	public ExtractSynonym(){}
 6 | 	public ExtractSynonym(String word,String synset)
 7 | 	{
 8 | 		this.word=word;
 9 | 		this.synset=synset;
10 | 	}
11 | 	private String word;
12 | 	
13 | 	private String synset;
14 | 
15 | 	public String getWord() {
16 | 		return word;
17 | 	}
18 | 
19 | 	public void setWord(String word) {
20 | 		this.word = word;
21 | 	}
22 | 
23 | 	public String getSynset() {
24 | 		return synset;
25 | 	}
26 | 
27 | 	public void setSynset(String synset) {
28 | 		this.synset = synset;
29 | 	}
30 | 	
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractTextCosineSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | 
 4 | public class ExtractTextCosineSimilarity {
 5 | 
 6 | 	public ExtractTextCosineSimilarity(){}
 7 |     public ExtractTextCosineSimilarity(String DocName, String DocCandidate,double Similarity)
 8 |     {
 9 |         this.DocName=DocName;
10 |         this.DocCandidate=DocCandidate;
11 |         this.Similarity=Similarity;
12 |     }
13 |     public String query;
14 |     public void setQuery(String query)
15 |     {
16 |     	this.query=query;
17 |     }
18 |     public String getQuery()
19 |     {
20 |     	return this.query;
21 |     }
22 |     public String DocName;
23 |     public void setDocName(String DocName)
24 |     {
25 |         this.DocName=DocName;
26 |     }
27 |     public String getDocName()
28 |     {
29 |         return DocName;
30 |     }
31 |     public String DocCandidate;
32 |     public void setDocCandidate(String DocCandidate)
33 |     {
34 |         this.DocCandidate=DocCandidate;
35 |     }
36 |     public String getDocCandidate()
37 |     {
38 |         return this.DocCandidate;
39 |     }
40 |     public double Similarity;
41 |     public void setSimilairty(double CosineSimilarity)
42 |     {
43 |         this.Similarity=CosineSimilarity;
44 |     }
45 |     public double getSimilarity()
46 |     {
47 |         return this.Similarity;
48 |     }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractTextSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | public class ExtractTextSimilarity {
 4 | 
 5 | 	private String word;
 6 |     public void setWord(String word)
 7 |     {
 8 |         this.word=word;
 9 |     }
10 |     public String getWord()
11 |     {
12 |         return this.word;
13 |     }
14 |     private String candidate;
15 |     public void setCandidate(String candidate)
16 |     {
17 |         this.candidate=candidate;
18 |     }
19 |     public String getCandidate()
20 |     {
21 |         return this.candidate;
22 |     }
23 |     private double similarity;
24 |     public void setSimilarity(double similarity)
25 |     {
26 |         this.similarity=similarity;
27 |     }
28 |     public double getSimilarity()
29 |     {
30 |         return this.similarity;
31 |     }
32 |     public ExtractTextSimilarity(){}
33 |     public ExtractTextSimilarity (String Word, String Candidate, double Similarity)
34 |     {
35 |         this.word=Word;
36 |         this.candidate=Candidate;
37 |         this.similarity=Similarity;
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractTextTfidfSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | 
 4 | public class ExtractTextTfidfSimilarity {
 5 | 
 6 | 	private String query;
 7 | 
 8 | 	private String sentence;
 9 |     private String similarSentence;
10 |     private String score;
11 |     public ExtractTextTfidfSimilarity(){}
12 |     
13 |     public ExtractTextTfidfSimilarity(String searchText, String similarSentence, String score) {
14 | 		this.sentence=searchText;
15 | 		this.similarSentence=similarSentence;
16 | 		this.score=score;
17 | 	}
18 |     public void setQuery(String query)
19 |     {
20 |     	this.query=query;
21 |     }
22 |     public String getQuery()
23 |     {
24 |     	return this.query;
25 |     }
26 | 	public void setSentence(String sentence)
27 |     {
28 |     	this.sentence=sentence;
29 |     }
30 |     public String getSentence()
31 |     {
32 |     	return this.sentence;
33 |     }
34 |     
35 |     public void setSimilaritySentence(String similaritySentence)
36 |     {
37 |     	this.similarSentence=similaritySentence;
38 |     }
39 |     
40 |     public String getSimilaritySentence()
41 |     {
42 |     	return this.similarSentence;
43 |     }
44 |     
45 |     public void serScore(String score)
46 |     {
47 |     	this.score=score;
48 |     }
49 |     public String getScore()
50 |     {
51 |     	return this.score;
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/ExtractionKeyword.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain;
 2 | 
 3 | 
 4 | public class ExtractionKeyword {
 5 | 
 6 | 	public ExtractionKeyword(){}
 7 | 	public ExtractionKeyword(String tweet,String keyword)
 8 | 	{
 9 | 		this.tweet=tweet;
10 | 		this.keyword=keyword;
11 | 	}
12 | 	public ExtractionKeyword(String keyword)
13 | 	{
14 | 		this.keyword=keyword;
15 | 	}
16 | 	public String tweet;
17 | 	public String keyword;
18 | 	public String inputSentence;
19 | 	public String inputTweet;
20 | 
21 | 	
22 | 	public void setInputSentence(String inputSentence)
23 | 	{
24 | 		this.inputSentence=inputSentence;
25 | 	}
26 | 	
27 | 	public String getInputSentence()
28 | 	{
29 | 		return inputSentence;
30 | 	}
31 | 	public void setInputTweet(String inputTweet)
32 | 	{
33 | 	  this.inputTweet=inputTweet;
34 | 	}
35 | 	public String getInputTweet()
36 | 	{
37 | 		return inputTweet;
38 | 	}
39 | 	public void setTweet(String tweet)
40 | 	{
41 | 		this.tweet=tweet;
42 | 	}
43 | 	public String getTweet()
44 | 	{
45 | 		return tweet;
46 | 	}
47 | 	
48 | 	public void setKeyword(String keyword)
49 | 	{
50 | 		this.keyword=keyword;
51 | 	}
52 | 	public String getKeyword()
53 | 	{
54 | 		return keyword;
55 | 	}
56 | 	
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextDecisionTree.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | import unsw.curation.api.domain.Classification;
 9 | 
10 | public interface IClassificationTextDecisionTree {
11 | 
12 | 	void LoadDataset(File arffFileName) throws IOException;
13 | 	List<Classification> EvaluateDecisionTree() throws Exception;
14 | 	void LearnDecisionTree() throws Exception;
15 | 	void SaveModel(String modelName) throws FileNotFoundException, IOException;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextKNN.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | import unsw.curation.api.domain.Classification;
 9 | 
10 | public interface IClassificationTextKNN {
11 | 
12 | 	void LoadDataset(File arffFileName) throws IOException;
13 | 	List<Classification> EvaluateKNN() throws Exception;
14 | 	void LearnKNN() throws Exception;
15 | 	void SaveModel(String modelName) throws FileNotFoundException, IOException;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextLogisticRegression.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | import unsw.curation.api.domain.Classification;
 9 | 
10 | public interface IClassificationTextLogisticRegression {
11 | 
12 | 	void LoadDataset(File arffFileName) throws IOException;
13 | 	List<Classification> EvaluateLogisticRegression() throws Exception;
14 | 	void LearnLogisticRegression() throws Exception;
15 | 	void SaveModel(String modelName) throws FileNotFoundException, IOException;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextNaiveBays.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | import unsw.curation.api.domain.Classification;
 9 | 
10 | public interface IClassificationTextNaiveBays {
11 | 
12 | 	void LoadDataset(File arffFileName) throws IOException;
13 | 	List<Classification> EvaluateNaiveBays() throws Exception;
14 | 	void LearnNaiveBays() throws Exception;
15 | 	void SaveModel(String modelName) throws FileNotFoundException, IOException;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextNeuralNetwork.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | import unsw.curation.api.domain.Classification;
 9 | 
10 | public interface IClassificationTextNeuralNetwork {
11 | 
12 | 	void LoadDataset(File arffFileName) throws IOException;
13 | 	List<Classification> EvaluateNeuralNetwork() throws Exception;
14 | 	void LearnNeuralNetwork() throws Exception;
15 | 	void SaveModel(String modelName) throws FileNotFoundException, IOException;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextRandomForest.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | import unsw.curation.api.domain.Classification;
 9 | 
10 | public interface IClassificationTextRandomForest 
11 | {
12 | 	void LoadDataset(File arffFileName) throws IOException;
13 | 	List<Classification> EvaluateRandomForest() throws Exception;
14 | 	void LearnRandomForest() throws Exception;
15 | 	void SaveModel(String modelName) throws FileNotFoundException, IOException;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextSVM.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.util.List;
 7 | 
 8 | import unsw.curation.api.domain.Classification;
 9 | 
10 | public interface IClassificationTextSVM {
11 | 
12 | 	void LoadDataset(File arffFileName) throws IOException;
13 | 	List<Classification> EvaluateSVM() throws Exception;
14 | 	void LearnSVM() throws Exception;
15 | 	void SaveModel(String modelName) throws FileNotFoundException, IOException;
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IKeywordEx.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | import java.io.File;
 3 | import java.io.FileNotFoundException;
 4 | import java.io.IOException;
 5 | import java.util.List;
 6 | 
 7 | import unsw.curation.api.domain.ExtractionKeyword;
 8 | 
 9 | public interface IKeywordEx {
10 | 
11 | 	String ExtractTweetKeyword(String inputTweet,File stopWordList) throws Exception;
12 | 	List<ExtractionKeyword> ExtractTweetKeywordFromFile(File fileName, File stopWordList) throws FileNotFoundException, IOException;
13 | 	String ExtractSentenceKeyword(String inputSentence, File stopWordList) throws Exception;
14 | 	//String ExtractSentenceKeyPhrase(String inputSentence,File stopWordList) throws Exception;
15 | 	String ExtractFileKeyword(File fileName, File stopWordList) throws FileNotFoundException, IOException;
16 | 	/*ExtractionKeyword ExtractSentenceKeywords(String inputSentence) throws Exception;
17 | 	ExtractionKeyword ExtractFileKeywords(String inputFilePath) throws Exception;*/
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/INamedEntity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.net.URISyntaxException;
 7 | import java.util.List;
 8 | 
 9 | import unsw.curation.api.domain.ExtractNamedEntity;
10 | 
11 | public interface INamedEntity {
12 | 
13 | 	List<ExtractNamedEntity>ExtractNamedEntityFile(File filePath) throws Exception;
14 | 	//List<ExtractNamedEntity>ExtractNamedEntity(boolean useRegexNer,List<String> lstData) throws Exception;
15 | 	List<ExtractNamedEntity> ExtractNamedEntitySentence(String inputSentence) throws Exception;
16 | 	List<String> ExtractOrganization(String inputSentence) throws URISyntaxException, Exception;
17 | 	List<String> ExtractPerson(String inputSentence)throws URISyntaxException, Exception;
18 | 	List<String> ExtractLocation(String inputSentence)throws URISyntaxException, Exception;
19 | 	List<String> ExtractDate(String inputSentence)throws URISyntaxException, Exception;
20 | 	List<String> ExtractMoney(String inputSentence)throws URISyntaxException, Exception;
21 | 	List<String> ExtractCity(String inputSentence)throws URISyntaxException, Exception;
22 | 	List<String> ExtractState(String inputSentence)throws URISyntaxException, Exception;
23 | 	List<String> ExtractCountry(String inputSentence)throws URISyntaxException, FileNotFoundException, IOException, Exception;
24 | 	List<String> ExtractContinent(String inputSentence)throws URISyntaxException, Exception;
25 | 	List<String> ExtractCrime(String inputSentence)throws URISyntaxException, Exception;
26 | 	List<String> ExtractSport(String inputSentence)throws URISyntaxException, Exception;
27 | 	List<String> ExtractHoliday(String inputSentence)throws URISyntaxException, Exception;
28 | 	List<String> ExtractCompany(String inputSentence)throws URISyntaxException, Exception;
29 | 	List<String> ExtractNaturalDisaster(String inputSentence)throws URISyntaxException, Exception;
30 | 	List<String> ExtractDrug(String inputSentence)throws URISyntaxException, Exception;
31 | 	List<String> ExtractProduct(String inputSentence)throws URISyntaxException, Exception;
32 | 	//List<String> ExtractRadioProgram(String inputSentence)throws URISyntaxException, Exception;
33 | 	//List<String> ExtractRadioStation(String inputSentence)throws URISyntaxException, Exception;
34 | 	//List<String> ExtractTvShows(String inputSentence)throws URISyntaxException;
35 | 	List<String> ExtractMedia(String inputSentence)throws URISyntaxException, Exception;
36 | 	List<String> ExtractOperatingSystem(String inputSentence)throws URISyntaxException, Exception;
37 | 	List<String> ExtractDegree(String inputSentence)throws URISyntaxException, Exception;
38 | 	List<String> ExtractSportEvents(String inputSentence)throws URISyntaxException, Exception;
39 | 	//List<String> ExtractRegion(String inputSentence)throws URISyntaxException;
40 | 	//List<String> ExtractGeographicFeature(String inputSentence)throws URISyntaxException;
41 | 	List<String> ReadRawData(File filePath) throws Exception;
42 | 	
43 | 		
44 | 	
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/INumberCosineSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import unsw.curation.api.domain.ExtractNumberSimilarity;
 7 | 
 8 | 
 9 | 
10 | public interface INumberCosineSimilarity {
11 | 
12 | 	double Cosine_Vector_Vector(double [] number1,double [] number2);
13 | 	List<ExtractNumberSimilarity> Cosine_Vector_VectorS(String filePath) throws IOException;
14 | 	List<ExtractNumberSimilarity> Cosine_Vector_VectorS(double [] vector,String filePath) throws IOException;
15 | 	
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/INumberDiceSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import unsw.curation.api.domain.ExtractNumberSimilarity;
 7 | 
 8 | 
 9 | public interface INumberDiceSimilarity {
10 | 
11 | 	double Dice_Vector_Vector(double [] number1,double [] number2);
12 | 	List<ExtractNumberSimilarity> Dice_Vector_VectorS(String filePath) throws IOException;
13 | 	List<ExtractNumberSimilarity> Dice_Vector_VectorS(Double [] vector,String filePath) throws IOException;
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/INumberEuclideanSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import unsw.curation.api.domain.ExtractNumberSimilarity;
 7 | 
 8 | 
 9 | 
10 | public interface INumberEuclideanSimilarity {
11 | 
12 | 	double Euclidean_Vector_Vector(double [] number1,double [] number2);
13 | 	List<ExtractNumberSimilarity> Euclidean_Vector_VectorS(String filePath) throws IOException;
14 | 	List<ExtractNumberSimilarity> Euclidean_Vector_VectorS(double [] vector,String filePath) throws IOException;
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/INumberJaccardSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import unsw.curation.api.domain.ExtractNumberSimilarity;
 7 | 
 8 | 
 9 | 
10 | public interface INumberJaccardSimilarity {
11 | 
12 | 	double Jaccard_Vector_Vector(double [] number1,double [] number2);
13 | 	List<ExtractNumberSimilarity> Jaccard_Vector_VectorS(String filePath) throws IOException;
14 | 	List<ExtractNumberSimilarity> Jaccard_Vector_VectorS(Double [] vector,String filePath) throws IOException;
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IPosTag.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.util.List;
 5 | 
 6 | import unsw.curation.api.domain.ExtractPosTag;
 7 | 
 8 | public interface IPosTag {
 9 | 
10 | 	List<ExtractPosTag> ExtractNoun(String sentence);
11 | 	List<ExtractPosTag> ExtractAdjective(String sentence);
12 | 	List<ExtractPosTag> ExtractAdverb(String sentence);
13 | 	List<ExtractPosTag> ExtractVerb(String sentence);
14 | 	List<ExtractPosTag> ExtractQuotaion(String sentence);
15 | 	List<String> ExtractPhrase(String sentence);
16 | 	List<ExtractPosTag> ExtractNoun(File filePath)throws Exception;
17 | 	List<ExtractPosTag> ExtractAdjective(File filePath)throws Exception;
18 | 	List<ExtractPosTag> ExtractAdverb(File filePath) throws Exception;
19 | 	List<ExtractPosTag> ExtractVerb(File filePath) throws Exception;
20 | 	
21 | 	List<ExtractPosTag> ExtractPosTagsSentence(String sentence);
22 | 	List<ExtractPosTag> ExtractPosTagsSentenceNew(String sentence);
23 | 	List<ExtractPosTag> ExtractPosTagsFile(File filePath) throws Exception;
24 | 	
25 | 	List<String> ExtractData(File filePath) throws Exception;
26 | 	List<ExtractPosTag> ExtractPosTags(List<String> inputData);
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IStem.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.net.URISyntaxException;
 7 | import java.util.List;
 8 | 
 9 | import unsw.curation.api.domain.ExtractStem;
10 | 
11 | public interface IStem {
12 | 
13 | 	void ReadDataset() throws FileNotFoundException, IOException, URISyntaxException;
14 | 	List<ExtractStem> FindWordDerivedForms(String word) throws FileNotFoundException, IOException, URISyntaxException;
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/ISynonym.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.URISyntaxException;
 5 | import java.util.List;
 6 | 
 7 | public interface ISynonym {
 8 | 
 9 | 	List<String> ExtractSynonymWord(String word) throws URISyntaxException, IOException;
10 | 	List<String> ExtractHypernymWord(String word);
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/ITextCosineSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.net.URISyntaxException;
 6 | import java.util.List;
 7 | 
 8 | import org.apache.lucene.store.LockObtainFailedException;
 9 | 
10 | import unsw.curation.api.domain.ExtractTextCosineSimilarity;
11 | 
12 | 
13 | public interface ITextCosineSimilarity {
14 | 
15 | 	List<ExtractTextCosineSimilarity> Cosine_Document_DocumentS(String QueryFilePath, String DataDirectoryPath) 
16 | 			throws LockObtainFailedException, IOException, URISyntaxException;
17 | 	//List<ExtractTextCosineSimilarity> Cosine_Sentence_Document(String Query, String FileName) throws LockObtainFailedException, IOException;
18 | 	//public List<String> ExtractListKeyword(List<String> lstSentence) throws Exception;
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/ITextJaccardSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.List;
 6 | 
 7 | import unsw.curation.api.domain.ExtractTextSimilarity;
 8 | 
 9 | 
10 | 
11 | public interface ITextJaccardSimilarity {
12 | 
13 | 	double Jaccard_Word_Word(String word1, String word2);
14 | 	List<ExtractTextSimilarity> Jaccard_Word_Document(String word, String filePath) throws IOException;
15 | 	double Jaccard_Document_Document(String file1,String file2) throws IOException;
16 | 	//List<ExtractTextSimilarity> Jaccard_Document_DocumentS(File filePath, String directoryPath) throws IOException;
17 | 	
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/ITextJaroSimilarity.java:
--------------------------------------------------------------------------------
1 | package unsw.curation.api.domain.abstraction;
2 | 
3 | public interface ITextJaroSimilarity {
4 | 
5 | 	public double ComputeJaroSimilarity(String Word1,String word2);
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/ITextLevenshtainSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import unsw.curation.api.domain.ExtractTextSimilarity;
 7 | 
 8 | 
 9 | 
10 | public interface ITextLevenshtainSimilarity 
11 | {
12 | 
13 | 	List<ExtractTextSimilarity> Leveneshtain_Word_Document(String word1, String filePath) throws IOException;
14 | 	int Leveneshtain_Word_Word(String word1, String word2);
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/ITextQGramSimilarity.java:
--------------------------------------------------------------------------------
1 | package unsw.curation.api.domain.abstraction;
2 | 
3 | public interface ITextQGramSimilarity {
4 | 
5 | 	double ComputeQGramSimilarity(String word1,String word2);
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/ITextSoundexSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import org.apache.commons.codec.EncoderException;
 4 | 
 5 | public interface ITextSoundexSimilarity {
 6 | 
 7 | 	int SoundexDifference(String word1,String word2) throws EncoderException;
 8 | 	
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/ITextTfidfSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.lucene.queryparser.classic.ParseException;
 8 | 
 9 | import unsw.curation.api.domain.ExtractTextTfidfSimilarity;
10 | 
11 | 
12 | public interface ITextTfidfSimilarity 
13 | {
14 | 	//List<ExtractTextTfidfSimilarity> SearchFile(String FilePath) throws IOException, ParseException;
15 | 	List<ExtractTextTfidfSimilarity> SearchText(String searchText) throws IOException, ParseException;
16 | 	void CreateIndex(String IndexFilePath) throws IOException, ParseException;
17 | 	void delete(File file) throws IOException;
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/domain/abstraction/IUrlExtraction.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.domain.abstraction;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.List;
 6 | 
 7 | public interface IUrlExtraction {
 8 | 
 9 | 	String ExtractTitle(String url) throws IOException;
10 | 	List<String> ExtractHeadings(String url) throws IOException;
11 | 	List<String> ExtractHrefText(String url) throws IOException;
12 | 	List<String> ExtractParagraphes(String url) throws IOException;
13 | 	List<String> ExtractImageALTtext(String url) throws IOException;
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractnamedentity/curation.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/src/main/java/unsw/curation/api/extractnamedentity/curation.jpg


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractnamedentity/curation.ucls:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <class-diagram version="1.1.11" icons="true" automaticImage="JPEG" always-add-relationships="false" 
 3 |   generalizations="true" realizations="true" associations="true" dependencies="false" nesting-relationships="true" 
 4 |   router="FAN">  
 5 |   <class id="1" language="java" name="unsw.curation.api.extractstem.ExtractStemImpl" project="TextAPI" 
 6 |     file="/TextAPI/src/main/java/unsw/curation/api/extractstem/ExtractStemImpl.java" binary="false" 
 7 |     corner="BOTTOM_RIGHT">    
 8 |     <position height="135" width="265" x="453" y="202"/>    
 9 |     <display autosize="true" stereotype="true" package="true" initial-value="false" signature="true" 
10 |       sort-features="false" accessors="true" visibility="true">      
11 |       <attributes public="true" package="true" protected="true" private="true" static="true"/>      
12 |       <operations public="true" package="true" protected="true" private="true" static="true"/>    
13 |     </display>  
14 |   </class>  
15 |   <class id="2" language="java" name="unsw.curation.api.domain.ExtractStem" project="TextAPI" 
16 |     file="/TextAPI/src/main/java/unsw/curation/api/domain/ExtractStem.java" binary="false" corner="BOTTOM_RIGHT">    
17 |     <position height="315" width="213" x="212" y="-40"/>    
18 |     <display autosize="true" stereotype="true" package="true" initial-value="false" signature="true" 
19 |       sort-features="false" accessors="true" visibility="true">      
20 |       <attributes public="true" package="true" protected="true" private="true" static="true"/>      
21 |       <operations public="true" package="true" protected="true" private="true" static="true"/>    
22 |     </display>  
23 |   </class>  
24 |   <interface id="3" language="java" name="unsw.curation.api.domain.abstraction.IStem" project="TextAPI" 
25 |     file="/TextAPI/src/main/java/unsw/curation/api/domain/abstraction/IStem.java" binary="false" corner="BOTTOM_RIGHT">    
26 |     <position height="99" width="265" x="497" y="14"/>    
27 |     <display autosize="true" stereotype="true" package="true" initial-value="false" signature="true" 
28 |       sort-features="false" accessors="true" visibility="true">      
29 |       <attributes public="true" package="true" protected="true" private="true" static="true"/>      
30 |       <operations public="true" package="true" protected="true" private="true" static="true"/>    
31 |     </display>  
32 |   </interface>  
33 |   <realization id="4">    
34 |     <end type="SOURCE" refId="1"/>    
35 |     <end type="TARGET" refId="3"/>  
36 |   </realization>  
37 |   <association id="5">    
38 |     <end type="SOURCE" refId="1" navigable="false">      
39 |       <attribute id="6" name="lstValues"/>      
40 |       <multiplicity id="7" minimum="0" maximum="2147483647"/>    
41 |     </end>    
42 |     <end type="TARGET" refId="2" navigable="true"/>    
43 |     <display labels="true" multiplicity="true"/>  
44 |   </association>  
45 |   <classifier-display autosize="true" stereotype="true" package="true" initial-value="false" signature="true" 
46 |     sort-features="false" accessors="true" visibility="true">    
47 |     <attributes public="true" package="true" protected="true" private="true" static="true"/>    
48 |     <operations public="true" package="true" protected="true" private="true" static="true"/>  
49 |   </classifier-display>  
50 |   <association-display labels="true" multiplicity="true"/>
51 | </class-diagram>


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractNumberCosineSimilarityImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.extractsimilarity;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileNotFoundException;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.Arrays;
  9 | import java.util.Collections;
 10 | import java.util.Comparator;
 11 | import java.util.List;
 12 | import java.util.stream.Collectors;
 13 | 
 14 | import unsw.curation.api.domain.ExtractNumberSimilarity;
 15 | import unsw.curation.api.domain.abstraction.INumberCosineSimilarity;
 16 | 
 17 | 
 18 | 
 19 | 
 20 | public class ExtractNumberCosineSimilarityImpl implements INumberCosineSimilarity
 21 | {
 22 | 	@Override
 23 | 	public double Cosine_Vector_Vector(double [] vector1, double [] vector2)
 24 |     {
 25 |         double dotProduct=0.0;
 26 |         double vector1Len=0.0;
 27 |         double vector2Len=0.0;
 28 |         for(int i=0;i<vector1.length;i++)
 29 |         {
 30 |            dotProduct+=vector1[i]*vector2[i];
 31 |            vector1Len+=Math.pow(vector1[i],2);
 32 |            vector2Len+=Math.pow(vector2[i],2);       
 33 |         }
 34 |         return 1-(dotProduct/ (Math.sqrt(vector1Len)* Math.sqrt(vector2Len)));
 35 |     }
 36 |     @Override
 37 |     //dar meghdar bazgashti dobareh check kon
 38 |     public List<ExtractNumberSimilarity> Cosine_Vector_VectorS(double [] vector1, String fileName) throws IOException
 39 |     {
 40 |       List<ExtractNumberSimilarity> lstValues=new ArrayList<>();
 41 |       List<String[]> lstData=ReadData(fileName);
 42 |       for(String [] vector2:lstData)
 43 |       {
 44 |           
 45 |         double dotProduct=0.0;
 46 |         double vector1Len=0.0;
 47 |         double vector2Len=0.0;
 48 |         for(int i=0;i<vector1.length;i++)
 49 |         {
 50 |            dotProduct+=vector1[i]*Double.parseDouble(vector2[i]);
 51 |            vector1Len+=Math.pow(vector1[i],2);
 52 |            vector2Len+=Math.pow(Double.parseDouble(vector2[i]),2);       
 53 |         }
 54 |         double CosineSimilarity= 1-(dotProduct/ (Math.sqrt(vector1Len)* Math.sqrt(vector2Len)));
 55 |         lstValues.add(new ExtractNumberSimilarity(Arrays.toString(vector1),
 56 |                 Arrays.toString(vector2), 
 57 |                 CosineSimilarity));
 58 |       }
 59 |         Collections.sort(lstValues,new MyCosineComp());
 60 |         List<ExtractNumberSimilarity> lstTopRecords=lstValues.stream()
 61 |                 .limit(10)
 62 |                 .collect(Collectors.toList());
 63 |       return lstTopRecords;
 64 |     }
 65 |     @Override
 66 |   //dar meghdar bazgashti dobareh check kon
 67 |     public List<ExtractNumberSimilarity> Cosine_Vector_VectorS(String fileName) throws IOException
 68 |     {
 69 |       List<ExtractNumberSimilarity> lstValues=new ArrayList<>();
 70 |       List<String[]> lstData=ReadData(fileName);
 71 |       List<ExtractNumberSimilarity> lstTopRecords=new ArrayList<>();
 72 |       for(String [] vector1:lstData)
 73 |       {
 74 |        List<String[]> lstTempValues=new ArrayList<>();
 75 |        lstTempValues.addAll(lstData);
 76 |        lstTempValues.remove(vector1);
 77 |        for(String [] vector2:lstTempValues)
 78 |        {
 79 |         double dotProduct=0.0;
 80 |         double vector1Len=0.0;
 81 |         double vector2Len=0.0;
 82 |         for(int i=0;i<vector1.length;i++)
 83 |         {
 84 |            dotProduct+=Double.parseDouble(vector1[i])*Double.parseDouble(vector2[i]);
 85 |            vector1Len+=Math.pow(Double.parseDouble(vector1[i]),2);
 86 |            vector2Len+=Math.pow(Double.parseDouble(vector2[i]),2);       
 87 |         }
 88 |         double CosineSimilarity= 1-(dotProduct/ (Math.sqrt(vector1Len)* Math.sqrt(vector2Len)));
 89 |         lstValues.add(new ExtractNumberSimilarity(Arrays.toString(vector1),
 90 |         Arrays.toString(vector2), 
 91 |         CosineSimilarity));
 92 |        }
 93 |         Collections.sort(lstValues,new MyCosineComp());
 94 |         lstValues=lstValues.stream().limit(10).collect(Collectors.toList());
 95 |         lstTopRecords.addAll(lstValues);
 96 |         lstValues.clear();
 97 |       }
 98 |       return lstTopRecords;
 99 |     }
100 |     public List<String[]> ReadData(String FilePath) throws FileNotFoundException, IOException
101 |     {
102 |         List<String[]> lstValues=new ArrayList<>();
103 |         BufferedReader reader=new BufferedReader(new FileReader(FilePath));
104 |         String line="";
105 |         while((line=reader.readLine())!=null)
106 |         {
107 |            String [] arrLine=line.split(",");
108 |            lstValues.add(arrLine);
109 |         }
110 |         return lstValues;
111 |     }
112 |     public class MyCosineComp implements Comparator<ExtractNumberSimilarity>
113 |     {
114 |             @Override
115 |             public int compare(ExtractNumberSimilarity o1, ExtractNumberSimilarity o2) {
116 |               if(o1.getScore()<o2.getScore())
117 |                   return 1;
118 |               else
119 |                   return -1;
120 |             }
121 |     }
122 | 	
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractNumberDiceSimilarityImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.extractsimilarity;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileNotFoundException;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.Arrays;
  9 | import java.util.Collections;
 10 | import java.util.Comparator;
 11 | import java.util.HashSet;
 12 | import java.util.List;
 13 | import java.util.stream.Collectors;
 14 | 
 15 | import unsw.curation.api.domain.ExtractNumberSimilarity;
 16 | import unsw.curation.api.domain.abstraction.INumberDiceSimilarity;
 17 | 
 18 | 
 19 | 
 20 | 
 21 | public class ExtractNumberDiceSimilarityImpl implements INumberDiceSimilarity{
 22 | 
 23 | 	@Override
 24 | 	public double Dice_Vector_Vector(double[] number1, double[] number2) {
 25 | 		List<String> lstarr1=new ArrayList<>();
 26 |         List<String> lstarr2=new ArrayList<>();
 27 |         for(double num:number1)
 28 |         {
 29 |             lstarr1.add(String.valueOf(num));
 30 |         }
 31 |         for(double num:number2)
 32 |         {
 33 |             lstarr2.add(String.valueOf(num));
 34 |         }
 35 |         List<String> lstUnique=new ArrayList<>();
 36 |         lstUnique.addAll(lstarr1);
 37 |         lstUnique.addAll(lstarr2);
 38 |         HashSet<String> lstIntersect=new HashSet<>();
 39 |         lstIntersect.addAll(lstarr1);
 40 |         lstIntersect.retainAll(lstarr2);
 41 |         double intersectSize=lstIntersect.size();
 42 |         double uniqueSize=lstUnique.size();
 43 |         double DiceSimlarity=(2*intersectSize)/uniqueSize;
 44 |         return DiceSimlarity;
 45 | 	}
 46 | 
 47 | 	@Override
 48 | 	public List<ExtractNumberSimilarity> Dice_Vector_VectorS(String filePath) throws IOException {
 49 | 		List<ExtractNumberSimilarity> lstValues=new ArrayList<>();
 50 |         List<ExtractNumberSimilarity> lstTopRecords=new ArrayList<>();
 51 |         List<String[]> lstarr=ReadData(filePath);
 52 |         for(String [] arrItem:lstarr)
 53 |         {
 54 |             List<Double> lstUniqueItems=new ArrayList<>();
 55 |             HashSet<Double> lstIntersect=new HashSet<>();
 56 |             for(String arrIte:arrItem)
 57 |             {
 58 |                 lstUniqueItems.add(Double.parseDouble(arrIte));
 59 |                 lstIntersect.add(Double.parseDouble(arrIte));
 60 |             }
 61 |             List<String []> lstTempData=new ArrayList<>();
 62 |             lstTempData.addAll(lstarr);
 63 |             int arrItemIndex=lstTempData.indexOf(arrItem);
 64 |             lstTempData.remove(arrItemIndex);
 65 |             for(String [] secArrItem:lstTempData)
 66 |             {
 67 |                 List<Double> lstforSecArrItem=new ArrayList<>();
 68 |                  for(String arrIt: secArrItem)
 69 |                  {
 70 |                     lstUniqueItems.add(Double.parseDouble(arrIt));
 71 |                     
 72 |                     lstforSecArrItem.add(Double.parseDouble(arrIt));
 73 |                  }
 74 |                 lstIntersect.retainAll(lstforSecArrItem);
 75 |                 double intersectSize=lstIntersect.size();
 76 |                 double uniqueSize=lstUniqueItems.size();
 77 |                 double DiceSimlarity=intersectSize/uniqueSize;
 78 |                 lstValues.add(new ExtractNumberSimilarity(Arrays.toString(arrItem),Arrays.toString(secArrItem)
 79 |                         ,DiceSimlarity));
 80 |             }
 81 |           Collections.sort(lstValues,new MyDiceComp());
 82 |             lstValues=lstValues.stream().limit(10).collect(Collectors.toList());
 83 |             lstTopRecords.addAll(lstValues);
 84 |             lstValues.clear();
 85 |         }
 86 |         return lstTopRecords;
 87 | 	}
 88 | 
 89 | 	@Override
 90 | 	public List<ExtractNumberSimilarity> Dice_Vector_VectorS(Double[] vector, String filePath) throws IOException {
 91 | 		List<ExtractNumberSimilarity> lstValues=new ArrayList<>();
 92 |         List<String[]> lstarr=ReadData(filePath);
 93 |         for(String [] arrItem:lstarr)
 94 |         {
 95 |             List<Double> lstUniqueItems=new ArrayList<>();
 96 |             HashSet<Double> lstIntersect=new HashSet<>();
 97 |             for(String dblVal:arrItem)
 98 |             {
 99 |                     lstUniqueItems.add(Double.parseDouble(dblVal));
100 |                     lstIntersect.add(Double.parseDouble(dblVal));
101 |             }
102 |             lstUniqueItems.addAll(Arrays.asList(vector));
103 |             lstIntersect.retainAll(Arrays.asList(vector));
104 |             double intersectSize=lstIntersect.size();
105 |             double uniqueSize=lstUniqueItems.size();
106 |             double DiceSimlarity=(2*intersectSize)/uniqueSize;
107 |             lstValues.add(new ExtractNumberSimilarity(Arrays.toString(arrItem),Arrays.toString(vector)
108 |                     ,DiceSimlarity));
109 |         }
110 |         Collections.sort(lstValues,new MyDiceComp());
111 |         List<ExtractNumberSimilarity> lstTopRecords=lstValues.stream()
112 |                 .limit(10)
113 |                 .collect(Collectors.toList());
114 |         return lstTopRecords;
115 | 	}
116 | 
117 | 	public List<String[]> ReadData(String FilePath) throws FileNotFoundException, IOException
118 |     {
119 |         List<String[]> lstValues=new ArrayList<>();
120 |         BufferedReader reader=new BufferedReader(new FileReader(FilePath));
121 |         String line="";
122 |         while((line=reader.readLine())!=null)
123 |         {
124 |            String [] arrLine=line.split(",");
125 |            lstValues.add(arrLine);
126 |         }
127 |         return lstValues;
128 |     }
129 | 	public class MyDiceComp implements Comparator<ExtractNumberSimilarity>
130 |     {
131 | 
132 |         @Override
133 |         public int compare(ExtractNumberSimilarity o1, ExtractNumberSimilarity o2) {
134 |            if(o1.getScore()<o2.getScore())
135 |                return 1;
136 |            else
137 |                return -1;
138 |         }
139 |         
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractNumberEuclideanSimilarity.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.extractsimilarity;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileNotFoundException;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.Arrays;
  9 | import java.util.Collections;
 10 | import java.util.Comparator;
 11 | import java.util.List;
 12 | import java.util.stream.Collectors;
 13 | 
 14 | import unsw.curation.api.domain.ExtractNumberSimilarity;
 15 | import unsw.curation.api.domain.abstraction.INumberEuclideanSimilarity;
 16 | 
 17 | 
 18 | public class ExtractNumberEuclideanSimilarity implements INumberEuclideanSimilarity{
 19 | 	
 20 | 	@Override
 21 | 	public double Euclidean_Vector_Vector(double [] vector1,double [] vector2)
 22 |     {
 23 |         if(vector1.length==vector2.length)
 24 |         {
 25 |             double sum=0.0;
 26 |            for(int i=0;i<vector1.length;i++)
 27 |            {
 28 |               double tempValue= vector2[i]-vector1[i];
 29 |               tempValue=Math.pow(tempValue, 2);
 30 |               sum+=tempValue;
 31 |            }
 32 |            double EclideanDistance=Math.sqrt(sum);
 33 |            return EclideanDistance;
 34 |         }
 35 |         else
 36 |         {
 37 |             System.err.println("Error in Vectors length...");
 38 |             return 0.0;
 39 |         }
 40 |     }
 41 | 	@Override
 42 | 	public List<ExtractNumberSimilarity> Euclidean_Vector_VectorS(String filePath) throws IOException
 43 |     {
 44 |         List<ExtractNumberSimilarity> lstSimilarity=new ArrayList<>();
 45 |         List<ExtractNumberSimilarity> lstTopRecords=new ArrayList<>();
 46 |         List<String[]> lstValues=ReadData(filePath);
 47 |         for(String [] arrVal1: lstValues)
 48 |         {
 49 |             List<String[]> lstTempVal=new ArrayList<>();
 50 |             lstTempVal.addAll(lstValues);
 51 |             lstTempVal.remove(arrVal1);
 52 |             for(String [] arrVal2:lstTempVal)
 53 |             {
 54 |                 double sum=0.0;
 55 |                 for(int i=0;i<arrVal1.length;i++)
 56 |                 {
 57 |                   double tempValue= Double.parseDouble(arrVal2[i])-Double.parseDouble(arrVal1[i]);
 58 |                   tempValue=Math.pow(tempValue, 2);
 59 |                   sum+=tempValue;
 60 |                 }
 61 |                  double EclideanDistance=Math.sqrt(sum);
 62 |                 lstSimilarity.add(new ExtractNumberSimilarity(Arrays.toString(arrVal1),Arrays.toString(arrVal2)
 63 |                         ,EclideanDistance));
 64 |             }
 65 |             Collections.sort(lstSimilarity,new MyEuclideanComp());
 66 |             lstSimilarity=lstSimilarity.stream().limit(10).collect(Collectors.toList());
 67 |             lstTopRecords.addAll(lstSimilarity);
 68 |             lstSimilarity.clear();
 69 |         }
 70 |         return lstTopRecords;
 71 |     }
 72 | 	@Override
 73 | 	 // dar in method en mozo ra ke tool vector aval ba too; vector hay file yeki bashad ra dar nazar begir
 74 |     // mitavani tool yeki ra chek koni agar andazeh bood baad pardazesh koni.
 75 |     public List<ExtractNumberSimilarity> Euclidean_Vector_VectorS(double [] vector,String filePath) throws IOException
 76 |     {
 77 |         List<ExtractNumberSimilarity> lstSimilarity=new ArrayList<>();
 78 |         List<String[]> lstValues=ReadData(filePath);
 79 |         for(String [] arrVal2: lstValues)
 80 |         {
 81 |                 double sum=0.0;
 82 |                 for(int i=0;i<arrVal2.length;i++)
 83 |                 {
 84 |                   double tempValue= Double.parseDouble(arrVal2[i])-vector[i];
 85 |                   tempValue=Math.pow(tempValue, 2);
 86 |                   sum+=tempValue;
 87 |                 }
 88 |                  double EclideanDistance=Math.sqrt(sum);
 89 |                  lstSimilarity.add(new ExtractNumberSimilarity(Arrays.toString(arrVal2),Arrays.toString(vector)
 90 |                          ,EclideanDistance));
 91 |         }
 92 |         Collections.sort(lstSimilarity,new MyEuclideanComp());
 93 |         List<ExtractNumberSimilarity> lstTopRecords=lstSimilarity.stream()
 94 |                 .limit(10)
 95 |                 .collect(Collectors.toList());
 96 |         return lstTopRecords;
 97 |     }
 98 |     public List<String[]> ReadData(String FilePath) throws FileNotFoundException, IOException
 99 |     {
100 |         List<String[]> lstValues=new ArrayList<>();
101 |         BufferedReader reader=new BufferedReader(new FileReader(FilePath));
102 |         String line="";
103 |         while((line=reader.readLine())!=null)
104 |         {
105 |            String [] arrLine=line.split(",");
106 |            lstValues.add(arrLine);
107 |         }
108 |         return lstValues;
109 |     }
110 |     public class MyEuclideanComp implements Comparator<ExtractNumberSimilarity>
111 |     {
112 | 
113 |         @Override
114 |         public int compare(ExtractNumberSimilarity o1, ExtractNumberSimilarity o2) {
115 |             if(o1.getScore()>o2.getScore())
116 |                 return 1;
117 |             else
118 |                 return -1;
119 |         }
120 |         
121 |     }
122 | 
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractNumberJaccardSimilarityImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.extractsimilarity;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileNotFoundException;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.Arrays;
  9 | import java.util.Collections;
 10 | import java.util.Comparator;
 11 | import java.util.HashSet;
 12 | import java.util.List;
 13 | import java.util.stream.Collectors;
 14 | 
 15 | import unsw.curation.api.domain.ExtractNumberSimilarity;
 16 | import unsw.curation.api.domain.abstraction.INumberJaccardSimilarity;
 17 | 
 18 | 
 19 | 
 20 | public class ExtractNumberJaccardSimilarityImpl implements INumberJaccardSimilarity{
 21 | 
 22 | 	@Override
 23 | 	public double Jaccard_Vector_Vector(double[] number1, double[] number2) {
 24 | 		List<String> lstarr1=new ArrayList<>();
 25 |         List<String> lstarr2=new ArrayList<>();
 26 |         for(double num:number1)
 27 |         {
 28 |             lstarr1.add(String.valueOf(num));
 29 |         }
 30 |         for(double num:number2)
 31 |         {
 32 |             lstarr2.add(String.valueOf(num));
 33 |         }
 34 |         HashSet<String> lstUnique=new HashSet<>();
 35 |         lstUnique.addAll(lstarr1);
 36 |         lstUnique.addAll(lstarr2);
 37 |         HashSet<String> lstIntersect=new HashSet<>();
 38 |         lstIntersect.addAll(lstarr1);
 39 |         lstIntersect.retainAll(lstarr2);
 40 |         double intersectSize=lstIntersect.size();
 41 |         double uniqueSize=lstUnique.size();
 42 |         double JaccardSimilarity=intersectSize/uniqueSize;
 43 |         return JaccardSimilarity;
 44 | 	}
 45 | 
 46 | 	@Override
 47 | 	public List<ExtractNumberSimilarity> Jaccard_Vector_VectorS(String filePath) throws IOException {
 48 | 		List<ExtractNumberSimilarity> lstValues=new ArrayList<>();
 49 |         List<ExtractNumberSimilarity> lstTopRecords=new ArrayList<>();
 50 |         List<String[]> lstarr=ReadData(filePath);
 51 |         for(String [] arrItem:lstarr)
 52 |         {
 53 |             HashSet<Double> lstUniqueItems=new HashSet<>();
 54 |             HashSet<Double> lstIntersect=new HashSet<>();
 55 |             for(String arrIte:arrItem)
 56 |             {
 57 |                 lstUniqueItems.add(Double.parseDouble(arrIte));
 58 |                 lstIntersect.add(Double.parseDouble(arrIte));
 59 |             }
 60 |             List<String []> lstTempData=new ArrayList<>();
 61 |             lstTempData.addAll(lstarr);
 62 |             int arrItemIndex=lstTempData.indexOf(arrItem);
 63 |             lstTempData.remove(arrItemIndex);
 64 |             for(String [] secArrItem:lstTempData)
 65 |             {
 66 |                 List<Double> lstforSecArrItem=new ArrayList<>();
 67 |                  for(String arrIt: secArrItem)
 68 |                  {
 69 |                     lstUniqueItems.add(Double.parseDouble(arrIt));
 70 |                     
 71 |                     lstforSecArrItem.add(Double.parseDouble(arrIt));
 72 |                  }
 73 |                 lstIntersect.retainAll(lstforSecArrItem);
 74 |                 double intersectSize=lstIntersect.size();
 75 |                 double uniqueSize=lstUniqueItems.size();
 76 |                 double JaccardSimlarity=intersectSize/uniqueSize;
 77 |                 lstValues.add(new ExtractNumberSimilarity(Arrays.toString(arrItem),Arrays.toString(secArrItem)
 78 |                         ,JaccardSimlarity));
 79 |             }
 80 |           Collections.sort(lstValues,new MyJaccardCoefficientComp());
 81 |             lstValues=lstValues.stream().limit(10).collect(Collectors.toList());
 82 |             lstTopRecords.addAll(lstValues);
 83 |             lstValues.clear();
 84 |         }
 85 |         return lstTopRecords;
 86 | 	}
 87 | 
 88 | 	@Override
 89 | 	public List<ExtractNumberSimilarity> Jaccard_Vector_VectorS(Double[] vector, String filePath) throws IOException {
 90 | 		List<ExtractNumberSimilarity> lstValues=new ArrayList<>();
 91 |         List<String[]> lstarr=ReadData(filePath);
 92 |         for(String [] arrItem:lstarr)
 93 |         {
 94 |            HashSet<Double> lstUniqueItems=new HashSet<>();
 95 |            HashSet<Double> lstIntersect=new HashSet<>();
 96 |             for(String arrIt: arrItem)
 97 |             {
 98 |                 lstUniqueItems.add(Double.parseDouble(arrIt));
 99 |                 lstIntersect.add(Double.parseDouble(arrIt));
100 |             }
101 |             lstUniqueItems.addAll(Arrays.asList(vector));
102 |             lstIntersect.retainAll(Arrays.asList(vector));
103 |             double intersectSize=lstIntersect.size();
104 |             double uniqueSize=lstUniqueItems.size();
105 |             double JaccardSimlarity=intersectSize/uniqueSize;
106 |                 lstValues.add(new ExtractNumberSimilarity(Arrays.toString(arrItem),Arrays.toString(vector)
107 |                         ,JaccardSimlarity));
108 |         }
109 |         Collections.sort(lstValues,new MyJaccardCoefficientComp());
110 |         List<ExtractNumberSimilarity> lstTopRecords=lstValues.stream()
111 |                 .limit(10)
112 |                 .collect(Collectors.toList());
113 |         return lstTopRecords;
114 | 	}
115 | 
116 | 	public List<String[]> ReadData(String FilePath) throws FileNotFoundException, IOException
117 |     {
118 |         List<String[]> lstValues=new ArrayList<>();
119 |         BufferedReader reader=new BufferedReader(new FileReader(FilePath));
120 |         String line="";
121 |         while((line=reader.readLine())!=null)
122 |         {
123 |            String [] arrLine=line.split(",");
124 |            lstValues.add(arrLine);
125 |         }
126 |         return lstValues;
127 |     }
128 |     public class MyJaccardCoefficientComp implements Comparator<ExtractNumberSimilarity>
129 |     {
130 |         @Override
131 |         public int compare(ExtractNumberSimilarity o1, ExtractNumberSimilarity o2) {
132 |             if(o1.getScore()<o2.getScore())
133 |                 return 1;
134 |             else
135 |                 return -1;
136 |         }
137 |         
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractTextJaccardSimilarityImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.extractsimilarity;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileReader;
  7 | import java.io.IOException;
  8 | import java.util.ArrayList;
  9 | import java.util.Arrays;
 10 | import java.util.Collections;
 11 | import java.util.Comparator;
 12 | import java.util.HashSet;
 13 | import java.util.List;
 14 | import java.util.stream.Collectors;
 15 | 
 16 | import unsw.curation.api.domain.ExtractTextSimilarity;
 17 | import unsw.curation.api.domain.abstraction.ITextJaccardSimilarity;
 18 | 
 19 | 
 20 | 
 21 | public class ExtractTextJaccardSimilarityImpl implements ITextJaccardSimilarity
 22 | {
 23 | 	@Override
 24 | 	public double Jaccard_Word_Word(String word1, String word2)
 25 |     {
 26 |         String [] arrWord1=word1.toLowerCase().split("");
 27 |         String [] arrWord2=word2.toLowerCase().split("");
 28 |         HashSet<String> lstUnion=new HashSet<>();
 29 |         lstUnion.addAll(Arrays.asList(arrWord1));
 30 |         lstUnion.addAll(Arrays.asList(arrWord2));
 31 |         HashSet<String> lstIntersect=new HashSet<>();
 32 |         lstIntersect.addAll(Arrays.asList(arrWord1));
 33 |         lstIntersect.retainAll(Arrays.asList(arrWord2));
 34 |         double lstUnoinSize=(double)lstUnion.size();
 35 |         double lstIntersectSize=(double)lstIntersect.size();
 36 |         double JaccardSimilarity=lstIntersectSize/lstUnoinSize;
 37 |         return JaccardSimilarity;
 38 |     }
 39 | 	@Override
 40 |     public List<ExtractTextSimilarity> Jaccard_Word_Document(String word, String filePath) throws IOException
 41 |     {
 42 |         String [] arrWord=word.toLowerCase().split("");
 43 |         List<String> lstVal=ReadData(filePath);
 44 |         List<ExtractTextSimilarity> lstSimilarity=new ArrayList<>();
 45 |         HashSet<String> lstUniqueVal=new HashSet<>();
 46 |         lstUniqueVal.addAll(lstVal);
 47 |         for(String str:lstUniqueVal)
 48 |         {
 49 |             HashSet<String> lstUnion=new HashSet<>();
 50 |             HashSet<String> lstIntersect=new HashSet<>();
 51 |             String [] arrStr=str.toLowerCase().split("");
 52 |             lstUnion.addAll(Arrays.asList(arrWord));
 53 |             lstUnion.addAll(Arrays.asList(arrStr));
 54 |             lstIntersect.addAll(Arrays.asList(arrWord));
 55 |             lstIntersect.retainAll(Arrays.asList(arrStr));
 56 |             double lstUnoinSize=(double)lstUnion.size();
 57 |             double lstIntersectSize=(double)lstIntersect.size();
 58 |             double JaccardSimilarity=lstIntersectSize/lstUnoinSize;
 59 |             lstSimilarity.add(new ExtractTextSimilarity(word,str,JaccardSimilarity));
 60 |         }
 61 |         System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
 62 |         Collections.sort(lstSimilarity,new MyStringJaccardComp());
 63 |         List<ExtractTextSimilarity> lstTopRecords=lstSimilarity.stream()
 64 |                 .limit(20)
 65 |                 .collect(Collectors.toList());
 66 |         return lstTopRecords;
 67 |     }
 68 |     @Override
 69 |     public double Jaccard_Document_Document(String file1,String file2) throws IOException
 70 |     {
 71 |         List<String> lstWords1=ReadData(file1);
 72 |         List<String> lstWords2=ReadData(file2);
 73 |         HashSet<String> lstUniqueWords=new HashSet<>();
 74 |         HashSet<String> lstIntersectWords=new HashSet<>();
 75 |         lstUniqueWords.addAll(lstWords1);
 76 |         lstUniqueWords.addAll(lstWords2);
 77 |         lstIntersectWords.addAll(lstWords1);
 78 |         lstIntersectWords.retainAll(lstWords2);
 79 |         double lstIntersectSize=lstIntersectWords.size();
 80 |         double lstUniqueWordsSize=lstUniqueWords.size();
 81 |         double JaccardSimilarity=lstIntersectSize/lstUniqueWordsSize;
 82 |         return JaccardSimilarity;
 83 |     }
 84 |    /* @Override
 85 |     public List<ExtractTextSimilarity> Jaccard_Document_DocumentS(File filePath, String directoryPath) throws IOException
 86 |     {
 87 |         
 88 |         List<ExtractTextSimilarity> lstSimilarity=new ArrayList<>();
 89 |         List<String> lstWords1=ReadData(filePath.getName());
 90 |         HashSet<String> lstUniqueWords1=new HashSet<>();
 91 |         lstUniqueWords1.addAll(lstWords1);
 92 |         File[] files=new File(directoryPath).listFiles();
 93 |         for(File file:files)
 94 |         {
 95 |             List<String> lstWords2=ReadData(file.getPath());
 96 |             lstUniqueWords1.addAll(lstWords2);
 97 |             HashSet<String> lstIntersectWords=new HashSet<>();
 98 |             lstIntersectWords.addAll(lstWords1);
 99 |             lstIntersectWords.retainAll(lstWords2);
100 |             double lstUniqueWordsSize=lstUniqueWords1.size();
101 |             double lstIntersectSize=lstIntersectWords.size();
102 |             double JaccardSimilarity=lstIntersectSize/lstUniqueWordsSize;
103 |             lstSimilarity.add(new ExtractTextSimilarity(filePath.getName(),file.getName()
104 |                     ,JaccardSimilarity));    
105 |         }
106 |         System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
107 |         Collections.sort(lstSimilarity,new MyStringJaccardComp());
108 |         List<ExtractTextSimilarity> lstTopRecords=lstSimilarity.stream()
109 |                 .limit(10)
110 |                 .collect(Collectors.toList());
111 |         return lstTopRecords;
112 |     }*/
113 |     private List<String> ReadData(String FilePath) throws FileNotFoundException, IOException
114 |     {
115 |         List<String> lstValues=new ArrayList<>();
116 |         BufferedReader reader=new BufferedReader(new FileReader(FilePath));
117 |         String line="";
118 |         while((line=reader.readLine())!=null)
119 |         {
120 |             String [] arrLine=line.split(" ");
121 |             for(String str: arrLine)
122 |             {
123 |                 str=str.toLowerCase();
124 |                 str=str.trim();
125 |                 lstValues.add(str);
126 |             }
127 |         }
128 |         return lstValues;
129 |     }
130 |     public class MyStringJaccardComp implements Comparator<ExtractTextSimilarity>
131 |     {
132 |         @Override
133 |         public int compare(ExtractTextSimilarity o1, ExtractTextSimilarity o2) {
134 |           if(o1.getSimilarity() <o2.getSimilarity())
135 |               return 1;
136 |           else
137 |               return -1;
138 |         }
139 |     }
140 | }
141 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractTextJaroSimialrity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.extractsimilarity;
 2 | 
 3 | 
 4 | import org.apache.commons.lang3.StringUtils;
 5 | 
 6 | import unsw.curation.api.domain.abstraction.ITextJaroSimilarity;
 7 | 
 8 | public class ExtractTextJaroSimialrity implements ITextJaroSimilarity {
 9 | 
10 | 	@Override
11 | 	public double ComputeJaroSimilarity(String Word1, String word2) {
12 | 		double jaroSimilarity=StringUtils.getJaroWinklerDistance(Word1, word2);
13 | 		return jaroSimilarity;
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractTextLevenshtainImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.extractsimilarity;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileNotFoundException;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.Collections;
  9 | import java.util.Comparator;
 10 | import java.util.List;
 11 | import java.util.stream.Collectors;
 12 | 
 13 | import unsw.curation.api.domain.ExtractTextSimilarity;
 14 | import unsw.curation.api.domain.abstraction.ITextLevenshtainSimilarity;
 15 | 
 16 | 
 17 | 
 18 | public class ExtractTextLevenshtainImpl implements ITextLevenshtainSimilarity {
 19 | 
 20 | 	public int Leveneshtain_Word_Word(String word1, String word2)
 21 |     {
 22 |       word1 = word1.toLowerCase();
 23 |       word2 = word2.toLowerCase();
 24 |       int[] costs = new int[word2.length() + 1];
 25 |       for (int j = 0; j < costs.length; j++)
 26 |           costs[j] = j;
 27 |       for (int i = 1; i <= word1.length(); i++)
 28 |       {
 29 |           costs[0] = i;
 30 |           int nw = i - 1;
 31 |           for (int j = 1; j <= word2.length(); j++)
 32 |           {
 33 |               int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]),
 34 |                       word1.charAt(i - 1) == word2.charAt(j - 1) ? nw : nw + 1);
 35 |               nw = costs[j];
 36 |               costs[j] = cj;
 37 |           }
 38 |       }
 39 |       return costs[word2.length()];
 40 |     }
 41 |    public List<ExtractTextSimilarity> Leveneshtain_Word_Document(String word1, String filePath) throws IOException
 42 |     {
 43 |       List<ExtractTextSimilarity> lstVal=new ArrayList<>();
 44 |       word1 = word1.toLowerCase();
 45 |       List<String> lstValues=ReadData(filePath);
 46 |       for(String b:lstValues)
 47 |       {
 48 |           int[] costs = new int[b.length() + 1];
 49 |           for (int j = 0; j < costs.length; j++)
 50 |               costs[j] = j;
 51 |           for (int i = 1; i <= word1.length(); i++)
 52 |           {
 53 |               costs[0] = i;
 54 |               int nw = i - 1;
 55 |               for (int j = 1; j <= b.length(); j++)
 56 |               {
 57 |                   int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]),
 58 |                           word1.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1);
 59 |                   nw = costs[j];
 60 |                   costs[j] = cj;
 61 |               }
 62 |           }
 63 |           lstVal.add(new ExtractTextSimilarity(word1, b, costs[b.length()]));
 64 |       }
 65 |         System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
 66 |         Collections.sort(lstVal,new myLeveneshteinComp());
 67 |         List<ExtractTextSimilarity> lstTopRecords=lstVal.stream()
 68 |                 .limit(10)
 69 |                 .collect(Collectors.toList());
 70 |         return lstTopRecords;
 71 |     }
 72 |    
 73 |     private List<String> ReadData(String FilePath) throws FileNotFoundException, IOException
 74 |     {
 75 |         List<String> lstValues=new ArrayList<>();
 76 |         BufferedReader reader=new BufferedReader(new FileReader(FilePath));
 77 |         String line="";
 78 |         while((line=reader.readLine())!=null)
 79 |         {
 80 |             String [] arrLine=line.split(" ");
 81 |             for(String str: arrLine)
 82 |             {
 83 |                 str=str.toLowerCase();
 84 |                 str=str.trim();
 85 |                 lstValues.add(str);
 86 |             }
 87 |         }
 88 |         return lstValues;
 89 |     }
 90 |     public class myLeveneshteinComp implements Comparator<ExtractTextSimilarity>
 91 |     {
 92 |         @Override
 93 |         public int compare(ExtractTextSimilarity o1, ExtractTextSimilarity o2)
 94 |         {
 95 |             if(o1.getSimilarity() > o2.getSimilarity())
 96 |               return 1;
 97 |           else
 98 |               return -1;
 99 |         }   
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractTextQGramSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.extractsimilarity;
 2 | 
 3 | 
 4 | import info.debatty.java.stringsimilarity.QGram;
 5 | import unsw.curation.api.domain.abstraction.ITextQGramSimilarity;
 6 | 
 7 | 
 8 | public class ExtractTextQGramSimilarity implements ITextQGramSimilarity {
 9 | 
10 | 	@Override
11 | 	public double ComputeQGramSimilarity(String word1, String word2) {
12 | 		QGram qG=new QGram();
13 | 		double qGramDistance=qG.distance(word1, word2);
14 | 		
15 | 		return qGramDistance;
16 | 	}
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractsimilarity/ExtractTextSoundexSimilarity.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.extractsimilarity;
 2 | 
 3 | import org.apache.commons.codec.EncoderException;
 4 | import org.apache.commons.codec.language.Soundex;
 5 | 
 6 | import unsw.curation.api.domain.abstraction.ITextSoundexSimilarity;
 7 | 
 8 | 
 9 | 
10 | public class ExtractTextSoundexSimilarity implements ITextSoundexSimilarity {
11 | 
12 | 	@Override
13 | 	public int SoundexDifference(String word1, String word2) throws EncoderException {
14 | 		Soundex soundee=new Soundex();
15 | 		return soundee.difference(word1, word2);
16 | 	}
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/extractstem/ExtractStemImpl.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.extractstem;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileNotFoundException;
 6 | import java.io.FileReader;
 7 | import java.io.IOException;
 8 | import java.net.URISyntaxException;
 9 | import java.util.ArrayList;
10 | import java.util.List;
11 | import java.util.stream.Collectors;
12 | 
13 | import unsw.curation.api.domain.ExtractStem;
14 | import unsw.curation.api.domain.abstraction.IStem;
15 | 
16 | 
17 | 
18 | public class ExtractStemImpl implements IStem {
19 | 
20 | 	private List<ExtractStem> lstValues=new ArrayList<>();
21 | 	@Override
22 | 	public List<ExtractStem> FindWordDerivedForms(String word) throws FileNotFoundException, IOException, URISyntaxException
23 | 	{
24 | 		String getWord=word.trim().toLowerCase();
25 | 		ReadDataset();
26 | 		List<ExtractStem> lstDerivedStems=lstValues.stream()
27 | 				.filter(s->s.getWord1().equalsIgnoreCase(getWord))
28 | 				.collect(Collectors.toList());
29 | 	     return lstDerivedStems;
30 | 	}
31 | 
32 | 	@Override
33 | 	public void ReadDataset() throws FileNotFoundException, IOException, URISyntaxException 
34 | 	{
35 | 		//java.net.URL url = getClass().getClassLoader().getResource("Stem.txt");
36 | 	    File file = new File("Stem.txt");
37 | 		BufferedReader reader=new BufferedReader(new FileReader(file));
38 |         String line="";
39 |         while((line=reader.readLine())!=null)
40 |         {
41 |             try
42 |             {
43 |             String [] lineValues=line.split("\\|");
44 |             String myWord1=lineValues[0].trim().toLowerCase();
45 |             String myDerived1=lineValues[1].trim().toLowerCase();
46 |             String myWord2=lineValues[3].trim().toLowerCase();
47 |             String myDerived2=lineValues[4].trim().toLowerCase();
48 |             lstValues.add(new ExtractStem(myWord1,myDerived1,myWord2,myDerived2));
49 |             }
50 |             catch(Exception ex)
51 |             {
52 |                 
53 |             }
54 |         }
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/index/DataSearch.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.index;
 7 | 
 8 | import java.io.File;
 9 | import java.io.IOException;
10 | 
11 | import org.apache.lucene.document.Document;
12 | import org.apache.lucene.index.CorruptIndexException;
13 | import org.apache.lucene.index.DirectoryReader;
14 | import org.apache.lucene.index.IndexReader;
15 | import org.apache.lucene.index.Term;
16 | import org.apache.lucene.queryparser.classic.ParseException;
17 | import org.apache.lucene.search.IndexSearcher;
18 | import org.apache.lucene.search.PhraseQuery;
19 | import org.apache.lucene.search.Query;
20 | import org.apache.lucene.search.ScoreDoc;
21 | import org.apache.lucene.search.TopDocs;
22 | import org.apache.lucene.store.FSDirectory;
23 | 
24 | 
25 | /**
26 |  *
27 |  * @author Alireza
28 |  */
29 | 
30 | public class DataSearch {
31 | 	  IndexReader reader;
32 | 	  IndexSearcher indSearch;
33 | 	  Query query;
34 | 	  public DataSearch(String IndexDir) throws IOException
35 | 	  {
36 | 	      reader=DirectoryReader.open(FSDirectory.open(new File(IndexDir)));
37 | 	      indSearch=new IndexSearcher(reader);
38 | 	  }
39 | 	  public TopDocs search(String searchText, int slop) throws IOException, ParseException
40 | 	  {
41 | 		  PhraseQuery query = new PhraseQuery();
42 | 	      query.setSlop(slop);
43 | 	      String [] searchTerms=searchText.split(" ");
44 | 	      for(String searchWord:searchTerms)
45 | 	      query.add(new Term("body",searchWord.toLowerCase()));
46 | 	      return indSearch.search(query, 100); 	     
47 | 	  }
48 | 	  public Document getDocument(ScoreDoc score) throws CorruptIndexException, IOException
49 | 	  {
50 | 		  return indSearch.doc(score.doc);
51 | 	  }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/index/Index.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.index;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | 
 6 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 7 | import org.apache.lucene.document.Document;
 8 | import org.apache.lucene.document.TextField;
 9 | import org.apache.lucene.index.CorruptIndexException;
10 | import org.apache.lucene.index.IndexWriter;
11 | import org.apache.lucene.index.IndexWriterConfig;
12 | import org.apache.lucene.store.FSDirectory;
13 | import org.apache.lucene.util.Version;
14 | import org.apache.lucene.document.Field;
15 | 
16 | 
17 | 
18 | /**
19 |  *
20 |  * @author Alireza
21 |  */
22 | public class Index {
23 |     private IndexWriter writer;
24 |     private StandardAnalyzer Analyzer=new StandardAnalyzer(Version.LUCENE_46);
25 |     
26 |     public Index(String indexDirectory) throws IOException
27 |     {
28 |      FSDirectory indexDir=FSDirectory.open(new File(indexDirectory));
29 |      IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_46,Analyzer);
30 | 	 writer=new IndexWriter(indexDir,config);
31 |     }
32 |     public void Close() throws CorruptIndexException, IOException
33 |     {
34 | 	  writer.close();
35 |     }
36 |     
37 |     private Document ListDoc(String text) throws IOException
38 |     {
39 |       Document doc=new Document();
40 |       doc.add(new TextField("body",text,Field.Store.YES));
41 |       //doc.add(new TextField("tweet",inputMongo.getBody(), Field.Store.YES));
42 |       //doc.add(new TextField("description", inputMongo.getDescription(), Field.Store.YES));
43 |       //System.out.println("Name  "+inputMongo.getBody());
44 |      // doc.add(new TextField("displayName",inputMongo.getDisplayName(), Field.Store.YES));
45 |      //System.out.println("Indexing: "+inputMongo.getId()+" "+inputMongo.getBody()+" "+inputMongo.getDescription()+" "+inputMongo.getDisplayName());
46 |       return doc;
47 |     }
48 |     public void IndexDocuments(String getValues) throws IOException
49 |     {
50 |     // for(LuceneData<String, String> mongoVal:getLstMongoValues)
51 |          // {
52 |               try
53 |               {
54 |               Document document = ListDoc(getValues);
55 |               
56 |               writer.addDocument(document);
57 |               }
58 |               catch(Exception ex)
59 |               {         
60 |                   System.out.print(ex.getMessage());
61 |               }
62 |        //   }
63 |      
64 |     }
65 |      
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/index/SchIndData.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * To change this license header, choose License Headers in Project Properties.
  3 |  * To change this template file, choose Tools | Templates
  4 |  * and open the template in the editor.
  5 |  */
  6 | package unsw.curation.api.index;
  7 | 
  8 | import java.io.File;
  9 | import java.io.IOException;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | 
 13 | import org.apache.lucene.document.Document;
 14 | import org.apache.lucene.queryparser.classic.ParseException;
 15 | import org.apache.lucene.search.ScoreDoc;
 16 | import org.apache.lucene.search.TopDocs;
 17 | 
 18 | 
 19 | 
 20 | 
 21 | /**
 22 |  *
 23 |  * @author Alireza
 24 |  */
 25 | public class SchIndData 
 26 | {  
 27 |     static String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+";
 28 |     public SchIndData(){} 
 29 |     Index dInd;
 30 |     DataSearch DSch;
 31 |     String twitterData="";
 32 |     String current = System.getProperty("user.dir");
 33 |     String SchTxt="";
 34 |     public static void delete(File file)
 35 |     	throws IOException{
 36 |  
 37 |     	if(file.isDirectory())
 38 |         {
 39 |     		if(file.list().length==0){
 40 |     		   file.delete();
 41 |     		}else{
 42 |         	   String files[] = file.list();
 43 |      
 44 |         	   for (String temp : files) {
 45 |         	      File fileDelete = new File(file, temp);
 46 |         	     delete(fileDelete);
 47 |         	   }
 48 |         	   if(file.list().length==0){
 49 |            	     file.delete();
 50 |         	   }
 51 |     		}	
 52 |     	}
 53 |         else
 54 |         {
 55 |     		file.delete();
 56 |     	}
 57 |     }
 58 | /*    private List<String>ReadData(String filePath) throws IOException
 59 | 	{
 60 | 		List<String> lstTw=new ArrayList<>();
 61 | 		
 62 | 		BufferedReader reader=new BufferedReader(new FileReader(filePath));
 63 | 		String line="";
 64 | 		while((line=reader.readLine())!=null)
 65 | 		{
 66 | 		  lstTw.add(line);	
 67 | 		}
 68 | 		return lstTw;
 69 | 	}*/
 70 |      /*public void CreateIndex(String sentence) throws IOException, ParseException
 71 |      {
 72 |     	 //List<String>lstValues=ReadData(filePath);
 73 |          File fileCheck=new File(current+"\\File_Index");
 74 |          if(!fileCheck.exists())
 75 |          {
 76 |               fileCheck.mkdir();
 77 |               
 78 |               dInd = new Index(current+"\\File_Index\\");
 79 |               System.out.println("Start Indexing Data: "+System.currentTimeMillis());	
 80 |               for(String inputValues: lstValues)
 81 |               {
 82 |                 dInd.IndexDocuments(inputValues);
 83 |               }
 84 |               System.out.println("Finished Indexing Data: "+System.currentTimeMillis());
 85 |               dInd.Close();
 86 |          }
 87 |          else
 88 |              if(fileCheck.exists()&& fileCheck.listFiles().length>0)
 89 |              {
 90 |                  Scanner sc=new Scanner(System.in);
 91 |                  System.out.println("Index directory is exist; Do you want to index data again? (Y/N)");
 92 |                  String answer=sc.next();
 93 |                  if(answer.equalsIgnoreCase("y"))
 94 |                  {
 95 |                      delete(fileCheck);
 96 |                      System.out.println("All Index Files are deleted.");
 97 |                      fileCheck.mkdir();
 98 |                       dInd = new Index(current+"\\File_Index\\");
 99 |                       System.out.println("Start Indexing Data: "+System.currentTimeMillis());	
100 |                       for(String inputValues: lstValues)
101 |                       {
102 |                         dInd.IndexDocuments(inputValues);
103 |                       }
104 |                       System.out.println("Finished Indexing Data: "+System.currentTimeMillis());
105 |                       dInd.Close();
106 |                  }
107 |                  else
108 |                      if(answer.equalsIgnoreCase("n"))
109 |                      {
110 |                          System.out.println("Search "
111 |                                      + "Based on the previous Indexed files...");
112 |                      }
113 |              }
114 |      }*/
115 |    public List<String> search(String token, String indexDir, int slop) throws IOException, ParseException
116 |    {
117 | 	  List<String>lstSearch=new ArrayList<>();
118 |       DSch = new DataSearch(indexDir);
119 |       
120 |       
121 |           TopDocs hits = DSch.search(token, slop);
122 |           //System.out.println(searchSentence+" "+hits.totalHits);
123 |           for(ScoreDoc scoreDoc : hits.scoreDocs)
124 |           {
125 |         	 
126 |              Document doc = DSch.getDocument(scoreDoc);
127 |              lstSearch.add(doc.get("body"));
128 | /*           lso.setTweet(doc.get("body"));
129 |              lso.setNeType(searchSentence);*/
130 |              //System.out.println(lso.getSimilarSentence());
131 |              //System.out.println(lso.getScore());
132 |              //lso.setNeExistance(true);
133 |              //System.out.println(lso.getSimilarSentence());
134 |              //System.out.println(searchSentence);
135 |              //lso.setScore(String.valueOf(scoreDoc.score));
136 | 
137 |           }
138 |       return lstSearch;
139 |     }
140 |   /* private String PreProcessSentence(String inputSentence)
141 |    {
142 |        inputSentence=inputSentence.replaceAll(Pattern, "");
143 |              String [] arrSLine=inputSentence.split(" ");
144 |              String Line="";
145 |              for(String str:arrSLine)
146 |              {
147 |                      str=str.replace("'","");
148 |                      str=str.replace("(","");
149 |                      str=str.replace(")","");
150 |                      str=str.replace("!","");
151 |                      str=str.replace("[","");
152 |                      str=str.replace("]","");
153 |                      str=str.replace("{","");
154 |                      str=str.replace("}","");
155 |                      str=str.replace("\"","");
156 |                      str=str.replace("?","");
157 |                      str=str.replace(".","");
158 |                      Line+=str+" ";
159 |              }
160 |              return Line;
161 |    }*/
162 | }
163 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/linking/GoogleKnowledgeGraph.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.linking;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.BufferedWriter;
 5 | import java.io.FileReader;
 6 | import java.io.FileWriter;
 7 | import java.io.IOException;
 8 | import java.io.InputStreamReader;
 9 | import java.net.HttpURLConnection;
10 | import java.net.URL;
11 | import java.util.ArrayList;
12 | import java.util.List;
13 | 
14 | import org.json.JSONObject;
15 | 
16 | public class GoogleKnowledgeGraph {
17 | 	
18 | 	private final String USER_AGENT = "Mozilla/5.0";
19 | 	public void ParseGoogleKnowledgeGraph(List<String>lstEntity, String outputFileName) throws Exception {
20 | 
21 | 		BufferedWriter writer=new BufferedWriter(new FileWriter(outputFileName));
22 | 		for(String str:lstEntity)
23 | 		{
24 | 		str=str.trim();
25 | 		if(str.contains(" "))
26 | 			str=str.replace(" ", "+");
27 | 		
28 | 		String url = "https://kgsearch.googleapis.com/v1/entities:search?query="+str+"&key=AIzaSyA6u_gvGgeBjUx5ThGhc2hvg-MiIfuYBkk&limit=1&indent=True";
29 | 		//https://www.wikidata.org/w/api.php?action=wbsearchentities&search=lionel messi&language=en&format=json
30 | 		URL obj = new URL(url);
31 | 		HttpURLConnection con = (HttpURLConnection) obj.openConnection();
32 | 		con.setRequestMethod("GET");
33 | 		con.setRequestProperty("User-Agent", USER_AGENT);
34 | 		int responseCode = con.getResponseCode();
35 | 		BufferedReader in = new BufferedReader(
36 | 		        new InputStreamReader(con.getInputStream()));
37 | 		String inputLine;
38 | 		while ((inputLine = in.readLine()) != null)
39 | 		{
40 | 			//System.out.println("Fetching Data From Wikidata");
41 | 			System.out.println(inputLine);
42 | 			writer.write(inputLine);
43 | 			writer.newLine();
44 | 		}
45 | 		in.close();
46 | 		}
47 | 		writer.close();
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/linking/WikiData.java:
--------------------------------------------------------------------------------
 1 | package unsw.curation.api.linking;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.BufferedWriter;
 5 | import java.io.FileWriter;
 6 | import java.io.InputStreamReader;
 7 | import java.net.HttpURLConnection;
 8 | import java.net.URL;
 9 | import java.util.List;
10 | 
11 | public class WikiData {
12 | 
13 | 	private final String USER_AGENT = "Mozilla/5.0";
14 | 	public void ParseWikiData(List<String>lstEntity, String outputFileName) throws Exception {
15 | 
16 | 		BufferedWriter writer=new BufferedWriter(new FileWriter(outputFileName));
17 | 		for(String str:lstEntity)
18 | 		{
19 | 		str=str.trim();
20 | 		if(str.contains(" "))
21 | 			str=str.replace(" ", "+");
22 | 		
23 | 		String url = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search="+str+"&language=en&format=json";
24 | 		//https://www.wikidata.org/w/api.php?action=wbsearchentities&search=lionel messi&language=en&format=json
25 | 		URL obj = new URL(url);
26 | 		HttpURLConnection con = (HttpURLConnection) obj.openConnection();
27 | 		con.setRequestMethod("GET");
28 | 		con.setRequestProperty("User-Agent", USER_AGENT);
29 | 		int responseCode = con.getResponseCode();
30 | 		BufferedReader in = new BufferedReader(
31 | 		        new InputStreamReader(con.getInputStream()));
32 | 		String inputLine;
33 | 		while ((inputLine = in.readLine()) != null)
34 | 		{
35 | 			System.out.println("Fetching Data From Wikidata");
36 | 			System.out.println(inputLine);
37 | 			writer.write(inputLine);
38 | 			writer.newLine();
39 | 		}
40 | 		in.close();
41 | 		}
42 | 		writer.close();
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/EvaluateClassifier.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.util.ArrayList;
  8 | import java.util.List;
  9 | import java.util.stream.Collectors;
 10 | 
 11 | 
 12 | 
 13 | public class EvaluateClassifier {
 14 | 
 15 | 	/*public List<String> ComputePrecision(File actualValues, File predictedValues) throws IOException
 16 | 	{
 17 | 		List<String> lstActualFile=ReadTestData(actualValues);
 18 | 		List<String> lstPredictedFile=ReadTestData(predictedValues);
 19 | 		List<String> lstPrecisionValues=new ArrayList<>();
 20 | 		if(lstActualFile.size()!=lstPredictedFile.size())
 21 | 			System.err.println("The Length of Actual and Predicted Vectors are not Similar");
 22 | 		List<String> lstCategories=lstActualFile
 23 | 				.stream()
 24 | 				.distinct()
 25 | 				.collect(Collectors.toList());
 26 | 		for(String category: lstCategories)
 27 | 		{
 28 | 			double truePositive=0;
 29 | 			double falsePositive=0;
 30 | 			category=category.toLowerCase().trim();
 31 | 		   for(int i=0; i< lstPredictedFile.size();i++)
 32 | 		   {
 33 | 			 if(lstActualFile.get(i).equals(category)&& lstPredictedFile.get(i).equals(category))
 34 | 			 {
 35 | 				 truePositive++;
 36 | 			 }
 37 | 			 if(!lstActualFile.get(i).equals(category)&& lstPredictedFile.get(i).equals(category))
 38 | 			 {
 39 | 				 falsePositive++;
 40 | 			 }
 41 | 		   }
 42 | 		   double percision=truePositive/(falsePositive+truePositive);
 43 | 		   lstPrecisionValues.add("Precision: "+category+" is: "+String.valueOf(percision));
 44 | 		}
 45 | 		return lstPrecisionValues;
 46 | 	}*/
 47 | 	
 48 | 	public double ComputeAccuracy(File actualValues, File predictedValues) throws IOException
 49 | 	{
 50 | 		List<String> lstActualFile=ReadTestData(actualValues);
 51 | 		List<String> lstPredictedFile=ReadTestData(predictedValues);
 52 | 		
 53 | 		if(lstActualFile.size()!=lstPredictedFile.size())
 54 | 			System.err.println("The Length of Actual and Predicted Vectors are not Similar");
 55 | 		double positiveRate=0;
 56 | 		for(int i=0;i<lstActualFile.size();i++)
 57 | 		{
 58 | 			if(lstActualFile.get(i).equals(lstPredictedFile.get(i)))
 59 | 			{
 60 | 				positiveRate++;
 61 | 			}
 62 | 		}
 63 | 	    double accuracy=positiveRate/(double)lstActualFile.size();
 64 | 	    return accuracy;
 65 | 	}
 66 | 	
 67 | 	public List<String> ComputeRecall(File actualValues, File predictedValues) throws IOException
 68 | 	{
 69 | 		List<String> lstActualFile=ReadTestData(actualValues);
 70 | 		List<String> lstPredictedFile=ReadTestData(predictedValues);
 71 | 		List<String> lstRecall=new ArrayList<>();
 72 | 		List<String> lstCategories=lstActualFile
 73 | 				.stream()
 74 | 				.distinct()
 75 | 				.collect(Collectors.toList());
 76 | 		for(String category: lstCategories)
 77 | 		{
 78 | 			double truePositive=0;
 79 | 			double falseNegative=0;
 80 | 			category=category.toLowerCase().trim();
 81 | 		   for(int i=0; i< lstPredictedFile.size();i++)
 82 | 		   {
 83 | 			 if(lstActualFile.get(i).equals(category)&& lstPredictedFile.get(i).equals(category))
 84 | 			 {
 85 | 				 truePositive++;
 86 | 			 }
 87 | 			 if(lstActualFile.get(i).equals(category)&& !lstPredictedFile.get(i).equals(category))
 88 | 			 {
 89 | 				 falseNegative++;
 90 | 			 }
 91 | 		   }
 92 | 		   double recall=truePositive/(falseNegative+truePositive);
 93 | 		   lstRecall.add("Recall: "+category+" is: "+String.valueOf(recall));
 94 | 		}
 95 | 		return lstRecall;
 96 | 	}
 97 | 	
 98 | 	private List<String> ReadTestData(File inputLabels) throws IOException
 99 | 	{
100 | 		List<String> lstLabels=new ArrayList<>();
101 | 		BufferedReader reader=new BufferedReader(new FileReader(inputLabels));
102 | 		String line="";
103 | 		while((line=reader.readLine())!=null)
104 | 		{
105 | 			String [] arrLine=line.split(",");
106 | 			String label=arrLine[arrLine.length-1];
107 | 			lstLabels.add(label.trim().toLowerCase());
108 | 		}
109 | 		return lstLabels;
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextDecisionTreeImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileOutputStream;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.io.ObjectOutputStream;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.Random;
 13 | 
 14 | import unsw.curation.api.domain.Classification;
 15 | import unsw.curation.api.domain.abstraction.IClassificationTextDecisionTree;
 16 | import weka.classifiers.Evaluation;
 17 | import weka.classifiers.meta.FilteredClassifier;
 18 | import weka.classifiers.trees.J48;
 19 | import weka.core.Instances;
 20 | import weka.core.converters.ArffLoader;
 21 | import weka.filters.unsupervised.attribute.StringToWordVector;
 22 | 
 23 | 
 24 | public class ExtractClassificationTextDecisionTreeImpl implements IClassificationTextDecisionTree {
 25 | 
 26 | 	Instances trainedData;
 27 |     StringToWordVector filter;
 28 |     FilteredClassifier classifier;
 29 |     Classification cls=new Classification();
 30 | 	@Override
 31 | 	public void LoadDataset(File arffFileName) throws IOException 
 32 | 	{
 33 | 		BufferedReader bReader=new BufferedReader(
 34 |                 new FileReader(arffFileName));
 35 |         ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader);
 36 |         trainedData=arff.getData();
 37 |         bReader.close();		
 38 | 	}
 39 | 
 40 | 	@Override
 41 | 	public List<Classification> EvaluateDecisionTree() throws Exception 
 42 | 	{
 43 | 		List<Classification> lstEvaluationDetail=new ArrayList<>();
 44 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 45 |         filter=new StringToWordVector();
 46 |         classifier=new FilteredClassifier();
 47 |         classifier.setFilter(filter);
 48 |         classifier.setClassifier(new J48());
 49 |         Evaluation eval=new Evaluation(trainedData);
 50 |         eval.crossValidateModel(classifier, trainedData, 4, new Random(1));
 51 |         /*try
 52 |         {
 53 |         for(int i=0;i<10000;i++)
 54 |         {
 55 |         	cls.setPrecision(eval.precision(i));
 56 |         	cls.setRecall(eval.recall(i));
 57 |         	cls.setAuc(eval.areaUnderPRC(i));
 58 |         	cls.setFMeasure(eval.fMeasure(i));
 59 |         	cls.setFn(eval.falseNegativeRate(i));
 60 |         	cls.setFp(eval.falsePositiveRate(i));
 61 |         	cls.setTn(eval.trueNegativeRate(i));
 62 |         	cls.setTp(eval.truePositiveRate(i));
 63 |         	cls.setMeanAbsoluteError(eval.meanAbsoluteError());
 64 |         	cls.setRelativeAbsoluteError(eval.relativeAbsoluteError());
 65 |         	cls.setCorrect(eval.correct());
 66 |         	cls.setKappa(eval.kappa());
 67 |         	cls.setNumInstances(eval.numInstances());
 68 |         	cls.setInCorrect(eval.incorrect());
 69 |         	lstEvaluationDetail.add(new Classification(cls.getPrecision(),
 70 |         			cls.getRecall(),
 71 |         			cls.getAuc(),
 72 |         			cls.getCorrect(),
 73 |         			cls.getInCorrect(),
 74 |         			cls.getErrorRate(),
 75 |         			cls.getFn(),
 76 |         			cls.getFp(),
 77 |         			cls.getTn(),
 78 |         			cls.getTp(),
 79 |         			cls.getKappa(),
 80 |         			cls.getMeanAbsoluteError(),
 81 |         			cls.getNumInstances(),
 82 |         			cls.getRelativeAbsoluteError(),
 83 |         			cls.getFMeasure()));
 84 |         }
 85 |         }
 86 |         catch(Exception ex)
 87 |         {
 88 |         	
 89 |         }*/
 90 |         return lstEvaluationDetail;
 91 | 	}
 92 | 
 93 | 	@Override
 94 | 	public void LearnDecisionTree() throws Exception 
 95 | 	{
 96 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 97 |         filter=new StringToWordVector();
 98 |         classifier=new FilteredClassifier();
 99 |         classifier.setFilter(filter);
100 |         classifier.setClassifier(new J48());
101 |         classifier.buildClassifier(trainedData);
102 | 		
103 | 	}
104 | 
105 | 	@Override
106 | 	public void SaveModel(String modelName) throws FileNotFoundException, IOException 
107 | 	{
108 | 		ObjectOutputStream output=new ObjectOutputStream(
109 |                 new FileOutputStream(modelName));
110 |         output.writeObject(classifier);
111 |         output.close();
112 | 		
113 | 	}
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextKNNImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileOutputStream;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.io.ObjectOutputStream;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.Random;
 13 | 
 14 | import unsw.curation.api.domain.Classification;
 15 | import unsw.curation.api.domain.abstraction.IClassificationTextKNN;
 16 | import weka.classifiers.Evaluation;
 17 | import weka.classifiers.lazy.IBk;
 18 | import weka.classifiers.meta.FilteredClassifier;
 19 | import weka.core.Instances;
 20 | import weka.core.converters.ArffLoader;
 21 | import weka.filters.unsupervised.attribute.StringToWordVector;
 22 | 
 23 | 
 24 | public class ExtractClassificationTextKNNImpl implements IClassificationTextKNN {
 25 | 
 26 | 	Classification cls=new Classification();
 27 | 	Instances trainedData;
 28 |     StringToWordVector filter;
 29 |     FilteredClassifier classifier;
 30 | 	@Override
 31 | 	public void LoadDataset(File arffFileName) throws IOException {
 32 | 		BufferedReader bReader=new BufferedReader(new FileReader(arffFileName));
 33 |         ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader);
 34 |         trainedData=arff.getData();
 35 |         bReader.close();
 36 | 	}
 37 | 
 38 | 	@Override
 39 | 	public List<Classification> EvaluateKNN() throws Exception 
 40 | 	{
 41 | 		List<Classification> lstEvaluationDetail=new ArrayList<>();
 42 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 43 |         filter=new StringToWordVector();
 44 |         classifier=new FilteredClassifier();
 45 |         classifier.setFilter(filter);
 46 |         classifier.setClassifier(new IBk());
 47 |         Evaluation eval=new Evaluation(trainedData);
 48 |         eval.crossValidateModel(classifier, trainedData, 4, new Random(1));
 49 |         /*try
 50 |         {
 51 |         for(int i=0;i<10000;i++)
 52 |         {
 53 |         	cls.setPrecision(eval.precision(i));
 54 |         	cls.setRecall(eval.recall(i));
 55 |         	cls.setAuc(eval.areaUnderPRC(i));
 56 |         	cls.setFMeasure(eval.fMeasure(i));
 57 |         	cls.setFn(eval.falseNegativeRate(i));
 58 |         	cls.setFp(eval.falsePositiveRate(i));
 59 |         	cls.setTn(eval.trueNegativeRate(i));
 60 |         	cls.setTp(eval.truePositiveRate(i));
 61 |         	cls.setMeanAbsoluteError(eval.meanAbsoluteError());
 62 |         	cls.setRelativeAbsoluteError(eval.relativeAbsoluteError());
 63 |         	cls.setCorrect(eval.correct());
 64 |         	cls.setKappa(eval.kappa());
 65 |         	cls.setNumInstances(eval.numInstances());
 66 |         	cls.setInCorrect(eval.incorrect());
 67 |         	lstEvaluationDetail.add(new Classification(cls.getPrecision(),
 68 |         			cls.getRecall(),
 69 |         			cls.getAuc(),
 70 |         			cls.getCorrect(),
 71 |         			cls.getInCorrect(),
 72 |         			cls.getErrorRate(),
 73 |         			cls.getFn(),
 74 |         			cls.getFp(),
 75 |         			cls.getTn(),
 76 |         			cls.getTp(),
 77 |         			cls.getKappa(),
 78 |         			cls.getMeanAbsoluteError(),
 79 |         			cls.getNumInstances(),
 80 |         			cls.getRelativeAbsoluteError(),
 81 |         			cls.getFMeasure()));
 82 |         }
 83 |         }
 84 |         catch(Exception ex)
 85 |         {
 86 |         	
 87 |         }*/
 88 |         return lstEvaluationDetail;
 89 | 	}
 90 | 
 91 | 	@Override
 92 | 	public void LearnKNN() throws Exception {
 93 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 94 |         filter=new StringToWordVector();
 95 |         classifier=new FilteredClassifier();
 96 |         classifier.setFilter(filter);
 97 |         classifier.setClassifier(new IBk());
 98 |         classifier.buildClassifier(trainedData);
 99 | 		
100 | 	}
101 | 
102 | 	@Override
103 | 	public void SaveModel(String modelName) throws FileNotFoundException, IOException {
104 | 		ObjectOutputStream output=new ObjectOutputStream(
105 |                 new FileOutputStream(modelName));
106 |         output.writeObject(classifier);
107 |         output.close();
108 | 		
109 | 	}
110 | 
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextLogisticRegressionImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileOutputStream;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.io.ObjectOutputStream;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.Random;
 13 | 
 14 | import unsw.curation.api.domain.Classification;
 15 | import unsw.curation.api.domain.abstraction.IClassificationTextLogisticRegression;
 16 | import weka.classifiers.Evaluation;
 17 | import weka.classifiers.functions.Logistic;
 18 | import weka.classifiers.meta.FilteredClassifier;
 19 | import weka.core.Instances;
 20 | import weka.core.converters.ArffLoader;
 21 | import weka.filters.unsupervised.attribute.StringToWordVector;
 22 | 
 23 | public class ExtractClassificationTextLogisticRegressionImpl implements IClassificationTextLogisticRegression 
 24 | {
 25 | 	Instances trainedData;
 26 |     StringToWordVector filter;
 27 |     FilteredClassifier classifier;
 28 |     Classification cls=new Classification();
 29 | 
 30 | 	@Override
 31 | 	public void LoadDataset(File arffFileName) throws IOException 
 32 | 	{
 33 | 		BufferedReader bReader=new BufferedReader(
 34 |                 new FileReader(arffFileName));
 35 |         ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader);
 36 |         trainedData=arff.getData();
 37 |         bReader.close();
 38 | 		
 39 | 	}
 40 | 	@Override
 41 | 	public List<Classification> EvaluateLogisticRegression() throws Exception
 42 | 	{
 43 | 		    List<Classification> lstEvaluationDetail=new ArrayList<>();
 44 | 		    trainedData.setClassIndex(trainedData.numAttributes()-1);
 45 | 	        filter=new StringToWordVector();
 46 | 	        classifier=new FilteredClassifier();
 47 | 	        classifier.setFilter(filter);
 48 | 	        classifier.setClassifier(new Logistic());
 49 | 	        Evaluation eval=new Evaluation(trainedData);
 50 | 	        eval.crossValidateModel(classifier, trainedData, 4, new Random(1));
 51 | 	        /*try
 52 | 	        {
 53 | 	        for(int i=0;i<10000;i++)
 54 | 	        {
 55 | 	        	cls.setPrecision(eval.precision(i));
 56 | 	        	cls.setRecall(eval.recall(i));
 57 | 	        	cls.setAuc(eval.areaUnderPRC(i));
 58 | 	        	cls.setFMeasure(eval.fMeasure(i));
 59 | 	        	cls.setFn(eval.falseNegativeRate(i));
 60 | 	        	cls.setFp(eval.falsePositiveRate(i));
 61 | 	        	cls.setTn(eval.trueNegativeRate(i));
 62 | 	        	cls.setTp(eval.truePositiveRate(i));
 63 | 	        	cls.setMeanAbsoluteError(eval.meanAbsoluteError());
 64 | 	        	cls.setRelativeAbsoluteError(eval.relativeAbsoluteError());
 65 | 	        	cls.setCorrect(eval.correct());
 66 | 	        	cls.setKappa(eval.kappa());
 67 | 	        	cls.setNumInstances(eval.numInstances());
 68 | 	        	cls.setInCorrect(eval.incorrect());
 69 | 	        	lstEvaluationDetail.add(new Classification(cls.getPrecision(),
 70 | 	        			cls.getRecall(),
 71 | 	        			cls.getAuc(),
 72 | 	        			cls.getCorrect(),
 73 | 	        			cls.getInCorrect(),
 74 | 	        			cls.getErrorRate(),
 75 | 	        			cls.getFn(),
 76 | 	        			cls.getFp(),
 77 | 	        			cls.getTn(),
 78 | 	        			cls.getTp(),
 79 | 	        			cls.getKappa(),
 80 | 	        			cls.getMeanAbsoluteError(),
 81 | 	        			cls.getNumInstances(),
 82 | 	        			cls.getRelativeAbsoluteError(),
 83 | 	        			cls.getFMeasure()));
 84 | 	        }
 85 | 	        }
 86 | 	        catch(Exception ex)
 87 | 	        {
 88 | 	        	
 89 | 	        }*/
 90 | 	        return lstEvaluationDetail;
 91 | 	}
 92 | 
 93 | 	@Override
 94 | 	public void LearnLogisticRegression() throws Exception 
 95 | 	{
 96 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 97 |         filter=new StringToWordVector();
 98 |         classifier=new FilteredClassifier();
 99 |         classifier.setFilter(filter);
100 |         classifier.setClassifier(new Logistic());
101 |         classifier.buildClassifier(trainedData);
102 | 		
103 | 	}
104 | 	@Override
105 | 	public void SaveModel(String modelName) throws FileNotFoundException, IOException 
106 | 	{
107 | 		 ObjectOutputStream output=new ObjectOutputStream(
108 | 	                new FileOutputStream(modelName));
109 | 	        output.writeObject(classifier);
110 | 	        output.close();
111 | 		
112 | 	}
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextNaiveBaysImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileOutputStream;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.io.ObjectOutputStream;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.Random;
 13 | 
 14 | import unsw.curation.api.domain.Classification;
 15 | import unsw.curation.api.domain.abstraction.IClassificationTextNaiveBays;
 16 | import weka.classifiers.Evaluation;
 17 | import weka.classifiers.bayes.NaiveBayes;
 18 | import weka.classifiers.meta.FilteredClassifier;
 19 | import weka.core.Instances;
 20 | import weka.core.converters.ArffLoader;
 21 | import weka.filters.unsupervised.attribute.StringToWordVector;
 22 | 
 23 | 
 24 | public class ExtractClassificationTextNaiveBaysImpl implements IClassificationTextNaiveBays {
 25 | 
 26 | 	Classification cls=new Classification();
 27 | 	Instances trainedData;
 28 |     StringToWordVector filter;
 29 |     FilteredClassifier classifier;
 30 | 	@Override
 31 | 	public void LoadDataset(File arffFileName) throws IOException 
 32 | 	{	
 33 | 		BufferedReader bReader=new BufferedReader(new FileReader(arffFileName));
 34 |         ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader);
 35 |         trainedData=arff.getData();
 36 |         bReader.close();
 37 | 	}
 38 | 
 39 | 	@Override
 40 | 	public List<Classification> EvaluateNaiveBays() throws Exception 
 41 | 	{
 42 | 		List<Classification> lstEvaluationDetail=new ArrayList<>();
 43 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 44 |         filter=new StringToWordVector();
 45 |         classifier=new FilteredClassifier();
 46 |         classifier.setFilter(filter);
 47 |         classifier.setClassifier(new NaiveBayes());
 48 |         Evaluation eval=new Evaluation(trainedData);
 49 |         eval.crossValidateModel(classifier, trainedData, 4, new Random(1));
 50 |         /*try
 51 |         {
 52 |         for(int i=0;i<10000;i++)
 53 |         {
 54 |         	cls.setPrecision(eval.precision(i));
 55 |         	cls.setRecall(eval.recall(i));
 56 |         	cls.setAuc(eval.areaUnderPRC(i));
 57 |         	cls.setFMeasure(eval.fMeasure(i));
 58 |         	cls.setFn(eval.falseNegativeRate(i));
 59 |         	cls.setFp(eval.falsePositiveRate(i));
 60 |         	cls.setTn(eval.trueNegativeRate(i));
 61 |         	cls.setTp(eval.truePositiveRate(i));
 62 |         	cls.setMeanAbsoluteError(eval.meanAbsoluteError());
 63 |         	cls.setRelativeAbsoluteError(eval.relativeAbsoluteError());
 64 |         	cls.setCorrect(eval.correct());
 65 |         	cls.setKappa(eval.kappa());
 66 |         	cls.setNumInstances(eval.numInstances());
 67 |         	cls.setInCorrect(eval.incorrect());
 68 |         	lstEvaluationDetail.add(new Classification(cls.getPrecision(),
 69 |         			cls.getRecall(),
 70 |         			cls.getAuc(),
 71 |         			cls.getCorrect(),
 72 |         			cls.getInCorrect(),
 73 |         			cls.getErrorRate(),
 74 |         			cls.getFn(),
 75 |         			cls.getFp(),
 76 |         			cls.getTn(),
 77 |         			cls.getTp(),
 78 |         			cls.getKappa(),
 79 |         			cls.getMeanAbsoluteError(),
 80 |         			cls.getNumInstances(),
 81 |         			cls.getRelativeAbsoluteError(),
 82 |         			cls.getFMeasure()));
 83 |         }
 84 |         }
 85 |         catch(Exception ex)
 86 |         {
 87 |         	
 88 |         }*/
 89 |         return lstEvaluationDetail;
 90 | 	}
 91 | 
 92 | 	@Override
 93 | 	public void LearnNaiveBays() throws Exception 
 94 | 	{
 95 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 96 |         filter=new StringToWordVector();
 97 |         classifier=new FilteredClassifier();
 98 |         classifier.setFilter(filter);
 99 |         classifier.setClassifier(new NaiveBayes());
100 |         classifier.buildClassifier(trainedData);
101 | 	}
102 | 
103 | 	@Override
104 | 	public void SaveModel(String modelName) throws FileNotFoundException, IOException
105 | 	{
106 | 		ObjectOutputStream output=new ObjectOutputStream(
107 |                 new FileOutputStream(modelName));
108 |         output.writeObject(classifier);
109 |         output.close();
110 | 	}
111 | 
112 | 
113 | 
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextNeuralNetworkImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileOutputStream;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.io.ObjectOutputStream;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.Random;
 13 | 
 14 | import unsw.curation.api.domain.Classification;
 15 | import unsw.curation.api.domain.abstraction.IClassificationTextNeuralNetwork;
 16 | import weka.classifiers.Evaluation;
 17 | import weka.classifiers.functions.MultilayerPerceptron;
 18 | import weka.classifiers.meta.FilteredClassifier;
 19 | import weka.core.Instances;
 20 | import weka.core.converters.ArffLoader;
 21 | import weka.filters.unsupervised.attribute.StringToWordVector;
 22 | 
 23 | 
 24 | public class ExtractClassificationTextNeuralNetworkImpl implements IClassificationTextNeuralNetwork 
 25 | {
 26 | 
 27 | 	Instances trainedData;
 28 |     StringToWordVector filter;
 29 |     FilteredClassifier classifier;
 30 |     Classification cls=new Classification();
 31 |     
 32 | 	@Override
 33 | 	public void LoadDataset(File arffFileName) throws IOException 
 34 | 	{
 35 | 		BufferedReader bReader=new BufferedReader(
 36 |                 new FileReader(arffFileName));
 37 |         ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader);
 38 |         trainedData=arff.getData();
 39 |         bReader.close();
 40 | 	}
 41 | 
 42 | 	@Override
 43 | 	public List<Classification> EvaluateNeuralNetwork() throws Exception
 44 | 	{
 45 | 		List<Classification> lstEvaluationDetail=new ArrayList<>();
 46 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 47 |         filter=new StringToWordVector();
 48 |         classifier=new FilteredClassifier();
 49 |         classifier.setFilter(filter);
 50 |         classifier.setClassifier(new MultilayerPerceptron());
 51 |         Evaluation eval=new Evaluation(trainedData);
 52 |         eval.crossValidateModel(classifier, trainedData, 4, new Random(1));
 53 |         /*try
 54 |         {
 55 |         for(int i=0;i<10000;i++)
 56 |         {
 57 |         	cls.setPrecision(eval.precision(i));
 58 |         	cls.setRecall(eval.recall(i));
 59 |         	cls.setAuc(eval.areaUnderPRC(i));
 60 |         	cls.setFMeasure(eval.fMeasure(i));
 61 |         	cls.setFn(eval.falseNegativeRate(i));
 62 |         	cls.setFp(eval.falsePositiveRate(i));
 63 |         	cls.setTn(eval.trueNegativeRate(i));
 64 |         	cls.setTp(eval.truePositiveRate(i));
 65 |         	cls.setMeanAbsoluteError(eval.meanAbsoluteError());
 66 |         	cls.setRelativeAbsoluteError(eval.relativeAbsoluteError());
 67 |         	cls.setCorrect(eval.correct());
 68 |         	cls.setKappa(eval.kappa());
 69 |         	cls.setNumInstances(eval.numInstances());
 70 |         	cls.setInCorrect(eval.incorrect());
 71 |         	lstEvaluationDetail.add(new Classification(cls.getPrecision(),
 72 |         			cls.getRecall(),
 73 |         			cls.getAuc(),
 74 |         			cls.getCorrect(),
 75 |         			cls.getInCorrect(),
 76 |         			cls.getErrorRate(),
 77 |         			cls.getFn(),
 78 |         			cls.getFp(),
 79 |         			cls.getTn(),
 80 |         			cls.getTp(),
 81 |         			cls.getKappa(),
 82 |         			cls.getMeanAbsoluteError(),
 83 |         			cls.getNumInstances(),
 84 |         			cls.getRelativeAbsoluteError(),
 85 |         			cls.getFMeasure()));
 86 |         }
 87 |         }
 88 |         catch(Exception ex)
 89 |         {
 90 |         	
 91 |         }*/
 92 |         return lstEvaluationDetail;
 93 | 	}
 94 | 
 95 | 	@Override
 96 | 	public void LearnNeuralNetwork() throws Exception 
 97 | 	{
 98 | 		   trainedData.setClassIndex(trainedData.numAttributes()-1);
 99 | 	        filter=new StringToWordVector();
100 | 	        classifier=new FilteredClassifier();
101 | 	        classifier.setFilter(filter);
102 | 	        classifier.setClassifier(new MultilayerPerceptron());
103 | 	        classifier.buildClassifier(trainedData);
104 | 	}
105 | 
106 | 	@Override
107 | 	public void SaveModel(String modelName) throws FileNotFoundException, IOException 
108 | 	{
109 | 		ObjectOutputStream output=new ObjectOutputStream(
110 |                 new FileOutputStream(modelName));
111 |         output.writeObject(classifier);
112 |         output.close();
113 | 	}
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextRandomForestImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileOutputStream;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.io.ObjectOutputStream;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.Random;
 13 | 
 14 | import unsw.curation.api.domain.Classification;
 15 | import unsw.curation.api.domain.abstraction.IClassificationTextRandomForest;
 16 | import weka.classifiers.Evaluation;
 17 | import weka.classifiers.meta.FilteredClassifier;
 18 | import weka.classifiers.trees.RandomForest;
 19 | import weka.core.Instances;
 20 | import weka.core.converters.ArffLoader;
 21 | import weka.filters.unsupervised.attribute.StringToWordVector;
 22 | 
 23 | 
 24 | public class ExtractClassificationTextRandomForestImpl implements IClassificationTextRandomForest 
 25 | {
 26 | 	Instances trainedData;
 27 |     StringToWordVector filter;
 28 |     FilteredClassifier classifier;
 29 |     Classification cls=new Classification();
 30 |     
 31 | 	@Override
 32 | 	public void LoadDataset(File arffFileName) throws IOException 
 33 | 	{
 34 | 		BufferedReader bReader=new BufferedReader(
 35 |                 new FileReader(arffFileName));
 36 |         ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader);
 37 |         trainedData=arff.getData();
 38 |         bReader.close();
 39 | 	}
 40 | 
 41 | 	@Override
 42 | 	public List<Classification> EvaluateRandomForest() throws Exception 
 43 | 	{
 44 | 		List<Classification> lstEvaluationDetail=new ArrayList<>();
 45 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 46 |         filter=new StringToWordVector();
 47 |         classifier=new FilteredClassifier();
 48 |         classifier.setFilter(filter);
 49 |         classifier.setClassifier(new RandomForest());
 50 |         Evaluation eval=new Evaluation(trainedData);
 51 |         eval.crossValidateModel(classifier, trainedData, 4, new Random(1));
 52 |         /*try
 53 |         {
 54 |         for(int i=0;i<10000;i++)
 55 |         {
 56 |         	cls.setPrecision(eval.precision(i));
 57 |         	cls.setRecall(eval.recall(i));
 58 |         	cls.setAuc(eval.areaUnderPRC(i));
 59 |         	cls.setFMeasure(eval.fMeasure(i));
 60 |         	cls.setFn(eval.falseNegativeRate(i));
 61 |         	cls.setFp(eval.falsePositiveRate(i));
 62 |         	cls.setTn(eval.trueNegativeRate(i));
 63 |         	cls.setTp(eval.truePositiveRate(i));
 64 |         	cls.setMeanAbsoluteError(eval.meanAbsoluteError());
 65 |         	cls.setRelativeAbsoluteError(eval.relativeAbsoluteError());
 66 |         	cls.setCorrect(eval.correct());
 67 |         	cls.setKappa(eval.kappa());
 68 |         	cls.setNumInstances(eval.numInstances());
 69 |         	cls.setInCorrect(eval.incorrect());
 70 |         	lstEvaluationDetail.add(new Classification(cls.getPrecision(),
 71 |         			cls.getRecall(),
 72 |         			cls.getAuc(),
 73 |         			cls.getCorrect(),
 74 |         			cls.getInCorrect(),
 75 |         			cls.getErrorRate(),
 76 |         			cls.getFn(),
 77 |         			cls.getFp(),
 78 |         			cls.getTn(),
 79 |         			cls.getTp(),
 80 |         			cls.getKappa(),
 81 |         			cls.getMeanAbsoluteError(),
 82 |         			cls.getNumInstances(),
 83 |         			cls.getRelativeAbsoluteError(),
 84 |         			cls.getFMeasure()));
 85 |         }
 86 |         }
 87 |         catch(Exception ex)
 88 |         {
 89 |         	
 90 |         }*/
 91 |         return lstEvaluationDetail;
 92 | 	}
 93 | 
 94 | 	@Override
 95 | 	public void LearnRandomForest() throws Exception 
 96 | 	{
 97 | 		   trainedData.setClassIndex(trainedData.numAttributes()-1);
 98 | 	        filter=new StringToWordVector();
 99 | 	        classifier=new FilteredClassifier();
100 | 	        classifier.setFilter(filter);
101 | 	        classifier.setClassifier(new RandomForest());
102 | 	        classifier.buildClassifier(trainedData);
103 | 	}
104 | 
105 | 	@Override
106 | 	public void SaveModel(String modelName) throws FileNotFoundException, IOException 
107 | 	{
108 | 		ObjectOutputStream output=new ObjectOutputStream(
109 |                 new FileOutputStream(modelName));
110 |         output.writeObject(classifier);
111 |         output.close();
112 | 		
113 | 	}
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextSVMImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileOutputStream;
  7 | import java.io.FileReader;
  8 | import java.io.IOException;
  9 | import java.io.ObjectOutputStream;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.Random;
 13 | 
 14 | import unsw.curation.api.domain.Classification;
 15 | import unsw.curation.api.domain.abstraction.IClassificationTextSVM;
 16 | import weka.classifiers.Evaluation;
 17 | import weka.classifiers.functions.SMO;
 18 | import weka.classifiers.meta.FilteredClassifier;
 19 | import weka.core.Instances;
 20 | import weka.core.converters.ArffLoader;
 21 | import weka.filters.unsupervised.attribute.StringToWordVector;
 22 | 
 23 | 
 24 | public class ExtractClassificationTextSVMImpl implements IClassificationTextSVM {
 25 | 
 26 | 	Instances trainedData;
 27 |     StringToWordVector filter;
 28 |     FilteredClassifier classifier;
 29 |     Classification cls=new Classification();
 30 |     
 31 | 	@Override
 32 | 	public void LoadDataset(File arffFileName) throws IOException {
 33 | 		BufferedReader bReader=new BufferedReader(
 34 |                 new FileReader(arffFileName));
 35 |         ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader);
 36 |         trainedData=arff.getData();
 37 |         bReader.close();
 38 | 		
 39 | 	}
 40 | 
 41 | 	@Override
 42 | 	public List<Classification> EvaluateSVM() throws Exception 
 43 | 	{
 44 | 		List<Classification> lstEvaluationDetail=new ArrayList<>();
 45 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 46 |         filter=new StringToWordVector();
 47 |         classifier=new FilteredClassifier();
 48 |         classifier.setFilter(filter);
 49 |         classifier.setClassifier(new SMO());
 50 |         Evaluation eval=new Evaluation(trainedData);
 51 |         eval.crossValidateModel(classifier, trainedData, 4, new Random(1));
 52 |         /*try
 53 |         {
 54 |         for(int i=0;i<10000;i++)
 55 |         {
 56 |         	cls.setPrecision(eval.precision(i));
 57 |         	cls.setRecall(eval.recall(i));
 58 |         	cls.setAuc(eval.areaUnderPRC(i));
 59 |         	cls.setFMeasure(eval.fMeasure(i));
 60 |         	cls.setFn(eval.falseNegativeRate(i));
 61 |         	cls.setFp(eval.falsePositiveRate(i));
 62 |         	cls.setTn(eval.trueNegativeRate(i));
 63 |         	cls.setTp(eval.truePositiveRate(i));
 64 |         	cls.setMeanAbsoluteError(eval.meanAbsoluteError());
 65 |         	cls.setRelativeAbsoluteError(eval.relativeAbsoluteError());
 66 |         	cls.setCorrect(eval.correct());
 67 |         	cls.setKappa(eval.kappa());
 68 |         	cls.setNumInstances(eval.numInstances());
 69 |         	cls.setInCorrect(eval.incorrect());
 70 |         	lstEvaluationDetail.add(new Classification(cls.getPrecision(),
 71 |         			cls.getRecall(),
 72 |         			cls.getAuc(),
 73 |         			cls.getCorrect(),
 74 |         			cls.getInCorrect(),
 75 |         			cls.getErrorRate(),
 76 |         			cls.getFn(),
 77 |         			cls.getFp(),
 78 |         			cls.getTn(),
 79 |         			cls.getTp(),
 80 |         			cls.getKappa(),
 81 |         			cls.getMeanAbsoluteError(),
 82 |         			cls.getNumInstances(),
 83 |         			cls.getRelativeAbsoluteError(),
 84 |         			cls.getFMeasure()));
 85 |         }
 86 |         }
 87 |         catch(Exception ex)
 88 |         {
 89 |         	
 90 |         }*/
 91 |         return lstEvaluationDetail;
 92 | 	}
 93 | 
 94 | 	@Override
 95 | 	public void LearnSVM() throws Exception 
 96 | 	{
 97 | 		trainedData.setClassIndex(trainedData.numAttributes()-1);
 98 |         filter=new StringToWordVector();
 99 |         classifier=new FilteredClassifier();
100 |         classifier.setFilter(filter);
101 |         classifier.setClassifier(new SMO());
102 |         classifier.buildClassifier(trainedData);
103 | 		
104 | 	}
105 | 
106 | 	@Override
107 | 	public void SaveModel(String modelName) throws FileNotFoundException, IOException {
108 | 		 ObjectOutputStream output=new ObjectOutputStream(
109 | 	                new FileOutputStream(modelName));
110 | 	        output.writeObject(classifier);
111 | 	        output.close();
112 | 		
113 | 	}
114 | 
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/textclassification/TextClassifierImpl.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.textclassification;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileNotFoundException;
  5 | import java.io.IOException;
  6 | import java.util.List;
  7 | import java.util.concurrent.ThreadLocalRandom;
  8 | 
  9 | import unsw.curation.api.classify.TextClassifier;
 10 | import unsw.curation.api.domain.Classification;
 11 | 
 12 | public class TextClassifierImpl extends TextClassifier {
 13 | 
 14 | 	public void TKnn(File trainFile, File testFile, File result) throws Exception
 15 | 	{
 16 | 		   long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999);
 17 | 		   String current = System.getProperty("user.dir");
 18 |            File Textdir= new File(current+"\\TextClassification");
 19 |             if(!Textdir.exists())
 20 |                Textdir.mkdir();
 21 |             
 22 | 		   ExtractClassificationTextKNNImpl knn=new ExtractClassificationTextKNNImpl();
 23 | 		   knn.LoadDataset(trainFile);
 24 | 		   knn.EvaluateKNN();
 25 | 		   knn.LearnKNN();
 26 | 		   knn.SaveModel(Textdir+"\\Knn"+fileNumber+".dat");
 27 | 		   
 28 | 		   LoadTestData(testFile);
 29 | 		   loadModel(Textdir+"\\Knn"+fileNumber+".dat");
 30 | 		   Predict(result.getAbsolutePath());
 31 | 	}
 32 | 	public void TNaiveBayes(File trainFile, File testFile, File result) throws Exception
 33 | 	{
 34 | 		long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999);
 35 | 		String current = System.getProperty("user.dir");
 36 |         File Textdir= new File(current+"\\TextClassification");
 37 |          if(!Textdir.exists())
 38 |             Textdir.mkdir();
 39 |          ExtractClassificationTextNaiveBaysImpl naiveBayes=new ExtractClassificationTextNaiveBaysImpl();
 40 |          naiveBayes.LoadDataset(trainFile);
 41 |          naiveBayes.EvaluateNaiveBays();
 42 |          naiveBayes.LearnNaiveBays();
 43 |          naiveBayes.SaveModel(Textdir+"\\NaiveBayes"+fileNumber+".dat");
 44 |          
 45 |          LoadTestData(testFile);
 46 | 		 loadModel(Textdir+"\\NaiveBayes"+fileNumber+".dat");
 47 | 		 Predict(result.getAbsolutePath());
 48 | 	}
 49 | 	public void TLogisticRegression(File trainFile, File testFile, File result) throws Exception
 50 | 	{
 51 | 		long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999);
 52 | 		String current = System.getProperty("user.dir");
 53 |         File Textdir= new File(current+"\\TextClassification");
 54 |          if(!Textdir.exists())
 55 |             Textdir.mkdir();
 56 |          ExtractClassificationTextLogisticRegressionImpl glm=new ExtractClassificationTextLogisticRegressionImpl();
 57 |          glm.LoadDataset(trainFile);
 58 |          glm.EvaluateLogisticRegression();
 59 |          glm.LearnLogisticRegression();
 60 |          glm.SaveModel(Textdir+"\\Logistic"+fileNumber+".dat");
 61 |          
 62 |          LoadTestData(testFile);
 63 |          loadModel(Textdir+"\\Logistic"+fileNumber+".dat");
 64 |          Predict(result.getAbsolutePath());
 65 | 	}
 66 | 	public void TDecisionTree(File trainFile, File testFile, File result) throws Exception
 67 | 	{
 68 | 		long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999);
 69 | 		String current = System.getProperty("user.dir");
 70 |         File Textdir= new File(current+"\\TextClassification");
 71 |          if(!Textdir.exists())
 72 |             Textdir.mkdir();
 73 |          ExtractClassificationTextDecisionTreeImpl j48=new ExtractClassificationTextDecisionTreeImpl();
 74 |          j48.LoadDataset(trainFile);
 75 |          j48.EvaluateDecisionTree();
 76 |          j48.LearnDecisionTree();
 77 |          j48.SaveModel(Textdir+"\\DecisionTree"+fileNumber+".dat");
 78 |          
 79 |          LoadTestData(testFile);
 80 |          loadModel(Textdir+"\\DecisionTree"+fileNumber+".dat");
 81 |          Predict(result.getAbsolutePath());
 82 | 	}
 83 | 	public void TRandomForest(File trainFile, File testFile, File result) throws Exception
 84 | 	{
 85 | 		long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999);
 86 | 		String current = System.getProperty("user.dir");
 87 |         File Textdir= new File(current+"\\TextClassification");
 88 |          if(!Textdir.exists())
 89 |             Textdir.mkdir();
 90 |         ExtractClassificationTextRandomForestImpl rf=new ExtractClassificationTextRandomForestImpl();
 91 |         rf.LoadDataset(trainFile);
 92 |         rf.EvaluateRandomForest();
 93 |         rf.LearnRandomForest();
 94 |         rf.SaveModel(Textdir+"\\RandomForest"+fileNumber+".dat");
 95 |         
 96 |         LoadTestData(testFile);
 97 |         loadModel(Textdir+"\\RandomForest"+fileNumber+".dat");
 98 |         Predict(result.getAbsolutePath());
 99 | 	}
100 | 	
101 | 	public void TSvm(File trainFile, File testFile, File result) throws Exception
102 | 	{
103 | 		long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999);
104 | 		String current = System.getProperty("user.dir");
105 |         File Textdir= new File(current+"\\TextClassification");
106 |          if(!Textdir.exists())
107 |             Textdir.mkdir();
108 |          
109 |          ExtractClassificationTextSVMImpl svm=new ExtractClassificationTextSVMImpl();
110 |          svm.LoadDataset(trainFile);
111 |          svm.EvaluateSVM();
112 |          svm.LearnSVM();
113 |          svm.SaveModel(Textdir+"\\SVM"+fileNumber+".dat");
114 |          
115 |          LoadTestData(testFile);
116 |          loadModel(Textdir+"\\SVM"+fileNumber+".dat");
117 |          Predict(result.getAbsolutePath());
118 | 	}
119 | 	public void TNeuralNetwork(File trainFile, File testFile, File result) throws Exception
120 | 	{
121 | 		long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999);
122 | 		String current = System.getProperty("user.dir");
123 |         File Textdir= new File(current+"\\TextClassification");
124 |          if(!Textdir.exists())
125 |             Textdir.mkdir();
126 |          ExtractClassificationTextNeuralNetworkImpl neural =new ExtractClassificationTextNeuralNetworkImpl();
127 |          neural.LoadDataset(trainFile);
128 |          neural.EvaluateNeuralNetwork();
129 |          neural.LearnNeuralNetwork();
130 |          neural.SaveModel(Textdir+"\\Neural"+fileNumber+".dat");
131 |          
132 |          LoadTestData(testFile);
133 |          loadModel(Textdir+"\\Neural"+fileNumber+".dat");
134 |          Predict(result.getAbsolutePath());
135 |          
136 | 	}
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/tfidf/DataSearchSentence.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.tfidf;
 7 | 
 8 | import java.io.File;
 9 | import java.io.IOException;
10 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
11 | import org.apache.lucene.document.Document;
12 | import org.apache.lucene.index.CorruptIndexException;
13 | import org.apache.lucene.index.DirectoryReader;
14 | import org.apache.lucene.index.IndexReader;
15 | import org.apache.lucene.queryparser.classic.ParseException;
16 | import org.apache.lucene.queryparser.classic.QueryParser;
17 | import org.apache.lucene.search.IndexSearcher;
18 | import org.apache.lucene.search.Query;
19 | import org.apache.lucene.search.ScoreDoc;
20 | import org.apache.lucene.search.TopDocs;
21 | import org.apache.lucene.store.FSDirectory;
22 | import org.apache.lucene.util.Version;
23 | 
24 | /**
25 |  *
26 |  * @author Alireza
27 |  */
28 | 
29 | public class DataSearchSentence {
30 |   IndexReader reader;
31 |   IndexSearcher indSearch;
32 |   Query query;
33 |   public DataSearchSentence(String IndexDir) throws IOException
34 |   {
35 |       reader=DirectoryReader.open(FSDirectory.open(new File(IndexDir)));
36 |       indSearch=new IndexSearcher(reader);
37 |   }
38 |   public TopDocs search(String searchText) throws IOException, ParseException
39 |   {
40 |       Query q2=new QueryParser(Version.LUCENE_41,"Content", 
41 |     		  new StandardAnalyzer(Version.LUCENE_41))
42 |     		  .parse(searchText);
43 |       return indSearch.search(q2, 10); 
44 |   }
45 |   public Document getDocument(ScoreDoc score) throws CorruptIndexException, IOException
46 |   {
47 | 	  return indSearch.doc(score.doc);
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/tfidf/IndexSentence.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.tfidf;
 7 | 
 8 | import java.io.File;
 9 | import java.io.IOException;
10 | import java.util.List;
11 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
12 | import org.apache.lucene.document.Document;
13 | import org.apache.lucene.document.Field;
14 | import org.apache.lucene.document.TextField;
15 | import org.apache.lucene.index.CorruptIndexException;
16 | import org.apache.lucene.index.IndexWriter;
17 | import org.apache.lucene.index.IndexWriterConfig;
18 | import org.apache.lucene.store.FSDirectory;
19 | import org.apache.lucene.util.Version;
20 | 
21 | /**
22 |  *
23 |  * @author Alireza
24 |  */
25 | public class IndexSentence {
26 |     private IndexWriter writer;
27 |     private StandardAnalyzer Analyzer=new StandardAnalyzer(Version.LUCENE_41);
28 |     public IndexSentence(String indexDirectory) throws IOException
29 |     {
30 |       FSDirectory indexDir=FSDirectory.open(new File(indexDirectory));
31 |          IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_41,Analyzer);
32 | 	 writer=new IndexWriter(indexDir,config);
33 |     }
34 |     public void Close() throws CorruptIndexException, IOException
35 |     {
36 | 	  writer.close();
37 |     }
38 |     
39 |     private Document ListDoc(String  tweet) throws IOException
40 |     {
41 |         Document doc=new Document();
42 | //        Field fileContent=new Field("Content",new FileReader("FileName Must Be Provided"));
43 | //        doc.add(fileContent);
44 |     doc.add(new TextField("Content",tweet, Field.Store.YES));
45 |     //doc.add(new TextField("FilePath",file.getCanonicalPath(),Field.Store.YES));
46 |     return doc;
47 |     }
48 |       private void IndexDocuments(String tweetFilePath) throws IOException{
49 |       System.out.println("Indexing Sentences  ");
50 |       List<String> lstProcessedData=ReadDataSentence.ReadPreProcessedData(tweetFilePath);
51 |       for(String tweet:lstProcessedData)
52 |           {
53 |               try
54 |               {
55 |               Document document = ListDoc(tweet);
56 |               writer.addDocument(document);
57 |               }
58 |               catch(Exception ex)
59 |               {         
60 |                   System.out.print(ex.getMessage());
61 |               }
62 |           }
63 |       }
64 |       public boolean IndexTweets(String tweetFilePath) 
65 |       throws IOException{
66 |       //File[] files = new File(dataDir).listFiles();
67 |      //File f=new File(tweetFilePath);
68 |       //for (File file : files) {
69 |         // if(!tweetFilePath.isDirectory()&& tweetFilePath.exists())
70 |          //{
71 |             IndexDocuments(new File(tweetFilePath).getPath());
72 |          //}
73 |         /* else
74 |          {
75 |              return false;
76 |          }*/
77 |      // }
78 |       //return writer.numDocs();
79 |       return true;
80 |    }
81 |      
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/tfidf/ReadDataSentence.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.tfidf;
 7 | 
 8 | import java.io.BufferedReader;
 9 | import java.io.FileNotFoundException;
10 | import java.io.FileReader;
11 | import java.io.IOException;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | 
15 | /**
16 |  *
17 |  * @author Alireza
18 |  */
19 | public class ReadDataSentence {
20 |     
21 |     static String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+";
22 |     public static List<String> ReadPreProcessedData(String filePath) throws FileNotFoundException, IOException{
23 |         List<String> lstData=new ArrayList<>();
24 |         BufferedReader sr;
25 |         String sLine;
26 |         sr = new BufferedReader(new FileReader(filePath));
27 |          while ((sLine = sr.readLine()) != null) {
28 |         	 if(sLine.split(" ").length<1)
29 |          	   {
30 |          		continue;
31 |          	   }
32 |              sLine=sLine.replaceAll(Pattern, "");
33 |              String [] arrSLine=sLine.split(" ");
34 |              String Line="";
35 |              for(String str:arrSLine)
36 |              {
37 |                      str=str.replace("'","");
38 |                      str=str.replace("(","");
39 |                      str=str.replace(")","");
40 |                      str=str.replace("!","");
41 |                      str=str.replace("[","");
42 |                      str=str.replace("]","");
43 |                      str=str.replace("{","");
44 |                      str=str.replace("}","");
45 |                      str=str.replace("\"","");
46 |                      str=str.replace("?","");
47 |                      str=str.replace(".","");
48 |                      Line+=str+" ";
49 |              }
50 |              
51 |          lstData.add(Line.trim());
52 |          }
53 |          return lstData;
54 |     }
55 |     public static List<String> ReadRawData(String filePath) throws FileNotFoundException, IOException
56 |     {
57 |         List<String> lstData=new ArrayList<>();
58 |         BufferedReader sr;
59 |         String sLine;
60 |         sr = new BufferedReader(new FileReader(filePath));
61 |          while ((sLine = sr.readLine()) != null) {
62 |         	 if(sLine.split(" ").length<1)
63 |          	   {
64 |          		continue;
65 |          	   }
66 |          lstData.add(sLine);
67 |          }
68 |          return lstData;
69 |     }
70 |     public static void ReadFromMySql()
71 |     {
72 |         
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/tokenization/ExtractionKeywordImpl.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/src/main/java/unsw/curation/api/tokenization/ExtractionKeywordImpl.java


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitter/KeywordExtraction.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.twitter;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileReader;
  7 | import java.io.IOException;
  8 | import java.io.StringReader;
  9 | import java.util.ArrayList;
 10 | import java.util.List;
 11 | import org.apache.lucene.analysis.TokenStream;
 12 | import org.apache.lucene.analysis.core.StopFilter;
 13 | import org.apache.lucene.analysis.standard.StandardTokenizer;
 14 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 15 | import org.apache.lucene.analysis.util.CharArraySet;
 16 | 
 17 | /**
 18 |  *
 19 |  * @author Alireza
 20 |  */
 21 | public class KeywordExtraction {
 22 |     
 23 |     public KeywordExtraction()
 24 |     {
 25 |         
 26 |     }
 27 |     private String preProcessTweet(String tweet)
 28 |     {
 29 |         String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+";
 30 |         tweet=tweet.replaceAll(Pattern, "");
 31 |         String Line="";
 32 |         if(tweet.toCharArray().length<141)
 33 |         {
 34 |             String [] arrSLine=tweet.split(" ");
 35 |             for(String str:arrSLine)
 36 |             {
 37 |                     str=str.replace("'","");
 38 |                     str=str.replace("(","");
 39 |                     str=str.replace(")","");
 40 |                     str=str.replace("!","");
 41 |                     str=str.replace("[","");
 42 |                     str=str.replace("]","");
 43 |                     str=str.replace("{","");
 44 |                     str=str.replace("}","");
 45 |                     str=str.replace("\"","");
 46 |                     str=str.replace("?","");
 47 |                     str=str.replace(".","");
 48 |                     str=str.replace("#","");
 49 |                     str=str.replace("@","");
 50 |                     Line+=str.trim()+" ";
 51 |             }
 52 |         }
 53 |             return Line;
 54 |     }
 55 |     
 56 |          String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+";
 57 | 	 private List<String> lstStopWords=new ArrayList<>();
 58 |         public String ExtractTweetKeyword(String inputTweet, File stopwordList) throws Exception 
 59 | 	{
 60 | 	lstStopWords=ReadRawData(stopwordList);
 61 | 	String trimmedText=inputTweet.replaceAll(Pattern, "");
 62 |         trimmedText=trimmedText.replaceAll("\\d", "");
 63 |         String values=preProcessTweet(trimmedText);
 64 |         CharArraySet stopWords=new CharArraySet(org.apache.lucene.util.Version.LUCENE_41,lstStopWords,true);
 65 |         TokenStream tokenStreamer = new 
 66 |     		StandardTokenizer(org.apache.lucene.util.Version.LUCENE_41, new StringReader(values));
 67 |         tokenStreamer = new StopFilter(org.apache.lucene.util.Version.LUCENE_41, tokenStreamer, stopWords);
 68 |         StringBuilder sb = new StringBuilder();
 69 |         CharTermAttribute charTermAttribute = tokenStreamer.addAttribute(CharTermAttribute.class);
 70 |         tokenStreamer.reset();
 71 |         while (tokenStreamer.incrementToken()) 
 72 |         {
 73 |            String term = charTermAttribute.toString();
 74 |            sb.append(term).append(",");
 75 |         }
 76 |         return sb.toString();
 77 | 	}
 78 |         
 79 |         
 80 |         public String ExtractKeyword(String inputTweet, File stopwordList) throws Exception 
 81 | 	{
 82 | 	lstStopWords=ReadRawData(stopwordList);
 83 | 	String trimmedText=inputTweet.replaceAll(Pattern, "");
 84 |         trimmedText=trimmedText.replaceAll("\\d", "");
 85 |         CharArraySet stopWords=new CharArraySet(org.apache.lucene.util.Version.LUCENE_41,lstStopWords,true);
 86 |         TokenStream tokenStreamer = new 
 87 |     		StandardTokenizer(org.apache.lucene.util.Version.LUCENE_41, new StringReader(trimmedText));
 88 |         tokenStreamer = new StopFilter(org.apache.lucene.util.Version.LUCENE_41, tokenStreamer, stopWords);
 89 |         StringBuilder sb = new StringBuilder();
 90 |         CharTermAttribute charTermAttribute = tokenStreamer.addAttribute(CharTermAttribute.class);
 91 |         tokenStreamer.reset();
 92 |         while (tokenStreamer.incrementToken()) 
 93 |         {
 94 |            String term = charTermAttribute.toString();
 95 |            sb.append(term).append(",");
 96 |         }
 97 |         return sb.toString();
 98 | 	}
 99 |         
100 |        public String ExtractKeywordsList(List<String> lstData) throws Exception
101 |        {
102 |         StringBuilder sb = new StringBuilder();
103 |         for(String str:lstData)
104 |         {
105 |             String trimmedText=str.replaceAll(Pattern, "");
106 |             trimmedText=trimmedText.replaceAll("\\d", "");
107 |         CharArraySet stopWords=new 
108 |             CharArraySet(org.apache.lucene.util.Version.LUCENE_41,lstStopWords,true);
109 |         TokenStream tokenStreamer = new 
110 |             StandardTokenizer(org.apache.lucene.util.Version.LUCENE_41, 
111 |                     new StringReader(trimmedText.trim()));
112 |         tokenStreamer = new 
113 |             StopFilter(org.apache.lucene.util.Version.LUCENE_41, tokenStreamer, stopWords);
114 |         
115 |         CharTermAttribute charTermAttribute = tokenStreamer.addAttribute(CharTermAttribute.class);
116 |         tokenStreamer.reset();
117 |         while (tokenStreamer.incrementToken()) {
118 |             String term = charTermAttribute.toString();
119 |             sb.append(term).append(",");
120 |         }
121 |         }
122 |         return sb.toString();
123 |        }
124 |         
125 |         
126 |         private static List<String> ReadRawData(File filePath) throws FileNotFoundException, IOException 
127 |         {
128 |         List<String> lstData=new ArrayList<>();
129 |         String sLine;
130 |         BufferedReader sr = new BufferedReader(new FileReader(filePath));
131 |          while ((sLine = sr.readLine()) != null) {
132 |          lstData.add(sLine);
133 |          }
134 |          return lstData;
135 |         }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitter/MyStemExtraction.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.twitter;
 7 | 
 8 | import java.io.BufferedReader;
 9 | import java.io.File;
10 | import java.io.FileNotFoundException;
11 | import java.io.FileReader;
12 | import java.io.IOException;
13 | import java.util.ArrayList;
14 | import java.util.List;
15 | import unsw.curation.api.twitterdomain.StemDomain;
16 | 
17 | /**
18 |  *
19 |  * @author Alireza
20 |  */
21 | public class MyStemExtraction {
22 |     
23 |     public List<StemDomain> ReadData(File stemFilePath) throws FileNotFoundException, IOException
24 |     {
25 |         List<StemDomain> lstValues=new ArrayList<>();
26 |         BufferedReader reader=new BufferedReader(new FileReader(stemFilePath));
27 |         String line="";
28 |         while((line=reader.readLine())!=null)
29 |         {
30 |             try
31 |             {
32 |             String [] lineValues=line.split("\\|");
33 |            
34 |             String myWord1=lineValues[0].trim().toLowerCase();
35 |             String myDerived1=lineValues[1].trim().toLowerCase();
36 |             String myWord2=lineValues[3].trim().toLowerCase();
37 |             String myDerived2=lineValues[4].trim().toLowerCase();
38 |             lstValues.add(new StemDomain(myWord1,myDerived1,myWord2,myDerived2));
39 |             }
40 |             catch(Exception ex)
41 |             {
42 |                 
43 |             }
44 |         }
45 |        
46 |         
47 |        return lstValues; 
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitter/NamedEntityExtraction.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * To change this license header, choose License Headers in Project Properties.
  3 |  * To change this template file, choose Tools | Templates
  4 |  * and open the template in the editor.
  5 |  */
  6 | package unsw.curation.api.twitter;
  7 | 
  8 | import edu.stanford.nlp.ling.CoreAnnotations;
  9 | import edu.stanford.nlp.ling.CoreLabel;
 10 | import edu.stanford.nlp.pipeline.Annotation;
 11 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 12 | import edu.stanford.nlp.util.CoreMap;
 13 | import java.io.IOException;
 14 | import java.util.ArrayList;
 15 | import java.util.List;
 16 | import java.util.Properties;
 17 | import unsw.curation.api.twitterdomain.NamedEntityDomain;
 18 | /**
 19 |  *
 20 |  * @author Alireza
 21 |  */
 22 | public class NamedEntityExtraction {
 23 | 
 24 |   public List<NamedEntityDomain> ExtractTweetNamedEntities(String tweet) throws IOException, Exception
 25 |      {
 26 | 
 27 |         List<NamedEntityDomain> lstEntityList=new ArrayList<>();
 28 |         Properties props = new Properties();
 29 |         boolean useRegexner = true;
 30 |         if (useRegexner) {
 31 |           props.put("annotators", "tokenize, ssplit, pos, lemma, ner,regexner");
 32 |           props.put("regexner.mapping", "data.txt");
 33 |           
 34 |         } else {
 35 |           props.put("annotators", "tokenize, ssplit, pos,lemma, ner");
 36 |         }
 37 |           String values=preProcessTweet(tweet);
 38 |           StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 39 |           Annotation document = new Annotation(values);
 40 |           pipeline.annotate(document);
 41 |           List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
 42 |            for (CoreMap sentence : sentences) 
 43 |            {
 44 |                for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) 
 45 |                {
 46 |                    String tToken = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
 47 |                    String word = token.get(CoreAnnotations.TextAnnotation.class);
 48 |                    if(tToken.equalsIgnoreCase("O"))
 49 |                    {
 50 |                            continue;
 51 |                    }
 52 |                    lstEntityList.add(new NamedEntityDomain(word,tToken));
 53 |                }
 54 |            }
 55 |         return lstEntityList;
 56 |      }
 57 |   
 58 |   public static String preProcessTweet(String tweet)
 59 |      {
 60 |          String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+";
 61 |          tweet=tweet.replaceAll(Pattern, "");
 62 |          String Line="";
 63 |          if(tweet.toCharArray().length<141)
 64 |          {
 65 |              String [] arrSLine=tweet.split(" ");
 66 |              for(String str:arrSLine)
 67 |              {
 68 |                      str=str.replace("'","");
 69 |                      str=str.replace("(","");
 70 |                      str=str.replace(")","");
 71 |                      str=str.replace("!","");
 72 |                      str=str.replace("[","");
 73 |                      str=str.replace("]","");
 74 |                      str=str.replace("{","");
 75 |                      str=str.replace("}","");
 76 |                      str=str.replace("\"","");
 77 |                      str=str.replace("?","");
 78 |                      str=str.replace(".","");
 79 |                      str=str.replace("#","");
 80 |                      str=str.replace("@","");
 81 |                      Line+=str.trim()+" ";
 82 |              }
 83 |          }
 84 |              return Line;
 85 |      }
 86 |   
 87 |    public List<NamedEntityDomain> ExtractTweetEntities(String tweet) throws IOException, Exception
 88 |      {
 89 | 
 90 |         List<NamedEntityDomain> lstEntityList=new ArrayList<>();
 91 |         Properties props = new Properties();
 92 |         boolean useRegexner = true;
 93 |         if (useRegexner) {
 94 |           props.put("annotators", "tokenize, ssplit, pos, lemma, ner,regexner");
 95 |           props.put("regexner.mapping", "data.txt");
 96 |           
 97 |         } else {
 98 |           props.put("annotators", "tokenize, ssplit, pos,lemma, ner");
 99 |         }
100 |           //String values=preProcessTweet(tweet);
101 |           StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
102 |           Annotation document = new Annotation(tweet);
103 |           pipeline.annotate(document);
104 |           List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
105 |            for (CoreMap sentence : sentences) 
106 |            {
107 |                for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) 
108 |                {
109 |                    String tToken = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
110 |                    String word = token.get(CoreAnnotations.TextAnnotation.class);
111 |                    if(tToken.equalsIgnoreCase("O"))
112 |                    {
113 |                            continue;
114 |                    }
115 |                    lstEntityList.add(new NamedEntityDomain(word,tToken));
116 |                }
117 |            }
118 |         return lstEntityList;
119 |      }
120 | }
121 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitter/Synonyms.java:
--------------------------------------------------------------------------------
 1 | 
 2 | package unsw.curation.api.twitter;
 3 | 
 4 | import edu.mit.jwi.Dictionary;
 5 | import edu.mit.jwi.IDictionary;
 6 | import edu.mit.jwi.item.IIndexWord;
 7 | import edu.mit.jwi.item.ISynset;
 8 | import edu.mit.jwi.item.IWord;
 9 | import edu.mit.jwi.item.IWordID;
10 | import edu.mit.jwi.item.POS;
11 | import edu.mit.jwi.morph.WordnetStemmer;
12 | import java.io.File;
13 | import java.io.IOException;
14 | import java.util.ArrayList;
15 | import java.util.List;
16 | import unsw.curation.api.twitterdomain.SynonymDomain;
17 | 
18 | /**
19 |  *
20 |  * @author Alireza
21 |  */
22 | public class Synonyms {
23 |     KeywordExtraction EX;
24 |     private String path="C:\\Program Files (x86)\\WordNet\\2.1\\dict\\";
25 |     public Synonyms() throws IOException
26 |     {
27 |       EX=new KeywordExtraction();
28 |     }
29 |     public Synonyms(String dictionaryFilePath) throws IOException
30 |     {
31 |       path=dictionaryFilePath;  
32 |       EX=new KeywordExtraction();
33 |     }
34 |     
35 |     
36 |     public List<SynonymDomain> ExtractSynsetsSentence(String Sentence,File englishStopwordsFilePath) throws IOException, Exception
37 |     {
38 |         List<SynonymDomain> lstSynset=new ArrayList<>();
39 |         String sentenceKeyWords=EX.ExtractKeyword(Sentence, englishStopwordsFilePath);
40 |         for(String str:sentenceKeyWords.split(","))
41 |         {
42 |         String strSynset="";
43 |         File dicFile=new File(path);
44 |         IDictionary dict=new Dictionary(dicFile);
45 |         dict.open();
46 |         WordnetStemmer stemmer=new WordnetStemmer(dict);
47 |          try
48 |          {
49 |             List<String> lstStem=stemmer.findStems(str, POS.NOUN);
50 |             IIndexWord idxWord = dict . getIndexWord (lstStem.get(0), POS.NOUN);
51 |             IWordID wordID = idxWord . getWordIDs ().get(0);
52 |             IWord word = dict.getWord(wordID);
53 |             ISynset sen=word.getSynset();
54 |              for(IWord w:sen.getWords())
55 |              {
56 |                strSynset+=w.getLemma()+",";
57 |                
58 |              }
59 |             lstSynset.add(new SynonymDomain(str, strSynset));
60 |          }
61 |          catch(Exception ex)
62 |          {
63 |             
64 |          }
65 |         }
66 |         return lstSynset;
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitter/URLExtraction.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.twitter;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.IOException;
  5 | import java.io.InputStreamReader;
  6 | import java.net.HttpURLConnection;
  7 | import java.net.InetSocketAddress;
  8 | import java.net.Proxy;
  9 | import java.net.URL;
 10 | import java.util.ArrayList;
 11 | import java.util.List;
 12 | import java.util.regex.Matcher;
 13 | import java.util.regex.Pattern;
 14 | import org.jsoup.Jsoup;
 15 | import org.jsoup.nodes.Document;
 16 | import org.jsoup.nodes.Element;
 17 | import org.jsoup.select.Elements;
 18 | import unsw.curation.api.twitterdomain.UrlDomain;
 19 | /**
 20 |  *
 21 |  * @author Alireza
 22 |  */
 23 | public class URLExtraction {
 24 |     
 25 |     private Document docPub;
 26 |     UrlDomain urlDomain;
 27 | 
 28 |     public URLExtraction() {
 29 |         urlDomain=new UrlDomain();
 30 | //                System.setProperty("http.proxyHost", "127.0.0.1");
 31 | //                System.setProperty("http.proxyPort", "8580");
 32 | //                System.setProperty("https.proxyHost", "127.0.0.1");
 33 | //                System.setProperty("https.proxyPort", "8580");
 34 |     }
 35 |     
 36 |     
 37 |     public void Extract(String Url) throws IOException
 38 |     {
 39 | //                System.setProperty("http.proxyHost", "127.0.0.1");
 40 | //                System.setProperty("http.proxyPort", "8580");
 41 | //                System.setProperty("https.proxyHost", "127.0.0.1");
 42 | //                System.setProperty("https.proxyPort", "8580");
 43 | //        URL url = new URL(Url);
 44 | //        Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 8580)); 
 45 | //        HttpURLConnection uc = (HttpURLConnection)url.openConnection(proxy);
 46 | //        uc.connect();
 47 | //        String line = null;
 48 | //        StringBuffer tmp = new StringBuffer();
 49 | //        BufferedReader in = new BufferedReader(new InputStreamReader(uc.getInputStream()));
 50 | //        while ((line = in.readLine()) != null) {
 51 | //          tmp.append(line);
 52 | //        }
 53 | //        Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 8080)); // or whatever your proxy is
 54 | //        HttpURLConnection uc = (HttpURLConnection)url.openConnection(proxy);
 55 |          docPub=Jsoup.connect(Url).timeout(10000).get();    
 56 |       //  docPub=Jsoup.parse(String.valueOf(tmp));
 57 |     }
 58 |     
 59 |     private String O_ExtractTitle() throws IOException
 60 |     {
 61 |         if(docPub==null)
 62 |          {
 63 |            throw new IOException("No Page To Download");
 64 |          }
 65 |         String Title=docPub.title();
 66 |         return Title;
 67 |     }
 68 | 
 69 |      
 70 |     private List<String> ExtractHyperLink(String Sentence)
 71 |     {
 72 |      List<String> Links=new ArrayList<>();
 73 |      String [] splitedStr=Sentence.split(" ");
 74 |      for(String st:splitedStr){
 75 |         Pattern p=Pattern.compile("(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+");
 76 |         Matcher m=p.matcher(st);
 77 |         while(m.find())
 78 |         {
 79 |            Links.add(m.group());
 80 |         }
 81 |      }
 82 |      return Links;
 83 |     }
 84 |     
 85 |     List<String>lstTweetLinkList=new ArrayList<>();
 86 |     public List<String> ExtractLinkInfo(String tweet)
 87 |     {
 88 |         lstTweetLinkList=ExtractHyperLink(tweet);
 89 |         List<String> lstPargraphList=new ArrayList<>();
 90 |         if(lstTweetLinkList.size()>0)
 91 |         {
 92 |                 for(String pageLink:lstTweetLinkList)
 93 |                 {
 94 |                     try
 95 |                     {
 96 |                     List<String> lsttemp=new ArrayList<>();
 97 |                     Extract(pageLink);
 98 |                     lsttemp=O_ExtractParagraphes();
 99 |                     lstPargraphList.addAll(lsttemp);
100 |                     }
101 |                     catch(Exception ex)
102 |                     {
103 |                         System.err.println(ex.getMessage());
104 |                     }
105 |                 }
106 |         }
107 |           return lstPargraphList;
108 |     }
109 |     
110 |     private List<String> O_ExtractParagraphes() throws IOException
111 |     {
112 |         List<String> lstParagraphes=new ArrayList<>();
113 |          if(docPub==null)
114 |          {
115 |            throw new IOException("No Page To Download");
116 |          }
117 |         Elements Paragraphes=docPub.select("p");
118 |         for(Element Paragraph:Paragraphes)
119 |         {
120 |             if(Paragraph.text().length()>1)
121 |             lstParagraphes.add(Paragraph.text());
122 |         }
123 |         System.out.println("Downloading Page Content...");
124 |         return lstParagraphes;
125 |     }
126 |     
127 |     List<String>lstTweetTitle=new ArrayList<>();
128 |     public List<String> ExtractTitle(String tweet)
129 |     {
130 |         lstTweetTitle=ExtractHyperLink(tweet);
131 |         List<String> lstTitleList=new ArrayList<>();
132 |         if(lstTweetTitle.size()>0)
133 |         {
134 |                 for(String pageLink:lstTweetTitle)
135 |                 {
136 |                     try
137 |                     {
138 |                     String title="";
139 |                     Extract(pageLink);
140 |                     title=O_ExtractTitle();
141 |                     lstTitleList.add(title);
142 |                     }
143 |                     catch(Exception ex)
144 |                     {
145 |                         System.err.println(ex.getMessage());
146 |                     }
147 |                 }
148 |         }
149 |           return lstTitleList;
150 |     }
151 | }
152 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitterdomain/KeywordDomain.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.twitterdomain;
 7 | 
 8 | /**
 9 |  *
10 |  * @author Alireza
11 |  */
12 | public class KeywordDomain {
13 |     
14 |     public KeywordDomain(){}
15 | 	public KeywordDomain(String tweet,String keyword)
16 | 	{
17 | 		this.tweet=tweet;
18 | 		this.keyword=keyword;
19 | 	}
20 | 	public KeywordDomain(String keyword)
21 | 	{
22 | 		this.keyword=keyword;
23 | 	}
24 | 	public String tweet;
25 | 	public String keyword;
26 | 	public String inputSentence;
27 | 	public String inputTweet;
28 | 	
29 | 	public void setInputSentence(String inputSentence)
30 | 	{
31 | 	  this.inputSentence=inputSentence;
32 | 	}
33 | 	
34 | 	public String getInputSentence()
35 | 	{
36 | 	  return inputSentence;
37 | 	}
38 | 	public void setInputTweet(String inputTweet)
39 | 	{
40 | 	  this.inputTweet=inputTweet;
41 | 	}
42 | 	public String getInputTweet()
43 | 	{
44 | 		return inputTweet;
45 | 	}
46 | 	public void setTweet(String tweet)
47 | 	{
48 | 		this.tweet=tweet;
49 | 	}
50 | 	public String getTweet()
51 | 	{
52 | 		return tweet;
53 | 	}
54 | 	
55 | 	public void setKeyword(String keyword)
56 | 	{
57 | 		this.keyword=keyword;
58 | 	}
59 | 	public String getKeyword()
60 | 	{
61 | 		return keyword;
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitterdomain/NamedEntityDomain.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.twitterdomain;
 7 | 
 8 | /**
 9 |  *
10 |  * @author Alireza
11 |  */
12 | public class NamedEntityDomain {
13 |     
14 |     public String _Word;
15 |     public String _Ner;
16 |     
17 |     public void setWord(String _Word)
18 |     {
19 |         this._Word=_Word;
20 |     }
21 |     public String getWord()
22 |     {
23 |         return _Word;
24 |     }
25 |     
26 |     public void setNer(String _Ner)
27 |     {
28 |         this._Ner=_Ner;
29 |     }
30 |     public String getNer()
31 |     {
32 |         return _Ner;
33 |     }
34 | //    private String HashTag;
35 | //    private String Text;
36 | //    private String Domain;
37 | //    private String KeyWords;
38 | //    private String Synonyms;
39 | //    private String Links;
40 | //    private String Entities;
41 |     public NamedEntityDomain(){}
42 |     
43 |     public NamedEntityDomain(String word,String Ner)
44 |     {
45 |         _Word=word;
46 |         _Ner=Ner;
47 |     }
48 | //    public NamedEntityDomain(String Text,String Entities,String HashTag,String Domain, String KeyWords,String Synonyms,String Links)
49 | //    {
50 | //        this.HashTag=HashTag;
51 | //        this.Text=Text;
52 | //        this.Entities=Entities;
53 | //        this.Domain=Domain;
54 | //        this.KeyWords=KeyWords;
55 | //        this.Synonyms=Synonyms;
56 | //        this.Links=Links;
57 | //    }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitterdomain/StemDomain.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.twitterdomain;
 7 | 
 8 | /**
 9 |  *
10 |  * @author Alireza
11 |  */
12 | public class StemDomain {
13 |     public StemDomain(){}
14 |     private String word1;
15 |     public void setWord1(String word)
16 |     {
17 |         this.word1=word;
18 |     }
19 |     public String getWord1()
20 |     {
21 |         return word1;
22 |     }
23 |     private String derived1;
24 |     public void setDerived1(String derived)
25 |     {
26 |         this.derived1=derived;
27 |     }
28 |     public String getDerived1()
29 |     {
30 |         return this.derived1;
31 |     }
32 |     
33 |     private String word2;
34 |     public void setWord2(String word)
35 |     {
36 |         this.word2=word;
37 |     }
38 |     public String getWord2()
39 |     {
40 |         return word2;
41 |     }
42 |     private String derived2;
43 |     public void setDerived2(String derived)
44 |     {
45 |         this.derived2=derived;
46 |     }
47 |     public String getDerived2()
48 |     {
49 |         return this.derived2;
50 |     }
51 |     public StemDomain(String word1,String derived1,String word2,String derived2)
52 |     {
53 |         this.word1=word1;
54 |         this.word2=word2;
55 |         this.derived1=derived1;
56 |         this.derived2=derived2;
57 |     }
58 |     
59 | //    public StemDomain(StemDomain Domain)
60 | //    {
61 | //        this.word1=Domain.getWord1();
62 | //        this.word2=Domain.getWord2();
63 | //        this.derived1=Domain.getDerived1();
64 | //        this.derived2=Domain.getDerived2();
65 | //    }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitterdomain/SynonymDomain.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.twitterdomain;
 7 | 
 8 | /**
 9 |  *
10 |  * @author Alireza
11 |  */
12 | public class SynonymDomain {
13 |     
14 |     public SynonymDomain(){}
15 |     public SynonymDomain(String word, String synset)
16 |     {
17 |         this.word=word;
18 |         this.synset=synset;
19 |     }
20 |     public String word;
21 |     public String synset;
22 |     
23 |     public void setWord(String word)
24 |     {
25 |         this.word=word;
26 |     }
27 |     public String getWord()
28 |     {
29 |         return this.word;
30 |     }
31 |     public void setSynset(String synset)
32 |     {
33 |         this.synset=synset;
34 |     }
35 |     public String getSynset()
36 |     {
37 |         return synset;
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/twitterdomain/UrlDomain.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * To change this license header, choose License Headers in Project Properties.
 3 |  * To change this template file, choose Tools | Templates
 4 |  * and open the template in the editor.
 5 |  */
 6 | package unsw.curation.api.twitterdomain;
 7 | 
 8 | /**
 9 |  *
10 |  * @author Alireza
11 |  */
12 | public class UrlDomain 
13 | {
14 |     private String pageTitle;
15 |     
16 |     public void setPageTitle(String pageTitle)
17 |     {
18 |         this.pageTitle=pageTitle;
19 |     }
20 |     public String getPageTitle()
21 |     {
22 |         return this.pageTitle;
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/unsw/curation/api/url/GetHTMLFile.java:
--------------------------------------------------------------------------------
  1 | package unsw.curation.api.url;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.util.ArrayList;
  6 | import java.util.List;
  7 | import org.jsoup.Jsoup;
  8 | import org.jsoup.nodes.Document;
  9 | import org.jsoup.nodes.Element;
 10 | import org.jsoup.select.Elements;
 11 | 
 12 | /**
 13 |  *
 14 |  * @author Alireza
 15 |  */
 16 | public class GetHTMLFile {
 17 |     
 18 |     private Document docPub;
 19 |     /**
 20 |      * 
 21 |      * @param FilePath
 22 |      * @throws IOException 
 23 |      */
 24 |     public void ExtractLocal(String FilePath) throws IOException
 25 |     {
 26 |        docPub=Jsoup.parse(new File(FilePath), "UTF-8");
 27 |     }
 28 |     /**
 29 |      * 
 30 |      * @return
 31 |      * @throws IOException 
 32 |      */
 33 |     public String L_ExtractTitle() throws IOException
 34 |     {
 35 |         if(docPub==null)
 36 |          {
 37 |            throw new IOException("No Page To Parse");
 38 |          }
 39 |         String Title=docPub.title();
 40 |         return Title;
 41 |     }
 42 |     /**
 43 |      * 
 44 |      * @param Url
 45 |      * @return
 46 |      * @throws IOException 
 47 |      */
 48 |     public List<String> L_ExtractHeadings() throws IOException
 49 |     {
 50 |         List<String> lstHeadings=new ArrayList<>();
 51 |          if(docPub==null)
 52 |          {
 53 |            throw new IOException("No Page To Parse");
 54 |          }
 55 |         Elements H1=docPub.select("h1");
 56 |         Elements H2=docPub.select("h2");
 57 |         Elements H3=docPub.select("H3");
 58 |         Elements H4=docPub.select("H4");
 59 |         for(Element H:H1)
 60 |         {
 61 |             lstHeadings.add(H.text());
 62 |         }
 63 |         for(Element H:H2)
 64 |         {
 65 |             lstHeadings.add(H.text());
 66 |         }
 67 |         for(Element H:H3)
 68 |         {
 69 |             lstHeadings.add(H.text());
 70 |         }
 71 |         for(Element H:H4)
 72 |         {
 73 |             lstHeadings.add(H.text());
 74 |         }
 75 |         return lstHeadings;
 76 |     }
 77 |     /**
 78 |      * 
 79 |      * @return
 80 |      * @throws IOException 
 81 |      */
 82 |     public List<String> L_ExtractHrefText() throws IOException
 83 |     {
 84 |         List<String> lstHref=new ArrayList<>();
 85 |         if(docPub==null)
 86 |          {
 87 |            throw new IOException("No Page To Parse");
 88 |          }
 89 |         Elements Hrefs=docPub.select("a[href]");
 90 |         for(Element Href:Hrefs)
 91 |         {
 92 |             lstHref.add(Href.text());
 93 |         }
 94 |         return lstHref;        
 95 |     }
 96 |     /**
 97 |      * 
 98 |      * @return
 99 |      * @throws IOException 
100 |      */
101 |     public List<String> L_ExtractParagraphes() throws IOException
102 |     {
103 |         List<String> lstParagraphes=new ArrayList<>();
104 |          if(docPub==null)
105 |          {
106 |            throw new IOException("No Page To Parse");
107 |          }
108 |         Elements Paragraphes=docPub.select("p");
109 |         for(Element Paragraph:Paragraphes)
110 |         {
111 |             if(Paragraph.text().length()>1)
112 |             lstParagraphes.add(Paragraph.text());
113 |         }
114 |         return lstParagraphes;
115 |     }
116 |     /**
117 |      * 
118 |      * @param Position
119 |      * @return
120 |      * @throws IOException 
121 |      */
122 |     public String L_ExtractParagraphByPosition(int Position) throws IOException
123 |     {
124 |         if(docPub==null)
125 |         {
126 |            throw new IOException("No Page To Parse");
127 |         }
128 |         try{
129 |         List<String> lstParagraphes=new ArrayList<>();
130 |         Elements Paragraphes=docPub.select("p");
131 |         for(Element Paragraph:Paragraphes)
132 |         {
133 |             if(Paragraph.text().length()>1)
134 |             lstParagraphes.add(Paragraph.text());
135 |         }
136 |         
137 |         if(lstParagraphes.size()<Position)
138 |         {
139 |             System.out.println(String.format("There are %s Paragraphes in the Current Document ",
140 |                     lstParagraphes.size()));
141 |            return "Incorrect Paragraph Position";
142 |         }
143 |         return lstParagraphes.get(Position-1);
144 |         }
145 |         catch(Exception ex)
146 |         {
147 |             return ex.getMessage();
148 |         }
149 |     }
150 |     /**
151 |      * 
152 |      * @return 
153 |      */
154 |     public List<String> L_ExtractImageALTtext()
155 |     {
156 |         List<String> lstImage=new ArrayList<>();
157 |          Elements src=docPub.select("img[src]");
158 | 	 for(Element el:src)
159 | 	 {
160 |              if(el.attr("alt").length()>1)
161 |                lstImage.add(el.attr("alt"));
162 | 	 }
163 |         return lstImage;
164 |     }
165 |     /**
166 |      * 
167 |      * @return 
168 |      */
169 |     public List<String> L_ExtractListTexts()
170 |     {
171 |         List<String> lstUl=new ArrayList<>();
172 |         Elements Ul=docPub.select("ul");
173 |         for(Element e:Ul)
174 |         {
175 |             lstUl.add(e.text());
176 |         }
177 |         return lstUl;
178 |     }
179 | //    public void SaveResult(List<String> LstData,String Path) throws IOException
180 | //    {
181 | //      BufferedWriter buf=new BufferedWriter(new FileWriter(Path));
182 | //      for(String Values:LstData)
183 | //      {
184 | //          buf.write(Values);
185 | //          buf.newLine();
186 | //      }
187 | //      buf.close();
188 | //    }
189 | //    public void SaveResult(String Values,String Path) throws IOException
190 | //    {
191 | //      BufferedWriter buf=new BufferedWriter(new FileWriter(Path));
192 | //          buf.write(Values);
193 | //          buf.newLine();
194 | //         buf.close();
195 | //    }
196 | }
197 | 


--------------------------------------------------------------------------------
/test.txt:
--------------------------------------------------------------------------------
1 | Dont censore the web ask the congress.


--------------------------------------------------------------------------------
/text.txt:
--------------------------------------------------------------------------------
1 | {"searchinfo":{"search":"taylor swift"},"search":[{"id":"Q26876","concepturi":"http://www.wikidata.org/entity/Q26876","url":"//www.wikidata.org/wiki/Q26876","title":"Q26876","pageid":30291,"label":"Taylor Swift","description":"singer-songwriter from the United States","match":{"type":"label","language":"en","text":"Taylor Swift"}},{"id":"Q845783","concepturi":"http://www.wikidata.org/entity/Q845783","url":"//www.wikidata.org/wiki/Q845783","title":"Q845783","pageid":797842,"label":"Taylor Swift","description":"Eponymous debut studio album by Taylor Swift","match":{"type":"label","language":"en","text":"Taylor Swift"}},{"id":"Q276736","concepturi":"http://www.wikidata.org/entity/Q276736","url":"//www.wikidata.org/wiki/Q276736","title":"Q276736","pageid":267594,"label":"Taylor Swift discography","description":"discography","match":{"type":"label","language":"en","text":"Taylor Swift discography"}},{"id":"Q20734198","concepturi":"http://www.wikidata.org/entity/Q20734198","url":"//www.wikidata.org/wiki/Q20734198","title":"Q20734198","pageid":22479565,"label":"Taylor Swift breaks Vevo record","match":{"type":"label","language":"en","text":"Taylor Swift breaks Vevo record"}},{"id":"Q27076640","concepturi":"http://www.wikidata.org/entity/Q27076640","url":"//www.wikidata.org/wiki/Q27076640","title":"Q27076640","pageid":28932752,"label":"Taylor Swift videography","match":{"type":"label","language":"en","text":"Taylor Swift videography"}},{"id":"Q22814294","concepturi":"http://www.wikidata.org/entity/Q22814294","url":"//www.wikidata.org/wiki/Q22814294","title":"Q22814294","pageid":24835551,"label":"Taylor Swift's 1989 wins Grammy's Record of the year; Bad Blood wins the Best Music Video","match":{"type":"label","language":"en","text":"Taylor Swift's 1989 wins Grammy's Record of the year; Bad Blood wins the Best Music Video"}},{"id":"Q7690142","concepturi":"http://www.wikidata.org/entity/Q7690142","url":"//www.wikidata.org/wiki/Q7690142","title":"Q7690142","pageid":7615766,"label":"Taylor Swift and Def Leppard","match":{"type":"label","language":"en","text":"Taylor Swift and Def Leppard"}}],"success":1}
2 | {"searchinfo":{"search":"toyota"},"search":[{"id":"Q53268","concepturi":"http://www.wikidata.org/entity/Q53268","url":"//www.wikidata.org/wiki/Q53268","title":"Q53268","pageid":55718,"label":"Toyota","description":"automotive brand manufacturer","match":{"type":"label","language":"en","text":"Toyota"}},{"id":"Q201117","concepturi":"http://www.wikidata.org/entity/Q201117","url":"//www.wikidata.org/wiki/Q201117","title":"Q201117","pageid":197802,"label":"Toyota","description":"city in Aichi Prefecture, Japan","match":{"type":"label","language":"en","text":"Toyota"}},{"id":"Q236526","concepturi":"http://www.wikidata.org/entity/Q236526","url":"//www.wikidata.org/wiki/Q236526","title":"Q236526","pageid":229982,"label":"Toyota","description":"Wikipedia disambiguation page","match":{"type":"label","language":"en","text":"Toyota"}},{"id":"Q17669963","concepturi":"http://www.wikidata.org/entity/Q17669963","url":"//www.wikidata.org/wiki/Q17669963","title":"Q17669963","pageid":19265719,"label":"Toyota","match":{"type":"label","language":"en","text":"Toyota"}},{"id":"Q22341651","concepturi":"http://www.wikidata.org/entity/Q22341651","url":"//www.wikidata.org/wiki/Q22341651","title":"Q22341651","pageid":24368174,"label":"Mathunny Mathews","description":"Indian businessperson","match":{"type":"alias","language":"en","text":"Toyota Sunny"},"aliases":["Toyota Sunny"]},{"id":"Q10700769","concepturi":"http://www.wikidata.org/entity/Q10700769","url":"//www.wikidata.org/wiki/Q10700769","title":"Q10700769","pageid":11976587,"label":"Toyotahallen","description":"Wikimedia disambiguation page","match":{"type":"label","language":"en","text":"Toyotahallen"}},{"id":"Q182473","concepturi":"http://www.wikidata.org/entity/Q182473","url":"//www.wikidata.org/wiki/Q182473","title":"Q182473","pageid":181568,"label":"Intercontinental Cup","description":"international association football tournament for clubs","match":{"type":"alias","language":"en","text":"Toyota Cup"},"aliases":["Toyota Cup"]}],"search-continue":7,"success":1}
3 | {"searchinfo":{"search":"lionel messi"},"search":[{"id":"Q615","concepturi":"http://www.wikidata.org/entity/Q615","url":"//www.wikidata.org/wiki/Q615","title":"Q615","pageid":899,"label":"Lionel Messi","description":"Argentine footballer","match":{"type":"label","language":"en","text":"Lionel Messi"}},{"id":"Q16301083","concepturi":"http://www.wikidata.org/entity/Q16301083","url":"//www.wikidata.org/wiki/Q16301083","title":"Q16301083","pageid":17928471,"label":"Lionel Messi Nyamsi","description":"Cameroonian footballer","match":{"type":"label","language":"en","text":"Lionel Messi Nyamsi"}}],"success":1}
4 | 


--------------------------------------------------------------------------------