├── .gitattributes ├── .gitignore ├── CurationAPIs-UserGuide.pdf ├── Index ├── _a.cfe ├── _a.cfs ├── _a.si ├── segments.gen └── segments_b ├── IndexSentence ├── _2.fdt ├── _2.fdx ├── _2.fnm ├── _2.si ├── _2.tvd ├── _2.tvf ├── _2.tvx ├── _2_Lucene41_0.doc ├── _2_Lucene41_0.pay ├── _2_Lucene41_0.pos ├── _2_Lucene41_0.tim ├── _2_Lucene41_0.tip ├── _2_nrm.cfe ├── _2_nrm.cfs ├── segments.gen └── segments_3 ├── LICENSE ├── README.md ├── Result.xml ├── Sentence_Index ├── _0.cfe ├── _0.cfs ├── _0.si ├── segments.gen └── segments_1 ├── Stem.txt ├── TextClassification ├── Knn1248308.dat ├── Knn1793382.dat ├── Knn2053135.dat ├── Knn4725799.dat ├── Knn5442488.dat ├── Knn8341982.dat ├── Knn8586130.dat ├── Knn8814335.dat ├── NaiveBayes1567923.dat ├── NaiveBayes1902130.dat ├── NaiveBayes3099268.dat └── NaiveBayes5432235.dat ├── cosine.txt ├── cosineDoc ├── data.txt └── tweets.txt ├── data.txt ├── englishStopwords.txt ├── entity.txt ├── pom.xml ├── pos.txt ├── src └── main │ └── java │ └── unsw │ └── curation │ └── api │ ├── classify │ └── TextClassifier.java │ ├── cosinesentence │ ├── AllTermsSentence.java │ ├── CosineSimilaritySentence.java │ ├── DocVectorSentence.java │ ├── IndexSentence.java │ └── VectorGeneratorSentence.java │ ├── cosinetext │ ├── AllTerms.java │ ├── CosineSimilarity.java │ ├── DocVector.java │ ├── Index.java │ └── VectorGenerator.java │ ├── domain │ ├── Classification.java │ ├── ExtractNamedEntity.java │ ├── ExtractNumberSimilarity.java │ ├── ExtractPosTag.java │ ├── ExtractStem.java │ ├── ExtractSynonym.java │ ├── ExtractTextCosineSimilarity.java │ ├── ExtractTextSimilarity.java │ ├── ExtractTextTfidfSimilarity.java │ ├── ExtractionKeyword.java │ └── abstraction │ │ ├── IClassificationTextDecisionTree.java │ │ ├── IClassificationTextKNN.java │ │ ├── IClassificationTextLogisticRegression.java │ │ ├── IClassificationTextNaiveBays.java │ │ ├── IClassificationTextNeuralNetwork.java │ │ ├── IClassificationTextRandomForest.java │ │ ├── IClassificationTextSVM.java │ │ ├── IKeywordEx.java │ │ ├── INamedEntity.java │ │ ├── INumberCosineSimilarity.java │ │ ├── INumberDiceSimilarity.java │ │ ├── INumberEuclideanSimilarity.java │ │ ├── INumberJaccardSimilarity.java │ │ ├── IPosTag.java │ │ ├── IStem.java │ │ ├── ISynonym.java │ │ ├── ITextCosineSimilarity.java │ │ ├── ITextJaccardSimilarity.java │ │ ├── ITextJaroSimilarity.java │ │ ├── ITextLevenshtainSimilarity.java │ │ ├── ITextQGramSimilarity.java │ │ ├── ITextSoundexSimilarity.java │ │ ├── ITextTfidfSimilarity.java │ │ └── IUrlExtraction.java │ ├── extractnamedentity │ ├── ExtractEntityFile.java │ ├── ExtractEntitySentence.java │ ├── RegexClass.java │ ├── curation.jpg │ └── curation.ucls │ ├── extractpostag │ └── ExtractPosTagData.java │ ├── extractsimilarity │ ├── ExtractNumberCosineSimilarityImpl.java │ ├── ExtractNumberDiceSimilarityImpl.java │ ├── ExtractNumberEuclideanSimilarity.java │ ├── ExtractNumberJaccardSimilarityImpl.java │ ├── ExtractTextCosineSimilarityImpl.java │ ├── ExtractTextJaccardSimilarityImpl.java │ ├── ExtractTextJaroSimialrity.java │ ├── ExtractTextLevenshtainImpl.java │ ├── ExtractTextQGramSimilarity.java │ ├── ExtractTextSoundexSimilarity.java │ └── ExtractTextTfIdfSimilarityImpl.java │ ├── extractstem │ └── ExtractStemImpl.java │ ├── extractsynonym │ └── WordNetFile.java │ ├── index │ ├── DataSearch.java │ ├── Index.java │ └── SchIndData.java │ ├── linking │ ├── ConceptNet.java │ ├── GoogleKnowledgeGraph.java │ └── WikiData.java │ ├── run │ └── run.java │ ├── textclassification │ ├── EvaluateClassifier.java │ ├── ExtractClassificationTextDecisionTreeImpl.java │ ├── ExtractClassificationTextKNNImpl.java │ ├── ExtractClassificationTextLogisticRegressionImpl.java │ ├── ExtractClassificationTextNaiveBaysImpl.java │ ├── ExtractClassificationTextNeuralNetworkImpl.java │ ├── ExtractClassificationTextRandomForestImpl.java │ ├── ExtractClassificationTextSVMImpl.java │ └── TextClassifierImpl.java │ ├── tfidf │ ├── DataSearchSentence.java │ ├── IndexSentence.java │ └── ReadDataSentence.java │ ├── tokenization │ └── ExtractionKeywordImpl.java │ ├── twitter │ ├── KeywordExtraction.java │ ├── MyStemExtraction.java │ ├── NamedEntityExtraction.java │ ├── Synonyms.java │ ├── TweetInfo.java │ ├── URLExtraction.java │ └── XmlGenerator.java │ ├── twitterdomain │ ├── KeywordDomain.java │ ├── NamedEntityDomain.java │ ├── StemDomain.java │ ├── SynonymDomain.java │ ├── TweetInfoDomain.java │ └── UrlDomain.java │ └── url │ ├── GetHTMLFile.java │ └── GetURL.java ├── test.txt ├── text.txt └── tweets.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | *.jar filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Eclipse 3 | .classpath 4 | .project 5 | .settings/ 6 | 7 | # Intellij 8 | .idea/ 9 | *.iml 10 | *.iws 11 | 12 | # Mac 13 | .DS_Store 14 | 15 | # Maven 16 | log/ 17 | target/ 18 | -------------------------------------------------------------------------------- /CurationAPIs-UserGuide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/CurationAPIs-UserGuide.pdf -------------------------------------------------------------------------------- /Index/_a.cfe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/_a.cfe -------------------------------------------------------------------------------- /Index/_a.cfs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/_a.cfs -------------------------------------------------------------------------------- /Index/_a.si: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/_a.si -------------------------------------------------------------------------------- /Index/segments.gen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/segments.gen -------------------------------------------------------------------------------- /Index/segments_b: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Index/segments_b -------------------------------------------------------------------------------- /IndexSentence/_2.fdt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.fdt -------------------------------------------------------------------------------- /IndexSentence/_2.fdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.fdx -------------------------------------------------------------------------------- /IndexSentence/_2.fnm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.fnm -------------------------------------------------------------------------------- /IndexSentence/_2.si: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.si -------------------------------------------------------------------------------- /IndexSentence/_2.tvd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.tvd -------------------------------------------------------------------------------- /IndexSentence/_2.tvf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.tvf -------------------------------------------------------------------------------- /IndexSentence/_2.tvx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2.tvx -------------------------------------------------------------------------------- /IndexSentence/_2_Lucene41_0.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.doc -------------------------------------------------------------------------------- /IndexSentence/_2_Lucene41_0.pay: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.pay -------------------------------------------------------------------------------- /IndexSentence/_2_Lucene41_0.pos: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.pos -------------------------------------------------------------------------------- /IndexSentence/_2_Lucene41_0.tim: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.tim -------------------------------------------------------------------------------- /IndexSentence/_2_Lucene41_0.tip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_Lucene41_0.tip -------------------------------------------------------------------------------- /IndexSentence/_2_nrm.cfe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_nrm.cfe -------------------------------------------------------------------------------- /IndexSentence/_2_nrm.cfs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/_2_nrm.cfs -------------------------------------------------------------------------------- /IndexSentence/segments.gen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/segments.gen -------------------------------------------------------------------------------- /IndexSentence/segments_3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/IndexSentence/segments_3 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ----Data Curation APIs------------- 2 | 3 | Understanding and analyzing big data is firmly recognized as a powerful and strategic priority. For deeper interpretation of and better intelligence with big data, it is important to transform raw data (unstructured, semi-structured and structured data sources, e.g., text, video, image data sets) into curated data: contextualized data and knowledge that is maintained and made available for use by end-users and applications. In particular, data curation acts as the glue between raw data and analytics, providing an abstraction layer that relieves users from time consuming, tedious and error prone curation tasks. In this context, the data curation process becomes a vital analytics asset for increasing added value and insights. 4 | 5 | We identify and implement a set of curation APIs and make them available (as an open source project on GitHub) to researchers and developers to assist them transforming their raw data into curated data. The curation APIs enable developers to easily add features - such as extracting keyword, part of speech, and named entities such as Persons, Locations, Organizations, Companies, Products, Diseases, Drugs, etc.; providing synonyms and stems for extracted information items leveraging lexical knowledge bases for the English language such as WordNet; linking extracted entities to external knowledge bases such as Google Knowledge Graph and Wikidata; discovering similarity among the extracted information items, such as calculating similarity between string, number, date and time data; classifying, sorting and categorizing data into various types, forms or any other distinct class; and indexing structured and unstructured data - into their applications. 6 | 7 | 8 | Notice: 9 | 10 | We encourage researchers/developers to cite our paper if you have used our APIs, libraries, tools or datasets. 11 | 12 | * Beheshti, Tabebordbar, Benatallah, Nouri: "On Automating Basic Data Curation Tasks", WWW 2017, Perth Australia. 13 | * Beheshti, Tabebordbar, Benatallah, Nouri: "Data Curation APIs", CoRR abs/1612.03277 (2016). 14 | 15 | 16 | You can find the technical report and user guide from the following link: 17 | https://arxiv.org/abs/1612.03277 18 | 19 | Curation Services Rest APIs: 20 | http://d2dcrc.cse.unsw.edu.au:9091/ExtractionAPI-0.0.1-SNAPSHOT/ 21 | 22 | 23 | 24 | ----License----------------------- 25 | 26 | License: This software is licensed under the Apache 2.0 license, quoted below. 27 | 28 | Copyright 2016 UNSW.CSE.SOC Research Group 29 | 30 | 31 | You may not use these APIs except in compliance with the License. You may obtain a copy of 32 | the License at 33 | 34 | http://www.apache.org/licenses/LICENSE-2.0 35 | 36 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 37 | 38 | ----Contributors----------------------- 39 | 40 | 41 | Amin (SMR) Beheshti 42 | 43 | Alireza Tabebordbar 44 | 45 | Boualem Benatallah 46 | 47 | Seyed Mohammad Reza Nouri 48 | 49 | Service Oriented Computing (SOC) Research Group, School of Computer Sience and Engineering, The University of New South Wales, Sydney, Australia. This work is part of the Data Curation Foundry project stream, D2D CRC. 50 | 51 | -------------------------------------------------------------------------------- /Result.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Result.xml -------------------------------------------------------------------------------- /Sentence_Index/_0.cfe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/_0.cfe -------------------------------------------------------------------------------- /Sentence_Index/_0.cfs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/_0.cfs -------------------------------------------------------------------------------- /Sentence_Index/_0.si: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/_0.si -------------------------------------------------------------------------------- /Sentence_Index/segments.gen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/segments.gen -------------------------------------------------------------------------------- /Sentence_Index/segments_1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Sentence_Index/segments_1 -------------------------------------------------------------------------------- /Stem.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/Stem.txt -------------------------------------------------------------------------------- /TextClassification/Knn1248308.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn1248308.dat -------------------------------------------------------------------------------- /TextClassification/Knn1793382.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn1793382.dat -------------------------------------------------------------------------------- /TextClassification/Knn2053135.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn2053135.dat -------------------------------------------------------------------------------- /TextClassification/Knn4725799.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn4725799.dat -------------------------------------------------------------------------------- /TextClassification/Knn5442488.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn5442488.dat -------------------------------------------------------------------------------- /TextClassification/Knn8341982.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn8341982.dat -------------------------------------------------------------------------------- /TextClassification/Knn8586130.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn8586130.dat -------------------------------------------------------------------------------- /TextClassification/Knn8814335.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/Knn8814335.dat -------------------------------------------------------------------------------- /TextClassification/NaiveBayes1567923.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/NaiveBayes1567923.dat -------------------------------------------------------------------------------- /TextClassification/NaiveBayes1902130.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/NaiveBayes1902130.dat -------------------------------------------------------------------------------- /TextClassification/NaiveBayes3099268.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/NaiveBayes3099268.dat -------------------------------------------------------------------------------- /TextClassification/NaiveBayes5432235.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/TextClassification/NaiveBayes5432235.dat -------------------------------------------------------------------------------- /cosine.txt: -------------------------------------------------------------------------------- 1 | 1,2,5,6 2 | 4,5,9,6 3 | 7,5,8,6 4 | 3,2,5,9 5 | 11,15,19,17 6 | 10,1,1,5 -------------------------------------------------------------------------------- /entity.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/entity.txt -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | TextAPI 4 | TextAPI 5 | 0.0.1-SNAPSHOT 6 | 7 | 8 | edu.stanford.nlp 9 | stanford-corenlp 10 | 3.5.2 11 | 12 | 13 | 14 | edu.stanford.nlp 15 | stanford-corenlp 16 | 3.5.2 17 | models 18 | 19 | 20 | 21 | edu.mit 22 | jwi 23 | 2.2.3 24 | 25 | 26 | 27 | org.apache.opennlp 28 | opennlp-tools 29 | 1.6.0 30 | 31 | 32 | 33 | commons-codec 34 | commons-codec 35 | 1.9 36 | 37 | 38 | 39 | nz.ac.waikato.cms.weka 40 | weka-dev 41 | 3.7.10 42 | 43 | 44 | 45 | info.debatty 46 | java-string-similarity 47 | 0.13 48 | 49 | 54 | 55 | 56 | org.jsoup 57 | jsoup 58 | 1.7.2 59 | 60 | 61 | 62 | org.apache.commons 63 | commons-math3 64 | 3.2 65 | 66 | 77 | 78 | 79 | org.apache.commons 80 | commons-lang3 81 | 3.4 82 | 83 | 84 | 85 | org.twitter4j 86 | twitter4j-stream 87 | 4.0.4 88 | 89 | 90 | 91 | org.twitter4j 92 | twitter4j-core 93 | 4.0.1 94 | 95 | 96 | 97 | org.json 98 | json 99 | 20160810 100 | 101 | 102 | org.apache.lucene 103 | lucene-core 104 | 4.6.0 105 | 106 | 107 | 108 | org.apache.lucene 109 | lucene-analyzers-common 110 | 4.6.0 111 | 112 | 113 | 114 | org.apache.lucene 115 | lucene-queryparser 116 | 4.6.0 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /pos.txt: -------------------------------------------------------------------------------- 1 | Taylor Alison Swift (born December 13, 1989) is an American singer-songwriter. Throughout her career, she has become one of the most popular female contemporary singers. She is known for narrative songs about her personal life, which has received much media attention. 2 | Raised in Wyomissing, Pennsylvania, Swift moved to Nashville, Tennessee, at age 14 to pursue a career in country music. She signed with the independent label Big Machine Records and became the youngest artist ever signed by the Sony/ATV Music publishing house. The release of her self-titled debut album in 2006 marked the start of her career as a country music singer. The album's third single, "Our Song", made her the youngest person to single-handedly write and perform a number-one song on the Hot Country Songs chart. Swift's second album, Fearless, was released in 2008. Buoyed by the pop crossover success of the singles "Love Story" and "You Belong with Me", Fearless became the best-selling album of 2009 in the United States. The album won four Grammy Awards, with Swift becoming the youngest Album of the Year winner. 3 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/classify/TextClassifier.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.classify; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FileNotFoundException; 8 | import java.io.FileReader; 9 | import java.io.FileWriter; 10 | import java.io.IOException; 11 | import java.io.ObjectInputStream; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | import weka.classifiers.meta.FilteredClassifier; 15 | import weka.core.Instances; 16 | import weka.core.converters.ArffLoader; 17 | 18 | public class TextClassifier { 19 | 20 | private Instances test; 21 | private FilteredClassifier classifier; 22 | List lstLoadData=new ArrayList<>(); 23 | public List lstClassLabel=new ArrayList<>(); 24 | 25 | public void LoadTestData(File ArffFileName) throws FileNotFoundException, IOException 26 | { 27 | BufferedReader bTestReader=new BufferedReader( 28 | new FileReader(ArffFileName)); 29 | ArffLoader.ArffReader myarff=new ArffLoader.ArffReader(bTestReader); 30 | test=myarff.getData(); 31 | bTestReader.close(); 32 | } 33 | public void loadModel(String ModelName) throws FileNotFoundException, IOException, ClassNotFoundException { 34 | 35 | ObjectInputStream in = new ObjectInputStream(new FileInputStream(ModelName)); 36 | Object tmp = in.readObject(); 37 | classifier = (FilteredClassifier) tmp; 38 | in.close(); 39 | System.out.println("Model loaded: " + ModelName); 40 | 41 | } 42 | 43 | public void Predict(String OutPutFileName) throws Exception 44 | { 45 | List lstLabels=new ArrayList<>(); 46 | test.setClassIndex(test.numAttributes()-1); 47 | int numOfAttribute=test.numAttributes(); 48 | String createData; 49 | for (int i = 0; i < test.numInstances(); i++) { 50 | String FinalVal=""; 51 | double predict = classifier.classifyInstance(test.instance(i)); 52 | for(int k=0;k allTerms; 19 | Integer totalNoOfDocumentInIndex; 20 | IndexReader indexReader; 21 | 22 | public AllTermsSentence() throws IOException 23 | { 24 | allTerms = new HashMap<>(); 25 | String current = System.getProperty("user.dir"); 26 | indexReader = DirectoryReader 27 | .open(FSDirectory.open(new File(current+"\\IndexSentence\\"))); 28 | totalNoOfDocumentInIndex = indexReader.maxDoc(); 29 | } 30 | 31 | public void initAllTerms() throws IOException 32 | { 33 | int pos = 0; 34 | for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) { 35 | Terms vector = indexReader.getTermVector(docId, "contents"); 36 | TermsEnum termsEnum = null; 37 | termsEnum = vector.iterator(termsEnum); 38 | BytesRef text = null; 39 | while ((text = termsEnum.next()) != null) { 40 | String term = text.utf8ToString(); 41 | allTerms.put(term, pos++); 42 | } 43 | } 44 | pos = 0; 45 | for(Entry s : allTerms.entrySet()) 46 | { 47 | s.setValue(pos++); 48 | } 49 | } 50 | public Map getAllTerms() { 51 | return allTerms; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinesentence/CosineSimilaritySentence.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinesentence; 2 | 3 | 4 | public class CosineSimilaritySentence { 5 | public static double CosineSimilarity(DocVectorSentence d1,DocVectorSentence d2) { 6 | double cosinesimilarity; 7 | try { 8 | cosinesimilarity = (d1.vector.dotProduct(d2.vector)) 9 | / (d1.vector.getNorm() * d2.vector.getNorm()); 10 | } catch (Exception e) { 11 | return 0.0; 12 | } 13 | return cosinesimilarity; 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinesentence/DocVectorSentence.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinesentence; 2 | 3 | import java.util.Map; 4 | import org.apache.commons.math3.linear.ArrayRealVector; 5 | import org.apache.commons.math3.linear.RealVector; 6 | import org.apache.commons.math3.linear.RealVectorFormat; 7 | 8 | public class DocVectorSentence 9 | { 10 | 11 | public Map terms; 12 | public RealVector vector; 13 | public DocVectorSentence(Map terms) { 14 | this.terms = terms; 15 | this.vector = new ArrayRealVector(terms.size()); 16 | } 17 | 18 | public void setEntry(String term, int freq) { 19 | if (terms.containsKey(term)) { 20 | int pos = terms.get(term); 21 | vector.setEntry(pos, (double) freq); 22 | } 23 | } 24 | 25 | public void normalize() { 26 | double sum = vector.getL1Norm(); 27 | vector = (RealVector) vector.mapDivide(sum); 28 | } 29 | 30 | @Override 31 | public String toString() { 32 | RealVectorFormat formatter = new RealVectorFormat(); 33 | return formatter.format(vector); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinesentence/IndexSentence.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinesentence; 2 | 3 | import java.io.*; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import org.apache.lucene.analysis.Analyzer; 7 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.document.Field; 10 | import org.apache.lucene.document.FieldType; 11 | import org.apache.lucene.index.CorruptIndexException; 12 | import org.apache.lucene.index.FieldInfo; 13 | import org.apache.lucene.index.IndexWriter; 14 | import org.apache.lucene.index.IndexWriterConfig; 15 | import org.apache.lucene.store.Directory; 16 | import org.apache.lucene.store.FSDirectory; 17 | import org.apache.lucene.store.LockObtainFailedException; 18 | import org.apache.lucene.util.Version; 19 | 20 | 21 | public class IndexSentence { 22 | 23 | private final File sourceFileName; 24 | private final File indexDirectory; 25 | private static String fieldName; 26 | private final String QueryText; 27 | 28 | public IndexSentence(String fileName, String Query) 29 | { 30 | QueryText=Query; 31 | String current = System.getProperty("user.dir"); 32 | this.sourceFileName = new File(fileName); 33 | this.indexDirectory = new File(current+"\\IndexSentence\\"); 34 | fieldName="contents"; 35 | } 36 | public void index() throws CorruptIndexException, 37 | LockObtainFailedException, IOException { 38 | Directory dir = FSDirectory.open(indexDirectory); 39 | Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41,StandardAnalyzer.STOP_WORDS_SET); 40 | IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_41, analyzer); 41 | if (indexDirectory.exists()) { 42 | iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); 43 | } else { 44 | iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 45 | } 46 | IndexWriter writer = new IndexWriter(dir, iwc); 47 | List lstText=ExtractText(sourceFileName); 48 | for (String f : lstText) 49 | { 50 | System.out.println("Indexing Sentences... ");//+f.getName()); 51 | Document doc = new Document(); 52 | FieldType fieldType = new FieldType(); 53 | fieldType.setIndexed(true); 54 | fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); 55 | fieldType.setStored(true); 56 | fieldType.setStoreTermVectors(true); 57 | fieldType.setTokenized(true); 58 | Field contentField = new Field(fieldName, f, fieldType); 59 | doc.add(contentField); 60 | writer.addDocument(doc); 61 | } 62 | Document doc = new Document(); 63 | FieldType fieldType = new FieldType(); 64 | fieldType.setIndexed(true); 65 | fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); 66 | fieldType.setStored(true); 67 | fieldType.setStoreTermVectors(true); 68 | fieldType.setTokenized(true); 69 | Field contentField = new Field(fieldName, QueryText, fieldType); 70 | doc.add(contentField); 71 | writer.addDocument(doc); 72 | System.out.println("Indexing Finished... "); 73 | writer.close(); 74 | } 75 | public List ExtractText(File f) throws FileNotFoundException, IOException 76 | { 77 | List lstValues=new ArrayList<>(); 78 | BufferedReader reader=new BufferedReader(new FileReader(f)); 79 | String Line=""; 80 | while((Line=reader.readLine())!=null) 81 | { 82 | lstValues.add(Line); 83 | } 84 | return lstValues; 85 | } 86 | 87 | 88 | } -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinesentence/VectorGeneratorSentence.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinesentence; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | import org.apache.lucene.document.Document; 10 | import org.apache.lucene.index.DirectoryReader; 11 | import org.apache.lucene.index.IndexReader; 12 | import org.apache.lucene.index.Terms; 13 | import org.apache.lucene.index.TermsEnum; 14 | import org.apache.lucene.store.FSDirectory; 15 | import org.apache.lucene.util.BytesRef; 16 | 17 | 18 | public class VectorGeneratorSentence 19 | { 20 | 21 | public int DocId; 22 | public String DocName; 23 | public VectorGeneratorSentence(int DocId,String DocName) 24 | { 25 | this.DocId=DocId; 26 | this.DocName=DocName; 27 | } 28 | DocVectorSentence[] docVector; 29 | private Map allterms; 30 | Integer totalNoOfDocumentInIndex; 31 | IndexReader indexReader; 32 | 33 | private List lstData=new ArrayList<>(); 34 | public void setLstData(VectorGeneratorSentence VG) 35 | { 36 | lstData.add(new VectorGeneratorSentence(VG.DocId, VG.DocName)); 37 | } 38 | public List getLstData() 39 | { 40 | return lstData; 41 | } 42 | 43 | public VectorGeneratorSentence() throws IOException 44 | { 45 | String current = System.getProperty("user.dir"); 46 | allterms = new HashMap<>(); 47 | indexReader=DirectoryReader.open(FSDirectory.open(new File(current+"\\IndexSentence\\"))); 48 | totalNoOfDocumentInIndex=indexReader.maxDoc(); 49 | docVector = new DocVectorSentence[totalNoOfDocumentInIndex]; 50 | } 51 | 52 | public void GetAllTerms() throws IOException 53 | { 54 | AllTermsSentence allTerms = new AllTermsSentence(); 55 | allTerms.initAllTerms(); 56 | allterms = allTerms.getAllTerms(); 57 | } 58 | public void ExtractVectorsName(int i) throws IOException 59 | { 60 | 61 | Document doc=indexReader.document(i); 62 | String docName=doc.get("contents"); 63 | setLstData(new VectorGeneratorSentence(i, docName)); 64 | 65 | } 66 | 67 | public DocVectorSentence[] GetDocumentVectors() throws IOException 68 | { 69 | for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) 70 | { 71 | Terms vector = indexReader.getTermVector(docId, "contents"); 72 | // Document doc=indexReader.document(docId); 73 | // String FileName=doc.get("FileName"); 74 | // System.out.println(FileName+" "+docId); 75 | ExtractVectorsName(docId); 76 | TermsEnum termsEnum = null; 77 | termsEnum = vector.iterator(termsEnum); 78 | BytesRef text = null; 79 | docVector[docId] = new DocVectorSentence(allterms); 80 | while ((text = termsEnum.next()) != null) { 81 | String term = text.utf8ToString(); 82 | int freq = (int) termsEnum.totalTermFreq(); 83 | docVector[docId].setEntry(term, freq); 84 | } 85 | docVector[docId].normalize(); 86 | } 87 | indexReader.close(); 88 | return docVector; 89 | } 90 | } -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinetext/AllTerms.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinetext; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | import java.util.Map.Entry; 8 | import org.apache.lucene.index.DirectoryReader; 9 | import org.apache.lucene.index.IndexReader; 10 | import org.apache.lucene.index.Terms; 11 | import org.apache.lucene.index.TermsEnum; 12 | import org.apache.lucene.store.FSDirectory; 13 | import org.apache.lucene.util.BytesRef; 14 | 15 | 16 | public class AllTerms { 17 | private Map allTerms; 18 | Integer totalNoOfDocumentInIndex; 19 | IndexReader indexReader; 20 | 21 | public AllTerms() throws IOException 22 | { 23 | allTerms = new HashMap<>(); 24 | String current = System.getProperty("user.dir"); 25 | indexReader = DirectoryReader 26 | .open(FSDirectory.open(new File(current+"\\Index\\"))); 27 | totalNoOfDocumentInIndex = indexReader.maxDoc(); 28 | } 29 | 30 | public void initAllTerms() throws IOException 31 | { 32 | int pos = 0; 33 | for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) { 34 | Terms vector = indexReader.getTermVector(docId, "contents"); 35 | TermsEnum termsEnum = null; 36 | termsEnum = vector.iterator(termsEnum); 37 | BytesRef text = null; 38 | while ((text = termsEnum.next()) != null) { 39 | String term = text.utf8ToString(); 40 | allTerms.put(term, pos++); 41 | } 42 | } 43 | pos = 0; 44 | for(Entry s : allTerms.entrySet()) 45 | { 46 | s.setValue(pos++); 47 | } 48 | } 49 | public Map getAllTerms() { 50 | return allTerms; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinetext/CosineSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinetext; 2 | 3 | public class CosineSimilarity { 4 | public static double CosineSimilarity(DocVector d1,DocVector d2) { 5 | double cosinesimilarity; 6 | try { 7 | cosinesimilarity = (d1.vector.dotProduct(d2.vector)) 8 | / (d1.vector.getNorm() * d2.vector.getNorm()); 9 | } catch (Exception e) { 10 | return 0.0; 11 | } 12 | return cosinesimilarity; 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinetext/DocVector.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinetext; 2 | 3 | import java.util.Map; 4 | import org.apache.commons.math3.linear.ArrayRealVector; 5 | import org.apache.commons.math3.linear.RealVector; 6 | import org.apache.commons.math3.linear.RealVectorFormat; 7 | 8 | public class DocVector { 9 | 10 | public Map terms; 11 | public RealVector vector; 12 | 13 | public DocVector(Map terms) { 14 | this.terms = terms; 15 | this.vector = new ArrayRealVector(terms.size()); 16 | } 17 | 18 | public void setEntry(String term, int freq) { 19 | if (terms.containsKey(term)) { 20 | int pos = terms.get(term); 21 | vector.setEntry(pos, (double) freq); 22 | } 23 | } 24 | 25 | public void normalize() { 26 | double sum = vector.getL1Norm(); 27 | vector = (RealVector) vector.mapDivide(sum); 28 | } 29 | 30 | @Override 31 | public String toString() { 32 | RealVectorFormat formatter = new RealVectorFormat(); 33 | return formatter.format(vector); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinetext/Index.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinetext; 2 | 3 | import java.io.*; 4 | import java.nio.file.Files; 5 | import java.nio.file.Paths; 6 | import org.apache.lucene.analysis.Analyzer; 7 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.document.Field; 10 | import org.apache.lucene.document.FieldType; 11 | import org.apache.lucene.document.TextField; 12 | import org.apache.lucene.index.CorruptIndexException; 13 | import org.apache.lucene.index.FieldInfo; 14 | import org.apache.lucene.index.IndexWriter; 15 | import org.apache.lucene.index.IndexWriterConfig; 16 | import org.apache.lucene.store.Directory; 17 | import org.apache.lucene.store.FSDirectory; 18 | import org.apache.lucene.store.LockObtainFailedException; 19 | import org.apache.lucene.util.Version; 20 | 21 | 22 | public class Index { 23 | 24 | private final File sourceDirectory; 25 | private final File indexDirectory; 26 | private static String fieldName; 27 | 28 | public Index(String DataDir) 29 | { 30 | String current = System.getProperty("user.dir"); 31 | this.sourceDirectory = new File(DataDir); 32 | this.indexDirectory = new File(current+"\\Index\\"); 33 | fieldName="contents"; 34 | } 35 | public void index() throws CorruptIndexException, 36 | LockObtainFailedException, IOException { 37 | Directory dir = FSDirectory.open(indexDirectory); 38 | Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_41,StandardAnalyzer.STOP_WORDS_SET); 39 | IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_41, analyzer); 40 | if (indexDirectory.exists()) { 41 | iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); 42 | } else { 43 | iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 44 | } 45 | IndexWriter writer = new IndexWriter(dir, iwc); 46 | for (File f : sourceDirectory.listFiles()) { 47 | System.out.println("Indexing Document "+f.getName()); 48 | Document doc = new Document(); 49 | FieldType fieldType = new FieldType(); 50 | fieldType.setIndexed(true); 51 | fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); 52 | fieldType.setStored(true); 53 | fieldType.setStoreTermVectors(true); 54 | fieldType.setTokenized(true); 55 | Field contentField = new Field(fieldName, ExtractText(f), fieldType); 56 | doc.add(contentField); 57 | doc.add(new TextField("FileName", f.getName(), Field.Store.YES)); 58 | doc.add(new TextField("FilePath",f.getCanonicalPath(),Field.Store.YES)); 59 | writer.addDocument(doc); 60 | } 61 | writer.close(); 62 | } 63 | public String ExtractText(File f) throws FileNotFoundException, IOException 64 | { 65 | String textFileContent = ""; 66 | for (String line : Files.readAllLines(Paths.get(f.getAbsolutePath()))) 67 | { 68 | textFileContent += line; 69 | } 70 | return textFileContent; 71 | } 72 | 73 | 74 | } -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/cosinetext/VectorGenerator.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.cosinetext; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.Arrays; 7 | import java.util.Comparator; 8 | import java.util.HashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | import org.apache.lucene.document.Document; 12 | import org.apache.lucene.index.DirectoryReader; 13 | import org.apache.lucene.index.IndexReader; 14 | import org.apache.lucene.index.Terms; 15 | import org.apache.lucene.index.TermsEnum; 16 | import org.apache.lucene.store.FSDirectory; 17 | import org.apache.lucene.util.BytesRef; 18 | 19 | 20 | public class VectorGenerator 21 | { 22 | 23 | public int DocId; 24 | public String DocName; 25 | public VectorGenerator(int DocId,String DocName) 26 | { 27 | this.DocId=DocId; 28 | this.DocName=DocName; 29 | } 30 | DocVector[] docVector; 31 | private Map allterms; 32 | Integer totalNoOfDocumentInIndex; 33 | IndexReader indexReader; 34 | 35 | private List lstData=new ArrayList<>(); 36 | public void setLstData(VectorGenerator VG) 37 | { 38 | lstData.add(new VectorGenerator(VG.DocId, VG.DocName)); 39 | } 40 | public List getLstData() 41 | { 42 | return lstData; 43 | } 44 | 45 | public VectorGenerator() throws IOException 46 | { 47 | String current = System.getProperty("user.dir"); 48 | allterms = new HashMap<>(); 49 | indexReader=DirectoryReader.open(FSDirectory.open(new File(current+"\\Index\\"))); 50 | totalNoOfDocumentInIndex=indexReader.maxDoc(); 51 | docVector = new DocVector[totalNoOfDocumentInIndex]; 52 | } 53 | 54 | public void GetAllTerms() throws IOException 55 | { 56 | AllTerms allTerms = new AllTerms(); 57 | allTerms.initAllTerms(); 58 | allterms = allTerms.getAllTerms(); 59 | } 60 | public void ExtractVectorsName(int i) throws IOException 61 | { 62 | 63 | Document doc=indexReader.document(i); 64 | String docName=doc.get("FileName"); 65 | setLstData(new VectorGenerator(i, docName)); 66 | 67 | } 68 | 69 | public DocVector[] GetDocumentVectors() throws IOException 70 | { 71 | for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) 72 | { 73 | Terms vector = indexReader.getTermVector(docId, "contents"); 74 | // Document doc=indexReader.document(docId); 75 | // String FileName=doc.get("FileName"); 76 | // System.out.println(FileName+" "+docId); 77 | ExtractVectorsName(docId); 78 | TermsEnum termsEnum = null; 79 | termsEnum = vector.iterator(termsEnum); 80 | BytesRef text = null; 81 | docVector[docId] = new DocVector(allterms); 82 | while ((text = termsEnum.next()) != null) { 83 | String term = text.utf8ToString(); 84 | int freq = (int) termsEnum.totalTermFreq(); 85 | docVector[docId].setEntry(term, freq); 86 | } 87 | docVector[docId].normalize(); 88 | } 89 | indexReader.close(); 90 | return docVector; 91 | } 92 | } -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/Classification.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | 4 | 5 | 6 | public class Classification { 7 | 8 | public Classification(){} 9 | public Classification(double pre,double recall,double auc,double correct,double inCorrect, double errorRate, 10 | double fn,double fp,double tn,double tp,double kappa,double MAbsError, 11 | double numInstances,double relAbsErorr,double fMeasure) 12 | { 13 | this.precision=pre; 14 | this.recall=recall; 15 | this.auc=auc; 16 | this.incorrect=inCorrect; 17 | this.correct=correct; 18 | this.errorRate=errorRate; 19 | this.fn=fn; 20 | this.fp=fp; 21 | this.tn=tn; 22 | this.tp=tp; 23 | this.kappa=kappa; 24 | this.meanAbsoluteError=MAbsError; 25 | this.numInstances=numInstances; 26 | this.relativeAbsoluteError=relAbsErorr; 27 | this.fMeasure=fMeasure; 28 | } 29 | private double precision; 30 | private double recall; 31 | private double auc; 32 | private double correct; 33 | private double incorrect; 34 | private double errorRate; 35 | private double fn; 36 | private double fp; 37 | private double tn; 38 | private double tp; 39 | private double kappa; 40 | private double meanAbsoluteError; 41 | private double numInstances; 42 | private double relativeAbsoluteError; 43 | private double fMeasure; 44 | 45 | public void setInCorrect(double incorrect) 46 | { 47 | this.incorrect=incorrect; 48 | } 49 | public double getInCorrect() 50 | { 51 | return this.incorrect; 52 | } 53 | 54 | public void setCorrect(double correct) 55 | { 56 | this.correct=correct; 57 | } 58 | public double getCorrect() 59 | { 60 | return this.correct; 61 | } 62 | public void setPrecision(double precision) 63 | { 64 | this.precision=precision; 65 | } 66 | public double getPrecision() 67 | { 68 | return this.precision; 69 | } 70 | public void setRecall(double recall) 71 | { 72 | this.recall=recall; 73 | } 74 | public double getRecall() 75 | { 76 | return this.recall; 77 | } 78 | public void setAuc(double auc) 79 | { 80 | this.auc=auc; 81 | } 82 | public double getAuc() 83 | { 84 | return this.auc; 85 | } 86 | public void setErrorRate(double errorRate) 87 | { 88 | this.errorRate=errorRate; 89 | } 90 | public double getErrorRate() 91 | { 92 | return this.errorRate; 93 | } 94 | public void setFn(double fn) 95 | { 96 | this.fn=fn; 97 | } 98 | public double getFn() 99 | { 100 | return this.fn; 101 | } 102 | public void setFp(double fp) 103 | { 104 | this.fp=fp; 105 | } 106 | public double getFp() 107 | { 108 | return this.fp; 109 | } 110 | public void setTn(double tn) 111 | { 112 | this.tn=tn; 113 | } 114 | public double getTn() 115 | { 116 | return this.tn; 117 | } 118 | public void setTp(double tp) 119 | { 120 | this.tp=tp; 121 | } 122 | public double getTp() 123 | { 124 | return tp; 125 | } 126 | public void setKappa(double kappa) 127 | { 128 | this.kappa=kappa; 129 | } 130 | public double getKappa() 131 | { 132 | return this.kappa; 133 | } 134 | public void setMeanAbsoluteError(double meanAbsoluteError) 135 | { 136 | this.meanAbsoluteError=meanAbsoluteError; 137 | } 138 | public double getMeanAbsoluteError() 139 | { 140 | return this.meanAbsoluteError; 141 | } 142 | public void setNumInstances(double d) 143 | { 144 | this.numInstances=d; 145 | } 146 | public double getNumInstances() 147 | { 148 | return this.numInstances; 149 | } 150 | public void setRelativeAbsoluteError(double relativeAbsoluteError) 151 | { 152 | this.relativeAbsoluteError=relativeAbsoluteError; 153 | } 154 | public double getRelativeAbsoluteError() 155 | { 156 | return this.relativeAbsoluteError; 157 | } 158 | public void setFMeasure(double fMeasure) 159 | { 160 | this.fMeasure=fMeasure; 161 | } 162 | public double getFMeasure() 163 | { 164 | return this.fMeasure; 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractNamedEntity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | public class ExtractNamedEntity { 4 | 5 | public ExtractNamedEntity() 6 | { 7 | 8 | } 9 | 10 | public String word; 11 | public String ner; 12 | public int position; 13 | /*public ExtractNamedEntity(String word,String ner) 14 | { 15 | this.word=word; 16 | this.ner=ner; 17 | }*/ 18 | public ExtractNamedEntity(String word,String ner,int position) 19 | { 20 | this.word=word; 21 | this.ner=ner; 22 | this.position=position; 23 | } 24 | public int getPosition() { 25 | return position; 26 | } 27 | public void setPosition(int position) { 28 | this.position = position; 29 | } 30 | public void setWord(String word) 31 | { 32 | this.word=word; 33 | } 34 | public String getWord() 35 | { 36 | return this.word; 37 | } 38 | 39 | public void setNer(String ner) 40 | { 41 | this.ner=ner; 42 | } 43 | public String getNer() 44 | { 45 | return this.ner; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractNumberSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | public class ExtractNumberSimilarity { 4 | 5 | private String vector1; 6 | private String vector2; 7 | private double score; 8 | 9 | public ExtractNumberSimilarity(){} 10 | 11 | public ExtractNumberSimilarity(String vector1,String vector2,double score) 12 | { 13 | this.vector1=vector1; 14 | this.vector2=vector2; 15 | this.score=score; 16 | } 17 | 18 | public void setVector1(String vector1) 19 | { 20 | this.vector1=vector1; 21 | } 22 | public String getVector1() 23 | { 24 | return this.vector1; 25 | } 26 | 27 | public void setVecor2(String vector2) 28 | { 29 | this.vector2=vector2; 30 | } 31 | public String getVector2() 32 | { 33 | return this.vector2; 34 | } 35 | public void setScore(double score) 36 | { 37 | this.score=score; 38 | } 39 | public double getScore() 40 | { 41 | return this.score; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractPosTag.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | public class ExtractPosTag { 4 | 5 | public ExtractPosTag(){} 6 | public ExtractPosTag(String wordPart,String tag) 7 | { 8 | this.wordPart=wordPart; 9 | this.tag=tag; 10 | } 11 | private String wordPart; 12 | private String tag; 13 | private int itemCount; 14 | 15 | public void setWordPart(String wordPart) 16 | { 17 | this.wordPart=wordPart; 18 | } 19 | public String getWordPart() 20 | { 21 | return this.wordPart; 22 | } 23 | public void setTag(String tag) 24 | { 25 | this.tag=tag; 26 | } 27 | public String getTag() 28 | { 29 | return this.tag; 30 | } 31 | public void setItemCount(int itemCount) 32 | { 33 | this.itemCount=itemCount; 34 | } 35 | public int getItemCount() 36 | { 37 | return this.itemCount; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractStem.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | public class ExtractStem { 4 | 5 | private String word1; 6 | public void setWord1(String word) 7 | { 8 | this.word1=word; 9 | } 10 | public String getWord1() 11 | { 12 | return word1; 13 | } 14 | private String derived1; 15 | public void setDerived1(String derived) 16 | { 17 | this.derived1=derived; 18 | } 19 | public String getDerived1() 20 | { 21 | return this.derived1; 22 | } 23 | private String word2; 24 | public void setWord2(String word) 25 | { 26 | this.word2=word; 27 | } 28 | public String getWord2() 29 | { 30 | return word2; 31 | } 32 | private String derived2; 33 | public void setDerived2(String derived) 34 | { 35 | this.derived2=derived; 36 | } 37 | public String getDerived2() 38 | { 39 | return this.derived2; 40 | } 41 | public ExtractStem(String word1,String derived1,String word2,String derived2) 42 | { 43 | this.word1=word1; 44 | this.word2=word2; 45 | this.derived1=derived1; 46 | this.derived2=derived2; 47 | } 48 | public ExtractStem() { 49 | // TODO Auto-generated constructor stub 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractSynonym.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | public class ExtractSynonym { 4 | 5 | public ExtractSynonym(){} 6 | public ExtractSynonym(String word,String synset) 7 | { 8 | this.word=word; 9 | this.synset=synset; 10 | } 11 | private String word; 12 | 13 | private String synset; 14 | 15 | public String getWord() { 16 | return word; 17 | } 18 | 19 | public void setWord(String word) { 20 | this.word = word; 21 | } 22 | 23 | public String getSynset() { 24 | return synset; 25 | } 26 | 27 | public void setSynset(String synset) { 28 | this.synset = synset; 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractTextCosineSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | 4 | public class ExtractTextCosineSimilarity { 5 | 6 | public ExtractTextCosineSimilarity(){} 7 | public ExtractTextCosineSimilarity(String DocName, String DocCandidate,double Similarity) 8 | { 9 | this.DocName=DocName; 10 | this.DocCandidate=DocCandidate; 11 | this.Similarity=Similarity; 12 | } 13 | public String query; 14 | public void setQuery(String query) 15 | { 16 | this.query=query; 17 | } 18 | public String getQuery() 19 | { 20 | return this.query; 21 | } 22 | public String DocName; 23 | public void setDocName(String DocName) 24 | { 25 | this.DocName=DocName; 26 | } 27 | public String getDocName() 28 | { 29 | return DocName; 30 | } 31 | public String DocCandidate; 32 | public void setDocCandidate(String DocCandidate) 33 | { 34 | this.DocCandidate=DocCandidate; 35 | } 36 | public String getDocCandidate() 37 | { 38 | return this.DocCandidate; 39 | } 40 | public double Similarity; 41 | public void setSimilairty(double CosineSimilarity) 42 | { 43 | this.Similarity=CosineSimilarity; 44 | } 45 | public double getSimilarity() 46 | { 47 | return this.Similarity; 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractTextSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | public class ExtractTextSimilarity { 4 | 5 | private String word; 6 | public void setWord(String word) 7 | { 8 | this.word=word; 9 | } 10 | public String getWord() 11 | { 12 | return this.word; 13 | } 14 | private String candidate; 15 | public void setCandidate(String candidate) 16 | { 17 | this.candidate=candidate; 18 | } 19 | public String getCandidate() 20 | { 21 | return this.candidate; 22 | } 23 | private double similarity; 24 | public void setSimilarity(double similarity) 25 | { 26 | this.similarity=similarity; 27 | } 28 | public double getSimilarity() 29 | { 30 | return this.similarity; 31 | } 32 | public ExtractTextSimilarity(){} 33 | public ExtractTextSimilarity (String Word, String Candidate, double Similarity) 34 | { 35 | this.word=Word; 36 | this.candidate=Candidate; 37 | this.similarity=Similarity; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractTextTfidfSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | 4 | public class ExtractTextTfidfSimilarity { 5 | 6 | private String query; 7 | 8 | private String sentence; 9 | private String similarSentence; 10 | private String score; 11 | public ExtractTextTfidfSimilarity(){} 12 | 13 | public ExtractTextTfidfSimilarity(String searchText, String similarSentence, String score) { 14 | this.sentence=searchText; 15 | this.similarSentence=similarSentence; 16 | this.score=score; 17 | } 18 | public void setQuery(String query) 19 | { 20 | this.query=query; 21 | } 22 | public String getQuery() 23 | { 24 | return this.query; 25 | } 26 | public void setSentence(String sentence) 27 | { 28 | this.sentence=sentence; 29 | } 30 | public String getSentence() 31 | { 32 | return this.sentence; 33 | } 34 | 35 | public void setSimilaritySentence(String similaritySentence) 36 | { 37 | this.similarSentence=similaritySentence; 38 | } 39 | 40 | public String getSimilaritySentence() 41 | { 42 | return this.similarSentence; 43 | } 44 | 45 | public void serScore(String score) 46 | { 47 | this.score=score; 48 | } 49 | public String getScore() 50 | { 51 | return this.score; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/ExtractionKeyword.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain; 2 | 3 | 4 | public class ExtractionKeyword { 5 | 6 | public ExtractionKeyword(){} 7 | public ExtractionKeyword(String tweet,String keyword) 8 | { 9 | this.tweet=tweet; 10 | this.keyword=keyword; 11 | } 12 | public ExtractionKeyword(String keyword) 13 | { 14 | this.keyword=keyword; 15 | } 16 | public String tweet; 17 | public String keyword; 18 | public String inputSentence; 19 | public String inputTweet; 20 | 21 | 22 | public void setInputSentence(String inputSentence) 23 | { 24 | this.inputSentence=inputSentence; 25 | } 26 | 27 | public String getInputSentence() 28 | { 29 | return inputSentence; 30 | } 31 | public void setInputTweet(String inputTweet) 32 | { 33 | this.inputTweet=inputTweet; 34 | } 35 | public String getInputTweet() 36 | { 37 | return inputTweet; 38 | } 39 | public void setTweet(String tweet) 40 | { 41 | this.tweet=tweet; 42 | } 43 | public String getTweet() 44 | { 45 | return tweet; 46 | } 47 | 48 | public void setKeyword(String keyword) 49 | { 50 | this.keyword=keyword; 51 | } 52 | public String getKeyword() 53 | { 54 | return keyword; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextDecisionTree.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | import unsw.curation.api.domain.Classification; 9 | 10 | public interface IClassificationTextDecisionTree { 11 | 12 | void LoadDataset(File arffFileName) throws IOException; 13 | List EvaluateDecisionTree() throws Exception; 14 | void LearnDecisionTree() throws Exception; 15 | void SaveModel(String modelName) throws FileNotFoundException, IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextKNN.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | import unsw.curation.api.domain.Classification; 9 | 10 | public interface IClassificationTextKNN { 11 | 12 | void LoadDataset(File arffFileName) throws IOException; 13 | List EvaluateKNN() throws Exception; 14 | void LearnKNN() throws Exception; 15 | void SaveModel(String modelName) throws FileNotFoundException, IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextLogisticRegression.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | import unsw.curation.api.domain.Classification; 9 | 10 | public interface IClassificationTextLogisticRegression { 11 | 12 | void LoadDataset(File arffFileName) throws IOException; 13 | List EvaluateLogisticRegression() throws Exception; 14 | void LearnLogisticRegression() throws Exception; 15 | void SaveModel(String modelName) throws FileNotFoundException, IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextNaiveBays.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | import unsw.curation.api.domain.Classification; 9 | 10 | public interface IClassificationTextNaiveBays { 11 | 12 | void LoadDataset(File arffFileName) throws IOException; 13 | List EvaluateNaiveBays() throws Exception; 14 | void LearnNaiveBays() throws Exception; 15 | void SaveModel(String modelName) throws FileNotFoundException, IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextNeuralNetwork.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | import unsw.curation.api.domain.Classification; 9 | 10 | public interface IClassificationTextNeuralNetwork { 11 | 12 | void LoadDataset(File arffFileName) throws IOException; 13 | List EvaluateNeuralNetwork() throws Exception; 14 | void LearnNeuralNetwork() throws Exception; 15 | void SaveModel(String modelName) throws FileNotFoundException, IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextRandomForest.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | import unsw.curation.api.domain.Classification; 9 | 10 | public interface IClassificationTextRandomForest 11 | { 12 | void LoadDataset(File arffFileName) throws IOException; 13 | List EvaluateRandomForest() throws Exception; 14 | void LearnRandomForest() throws Exception; 15 | void SaveModel(String modelName) throws FileNotFoundException, IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IClassificationTextSVM.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | import unsw.curation.api.domain.Classification; 9 | 10 | public interface IClassificationTextSVM { 11 | 12 | void LoadDataset(File arffFileName) throws IOException; 13 | List EvaluateSVM() throws Exception; 14 | void LearnSVM() throws Exception; 15 | void SaveModel(String modelName) throws FileNotFoundException, IOException; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IKeywordEx.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | import java.io.File; 3 | import java.io.FileNotFoundException; 4 | import java.io.IOException; 5 | import java.util.List; 6 | 7 | import unsw.curation.api.domain.ExtractionKeyword; 8 | 9 | public interface IKeywordEx { 10 | 11 | String ExtractTweetKeyword(String inputTweet,File stopWordList) throws Exception; 12 | List ExtractTweetKeywordFromFile(File fileName, File stopWordList) throws FileNotFoundException, IOException; 13 | String ExtractSentenceKeyword(String inputSentence, File stopWordList) throws Exception; 14 | //String ExtractSentenceKeyPhrase(String inputSentence,File stopWordList) throws Exception; 15 | String ExtractFileKeyword(File fileName, File stopWordList) throws FileNotFoundException, IOException; 16 | /*ExtractionKeyword ExtractSentenceKeywords(String inputSentence) throws Exception; 17 | ExtractionKeyword ExtractFileKeywords(String inputFilePath) throws Exception;*/ 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/INamedEntity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.net.URISyntaxException; 7 | import java.util.List; 8 | 9 | import unsw.curation.api.domain.ExtractNamedEntity; 10 | 11 | public interface INamedEntity { 12 | 13 | ListExtractNamedEntityFile(File filePath) throws Exception; 14 | //ListExtractNamedEntity(boolean useRegexNer,List lstData) throws Exception; 15 | List ExtractNamedEntitySentence(String inputSentence) throws Exception; 16 | List ExtractOrganization(String inputSentence) throws URISyntaxException, Exception; 17 | List ExtractPerson(String inputSentence)throws URISyntaxException, Exception; 18 | List ExtractLocation(String inputSentence)throws URISyntaxException, Exception; 19 | List ExtractDate(String inputSentence)throws URISyntaxException, Exception; 20 | List ExtractMoney(String inputSentence)throws URISyntaxException, Exception; 21 | List ExtractCity(String inputSentence)throws URISyntaxException, Exception; 22 | List ExtractState(String inputSentence)throws URISyntaxException, Exception; 23 | List ExtractCountry(String inputSentence)throws URISyntaxException, FileNotFoundException, IOException, Exception; 24 | List ExtractContinent(String inputSentence)throws URISyntaxException, Exception; 25 | List ExtractCrime(String inputSentence)throws URISyntaxException, Exception; 26 | List ExtractSport(String inputSentence)throws URISyntaxException, Exception; 27 | List ExtractHoliday(String inputSentence)throws URISyntaxException, Exception; 28 | List ExtractCompany(String inputSentence)throws URISyntaxException, Exception; 29 | List ExtractNaturalDisaster(String inputSentence)throws URISyntaxException, Exception; 30 | List ExtractDrug(String inputSentence)throws URISyntaxException, Exception; 31 | List ExtractProduct(String inputSentence)throws URISyntaxException, Exception; 32 | //List ExtractRadioProgram(String inputSentence)throws URISyntaxException, Exception; 33 | //List ExtractRadioStation(String inputSentence)throws URISyntaxException, Exception; 34 | //List ExtractTvShows(String inputSentence)throws URISyntaxException; 35 | List ExtractMedia(String inputSentence)throws URISyntaxException, Exception; 36 | List ExtractOperatingSystem(String inputSentence)throws URISyntaxException, Exception; 37 | List ExtractDegree(String inputSentence)throws URISyntaxException, Exception; 38 | List ExtractSportEvents(String inputSentence)throws URISyntaxException, Exception; 39 | //List ExtractRegion(String inputSentence)throws URISyntaxException; 40 | //List ExtractGeographicFeature(String inputSentence)throws URISyntaxException; 41 | List ReadRawData(File filePath) throws Exception; 42 | 43 | 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/INumberCosineSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import unsw.curation.api.domain.ExtractNumberSimilarity; 7 | 8 | 9 | 10 | public interface INumberCosineSimilarity { 11 | 12 | double Cosine_Vector_Vector(double [] number1,double [] number2); 13 | List Cosine_Vector_VectorS(String filePath) throws IOException; 14 | List Cosine_Vector_VectorS(double [] vector,String filePath) throws IOException; 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/INumberDiceSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import unsw.curation.api.domain.ExtractNumberSimilarity; 7 | 8 | 9 | public interface INumberDiceSimilarity { 10 | 11 | double Dice_Vector_Vector(double [] number1,double [] number2); 12 | List Dice_Vector_VectorS(String filePath) throws IOException; 13 | List Dice_Vector_VectorS(Double [] vector,String filePath) throws IOException; 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/INumberEuclideanSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import unsw.curation.api.domain.ExtractNumberSimilarity; 7 | 8 | 9 | 10 | public interface INumberEuclideanSimilarity { 11 | 12 | double Euclidean_Vector_Vector(double [] number1,double [] number2); 13 | List Euclidean_Vector_VectorS(String filePath) throws IOException; 14 | List Euclidean_Vector_VectorS(double [] vector,String filePath) throws IOException; 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/INumberJaccardSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import unsw.curation.api.domain.ExtractNumberSimilarity; 7 | 8 | 9 | 10 | public interface INumberJaccardSimilarity { 11 | 12 | double Jaccard_Vector_Vector(double [] number1,double [] number2); 13 | List Jaccard_Vector_VectorS(String filePath) throws IOException; 14 | List Jaccard_Vector_VectorS(Double [] vector,String filePath) throws IOException; 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IPosTag.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.util.List; 5 | 6 | import unsw.curation.api.domain.ExtractPosTag; 7 | 8 | public interface IPosTag { 9 | 10 | List ExtractNoun(String sentence); 11 | List ExtractAdjective(String sentence); 12 | List ExtractAdverb(String sentence); 13 | List ExtractVerb(String sentence); 14 | List ExtractQuotaion(String sentence); 15 | List ExtractPhrase(String sentence); 16 | List ExtractNoun(File filePath)throws Exception; 17 | List ExtractAdjective(File filePath)throws Exception; 18 | List ExtractAdverb(File filePath) throws Exception; 19 | List ExtractVerb(File filePath) throws Exception; 20 | 21 | List ExtractPosTagsSentence(String sentence); 22 | List ExtractPosTagsSentenceNew(String sentence); 23 | List ExtractPosTagsFile(File filePath) throws Exception; 24 | 25 | List ExtractData(File filePath) throws Exception; 26 | List ExtractPosTags(List inputData); 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IStem.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.net.URISyntaxException; 7 | import java.util.List; 8 | 9 | import unsw.curation.api.domain.ExtractStem; 10 | 11 | public interface IStem { 12 | 13 | void ReadDataset() throws FileNotFoundException, IOException, URISyntaxException; 14 | List FindWordDerivedForms(String word) throws FileNotFoundException, IOException, URISyntaxException; 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/ISynonym.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.IOException; 4 | import java.net.URISyntaxException; 5 | import java.util.List; 6 | 7 | public interface ISynonym { 8 | 9 | List ExtractSynonymWord(String word) throws URISyntaxException, IOException; 10 | List ExtractHypernymWord(String word); 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/ITextCosineSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.net.URISyntaxException; 6 | import java.util.List; 7 | 8 | import org.apache.lucene.store.LockObtainFailedException; 9 | 10 | import unsw.curation.api.domain.ExtractTextCosineSimilarity; 11 | 12 | 13 | public interface ITextCosineSimilarity { 14 | 15 | List Cosine_Document_DocumentS(String QueryFilePath, String DataDirectoryPath) 16 | throws LockObtainFailedException, IOException, URISyntaxException; 17 | //List Cosine_Sentence_Document(String Query, String FileName) throws LockObtainFailedException, IOException; 18 | //public List ExtractListKeyword(List lstSentence) throws Exception; 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/ITextJaccardSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.List; 6 | 7 | import unsw.curation.api.domain.ExtractTextSimilarity; 8 | 9 | 10 | 11 | public interface ITextJaccardSimilarity { 12 | 13 | double Jaccard_Word_Word(String word1, String word2); 14 | List Jaccard_Word_Document(String word, String filePath) throws IOException; 15 | double Jaccard_Document_Document(String file1,String file2) throws IOException; 16 | //List Jaccard_Document_DocumentS(File filePath, String directoryPath) throws IOException; 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/ITextJaroSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | public interface ITextJaroSimilarity { 4 | 5 | public double ComputeJaroSimilarity(String Word1,String word2); 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/ITextLevenshtainSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import unsw.curation.api.domain.ExtractTextSimilarity; 7 | 8 | 9 | 10 | public interface ITextLevenshtainSimilarity 11 | { 12 | 13 | List Leveneshtain_Word_Document(String word1, String filePath) throws IOException; 14 | int Leveneshtain_Word_Word(String word1, String word2); 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/ITextQGramSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | public interface ITextQGramSimilarity { 4 | 5 | double ComputeQGramSimilarity(String word1,String word2); 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/ITextSoundexSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import org.apache.commons.codec.EncoderException; 4 | 5 | public interface ITextSoundexSimilarity { 6 | 7 | int SoundexDifference(String word1,String word2) throws EncoderException; 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/ITextTfidfSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.List; 6 | 7 | import org.apache.lucene.queryparser.classic.ParseException; 8 | 9 | import unsw.curation.api.domain.ExtractTextTfidfSimilarity; 10 | 11 | 12 | public interface ITextTfidfSimilarity 13 | { 14 | //List SearchFile(String FilePath) throws IOException, ParseException; 15 | List SearchText(String searchText) throws IOException, ParseException; 16 | void CreateIndex(String IndexFilePath) throws IOException, ParseException; 17 | void delete(File file) throws IOException; 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/domain/abstraction/IUrlExtraction.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.domain.abstraction; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.List; 6 | 7 | public interface IUrlExtraction { 8 | 9 | String ExtractTitle(String url) throws IOException; 10 | List ExtractHeadings(String url) throws IOException; 11 | List ExtractHrefText(String url) throws IOException; 12 | List ExtractParagraphes(String url) throws IOException; 13 | List ExtractImageALTtext(String url) throws IOException; 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/extractnamedentity/curation.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/src/main/java/unsw/curation/api/extractnamedentity/curation.jpg -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/extractnamedentity/curation.ucls: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 11 | 12 | 13 | 14 | 15 | 17 | 18 | 20 | 21 | 22 | 23 | 24 | 26 | 27 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/extractsimilarity/ExtractNumberCosineSimilarityImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.extractsimilarity; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileNotFoundException; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.Collections; 10 | import java.util.Comparator; 11 | import java.util.List; 12 | import java.util.stream.Collectors; 13 | 14 | import unsw.curation.api.domain.ExtractNumberSimilarity; 15 | import unsw.curation.api.domain.abstraction.INumberCosineSimilarity; 16 | 17 | 18 | 19 | 20 | public class ExtractNumberCosineSimilarityImpl implements INumberCosineSimilarity 21 | { 22 | @Override 23 | public double Cosine_Vector_Vector(double [] vector1, double [] vector2) 24 | { 25 | double dotProduct=0.0; 26 | double vector1Len=0.0; 27 | double vector2Len=0.0; 28 | for(int i=0;i Cosine_Vector_VectorS(double [] vector1, String fileName) throws IOException 39 | { 40 | List lstValues=new ArrayList<>(); 41 | List lstData=ReadData(fileName); 42 | for(String [] vector2:lstData) 43 | { 44 | 45 | double dotProduct=0.0; 46 | double vector1Len=0.0; 47 | double vector2Len=0.0; 48 | for(int i=0;i lstTopRecords=lstValues.stream() 61 | .limit(10) 62 | .collect(Collectors.toList()); 63 | return lstTopRecords; 64 | } 65 | @Override 66 | //dar meghdar bazgashti dobareh check kon 67 | public List Cosine_Vector_VectorS(String fileName) throws IOException 68 | { 69 | List lstValues=new ArrayList<>(); 70 | List lstData=ReadData(fileName); 71 | List lstTopRecords=new ArrayList<>(); 72 | for(String [] vector1:lstData) 73 | { 74 | List lstTempValues=new ArrayList<>(); 75 | lstTempValues.addAll(lstData); 76 | lstTempValues.remove(vector1); 77 | for(String [] vector2:lstTempValues) 78 | { 79 | double dotProduct=0.0; 80 | double vector1Len=0.0; 81 | double vector2Len=0.0; 82 | for(int i=0;i ReadData(String FilePath) throws FileNotFoundException, IOException 101 | { 102 | List lstValues=new ArrayList<>(); 103 | BufferedReader reader=new BufferedReader(new FileReader(FilePath)); 104 | String line=""; 105 | while((line=reader.readLine())!=null) 106 | { 107 | String [] arrLine=line.split(","); 108 | lstValues.add(arrLine); 109 | } 110 | return lstValues; 111 | } 112 | public class MyCosineComp implements Comparator 113 | { 114 | @Override 115 | public int compare(ExtractNumberSimilarity o1, ExtractNumberSimilarity o2) { 116 | if(o1.getScore() lstarr1=new ArrayList<>(); 26 | List lstarr2=new ArrayList<>(); 27 | for(double num:number1) 28 | { 29 | lstarr1.add(String.valueOf(num)); 30 | } 31 | for(double num:number2) 32 | { 33 | lstarr2.add(String.valueOf(num)); 34 | } 35 | List lstUnique=new ArrayList<>(); 36 | lstUnique.addAll(lstarr1); 37 | lstUnique.addAll(lstarr2); 38 | HashSet lstIntersect=new HashSet<>(); 39 | lstIntersect.addAll(lstarr1); 40 | lstIntersect.retainAll(lstarr2); 41 | double intersectSize=lstIntersect.size(); 42 | double uniqueSize=lstUnique.size(); 43 | double DiceSimlarity=(2*intersectSize)/uniqueSize; 44 | return DiceSimlarity; 45 | } 46 | 47 | @Override 48 | public List Dice_Vector_VectorS(String filePath) throws IOException { 49 | List lstValues=new ArrayList<>(); 50 | List lstTopRecords=new ArrayList<>(); 51 | List lstarr=ReadData(filePath); 52 | for(String [] arrItem:lstarr) 53 | { 54 | List lstUniqueItems=new ArrayList<>(); 55 | HashSet lstIntersect=new HashSet<>(); 56 | for(String arrIte:arrItem) 57 | { 58 | lstUniqueItems.add(Double.parseDouble(arrIte)); 59 | lstIntersect.add(Double.parseDouble(arrIte)); 60 | } 61 | List lstTempData=new ArrayList<>(); 62 | lstTempData.addAll(lstarr); 63 | int arrItemIndex=lstTempData.indexOf(arrItem); 64 | lstTempData.remove(arrItemIndex); 65 | for(String [] secArrItem:lstTempData) 66 | { 67 | List lstforSecArrItem=new ArrayList<>(); 68 | for(String arrIt: secArrItem) 69 | { 70 | lstUniqueItems.add(Double.parseDouble(arrIt)); 71 | 72 | lstforSecArrItem.add(Double.parseDouble(arrIt)); 73 | } 74 | lstIntersect.retainAll(lstforSecArrItem); 75 | double intersectSize=lstIntersect.size(); 76 | double uniqueSize=lstUniqueItems.size(); 77 | double DiceSimlarity=intersectSize/uniqueSize; 78 | lstValues.add(new ExtractNumberSimilarity(Arrays.toString(arrItem),Arrays.toString(secArrItem) 79 | ,DiceSimlarity)); 80 | } 81 | Collections.sort(lstValues,new MyDiceComp()); 82 | lstValues=lstValues.stream().limit(10).collect(Collectors.toList()); 83 | lstTopRecords.addAll(lstValues); 84 | lstValues.clear(); 85 | } 86 | return lstTopRecords; 87 | } 88 | 89 | @Override 90 | public List Dice_Vector_VectorS(Double[] vector, String filePath) throws IOException { 91 | List lstValues=new ArrayList<>(); 92 | List lstarr=ReadData(filePath); 93 | for(String [] arrItem:lstarr) 94 | { 95 | List lstUniqueItems=new ArrayList<>(); 96 | HashSet lstIntersect=new HashSet<>(); 97 | for(String dblVal:arrItem) 98 | { 99 | lstUniqueItems.add(Double.parseDouble(dblVal)); 100 | lstIntersect.add(Double.parseDouble(dblVal)); 101 | } 102 | lstUniqueItems.addAll(Arrays.asList(vector)); 103 | lstIntersect.retainAll(Arrays.asList(vector)); 104 | double intersectSize=lstIntersect.size(); 105 | double uniqueSize=lstUniqueItems.size(); 106 | double DiceSimlarity=(2*intersectSize)/uniqueSize; 107 | lstValues.add(new ExtractNumberSimilarity(Arrays.toString(arrItem),Arrays.toString(vector) 108 | ,DiceSimlarity)); 109 | } 110 | Collections.sort(lstValues,new MyDiceComp()); 111 | List lstTopRecords=lstValues.stream() 112 | .limit(10) 113 | .collect(Collectors.toList()); 114 | return lstTopRecords; 115 | } 116 | 117 | public List ReadData(String FilePath) throws FileNotFoundException, IOException 118 | { 119 | List lstValues=new ArrayList<>(); 120 | BufferedReader reader=new BufferedReader(new FileReader(FilePath)); 121 | String line=""; 122 | while((line=reader.readLine())!=null) 123 | { 124 | String [] arrLine=line.split(","); 125 | lstValues.add(arrLine); 126 | } 127 | return lstValues; 128 | } 129 | public class MyDiceComp implements Comparator 130 | { 131 | 132 | @Override 133 | public int compare(ExtractNumberSimilarity o1, ExtractNumberSimilarity o2) { 134 | if(o1.getScore() Euclidean_Vector_VectorS(String filePath) throws IOException 43 | { 44 | List lstSimilarity=new ArrayList<>(); 45 | List lstTopRecords=new ArrayList<>(); 46 | List lstValues=ReadData(filePath); 47 | for(String [] arrVal1: lstValues) 48 | { 49 | List lstTempVal=new ArrayList<>(); 50 | lstTempVal.addAll(lstValues); 51 | lstTempVal.remove(arrVal1); 52 | for(String [] arrVal2:lstTempVal) 53 | { 54 | double sum=0.0; 55 | for(int i=0;i Euclidean_Vector_VectorS(double [] vector,String filePath) throws IOException 76 | { 77 | List lstSimilarity=new ArrayList<>(); 78 | List lstValues=ReadData(filePath); 79 | for(String [] arrVal2: lstValues) 80 | { 81 | double sum=0.0; 82 | for(int i=0;i lstTopRecords=lstSimilarity.stream() 94 | .limit(10) 95 | .collect(Collectors.toList()); 96 | return lstTopRecords; 97 | } 98 | public List ReadData(String FilePath) throws FileNotFoundException, IOException 99 | { 100 | List lstValues=new ArrayList<>(); 101 | BufferedReader reader=new BufferedReader(new FileReader(FilePath)); 102 | String line=""; 103 | while((line=reader.readLine())!=null) 104 | { 105 | String [] arrLine=line.split(","); 106 | lstValues.add(arrLine); 107 | } 108 | return lstValues; 109 | } 110 | public class MyEuclideanComp implements Comparator 111 | { 112 | 113 | @Override 114 | public int compare(ExtractNumberSimilarity o1, ExtractNumberSimilarity o2) { 115 | if(o1.getScore()>o2.getScore()) 116 | return 1; 117 | else 118 | return -1; 119 | } 120 | 121 | } 122 | 123 | } 124 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/extractsimilarity/ExtractNumberJaccardSimilarityImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.extractsimilarity; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileNotFoundException; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.Collections; 10 | import java.util.Comparator; 11 | import java.util.HashSet; 12 | import java.util.List; 13 | import java.util.stream.Collectors; 14 | 15 | import unsw.curation.api.domain.ExtractNumberSimilarity; 16 | import unsw.curation.api.domain.abstraction.INumberJaccardSimilarity; 17 | 18 | 19 | 20 | public class ExtractNumberJaccardSimilarityImpl implements INumberJaccardSimilarity{ 21 | 22 | @Override 23 | public double Jaccard_Vector_Vector(double[] number1, double[] number2) { 24 | List lstarr1=new ArrayList<>(); 25 | List lstarr2=new ArrayList<>(); 26 | for(double num:number1) 27 | { 28 | lstarr1.add(String.valueOf(num)); 29 | } 30 | for(double num:number2) 31 | { 32 | lstarr2.add(String.valueOf(num)); 33 | } 34 | HashSet lstUnique=new HashSet<>(); 35 | lstUnique.addAll(lstarr1); 36 | lstUnique.addAll(lstarr2); 37 | HashSet lstIntersect=new HashSet<>(); 38 | lstIntersect.addAll(lstarr1); 39 | lstIntersect.retainAll(lstarr2); 40 | double intersectSize=lstIntersect.size(); 41 | double uniqueSize=lstUnique.size(); 42 | double JaccardSimilarity=intersectSize/uniqueSize; 43 | return JaccardSimilarity; 44 | } 45 | 46 | @Override 47 | public List Jaccard_Vector_VectorS(String filePath) throws IOException { 48 | List lstValues=new ArrayList<>(); 49 | List lstTopRecords=new ArrayList<>(); 50 | List lstarr=ReadData(filePath); 51 | for(String [] arrItem:lstarr) 52 | { 53 | HashSet lstUniqueItems=new HashSet<>(); 54 | HashSet lstIntersect=new HashSet<>(); 55 | for(String arrIte:arrItem) 56 | { 57 | lstUniqueItems.add(Double.parseDouble(arrIte)); 58 | lstIntersect.add(Double.parseDouble(arrIte)); 59 | } 60 | List lstTempData=new ArrayList<>(); 61 | lstTempData.addAll(lstarr); 62 | int arrItemIndex=lstTempData.indexOf(arrItem); 63 | lstTempData.remove(arrItemIndex); 64 | for(String [] secArrItem:lstTempData) 65 | { 66 | List lstforSecArrItem=new ArrayList<>(); 67 | for(String arrIt: secArrItem) 68 | { 69 | lstUniqueItems.add(Double.parseDouble(arrIt)); 70 | 71 | lstforSecArrItem.add(Double.parseDouble(arrIt)); 72 | } 73 | lstIntersect.retainAll(lstforSecArrItem); 74 | double intersectSize=lstIntersect.size(); 75 | double uniqueSize=lstUniqueItems.size(); 76 | double JaccardSimlarity=intersectSize/uniqueSize; 77 | lstValues.add(new ExtractNumberSimilarity(Arrays.toString(arrItem),Arrays.toString(secArrItem) 78 | ,JaccardSimlarity)); 79 | } 80 | Collections.sort(lstValues,new MyJaccardCoefficientComp()); 81 | lstValues=lstValues.stream().limit(10).collect(Collectors.toList()); 82 | lstTopRecords.addAll(lstValues); 83 | lstValues.clear(); 84 | } 85 | return lstTopRecords; 86 | } 87 | 88 | @Override 89 | public List Jaccard_Vector_VectorS(Double[] vector, String filePath) throws IOException { 90 | List lstValues=new ArrayList<>(); 91 | List lstarr=ReadData(filePath); 92 | for(String [] arrItem:lstarr) 93 | { 94 | HashSet lstUniqueItems=new HashSet<>(); 95 | HashSet lstIntersect=new HashSet<>(); 96 | for(String arrIt: arrItem) 97 | { 98 | lstUniqueItems.add(Double.parseDouble(arrIt)); 99 | lstIntersect.add(Double.parseDouble(arrIt)); 100 | } 101 | lstUniqueItems.addAll(Arrays.asList(vector)); 102 | lstIntersect.retainAll(Arrays.asList(vector)); 103 | double intersectSize=lstIntersect.size(); 104 | double uniqueSize=lstUniqueItems.size(); 105 | double JaccardSimlarity=intersectSize/uniqueSize; 106 | lstValues.add(new ExtractNumberSimilarity(Arrays.toString(arrItem),Arrays.toString(vector) 107 | ,JaccardSimlarity)); 108 | } 109 | Collections.sort(lstValues,new MyJaccardCoefficientComp()); 110 | List lstTopRecords=lstValues.stream() 111 | .limit(10) 112 | .collect(Collectors.toList()); 113 | return lstTopRecords; 114 | } 115 | 116 | public List ReadData(String FilePath) throws FileNotFoundException, IOException 117 | { 118 | List lstValues=new ArrayList<>(); 119 | BufferedReader reader=new BufferedReader(new FileReader(FilePath)); 120 | String line=""; 121 | while((line=reader.readLine())!=null) 122 | { 123 | String [] arrLine=line.split(","); 124 | lstValues.add(arrLine); 125 | } 126 | return lstValues; 127 | } 128 | public class MyJaccardCoefficientComp implements Comparator 129 | { 130 | @Override 131 | public int compare(ExtractNumberSimilarity o1, ExtractNumberSimilarity o2) { 132 | if(o1.getScore() lstUnion=new HashSet<>(); 29 | lstUnion.addAll(Arrays.asList(arrWord1)); 30 | lstUnion.addAll(Arrays.asList(arrWord2)); 31 | HashSet lstIntersect=new HashSet<>(); 32 | lstIntersect.addAll(Arrays.asList(arrWord1)); 33 | lstIntersect.retainAll(Arrays.asList(arrWord2)); 34 | double lstUnoinSize=(double)lstUnion.size(); 35 | double lstIntersectSize=(double)lstIntersect.size(); 36 | double JaccardSimilarity=lstIntersectSize/lstUnoinSize; 37 | return JaccardSimilarity; 38 | } 39 | @Override 40 | public List Jaccard_Word_Document(String word, String filePath) throws IOException 41 | { 42 | String [] arrWord=word.toLowerCase().split(""); 43 | List lstVal=ReadData(filePath); 44 | List lstSimilarity=new ArrayList<>(); 45 | HashSet lstUniqueVal=new HashSet<>(); 46 | lstUniqueVal.addAll(lstVal); 47 | for(String str:lstUniqueVal) 48 | { 49 | HashSet lstUnion=new HashSet<>(); 50 | HashSet lstIntersect=new HashSet<>(); 51 | String [] arrStr=str.toLowerCase().split(""); 52 | lstUnion.addAll(Arrays.asList(arrWord)); 53 | lstUnion.addAll(Arrays.asList(arrStr)); 54 | lstIntersect.addAll(Arrays.asList(arrWord)); 55 | lstIntersect.retainAll(Arrays.asList(arrStr)); 56 | double lstUnoinSize=(double)lstUnion.size(); 57 | double lstIntersectSize=(double)lstIntersect.size(); 58 | double JaccardSimilarity=lstIntersectSize/lstUnoinSize; 59 | lstSimilarity.add(new ExtractTextSimilarity(word,str,JaccardSimilarity)); 60 | } 61 | System.setProperty("java.util.Arrays.useLegacyMergeSort", "true"); 62 | Collections.sort(lstSimilarity,new MyStringJaccardComp()); 63 | List lstTopRecords=lstSimilarity.stream() 64 | .limit(20) 65 | .collect(Collectors.toList()); 66 | return lstTopRecords; 67 | } 68 | @Override 69 | public double Jaccard_Document_Document(String file1,String file2) throws IOException 70 | { 71 | List lstWords1=ReadData(file1); 72 | List lstWords2=ReadData(file2); 73 | HashSet lstUniqueWords=new HashSet<>(); 74 | HashSet lstIntersectWords=new HashSet<>(); 75 | lstUniqueWords.addAll(lstWords1); 76 | lstUniqueWords.addAll(lstWords2); 77 | lstIntersectWords.addAll(lstWords1); 78 | lstIntersectWords.retainAll(lstWords2); 79 | double lstIntersectSize=lstIntersectWords.size(); 80 | double lstUniqueWordsSize=lstUniqueWords.size(); 81 | double JaccardSimilarity=lstIntersectSize/lstUniqueWordsSize; 82 | return JaccardSimilarity; 83 | } 84 | /* @Override 85 | public List Jaccard_Document_DocumentS(File filePath, String directoryPath) throws IOException 86 | { 87 | 88 | List lstSimilarity=new ArrayList<>(); 89 | List lstWords1=ReadData(filePath.getName()); 90 | HashSet lstUniqueWords1=new HashSet<>(); 91 | lstUniqueWords1.addAll(lstWords1); 92 | File[] files=new File(directoryPath).listFiles(); 93 | for(File file:files) 94 | { 95 | List lstWords2=ReadData(file.getPath()); 96 | lstUniqueWords1.addAll(lstWords2); 97 | HashSet lstIntersectWords=new HashSet<>(); 98 | lstIntersectWords.addAll(lstWords1); 99 | lstIntersectWords.retainAll(lstWords2); 100 | double lstUniqueWordsSize=lstUniqueWords1.size(); 101 | double lstIntersectSize=lstIntersectWords.size(); 102 | double JaccardSimilarity=lstIntersectSize/lstUniqueWordsSize; 103 | lstSimilarity.add(new ExtractTextSimilarity(filePath.getName(),file.getName() 104 | ,JaccardSimilarity)); 105 | } 106 | System.setProperty("java.util.Arrays.useLegacyMergeSort", "true"); 107 | Collections.sort(lstSimilarity,new MyStringJaccardComp()); 108 | List lstTopRecords=lstSimilarity.stream() 109 | .limit(10) 110 | .collect(Collectors.toList()); 111 | return lstTopRecords; 112 | }*/ 113 | private List ReadData(String FilePath) throws FileNotFoundException, IOException 114 | { 115 | List lstValues=new ArrayList<>(); 116 | BufferedReader reader=new BufferedReader(new FileReader(FilePath)); 117 | String line=""; 118 | while((line=reader.readLine())!=null) 119 | { 120 | String [] arrLine=line.split(" "); 121 | for(String str: arrLine) 122 | { 123 | str=str.toLowerCase(); 124 | str=str.trim(); 125 | lstValues.add(str); 126 | } 127 | } 128 | return lstValues; 129 | } 130 | public class MyStringJaccardComp implements Comparator 131 | { 132 | @Override 133 | public int compare(ExtractTextSimilarity o1, ExtractTextSimilarity o2) { 134 | if(o1.getSimilarity() Leveneshtain_Word_Document(String word1, String filePath) throws IOException 42 | { 43 | List lstVal=new ArrayList<>(); 44 | word1 = word1.toLowerCase(); 45 | List lstValues=ReadData(filePath); 46 | for(String b:lstValues) 47 | { 48 | int[] costs = new int[b.length() + 1]; 49 | for (int j = 0; j < costs.length; j++) 50 | costs[j] = j; 51 | for (int i = 1; i <= word1.length(); i++) 52 | { 53 | costs[0] = i; 54 | int nw = i - 1; 55 | for (int j = 1; j <= b.length(); j++) 56 | { 57 | int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), 58 | word1.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1); 59 | nw = costs[j]; 60 | costs[j] = cj; 61 | } 62 | } 63 | lstVal.add(new ExtractTextSimilarity(word1, b, costs[b.length()])); 64 | } 65 | System.setProperty("java.util.Arrays.useLegacyMergeSort", "true"); 66 | Collections.sort(lstVal,new myLeveneshteinComp()); 67 | List lstTopRecords=lstVal.stream() 68 | .limit(10) 69 | .collect(Collectors.toList()); 70 | return lstTopRecords; 71 | } 72 | 73 | private List ReadData(String FilePath) throws FileNotFoundException, IOException 74 | { 75 | List lstValues=new ArrayList<>(); 76 | BufferedReader reader=new BufferedReader(new FileReader(FilePath)); 77 | String line=""; 78 | while((line=reader.readLine())!=null) 79 | { 80 | String [] arrLine=line.split(" "); 81 | for(String str: arrLine) 82 | { 83 | str=str.toLowerCase(); 84 | str=str.trim(); 85 | lstValues.add(str); 86 | } 87 | } 88 | return lstValues; 89 | } 90 | public class myLeveneshteinComp implements Comparator 91 | { 92 | @Override 93 | public int compare(ExtractTextSimilarity o1, ExtractTextSimilarity o2) 94 | { 95 | if(o1.getSimilarity() > o2.getSimilarity()) 96 | return 1; 97 | else 98 | return -1; 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/extractsimilarity/ExtractTextQGramSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.extractsimilarity; 2 | 3 | 4 | import info.debatty.java.stringsimilarity.QGram; 5 | import unsw.curation.api.domain.abstraction.ITextQGramSimilarity; 6 | 7 | 8 | public class ExtractTextQGramSimilarity implements ITextQGramSimilarity { 9 | 10 | @Override 11 | public double ComputeQGramSimilarity(String word1, String word2) { 12 | QGram qG=new QGram(); 13 | double qGramDistance=qG.distance(word1, word2); 14 | 15 | return qGramDistance; 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/extractsimilarity/ExtractTextSoundexSimilarity.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.extractsimilarity; 2 | 3 | import org.apache.commons.codec.EncoderException; 4 | import org.apache.commons.codec.language.Soundex; 5 | 6 | import unsw.curation.api.domain.abstraction.ITextSoundexSimilarity; 7 | 8 | 9 | 10 | public class ExtractTextSoundexSimilarity implements ITextSoundexSimilarity { 11 | 12 | @Override 13 | public int SoundexDifference(String word1, String word2) throws EncoderException { 14 | Soundex soundee=new Soundex(); 15 | return soundee.difference(word1, word2); 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/extractstem/ExtractStemImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.extractstem; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.net.URISyntaxException; 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import java.util.stream.Collectors; 12 | 13 | import unsw.curation.api.domain.ExtractStem; 14 | import unsw.curation.api.domain.abstraction.IStem; 15 | 16 | 17 | 18 | public class ExtractStemImpl implements IStem { 19 | 20 | private List lstValues=new ArrayList<>(); 21 | @Override 22 | public List FindWordDerivedForms(String word) throws FileNotFoundException, IOException, URISyntaxException 23 | { 24 | String getWord=word.trim().toLowerCase(); 25 | ReadDataset(); 26 | List lstDerivedStems=lstValues.stream() 27 | .filter(s->s.getWord1().equalsIgnoreCase(getWord)) 28 | .collect(Collectors.toList()); 29 | return lstDerivedStems; 30 | } 31 | 32 | @Override 33 | public void ReadDataset() throws FileNotFoundException, IOException, URISyntaxException 34 | { 35 | //java.net.URL url = getClass().getClassLoader().getResource("Stem.txt"); 36 | File file = new File("Stem.txt"); 37 | BufferedReader reader=new BufferedReader(new FileReader(file)); 38 | String line=""; 39 | while((line=reader.readLine())!=null) 40 | { 41 | try 42 | { 43 | String [] lineValues=line.split("\\|"); 44 | String myWord1=lineValues[0].trim().toLowerCase(); 45 | String myDerived1=lineValues[1].trim().toLowerCase(); 46 | String myWord2=lineValues[3].trim().toLowerCase(); 47 | String myDerived2=lineValues[4].trim().toLowerCase(); 48 | lstValues.add(new ExtractStem(myWord1,myDerived1,myWord2,myDerived2)); 49 | } 50 | catch(Exception ex) 51 | { 52 | 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/index/DataSearch.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.index; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | 11 | import org.apache.lucene.document.Document; 12 | import org.apache.lucene.index.CorruptIndexException; 13 | import org.apache.lucene.index.DirectoryReader; 14 | import org.apache.lucene.index.IndexReader; 15 | import org.apache.lucene.index.Term; 16 | import org.apache.lucene.queryparser.classic.ParseException; 17 | import org.apache.lucene.search.IndexSearcher; 18 | import org.apache.lucene.search.PhraseQuery; 19 | import org.apache.lucene.search.Query; 20 | import org.apache.lucene.search.ScoreDoc; 21 | import org.apache.lucene.search.TopDocs; 22 | import org.apache.lucene.store.FSDirectory; 23 | 24 | 25 | /** 26 | * 27 | * @author Alireza 28 | */ 29 | 30 | public class DataSearch { 31 | IndexReader reader; 32 | IndexSearcher indSearch; 33 | Query query; 34 | public DataSearch(String IndexDir) throws IOException 35 | { 36 | reader=DirectoryReader.open(FSDirectory.open(new File(IndexDir))); 37 | indSearch=new IndexSearcher(reader); 38 | } 39 | public TopDocs search(String searchText, int slop) throws IOException, ParseException 40 | { 41 | PhraseQuery query = new PhraseQuery(); 42 | query.setSlop(slop); 43 | String [] searchTerms=searchText.split(" "); 44 | for(String searchWord:searchTerms) 45 | query.add(new Term("body",searchWord.toLowerCase())); 46 | return indSearch.search(query, 100); 47 | } 48 | public Document getDocument(ScoreDoc score) throws CorruptIndexException, IOException 49 | { 50 | return indSearch.doc(score.doc); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/index/Index.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.index; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | 6 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.document.TextField; 9 | import org.apache.lucene.index.CorruptIndexException; 10 | import org.apache.lucene.index.IndexWriter; 11 | import org.apache.lucene.index.IndexWriterConfig; 12 | import org.apache.lucene.store.FSDirectory; 13 | import org.apache.lucene.util.Version; 14 | import org.apache.lucene.document.Field; 15 | 16 | 17 | 18 | /** 19 | * 20 | * @author Alireza 21 | */ 22 | public class Index { 23 | private IndexWriter writer; 24 | private StandardAnalyzer Analyzer=new StandardAnalyzer(Version.LUCENE_46); 25 | 26 | public Index(String indexDirectory) throws IOException 27 | { 28 | FSDirectory indexDir=FSDirectory.open(new File(indexDirectory)); 29 | IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_46,Analyzer); 30 | writer=new IndexWriter(indexDir,config); 31 | } 32 | public void Close() throws CorruptIndexException, IOException 33 | { 34 | writer.close(); 35 | } 36 | 37 | private Document ListDoc(String text) throws IOException 38 | { 39 | Document doc=new Document(); 40 | doc.add(new TextField("body",text,Field.Store.YES)); 41 | //doc.add(new TextField("tweet",inputMongo.getBody(), Field.Store.YES)); 42 | //doc.add(new TextField("description", inputMongo.getDescription(), Field.Store.YES)); 43 | //System.out.println("Name "+inputMongo.getBody()); 44 | // doc.add(new TextField("displayName",inputMongo.getDisplayName(), Field.Store.YES)); 45 | //System.out.println("Indexing: "+inputMongo.getId()+" "+inputMongo.getBody()+" "+inputMongo.getDescription()+" "+inputMongo.getDisplayName()); 46 | return doc; 47 | } 48 | public void IndexDocuments(String getValues) throws IOException 49 | { 50 | // for(LuceneData mongoVal:getLstMongoValues) 51 | // { 52 | try 53 | { 54 | Document document = ListDoc(getValues); 55 | 56 | writer.addDocument(document); 57 | } 58 | catch(Exception ex) 59 | { 60 | System.out.print(ex.getMessage()); 61 | } 62 | // } 63 | 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/index/SchIndData.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.index; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | import org.apache.lucene.document.Document; 14 | import org.apache.lucene.queryparser.classic.ParseException; 15 | import org.apache.lucene.search.ScoreDoc; 16 | import org.apache.lucene.search.TopDocs; 17 | 18 | 19 | 20 | 21 | /** 22 | * 23 | * @author Alireza 24 | */ 25 | public class SchIndData 26 | { 27 | static String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+"; 28 | public SchIndData(){} 29 | Index dInd; 30 | DataSearch DSch; 31 | String twitterData=""; 32 | String current = System.getProperty("user.dir"); 33 | String SchTxt=""; 34 | public static void delete(File file) 35 | throws IOException{ 36 | 37 | if(file.isDirectory()) 38 | { 39 | if(file.list().length==0){ 40 | file.delete(); 41 | }else{ 42 | String files[] = file.list(); 43 | 44 | for (String temp : files) { 45 | File fileDelete = new File(file, temp); 46 | delete(fileDelete); 47 | } 48 | if(file.list().length==0){ 49 | file.delete(); 50 | } 51 | } 52 | } 53 | else 54 | { 55 | file.delete(); 56 | } 57 | } 58 | /* private ListReadData(String filePath) throws IOException 59 | { 60 | List lstTw=new ArrayList<>(); 61 | 62 | BufferedReader reader=new BufferedReader(new FileReader(filePath)); 63 | String line=""; 64 | while((line=reader.readLine())!=null) 65 | { 66 | lstTw.add(line); 67 | } 68 | return lstTw; 69 | }*/ 70 | /*public void CreateIndex(String sentence) throws IOException, ParseException 71 | { 72 | //ListlstValues=ReadData(filePath); 73 | File fileCheck=new File(current+"\\File_Index"); 74 | if(!fileCheck.exists()) 75 | { 76 | fileCheck.mkdir(); 77 | 78 | dInd = new Index(current+"\\File_Index\\"); 79 | System.out.println("Start Indexing Data: "+System.currentTimeMillis()); 80 | for(String inputValues: lstValues) 81 | { 82 | dInd.IndexDocuments(inputValues); 83 | } 84 | System.out.println("Finished Indexing Data: "+System.currentTimeMillis()); 85 | dInd.Close(); 86 | } 87 | else 88 | if(fileCheck.exists()&& fileCheck.listFiles().length>0) 89 | { 90 | Scanner sc=new Scanner(System.in); 91 | System.out.println("Index directory is exist; Do you want to index data again? (Y/N)"); 92 | String answer=sc.next(); 93 | if(answer.equalsIgnoreCase("y")) 94 | { 95 | delete(fileCheck); 96 | System.out.println("All Index Files are deleted."); 97 | fileCheck.mkdir(); 98 | dInd = new Index(current+"\\File_Index\\"); 99 | System.out.println("Start Indexing Data: "+System.currentTimeMillis()); 100 | for(String inputValues: lstValues) 101 | { 102 | dInd.IndexDocuments(inputValues); 103 | } 104 | System.out.println("Finished Indexing Data: "+System.currentTimeMillis()); 105 | dInd.Close(); 106 | } 107 | else 108 | if(answer.equalsIgnoreCase("n")) 109 | { 110 | System.out.println("Search " 111 | + "Based on the previous Indexed files..."); 112 | } 113 | } 114 | }*/ 115 | public List search(String token, String indexDir, int slop) throws IOException, ParseException 116 | { 117 | ListlstSearch=new ArrayList<>(); 118 | DSch = new DataSearch(indexDir); 119 | 120 | 121 | TopDocs hits = DSch.search(token, slop); 122 | //System.out.println(searchSentence+" "+hits.totalHits); 123 | for(ScoreDoc scoreDoc : hits.scoreDocs) 124 | { 125 | 126 | Document doc = DSch.getDocument(scoreDoc); 127 | lstSearch.add(doc.get("body")); 128 | /* lso.setTweet(doc.get("body")); 129 | lso.setNeType(searchSentence);*/ 130 | //System.out.println(lso.getSimilarSentence()); 131 | //System.out.println(lso.getScore()); 132 | //lso.setNeExistance(true); 133 | //System.out.println(lso.getSimilarSentence()); 134 | //System.out.println(searchSentence); 135 | //lso.setScore(String.valueOf(scoreDoc.score)); 136 | 137 | } 138 | return lstSearch; 139 | } 140 | /* private String PreProcessSentence(String inputSentence) 141 | { 142 | inputSentence=inputSentence.replaceAll(Pattern, ""); 143 | String [] arrSLine=inputSentence.split(" "); 144 | String Line=""; 145 | for(String str:arrSLine) 146 | { 147 | str=str.replace("'",""); 148 | str=str.replace("(",""); 149 | str=str.replace(")",""); 150 | str=str.replace("!",""); 151 | str=str.replace("[",""); 152 | str=str.replace("]",""); 153 | str=str.replace("{",""); 154 | str=str.replace("}",""); 155 | str=str.replace("\"",""); 156 | str=str.replace("?",""); 157 | str=str.replace(".",""); 158 | Line+=str+" "; 159 | } 160 | return Line; 161 | }*/ 162 | } 163 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/linking/GoogleKnowledgeGraph.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.linking; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.FileReader; 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | import java.net.HttpURLConnection; 10 | import java.net.URL; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | import org.json.JSONObject; 15 | 16 | public class GoogleKnowledgeGraph { 17 | 18 | private final String USER_AGENT = "Mozilla/5.0"; 19 | public void ParseGoogleKnowledgeGraph(ListlstEntity, String outputFileName) throws Exception { 20 | 21 | BufferedWriter writer=new BufferedWriter(new FileWriter(outputFileName)); 22 | for(String str:lstEntity) 23 | { 24 | str=str.trim(); 25 | if(str.contains(" ")) 26 | str=str.replace(" ", "+"); 27 | 28 | String url = "https://kgsearch.googleapis.com/v1/entities:search?query="+str+"&key=AIzaSyA6u_gvGgeBjUx5ThGhc2hvg-MiIfuYBkk&limit=1&indent=True"; 29 | //https://www.wikidata.org/w/api.php?action=wbsearchentities&search=lionel messi&language=en&format=json 30 | URL obj = new URL(url); 31 | HttpURLConnection con = (HttpURLConnection) obj.openConnection(); 32 | con.setRequestMethod("GET"); 33 | con.setRequestProperty("User-Agent", USER_AGENT); 34 | int responseCode = con.getResponseCode(); 35 | BufferedReader in = new BufferedReader( 36 | new InputStreamReader(con.getInputStream())); 37 | String inputLine; 38 | while ((inputLine = in.readLine()) != null) 39 | { 40 | //System.out.println("Fetching Data From Wikidata"); 41 | System.out.println(inputLine); 42 | writer.write(inputLine); 43 | writer.newLine(); 44 | } 45 | in.close(); 46 | } 47 | writer.close(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/linking/WikiData.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.linking; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.FileWriter; 6 | import java.io.InputStreamReader; 7 | import java.net.HttpURLConnection; 8 | import java.net.URL; 9 | import java.util.List; 10 | 11 | public class WikiData { 12 | 13 | private final String USER_AGENT = "Mozilla/5.0"; 14 | public void ParseWikiData(ListlstEntity, String outputFileName) throws Exception { 15 | 16 | BufferedWriter writer=new BufferedWriter(new FileWriter(outputFileName)); 17 | for(String str:lstEntity) 18 | { 19 | str=str.trim(); 20 | if(str.contains(" ")) 21 | str=str.replace(" ", "+"); 22 | 23 | String url = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search="+str+"&language=en&format=json"; 24 | //https://www.wikidata.org/w/api.php?action=wbsearchentities&search=lionel messi&language=en&format=json 25 | URL obj = new URL(url); 26 | HttpURLConnection con = (HttpURLConnection) obj.openConnection(); 27 | con.setRequestMethod("GET"); 28 | con.setRequestProperty("User-Agent", USER_AGENT); 29 | int responseCode = con.getResponseCode(); 30 | BufferedReader in = new BufferedReader( 31 | new InputStreamReader(con.getInputStream())); 32 | String inputLine; 33 | while ((inputLine = in.readLine()) != null) 34 | { 35 | System.out.println("Fetching Data From Wikidata"); 36 | System.out.println(inputLine); 37 | writer.write(inputLine); 38 | writer.newLine(); 39 | } 40 | in.close(); 41 | } 42 | writer.close(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/EvaluateClassifier.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.stream.Collectors; 10 | 11 | 12 | 13 | public class EvaluateClassifier { 14 | 15 | /*public List ComputePrecision(File actualValues, File predictedValues) throws IOException 16 | { 17 | List lstActualFile=ReadTestData(actualValues); 18 | List lstPredictedFile=ReadTestData(predictedValues); 19 | List lstPrecisionValues=new ArrayList<>(); 20 | if(lstActualFile.size()!=lstPredictedFile.size()) 21 | System.err.println("The Length of Actual and Predicted Vectors are not Similar"); 22 | List lstCategories=lstActualFile 23 | .stream() 24 | .distinct() 25 | .collect(Collectors.toList()); 26 | for(String category: lstCategories) 27 | { 28 | double truePositive=0; 29 | double falsePositive=0; 30 | category=category.toLowerCase().trim(); 31 | for(int i=0; i< lstPredictedFile.size();i++) 32 | { 33 | if(lstActualFile.get(i).equals(category)&& lstPredictedFile.get(i).equals(category)) 34 | { 35 | truePositive++; 36 | } 37 | if(!lstActualFile.get(i).equals(category)&& lstPredictedFile.get(i).equals(category)) 38 | { 39 | falsePositive++; 40 | } 41 | } 42 | double percision=truePositive/(falsePositive+truePositive); 43 | lstPrecisionValues.add("Precision: "+category+" is: "+String.valueOf(percision)); 44 | } 45 | return lstPrecisionValues; 46 | }*/ 47 | 48 | public double ComputeAccuracy(File actualValues, File predictedValues) throws IOException 49 | { 50 | List lstActualFile=ReadTestData(actualValues); 51 | List lstPredictedFile=ReadTestData(predictedValues); 52 | 53 | if(lstActualFile.size()!=lstPredictedFile.size()) 54 | System.err.println("The Length of Actual and Predicted Vectors are not Similar"); 55 | double positiveRate=0; 56 | for(int i=0;i ComputeRecall(File actualValues, File predictedValues) throws IOException 68 | { 69 | List lstActualFile=ReadTestData(actualValues); 70 | List lstPredictedFile=ReadTestData(predictedValues); 71 | List lstRecall=new ArrayList<>(); 72 | List lstCategories=lstActualFile 73 | .stream() 74 | .distinct() 75 | .collect(Collectors.toList()); 76 | for(String category: lstCategories) 77 | { 78 | double truePositive=0; 79 | double falseNegative=0; 80 | category=category.toLowerCase().trim(); 81 | for(int i=0; i< lstPredictedFile.size();i++) 82 | { 83 | if(lstActualFile.get(i).equals(category)&& lstPredictedFile.get(i).equals(category)) 84 | { 85 | truePositive++; 86 | } 87 | if(lstActualFile.get(i).equals(category)&& !lstPredictedFile.get(i).equals(category)) 88 | { 89 | falseNegative++; 90 | } 91 | } 92 | double recall=truePositive/(falseNegative+truePositive); 93 | lstRecall.add("Recall: "+category+" is: "+String.valueOf(recall)); 94 | } 95 | return lstRecall; 96 | } 97 | 98 | private List ReadTestData(File inputLabels) throws IOException 99 | { 100 | List lstLabels=new ArrayList<>(); 101 | BufferedReader reader=new BufferedReader(new FileReader(inputLabels)); 102 | String line=""; 103 | while((line=reader.readLine())!=null) 104 | { 105 | String [] arrLine=line.split(","); 106 | String label=arrLine[arrLine.length-1]; 107 | lstLabels.add(label.trim().toLowerCase()); 108 | } 109 | return lstLabels; 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextDecisionTreeImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.io.ObjectOutputStream; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | import unsw.curation.api.domain.Classification; 15 | import unsw.curation.api.domain.abstraction.IClassificationTextDecisionTree; 16 | import weka.classifiers.Evaluation; 17 | import weka.classifiers.meta.FilteredClassifier; 18 | import weka.classifiers.trees.J48; 19 | import weka.core.Instances; 20 | import weka.core.converters.ArffLoader; 21 | import weka.filters.unsupervised.attribute.StringToWordVector; 22 | 23 | 24 | public class ExtractClassificationTextDecisionTreeImpl implements IClassificationTextDecisionTree { 25 | 26 | Instances trainedData; 27 | StringToWordVector filter; 28 | FilteredClassifier classifier; 29 | Classification cls=new Classification(); 30 | @Override 31 | public void LoadDataset(File arffFileName) throws IOException 32 | { 33 | BufferedReader bReader=new BufferedReader( 34 | new FileReader(arffFileName)); 35 | ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader); 36 | trainedData=arff.getData(); 37 | bReader.close(); 38 | } 39 | 40 | @Override 41 | public List EvaluateDecisionTree() throws Exception 42 | { 43 | List lstEvaluationDetail=new ArrayList<>(); 44 | trainedData.setClassIndex(trainedData.numAttributes()-1); 45 | filter=new StringToWordVector(); 46 | classifier=new FilteredClassifier(); 47 | classifier.setFilter(filter); 48 | classifier.setClassifier(new J48()); 49 | Evaluation eval=new Evaluation(trainedData); 50 | eval.crossValidateModel(classifier, trainedData, 4, new Random(1)); 51 | /*try 52 | { 53 | for(int i=0;i<10000;i++) 54 | { 55 | cls.setPrecision(eval.precision(i)); 56 | cls.setRecall(eval.recall(i)); 57 | cls.setAuc(eval.areaUnderPRC(i)); 58 | cls.setFMeasure(eval.fMeasure(i)); 59 | cls.setFn(eval.falseNegativeRate(i)); 60 | cls.setFp(eval.falsePositiveRate(i)); 61 | cls.setTn(eval.trueNegativeRate(i)); 62 | cls.setTp(eval.truePositiveRate(i)); 63 | cls.setMeanAbsoluteError(eval.meanAbsoluteError()); 64 | cls.setRelativeAbsoluteError(eval.relativeAbsoluteError()); 65 | cls.setCorrect(eval.correct()); 66 | cls.setKappa(eval.kappa()); 67 | cls.setNumInstances(eval.numInstances()); 68 | cls.setInCorrect(eval.incorrect()); 69 | lstEvaluationDetail.add(new Classification(cls.getPrecision(), 70 | cls.getRecall(), 71 | cls.getAuc(), 72 | cls.getCorrect(), 73 | cls.getInCorrect(), 74 | cls.getErrorRate(), 75 | cls.getFn(), 76 | cls.getFp(), 77 | cls.getTn(), 78 | cls.getTp(), 79 | cls.getKappa(), 80 | cls.getMeanAbsoluteError(), 81 | cls.getNumInstances(), 82 | cls.getRelativeAbsoluteError(), 83 | cls.getFMeasure())); 84 | } 85 | } 86 | catch(Exception ex) 87 | { 88 | 89 | }*/ 90 | return lstEvaluationDetail; 91 | } 92 | 93 | @Override 94 | public void LearnDecisionTree() throws Exception 95 | { 96 | trainedData.setClassIndex(trainedData.numAttributes()-1); 97 | filter=new StringToWordVector(); 98 | classifier=new FilteredClassifier(); 99 | classifier.setFilter(filter); 100 | classifier.setClassifier(new J48()); 101 | classifier.buildClassifier(trainedData); 102 | 103 | } 104 | 105 | @Override 106 | public void SaveModel(String modelName) throws FileNotFoundException, IOException 107 | { 108 | ObjectOutputStream output=new ObjectOutputStream( 109 | new FileOutputStream(modelName)); 110 | output.writeObject(classifier); 111 | output.close(); 112 | 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextKNNImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.io.ObjectOutputStream; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | import unsw.curation.api.domain.Classification; 15 | import unsw.curation.api.domain.abstraction.IClassificationTextKNN; 16 | import weka.classifiers.Evaluation; 17 | import weka.classifiers.lazy.IBk; 18 | import weka.classifiers.meta.FilteredClassifier; 19 | import weka.core.Instances; 20 | import weka.core.converters.ArffLoader; 21 | import weka.filters.unsupervised.attribute.StringToWordVector; 22 | 23 | 24 | public class ExtractClassificationTextKNNImpl implements IClassificationTextKNN { 25 | 26 | Classification cls=new Classification(); 27 | Instances trainedData; 28 | StringToWordVector filter; 29 | FilteredClassifier classifier; 30 | @Override 31 | public void LoadDataset(File arffFileName) throws IOException { 32 | BufferedReader bReader=new BufferedReader(new FileReader(arffFileName)); 33 | ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader); 34 | trainedData=arff.getData(); 35 | bReader.close(); 36 | } 37 | 38 | @Override 39 | public List EvaluateKNN() throws Exception 40 | { 41 | List lstEvaluationDetail=new ArrayList<>(); 42 | trainedData.setClassIndex(trainedData.numAttributes()-1); 43 | filter=new StringToWordVector(); 44 | classifier=new FilteredClassifier(); 45 | classifier.setFilter(filter); 46 | classifier.setClassifier(new IBk()); 47 | Evaluation eval=new Evaluation(trainedData); 48 | eval.crossValidateModel(classifier, trainedData, 4, new Random(1)); 49 | /*try 50 | { 51 | for(int i=0;i<10000;i++) 52 | { 53 | cls.setPrecision(eval.precision(i)); 54 | cls.setRecall(eval.recall(i)); 55 | cls.setAuc(eval.areaUnderPRC(i)); 56 | cls.setFMeasure(eval.fMeasure(i)); 57 | cls.setFn(eval.falseNegativeRate(i)); 58 | cls.setFp(eval.falsePositiveRate(i)); 59 | cls.setTn(eval.trueNegativeRate(i)); 60 | cls.setTp(eval.truePositiveRate(i)); 61 | cls.setMeanAbsoluteError(eval.meanAbsoluteError()); 62 | cls.setRelativeAbsoluteError(eval.relativeAbsoluteError()); 63 | cls.setCorrect(eval.correct()); 64 | cls.setKappa(eval.kappa()); 65 | cls.setNumInstances(eval.numInstances()); 66 | cls.setInCorrect(eval.incorrect()); 67 | lstEvaluationDetail.add(new Classification(cls.getPrecision(), 68 | cls.getRecall(), 69 | cls.getAuc(), 70 | cls.getCorrect(), 71 | cls.getInCorrect(), 72 | cls.getErrorRate(), 73 | cls.getFn(), 74 | cls.getFp(), 75 | cls.getTn(), 76 | cls.getTp(), 77 | cls.getKappa(), 78 | cls.getMeanAbsoluteError(), 79 | cls.getNumInstances(), 80 | cls.getRelativeAbsoluteError(), 81 | cls.getFMeasure())); 82 | } 83 | } 84 | catch(Exception ex) 85 | { 86 | 87 | }*/ 88 | return lstEvaluationDetail; 89 | } 90 | 91 | @Override 92 | public void LearnKNN() throws Exception { 93 | trainedData.setClassIndex(trainedData.numAttributes()-1); 94 | filter=new StringToWordVector(); 95 | classifier=new FilteredClassifier(); 96 | classifier.setFilter(filter); 97 | classifier.setClassifier(new IBk()); 98 | classifier.buildClassifier(trainedData); 99 | 100 | } 101 | 102 | @Override 103 | public void SaveModel(String modelName) throws FileNotFoundException, IOException { 104 | ObjectOutputStream output=new ObjectOutputStream( 105 | new FileOutputStream(modelName)); 106 | output.writeObject(classifier); 107 | output.close(); 108 | 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextLogisticRegressionImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.io.ObjectOutputStream; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | import unsw.curation.api.domain.Classification; 15 | import unsw.curation.api.domain.abstraction.IClassificationTextLogisticRegression; 16 | import weka.classifiers.Evaluation; 17 | import weka.classifiers.functions.Logistic; 18 | import weka.classifiers.meta.FilteredClassifier; 19 | import weka.core.Instances; 20 | import weka.core.converters.ArffLoader; 21 | import weka.filters.unsupervised.attribute.StringToWordVector; 22 | 23 | public class ExtractClassificationTextLogisticRegressionImpl implements IClassificationTextLogisticRegression 24 | { 25 | Instances trainedData; 26 | StringToWordVector filter; 27 | FilteredClassifier classifier; 28 | Classification cls=new Classification(); 29 | 30 | @Override 31 | public void LoadDataset(File arffFileName) throws IOException 32 | { 33 | BufferedReader bReader=new BufferedReader( 34 | new FileReader(arffFileName)); 35 | ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader); 36 | trainedData=arff.getData(); 37 | bReader.close(); 38 | 39 | } 40 | @Override 41 | public List EvaluateLogisticRegression() throws Exception 42 | { 43 | List lstEvaluationDetail=new ArrayList<>(); 44 | trainedData.setClassIndex(trainedData.numAttributes()-1); 45 | filter=new StringToWordVector(); 46 | classifier=new FilteredClassifier(); 47 | classifier.setFilter(filter); 48 | classifier.setClassifier(new Logistic()); 49 | Evaluation eval=new Evaluation(trainedData); 50 | eval.crossValidateModel(classifier, trainedData, 4, new Random(1)); 51 | /*try 52 | { 53 | for(int i=0;i<10000;i++) 54 | { 55 | cls.setPrecision(eval.precision(i)); 56 | cls.setRecall(eval.recall(i)); 57 | cls.setAuc(eval.areaUnderPRC(i)); 58 | cls.setFMeasure(eval.fMeasure(i)); 59 | cls.setFn(eval.falseNegativeRate(i)); 60 | cls.setFp(eval.falsePositiveRate(i)); 61 | cls.setTn(eval.trueNegativeRate(i)); 62 | cls.setTp(eval.truePositiveRate(i)); 63 | cls.setMeanAbsoluteError(eval.meanAbsoluteError()); 64 | cls.setRelativeAbsoluteError(eval.relativeAbsoluteError()); 65 | cls.setCorrect(eval.correct()); 66 | cls.setKappa(eval.kappa()); 67 | cls.setNumInstances(eval.numInstances()); 68 | cls.setInCorrect(eval.incorrect()); 69 | lstEvaluationDetail.add(new Classification(cls.getPrecision(), 70 | cls.getRecall(), 71 | cls.getAuc(), 72 | cls.getCorrect(), 73 | cls.getInCorrect(), 74 | cls.getErrorRate(), 75 | cls.getFn(), 76 | cls.getFp(), 77 | cls.getTn(), 78 | cls.getTp(), 79 | cls.getKappa(), 80 | cls.getMeanAbsoluteError(), 81 | cls.getNumInstances(), 82 | cls.getRelativeAbsoluteError(), 83 | cls.getFMeasure())); 84 | } 85 | } 86 | catch(Exception ex) 87 | { 88 | 89 | }*/ 90 | return lstEvaluationDetail; 91 | } 92 | 93 | @Override 94 | public void LearnLogisticRegression() throws Exception 95 | { 96 | trainedData.setClassIndex(trainedData.numAttributes()-1); 97 | filter=new StringToWordVector(); 98 | classifier=new FilteredClassifier(); 99 | classifier.setFilter(filter); 100 | classifier.setClassifier(new Logistic()); 101 | classifier.buildClassifier(trainedData); 102 | 103 | } 104 | @Override 105 | public void SaveModel(String modelName) throws FileNotFoundException, IOException 106 | { 107 | ObjectOutputStream output=new ObjectOutputStream( 108 | new FileOutputStream(modelName)); 109 | output.writeObject(classifier); 110 | output.close(); 111 | 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextNaiveBaysImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.io.ObjectOutputStream; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | import unsw.curation.api.domain.Classification; 15 | import unsw.curation.api.domain.abstraction.IClassificationTextNaiveBays; 16 | import weka.classifiers.Evaluation; 17 | import weka.classifiers.bayes.NaiveBayes; 18 | import weka.classifiers.meta.FilteredClassifier; 19 | import weka.core.Instances; 20 | import weka.core.converters.ArffLoader; 21 | import weka.filters.unsupervised.attribute.StringToWordVector; 22 | 23 | 24 | public class ExtractClassificationTextNaiveBaysImpl implements IClassificationTextNaiveBays { 25 | 26 | Classification cls=new Classification(); 27 | Instances trainedData; 28 | StringToWordVector filter; 29 | FilteredClassifier classifier; 30 | @Override 31 | public void LoadDataset(File arffFileName) throws IOException 32 | { 33 | BufferedReader bReader=new BufferedReader(new FileReader(arffFileName)); 34 | ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader); 35 | trainedData=arff.getData(); 36 | bReader.close(); 37 | } 38 | 39 | @Override 40 | public List EvaluateNaiveBays() throws Exception 41 | { 42 | List lstEvaluationDetail=new ArrayList<>(); 43 | trainedData.setClassIndex(trainedData.numAttributes()-1); 44 | filter=new StringToWordVector(); 45 | classifier=new FilteredClassifier(); 46 | classifier.setFilter(filter); 47 | classifier.setClassifier(new NaiveBayes()); 48 | Evaluation eval=new Evaluation(trainedData); 49 | eval.crossValidateModel(classifier, trainedData, 4, new Random(1)); 50 | /*try 51 | { 52 | for(int i=0;i<10000;i++) 53 | { 54 | cls.setPrecision(eval.precision(i)); 55 | cls.setRecall(eval.recall(i)); 56 | cls.setAuc(eval.areaUnderPRC(i)); 57 | cls.setFMeasure(eval.fMeasure(i)); 58 | cls.setFn(eval.falseNegativeRate(i)); 59 | cls.setFp(eval.falsePositiveRate(i)); 60 | cls.setTn(eval.trueNegativeRate(i)); 61 | cls.setTp(eval.truePositiveRate(i)); 62 | cls.setMeanAbsoluteError(eval.meanAbsoluteError()); 63 | cls.setRelativeAbsoluteError(eval.relativeAbsoluteError()); 64 | cls.setCorrect(eval.correct()); 65 | cls.setKappa(eval.kappa()); 66 | cls.setNumInstances(eval.numInstances()); 67 | cls.setInCorrect(eval.incorrect()); 68 | lstEvaluationDetail.add(new Classification(cls.getPrecision(), 69 | cls.getRecall(), 70 | cls.getAuc(), 71 | cls.getCorrect(), 72 | cls.getInCorrect(), 73 | cls.getErrorRate(), 74 | cls.getFn(), 75 | cls.getFp(), 76 | cls.getTn(), 77 | cls.getTp(), 78 | cls.getKappa(), 79 | cls.getMeanAbsoluteError(), 80 | cls.getNumInstances(), 81 | cls.getRelativeAbsoluteError(), 82 | cls.getFMeasure())); 83 | } 84 | } 85 | catch(Exception ex) 86 | { 87 | 88 | }*/ 89 | return lstEvaluationDetail; 90 | } 91 | 92 | @Override 93 | public void LearnNaiveBays() throws Exception 94 | { 95 | trainedData.setClassIndex(trainedData.numAttributes()-1); 96 | filter=new StringToWordVector(); 97 | classifier=new FilteredClassifier(); 98 | classifier.setFilter(filter); 99 | classifier.setClassifier(new NaiveBayes()); 100 | classifier.buildClassifier(trainedData); 101 | } 102 | 103 | @Override 104 | public void SaveModel(String modelName) throws FileNotFoundException, IOException 105 | { 106 | ObjectOutputStream output=new ObjectOutputStream( 107 | new FileOutputStream(modelName)); 108 | output.writeObject(classifier); 109 | output.close(); 110 | } 111 | 112 | 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextNeuralNetworkImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.io.ObjectOutputStream; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | import unsw.curation.api.domain.Classification; 15 | import unsw.curation.api.domain.abstraction.IClassificationTextNeuralNetwork; 16 | import weka.classifiers.Evaluation; 17 | import weka.classifiers.functions.MultilayerPerceptron; 18 | import weka.classifiers.meta.FilteredClassifier; 19 | import weka.core.Instances; 20 | import weka.core.converters.ArffLoader; 21 | import weka.filters.unsupervised.attribute.StringToWordVector; 22 | 23 | 24 | public class ExtractClassificationTextNeuralNetworkImpl implements IClassificationTextNeuralNetwork 25 | { 26 | 27 | Instances trainedData; 28 | StringToWordVector filter; 29 | FilteredClassifier classifier; 30 | Classification cls=new Classification(); 31 | 32 | @Override 33 | public void LoadDataset(File arffFileName) throws IOException 34 | { 35 | BufferedReader bReader=new BufferedReader( 36 | new FileReader(arffFileName)); 37 | ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader); 38 | trainedData=arff.getData(); 39 | bReader.close(); 40 | } 41 | 42 | @Override 43 | public List EvaluateNeuralNetwork() throws Exception 44 | { 45 | List lstEvaluationDetail=new ArrayList<>(); 46 | trainedData.setClassIndex(trainedData.numAttributes()-1); 47 | filter=new StringToWordVector(); 48 | classifier=new FilteredClassifier(); 49 | classifier.setFilter(filter); 50 | classifier.setClassifier(new MultilayerPerceptron()); 51 | Evaluation eval=new Evaluation(trainedData); 52 | eval.crossValidateModel(classifier, trainedData, 4, new Random(1)); 53 | /*try 54 | { 55 | for(int i=0;i<10000;i++) 56 | { 57 | cls.setPrecision(eval.precision(i)); 58 | cls.setRecall(eval.recall(i)); 59 | cls.setAuc(eval.areaUnderPRC(i)); 60 | cls.setFMeasure(eval.fMeasure(i)); 61 | cls.setFn(eval.falseNegativeRate(i)); 62 | cls.setFp(eval.falsePositiveRate(i)); 63 | cls.setTn(eval.trueNegativeRate(i)); 64 | cls.setTp(eval.truePositiveRate(i)); 65 | cls.setMeanAbsoluteError(eval.meanAbsoluteError()); 66 | cls.setRelativeAbsoluteError(eval.relativeAbsoluteError()); 67 | cls.setCorrect(eval.correct()); 68 | cls.setKappa(eval.kappa()); 69 | cls.setNumInstances(eval.numInstances()); 70 | cls.setInCorrect(eval.incorrect()); 71 | lstEvaluationDetail.add(new Classification(cls.getPrecision(), 72 | cls.getRecall(), 73 | cls.getAuc(), 74 | cls.getCorrect(), 75 | cls.getInCorrect(), 76 | cls.getErrorRate(), 77 | cls.getFn(), 78 | cls.getFp(), 79 | cls.getTn(), 80 | cls.getTp(), 81 | cls.getKappa(), 82 | cls.getMeanAbsoluteError(), 83 | cls.getNumInstances(), 84 | cls.getRelativeAbsoluteError(), 85 | cls.getFMeasure())); 86 | } 87 | } 88 | catch(Exception ex) 89 | { 90 | 91 | }*/ 92 | return lstEvaluationDetail; 93 | } 94 | 95 | @Override 96 | public void LearnNeuralNetwork() throws Exception 97 | { 98 | trainedData.setClassIndex(trainedData.numAttributes()-1); 99 | filter=new StringToWordVector(); 100 | classifier=new FilteredClassifier(); 101 | classifier.setFilter(filter); 102 | classifier.setClassifier(new MultilayerPerceptron()); 103 | classifier.buildClassifier(trainedData); 104 | } 105 | 106 | @Override 107 | public void SaveModel(String modelName) throws FileNotFoundException, IOException 108 | { 109 | ObjectOutputStream output=new ObjectOutputStream( 110 | new FileOutputStream(modelName)); 111 | output.writeObject(classifier); 112 | output.close(); 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextRandomForestImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.io.ObjectOutputStream; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | import unsw.curation.api.domain.Classification; 15 | import unsw.curation.api.domain.abstraction.IClassificationTextRandomForest; 16 | import weka.classifiers.Evaluation; 17 | import weka.classifiers.meta.FilteredClassifier; 18 | import weka.classifiers.trees.RandomForest; 19 | import weka.core.Instances; 20 | import weka.core.converters.ArffLoader; 21 | import weka.filters.unsupervised.attribute.StringToWordVector; 22 | 23 | 24 | public class ExtractClassificationTextRandomForestImpl implements IClassificationTextRandomForest 25 | { 26 | Instances trainedData; 27 | StringToWordVector filter; 28 | FilteredClassifier classifier; 29 | Classification cls=new Classification(); 30 | 31 | @Override 32 | public void LoadDataset(File arffFileName) throws IOException 33 | { 34 | BufferedReader bReader=new BufferedReader( 35 | new FileReader(arffFileName)); 36 | ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader); 37 | trainedData=arff.getData(); 38 | bReader.close(); 39 | } 40 | 41 | @Override 42 | public List EvaluateRandomForest() throws Exception 43 | { 44 | List lstEvaluationDetail=new ArrayList<>(); 45 | trainedData.setClassIndex(trainedData.numAttributes()-1); 46 | filter=new StringToWordVector(); 47 | classifier=new FilteredClassifier(); 48 | classifier.setFilter(filter); 49 | classifier.setClassifier(new RandomForest()); 50 | Evaluation eval=new Evaluation(trainedData); 51 | eval.crossValidateModel(classifier, trainedData, 4, new Random(1)); 52 | /*try 53 | { 54 | for(int i=0;i<10000;i++) 55 | { 56 | cls.setPrecision(eval.precision(i)); 57 | cls.setRecall(eval.recall(i)); 58 | cls.setAuc(eval.areaUnderPRC(i)); 59 | cls.setFMeasure(eval.fMeasure(i)); 60 | cls.setFn(eval.falseNegativeRate(i)); 61 | cls.setFp(eval.falsePositiveRate(i)); 62 | cls.setTn(eval.trueNegativeRate(i)); 63 | cls.setTp(eval.truePositiveRate(i)); 64 | cls.setMeanAbsoluteError(eval.meanAbsoluteError()); 65 | cls.setRelativeAbsoluteError(eval.relativeAbsoluteError()); 66 | cls.setCorrect(eval.correct()); 67 | cls.setKappa(eval.kappa()); 68 | cls.setNumInstances(eval.numInstances()); 69 | cls.setInCorrect(eval.incorrect()); 70 | lstEvaluationDetail.add(new Classification(cls.getPrecision(), 71 | cls.getRecall(), 72 | cls.getAuc(), 73 | cls.getCorrect(), 74 | cls.getInCorrect(), 75 | cls.getErrorRate(), 76 | cls.getFn(), 77 | cls.getFp(), 78 | cls.getTn(), 79 | cls.getTp(), 80 | cls.getKappa(), 81 | cls.getMeanAbsoluteError(), 82 | cls.getNumInstances(), 83 | cls.getRelativeAbsoluteError(), 84 | cls.getFMeasure())); 85 | } 86 | } 87 | catch(Exception ex) 88 | { 89 | 90 | }*/ 91 | return lstEvaluationDetail; 92 | } 93 | 94 | @Override 95 | public void LearnRandomForest() throws Exception 96 | { 97 | trainedData.setClassIndex(trainedData.numAttributes()-1); 98 | filter=new StringToWordVector(); 99 | classifier=new FilteredClassifier(); 100 | classifier.setFilter(filter); 101 | classifier.setClassifier(new RandomForest()); 102 | classifier.buildClassifier(trainedData); 103 | } 104 | 105 | @Override 106 | public void SaveModel(String modelName) throws FileNotFoundException, IOException 107 | { 108 | ObjectOutputStream output=new ObjectOutputStream( 109 | new FileOutputStream(modelName)); 110 | output.writeObject(classifier); 111 | output.close(); 112 | 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/ExtractClassificationTextSVMImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.FileReader; 8 | import java.io.IOException; 9 | import java.io.ObjectOutputStream; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.Random; 13 | 14 | import unsw.curation.api.domain.Classification; 15 | import unsw.curation.api.domain.abstraction.IClassificationTextSVM; 16 | import weka.classifiers.Evaluation; 17 | import weka.classifiers.functions.SMO; 18 | import weka.classifiers.meta.FilteredClassifier; 19 | import weka.core.Instances; 20 | import weka.core.converters.ArffLoader; 21 | import weka.filters.unsupervised.attribute.StringToWordVector; 22 | 23 | 24 | public class ExtractClassificationTextSVMImpl implements IClassificationTextSVM { 25 | 26 | Instances trainedData; 27 | StringToWordVector filter; 28 | FilteredClassifier classifier; 29 | Classification cls=new Classification(); 30 | 31 | @Override 32 | public void LoadDataset(File arffFileName) throws IOException { 33 | BufferedReader bReader=new BufferedReader( 34 | new FileReader(arffFileName)); 35 | ArffLoader.ArffReader arff=new ArffLoader.ArffReader(bReader); 36 | trainedData=arff.getData(); 37 | bReader.close(); 38 | 39 | } 40 | 41 | @Override 42 | public List EvaluateSVM() throws Exception 43 | { 44 | List lstEvaluationDetail=new ArrayList<>(); 45 | trainedData.setClassIndex(trainedData.numAttributes()-1); 46 | filter=new StringToWordVector(); 47 | classifier=new FilteredClassifier(); 48 | classifier.setFilter(filter); 49 | classifier.setClassifier(new SMO()); 50 | Evaluation eval=new Evaluation(trainedData); 51 | eval.crossValidateModel(classifier, trainedData, 4, new Random(1)); 52 | /*try 53 | { 54 | for(int i=0;i<10000;i++) 55 | { 56 | cls.setPrecision(eval.precision(i)); 57 | cls.setRecall(eval.recall(i)); 58 | cls.setAuc(eval.areaUnderPRC(i)); 59 | cls.setFMeasure(eval.fMeasure(i)); 60 | cls.setFn(eval.falseNegativeRate(i)); 61 | cls.setFp(eval.falsePositiveRate(i)); 62 | cls.setTn(eval.trueNegativeRate(i)); 63 | cls.setTp(eval.truePositiveRate(i)); 64 | cls.setMeanAbsoluteError(eval.meanAbsoluteError()); 65 | cls.setRelativeAbsoluteError(eval.relativeAbsoluteError()); 66 | cls.setCorrect(eval.correct()); 67 | cls.setKappa(eval.kappa()); 68 | cls.setNumInstances(eval.numInstances()); 69 | cls.setInCorrect(eval.incorrect()); 70 | lstEvaluationDetail.add(new Classification(cls.getPrecision(), 71 | cls.getRecall(), 72 | cls.getAuc(), 73 | cls.getCorrect(), 74 | cls.getInCorrect(), 75 | cls.getErrorRate(), 76 | cls.getFn(), 77 | cls.getFp(), 78 | cls.getTn(), 79 | cls.getTp(), 80 | cls.getKappa(), 81 | cls.getMeanAbsoluteError(), 82 | cls.getNumInstances(), 83 | cls.getRelativeAbsoluteError(), 84 | cls.getFMeasure())); 85 | } 86 | } 87 | catch(Exception ex) 88 | { 89 | 90 | }*/ 91 | return lstEvaluationDetail; 92 | } 93 | 94 | @Override 95 | public void LearnSVM() throws Exception 96 | { 97 | trainedData.setClassIndex(trainedData.numAttributes()-1); 98 | filter=new StringToWordVector(); 99 | classifier=new FilteredClassifier(); 100 | classifier.setFilter(filter); 101 | classifier.setClassifier(new SMO()); 102 | classifier.buildClassifier(trainedData); 103 | 104 | } 105 | 106 | @Override 107 | public void SaveModel(String modelName) throws FileNotFoundException, IOException { 108 | ObjectOutputStream output=new ObjectOutputStream( 109 | new FileOutputStream(modelName)); 110 | output.writeObject(classifier); 111 | output.close(); 112 | 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/textclassification/TextClassifierImpl.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.textclassification; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.util.List; 7 | import java.util.concurrent.ThreadLocalRandom; 8 | 9 | import unsw.curation.api.classify.TextClassifier; 10 | import unsw.curation.api.domain.Classification; 11 | 12 | public class TextClassifierImpl extends TextClassifier { 13 | 14 | public void TKnn(File trainFile, File testFile, File result) throws Exception 15 | { 16 | long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999); 17 | String current = System.getProperty("user.dir"); 18 | File Textdir= new File(current+"\\TextClassification"); 19 | if(!Textdir.exists()) 20 | Textdir.mkdir(); 21 | 22 | ExtractClassificationTextKNNImpl knn=new ExtractClassificationTextKNNImpl(); 23 | knn.LoadDataset(trainFile); 24 | knn.EvaluateKNN(); 25 | knn.LearnKNN(); 26 | knn.SaveModel(Textdir+"\\Knn"+fileNumber+".dat"); 27 | 28 | LoadTestData(testFile); 29 | loadModel(Textdir+"\\Knn"+fileNumber+".dat"); 30 | Predict(result.getAbsolutePath()); 31 | } 32 | public void TNaiveBayes(File trainFile, File testFile, File result) throws Exception 33 | { 34 | long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999); 35 | String current = System.getProperty("user.dir"); 36 | File Textdir= new File(current+"\\TextClassification"); 37 | if(!Textdir.exists()) 38 | Textdir.mkdir(); 39 | ExtractClassificationTextNaiveBaysImpl naiveBayes=new ExtractClassificationTextNaiveBaysImpl(); 40 | naiveBayes.LoadDataset(trainFile); 41 | naiveBayes.EvaluateNaiveBays(); 42 | naiveBayes.LearnNaiveBays(); 43 | naiveBayes.SaveModel(Textdir+"\\NaiveBayes"+fileNumber+".dat"); 44 | 45 | LoadTestData(testFile); 46 | loadModel(Textdir+"\\NaiveBayes"+fileNumber+".dat"); 47 | Predict(result.getAbsolutePath()); 48 | } 49 | public void TLogisticRegression(File trainFile, File testFile, File result) throws Exception 50 | { 51 | long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999); 52 | String current = System.getProperty("user.dir"); 53 | File Textdir= new File(current+"\\TextClassification"); 54 | if(!Textdir.exists()) 55 | Textdir.mkdir(); 56 | ExtractClassificationTextLogisticRegressionImpl glm=new ExtractClassificationTextLogisticRegressionImpl(); 57 | glm.LoadDataset(trainFile); 58 | glm.EvaluateLogisticRegression(); 59 | glm.LearnLogisticRegression(); 60 | glm.SaveModel(Textdir+"\\Logistic"+fileNumber+".dat"); 61 | 62 | LoadTestData(testFile); 63 | loadModel(Textdir+"\\Logistic"+fileNumber+".dat"); 64 | Predict(result.getAbsolutePath()); 65 | } 66 | public void TDecisionTree(File trainFile, File testFile, File result) throws Exception 67 | { 68 | long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999); 69 | String current = System.getProperty("user.dir"); 70 | File Textdir= new File(current+"\\TextClassification"); 71 | if(!Textdir.exists()) 72 | Textdir.mkdir(); 73 | ExtractClassificationTextDecisionTreeImpl j48=new ExtractClassificationTextDecisionTreeImpl(); 74 | j48.LoadDataset(trainFile); 75 | j48.EvaluateDecisionTree(); 76 | j48.LearnDecisionTree(); 77 | j48.SaveModel(Textdir+"\\DecisionTree"+fileNumber+".dat"); 78 | 79 | LoadTestData(testFile); 80 | loadModel(Textdir+"\\DecisionTree"+fileNumber+".dat"); 81 | Predict(result.getAbsolutePath()); 82 | } 83 | public void TRandomForest(File trainFile, File testFile, File result) throws Exception 84 | { 85 | long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999); 86 | String current = System.getProperty("user.dir"); 87 | File Textdir= new File(current+"\\TextClassification"); 88 | if(!Textdir.exists()) 89 | Textdir.mkdir(); 90 | ExtractClassificationTextRandomForestImpl rf=new ExtractClassificationTextRandomForestImpl(); 91 | rf.LoadDataset(trainFile); 92 | rf.EvaluateRandomForest(); 93 | rf.LearnRandomForest(); 94 | rf.SaveModel(Textdir+"\\RandomForest"+fileNumber+".dat"); 95 | 96 | LoadTestData(testFile); 97 | loadModel(Textdir+"\\RandomForest"+fileNumber+".dat"); 98 | Predict(result.getAbsolutePath()); 99 | } 100 | 101 | public void TSvm(File trainFile, File testFile, File result) throws Exception 102 | { 103 | long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999); 104 | String current = System.getProperty("user.dir"); 105 | File Textdir= new File(current+"\\TextClassification"); 106 | if(!Textdir.exists()) 107 | Textdir.mkdir(); 108 | 109 | ExtractClassificationTextSVMImpl svm=new ExtractClassificationTextSVMImpl(); 110 | svm.LoadDataset(trainFile); 111 | svm.EvaluateSVM(); 112 | svm.LearnSVM(); 113 | svm.SaveModel(Textdir+"\\SVM"+fileNumber+".dat"); 114 | 115 | LoadTestData(testFile); 116 | loadModel(Textdir+"\\SVM"+fileNumber+".dat"); 117 | Predict(result.getAbsolutePath()); 118 | } 119 | public void TNeuralNetwork(File trainFile, File testFile, File result) throws Exception 120 | { 121 | long fileNumber=ThreadLocalRandom.current().nextInt(1, 9999999); 122 | String current = System.getProperty("user.dir"); 123 | File Textdir= new File(current+"\\TextClassification"); 124 | if(!Textdir.exists()) 125 | Textdir.mkdir(); 126 | ExtractClassificationTextNeuralNetworkImpl neural =new ExtractClassificationTextNeuralNetworkImpl(); 127 | neural.LoadDataset(trainFile); 128 | neural.EvaluateNeuralNetwork(); 129 | neural.LearnNeuralNetwork(); 130 | neural.SaveModel(Textdir+"\\Neural"+fileNumber+".dat"); 131 | 132 | LoadTestData(testFile); 133 | loadModel(Textdir+"\\Neural"+fileNumber+".dat"); 134 | Predict(result.getAbsolutePath()); 135 | 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/tfidf/DataSearchSentence.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.tfidf; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 11 | import org.apache.lucene.document.Document; 12 | import org.apache.lucene.index.CorruptIndexException; 13 | import org.apache.lucene.index.DirectoryReader; 14 | import org.apache.lucene.index.IndexReader; 15 | import org.apache.lucene.queryparser.classic.ParseException; 16 | import org.apache.lucene.queryparser.classic.QueryParser; 17 | import org.apache.lucene.search.IndexSearcher; 18 | import org.apache.lucene.search.Query; 19 | import org.apache.lucene.search.ScoreDoc; 20 | import org.apache.lucene.search.TopDocs; 21 | import org.apache.lucene.store.FSDirectory; 22 | import org.apache.lucene.util.Version; 23 | 24 | /** 25 | * 26 | * @author Alireza 27 | */ 28 | 29 | public class DataSearchSentence { 30 | IndexReader reader; 31 | IndexSearcher indSearch; 32 | Query query; 33 | public DataSearchSentence(String IndexDir) throws IOException 34 | { 35 | reader=DirectoryReader.open(FSDirectory.open(new File(IndexDir))); 36 | indSearch=new IndexSearcher(reader); 37 | } 38 | public TopDocs search(String searchText) throws IOException, ParseException 39 | { 40 | Query q2=new QueryParser(Version.LUCENE_41,"Content", 41 | new StandardAnalyzer(Version.LUCENE_41)) 42 | .parse(searchText); 43 | return indSearch.search(q2, 10); 44 | } 45 | public Document getDocument(ScoreDoc score) throws CorruptIndexException, IOException 46 | { 47 | return indSearch.doc(score.doc); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/tfidf/IndexSentence.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.tfidf; 7 | 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.util.List; 11 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 12 | import org.apache.lucene.document.Document; 13 | import org.apache.lucene.document.Field; 14 | import org.apache.lucene.document.TextField; 15 | import org.apache.lucene.index.CorruptIndexException; 16 | import org.apache.lucene.index.IndexWriter; 17 | import org.apache.lucene.index.IndexWriterConfig; 18 | import org.apache.lucene.store.FSDirectory; 19 | import org.apache.lucene.util.Version; 20 | 21 | /** 22 | * 23 | * @author Alireza 24 | */ 25 | public class IndexSentence { 26 | private IndexWriter writer; 27 | private StandardAnalyzer Analyzer=new StandardAnalyzer(Version.LUCENE_41); 28 | public IndexSentence(String indexDirectory) throws IOException 29 | { 30 | FSDirectory indexDir=FSDirectory.open(new File(indexDirectory)); 31 | IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_41,Analyzer); 32 | writer=new IndexWriter(indexDir,config); 33 | } 34 | public void Close() throws CorruptIndexException, IOException 35 | { 36 | writer.close(); 37 | } 38 | 39 | private Document ListDoc(String tweet) throws IOException 40 | { 41 | Document doc=new Document(); 42 | // Field fileContent=new Field("Content",new FileReader("FileName Must Be Provided")); 43 | // doc.add(fileContent); 44 | doc.add(new TextField("Content",tweet, Field.Store.YES)); 45 | //doc.add(new TextField("FilePath",file.getCanonicalPath(),Field.Store.YES)); 46 | return doc; 47 | } 48 | private void IndexDocuments(String tweetFilePath) throws IOException{ 49 | System.out.println("Indexing Sentences "); 50 | List lstProcessedData=ReadDataSentence.ReadPreProcessedData(tweetFilePath); 51 | for(String tweet:lstProcessedData) 52 | { 53 | try 54 | { 55 | Document document = ListDoc(tweet); 56 | writer.addDocument(document); 57 | } 58 | catch(Exception ex) 59 | { 60 | System.out.print(ex.getMessage()); 61 | } 62 | } 63 | } 64 | public boolean IndexTweets(String tweetFilePath) 65 | throws IOException{ 66 | //File[] files = new File(dataDir).listFiles(); 67 | //File f=new File(tweetFilePath); 68 | //for (File file : files) { 69 | // if(!tweetFilePath.isDirectory()&& tweetFilePath.exists()) 70 | //{ 71 | IndexDocuments(new File(tweetFilePath).getPath()); 72 | //} 73 | /* else 74 | { 75 | return false; 76 | }*/ 77 | // } 78 | //return writer.numDocs(); 79 | return true; 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/tfidf/ReadDataSentence.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.tfidf; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.FileNotFoundException; 10 | import java.io.FileReader; 11 | import java.io.IOException; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | /** 16 | * 17 | * @author Alireza 18 | */ 19 | public class ReadDataSentence { 20 | 21 | static String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+"; 22 | public static List ReadPreProcessedData(String filePath) throws FileNotFoundException, IOException{ 23 | List lstData=new ArrayList<>(); 24 | BufferedReader sr; 25 | String sLine; 26 | sr = new BufferedReader(new FileReader(filePath)); 27 | while ((sLine = sr.readLine()) != null) { 28 | if(sLine.split(" ").length<1) 29 | { 30 | continue; 31 | } 32 | sLine=sLine.replaceAll(Pattern, ""); 33 | String [] arrSLine=sLine.split(" "); 34 | String Line=""; 35 | for(String str:arrSLine) 36 | { 37 | str=str.replace("'",""); 38 | str=str.replace("(",""); 39 | str=str.replace(")",""); 40 | str=str.replace("!",""); 41 | str=str.replace("[",""); 42 | str=str.replace("]",""); 43 | str=str.replace("{",""); 44 | str=str.replace("}",""); 45 | str=str.replace("\"",""); 46 | str=str.replace("?",""); 47 | str=str.replace(".",""); 48 | Line+=str+" "; 49 | } 50 | 51 | lstData.add(Line.trim()); 52 | } 53 | return lstData; 54 | } 55 | public static List ReadRawData(String filePath) throws FileNotFoundException, IOException 56 | { 57 | List lstData=new ArrayList<>(); 58 | BufferedReader sr; 59 | String sLine; 60 | sr = new BufferedReader(new FileReader(filePath)); 61 | while ((sLine = sr.readLine()) != null) { 62 | if(sLine.split(" ").length<1) 63 | { 64 | continue; 65 | } 66 | lstData.add(sLine); 67 | } 68 | return lstData; 69 | } 70 | public static void ReadFromMySql() 71 | { 72 | 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/tokenization/ExtractionKeywordImpl.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unsw-cse-soc/Data-curation-API/39473752816255e18f587203907bcd7d5783208e/src/main/java/unsw/curation/api/tokenization/ExtractionKeywordImpl.java -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitter/KeywordExtraction.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.twitter; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.io.StringReader; 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import org.apache.lucene.analysis.TokenStream; 12 | import org.apache.lucene.analysis.core.StopFilter; 13 | import org.apache.lucene.analysis.standard.StandardTokenizer; 14 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 15 | import org.apache.lucene.analysis.util.CharArraySet; 16 | 17 | /** 18 | * 19 | * @author Alireza 20 | */ 21 | public class KeywordExtraction { 22 | 23 | public KeywordExtraction() 24 | { 25 | 26 | } 27 | private String preProcessTweet(String tweet) 28 | { 29 | String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+"; 30 | tweet=tweet.replaceAll(Pattern, ""); 31 | String Line=""; 32 | if(tweet.toCharArray().length<141) 33 | { 34 | String [] arrSLine=tweet.split(" "); 35 | for(String str:arrSLine) 36 | { 37 | str=str.replace("'",""); 38 | str=str.replace("(",""); 39 | str=str.replace(")",""); 40 | str=str.replace("!",""); 41 | str=str.replace("[",""); 42 | str=str.replace("]",""); 43 | str=str.replace("{",""); 44 | str=str.replace("}",""); 45 | str=str.replace("\"",""); 46 | str=str.replace("?",""); 47 | str=str.replace(".",""); 48 | str=str.replace("#",""); 49 | str=str.replace("@",""); 50 | Line+=str.trim()+" "; 51 | } 52 | } 53 | return Line; 54 | } 55 | 56 | String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+"; 57 | private List lstStopWords=new ArrayList<>(); 58 | public String ExtractTweetKeyword(String inputTweet, File stopwordList) throws Exception 59 | { 60 | lstStopWords=ReadRawData(stopwordList); 61 | String trimmedText=inputTweet.replaceAll(Pattern, ""); 62 | trimmedText=trimmedText.replaceAll("\\d", ""); 63 | String values=preProcessTweet(trimmedText); 64 | CharArraySet stopWords=new CharArraySet(org.apache.lucene.util.Version.LUCENE_41,lstStopWords,true); 65 | TokenStream tokenStreamer = new 66 | StandardTokenizer(org.apache.lucene.util.Version.LUCENE_41, new StringReader(values)); 67 | tokenStreamer = new StopFilter(org.apache.lucene.util.Version.LUCENE_41, tokenStreamer, stopWords); 68 | StringBuilder sb = new StringBuilder(); 69 | CharTermAttribute charTermAttribute = tokenStreamer.addAttribute(CharTermAttribute.class); 70 | tokenStreamer.reset(); 71 | while (tokenStreamer.incrementToken()) 72 | { 73 | String term = charTermAttribute.toString(); 74 | sb.append(term).append(","); 75 | } 76 | return sb.toString(); 77 | } 78 | 79 | 80 | public String ExtractKeyword(String inputTweet, File stopwordList) throws Exception 81 | { 82 | lstStopWords=ReadRawData(stopwordList); 83 | String trimmedText=inputTweet.replaceAll(Pattern, ""); 84 | trimmedText=trimmedText.replaceAll("\\d", ""); 85 | CharArraySet stopWords=new CharArraySet(org.apache.lucene.util.Version.LUCENE_41,lstStopWords,true); 86 | TokenStream tokenStreamer = new 87 | StandardTokenizer(org.apache.lucene.util.Version.LUCENE_41, new StringReader(trimmedText)); 88 | tokenStreamer = new StopFilter(org.apache.lucene.util.Version.LUCENE_41, tokenStreamer, stopWords); 89 | StringBuilder sb = new StringBuilder(); 90 | CharTermAttribute charTermAttribute = tokenStreamer.addAttribute(CharTermAttribute.class); 91 | tokenStreamer.reset(); 92 | while (tokenStreamer.incrementToken()) 93 | { 94 | String term = charTermAttribute.toString(); 95 | sb.append(term).append(","); 96 | } 97 | return sb.toString(); 98 | } 99 | 100 | public String ExtractKeywordsList(List lstData) throws Exception 101 | { 102 | StringBuilder sb = new StringBuilder(); 103 | for(String str:lstData) 104 | { 105 | String trimmedText=str.replaceAll(Pattern, ""); 106 | trimmedText=trimmedText.replaceAll("\\d", ""); 107 | CharArraySet stopWords=new 108 | CharArraySet(org.apache.lucene.util.Version.LUCENE_41,lstStopWords,true); 109 | TokenStream tokenStreamer = new 110 | StandardTokenizer(org.apache.lucene.util.Version.LUCENE_41, 111 | new StringReader(trimmedText.trim())); 112 | tokenStreamer = new 113 | StopFilter(org.apache.lucene.util.Version.LUCENE_41, tokenStreamer, stopWords); 114 | 115 | CharTermAttribute charTermAttribute = tokenStreamer.addAttribute(CharTermAttribute.class); 116 | tokenStreamer.reset(); 117 | while (tokenStreamer.incrementToken()) { 118 | String term = charTermAttribute.toString(); 119 | sb.append(term).append(","); 120 | } 121 | } 122 | return sb.toString(); 123 | } 124 | 125 | 126 | private static List ReadRawData(File filePath) throws FileNotFoundException, IOException 127 | { 128 | List lstData=new ArrayList<>(); 129 | String sLine; 130 | BufferedReader sr = new BufferedReader(new FileReader(filePath)); 131 | while ((sLine = sr.readLine()) != null) { 132 | lstData.add(sLine); 133 | } 134 | return lstData; 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitter/MyStemExtraction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.twitter; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.File; 10 | import java.io.FileNotFoundException; 11 | import java.io.FileReader; 12 | import java.io.IOException; 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | import unsw.curation.api.twitterdomain.StemDomain; 16 | 17 | /** 18 | * 19 | * @author Alireza 20 | */ 21 | public class MyStemExtraction { 22 | 23 | public List ReadData(File stemFilePath) throws FileNotFoundException, IOException 24 | { 25 | List lstValues=new ArrayList<>(); 26 | BufferedReader reader=new BufferedReader(new FileReader(stemFilePath)); 27 | String line=""; 28 | while((line=reader.readLine())!=null) 29 | { 30 | try 31 | { 32 | String [] lineValues=line.split("\\|"); 33 | 34 | String myWord1=lineValues[0].trim().toLowerCase(); 35 | String myDerived1=lineValues[1].trim().toLowerCase(); 36 | String myWord2=lineValues[3].trim().toLowerCase(); 37 | String myDerived2=lineValues[4].trim().toLowerCase(); 38 | lstValues.add(new StemDomain(myWord1,myDerived1,myWord2,myDerived2)); 39 | } 40 | catch(Exception ex) 41 | { 42 | 43 | } 44 | } 45 | 46 | 47 | return lstValues; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitter/NamedEntityExtraction.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.twitter; 7 | 8 | import edu.stanford.nlp.ling.CoreAnnotations; 9 | import edu.stanford.nlp.ling.CoreLabel; 10 | import edu.stanford.nlp.pipeline.Annotation; 11 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 12 | import edu.stanford.nlp.util.CoreMap; 13 | import java.io.IOException; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | import java.util.Properties; 17 | import unsw.curation.api.twitterdomain.NamedEntityDomain; 18 | /** 19 | * 20 | * @author Alireza 21 | */ 22 | public class NamedEntityExtraction { 23 | 24 | public List ExtractTweetNamedEntities(String tweet) throws IOException, Exception 25 | { 26 | 27 | List lstEntityList=new ArrayList<>(); 28 | Properties props = new Properties(); 29 | boolean useRegexner = true; 30 | if (useRegexner) { 31 | props.put("annotators", "tokenize, ssplit, pos, lemma, ner,regexner"); 32 | props.put("regexner.mapping", "data.txt"); 33 | 34 | } else { 35 | props.put("annotators", "tokenize, ssplit, pos,lemma, ner"); 36 | } 37 | String values=preProcessTweet(tweet); 38 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 39 | Annotation document = new Annotation(values); 40 | pipeline.annotate(document); 41 | List sentences = document.get(CoreAnnotations.SentencesAnnotation.class); 42 | for (CoreMap sentence : sentences) 43 | { 44 | for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) 45 | { 46 | String tToken = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); 47 | String word = token.get(CoreAnnotations.TextAnnotation.class); 48 | if(tToken.equalsIgnoreCase("O")) 49 | { 50 | continue; 51 | } 52 | lstEntityList.add(new NamedEntityDomain(word,tToken)); 53 | } 54 | } 55 | return lstEntityList; 56 | } 57 | 58 | public static String preProcessTweet(String tweet) 59 | { 60 | String Pattern="(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+"; 61 | tweet=tweet.replaceAll(Pattern, ""); 62 | String Line=""; 63 | if(tweet.toCharArray().length<141) 64 | { 65 | String [] arrSLine=tweet.split(" "); 66 | for(String str:arrSLine) 67 | { 68 | str=str.replace("'",""); 69 | str=str.replace("(",""); 70 | str=str.replace(")",""); 71 | str=str.replace("!",""); 72 | str=str.replace("[",""); 73 | str=str.replace("]",""); 74 | str=str.replace("{",""); 75 | str=str.replace("}",""); 76 | str=str.replace("\"",""); 77 | str=str.replace("?",""); 78 | str=str.replace(".",""); 79 | str=str.replace("#",""); 80 | str=str.replace("@",""); 81 | Line+=str.trim()+" "; 82 | } 83 | } 84 | return Line; 85 | } 86 | 87 | public List ExtractTweetEntities(String tweet) throws IOException, Exception 88 | { 89 | 90 | List lstEntityList=new ArrayList<>(); 91 | Properties props = new Properties(); 92 | boolean useRegexner = true; 93 | if (useRegexner) { 94 | props.put("annotators", "tokenize, ssplit, pos, lemma, ner,regexner"); 95 | props.put("regexner.mapping", "data.txt"); 96 | 97 | } else { 98 | props.put("annotators", "tokenize, ssplit, pos,lemma, ner"); 99 | } 100 | //String values=preProcessTweet(tweet); 101 | StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 102 | Annotation document = new Annotation(tweet); 103 | pipeline.annotate(document); 104 | List sentences = document.get(CoreAnnotations.SentencesAnnotation.class); 105 | for (CoreMap sentence : sentences) 106 | { 107 | for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) 108 | { 109 | String tToken = token.get(CoreAnnotations.NamedEntityTagAnnotation.class); 110 | String word = token.get(CoreAnnotations.TextAnnotation.class); 111 | if(tToken.equalsIgnoreCase("O")) 112 | { 113 | continue; 114 | } 115 | lstEntityList.add(new NamedEntityDomain(word,tToken)); 116 | } 117 | } 118 | return lstEntityList; 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitter/Synonyms.java: -------------------------------------------------------------------------------- 1 | 2 | package unsw.curation.api.twitter; 3 | 4 | import edu.mit.jwi.Dictionary; 5 | import edu.mit.jwi.IDictionary; 6 | import edu.mit.jwi.item.IIndexWord; 7 | import edu.mit.jwi.item.ISynset; 8 | import edu.mit.jwi.item.IWord; 9 | import edu.mit.jwi.item.IWordID; 10 | import edu.mit.jwi.item.POS; 11 | import edu.mit.jwi.morph.WordnetStemmer; 12 | import java.io.File; 13 | import java.io.IOException; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | import unsw.curation.api.twitterdomain.SynonymDomain; 17 | 18 | /** 19 | * 20 | * @author Alireza 21 | */ 22 | public class Synonyms { 23 | KeywordExtraction EX; 24 | private String path="C:\\Program Files (x86)\\WordNet\\2.1\\dict\\"; 25 | public Synonyms() throws IOException 26 | { 27 | EX=new KeywordExtraction(); 28 | } 29 | public Synonyms(String dictionaryFilePath) throws IOException 30 | { 31 | path=dictionaryFilePath; 32 | EX=new KeywordExtraction(); 33 | } 34 | 35 | 36 | public List ExtractSynsetsSentence(String Sentence,File englishStopwordsFilePath) throws IOException, Exception 37 | { 38 | List lstSynset=new ArrayList<>(); 39 | String sentenceKeyWords=EX.ExtractKeyword(Sentence, englishStopwordsFilePath); 40 | for(String str:sentenceKeyWords.split(",")) 41 | { 42 | String strSynset=""; 43 | File dicFile=new File(path); 44 | IDictionary dict=new Dictionary(dicFile); 45 | dict.open(); 46 | WordnetStemmer stemmer=new WordnetStemmer(dict); 47 | try 48 | { 49 | List lstStem=stemmer.findStems(str, POS.NOUN); 50 | IIndexWord idxWord = dict . getIndexWord (lstStem.get(0), POS.NOUN); 51 | IWordID wordID = idxWord . getWordIDs ().get(0); 52 | IWord word = dict.getWord(wordID); 53 | ISynset sen=word.getSynset(); 54 | for(IWord w:sen.getWords()) 55 | { 56 | strSynset+=w.getLemma()+","; 57 | 58 | } 59 | lstSynset.add(new SynonymDomain(str, strSynset)); 60 | } 61 | catch(Exception ex) 62 | { 63 | 64 | } 65 | } 66 | return lstSynset; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitter/URLExtraction.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.twitter; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.net.HttpURLConnection; 7 | import java.net.InetSocketAddress; 8 | import java.net.Proxy; 9 | import java.net.URL; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.regex.Matcher; 13 | import java.util.regex.Pattern; 14 | import org.jsoup.Jsoup; 15 | import org.jsoup.nodes.Document; 16 | import org.jsoup.nodes.Element; 17 | import org.jsoup.select.Elements; 18 | import unsw.curation.api.twitterdomain.UrlDomain; 19 | /** 20 | * 21 | * @author Alireza 22 | */ 23 | public class URLExtraction { 24 | 25 | private Document docPub; 26 | UrlDomain urlDomain; 27 | 28 | public URLExtraction() { 29 | urlDomain=new UrlDomain(); 30 | // System.setProperty("http.proxyHost", "127.0.0.1"); 31 | // System.setProperty("http.proxyPort", "8580"); 32 | // System.setProperty("https.proxyHost", "127.0.0.1"); 33 | // System.setProperty("https.proxyPort", "8580"); 34 | } 35 | 36 | 37 | public void Extract(String Url) throws IOException 38 | { 39 | // System.setProperty("http.proxyHost", "127.0.0.1"); 40 | // System.setProperty("http.proxyPort", "8580"); 41 | // System.setProperty("https.proxyHost", "127.0.0.1"); 42 | // System.setProperty("https.proxyPort", "8580"); 43 | // URL url = new URL(Url); 44 | // Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 8580)); 45 | // HttpURLConnection uc = (HttpURLConnection)url.openConnection(proxy); 46 | // uc.connect(); 47 | // String line = null; 48 | // StringBuffer tmp = new StringBuffer(); 49 | // BufferedReader in = new BufferedReader(new InputStreamReader(uc.getInputStream())); 50 | // while ((line = in.readLine()) != null) { 51 | // tmp.append(line); 52 | // } 53 | // Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("127.0.0.1", 8080)); // or whatever your proxy is 54 | // HttpURLConnection uc = (HttpURLConnection)url.openConnection(proxy); 55 | docPub=Jsoup.connect(Url).timeout(10000).get(); 56 | // docPub=Jsoup.parse(String.valueOf(tmp)); 57 | } 58 | 59 | private String O_ExtractTitle() throws IOException 60 | { 61 | if(docPub==null) 62 | { 63 | throw new IOException("No Page To Download"); 64 | } 65 | String Title=docPub.title(); 66 | return Title; 67 | } 68 | 69 | 70 | private List ExtractHyperLink(String Sentence) 71 | { 72 | List Links=new ArrayList<>(); 73 | String [] splitedStr=Sentence.split(" "); 74 | for(String st:splitedStr){ 75 | Pattern p=Pattern.compile("(http:|https:|ftp)[:-_?\\a-zA-Z\\d.*//]+"); 76 | Matcher m=p.matcher(st); 77 | while(m.find()) 78 | { 79 | Links.add(m.group()); 80 | } 81 | } 82 | return Links; 83 | } 84 | 85 | ListlstTweetLinkList=new ArrayList<>(); 86 | public List ExtractLinkInfo(String tweet) 87 | { 88 | lstTweetLinkList=ExtractHyperLink(tweet); 89 | List lstPargraphList=new ArrayList<>(); 90 | if(lstTweetLinkList.size()>0) 91 | { 92 | for(String pageLink:lstTweetLinkList) 93 | { 94 | try 95 | { 96 | List lsttemp=new ArrayList<>(); 97 | Extract(pageLink); 98 | lsttemp=O_ExtractParagraphes(); 99 | lstPargraphList.addAll(lsttemp); 100 | } 101 | catch(Exception ex) 102 | { 103 | System.err.println(ex.getMessage()); 104 | } 105 | } 106 | } 107 | return lstPargraphList; 108 | } 109 | 110 | private List O_ExtractParagraphes() throws IOException 111 | { 112 | List lstParagraphes=new ArrayList<>(); 113 | if(docPub==null) 114 | { 115 | throw new IOException("No Page To Download"); 116 | } 117 | Elements Paragraphes=docPub.select("p"); 118 | for(Element Paragraph:Paragraphes) 119 | { 120 | if(Paragraph.text().length()>1) 121 | lstParagraphes.add(Paragraph.text()); 122 | } 123 | System.out.println("Downloading Page Content..."); 124 | return lstParagraphes; 125 | } 126 | 127 | ListlstTweetTitle=new ArrayList<>(); 128 | public List ExtractTitle(String tweet) 129 | { 130 | lstTweetTitle=ExtractHyperLink(tweet); 131 | List lstTitleList=new ArrayList<>(); 132 | if(lstTweetTitle.size()>0) 133 | { 134 | for(String pageLink:lstTweetTitle) 135 | { 136 | try 137 | { 138 | String title=""; 139 | Extract(pageLink); 140 | title=O_ExtractTitle(); 141 | lstTitleList.add(title); 142 | } 143 | catch(Exception ex) 144 | { 145 | System.err.println(ex.getMessage()); 146 | } 147 | } 148 | } 149 | return lstTitleList; 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitterdomain/KeywordDomain.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.twitterdomain; 7 | 8 | /** 9 | * 10 | * @author Alireza 11 | */ 12 | public class KeywordDomain { 13 | 14 | public KeywordDomain(){} 15 | public KeywordDomain(String tweet,String keyword) 16 | { 17 | this.tweet=tweet; 18 | this.keyword=keyword; 19 | } 20 | public KeywordDomain(String keyword) 21 | { 22 | this.keyword=keyword; 23 | } 24 | public String tweet; 25 | public String keyword; 26 | public String inputSentence; 27 | public String inputTweet; 28 | 29 | public void setInputSentence(String inputSentence) 30 | { 31 | this.inputSentence=inputSentence; 32 | } 33 | 34 | public String getInputSentence() 35 | { 36 | return inputSentence; 37 | } 38 | public void setInputTweet(String inputTweet) 39 | { 40 | this.inputTweet=inputTweet; 41 | } 42 | public String getInputTweet() 43 | { 44 | return inputTweet; 45 | } 46 | public void setTweet(String tweet) 47 | { 48 | this.tweet=tweet; 49 | } 50 | public String getTweet() 51 | { 52 | return tweet; 53 | } 54 | 55 | public void setKeyword(String keyword) 56 | { 57 | this.keyword=keyword; 58 | } 59 | public String getKeyword() 60 | { 61 | return keyword; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitterdomain/NamedEntityDomain.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.twitterdomain; 7 | 8 | /** 9 | * 10 | * @author Alireza 11 | */ 12 | public class NamedEntityDomain { 13 | 14 | public String _Word; 15 | public String _Ner; 16 | 17 | public void setWord(String _Word) 18 | { 19 | this._Word=_Word; 20 | } 21 | public String getWord() 22 | { 23 | return _Word; 24 | } 25 | 26 | public void setNer(String _Ner) 27 | { 28 | this._Ner=_Ner; 29 | } 30 | public String getNer() 31 | { 32 | return _Ner; 33 | } 34 | // private String HashTag; 35 | // private String Text; 36 | // private String Domain; 37 | // private String KeyWords; 38 | // private String Synonyms; 39 | // private String Links; 40 | // private String Entities; 41 | public NamedEntityDomain(){} 42 | 43 | public NamedEntityDomain(String word,String Ner) 44 | { 45 | _Word=word; 46 | _Ner=Ner; 47 | } 48 | // public NamedEntityDomain(String Text,String Entities,String HashTag,String Domain, String KeyWords,String Synonyms,String Links) 49 | // { 50 | // this.HashTag=HashTag; 51 | // this.Text=Text; 52 | // this.Entities=Entities; 53 | // this.Domain=Domain; 54 | // this.KeyWords=KeyWords; 55 | // this.Synonyms=Synonyms; 56 | // this.Links=Links; 57 | // } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitterdomain/StemDomain.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.twitterdomain; 7 | 8 | /** 9 | * 10 | * @author Alireza 11 | */ 12 | public class StemDomain { 13 | public StemDomain(){} 14 | private String word1; 15 | public void setWord1(String word) 16 | { 17 | this.word1=word; 18 | } 19 | public String getWord1() 20 | { 21 | return word1; 22 | } 23 | private String derived1; 24 | public void setDerived1(String derived) 25 | { 26 | this.derived1=derived; 27 | } 28 | public String getDerived1() 29 | { 30 | return this.derived1; 31 | } 32 | 33 | private String word2; 34 | public void setWord2(String word) 35 | { 36 | this.word2=word; 37 | } 38 | public String getWord2() 39 | { 40 | return word2; 41 | } 42 | private String derived2; 43 | public void setDerived2(String derived) 44 | { 45 | this.derived2=derived; 46 | } 47 | public String getDerived2() 48 | { 49 | return this.derived2; 50 | } 51 | public StemDomain(String word1,String derived1,String word2,String derived2) 52 | { 53 | this.word1=word1; 54 | this.word2=word2; 55 | this.derived1=derived1; 56 | this.derived2=derived2; 57 | } 58 | 59 | // public StemDomain(StemDomain Domain) 60 | // { 61 | // this.word1=Domain.getWord1(); 62 | // this.word2=Domain.getWord2(); 63 | // this.derived1=Domain.getDerived1(); 64 | // this.derived2=Domain.getDerived2(); 65 | // } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitterdomain/SynonymDomain.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.twitterdomain; 7 | 8 | /** 9 | * 10 | * @author Alireza 11 | */ 12 | public class SynonymDomain { 13 | 14 | public SynonymDomain(){} 15 | public SynonymDomain(String word, String synset) 16 | { 17 | this.word=word; 18 | this.synset=synset; 19 | } 20 | public String word; 21 | public String synset; 22 | 23 | public void setWord(String word) 24 | { 25 | this.word=word; 26 | } 27 | public String getWord() 28 | { 29 | return this.word; 30 | } 31 | public void setSynset(String synset) 32 | { 33 | this.synset=synset; 34 | } 35 | public String getSynset() 36 | { 37 | return synset; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/twitterdomain/UrlDomain.java: -------------------------------------------------------------------------------- 1 | /* 2 | * To change this license header, choose License Headers in Project Properties. 3 | * To change this template file, choose Tools | Templates 4 | * and open the template in the editor. 5 | */ 6 | package unsw.curation.api.twitterdomain; 7 | 8 | /** 9 | * 10 | * @author Alireza 11 | */ 12 | public class UrlDomain 13 | { 14 | private String pageTitle; 15 | 16 | public void setPageTitle(String pageTitle) 17 | { 18 | this.pageTitle=pageTitle; 19 | } 20 | public String getPageTitle() 21 | { 22 | return this.pageTitle; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/unsw/curation/api/url/GetHTMLFile.java: -------------------------------------------------------------------------------- 1 | package unsw.curation.api.url; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.nodes.Element; 10 | import org.jsoup.select.Elements; 11 | 12 | /** 13 | * 14 | * @author Alireza 15 | */ 16 | public class GetHTMLFile { 17 | 18 | private Document docPub; 19 | /** 20 | * 21 | * @param FilePath 22 | * @throws IOException 23 | */ 24 | public void ExtractLocal(String FilePath) throws IOException 25 | { 26 | docPub=Jsoup.parse(new File(FilePath), "UTF-8"); 27 | } 28 | /** 29 | * 30 | * @return 31 | * @throws IOException 32 | */ 33 | public String L_ExtractTitle() throws IOException 34 | { 35 | if(docPub==null) 36 | { 37 | throw new IOException("No Page To Parse"); 38 | } 39 | String Title=docPub.title(); 40 | return Title; 41 | } 42 | /** 43 | * 44 | * @param Url 45 | * @return 46 | * @throws IOException 47 | */ 48 | public List L_ExtractHeadings() throws IOException 49 | { 50 | List lstHeadings=new ArrayList<>(); 51 | if(docPub==null) 52 | { 53 | throw new IOException("No Page To Parse"); 54 | } 55 | Elements H1=docPub.select("h1"); 56 | Elements H2=docPub.select("h2"); 57 | Elements H3=docPub.select("H3"); 58 | Elements H4=docPub.select("H4"); 59 | for(Element H:H1) 60 | { 61 | lstHeadings.add(H.text()); 62 | } 63 | for(Element H:H2) 64 | { 65 | lstHeadings.add(H.text()); 66 | } 67 | for(Element H:H3) 68 | { 69 | lstHeadings.add(H.text()); 70 | } 71 | for(Element H:H4) 72 | { 73 | lstHeadings.add(H.text()); 74 | } 75 | return lstHeadings; 76 | } 77 | /** 78 | * 79 | * @return 80 | * @throws IOException 81 | */ 82 | public List L_ExtractHrefText() throws IOException 83 | { 84 | List lstHref=new ArrayList<>(); 85 | if(docPub==null) 86 | { 87 | throw new IOException("No Page To Parse"); 88 | } 89 | Elements Hrefs=docPub.select("a[href]"); 90 | for(Element Href:Hrefs) 91 | { 92 | lstHref.add(Href.text()); 93 | } 94 | return lstHref; 95 | } 96 | /** 97 | * 98 | * @return 99 | * @throws IOException 100 | */ 101 | public List L_ExtractParagraphes() throws IOException 102 | { 103 | List lstParagraphes=new ArrayList<>(); 104 | if(docPub==null) 105 | { 106 | throw new IOException("No Page To Parse"); 107 | } 108 | Elements Paragraphes=docPub.select("p"); 109 | for(Element Paragraph:Paragraphes) 110 | { 111 | if(Paragraph.text().length()>1) 112 | lstParagraphes.add(Paragraph.text()); 113 | } 114 | return lstParagraphes; 115 | } 116 | /** 117 | * 118 | * @param Position 119 | * @return 120 | * @throws IOException 121 | */ 122 | public String L_ExtractParagraphByPosition(int Position) throws IOException 123 | { 124 | if(docPub==null) 125 | { 126 | throw new IOException("No Page To Parse"); 127 | } 128 | try{ 129 | List lstParagraphes=new ArrayList<>(); 130 | Elements Paragraphes=docPub.select("p"); 131 | for(Element Paragraph:Paragraphes) 132 | { 133 | if(Paragraph.text().length()>1) 134 | lstParagraphes.add(Paragraph.text()); 135 | } 136 | 137 | if(lstParagraphes.size() L_ExtractImageALTtext() 155 | { 156 | List lstImage=new ArrayList<>(); 157 | Elements src=docPub.select("img[src]"); 158 | for(Element el:src) 159 | { 160 | if(el.attr("alt").length()>1) 161 | lstImage.add(el.attr("alt")); 162 | } 163 | return lstImage; 164 | } 165 | /** 166 | * 167 | * @return 168 | */ 169 | public List L_ExtractListTexts() 170 | { 171 | List lstUl=new ArrayList<>(); 172 | Elements Ul=docPub.select("ul"); 173 | for(Element e:Ul) 174 | { 175 | lstUl.add(e.text()); 176 | } 177 | return lstUl; 178 | } 179 | // public void SaveResult(List LstData,String Path) throws IOException 180 | // { 181 | // BufferedWriter buf=new BufferedWriter(new FileWriter(Path)); 182 | // for(String Values:LstData) 183 | // { 184 | // buf.write(Values); 185 | // buf.newLine(); 186 | // } 187 | // buf.close(); 188 | // } 189 | // public void SaveResult(String Values,String Path) throws IOException 190 | // { 191 | // BufferedWriter buf=new BufferedWriter(new FileWriter(Path)); 192 | // buf.write(Values); 193 | // buf.newLine(); 194 | // buf.close(); 195 | // } 196 | } 197 | -------------------------------------------------------------------------------- /test.txt: -------------------------------------------------------------------------------- 1 | Dont censore the web ask the congress. -------------------------------------------------------------------------------- /text.txt: -------------------------------------------------------------------------------- 1 | {"searchinfo":{"search":"taylor swift"},"search":[{"id":"Q26876","concepturi":"http://www.wikidata.org/entity/Q26876","url":"//www.wikidata.org/wiki/Q26876","title":"Q26876","pageid":30291,"label":"Taylor Swift","description":"singer-songwriter from the United States","match":{"type":"label","language":"en","text":"Taylor Swift"}},{"id":"Q845783","concepturi":"http://www.wikidata.org/entity/Q845783","url":"//www.wikidata.org/wiki/Q845783","title":"Q845783","pageid":797842,"label":"Taylor Swift","description":"Eponymous debut studio album by Taylor Swift","match":{"type":"label","language":"en","text":"Taylor Swift"}},{"id":"Q276736","concepturi":"http://www.wikidata.org/entity/Q276736","url":"//www.wikidata.org/wiki/Q276736","title":"Q276736","pageid":267594,"label":"Taylor Swift discography","description":"discography","match":{"type":"label","language":"en","text":"Taylor Swift discography"}},{"id":"Q20734198","concepturi":"http://www.wikidata.org/entity/Q20734198","url":"//www.wikidata.org/wiki/Q20734198","title":"Q20734198","pageid":22479565,"label":"Taylor Swift breaks Vevo record","match":{"type":"label","language":"en","text":"Taylor Swift breaks Vevo record"}},{"id":"Q27076640","concepturi":"http://www.wikidata.org/entity/Q27076640","url":"//www.wikidata.org/wiki/Q27076640","title":"Q27076640","pageid":28932752,"label":"Taylor Swift videography","match":{"type":"label","language":"en","text":"Taylor Swift videography"}},{"id":"Q22814294","concepturi":"http://www.wikidata.org/entity/Q22814294","url":"//www.wikidata.org/wiki/Q22814294","title":"Q22814294","pageid":24835551,"label":"Taylor Swift's 1989 wins Grammy's Record of the year; Bad Blood wins the Best Music Video","match":{"type":"label","language":"en","text":"Taylor Swift's 1989 wins Grammy's Record of the year; Bad Blood wins the Best Music Video"}},{"id":"Q7690142","concepturi":"http://www.wikidata.org/entity/Q7690142","url":"//www.wikidata.org/wiki/Q7690142","title":"Q7690142","pageid":7615766,"label":"Taylor Swift and Def Leppard","match":{"type":"label","language":"en","text":"Taylor Swift and Def Leppard"}}],"success":1} 2 | {"searchinfo":{"search":"toyota"},"search":[{"id":"Q53268","concepturi":"http://www.wikidata.org/entity/Q53268","url":"//www.wikidata.org/wiki/Q53268","title":"Q53268","pageid":55718,"label":"Toyota","description":"automotive brand manufacturer","match":{"type":"label","language":"en","text":"Toyota"}},{"id":"Q201117","concepturi":"http://www.wikidata.org/entity/Q201117","url":"//www.wikidata.org/wiki/Q201117","title":"Q201117","pageid":197802,"label":"Toyota","description":"city in Aichi Prefecture, Japan","match":{"type":"label","language":"en","text":"Toyota"}},{"id":"Q236526","concepturi":"http://www.wikidata.org/entity/Q236526","url":"//www.wikidata.org/wiki/Q236526","title":"Q236526","pageid":229982,"label":"Toyota","description":"Wikipedia disambiguation page","match":{"type":"label","language":"en","text":"Toyota"}},{"id":"Q17669963","concepturi":"http://www.wikidata.org/entity/Q17669963","url":"//www.wikidata.org/wiki/Q17669963","title":"Q17669963","pageid":19265719,"label":"Toyota","match":{"type":"label","language":"en","text":"Toyota"}},{"id":"Q22341651","concepturi":"http://www.wikidata.org/entity/Q22341651","url":"//www.wikidata.org/wiki/Q22341651","title":"Q22341651","pageid":24368174,"label":"Mathunny Mathews","description":"Indian businessperson","match":{"type":"alias","language":"en","text":"Toyota Sunny"},"aliases":["Toyota Sunny"]},{"id":"Q10700769","concepturi":"http://www.wikidata.org/entity/Q10700769","url":"//www.wikidata.org/wiki/Q10700769","title":"Q10700769","pageid":11976587,"label":"Toyotahallen","description":"Wikimedia disambiguation page","match":{"type":"label","language":"en","text":"Toyotahallen"}},{"id":"Q182473","concepturi":"http://www.wikidata.org/entity/Q182473","url":"//www.wikidata.org/wiki/Q182473","title":"Q182473","pageid":181568,"label":"Intercontinental Cup","description":"international association football tournament for clubs","match":{"type":"alias","language":"en","text":"Toyota Cup"},"aliases":["Toyota Cup"]}],"search-continue":7,"success":1} 3 | {"searchinfo":{"search":"lionel messi"},"search":[{"id":"Q615","concepturi":"http://www.wikidata.org/entity/Q615","url":"//www.wikidata.org/wiki/Q615","title":"Q615","pageid":899,"label":"Lionel Messi","description":"Argentine footballer","match":{"type":"label","language":"en","text":"Lionel Messi"}},{"id":"Q16301083","concepturi":"http://www.wikidata.org/entity/Q16301083","url":"//www.wikidata.org/wiki/Q16301083","title":"Q16301083","pageid":17928471,"label":"Lionel Messi Nyamsi","description":"Cameroonian footballer","match":{"type":"label","language":"en","text":"Lionel Messi Nyamsi"}}],"success":1} 4 | --------------------------------------------------------------------------------