├── .classpath ├── .project ├── .settings └── org.eclipse.jdt.core.prefs ├── README.md ├── bin ├── edu │ └── smu │ │ ├── data │ │ ├── Alphabet.class │ │ ├── DataList.class │ │ ├── Instance.class │ │ ├── InstanceList.class │ │ ├── Lattice.class │ │ ├── Node.class │ │ ├── Sequence.class │ │ ├── SequenceList.class │ │ └── SparseVector.class │ │ └── util │ │ ├── DataFormat.class │ │ ├── FileUtil.class │ │ ├── MatrixOps.class │ │ ├── RemoveIllegalChar.class │ │ ├── SequeceFeaturePatternExtrator.class │ │ └── StringUtil.class └── tem │ ├── com │ ├── ComUtil$1.class │ ├── ComUtil.class │ ├── FileUtil$1.class │ ├── FileUtil$2.class │ ├── FileUtil.class │ ├── JC.class │ ├── MathUtil.class │ ├── MatrixUtil.class │ ├── POStags.class │ ├── Sorting.class │ ├── Stopwords.class │ ├── ValueComparator.class │ └── wordFreq.class │ ├── conf │ ├── ConstantConfig.class │ └── PathConfig.class │ ├── linkas │ ├── ID.class │ ├── PR.class │ ├── TEPR.class │ └── TSPR.class │ ├── main │ ├── Documents$Document.class │ ├── Documents.class │ ├── FGMM.class │ ├── LdaGibbsSampling$modelparameters.class │ ├── LdaGibbsSampling$parameters.class │ ├── LdaGibbsSampling.class │ ├── LdaModel$TwordsComparable.class │ ├── LdaModel.class │ ├── ModelComFunc.class │ ├── SimpleEvaluate$TwordsComparable.class │ ├── SimpleEvaluate.class │ ├── TEMModel$TwordsComparable.class │ ├── TEMModel.class │ ├── TEMModel1$TwordsComparable.class │ ├── TEMModel1.class │ ├── TEMModelSampling$modelparameters.class │ ├── TEMModelSampling$parameters.class │ ├── TEMModelSampling.class │ ├── TEMResPaperVisual.class │ ├── TEMResPro$Post.class │ ├── TEMResPro.class │ └── TEMResProUserRecMergeU.class │ ├── parser │ ├── NewString.class │ ├── Porter.class │ └── StanfordTokenizer.class │ ├── script │ ├── DBConnection.class │ ├── ExportExpCorpusFromDB.class │ ├── ExportGraphMatrix.class │ ├── ExportTagsFromDB.class │ ├── ExportTestDataForRank.class │ ├── HandleTagTest.class │ ├── JAMATest.class │ ├── MergeUser10.class │ ├── PageRank2.class │ ├── PageRankYL.class │ ├── SimilarQuestionPAexport.class │ ├── SortByValueDemo.class │ └── ValueComparator.class │ └── uqa │ ├── UQAModel$Pair.class │ ├── UQAModel.class │ ├── UQAModelRes.class │ └── UQAModelSampling.class └── src ├── edu └── smu │ ├── data │ ├── Alphabet.java │ ├── DataList.java │ ├── Instance.java │ ├── InstanceList.java │ ├── Lattice.java │ ├── Node.java │ ├── Sequence.java │ ├── SequenceList.java │ └── SparseVector.java │ └── util │ ├── DataFormat.java │ ├── FileUtil.java │ ├── MatrixOps.java │ ├── RemoveIllegalChar.java │ ├── SequeceFeaturePatternExtrator.java │ └── StringUtil.java └── tem ├── com ├── ComUtil.java ├── FileUtil.java ├── JC.java ├── MathUtil.java ├── MatrixUtil.java ├── POStags.java ├── Sorting.java ├── Stopwords.java ├── ValueComparator.java └── wordFreq.java ├── conf ├── ConstantConfig.java └── PathConfig.java ├── linkas ├── ID.java ├── PR.java ├── TEPR.java └── TSPR.java ├── main ├── Documents.java ├── FGMM.java ├── LdaGibbsSampling.java ├── LdaModel.java ├── ModelComFunc.java ├── SimpleEvaluate.java ├── TEMModel.java ├── TEMModel1.java ├── TEMModelSampling.java ├── TEMResPaperVisual.java ├── TEMResPro.java └── TEMResProUserRecMergeU.java ├── parser ├── Porter.java └── StanfordTokenizer.java ├── script ├── DBConnection.java ├── ExportExpCorpusFromDB.java ├── ExportGraphMatrix.java ├── ExportTagsFromDB.java ├── ExportTestDataForRank.java ├── HandleTagTest.java ├── JAMATest.java ├── MergeUser10.java ├── PageRank2.java ├── PageRankYL.java ├── SimilarQuestionPAexport.java ├── SortByValueDemo.java └── ValueComparator.java └── uqa ├── UQAModel.java ├── UQAModelRes.java └── UQAModelSampling.java /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | NLPTEM 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.6 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.6 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TopicExpertiseModel 2 | =================== 3 | 4 | /** 5 | Copyright (C) 2013 by 6 | SMU Text Mining Group/Singapore Management University/Peking University 7 | 8 | TopicExpertiseModel is distributed for research purpose, but 9 | WITHOUT ANY WARRANTY; without even the implied warranty of 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | 12 | If you use this code, please cite the following paper: 13 | 14 | Liu Yang, Minghui Qiu, Swapna Gottipati, Feida Zhu, Jing Jiang, Huiping Sun and Zhong Chen. CQARank: Jointly Model Topics and Expertise in Community Question Answering. In Proceedings of the 22nd ACM International Conference on Information and Knowledge Management (CIKM 2013). (http://dl.acm.org/citation.cfm?id=2505720) 15 | 16 | Feel free to contact the following people if you find any 17 | problems in the package. 18 | lyang@cs.umass.edu or yangliuyx@gmail.com * */ 19 | 20 | Brief Introduction 21 | =================== 22 | 23 | 1. Community Question Answering (CQA) websites, where people share expertise on open platforms, have become large repositories of valuable knowledge. To bring the best value out of these knowledge repositories, it is critically important for CQA services to know how to find the right experts, retrieve archived similar questions and recommend best answers to new questions. To tackle this cluster of closely related problems in a principled approach, we proposed Topic Expertise Model (TEM), a novel probabilistic generative model with GMM hybrid, to jointly model topics and expertise by integrating textual content model and link structure analysis. Based on TEM results, we proposed CQARank to measure user interests and expertise score under different topics. Leveraging the question answering history based on long-term community reviews and voting, our method could find experts with both similar topical preference and high topical expertise. 24 | 25 | 2. This package implements Gibbs sampling for Topic Expertise Model for jointly modeling topics and expertise in question answering communities. More details of our model are described in the following paper: 26 | 27 | Liu Yang, Minghui Qiu, Swapna Gottipati, Feida Zhu, Jing Jiang, Huiping Sun and Zhong Chen. CQARank: Jointly Model Topics and Expertise in Community Question Answering. In Proceedings of the 22nd ACM International Conference on Information and Knowledge Management (CIKM 2013). (http://dl.acm.org/citation.cfm?id=2505720) 28 | 29 | 3. I didn't upload the data under ./data folder since the total size is too large. But I upload some used experimental data into a dropbox folder. You can find the experimental data here. [Download](https://www.dropbox.com/sh/42vei96g0vf56dy/AAATUsvDMq7uXkkPsDF87K5pa?dl=0). 30 | 31 | 4. I am happy that many readers of our CIKM'13 paper sent emails to me on questions about the paper and code since the paper was published. I am always trying my best to reply to those emails. My latest email address is lyang@cs.umass.edu / yangliuyx@gmail.com. I encourange you to use the "Issues" function in Github (https://github.com/yangliuy/TopicExpertiseModel/issues) so that there are QA threads which can be referred to by future readers. 32 | -------------------------------------------------------------------------------- /bin/edu/smu/data/Alphabet.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Alphabet.class -------------------------------------------------------------------------------- /bin/edu/smu/data/DataList.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/DataList.class -------------------------------------------------------------------------------- /bin/edu/smu/data/Instance.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Instance.class -------------------------------------------------------------------------------- /bin/edu/smu/data/InstanceList.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/InstanceList.class -------------------------------------------------------------------------------- /bin/edu/smu/data/Lattice.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Lattice.class -------------------------------------------------------------------------------- /bin/edu/smu/data/Node.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Node.class -------------------------------------------------------------------------------- /bin/edu/smu/data/Sequence.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Sequence.class -------------------------------------------------------------------------------- /bin/edu/smu/data/SequenceList.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/SequenceList.class -------------------------------------------------------------------------------- /bin/edu/smu/data/SparseVector.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/SparseVector.class -------------------------------------------------------------------------------- /bin/edu/smu/util/DataFormat.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/DataFormat.class -------------------------------------------------------------------------------- /bin/edu/smu/util/FileUtil.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/FileUtil.class -------------------------------------------------------------------------------- /bin/edu/smu/util/MatrixOps.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/MatrixOps.class -------------------------------------------------------------------------------- /bin/edu/smu/util/RemoveIllegalChar.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/RemoveIllegalChar.class -------------------------------------------------------------------------------- /bin/edu/smu/util/SequeceFeaturePatternExtrator.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/SequeceFeaturePatternExtrator.class -------------------------------------------------------------------------------- /bin/edu/smu/util/StringUtil.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/StringUtil.class -------------------------------------------------------------------------------- /bin/tem/com/ComUtil$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/ComUtil$1.class -------------------------------------------------------------------------------- /bin/tem/com/ComUtil.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/ComUtil.class -------------------------------------------------------------------------------- /bin/tem/com/FileUtil$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/FileUtil$1.class -------------------------------------------------------------------------------- /bin/tem/com/FileUtil$2.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/FileUtil$2.class -------------------------------------------------------------------------------- /bin/tem/com/FileUtil.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/FileUtil.class -------------------------------------------------------------------------------- /bin/tem/com/JC.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/JC.class -------------------------------------------------------------------------------- /bin/tem/com/MathUtil.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/MathUtil.class -------------------------------------------------------------------------------- /bin/tem/com/MatrixUtil.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/MatrixUtil.class -------------------------------------------------------------------------------- /bin/tem/com/POStags.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/POStags.class -------------------------------------------------------------------------------- /bin/tem/com/Sorting.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/Sorting.class -------------------------------------------------------------------------------- /bin/tem/com/Stopwords.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/Stopwords.class -------------------------------------------------------------------------------- /bin/tem/com/ValueComparator.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/ValueComparator.class -------------------------------------------------------------------------------- /bin/tem/com/wordFreq.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/wordFreq.class -------------------------------------------------------------------------------- /bin/tem/conf/ConstantConfig.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/conf/ConstantConfig.class -------------------------------------------------------------------------------- /bin/tem/conf/PathConfig.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/conf/PathConfig.class -------------------------------------------------------------------------------- /bin/tem/linkas/ID.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/linkas/ID.class -------------------------------------------------------------------------------- /bin/tem/linkas/PR.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/linkas/PR.class -------------------------------------------------------------------------------- /bin/tem/linkas/TEPR.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/linkas/TEPR.class -------------------------------------------------------------------------------- /bin/tem/linkas/TSPR.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/linkas/TSPR.class -------------------------------------------------------------------------------- /bin/tem/main/Documents$Document.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/Documents$Document.class -------------------------------------------------------------------------------- /bin/tem/main/Documents.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/Documents.class -------------------------------------------------------------------------------- /bin/tem/main/FGMM.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/FGMM.class -------------------------------------------------------------------------------- /bin/tem/main/LdaGibbsSampling$modelparameters.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaGibbsSampling$modelparameters.class -------------------------------------------------------------------------------- /bin/tem/main/LdaGibbsSampling$parameters.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaGibbsSampling$parameters.class -------------------------------------------------------------------------------- /bin/tem/main/LdaGibbsSampling.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaGibbsSampling.class -------------------------------------------------------------------------------- /bin/tem/main/LdaModel$TwordsComparable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaModel$TwordsComparable.class -------------------------------------------------------------------------------- /bin/tem/main/LdaModel.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaModel.class -------------------------------------------------------------------------------- /bin/tem/main/ModelComFunc.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/ModelComFunc.class -------------------------------------------------------------------------------- /bin/tem/main/SimpleEvaluate$TwordsComparable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/SimpleEvaluate$TwordsComparable.class -------------------------------------------------------------------------------- /bin/tem/main/SimpleEvaluate.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/SimpleEvaluate.class -------------------------------------------------------------------------------- /bin/tem/main/TEMModel$TwordsComparable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModel$TwordsComparable.class -------------------------------------------------------------------------------- /bin/tem/main/TEMModel.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModel.class -------------------------------------------------------------------------------- /bin/tem/main/TEMModel1$TwordsComparable.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModel1$TwordsComparable.class -------------------------------------------------------------------------------- /bin/tem/main/TEMModel1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModel1.class -------------------------------------------------------------------------------- /bin/tem/main/TEMModelSampling$modelparameters.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModelSampling$modelparameters.class -------------------------------------------------------------------------------- /bin/tem/main/TEMModelSampling$parameters.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModelSampling$parameters.class -------------------------------------------------------------------------------- /bin/tem/main/TEMModelSampling.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModelSampling.class -------------------------------------------------------------------------------- /bin/tem/main/TEMResPaperVisual.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMResPaperVisual.class -------------------------------------------------------------------------------- /bin/tem/main/TEMResPro$Post.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMResPro$Post.class -------------------------------------------------------------------------------- /bin/tem/main/TEMResPro.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMResPro.class -------------------------------------------------------------------------------- /bin/tem/main/TEMResProUserRecMergeU.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMResProUserRecMergeU.class -------------------------------------------------------------------------------- /bin/tem/parser/NewString.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/parser/NewString.class -------------------------------------------------------------------------------- /bin/tem/parser/Porter.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/parser/Porter.class -------------------------------------------------------------------------------- /bin/tem/parser/StanfordTokenizer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/parser/StanfordTokenizer.class -------------------------------------------------------------------------------- /bin/tem/script/DBConnection.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/DBConnection.class -------------------------------------------------------------------------------- /bin/tem/script/ExportExpCorpusFromDB.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ExportExpCorpusFromDB.class -------------------------------------------------------------------------------- /bin/tem/script/ExportGraphMatrix.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ExportGraphMatrix.class -------------------------------------------------------------------------------- /bin/tem/script/ExportTagsFromDB.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ExportTagsFromDB.class -------------------------------------------------------------------------------- /bin/tem/script/ExportTestDataForRank.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ExportTestDataForRank.class -------------------------------------------------------------------------------- /bin/tem/script/HandleTagTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/HandleTagTest.class -------------------------------------------------------------------------------- /bin/tem/script/JAMATest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/JAMATest.class -------------------------------------------------------------------------------- /bin/tem/script/MergeUser10.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/MergeUser10.class -------------------------------------------------------------------------------- /bin/tem/script/PageRank2.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/PageRank2.class -------------------------------------------------------------------------------- /bin/tem/script/PageRankYL.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/PageRankYL.class -------------------------------------------------------------------------------- /bin/tem/script/SimilarQuestionPAexport.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/SimilarQuestionPAexport.class -------------------------------------------------------------------------------- /bin/tem/script/SortByValueDemo.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/SortByValueDemo.class -------------------------------------------------------------------------------- /bin/tem/script/ValueComparator.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ValueComparator.class -------------------------------------------------------------------------------- /bin/tem/uqa/UQAModel$Pair.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/uqa/UQAModel$Pair.class -------------------------------------------------------------------------------- /bin/tem/uqa/UQAModel.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/uqa/UQAModel.class -------------------------------------------------------------------------------- /bin/tem/uqa/UQAModelRes.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/uqa/UQAModelRes.class -------------------------------------------------------------------------------- /bin/tem/uqa/UQAModelSampling.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/uqa/UQAModelSampling.class -------------------------------------------------------------------------------- /src/edu/smu/data/Alphabet.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | 3 | import java.util.*; 4 | import java.io.*; 5 | 6 | /** 7 | * An Alphabet object stores the mapping between symbols (represented by 8 | * String objects) and integers (represented by Integer objects). It can be 9 | * used to map feature strings to feature indices, for example, or to map 10 | * class labels to class indices. 11 | *

12 | * A symbol can never be deleted from an Alphabet object once it has been 13 | * added. Integers are assigned to symbols sequentially, starting from 0. 14 | * For example, suppose we have the following code to insert symbols into an 15 | * Alphabet object: 16 | *

17 | * Alphabet alpha = new Alphabet(); 18 | * alpha.addSymbol("a"); 19 | * alpha.addSymbol("b"); 20 | * alpha.addSymbol("z"); 21 | * alpha.addSymbol("a"); 22 | * alpha.addSymbol("c"); 23 | *

24 | * Then internally the following mapping is stored: 25 | *

26 | * a -- 0 27 | * b -- 1 28 | * z -- 2 29 | * c -- 3 30 | */ 31 | 32 | public class Alphabet { 33 | 34 | /** 35 | * Constructs a new Alphabet object with no symbol stored. 36 | */ 37 | public Alphabet() { 38 | indices = new HashMap(); 39 | symbols = new ArrayList(); 40 | } 41 | 42 | public Alphabet(String[] symbols) { 43 | indices = new HashMap(); 44 | this.symbols = new ArrayList(); 45 | addSymbols(symbols); 46 | } 47 | 48 | /** 49 | * Adds a new symbol into the Alphabet object, and returns the integer 50 | * assigned to this symbol. If this symbol is already stored in the Alphabet 51 | * then no new integer is assigned to it and the old integer assigned to it 52 | * is returned. 53 | * @param sym A symbol to be added 54 | * @return The index assigned to the newly added symbol 55 | */ 56 | public int addSymbol(String sym) { 57 | if(sym == null){ 58 | return -1; 59 | } 60 | if (!indices.containsKey(sym)) { 61 | indices.put(sym, new Integer(indices.size())); 62 | symbols.add(sym); 63 | } 64 | return indices.get(sym).intValue(); 65 | } 66 | 67 | /** 68 | * Returns the index associated with the symbol. 69 | * @param sym A symbol of which the index is to be returned 70 | * @return The index associated with the given symbol or -1 if the symbol is 71 | * not stored in the Alphabet 72 | */ 73 | public int getIndex(String sym) { 74 | if (indices.containsKey(sym)) { 75 | return indices.get(sym).intValue(); 76 | } 77 | return -1; 78 | } 79 | 80 | /** 81 | * Returns the symbol at the given index position. 82 | * @param index The index position at which the symbol is to be returned 83 | * @return The symbol at the given index position or null if the index is 84 | * out of range (index < 0 || index >= size()) 85 | */ 86 | public String getSymbol(int index) { 87 | if (index >= 0 && index < symbols.size()) { 88 | return symbols.get(index); 89 | } 90 | return null; 91 | } 92 | 93 | /** 94 | * Returns the size of the Alphabet. 95 | * @return The size of this Alphabet object, i.e. the number of symbols 96 | * stored in the Alphabet. 97 | */ 98 | public int size() { 99 | // System.out.println("SYS=" + symbols.size()); 100 | return indices.size(); 101 | } 102 | 103 | /** 104 | * Add a array of symbols into current Alphabet 105 | * @param A array of Strings 106 | */ 107 | public void addSymbols(String[] symbols){ 108 | assert(symbols.length > 0 ); 109 | for(int i = 0; i < symbols.length; i++){ 110 | addSymbol(symbols[i]); 111 | } 112 | } 113 | public void display(){ 114 | Iterator ite = indices.keySet().iterator(); 115 | while( ite.hasNext() ){ 116 | String key = ite.next(); 117 | //if( indices.get(key) > 2000 ) 118 | // System.out.print(key + " " + indices.get(key)); 119 | System.out.print(indices.get(key) + " "); 120 | } 121 | System.out.println("\n" + "[" + symbols.size()+ "]"); 122 | for(int i = 0; i < symbols.size(); i++){ 123 | System.out.print( symbols.get(i) + " "); 124 | } 125 | } 126 | public void saveVocab(String file) throws IOException{ 127 | BufferedWriter out = new BufferedWriter( 128 | new FileWriter( new File(file))); 129 | 130 | Iterator ite = indices.keySet().iterator(); 131 | while(ite.hasNext()){ 132 | String wrd = ite.next(); 133 | int id = indices.get(wrd); 134 | out.write( wrd + " " + id + "\n"); 135 | } 136 | out.flush(); 137 | out.close(); 138 | } 139 | private HashMap indices; 140 | private ArrayList symbols; 141 | 142 | } -------------------------------------------------------------------------------- /src/edu/smu/data/DataList.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.Iterator; 6 | 7 | import edu.smu.util.MatrixOps; 8 | 9 | /** 10 | * A class manipulates the collection of instances 11 | */ 12 | public class DataList implements Iterable { 13 | 14 | public DataList(){ 15 | this.dataSet = new ArrayList(); 16 | length = dataSet.size(); 17 | iter = dataSet.iterator(); 18 | } 19 | 20 | public DataList( ArrayList dataSet){ 21 | this.dataSet = new ArrayList(dataSet); 22 | length = dataSet.size(); 23 | iter = dataSet.iterator(); 24 | } 25 | public int size(){ 26 | return length; 27 | } 28 | public Iterator iterator() { 29 | return iter; 30 | } 31 | 32 | public T get(int index){ 33 | assert(index < length && index >= 0); 34 | return dataSet.get(index); 35 | } 36 | 37 | public DataList[] split (double[] proportions) { 38 | return split (new java.util.Random(System.currentTimeMillis()), proportions); 39 | } 40 | 41 | public DataList deepClone () { 42 | DataList ret = new DataList( dataSet ); 43 | return ret; 44 | } 45 | public void shuffle (java.util.Random r) { 46 | Collections.shuffle (dataSet, r); 47 | } 48 | /** 49 | * Randomly permute the specified InstanceList using the specified source of randomness. And then split it into a array of InstanceList 50 | * @param r 51 | * @param proportions 52 | * @return 53 | */ 54 | public DataList[] split (java.util.Random r, double[] proportions) { 55 | DataList shuffled = this.deepClone(); 56 | shuffled.shuffle (r); 57 | return shuffled.splitInOrder(proportions); 58 | } 59 | /** 60 | * 61 | * @param A array of proportions to divide the whole instance list 62 | * @return A array of InstanceList 63 | */ 64 | public DataList[] splitInOrder (double[] proportions) { 65 | DataList[] ret = new DataList[proportions.length]; 66 | double maxind[] = proportions.clone(); 67 | MatrixOps.normalize(maxind); 68 | for (int i = 0; i < maxind.length; i++) { 69 | ret[i] = new DataList(); 70 | if (i > 0) 71 | maxind[i] += maxind[i-1]; 72 | } 73 | for (int i = 0; i < maxind.length; i++) { 74 | // Fill maxind[] with the highest instance index to go in each corresponding returned InstanceList 75 | maxind[i] = Math.rint (maxind[i] * this.size()); 76 | } 77 | for (int i = 0, j = 0; i < size(); i++) { 78 | // This gives a slight bias toward putting an extra instance in the last InstanceList. 79 | while (i >= maxind[j] && j < ret.length) 80 | j++; 81 | ret[j].add(dataSet.get(i)); 82 | } 83 | return ret; 84 | } 85 | /** 86 | * Add an instance to current list 87 | * @param an instance to be added in the instance list 88 | */ 89 | public void add(T instance) { 90 | assert(instance != null); 91 | dataSet.add(instance); 92 | length = dataSet.size(); 93 | } 94 | 95 | //Iterable 96 | protected Iterator iter; 97 | //Storing the instance lists 98 | protected ArrayList dataSet; 99 | //The size of dataset 100 | protected int length; 101 | protected Alphabet labelSet; 102 | protected Alphabet featSet; 103 | } 104 | -------------------------------------------------------------------------------- /src/edu/smu/data/Instance.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | 3 | import java.util.*; 4 | import java.io.*; 5 | 6 | /** 7 | * An Instance object stores a sparse vector that represents an observation 8 | * together with a label for this observation. 9 | */ 10 | public class Instance { 11 | 12 | public Instance(SparseVector featVec, int label) { 13 | this.featVec = featVec; 14 | this.label = label; 15 | id = "UNKNOWN"; 16 | predictLabel = -1; 17 | } 18 | 19 | public Instance(SparseVector featVec, int label, String id) { 20 | this.featVec = featVec; 21 | this.label = label; 22 | this.id = id; 23 | predictLabel = -1; 24 | } 25 | 26 | public void setFeaVector( SparseVector featVec ){ 27 | this.featVec = featVec; 28 | } 29 | 30 | public void setLabel(int label){ 31 | this.label = label; 32 | } 33 | 34 | 35 | public SparseVector getFeatureVector() { 36 | return featVec; 37 | } 38 | 39 | public int getLabel() { 40 | return label; 41 | } 42 | 43 | public int getPredictLabel(){ 44 | return predictLabel; 45 | } 46 | 47 | public void setPredictLabel(int l){ 48 | predictLabel = l; 49 | } 50 | 51 | public String getID() { 52 | return id; 53 | } 54 | 55 | public void display() { 56 | System.out.println("--------------------------------------------"); 57 | System.out.println("Id=" + id ); 58 | System.out.println("Label="+ label ); 59 | featVec.display(); 60 | System.out.println("--------------------------------------------"); 61 | } 62 | // The feature vector that represents this Instance object. 63 | protected SparseVector featVec; 64 | 65 | // The class label of this Instance object. The label ranges from 0 to (C-1) 66 | // where C is the total number of classes. If label is set to -1, it means 67 | // this Instance in unlabeled. 68 | protected int label; 69 | 70 | protected int predictLabel; 71 | // A String that can be used to identify this Instance if needed. E.g. if the 72 | // Instance object is a document, the id can be the document ID. It is not 73 | // necessary to set this id. 74 | protected String id; 75 | 76 | } -------------------------------------------------------------------------------- /src/edu/smu/data/InstanceList.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.Iterator; 6 | 7 | import edu.smu.util.MatrixOps; 8 | /** 9 | * A class manipulates the collection of instances 10 | */ 11 | public class InstanceList implements Iterable { 12 | 13 | public InstanceList(){ 14 | this.dataSet = new ArrayList(); 15 | length = dataSet.size(); 16 | iter = dataSet.iterator(); 17 | } 18 | 19 | public InstanceList( ArrayList dataSet){ 20 | this.dataSet = new ArrayList(dataSet); 21 | length = dataSet.size(); 22 | iter = dataSet.iterator(); 23 | } 24 | public int size(){ 25 | return length; 26 | } 27 | public Iterator iterator() { 28 | return iter; 29 | } 30 | 31 | public Instance get(int index){ 32 | assert(index < length && index >= 0); 33 | return dataSet.get(index); 34 | } 35 | 36 | public InstanceList[] split (double[] proportions) { 37 | return split (new java.util.Random(System.currentTimeMillis()), proportions); 38 | } 39 | 40 | public InstanceList deepClone () { 41 | InstanceList ret = new InstanceList( dataSet ); 42 | return ret; 43 | } 44 | public void shuffle (java.util.Random r) { 45 | Collections.shuffle (dataSet, r); 46 | } 47 | /** 48 | * Randomly permute the specified InstanceList using the specified source of randomness. And then split it into a array of InstanceList 49 | * @param r 50 | * @param proportions 51 | * @return 52 | */ 53 | public InstanceList[] split (java.util.Random r, double[] proportions) { 54 | InstanceList shuffled = this.deepClone(); 55 | shuffled.shuffle (r); 56 | return shuffled.splitInOrder(proportions); 57 | } 58 | /** 59 | * 60 | * @param A array of proportions to divide the whole instance list 61 | * @return A array of InstanceList 62 | */ 63 | public InstanceList[] splitInOrder (double[] proportions) { 64 | InstanceList[] ret = new InstanceList[proportions.length]; 65 | double maxind[] = proportions.clone(); 66 | MatrixOps.normalize(maxind); 67 | for (int i = 0; i < maxind.length; i++) { 68 | ret[i] = new InstanceList(); 69 | if (i > 0) 70 | maxind[i] += maxind[i-1]; 71 | } 72 | for (int i = 0; i < maxind.length; i++) { 73 | // Fill maxind[] with the highest instance index to go in each corresponding returned InstanceList 74 | maxind[i] = Math.rint (maxind[i] * this.size()); 75 | } 76 | for (int i = 0, j = 0; i < size(); i++) { 77 | // This gives a slight bias toward putting an extra instance in the last InstanceList. 78 | while (i >= maxind[j] && j < ret.length) 79 | j++; 80 | ret[j].add(dataSet.get(i)); 81 | } 82 | return ret; 83 | } 84 | /** 85 | * Add an instance to current list 86 | * @param an instance to be added in the instance list 87 | */ 88 | public void add(Instance instance) { 89 | assert(instance != null); 90 | dataSet.add(instance); 91 | length = dataSet.size(); 92 | } 93 | public void display(){ 94 | System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); 95 | for(int i = 0; i < length; i++ ){ 96 | dataSet.get(i).display(); 97 | } 98 | System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); 99 | } 100 | //Iterable 101 | private Iterator iter; 102 | //Storing the instance lists 103 | private ArrayList dataSet; 104 | //The size of dataset 105 | private int length; 106 | } 107 | -------------------------------------------------------------------------------- /src/edu/smu/data/Lattice.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | //To make the program run fast, we don't use this class right now! 3 | public class Lattice { 4 | public Lattice(int row, int col, int numLabels, Node[][] net, double[][] cost){ 5 | this.row = row; 6 | this.col = col; 7 | this.numLabels = numLabels; 8 | this.net = net; 9 | this.cost = cost; 10 | } 11 | public double getCostOf(int x, int y){ 12 | return cost[x][y]; 13 | } 14 | public void setBestLabel(int x, int y, int l ){ 15 | net[x][y].setLabel(l); 16 | } 17 | public int getPrevNodeOf(int x, int y){ 18 | return net[x][y].getPrevNode(); 19 | } 20 | public void readPrevProbsOf(int x, int y, double[] probs){ 21 | net[x][y].getPrevProbs(probs); 22 | } 23 | public void setCurProbsOf(int x, int y, double[] probs){ 24 | net[x][y].setCurProbs(probs); 25 | } 26 | public int getRow(){ 27 | return row; 28 | } 29 | public int getCol(){ 30 | return col; 31 | } 32 | public int getNumFeatures(){ 33 | return numLabels; 34 | } 35 | private double[][] cost; 36 | private int row; 37 | private int col; 38 | private int numLabels; 39 | private Node[][] net; 40 | } 41 | -------------------------------------------------------------------------------- /src/edu/smu/data/Node.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | //To make the program run fast, we don't use this class right now! 3 | import java.util.Vector; 4 | 5 | import edu.smu.util.MatrixOps; 6 | 7 | public class Node { 8 | public Node(int numLabels){ 9 | this.numLabels = numLabels; 10 | prevProbs = new double[numLabels]; 11 | curProbs = new double[numLabels]; 12 | MatrixOps.setAll(prevProbs, 0); 13 | MatrixOps.setAll(curProbs, 0); 14 | bestLabel = -1; 15 | } 16 | public Node(int numLabels, double[] prevProbs){ 17 | this.numLabels = numLabels; 18 | this.prevProbs = new double[numLabels]; 19 | this.curProbs = new double[numLabels]; 20 | MatrixOps.set(this.prevProbs, prevProbs); 21 | MatrixOps.setAll(curProbs, 0); 22 | bestLabel = -1; 23 | } 24 | public void getPrevProbs(double[] probs){ 25 | MatrixOps.set(probs, prevProbs); 26 | } 27 | public void getCurProbs(double[] probs){ 28 | MatrixOps.set(probs, curProbs); 29 | } 30 | public void setCurProbs(double[] probs){ 31 | MatrixOps.set(curProbs, probs); 32 | } 33 | public int getBestLabel(){ 34 | return bestLabel; 35 | } 36 | public void setLabel(int label){ 37 | bestLabel = label; 38 | } 39 | public int getPrevNode(){ 40 | return prevNode; 41 | } 42 | private int numLabels; 43 | private int prevNode; 44 | private int bestLabel; 45 | double[] prevProbs; 46 | double[] curProbs; 47 | } 48 | -------------------------------------------------------------------------------- /src/edu/smu/data/Sequence.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | 3 | 4 | import java.util.ArrayList; 5 | import java.util.Collections; 6 | import java.util.Iterator; 7 | 8 | 9 | /** 10 | * A class manipulates the sequence 11 | */ 12 | public class Sequence implements Iterable , Comparable { 13 | 14 | public Sequence(){ 15 | this.dataSet = new ArrayList(); 16 | length = dataSet.size(); 17 | iter = dataSet.iterator(); 18 | } 19 | 20 | public Sequence( ArrayList dataSet){ 21 | this.dataSet = new ArrayList(dataSet); 22 | iter = dataSet.iterator(); 23 | } 24 | 25 | public Instance getInstance(int idx){ 26 | assert(idx >= 0 && idx < length); 27 | return dataSet.get(idx); 28 | } 29 | 30 | public void addInstance(Instance inst){ 31 | dataSet.add(inst); 32 | length = dataSet.size(); 33 | } 34 | public int size(){ 35 | return length;// = dataSet.size(); 36 | } 37 | public Iterator iterator() { 38 | return iter; 39 | } 40 | 41 | public Instance get(int index){ 42 | assert(index < length && index >= 0); 43 | return dataSet.get(index); 44 | } 45 | 46 | public InstanceList deepClone () { 47 | InstanceList ret = new InstanceList( dataSet ); 48 | return ret; 49 | } 50 | 51 | public void display(){ 52 | //System.out.println(dataSet.size()); 53 | System.out.println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"); 54 | for(int i = 0; i < length; i++ ){ 55 | dataSet.get(i).display(); 56 | if( i != length - 1) 57 | System.out.println("=>"); 58 | } 59 | System.out.println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"); 60 | } 61 | public int compareTo(Object o) { 62 | Sequence B = (Sequence)o; 63 | double bp = B.getProb(); 64 | if( prob < bp ) 65 | return 1; 66 | else if( prob > bp ){ 67 | return -1; 68 | } else if( prob == bp ){ 69 | return 0; 70 | } 71 | return 0; 72 | } 73 | public double getProb(){ 74 | return prob; 75 | } 76 | public void setProb(double prob){ 77 | this.prob = prob; 78 | } 79 | public boolean isLabelIn(int label){ 80 | for(int i = 0; i < dataSet.size(); i++ ){ 81 | Instance inst = dataSet.get(i); 82 | int id = inst.getPredictLabel(); 83 | if( id == label ){ 84 | return true; 85 | } 86 | } 87 | return false; 88 | } 89 | // 90 | private double prob = -1.0; 91 | //Iterable 92 | private Iterator iter; 93 | //Storing the instance lists 94 | private ArrayList dataSet; 95 | //The size of dataset 96 | private int length; 97 | } 98 | -------------------------------------------------------------------------------- /src/edu/smu/data/SequenceList.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | 6 | public class SequenceList { 7 | 8 | public SequenceList(Alphabet labelSet){ 9 | this.labelSet = labelSet; 10 | 11 | dataSet = new ArrayList(); 12 | iter = dataSet.iterator(); 13 | } 14 | 15 | public SequenceList(Alphabet labelSet, ArrayList arr ){ 16 | //System.out.println(arr.size()); 17 | this.labelSet = labelSet; 18 | this.dataSet = new ArrayList(arr); 19 | iter = dataSet.iterator(); 20 | //System.out.println("->" + dataSet.size()); 21 | length = this.dataSet.size(); 22 | } 23 | 24 | public void addSequence(Sequence seq){ 25 | dataSet.add(seq); 26 | length = dataSet.size(); 27 | } 28 | public int size() { 29 | return length; 30 | } 31 | 32 | public Sequence getSequence(int idx){ 33 | assert(idx >= 0 && idx < length); 34 | return dataSet.get(idx); 35 | } 36 | 37 | public void getArrayList(ArrayList arrSeq){ 38 | for(int i = 0; i < length; i++ ){ 39 | arrSeq.add(dataSet.get(i)); 40 | } 41 | } 42 | public void getInstanceList(ArrayList arrSeq){ 43 | for(int i = 0; i < length; i++ ){ 44 | Sequence seq = dataSet.get(i); 45 | for(int j = 0; j < seq.size(); j++ ){ 46 | arrSeq.add(seq.get(j)); 47 | } 48 | } 49 | } 50 | /*public SequenceList deepClone () { 51 | ArrayList ret = new ArrayList( dataSet ); 52 | return ret; 53 | }*/ 54 | public Iterator iterator() { 55 | return iter; 56 | } 57 | public InstanceList[] splitByPreviousLabel(){ 58 | //Iterator iterSequence = this.iterator(); 59 | //while( iterSequence.hasNext() ){ 60 | //Sequence seq = iterSequence.next(); 61 | /*Iterator iterInst = seq.iterator(); 62 | int prev = -1; 63 | while( iterInst.hasNext() ){ 64 | Instance inst = iterInst.next(); 65 | if( prev != -1 ){ 66 | instList[prev].add(inst); 67 | } 68 | prev = inst.getLabel(); 69 | }*/ 70 | instList = new InstanceList[labelSet.size()]; 71 | for(int i = 0; i < labelSet.size(); i++){ 72 | instList[i] = new InstanceList(); 73 | } 74 | for(int s = 0; s < length; s++ ){ 75 | Sequence seq = dataSet.get(s); 76 | //dataSet.r 77 | int prev = -1; 78 | for(int i = 0; i < seq.size(); i++ ){ 79 | Instance inst = seq.get(i); 80 | if( prev != -1 ){ 81 | //System.out.println("prev=" + prev); 82 | instList[prev].add(inst); 83 | } 84 | prev = inst.getLabel(); 85 | } 86 | } 87 | //System.out.println( labelSet.size() ); 88 | return instList; 89 | } 90 | 91 | public void display(){ 92 | System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); 93 | //System.out.println(dataSet.size()); 94 | for(int i = 0; i < dataSet.size(); i++ ){ 95 | dataSet.get(i).display(); 96 | } 97 | System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); 98 | } 99 | protected Iterator iter; 100 | //Storing the instance lists 101 | protected ArrayList dataSet; 102 | //The size of dataset 103 | protected int length; 104 | protected Alphabet labelSet; 105 | protected Alphabet featSet; 106 | private InstanceList[] instList; 107 | } 108 | -------------------------------------------------------------------------------- /src/edu/smu/data/SparseVector.java: -------------------------------------------------------------------------------- 1 | package edu.smu.data; 2 | 3 | import java.util.*; 4 | import java.io.*; 5 | 6 | /** 7 | * A SparseVector object represents a sparse vector. It stores the indices of 8 | * the features that have non-zero values and their corresponding feature 9 | * values. 10 | */ 11 | 12 | // To be completed! 13 | 14 | public class SparseVector { 15 | 16 | // The constructor can be changed to take in different types of parameters. 17 | /** 18 | * Note that right now SparseVector doesn't support add or remove operations which will be time-consuming in current position. 19 | */ 20 | public SparseVector(int[] indices, double[] values) { 21 | assert( indices.length == values.length ); 22 | 23 | capacity = indices.length + 1; 24 | this.indices = new int[capacity]; 25 | this.values = new double[capacity]; 26 | //id2Pos = new HashMap(); 27 | 28 | int cnt = 0; 29 | for(int i = 0; i < indices.length; i++ ){ 30 | if( values[i] != 0.0 ){ //&& !id2Pos.containsKey(indices[i])){ 31 | this.indices[cnt] = indices[i]; 32 | this.values[cnt] = values[i]; 33 | //id2Pos.put(indices[i], cnt); 34 | cnt++; 35 | } 36 | } 37 | length = cnt; 38 | } 39 | 40 | /** 41 | * Returns the number of entries (non-zero features) stored in this 42 | * SparseVector object. 43 | * @return The number of entries in this SparseVector. 44 | */ 45 | public int numEntries() { 46 | return length; 47 | } 48 | 49 | /** 50 | * Returns the index of the i'th feature stored in this SparseVector. For 51 | * example, suppose a SparseVector has the following feature indices and 52 | * feature values: 53 | *

54 | * 2 1.5 55 | * 5 0.5 56 | * 9 1.0 57 | *

58 | * Then calling getFeatureIndexAt(0) returns 2 and calling 59 | * getFeatureIndexAt(2) returns 9. 60 | * @param i The location of the entry from which a feature index is to be 61 | * returned. 62 | * @return The feature index stored in the specified entry. 63 | */ 64 | public int getFeatureIndexAt(int i) { 65 | assert(i >= 0 && i < length); 66 | return indices[i]; 67 | } 68 | /** 69 | * Returns the value of the i'th feature stored in this SparseVector. For 70 | * example, suppose a SparseVector has the following feature indices and 71 | * feature values: 72 | *

73 | * 2 1.5 74 | * 5 0.5 75 | * 9 1.0 76 | *

77 | * Then calling getFeatureValueAt(0) returns 1.5 and calling 78 | * getFeatureValueAt(2) returns 1.0. 79 | * @param i The location of the entry from which a feature value is to be 80 | * returned. 81 | * @return The feature value stored in the specified entry. 82 | */ 83 | public double getFeatureValueAt(int i) { 84 | assert(i >= 0 && i < length); 85 | return values[i]; 86 | } 87 | 88 | /** 89 | * Different from getFeatureValueAt, this function supporting locate entry by feature id 90 | * @param feature id 91 | * @return corresponding feature position in this SparseVector, if not return -1 92 | */ 93 | /*public int getFeaturePositionOf(int ind){ 94 | if(!id2Pos.containsKey(ind)){ 95 | return -1; 96 | } 97 | return id2Pos.get(ind); 98 | }*/ 99 | 100 | /** 101 | * This function supporting locate entry by feature id, then return its value 102 | * @param feature id 103 | * @return corresponding feature value or Double.MAX_VALUE 104 | */ 105 | /*public double getFeatureValueOf(int ind){ 106 | if(!id2Pos.containsKey(ind)){ 107 | return Double.MAX_VALUE; 108 | } 109 | return values[id2Pos.get(ind)]; 110 | }*/ 111 | 112 | public int size(){ 113 | return length; 114 | } 115 | /** 116 | * Just output the elements into screen 117 | */ 118 | public void display(){ 119 | for(int i = 0; i < length; i++ ){ 120 | System.out.print( "(" + new Integer(indices[i]) + "," + new Double(values[i]) + ")" ); 121 | if( i != length-1 ) 122 | System.out.print(" , "); 123 | else System.out.println(); 124 | } 125 | } 126 | // The following attributes are possible ways to implement this class but 127 | // other data structures are also possible. 128 | 129 | private int[] indices; // The indices of features that have non-zero values. 130 | // If there are no features with none-zero values, 131 | // then "indices" is set to null. 132 | private double[] values; // The values corresponding to the features 133 | // specified in "indices" or null if all features 134 | // are binary. 135 | private int length; 136 | private int capacity; 137 | //private HashMap id2Pos; 138 | } -------------------------------------------------------------------------------- /src/edu/smu/util/FileUtil.java: -------------------------------------------------------------------------------- 1 | package edu.smu.util; 2 | 3 | import java.util.*; 4 | import java.io.*; 5 | 6 | public class FileUtil { 7 | 8 | public static void readLines(String file, ArrayList lines) { 9 | BufferedReader reader = null; 10 | 11 | try { 12 | 13 | reader = new BufferedReader(new FileReader(new File(file))); 14 | 15 | String line = null; 16 | while( (line = reader.readLine()) != null ) { 17 | lines.add(line); 18 | } 19 | 20 | } catch(FileNotFoundException e) { 21 | e.printStackTrace(); 22 | } catch(IOException e) { 23 | e.printStackTrace(); 24 | } finally { 25 | if (reader != null) { 26 | try { 27 | reader.close(); 28 | } catch(IOException e) { 29 | e.printStackTrace(); 30 | } 31 | } 32 | } 33 | 34 | } 35 | 36 | public static void readLinesBySequence(String file, ArrayList lines) { 37 | BufferedReader reader = null; 38 | String content; 39 | try { 40 | 41 | reader = new BufferedReader(new FileReader(new File(file))); 42 | 43 | String line = null; 44 | content = ""; 45 | while( (line = reader.readLine()) != null ) { 46 | // System.out.println(line); 47 | // System.out.println(line.length()); 48 | //System.out.println(line); 49 | // System.out.println(line.length()); 50 | 51 | if( line.length() > 0 ){ 52 | if( content.length() > 0 ) 53 | content += "@" + line; 54 | else 55 | content = line; 56 | } else { 57 | if( content.length() > 0 ){ 58 | //System.out.println(content); 59 | lines.add(content); 60 | content = ""; 61 | } 62 | } 63 | } 64 | 65 | if( content.length() > 0 ){ 66 | lines.add(content); 67 | } 68 | 69 | } catch(FileNotFoundException e) { 70 | e.printStackTrace(); 71 | } catch(IOException e) { 72 | e.printStackTrace(); 73 | } finally { 74 | if (reader != null) { 75 | try { 76 | reader.close(); 77 | } catch(IOException e) { 78 | e.printStackTrace(); 79 | } 80 | } 81 | } 82 | //System.out.println(lines.size()); 83 | } 84 | 85 | public static void writeLines(String file, ArrayList lines) { 86 | BufferedWriter writer = null; 87 | 88 | try { 89 | 90 | writer = new BufferedWriter(new FileWriter(new File(file))); 91 | 92 | for(int i = 0; i < lines.size(); i++) { 93 | writer.write(lines.get(i) + "\n"); 94 | } 95 | 96 | } catch(FileNotFoundException e) { 97 | e.printStackTrace(); 98 | } catch(IOException e) { 99 | e.printStackTrace(); 100 | } finally { 101 | if (writer != null) { 102 | try { 103 | writer.close(); 104 | } catch(IOException e) { 105 | e.printStackTrace(); 106 | } 107 | } 108 | } 109 | 110 | } 111 | 112 | 113 | } -------------------------------------------------------------------------------- /src/edu/smu/util/RemoveIllegalChar.java: -------------------------------------------------------------------------------- 1 | package edu.smu.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileNotFoundException; 7 | import java.io.FileReader; 8 | import java.io.FileWriter; 9 | import java.io.IOException; 10 | import java.util.StringTokenizer; 11 | 12 | public class RemoveIllegalChar { 13 | public static void main() throws IOException{ 14 | 15 | BufferedReader in = new BufferedReader( new FileReader(new File("C:\\cygwin\\home\\xzhao\\opinion_mining\\data\\hotel.txt") )); 16 | BufferedWriter out = new BufferedWriter( new FileWriter(new File("C:\\cygwin\\home\\xzhao\\opinion_mining\\data\\hotel.good.txt") )); 17 | 18 | String line = ""; 19 | 20 | while( (line=in.readLine()) != null ){ 21 | StringTokenizer st = new StringTokenizer(line); 22 | while( st.hasMoreTokens() ){ 23 | String word = st.nextToken(); 24 | if( word.indexOf("_") == -1 ){ 25 | continue; 26 | } 27 | out.write(word+" "); 28 | } 29 | out.write("\n"); 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/edu/smu/util/SequeceFeaturePatternExtrator.java: -------------------------------------------------------------------------------- 1 | package edu.smu.util; 2 | 3 | import java.util.ArrayList; 4 | 5 | import edu.smu.data.Alphabet; 6 | import edu.smu.data.Instance; 7 | import edu.smu.data.Sequence; 8 | import edu.smu.data.SequenceList; 9 | import edu.smu.data.SparseVector; 10 | 11 | public class SequeceFeaturePatternExtrator { 12 | public static SequenceList getTrainSeqListFromFileWithPreviousLabel(String dataFile, String templateFile, Alphabet featSet, Alphabet labelSet){ 13 | ArrayList dataLines = new ArrayList();// 14 | FileUtil.readLinesBySequence(dataFile, dataLines); 15 | 16 | ArrayList templateLines = new ArrayList();// 17 | FileUtil.readLines(templateFile, templateLines); 18 | 19 | //System.out.println(dataLines.size()); 20 | 21 | int[] x = new int[templateLines.size()]; 22 | int[] y = new int[templateLines.size()]; 23 | for(int i = 0; i < templateLines.size(); i++){ 24 | String[] pat = templateLines.get(i).split(","); 25 | x[i] = new Integer(pat[0]); 26 | y[i] = new Integer(pat[1]); 27 | } 28 | 29 | SequenceList seqList = new SequenceList(labelSet); 30 | featSet.addSymbol("ME_BIAS"); 31 | 32 | for(int i = 0; i < dataLines.size(); i++){ 33 | //System.out.println(dataLines.get(i)); 34 | seqList.addSequence( str2Seq(dataLines.get(i), x, y, featSet, labelSet, Integer.MAX_VALUE ) ); 35 | } 36 | return seqList; 37 | } 38 | 39 | public static SequenceList[] getTestSeqListFromFileWithPreviousLabel(String dataFile, String templateFile, Alphabet featSet, Alphabet labelSet){ 40 | ArrayList dataLines = new ArrayList();// 41 | FileUtil.readLinesBySequence(dataFile, dataLines); 42 | 43 | ArrayList templateLines = new ArrayList();// 44 | FileUtil.readLines(templateFile, templateLines); 45 | 46 | //System.out.println(dataLines.size()); 47 | 48 | int[] x = new int[templateLines.size()]; 49 | int[] y = new int[templateLines.size()]; 50 | for(int i = 0; i < templateLines.size(); i++){ 51 | String[] pat = templateLines.get(i).split(","); 52 | x[i] = new Integer(pat[0]); 53 | y[i] = new Integer(pat[1]); 54 | } 55 | 56 | SequenceList[] seqList = new SequenceList[labelSet.size()]; 57 | for(int i = 0; i < seqList.length; i++ ){ 58 | seqList[i] = new SequenceList(labelSet); 59 | } 60 | featSet.addSymbol("ME_BIAS"); 61 | 62 | for(int i = 0; i < dataLines.size(); i++){ 63 | //System.out.println(dataLines.get(i)); 64 | for(int l = 0; l < labelSet.size(); l++ ){ 65 | seqList[l].addSequence( str2Seq(dataLines.get(i), x, y, featSet, labelSet, l ) ); 66 | } 67 | } 68 | return seqList; 69 | } 70 | 71 | public static SequenceList getSeqListFromFile(String dataFile, String templateFile, Alphabet featSet, Alphabet labelSet){ 72 | ArrayList dataLines = new ArrayList();// 73 | FileUtil.readLinesBySequence(dataFile, dataLines); 74 | 75 | ArrayList templateLines = new ArrayList();// 76 | FileUtil.readLines(templateFile, templateLines); 77 | 78 | //System.out.println(dataLines.size()); 79 | 80 | int[] x = new int[templateLines.size()]; 81 | int[] y = new int[templateLines.size()]; 82 | for(int i = 0; i < templateLines.size(); i++){ 83 | String[] pat = templateLines.get(i).split(","); 84 | x[i] = new Integer(pat[0]); 85 | y[i] = new Integer(pat[1]); 86 | } 87 | 88 | SequenceList seqList = new SequenceList(labelSet); 89 | featSet.addSymbol("ME_BIAS"); 90 | 91 | for(int i = 0; i < dataLines.size(); i++){ 92 | //System.out.println(dataLines.get(i)); 93 | seqList.addSequence( str2Seq(dataLines.get(i), x, y, featSet, labelSet, -1 )); 94 | } 95 | return seqList; 96 | } 97 | public static Sequence str2Seq(String line, int[] x, int[] y, Alphabet featSet, Alphabet labelSet, int prevLabel){ 98 | //System.out.println(line); 99 | String[] str = line.split("@"); 100 | String[][] item = new String[str.length][]; 101 | for(int i = 0; i < item.length; i++ ){ 102 | //System.out.println(str[i]); 103 | item[i] = str[i].split(" "); 104 | } 105 | 106 | /*for(int i = 0; i < item.length; i++){ 107 | for(int j = 0; j < item[i].length; j++){ 108 | System.out.println( item[i][j] + "\t"); 109 | } 110 | System.out.println(); 111 | }*/ 112 | 113 | int row = item.length; 114 | int col = item[0].length; 115 | 116 | //System.out.println(row); 117 | //System.out.println(col); 118 | 119 | Sequence seq = new Sequence(); 120 | 121 | for(int r = 0; r < row; r++){ 122 | //int label = labelSet.addSymbol(item[r][col-1]); 123 | int label = labelSet.addSymbol(item[r][col-1].substring(0,1)); 124 | 125 | ArrayList arrInd = new ArrayList(); 126 | ArrayList arrValue = new ArrayList(); 127 | 128 | String post = ""; 129 | 130 | /** 131 | * Adding the set prev_label; 132 | */ 133 | if( prevLabel >= 0 && prevLabel < labelSet.size() && r >= 1){ 134 | //if( prev){ 135 | 136 | //} 137 | //System.out.println("prev=_" + labelSet.getSymbol(prevLabel)); 138 | //arrInd.add( featSet.addSymbol("prev=_" + labelSet.getSymbol(prevLabel)+"_"+item[r-1][1])); 139 | //arrValue.add( 1.0 ); 140 | //arrInd.add( featSet.addSymbol("prev=_" + labelSet.getSymbol(prevLabel))); 141 | //arrValue.add( 1.0 ); 142 | post = "_prev=_" + labelSet.getSymbol(prevLabel); 143 | }/** 144 | * Adding the previous label; 145 | */ 146 | else if( prevLabel == Integer.MAX_VALUE && r >= 1 ){ 147 | //arrInd.add( featSet.addSymbol("prev=_" + item[r-1][col-1])); 148 | //System.out.println("prev=_" + item[r-1][col-1]); 149 | //arrValue.add( 1.0 ); 150 | post = "_prev=_" + item[r-1][col-1] ; 151 | } 152 | 153 | for(int i = 0; i < x.length; i++){ 154 | int tx = x[i] + r; 155 | int ty = y[i]; 156 | if( tx >= 0 && tx < row && ty >= 0 && ty < col ){ 157 | String fea = item[tx][ty] + "_x["+ new Integer(x[i])+"," + new Integer(y[i]) + "]" + post; 158 | //System.out.println(fea); 159 | int fId = featSet.addSymbol(fea); 160 | double v = 1.0; 161 | arrInd.add(fId); 162 | arrValue.add(v); 163 | 164 | 165 | fea = item[tx][ty] + "_x["+ new Integer(x[i])+"," + new Integer(y[i]) + "]"; 166 | arrInd.add(featSet.addSymbol(fea)); 167 | arrValue.add(v); 168 | 169 | } 170 | } 171 | int[] inds = new int[arrInd.size()]; 172 | MatrixOps.arrayListToArray(arrInd, inds); 173 | double[] values = new double[arrValue.size()]; 174 | MatrixOps.arrayListToArray(arrValue, values); 175 | 176 | //System.out.println(inds.length + " " + values.length); 177 | Instance inst = new Instance(new SparseVector(inds, values), label); 178 | //inst.display(); 179 | seq.addInstance(inst); 180 | } 181 | return seq; 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/edu/smu/util/StringUtil.java: -------------------------------------------------------------------------------- 1 | package edu.smu.util; 2 | 3 | import java.util.*; 4 | 5 | public class StringUtil { 6 | 7 | /** 8 | * Splits the given String into tokens. 9 | * 10 | * @param line The String to be tokenized. 11 | * @param tokens The ArrayList to store the tokens. 12 | */ 13 | public static void tokenize(String line, ArrayList tokens) { 14 | StringTokenizer strTok = new StringTokenizer(line); 15 | while(strTok.hasMoreTokens()) { 16 | String token = strTok.nextToken(); 17 | tokens.add(token); 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /src/tem/com/JC.java: -------------------------------------------------------------------------------- 1 | package tem.com; 2 | import java.util.ArrayList; 3 | import jargs.gnu.CmdLineParser; 4 | 5 | /* **************************** JC ************************************* 6 | * This is a class for input parameters. A demo usage is as follows. 7 | * Relative Path: path in System.getProperty("user.dir") 8 | * Usage: 9 | * 1. JC.setInputOptions(Descry, directory, options, args, property, int i); 10 | * i = 1 for input options; others for specified directory; 11 | * i = 1 -> property.charAt[i] = 0 means input is not required; 1 required; 12 | * i = 0 -> property.charAt[i] = 0 means Relative Path; 1 means absolute path 13 | * 14 | * Demo: 15 | * String [] descry = {"Filelist ", "TagMap ", "File direction ", "OuptDir "}; 16 | * String [] directory = {"/filelist_data.txt", "/TagMap.txt","/sentence/","/output/"}; 17 | * char [] options = {'f','t','i','o'}; 18 | * String property = "1111"; 19 | * new JC(); 20 | * JC.setInputOptions(Descry, directory, options, args, property, 1); 21 | * String fileName = JC.getARG(0); 22 | * String TagList = JC.getARG(1); 23 | * String dataDir = JC.getARG(2); 24 | * String outputFileName = JC.getARG(3); 25 | * JC.close(); 26 | * 27 | * JC.setInputOptions(Descry, directory, options, args, property, 1); 28 | * Call the func: java -jar [name].jar -f filelist -t tagmap -i dir -o outputDir 29 | * 30 | * JC.setInputOptions(Descry, directory, options, args, property, 0); 31 | * Just execute the program ! 32 | * 33 | * ************************************************************************/ 34 | 35 | public class JC { 36 | 37 | public static CmdLineParser clp; 38 | 39 | public static String CD; 40 | 41 | public static ArrayList Argums; // 1: description 2: option (-f) 42 | 43 | public JC() { 44 | CD = System.getProperty("user.dir"); 45 | clp = new CmdLineParser(); 46 | Argums = new ArrayList(); 47 | } 48 | 49 | public static ArrayList getArgums() { 50 | return Argums; 51 | } 52 | 53 | public static void setArgums(ArrayList argums) { 54 | Argums = argums; 55 | } 56 | 57 | public static void setSinArgums(String argums) { 58 | Argums.add(argums); 59 | } 60 | 61 | public static String getCD() { 62 | return CD; 63 | } 64 | 65 | public void setCD(String cD) { 66 | CD = cD; 67 | } 68 | 69 | public static void close() { 70 | 71 | for (int i = 0; i < Argums.size(); i += 2) 72 | System.err.println(Argums.get(i) + " is: " 73 | + Argums.get(i + 1)); 74 | Argums.clear(); 75 | } 76 | 77 | static void setOption(char [] options) { 78 | 79 | for(int i = 0; i < options.length; i++) { 80 | clp.addStringOption(options[i], options[i]+""); 81 | } 82 | } 83 | 84 | public static String getARG(int i) { 85 | return Argums.get(2*i+1); 86 | } 87 | 88 | private static void printHelp(String[] descrp, char[] options) { 89 | System.err.println("\nPlease run this file in the following way:"); 90 | System.err.println("java -jar [name].jar -" + options[0] + " " + descrp[0]); 91 | for(int m = 1; m < descrp.length; m++) { 92 | System.err.println(" -" + options[m] + " " + descrp[m]); 93 | } 94 | } 95 | 96 | public static void setInputOptions(String[] descrp, String[] directory, 97 | char[] options, String[] args, String string, int i) { 98 | if( i == 1) 99 | setInputOptions(descrp, options, args, string); 100 | else 101 | setInputOptions(descrp, directory, string); 102 | } 103 | 104 | 105 | public static void setInputOptions(String[] descrp, char[] options, 106 | String[] args, String property) { 107 | if(descrp.length != options.length | descrp.length != property.length()) { 108 | System.err.println("\n Length of input parameters is not equal ! "); 109 | System.exit(1); 110 | } else { 111 | setParemeter(descrp, options, args, property); 112 | } 113 | } 114 | 115 | private static void setParemeter(String[] descrp, char[] options, 116 | String[] a, String property) { 117 | 118 | setOption(options); 119 | try { 120 | clp.parse(a); 121 | } catch (CmdLineParser.OptionException e) { 122 | System.err.println(e.getMessage()); e.printStackTrace(); 123 | printHelp(descrp, options); 124 | System.exit(1); 125 | } 126 | for(int i = 0; i < descrp.length; i++) { 127 | CmdLineParser.Option tmp = clp.addStringOption(options[i], options[i]+""); 128 | Argums.add(descrp[i]); 129 | Argums.add((String)clp.getOptionValue(tmp)); 130 | if(Integer.parseInt(property.charAt(i)+"") == 0 && 131 | Argums.get(Argums.size()-1) == null) { 132 | System.err.print("-" + options[i] + " option is missing !"); 133 | printHelp(descrp, options); 134 | System.exit(1); 135 | } 136 | } 137 | } 138 | 139 | public static void setInputOptions(String[] descrp, String[] directory, 140 | String property) { 141 | if(descrp.length != directory.length | descrp.length != property.length()) { 142 | System.err.println("\n Length of input parameters is not equal ! "); 143 | System.exit(1); 144 | } else { 145 | for(int i = 0; i < descrp.length; i++) { 146 | setParemeter(descrp[i], directory[i], 147 | Integer.parseInt(property.charAt(i)+"")); 148 | } 149 | } 150 | } 151 | 152 | public static void setParemeter(String p1, String p2, int i) { 153 | 154 | // i = 1 means p2 is absolute path, others relative path 155 | if (i == 1) { 156 | setSinArgums(p1); 157 | setSinArgums(p2); 158 | } else { 159 | setSinArgums(p1); 160 | setSinArgums(getCD() + p2); 161 | } 162 | } 163 | 164 | } 165 | -------------------------------------------------------------------------------- /src/tem/com/MathUtil.java: -------------------------------------------------------------------------------- 1 | package tem.com; 2 | /** 3 | * Math Util for Gaussian distribution 4 | * 5 | * @author Minghui 6 | */ 7 | 8 | public class MathUtil { 9 | 10 | // return phi(x) = standard Gaussian pdf 11 | public static double phi(double x) { 12 | return Math.exp(-x*x / 2) / Math.sqrt(2 * Math.PI); 13 | } 14 | 15 | // return phi(x, mu, signma) = Gaussian pdf with mean mu and stddev sigma 16 | public static double phi(double x, double mu, double sigma) { 17 | return phi((x - mu) / sigma) / sigma; 18 | } 19 | 20 | // return Phi(z) = standard Gaussian cdf using Taylor approximation 21 | public static double Phi(double z) { 22 | if (z < -8.0) return 0.0; 23 | if (z > 8.0) return 1.0; 24 | double sum = 0.0, term = z; 25 | for (int i = 3; sum + term != sum; i += 2) { 26 | sum = sum + term; 27 | term = term * z * z / i; 28 | } 29 | return 0.5 + sum * phi(z); 30 | } 31 | 32 | // return Phi(z, mu, sigma) = Gaussian cdf with mean mu and stddev sigma 33 | public static double Phi(double z, double mu, double sigma) { 34 | return Phi((z - mu) / sigma); 35 | } 36 | 37 | // Compute z such that Phi(z) = y via bisection search 38 | public static double PhiInverse(double y) { 39 | return PhiInverse(y, .00000001, -8, 8); 40 | } 41 | 42 | // bisection search 43 | private static double PhiInverse(double y, double delta, double lo, double hi) { 44 | double mid = lo + (hi - lo) / 2; 45 | if (hi - lo < delta) return mid; 46 | if (Phi(mid) > y) return PhiInverse(y, delta, lo, mid); 47 | else return PhiInverse(y, delta, mid, hi); 48 | } 49 | 50 | 51 | 52 | // test client 53 | public static void main(String[] args) { 54 | double z = Double.parseDouble(args[0]); 55 | double mu = Double.parseDouble(args[1]); 56 | double sigma = Double.parseDouble(args[2]); 57 | System.out.println(Phi(z, mu, sigma)); 58 | double y = Phi(z); 59 | System.out.println(PhiInverse(y)); 60 | } 61 | 62 | } -------------------------------------------------------------------------------- /src/tem/com/MatrixUtil.java: -------------------------------------------------------------------------------- 1 | package tem.com; 2 | 3 | import java.util.*; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | import java.io.*; 7 | 8 | public class MatrixUtil { 9 | // irregular array 10 | public static int[][] getArray() { 11 | int[][] num = { { 1, 2, 3 }, { 4, 5 }, { 2 } }; 12 | for (int i = 0; i < num.length; i++) { 13 | for (int j = 0; j < num[i].length; j++) 14 | System.out.println(num[i][j]); 15 | } 16 | return num; 17 | } 18 | 19 | public static void printArray(int[][] num) { 20 | // int [][] num={{1,2,3},{4,5},{2}}; 21 | for (int i = 0; i < num.length; i++) { 22 | for (int j = 0; j < num[i].length; j++) 23 | System.out.print(num[i][j] + "\t"); 24 | System.out.println(); 25 | } 26 | } 27 | 28 | public static void printArray(short[][] num) { 29 | // int [][] num={{1,2,3},{4,5},{2}}; 30 | for (int i = 0; i < num.length; i++) { 31 | for (int j = 0; j < num[i].length; j++) 32 | System.out.print(num[i][j] + "\t"); 33 | System.out.println(); 34 | } 35 | } 36 | 37 | public static void printArray(int[] num) { 38 | for (int i = 0; i < num.length; i++) { 39 | System.out.print(num[i] + "\t"); 40 | } 41 | System.out.println(); 42 | } 43 | 44 | public static void printArray(long[] num) { 45 | for (int i = 0; i < num.length; i++) { 46 | System.out.print(num[i] + "\t"); 47 | } 48 | System.out.println(); 49 | } 50 | 51 | public static void printArray(double[] num) { 52 | for (int i = 0; i < num.length; i++) { 53 | System.out.print(num[i] + "\t"); 54 | } 55 | System.out.println(); 56 | } 57 | 58 | public static void printArray(boolean[][] bs) { 59 | for (int i = 0; i < bs.length; i++) { 60 | for (int j = 0; j < bs[i].length; j++) { 61 | if (bs[i][j]) 62 | System.out.print("1\t"); 63 | else 64 | System.out.print("0\t"); 65 | } 66 | System.out.println(); 67 | } 68 | } 69 | 70 | public static double sumCol(float[][] data, int u) { 71 | double a = 0.0D; 72 | for (int m = 0; m < data[u].length; m++) { 73 | a += data[m][u]; 74 | } 75 | return a; 76 | } 77 | 78 | public static double sumRow(int[][] matrix, int u) { 79 | double a = 0.0D; 80 | for (int m = 0; m < matrix[u].length; m++) { 81 | a += matrix[u][m]; 82 | } 83 | return a; 84 | } 85 | 86 | public static double sum(double[] a2) { 87 | double a = 0l; 88 | for (int i = 0; i < a2.length; i++) { 89 | a += a2[i]; 90 | } 91 | return a; 92 | } 93 | 94 | public static double sum(int[] a2) { 95 | double a = 0; 96 | for (int i = 0; i < a2.length; i++) { 97 | a += a2[i]; 98 | } 99 | return a; 100 | } 101 | 102 | public static int max(int[] flag) { 103 | int max = flag[0]; 104 | for (int i = 1; i < flag.length; i++) { 105 | if (flag[i] > max) 106 | max = flag[i]; 107 | } 108 | return max; 109 | } 110 | 111 | public static double max(double[] flag) { 112 | double max = flag[0]; 113 | for (int i = 1; i < flag.length; i++) { 114 | if (flag[i] > max) 115 | max = flag[i]; 116 | } 117 | return max; 118 | } 119 | 120 | public static double min(double[] flag) { 121 | double min = flag[0]; 122 | for (int i = 1; i < flag.length; i++) { 123 | if (flag[i] < min) 124 | min = flag[i]; 125 | } 126 | return min; 127 | } 128 | 129 | public static void randperm(int[] set, int vector_n, int m) { 130 | // sample M points from 1:vector_n 131 | List list = new ArrayList(); 132 | for (int i = 0; i < vector_n; i++) 133 | list.add(i); 134 | java.util.Collections.shuffle(list); 135 | 136 | for (int i = 0; i < m; i++) { 137 | set[i] = list.get(i); 138 | } 139 | } 140 | 141 | public static double dist(double[] ds, double[] ds2, String distDesp) { 142 | if (distDesp.equals("Euclidean")) { 143 | double dist = 0d; 144 | for (int i = 0; i < ds.length; i++) { 145 | dist += Math.pow(ds[i] - ds2[i], 2); 146 | } 147 | return Math.sqrt(dist); 148 | } else 149 | return (Double) null; 150 | } 151 | 152 | public static double dist(double ds, double ds2, String distDesp) { 153 | if (distDesp.equals("Euclidean")) { 154 | double dist = Math.pow(ds - ds2, 2); 155 | return Math.sqrt(dist); 156 | } else 157 | return (Double) null; 158 | } 159 | 160 | public static double KL(double[] fs, double[] fs2) { 161 | double klScore = 0.0D, phi_i1, phi_i2; 162 | klScore = 0.0; 163 | for (int v = 0; v < fs.length; v++) { 164 | // cal KL Div by summing 165 | phi_i1 = fs[v]; 166 | phi_i2 = fs2[v]; 167 | if (phi_i1 > 0 && phi_i2 > 0) 168 | klScore += phi_i1 * Math.log(phi_i1 / phi_i2); 169 | } 170 | return klScore; 171 | } 172 | 173 | public static double JS(double[] fs, double[] fs2) { 174 | double[] avg = new double[fs.length]; 175 | for (int v = 0; v < fs.length; v++) { 176 | avg[v] = (fs[v] + fs2[v]) / 2; 177 | } 178 | 179 | double JSScore = 0.0d; 180 | JSScore += KL(fs, avg); 181 | JSScore += KL(fs2, avg); 182 | JSScore = JSScore / 2; 183 | 184 | return JSScore; 185 | } 186 | 187 | public static void norm1(double[] thetaD) { 188 | double sum = sum(thetaD); 189 | for (int i = 0; i < thetaD.length; i++) 190 | thetaD[i] = thetaD[i] / sum; 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/tem/com/POStags.java: -------------------------------------------------------------------------------- 1 | package tem.com; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | 6 | 7 | public class POStags { 8 | 9 | public HashMap gTagMap; 10 | 11 | public POStags() { 12 | String map = "CD ADJ" + "\t" + 13 | "JJ ADJ" + "\t" + 14 | "JJR ADJ" + "\t" + 15 | "JJS ADJ" + "\t" + 16 | "VB V" + "\t" + 17 | "VBD V" + "\t" + 18 | "VBG V" + "\t" + 19 | "VBN V" + "\t" + 20 | "VBP V" + "\t" + 21 | "VBZ V" + "\t" + 22 | "MD V" + "\t" + 23 | "NN N" + "\t" + 24 | "NNS N" + "\t" + 25 | "NNP N" + "\t" + 26 | "NNPS N" + "\t" + 27 | "RB ADV" + "\t" + 28 | "RBR ADV" + "\t" + 29 | "RBS ADV" + "\t" + 30 | "RP ADV" + "\t" + 31 | "WRB ADV" + "\t" + 32 | "DT DET" + "\t" + 33 | "PDT DET" + "\t" + 34 | "WDT DET" + "\t" + 35 | "POS DET" + "\t" + 36 | "PRP PRP" + "\t" + 37 | "WP PRP" + "\t" + 38 | "PRP$ PRP$" + "\t" + 39 | "WP$ PRP$" + "\t" + 40 | "TO PREP" + "\t" + 41 | "IN PREP" + "\t" + 42 | "CC CONJ" + "\t" + 43 | "EX OTHER" + "\t" + 44 | "FW OTHER" + "\t" + 45 | "SYM OTHER" + "\t" + 46 | "UH OTHER" + "\t" + 47 | "LS OTHER" + "\t"; 48 | 49 | gTagMap = new HashMap(); 50 | String [] maps = map.split("\t"); 51 | ArrayList tokens = new ArrayList (); 52 | for(int i = 0; i < maps.length; i++) { 53 | tokens.clear(); 54 | FileUtil.tokenize(maps[i], tokens); 55 | if(tokens.size() != 2) { 56 | System.err.println(maps[i]); 57 | } else { 58 | gTagMap.put(tokens.get(0).toLowerCase().trim(), 59 | tokens.get(1).toLowerCase().trim()); 60 | } 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/tem/com/Sorting.java: -------------------------------------------------------------------------------- 1 | package tem.com; 2 | import java.util.Comparator; 3 | 4 | 5 | public class Sorting implements Comparator { 6 | public int compare(wordFreq o1, wordFreq o2) { 7 | return Long.valueOf(o2.getNo()).compareTo(Long.valueOf((o1.getNo()))); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/tem/com/ValueComparator.java: -------------------------------------------------------------------------------- 1 | package tem.com; 2 | 3 | import java.util.Comparator; 4 | import java.util.Map; 5 | 6 | public class ValueComparator implements Comparator{ 7 | Map baseMap; 8 | 9 | public ValueComparator(Map base){ 10 | this.baseMap = base; 11 | } 12 | 13 | @Override 14 | public int compare(String o1, String o2) { 15 | // TODO Auto-generated method stub 16 | if(baseMap.get(o1) >= baseMap.get(o2)){ 17 | return -1; 18 | } else { 19 | return 1; 20 | } 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/tem/com/wordFreq.java: -------------------------------------------------------------------------------- 1 | package tem.com; 2 | 3 | import java.util.ArrayList; 4 | 5 | 6 | public class wordFreq { 7 | 8 | public String word; 9 | 10 | public int No; 11 | 12 | public double prob; 13 | 14 | public String getWord() { 15 | return word; 16 | } 17 | 18 | public void setWord(String word) { 19 | this.word = word; 20 | } 21 | 22 | public int getNo() { 23 | return No; 24 | } 25 | 26 | public void setNo(int no) { 27 | No = no; 28 | } 29 | 30 | public double getProb() { 31 | return prob; 32 | } 33 | 34 | public void setProb(double prob) { 35 | this.prob = prob; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/tem/conf/ConstantConfig.java: -------------------------------------------------------------------------------- 1 | package tem.conf; 2 | 3 | public class ConstantConfig { 4 | 5 | public static String LDAPARAMETERFILE = "data/modelParams/temParams.txt"; 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/tem/conf/PathConfig.java: -------------------------------------------------------------------------------- 1 | package tem.conf; 2 | 3 | public class PathConfig { 4 | 5 | public static String scriptDataPath = "data/scriptData/ThreeM09/"; 6 | 7 | public static String originalDataPath = "data/originalData/ThreeM09/"; 8 | 9 | public static String testDataPath = "data/originalData/TestData/"; 10 | 11 | public static String modelParamsPath = "data/modelParams/"; 12 | 13 | public static String modelResPath = "data/modelRes/ThreeM09/"; 14 | 15 | public static String minPostNum = "50"; 16 | 17 | public static String UQAPath = "data/modelRes/ThreeM09/UQA/"; 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/tem/linkas/ID.java: -------------------------------------------------------------------------------- 1 | package tem.linkas; 2 | 3 | /**InDegree Algorithm for expert finding (CIKM 12 & KDD08) 4 | * @author yangliu 5 | * @blog http://blog.csdn.net/yangliuy 6 | * @mail yangliuyx@gmail.com 7 | */ 8 | 9 | 10 | import java.io.IOException; 11 | import java.io.PrintWriter; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | 16 | import Jama.Matrix; 17 | 18 | import tem.com.FileUtil; 19 | import tem.com.MatrixUtil; 20 | import tem.conf.PathConfig; 21 | import tem.main.TEMModel; 22 | 23 | public class ID { 24 | 25 | private static int NODENUM; // Node number 26 | private static Matrix U; // Matrix with all 1 27 | private static Matrix graphAdjM; //Adjancy matrix of graph 28 | private static Matrix transM;//Transition probability matrix 29 | 30 | public static void main(String[] args) throws IOException, ClassNotFoundException { 31 | String minPostNum = "80"; 32 | String modelName = "ID"; 33 | int K = 15; 34 | //Meature user interests and expertise by in degree of user node 35 | //i.e. The total number of answers the user provides or total number of votes the user gets 36 | //Compute the sum of each column 37 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userAnswerNumWeighted.QAgraph"; 38 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph"; 39 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/test.QAgraph"; 40 | //String TransMFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".transM"; 41 | String finalPRALLFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".finalPRAll"; 42 | ArrayList finalPRallLines = new ArrayList(); 43 | 44 | PrintWriter pw; 45 | readQAGraph(graphDataFile); 46 | NODENUM = graphAdjM.getRowDimension(); 47 | System.out.println("NODENUM : " + NODENUM); 48 | 49 | double[] nodeScoreArray = new double[NODENUM]; 50 | for(int i = 0; i < NODENUM; i++){ 51 | for(int j = 0; j < NODENUM; j++){ 52 | nodeScoreArray[i] += graphAdjM.get(j, i); 53 | } 54 | } 55 | MatrixUtil.norm1(nodeScoreArray); 56 | 57 | for(int z = 0; z < K; z++){ 58 | System.out.println("now topic = " + z); 59 | 60 | Matrix pageRank = new Matrix(nodeScoreArray, 1); 61 | 62 | System.out.println("Final PageRank is :"); 63 | pageRank.print(4, 4); 64 | 65 | String PRLine = ""; 66 | for(int i = 0; i < NODENUM; i++){ 67 | PRLine += pageRank.get(0, i) + "\t"; 68 | } 69 | finalPRallLines.add(PRLine); 70 | } 71 | FileUtil.writeLines(finalPRALLFile, finalPRallLines); 72 | } 73 | 74 | private static void readQAGraph( 75 | String graphDataFile) { 76 | // TODO Auto-generated method stub 77 | ArrayList graphLines = new ArrayList(); 78 | FileUtil.readLines(graphDataFile, graphLines); 79 | double[][] graphMatrix = new double[graphLines.size()][]; 80 | double minNumber = 1000; 81 | for(int i = 0; i < graphLines.size(); i++){ 82 | String[] glineTokens = graphLines.get(i).split("\t"); 83 | graphMatrix[i] = new double[glineTokens.length]; 84 | for(int j = 0; j < glineTokens.length; j++){ 85 | double d = Double.valueOf(glineTokens[j]); 86 | if(d < 0){ 87 | graphMatrix[i][j] = 0; 88 | } else { 89 | graphMatrix[i][j] = d; 90 | } 91 | if(minNumber > graphMatrix[i][j]) { 92 | minNumber = graphMatrix[i][j]; 93 | } 94 | } 95 | } 96 | //If there is negative number is matrix, find the min one x. Then all number add |x| 97 | System.out.println("minNumber " + minNumber); 98 | /*if (minNumber < 0){ 99 | for(int i = 0; i < graphMatrix.length; i++){ 100 | for(int j = 0; j < graphMatrix[i].length; j++){ 101 | graphMatrix[i][j] += (0 - minNumber); 102 | } 103 | } 104 | }*/ 105 | graphAdjM = new Matrix(graphMatrix); 106 | } 107 | 108 | public static void printMatrix(List> m) { 109 | for (int i = 0; i < m.size(); i++) { 110 | for (int j = 0; j < m.get(i).size(); j++) { 111 | System.out.print(m.get(i).get(j) + "\t"); 112 | } 113 | System.out.println(); 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/tem/linkas/PR.java: -------------------------------------------------------------------------------- 1 | package tem.linkas; 2 | 3 | /**Standard PageRank Algorithm (CIKM 12 PR for Expert finding) 4 | * @author yangliu 5 | * @blog http://blog.csdn.net/yangliuy 6 | * @mail yangliuyx@gmail.com 7 | */ 8 | 9 | import java.io.FileInputStream; 10 | import java.io.FileWriter; 11 | import java.io.IOException; 12 | import java.io.ObjectInputStream; 13 | import java.io.PrintWriter; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | import java.util.Random; 17 | 18 | import Jama.Matrix; 19 | 20 | import tem.com.FileUtil; 21 | import tem.conf.PathConfig; 22 | import tem.main.TEMModel; 23 | 24 | public class PR { 25 | private static double LAMBDA = 0.2; 26 | private static double THRESHOLD = 0.0000001; 27 | 28 | private static int NODENUM; // Node number 29 | private static Matrix U; // Matrix with all 1 30 | private static Matrix graphAdjM; //Adjancy matrix of graph 31 | private static Matrix transM;//Transition probability matrix 32 | 33 | public static void main(String[] args) throws IOException, ClassNotFoundException { 34 | String minPostNum = "80"; 35 | String modelName = "PR"; 36 | int K = 15; 37 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userAnswerNumWeighted.QAgraph"; 38 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph"; 39 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/test.QAgraph"; 40 | //String TransMFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".transM"; 41 | String finalPRALLFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".finalPRAll"; 42 | ArrayList finalPRallLines = new ArrayList(); 43 | 44 | PrintWriter pw; 45 | readQAGraph(graphDataFile); 46 | NODENUM = graphAdjM.getRowDimension(); 47 | System.out.println("NODENUM : " + NODENUM); 48 | //1. Init PR state vector and Matrix U 49 | //Both randomly initialise or set all 1 are OK 50 | Matrix PR0 = initPRStateVector(); 51 | initU(); 52 | System.out.println("Initial state vector PR0 is:"); 53 | PR0.print(4, 4); 54 | 55 | 56 | for(int z = 0; z < K; z++){ 57 | System.out.println("now topic = " + z); 58 | //String newPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".newPR"; 59 | //String finalPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".finalPR"; 60 | 61 | //2. Compute transition probability matrix 62 | computeTransM(z); 63 | //pw = new PrintWriter(new FileWriter(TransMFile)); 64 | //transM.print(pw, 4, 4); 65 | 66 | //3. Compute newPR update matrix 67 | Matrix newPR = computeNewPR(z); 68 | //Normalized newPR matrix 69 | normal(newPR); 70 | //pw = new PrintWriter(new FileWriter(newPRFile)); 71 | //newPR.print(pw, 4, 4); 72 | 73 | //4. Iteratively update PR state vector 74 | Matrix pageRank = calPageRank(PR0, newPR); 75 | 76 | //5. Print final PageRank score 77 | System.out.println("Final PageRank is :"); 78 | pageRank.print(4, 4); 79 | 80 | String PRLine = ""; 81 | for(int i = 0; i < NODENUM; i++){ 82 | PRLine += pageRank.get(0, i) + "\t"; 83 | } 84 | finalPRallLines.add(PRLine); 85 | //saveFinalPR(finalPRFile, pageRank); 86 | } 87 | FileUtil.writeLines(finalPRALLFile, finalPRallLines); 88 | } 89 | 90 | private static void normal(Matrix newPR) { 91 | // TODO Auto-generated method stub 92 | for(int i = 0; i < NODENUM; i++){ 93 | double sum = 0; 94 | for(int j = 0; j < NODENUM; j++){ 95 | sum += newPR.get(i, j); 96 | } 97 | if(sum != 0){ 98 | for(int j = 0; j < NODENUM; j++){ 99 | newPR.set(i, j, newPR.get(i, j) / sum); 100 | } 101 | } 102 | } 103 | } 104 | 105 | private static void saveFinalPR(String finalPRFile, Matrix pageRank) { 106 | // TODO Auto-generated method stub 107 | ArrayList lines = new ArrayList(); 108 | for(int i = 0; i < pageRank.getRowDimension(); i++){ 109 | String line = ""; 110 | for(int j = 0; j < pageRank.getColumnDimension(); j++){ 111 | line += pageRank.get(i, j) + "\t"; 112 | } 113 | lines.add(line); 114 | } 115 | FileUtil.writeLines(finalPRFile, lines); 116 | } 117 | 118 | //Matrix with all 1 119 | private static void initU() { 120 | // TODO Auto-generated method stub 121 | double[][] u = new double[NODENUM][NODENUM]; 122 | for(int i = 0; i < NODENUM; i++){ 123 | for(int j = 0; j < NODENUM; j++){ 124 | u[i][j] = 1; 125 | } 126 | } 127 | U = new Matrix(u); 128 | } 129 | 130 | //Compute transition matrix 131 | private static void computeTransM(int z) { 132 | // TODO Auto-generated method stub 133 | double[][] transm = new double[NODENUM][NODENUM]; 134 | for(int i = 0; i < NODENUM; i++){ 135 | double rowSum = 0; 136 | for(int j = 0; j < NODENUM; j++){ 137 | rowSum += graphAdjM.get(i, j); 138 | } 139 | if(rowSum == 0){ 140 | for(int j = 0; j < NODENUM; j++){ 141 | transm [i][j] = 0; 142 | } 143 | } else { 144 | for(int j = 0; j < NODENUM; j++){ 145 | double norWeight = graphAdjM.get(i, j) / 146 | rowSum; 147 | transm [i][j] = norWeight; 148 | } 149 | } 150 | } 151 | transM = new Matrix(transm); 152 | } 153 | 154 | private static double sim(float f, float g) { 155 | // TODO Auto-generated method stub 156 | return 1 - Math.abs(f - g); 157 | } 158 | 159 | //compute pagerank 160 | public static Matrix calPageRank(Matrix PR0, Matrix newPR) { 161 | Matrix PR; 162 | while (true) { 163 | PR = PR0.times(newPR); 164 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration 165 | System.out.println("distance:" + dis); 166 | if (dis <= THRESHOLD) { 167 | System.out.println("PR:"); 168 | PR.print(4, 4); 169 | break; 170 | } 171 | PR0 = PR; 172 | } 173 | return PR; 174 | } 175 | 176 | private static Matrix initPRStateVector() { 177 | // TODO Auto-generated method stub 178 | double[] pr0M = new double[NODENUM]; 179 | for(int i = 0; i < NODENUM; i++){ 180 | pr0M[i] = 1; 181 | } 182 | return new Matrix(pr0M, 1); 183 | } 184 | 185 | private static void readQAGraph( 186 | String graphDataFile) { 187 | // TODO Auto-generated method stub 188 | ArrayList graphLines = new ArrayList(); 189 | FileUtil.readLines(graphDataFile, graphLines); 190 | double[][] graphMatrix = new double[graphLines.size()][]; 191 | double minNumber = 1000; 192 | for(int i = 0; i < graphLines.size(); i++){ 193 | String[] glineTokens = graphLines.get(i).split("\t"); 194 | graphMatrix[i] = new double[glineTokens.length]; 195 | for(int j = 0; j < glineTokens.length; j++){ 196 | double d = Double.valueOf(glineTokens[j]); 197 | if(d < 0){ 198 | graphMatrix[i][j] = 0; 199 | } else { 200 | graphMatrix[i][j] = d; 201 | } 202 | if(minNumber > graphMatrix[i][j]) { 203 | minNumber = graphMatrix[i][j]; 204 | } 205 | } 206 | } 207 | //If there is negative number is matrix, find the min one x. Then all number add |x| 208 | System.out.println("minNumber " + minNumber); 209 | /*if (minNumber < 0){ 210 | for(int i = 0; i < graphMatrix.length; i++){ 211 | for(int j = 0; j < graphMatrix[i].length; j++){ 212 | graphMatrix[i][j] += (0 - minNumber); 213 | } 214 | } 215 | }*/ 216 | graphAdjM = new Matrix(graphMatrix); 217 | } 218 | 219 | public static void printMatrix(List> m) { 220 | for (int i = 0; i < m.size(); i++) { 221 | for (int j = 0; j < m.get(i).size(); j++) { 222 | System.out.print(m.get(i).get(j) + "\t"); 223 | } 224 | System.out.println(); 225 | } 226 | } 227 | 228 | public static void printVec(List v) { 229 | for (int i = 0; i < v.size(); i++) { 230 | System.out.print(v.get(i) + "\t"); 231 | } 232 | System.out.println(); 233 | } 234 | 235 | /** 236 | * Randomly Initialise state vector PR0 237 | * 238 | * @param n 239 | * dimension of vector PR0 240 | * @return A random vector, each dimension is 0-5 241 | */ 242 | public static List randomInitPR0(int n) { 243 | Random random = new Random(); 244 | List q = new ArrayList(); 245 | for (int i = 0; i < n; i++) { 246 | q.add(new Double(5 * random.nextDouble())); 247 | } 248 | return q; 249 | } 250 | 251 | /** 252 | * Compute Euclidean Distance 253 | * 254 | * @param q1 255 | * 256 | * @param q2 257 | * 258 | * @return distance 259 | */ 260 | public static double calDistance(Matrix q1, Matrix q2) { 261 | double sum = 0; 262 | 263 | if (q1.getColumnDimension() != q2.getColumnDimension() ) { 264 | return -1; 265 | } 266 | 267 | for (int i = 0; i < q1.getColumnDimension() ; i++) { 268 | sum += Math.pow(q1.get(0, i) - q2.get(0, i), 269 | 2); 270 | } 271 | return Math.sqrt(sum); 272 | } 273 | 274 | /** 275 | * compute NEWPR matrix 276 | * 277 | * @return NEWPR matrix 278 | */ 279 | public static Matrix computeNewPR(int z) { 280 | Matrix add1 = transM.times(LAMBDA); 281 | //In new PR matrix, the larger values are in the c-th column, the larger score node c tends to get. 282 | /*double [][] newU = new double[NODENUM][NODENUM]; 283 | for(int i = 0; i < NODENUM; i++){ 284 | double userPreference = model.theta[i][z]; 285 | double tspr = userPreference; 286 | for(int k = 0; k < NODENUM; k++){ 287 | newU[k][i] = tspr; 288 | } 289 | } 290 | U = new Matrix(newU);*/ 291 | 292 | Matrix add2 = U.times( (1 - LAMBDA) / NODENUM); 293 | Matrix newPR = add1.plus(add2); 294 | return newPR; 295 | } 296 | } 297 | -------------------------------------------------------------------------------- /src/tem/linkas/TEPR.java: -------------------------------------------------------------------------------- 1 | package tem.linkas; 2 | 3 | /**Topic Expertise PageRank Algorithm 4 | * @author yangliu 5 | * @blog http://blog.csdn.net/yangliuy 6 | * @mail yangliuyx@gmail.com 7 | */ 8 | 9 | import java.io.FileInputStream; 10 | import java.io.FileWriter; 11 | import java.io.IOException; 12 | import java.io.ObjectInputStream; 13 | import java.io.PrintWriter; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | import java.util.Random; 17 | 18 | import Jama.Matrix; 19 | 20 | import tem.com.FileUtil; 21 | import tem.conf.PathConfig; 22 | import tem.main.TEMModel; 23 | 24 | public class TEPR { 25 | private static double LAMBDA = 0.2; 26 | private static double THRESHOLD = 0.0000001; 27 | 28 | private static int NODENUM; // Node number 29 | private static Matrix U; // Matrix with all 1 30 | private static Matrix graphAdjM; //Adjancy matrix of graph 31 | private static Matrix transM;//Transition probability matrix 32 | 33 | public static void main(String[] args) throws IOException, ClassNotFoundException { 34 | String minPostNum = "80"; 35 | String modelName = "TEPR"; 36 | String userIDFile = PathConfig.originalDataPath + "USER" + minPostNum + "/user.IDs"; 37 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userAnswerNumWeighted.QAgraph"; 38 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph"; 39 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/test.QAgraph"; 40 | //String TransMFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".transM"; 41 | 42 | ArrayList finalPRallLines = new ArrayList(); 43 | TEMModel model = new TEMModel(); 44 | //String[] ENums = {"11", "13", "14"}; 45 | String[] TNums = {"10", "12", "14", "16", "18", "20", "22", "24", "26", "28", "30"}; 46 | readQAGraph(graphDataFile); 47 | 48 | NODENUM = graphAdjM.getRowDimension(); 49 | System.out.println("NODENUM : " + NODENUM); 50 | for(String T : TNums){ 51 | //for(String E : ENums){ 52 | // load model 53 | //String modelFile = PathConfig.modelOutPath + "/USER80/Model_E" + E + "_T15.model"; 54 | String modelFile = PathConfig.modelResPath + "/USER" + PathConfig.minPostNum + "/Model_E10_T" + T + ".model"; 55 | System.out.println("reading a class from : " + modelFile); 56 | FileInputStream fis = new FileInputStream(modelFile); 57 | ObjectInputStream ois = new ObjectInputStream(fis); 58 | model = (TEMModel) ois.readObject(); 59 | ois.close(); 60 | System.out.println("TopicNum" + model.K); 61 | System.out.println("ENum" + model.ENum); 62 | 63 | String finalPRALLFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".E" + model.ENum + "T" + model.K + "finalPRAll"; 64 | finalPRallLines.clear(); 65 | 66 | PrintWriter pw; 67 | 68 | //1. Init PR state vector and Matrix U 69 | //Both randomly initialise or set all 1 are OK 70 | Matrix PR0 = initPRStateVector(); 71 | initU(); 72 | System.out.println("Initial state vector PR0 is:"); 73 | PR0.print(4, 4); 74 | 75 | for(int z = 0; z < model.K; z++){ 76 | System.out.println("now topic = " + z); 77 | //String newPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".newPR"; 78 | //String finalPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".finalPR"; 79 | 80 | //2. Compute transition probability matrix 81 | computeTransM(model, z); 82 | //pw = new PrintWriter(new FileWriter(TransMFile)); 83 | //transM.print(pw, 4, 4); 84 | 85 | //3. Compute newPR update matrix 86 | Matrix newPR = computeNewPR(model, z); 87 | //Normalized newPR matrix 88 | normal(newPR); 89 | //pw = new PrintWriter(new FileWriter(newPRFile)); 90 | //newPR.print(pw, 4, 4); 91 | 92 | //4. Iteratively update PR state vector 93 | Matrix pageRank = calPageRank(PR0, newPR); 94 | 95 | //5. Print final PageRank score 96 | System.out.println("Final PageRank is :"); 97 | pageRank.print(4, 4); 98 | 99 | String PRLine = ""; 100 | for(int i = 0; i < NODENUM; i++){ 101 | PRLine += pageRank.get(0, i) + "\t"; 102 | } 103 | finalPRallLines.add(PRLine); 104 | //saveFinalPR(finalPRFile, pageRank); 105 | } 106 | FileUtil.writeLines(finalPRALLFile, finalPRallLines); 107 | } 108 | 109 | } 110 | 111 | private static void normal(Matrix newPR) { 112 | // TODO Auto-generated method stub 113 | for(int i = 0; i < NODENUM; i++){ 114 | double sum = 0; 115 | for(int j = 0; j < NODENUM; j++){ 116 | sum += newPR.get(i, j); 117 | } 118 | if(sum != 0){ 119 | for(int j = 0; j < NODENUM; j++){ 120 | newPR.set(i, j, newPR.get(i, j) / sum); 121 | } 122 | } 123 | } 124 | } 125 | 126 | private static void saveFinalPR(String finalPRFile, Matrix pageRank) { 127 | // TODO Auto-generated method stub 128 | ArrayList lines = new ArrayList(); 129 | for(int i = 0; i < pageRank.getRowDimension(); i++){ 130 | String line = ""; 131 | for(int j = 0; j < pageRank.getColumnDimension(); j++){ 132 | line += pageRank.get(i, j) + "\t"; 133 | } 134 | lines.add(line); 135 | } 136 | FileUtil.writeLines(finalPRFile, lines); 137 | } 138 | 139 | //Matrix with all 1 140 | private static void initU() { 141 | // TODO Auto-generated method stub 142 | double[][] u = new double[NODENUM][NODENUM]; 143 | for(int i = 0; i < NODENUM; i++){ 144 | for(int j = 0; j < NODENUM; j++){ 145 | u[i][j] = 1; 146 | } 147 | } 148 | U = new Matrix(u); 149 | } 150 | 151 | //Compute transition matrix 152 | private static void computeTransM(TEMModel model, int z) { 153 | // TODO Auto-generated method stub 154 | double[][] transm = new double[NODENUM][NODENUM]; 155 | for(int i = 0; i < NODENUM; i++){ 156 | double rowSum = 0; 157 | for(int j = 0; j < NODENUM; j++){ 158 | rowSum += graphAdjM.get(i, j) * sim(model.theta[i][z], model.theta[j][z]); 159 | } 160 | if(rowSum == 0){ 161 | for(int j = 0; j < NODENUM; j++){ 162 | transm [i][j] = 0; 163 | } 164 | } else { 165 | for(int j = 0; j < NODENUM; j++){ 166 | double norWeight = graphAdjM.get(i, j) * sim(model.theta[i][z], model.theta[j][z]) / 167 | rowSum; 168 | transm [i][j] = norWeight; 169 | } 170 | } 171 | } 172 | transM = new Matrix(transm); 173 | } 174 | 175 | private static double sim(float f, float g) { 176 | // TODO Auto-generated method stub 177 | return 1 - Math.abs(f - g); 178 | } 179 | 180 | //compute pagerank 181 | public static Matrix calPageRank(Matrix PR0, Matrix newPR) { 182 | Matrix PR; 183 | while (true) { 184 | PR = PR0.times(newPR); 185 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration 186 | System.out.println("distance:" + dis); 187 | if (dis <= THRESHOLD) { 188 | System.out.println("PR:"); 189 | PR.print(4, 4); 190 | break; 191 | } 192 | PR0 = PR; 193 | } 194 | return PR; 195 | } 196 | 197 | private static Matrix initPRStateVector() { 198 | // TODO Auto-generated method stub 199 | double[] pr0M = new double[NODENUM]; 200 | for(int i = 0; i < NODENUM; i++){ 201 | pr0M[i] = 1; 202 | } 203 | return new Matrix(pr0M, 1); 204 | } 205 | 206 | private static void readQAGraph( 207 | String graphDataFile) { 208 | // TODO Auto-generated method stub 209 | ArrayList graphLines = new ArrayList(); 210 | FileUtil.readLines(graphDataFile, graphLines); 211 | double[][] graphMatrix = new double[graphLines.size()][]; 212 | double minNumber = 1000; 213 | for(int i = 0; i < graphLines.size(); i++){ 214 | String[] glineTokens = graphLines.get(i).split("\t"); 215 | graphMatrix[i] = new double[glineTokens.length]; 216 | for(int j = 0; j < glineTokens.length; j++){ 217 | double d = Double.valueOf(glineTokens[j]); 218 | if(d < 0){ 219 | graphMatrix[i][j] = 0; 220 | } else { 221 | graphMatrix[i][j] = d; 222 | } 223 | if(minNumber > graphMatrix[i][j]) { 224 | minNumber = graphMatrix[i][j]; 225 | } 226 | } 227 | } 228 | //If there is negative number is matrix, find the min one x. Then all number add |x| 229 | System.out.println("minNumber " + minNumber); 230 | /*if (minNumber < 0){ 231 | for(int i = 0; i < graphMatrix.length; i++){ 232 | for(int j = 0; j < graphMatrix[i].length; j++){ 233 | graphMatrix[i][j] += (0 - minNumber); 234 | } 235 | } 236 | }*/ 237 | graphAdjM = new Matrix(graphMatrix); 238 | } 239 | 240 | public static void printMatrix(List> m) { 241 | for (int i = 0; i < m.size(); i++) { 242 | for (int j = 0; j < m.get(i).size(); j++) { 243 | System.out.print(m.get(i).get(j) + "\t"); 244 | } 245 | System.out.println(); 246 | } 247 | } 248 | 249 | public static void printVec(List v) { 250 | for (int i = 0; i < v.size(); i++) { 251 | System.out.print(v.get(i) + "\t"); 252 | } 253 | System.out.println(); 254 | } 255 | 256 | /** 257 | * Randomly Initialise state vector PR0 258 | * 259 | * @param n 260 | * dimension of vector PR0 261 | * @return A random vector, each dimension is 0-5 262 | */ 263 | public static List randomInitPR0(int n) { 264 | Random random = new Random(); 265 | List q = new ArrayList(); 266 | for (int i = 0; i < n; i++) { 267 | q.add(new Double(5 * random.nextDouble())); 268 | } 269 | return q; 270 | } 271 | 272 | /** 273 | * Compute Euclidean Distance 274 | * 275 | * @param q1 276 | * 277 | * @param q2 278 | * 279 | * @return distance 280 | */ 281 | public static double calDistance(Matrix q1, Matrix q2) { 282 | double sum = 0; 283 | 284 | if (q1.getColumnDimension() != q2.getColumnDimension() ) { 285 | return -1; 286 | } 287 | 288 | for (int i = 0; i < q1.getColumnDimension() ; i++) { 289 | sum += Math.pow(q1.get(0, i) - q2.get(0, i), 290 | 2); 291 | } 292 | return Math.sqrt(sum); 293 | } 294 | 295 | /** 296 | * compute NEWPR matrix 297 | * 298 | * @return NEWPR matrix 299 | */ 300 | public static Matrix computeNewPR(TEMModel model, int z) { 301 | Matrix add1 = transM.times(LAMBDA); 302 | //In new PR matrix, the larger values are in the c-th column, the larger score node c tends to get. 303 | //Add user topic preference score and user topic expertise score in Matrix U 304 | double [][] newU = new double[NODENUM][NODENUM]; 305 | for(int i = 0; i < NODENUM; i++){ 306 | double userPreference = model.theta[i][z]; 307 | double userExpertise = 0d; 308 | for (int e = 0; e < model.ENum; e++) { 309 | userExpertise += model.phi[z][i][e] * model.fgmm.p_mu[e][0]; 310 | } 311 | double tepr = userPreference * userExpertise; 312 | for(int k = 0; k < NODENUM; k++){ 313 | newU[k][i] = tepr; 314 | } 315 | } 316 | U = new Matrix(newU); 317 | 318 | Matrix add2 = U.times( (1 - LAMBDA)); 319 | Matrix newPR = add1.plus(add2); 320 | return newPR; 321 | } 322 | } 323 | -------------------------------------------------------------------------------- /src/tem/linkas/TSPR.java: -------------------------------------------------------------------------------- 1 | package tem.linkas; 2 | 3 | /**Topic Sensitive PageRank Algorithm (CIKM 12 TSPR for Expert finding) 4 | * @author yangliu 5 | * @blog http://blog.csdn.net/yangliuy 6 | * @mail yangliuyx@gmail.com 7 | */ 8 | 9 | import java.io.FileInputStream; 10 | import java.io.FileWriter; 11 | import java.io.IOException; 12 | import java.io.ObjectInputStream; 13 | import java.io.PrintWriter; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | import java.util.Random; 17 | 18 | import Jama.Matrix; 19 | 20 | import tem.com.FileUtil; 21 | import tem.conf.PathConfig; 22 | import tem.main.TEMModel; 23 | 24 | public class TSPR { 25 | private static double LAMBDA = 0.2; 26 | private static double THRESHOLD = 0.0000001; 27 | 28 | private static int NODENUM; // Node number 29 | private static Matrix U; // Matrix with all 1 30 | private static Matrix graphAdjM; //Adjancy matrix of graph 31 | private static Matrix transM;//Transition probability matrix 32 | 33 | public static void main(String[] args) throws IOException, ClassNotFoundException { 34 | String minPostNum = "80"; 35 | String modelName = "TSPR"; 36 | String userIDFile = PathConfig.originalDataPath + "USER" + minPostNum + "/user.IDs"; 37 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userAnswerNumWeighted.QAgraph"; 38 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph"; 39 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/test.QAgraph"; 40 | //String TransMFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".transM"; 41 | String finalPRALLFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".finalPRAll"; 42 | ArrayList finalPRallLines = new ArrayList(); 43 | //TEMModel model = new TEMModel(); 44 | // load model 45 | //String modelFile = PathConfig.modelOutPath + "/USER80/ModelFile.model"; 46 | //System.out.println("reading a class from : " + modelFile); 47 | //FileInputStream fis = new FileInputStream(modelFile); 48 | //ObjectInputStream ois = new ObjectInputStream(fis); 49 | //model = (TEMModel) ois.readObject(); 50 | //ois.close(); 51 | //System.out.println(model.K); 52 | //System.out.println(model.theta.length); 53 | 54 | PrintWriter pw; 55 | readQAGraph(graphDataFile); 56 | NODENUM = graphAdjM.getRowDimension(); 57 | System.out.println("NODENUM : " + NODENUM); 58 | 59 | String LDAThetaFile = PathConfig.modelResPath + "LDA/lda_500.theta"; 60 | double [][] theta = FileUtil.read2DArray(LDAThetaFile); 61 | int K = theta[0].length; 62 | 63 | //1. Init PR state vector and Matrix U 64 | //Both randomly initialise or set all 1 are OK 65 | Matrix PR0 = initPRStateVector(); 66 | initU(); 67 | System.out.println("Initial state vector PR0 is:"); 68 | PR0.print(4, 4); 69 | 70 | 71 | for(int z = 0; z < K; z++){ 72 | System.out.println("now topic = " + z); 73 | //String newPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".newPR"; 74 | //String finalPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".finalPR"; 75 | 76 | //2. Compute transition probability matrix 77 | computeTransM(theta, z); 78 | //pw = new PrintWriter(new FileWriter(TransMFile)); 79 | //transM.print(pw, 4, 4); 80 | 81 | //3. Compute newPR update matrix 82 | Matrix newPR = computeNewPR(theta, z); 83 | //Normalized newPR matrix 84 | normal(newPR); 85 | //pw = new PrintWriter(new FileWriter(newPRFile)); 86 | //newPR.print(pw, 4, 4); 87 | 88 | //4. Iteratively update PR state vector 89 | Matrix pageRank = calPageRank(PR0, newPR); 90 | 91 | //5. Print final PageRank score 92 | System.out.println("Final PageRank is :"); 93 | pageRank.print(4, 4); 94 | 95 | String PRLine = ""; 96 | for(int i = 0; i < NODENUM; i++){ 97 | PRLine += pageRank.get(0, i) + "\t"; 98 | } 99 | finalPRallLines.add(PRLine); 100 | //saveFinalPR(finalPRFile, pageRank); 101 | } 102 | FileUtil.writeLines(finalPRALLFile, finalPRallLines); 103 | } 104 | 105 | private static void normal(Matrix newPR) { 106 | // TODO Auto-generated method stub 107 | for(int i = 0; i < NODENUM; i++){ 108 | double sum = 0; 109 | for(int j = 0; j < NODENUM; j++){ 110 | sum += newPR.get(i, j); 111 | } 112 | if(sum != 0){ 113 | for(int j = 0; j < NODENUM; j++){ 114 | newPR.set(i, j, newPR.get(i, j) / sum); 115 | } 116 | } 117 | } 118 | } 119 | 120 | private static void saveFinalPR(String finalPRFile, Matrix pageRank) { 121 | // TODO Auto-generated method stub 122 | ArrayList lines = new ArrayList(); 123 | for(int i = 0; i < pageRank.getRowDimension(); i++){ 124 | String line = ""; 125 | for(int j = 0; j < pageRank.getColumnDimension(); j++){ 126 | line += pageRank.get(i, j) + "\t"; 127 | } 128 | lines.add(line); 129 | } 130 | FileUtil.writeLines(finalPRFile, lines); 131 | } 132 | 133 | //Matrix with all 1 134 | private static void initU() { 135 | // TODO Auto-generated method stub 136 | double[][] u = new double[NODENUM][NODENUM]; 137 | for(int i = 0; i < NODENUM; i++){ 138 | for(int j = 0; j < NODENUM; j++){ 139 | u[i][j] = 1; 140 | } 141 | } 142 | U = new Matrix(u); 143 | } 144 | 145 | //Compute transition matrix 146 | private static void computeTransM(double[][] theta, int z) { 147 | // TODO Auto-generated method stub 148 | double[][] transm = new double[NODENUM][NODENUM]; 149 | for(int i = 0; i < NODENUM; i++){ 150 | double rowSum = 0; 151 | for(int j = 0; j < NODENUM; j++){ 152 | rowSum += graphAdjM.get(i, j) * sim(theta[i][z], theta[j][z]); 153 | } 154 | if(rowSum == 0){ 155 | for(int j = 0; j < NODENUM; j++){ 156 | transm [i][j] = 0; 157 | } 158 | } else { 159 | for(int j = 0; j < NODENUM; j++){ 160 | double norWeight = graphAdjM.get(i, j) * sim(theta[i][z], theta[j][z]) / 161 | rowSum; 162 | transm [i][j] = norWeight; 163 | } 164 | } 165 | } 166 | transM = new Matrix(transm); 167 | } 168 | 169 | private static double sim(double f, double g) { 170 | // TODO Auto-generated method stub 171 | return 1 - Math.abs(f - g); 172 | } 173 | 174 | //compute pagerank 175 | public static Matrix calPageRank(Matrix PR0, Matrix newPR) { 176 | Matrix PR; 177 | while (true) { 178 | PR = PR0.times(newPR); 179 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration 180 | System.out.println("distance:" + dis); 181 | if (dis <= THRESHOLD) { 182 | System.out.println("PR:"); 183 | PR.print(4, 4); 184 | break; 185 | } 186 | PR0 = PR; 187 | } 188 | return PR; 189 | } 190 | 191 | private static Matrix initPRStateVector() { 192 | // TODO Auto-generated method stub 193 | double[] pr0M = new double[NODENUM]; 194 | for(int i = 0; i < NODENUM; i++){ 195 | pr0M[i] = 1; 196 | } 197 | return new Matrix(pr0M, 1); 198 | } 199 | 200 | private static void readQAGraph( 201 | String graphDataFile) { 202 | // TODO Auto-generated method stub 203 | ArrayList graphLines = new ArrayList(); 204 | FileUtil.readLines(graphDataFile, graphLines); 205 | double[][] graphMatrix = new double[graphLines.size()][]; 206 | double minNumber = 1000; 207 | for(int i = 0; i < graphLines.size(); i++){ 208 | String[] glineTokens = graphLines.get(i).split("\t"); 209 | graphMatrix[i] = new double[glineTokens.length]; 210 | for(int j = 0; j < glineTokens.length; j++){ 211 | double d = Double.valueOf(glineTokens[j]); 212 | if(d < 0){ 213 | graphMatrix[i][j] = 0; 214 | } else { 215 | graphMatrix[i][j] = d; 216 | } 217 | if(minNumber > graphMatrix[i][j]) { 218 | minNumber = graphMatrix[i][j]; 219 | } 220 | } 221 | } 222 | //If there is negative number is matrix, find the min one x. Then all number add |x| 223 | System.out.println("minNumber " + minNumber); 224 | /*if (minNumber < 0){ 225 | for(int i = 0; i < graphMatrix.length; i++){ 226 | for(int j = 0; j < graphMatrix[i].length; j++){ 227 | graphMatrix[i][j] += (0 - minNumber); 228 | } 229 | } 230 | }*/ 231 | graphAdjM = new Matrix(graphMatrix); 232 | } 233 | 234 | public static void printMatrix(List> m) { 235 | for (int i = 0; i < m.size(); i++) { 236 | for (int j = 0; j < m.get(i).size(); j++) { 237 | System.out.print(m.get(i).get(j) + "\t"); 238 | } 239 | System.out.println(); 240 | } 241 | } 242 | 243 | public static void printVec(List v) { 244 | for (int i = 0; i < v.size(); i++) { 245 | System.out.print(v.get(i) + "\t"); 246 | } 247 | System.out.println(); 248 | } 249 | 250 | /** 251 | * Randomly Initialise state vector PR0 252 | * 253 | * @param n 254 | * dimension of vector PR0 255 | * @return A random vector, each dimension is 0-5 256 | */ 257 | public static List randomInitPR0(int n) { 258 | Random random = new Random(); 259 | List q = new ArrayList(); 260 | for (int i = 0; i < n; i++) { 261 | q.add(new Double(5 * random.nextDouble())); 262 | } 263 | return q; 264 | } 265 | 266 | /** 267 | * Compute Euclidean Distance 268 | * 269 | * @param q1 270 | * 271 | * @param q2 272 | * 273 | * @return distance 274 | */ 275 | public static double calDistance(Matrix q1, Matrix q2) { 276 | double sum = 0; 277 | 278 | if (q1.getColumnDimension() != q2.getColumnDimension() ) { 279 | return -1; 280 | } 281 | 282 | for (int i = 0; i < q1.getColumnDimension() ; i++) { 283 | sum += Math.pow(q1.get(0, i) - q2.get(0, i), 284 | 2); 285 | } 286 | return Math.sqrt(sum); 287 | } 288 | 289 | /** 290 | * compute NEWPR matrix 291 | * 292 | * @return NEWPR matrix 293 | */ 294 | public static Matrix computeNewPR(double[][] theta, int z) { 295 | Matrix add1 = transM.times(LAMBDA); 296 | //In new PR matrix, the larger values are in the c-th column, the larger score node c tends to get. 297 | double [][] newU = new double[NODENUM][NODENUM]; 298 | for(int i = 0; i < NODENUM; i++){ 299 | double userPreference = theta[i][z]; 300 | double tspr = userPreference; 301 | for(int k = 0; k < NODENUM; k++){ 302 | newU[k][i] = tspr; 303 | } 304 | } 305 | U = new Matrix(newU); 306 | 307 | Matrix add2 = U.times( (1 - LAMBDA)); 308 | Matrix newPR = add1.plus(add2); 309 | return newPR; 310 | } 311 | } 312 | -------------------------------------------------------------------------------- /src/tem/main/LdaGibbsSampling.java: -------------------------------------------------------------------------------- 1 | package tem.main; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | 7 | import tem.com.FileUtil; 8 | import tem.com.JC; 9 | import tem.conf.ConstantConfig; 10 | import tem.conf.PathConfig; 11 | import tem.main.Documents; 12 | import tem.main.TEMModelSampling.modelparameters; 13 | 14 | /**Liu Yang's implementation of Gibbs Sampling of LDA 15 | * @author yangliu 16 | * @blog http://blog.csdn.net/yangliuy 17 | * @mail yangliuyx@gmail.com 18 | */ 19 | 20 | public class LdaGibbsSampling { 21 | 22 | public static class modelparameters { 23 | float alpha = 1f; //usual value is 50 / K 24 | float beta = 0.1f;//usual value is 0.1 25 | int topicNum = 15; 26 | int iteration = 500; 27 | int saveStep = 20; 28 | int beginSaveIters = 440; 29 | } 30 | 31 | /**Get parameters from configuring file. If the 32 | * configuring file has value in it, use the value. 33 | * Else the default value in program will be used 34 | * @param ldaparameters 35 | * @param parameterFile 36 | * @return void 37 | */ 38 | private static void getParametersFromFile(modelparameters ldaparameters, 39 | String parameterFile) { 40 | // TODO Auto-generated method stub 41 | ArrayList paramLines = new ArrayList(); 42 | FileUtil.readLines(parameterFile, paramLines); 43 | for(String line : paramLines){ 44 | String[] lineParts = line.split("\t"); 45 | switch(parameters.valueOf(lineParts[0])){ 46 | case alpha: 47 | ldaparameters.alpha = Float.valueOf(lineParts[1]); 48 | break; 49 | case beta: 50 | ldaparameters.beta = Float.valueOf(lineParts[1]); 51 | break; 52 | case topicNum: 53 | ldaparameters.topicNum = Integer.valueOf(lineParts[1]); 54 | break; 55 | case iteration: 56 | ldaparameters.iteration = Integer.valueOf(lineParts[1]); 57 | break; 58 | case saveStep: 59 | ldaparameters.saveStep = Integer.valueOf(lineParts[1]); 60 | break; 61 | case beginSaveIters: 62 | ldaparameters.beginSaveIters = Integer.valueOf(lineParts[1]); 63 | break; 64 | } 65 | } 66 | } 67 | 68 | public enum parameters{ 69 | alpha, beta, topicNum, iteration, saveStep, beginSaveIters; 70 | } 71 | 72 | /** 73 | * @param args 74 | * @throws IOException 75 | * @throws ClassNotFoundException 76 | */ 77 | public static void main(String[] args) throws IOException, ClassNotFoundException { 78 | // TODO Auto-generated method stub 79 | String dataPath = PathConfig.modelResPath + "USER80/"; 80 | String minPostNum = PathConfig.minPostNum; 81 | Documents docSet = new Documents(); 82 | String docfile = dataPath + "USER" + minPostNum + ".data"; 83 | docSet = FileUtil.loadClass(docSet, docfile); 84 | 85 | System.out.println("indexToTermMap size : " 86 | + docSet.indexToTermMap.size()); 87 | // System.out.println("indexToTermMap : " + docSet.indexToTermMap); 88 | System.out.println("indexToTagMap size : " 89 | + docSet.indexToTagMap.size()); 90 | System.out.println("indexToVoteMap size : " 91 | + docSet.indexToVoteMap.size()); 92 | 93 | modelparameters ldaparameters = new modelparameters(); 94 | System.out.println("Topic Num : " + ldaparameters.topicNum); 95 | LdaModel model = new LdaModel(ldaparameters); 96 | System.out.println("1 Initialize the model ..."); 97 | model.initializeModel(docSet); 98 | System.out.println("2 Learning and Saving the model ..."); 99 | model.inferenceModel(docSet); 100 | System.out.println("3 Output the final model ..."); 101 | model.saveIteratedModel(ldaparameters.iteration, docSet); 102 | System.out.println("Done!"); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/tem/main/LdaModel.java: -------------------------------------------------------------------------------- 1 | package tem.main; 2 | 3 | /**Class for Lda model 4 | * @author yangliu 5 | * @blog http://blog.csdn.net/yangliuy 6 | * @mail yangliuyx@gmail.com 7 | */ 8 | import java.io.BufferedWriter; 9 | import java.io.FileWriter; 10 | import java.io.IOException; 11 | import java.util.ArrayList; 12 | import java.util.Collections; 13 | import java.util.Comparator; 14 | import java.util.List; 15 | 16 | import tem.com.FileUtil; 17 | import tem.conf.PathConfig; 18 | import tem.main.Documents; 19 | 20 | public class LdaModel { 21 | 22 | int [][] doc;//word index array 23 | int V, K, M;//vocabulary size, topic number, document number 24 | int [][] z;//topic label array 25 | float alpha; //doc-topic dirichlet prior parameter 26 | float beta; //topic-word dirichlet prior parameter 27 | int [][] nmk;//given document m, count times of topic k. M*K 28 | int [][] nkt;//given topic k, count times of term t. K*V 29 | int [] nmkSum;//Sum for each row in nmk 30 | int [] nktSum;//Sum for each row in nkt 31 | double [][] phi;//Parameters for topic-word distribution K*V 32 | double [][] theta;//Parameters for doc-topic distribution M*K 33 | int iterations;//Times of iterations 34 | int saveStep;//The number of iterations between two saving 35 | int beginSaveIters;//Begin save model at this iteration 36 | 37 | public LdaModel(LdaGibbsSampling.modelparameters modelparam) { 38 | // TODO Auto-generated constructor stub 39 | alpha = modelparam.alpha; 40 | beta = modelparam.beta; 41 | iterations = modelparam.iteration; 42 | K = modelparam.topicNum; 43 | saveStep = modelparam.saveStep; 44 | beginSaveIters = modelparam.beginSaveIters; 45 | } 46 | 47 | public void initializeModel(Documents docSet) { 48 | // TODO Auto-generated method stub 49 | M = docSet.docs.size(); 50 | V = docSet.termToIndexMap.size(); 51 | nmk = new int [M][K]; 52 | nkt = new int[K][V]; 53 | nmkSum = new int[M]; 54 | nktSum = new int[K]; 55 | phi = new double[K][V]; 56 | theta = new double[M][K]; 57 | 58 | //initialize documents index array 59 | doc = new int[M][]; 60 | for(int m = 0; m < M; m++){ 61 | //Notice the limit of memory 62 | int N = 0; 63 | for(int i = 0; i < docSet.docs.get(m).docWords.length; i++){ 64 | for(int j = 0; j < docSet.docs.get(m).docWords[i].length; j++){ 65 | N++; 66 | } 67 | } 68 | doc[m] = new int[N]; 69 | int n = 0; 70 | for(int i = 0; i < docSet.docs.get(m).docWords.length; i++){ 71 | for(int j = 0; j < docSet.docs.get(m).docWords[i].length; j++){ 72 | doc[m][n] = docSet.docs.get(m).docWords[i][j]; 73 | n++; 74 | } 75 | } 76 | } 77 | 78 | //initialize topic lable z for each word 79 | z = new int[M][]; 80 | for(int m = 0; m < M; m++){ 81 | int N = doc[m].length; 82 | z[m] = new int[N]; 83 | for(int n = 0; n < N; n++){ 84 | int initTopic = (int)(Math.random() * K);// From 0 to K - 1 85 | z[m][n] = initTopic; 86 | //number of words in doc m assigned to topic initTopic add 1 87 | nmk[m][initTopic]++; 88 | //number of terms doc[m][n] assigned to topic initTopic add 1 89 | nkt[initTopic][doc[m][n]]++; 90 | // total number of words assigned to topic initTopic add 1 91 | nktSum[initTopic]++; 92 | } 93 | // total number of words in document m is N 94 | nmkSum[m] = N; 95 | } 96 | } 97 | 98 | public void inferenceModel(Documents docSet) throws IOException { 99 | // TODO Auto-generated method stub 100 | if(iterations < saveStep + beginSaveIters){ 101 | System.err.println("Error: the number of iterations should be larger than " + (saveStep + beginSaveIters)); 102 | System.exit(0); 103 | } 104 | for(int i = 0; i < iterations; i++){ 105 | System.out.println("Iteration " + i); 106 | if((i >= beginSaveIters) && (((i - beginSaveIters) % saveStep) == 0)){ 107 | //Saving the model 108 | System.out.println("Saving model at iteration " + i +" ... "); 109 | //Firstly update parameters 110 | updateEstimatedParameters(); 111 | //Secondly print model variables 112 | saveIteratedModel(i, docSet); 113 | } 114 | 115 | //Use Gibbs Sampling to update z[][] 116 | for(int m = 0; m < M; m++){ 117 | int N = doc[m].length; 118 | for(int n = 0; n < N; n++){ 119 | // Sample from p(z_i|z_-i, w) 120 | int newTopic = sampleTopicZ(m, n); 121 | z[m][n] = newTopic; 122 | } 123 | } 124 | } 125 | } 126 | 127 | private void updateEstimatedParameters() { 128 | // TODO Auto-generated method stub 129 | for(int k = 0; k < K; k++){ 130 | for(int t = 0; t < V; t++){ 131 | phi[k][t] = (nkt[k][t] + beta) / (nktSum[k] + V * beta); 132 | } 133 | } 134 | 135 | for(int m = 0; m < M; m++){ 136 | for(int k = 0; k < K; k++){ 137 | theta[m][k] = (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha); 138 | } 139 | } 140 | } 141 | 142 | private int sampleTopicZ(int m, int n) { 143 | // TODO Auto-generated method stub 144 | // Sample from p(z_i|z_-i, w) using Gibbs upde rule 145 | 146 | //Remove topic label for w_{m,n} 147 | int oldTopic = z[m][n]; 148 | nmk[m][oldTopic]--; 149 | nkt[oldTopic][doc[m][n]]--; 150 | nmkSum[m]--; 151 | nktSum[oldTopic]--; 152 | 153 | //Compute p(z_i = k|z_-i, w) 154 | double [] p = new double[K]; 155 | for(int k = 0; k < K; k++){ 156 | p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha); 157 | } 158 | 159 | //Sample a new topic label for w_{m, n} like roulette 160 | //Compute cumulated probability for p 161 | for(int k = 1; k < K; k++){ 162 | p[k] += p[k - 1]; 163 | } 164 | double u = Math.random() * p[K - 1]; //p[] is unnormalised 165 | int newTopic; 166 | for(newTopic = 0; newTopic < K; newTopic++){ 167 | if(u < p[newTopic]){ 168 | break; 169 | } 170 | } 171 | 172 | //Add new topic label for w_{m, n} 173 | nmk[m][newTopic]++; 174 | nkt[newTopic][doc[m][n]]++; 175 | nmkSum[m]++; 176 | nktSum[newTopic]++; 177 | return newTopic; 178 | } 179 | 180 | public void saveIteratedModel(int iters, Documents docSet) throws IOException { 181 | // TODO Auto-generated method stub 182 | //lda.params lda.phi lda.theta lda.tassign lda.twords 183 | //lda.params 184 | String resPath = PathConfig.modelResPath + "LDA/"; 185 | String modelName = "lda_" + iters; 186 | ArrayList lines = new ArrayList(); 187 | lines.add("alpha = " + alpha); 188 | lines.add("beta = " + beta); 189 | lines.add("topicNum = " + K); 190 | lines.add("docNum = " + M); 191 | lines.add("termNum = " + V); 192 | lines.add("iterations = " + iterations); 193 | lines.add("saveStep = " + saveStep); 194 | lines.add("beginSaveIters = " + beginSaveIters); 195 | FileUtil.writeLines(resPath + modelName + ".params", lines); 196 | 197 | //lda.phi K*V 198 | BufferedWriter writer = new BufferedWriter(new FileWriter(resPath + modelName + ".phi")); 199 | for (int i = 0; i < K; i++){ 200 | for (int j = 0; j < V; j++){ 201 | writer.write(phi[i][j] + "\t"); 202 | } 203 | writer.write("\n"); 204 | } 205 | writer.close(); 206 | 207 | //lda.theta M*K 208 | writer = new BufferedWriter(new FileWriter(resPath + modelName + ".theta")); 209 | for(int i = 0; i < M; i++){ 210 | for(int j = 0; j < K; j++){ 211 | writer.write(theta[i][j] + "\t"); 212 | } 213 | writer.write("\n"); 214 | } 215 | writer.close(); 216 | 217 | //lda.tassign 218 | writer = new BufferedWriter(new FileWriter(resPath + modelName + ".tassign")); 219 | for(int m = 0; m < M; m++){ 220 | for(int n = 0; n < doc[m].length; n++){ 221 | writer.write(doc[m][n] + ":" + z[m][n] + "\t"); 222 | } 223 | writer.write("\n"); 224 | } 225 | writer.close(); 226 | 227 | //lda.twords phi[][] K*V 228 | writer = new BufferedWriter(new FileWriter(resPath + modelName + ".twords")); 229 | int topNum = 20; //Find the top 20 topic words in each topic 230 | for(int i = 0; i < K; i++){ 231 | List tWordsIndexArray = new ArrayList(); 232 | for(int j = 0; j < V; j++){ 233 | tWordsIndexArray.add(new Integer(j)); 234 | } 235 | Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[i])); 236 | writer.write("topic " + i + ":\t"); 237 | for(int t = 0; t < topNum; t++){ 238 | writer.write(docSet.indexToTermMap.get(tWordsIndexArray.get(t)) + "\t"); 239 | } 240 | writer.write("\n"); 241 | } 242 | writer.close(); 243 | } 244 | 245 | public class TwordsComparable implements Comparator { 246 | public double [] sortProb; // Store probability of each word in topic k 247 | 248 | public TwordsComparable (double[] sortProb){ 249 | this.sortProb = sortProb; 250 | } 251 | 252 | @Override 253 | public int compare(Integer o1, Integer o2) { 254 | // TODO Auto-generated method stub 255 | //Sort topic word index according to the probability of each word in topic k 256 | if(sortProb[o1] > sortProb[o2]) return -1; 257 | else if(sortProb[o1] < sortProb[o2]) return 1; 258 | else return 0; 259 | } 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /src/tem/main/ModelComFunc.java: -------------------------------------------------------------------------------- 1 | package tem.main; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.PrintWriter; 5 | import java.util.ArrayList; 6 | 7 | import tem.com.MatrixUtil; 8 | 9 | public class ModelComFunc { 10 | 11 | public static void writeData(float[] array, ArrayList strings, 12 | ArrayList rankList, BufferedWriter writer, String prefix) { 13 | PrintWriter writer2 = new PrintWriter(writer); 14 | for (int row = 0; row < rankList.size(); row++) { 15 | writer2.printf("%s\t%s\t%f\n", prefix, 16 | strings.get(rankList.get(row)), array[rankList.get(row)]); 17 | // writer2.printf(prefix + "\t", 18 | // strings.get(rankList.get(row)) + "\t" + array[rankList.get(row)] 19 | // + "\n"); 20 | } 21 | } 22 | 23 | // public static void writeData(ArrayList[] cNP2, BufferedWriter 24 | // writer) { 25 | // PrintWriter writer2 = new PrintWriter(writer); 26 | // writer2 = new PrintWriter(writer); 27 | // for (int i = 0; i < cNP2.length; i++) { 28 | // // writer2.printf("%d-th topic:\n", i); 29 | // for (int j = 0; j < cNP2[i].size(); j++) { 30 | // // writer2.printf("%s,\t", Doc.getNps().get(cNP2[i].get(j))); 31 | // } 32 | // writer2.print("\n\n"); 33 | // } 34 | // } 35 | 36 | public static void writeData(float[] pi, BufferedWriter writer) { 37 | PrintWriter writer2 = new PrintWriter(writer); 38 | for (int row = 0; row < pi.length; row++) { 39 | writer2.printf("\t%f", pi[row]); 40 | } 41 | } 42 | 43 | public static void writeData(int[][] phi2, PrintWriter writer2) { 44 | for (int row = 0; row < phi2.length; row++) { 45 | // writer2.printf("%d", row); 46 | for (int col = 0; col < phi2[row].length; col++) { 47 | writer2.printf("%d\t", phi2[row][col]); 48 | // writer2.printf(phi2[row][col] + "\t"); 49 | } 50 | writer2.print("\n"); 51 | } 52 | } 53 | 54 | public static void writeData(float[][] array, BufferedWriter writer) { 55 | PrintWriter writer2 = new PrintWriter(writer); 56 | for (int row = 0; row < array.length; row++) { 57 | // writer2.printf("%d\t", row); 58 | for (int col = 0; col < array[row].length; col++) { 59 | writer2.printf("%f\t", array[row][col]); 60 | // writer2.printf(array[row][col] + "\t"); 61 | } 62 | writer2.print("\n"); 63 | } 64 | } 65 | 66 | public static void writeData(double[][] vph2, BufferedWriter writer) { 67 | PrintWriter writer2 = new PrintWriter(writer); 68 | for (int row = 0; row < vph2.length; row++) { 69 | // writer2.printf("%d", row); 70 | for (int col = 0; col < vph2[row].length; col++) { 71 | writer2.printf("\t%f", vph2[row][col]); 72 | // writer2.printf("\t" + vph2[row][col]); 73 | } 74 | writer2.print("\n"); 75 | } 76 | } 77 | 78 | public static void writeData(float[][][] a, BufferedWriter writer) { 79 | PrintWriter writer2 = new PrintWriter(writer); 80 | for (int i = 0; i < a.length; i++) 81 | for (int row = 0; row < a[i].length; row++) { 82 | writer2.printf("%d\t%d", i, row); 83 | for (int col = 0; col < a[i][row].length; col++) { 84 | writer2.printf("\t%f", a[i][row][col]); 85 | } 86 | writer2.print("\n"); 87 | } 88 | } 89 | 90 | public static void writeData(float[][][][] data, BufferedWriter writer) { 91 | PrintWriter writer2 = new PrintWriter(writer); 92 | for (int d = 0; d < data.length; d++) 93 | for (int a = 0; a < data[d].length; a++) 94 | for (int s = 0; s < data[d][a].length; s++) { 95 | writer2.printf("%d\t%d\t%d", d, a, s); 96 | for (int w = 0; w < data[d][a][s].length; w++) { 97 | writer2.printf("\t%f", data[d][a][s][w]); 98 | } 99 | writer2.print("\n"); 100 | } 101 | } 102 | 103 | public static void writeData(ArrayList> rankLists, 104 | ArrayList> probs, ArrayList uniWordMap, 105 | ArrayList names, BufferedWriter writer, String string) 106 | throws Exception { 107 | // string: "\t" 108 | // names.get(0) names.get(1) ... 109 | // w11:probs11 w12:probs12 ... 110 | // w21:probs21 w21:probs22 ... 111 | // rankLists.get(0) rankLists.get(1) 112 | int maxsize = rankLists.get(0).size(); 113 | for (int i = 0; i < rankLists.size(); i++) { 114 | // get max size 115 | if (rankLists.get(i).size() > maxsize) 116 | maxsize = rankLists.get(i).size(); 117 | } 118 | for (int i = 0; i < names.size(); i++) { 119 | writer.write(names.get(i) + string + string); 120 | } 121 | writer.write("\n"); 122 | 123 | for (int j = 0; j < maxsize; j++) { 124 | for (int i = 0; i < rankLists.size(); i++) { 125 | if (rankLists.get(i).size() > j && probs.get(i).size() > j) { 126 | writer.write(uniWordMap.get(rankLists.get(i).get(j)) 127 | + string + probs.get(i).get(j) + string); 128 | } else 129 | writer.write("null" + string + "0" + string); 130 | } 131 | writer.write("\n"); 132 | } 133 | } 134 | 135 | public static boolean checkEqual(int[][][][] a, int[][][] b, String string) { 136 | for (int i = 0; i < a.length; i++) { 137 | for (int j = 0; j < a[i].length; j++) { 138 | for (int k = 0; k < a[i][j].length; k++) { 139 | if (IsLessThanZero(a[i][j][k])) 140 | return false; 141 | } 142 | } 143 | } 144 | for (int i = 0; i < b.length; i++) { 145 | for (int j = 0; j < b[i].length; j++) { 146 | if (IsLessThanZero(b[i][j])) 147 | return false; 148 | } 149 | } 150 | for (int k = 0; k < a.length; k++) { 151 | for (int i = 0; i < a[k].length; i++) { 152 | for (int j = 0; j < a[k][i].length; j++) { 153 | double c = MatrixUtil.sumRow(a[k][i], j); 154 | if (c != b[k][i][j]) { 155 | System.out.println(string + "\t" + c + "\t" + b[i]); 156 | return false; 157 | } 158 | } 159 | } 160 | } 161 | return true; 162 | } 163 | 164 | public static boolean checkEqual(int[][][] a, int[][] b, String string) { 165 | for (int i = 0; i < a.length; i++) { 166 | for (int j = 0; j < a[i].length; j++) { 167 | if (IsLessThanZero(a[i][j])) 168 | return false; 169 | } 170 | } 171 | for (int i = 0; i < b.length; i++) { 172 | if (IsLessThanZero(b[i])) 173 | return false; 174 | } 175 | for (int i = 0; i < a.length; i++) { 176 | for (int j = 0; j < a[i].length; j++) { 177 | double c = MatrixUtil.sumRow(a[i], j); 178 | if (c != b[i][j]) { 179 | System.out.println(string + "\t" + c + "\t" + b[i]); 180 | return false; 181 | } 182 | } 183 | } 184 | return true; 185 | } 186 | 187 | static boolean checkEqual(int[][] a, int[] b, String string) { 188 | 189 | for (int i = 0; i < a.length; i++) { 190 | if (IsLessThanZero(a[i])) 191 | return false; 192 | } 193 | if (IsLessThanZero(b)) 194 | return false; 195 | for (int i = 0; i < a.length; i++) { 196 | double c = MatrixUtil.sumRow(a, i); 197 | if (c != b[i]) { 198 | System.out.println(string + "\t" + c + "\t" + b[i]); 199 | return false; 200 | } 201 | } 202 | return true; 203 | } 204 | 205 | private boolean checkEqual(double a, double b, String string) { 206 | if (a < 0 || b < 0) 207 | return false; 208 | if (a != b) { 209 | System.out.println(string + "\t" + a + "\t" + b); 210 | return false; 211 | } else { 212 | return true; 213 | } 214 | } 215 | 216 | public static boolean checkEqual(int[] a, int b, String string) { 217 | if (IsLessThanZero(a) || b < 0) 218 | return false; 219 | double c = MatrixUtil.sum(a); 220 | if (c != b) { 221 | System.out.println(string + "\t" + c + "\t" + b); 222 | return false; 223 | } 224 | return true; 225 | } 226 | 227 | private static boolean IsLessThanZero(int[] b) { 228 | for (int i = 0; i < b.length; i++) { 229 | if (b[i] < 0) 230 | return true; 231 | } 232 | return false; 233 | } 234 | 235 | private static boolean IsLessThanZero(double[] b) { 236 | for (int i = 0; i < b.length; i++) { 237 | if (b[i] < 0) 238 | return true; 239 | } 240 | return false; 241 | } 242 | 243 | protected static double checkDoubleOverflow(double probs, int pos, 244 | int[] countP) { 245 | if (probs < 0) { 246 | System.err.println(probs + "\t" + pos); 247 | for (int i = 0; i < countP.length; i++) 248 | System.err.print(countP[i] + " "); 249 | throw new IndexOutOfBoundsException("p is negative!!"); 250 | } 251 | if (probs > 1e+150d) { 252 | // System.out.println("p is too large for double type (> 2e+150d)."); 253 | countP[pos]++; 254 | return (probs / 1e+150); 255 | } 256 | if (probs < 1e-150d) { 257 | // System.out.println("p is too small for double type (< 5e-150d)."); 258 | countP[pos]--; 259 | return (probs * 1e+150); 260 | } 261 | return probs; 262 | } 263 | 264 | static void reAssignP(double[] p, int[] countP) { 265 | // p and countP should be the same length 266 | int maxV = countP[0]; 267 | for (int i = 0; i < countP.length; i++) { 268 | // System.out.print(p[i] + ":" + countP[i] + "\t"); 269 | if (countP[i] > maxV) { 270 | maxV = countP[i]; 271 | } 272 | } 273 | // System.out.println("\t max:" + maxV); 274 | for (int i = 0; i < p.length; i++) { 275 | p[i] *= Math.pow(1e+150, countP[i] - maxV); 276 | // System.out.print(p[i] + "\t"); 277 | } 278 | // System.out.println(); 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /src/tem/main/SimpleEvaluate.java: -------------------------------------------------------------------------------- 1 | package tem.main; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Collections; 6 | import java.util.Comparator; 7 | import java.util.List; 8 | 9 | import tem.com.FileUtil; 10 | import tem.conf.PathConfig; 11 | import tem.main.Documents.Document; 12 | 13 | /**Simple evaluation for TEM result 14 | * Compute utopics and kuExpertiseScore file 15 | * kuExpertiseScore Matrix is also used in 16 | * Topic Expertise PageRank 17 | * @author yangliu 18 | * @blog http://blog.csdn.net/yangliuy 19 | * @mail yangliuyx@gmail.com 20 | */ 21 | public class SimpleEvaluate { 22 | 23 | static int K = 15; 24 | static int E = 4; 25 | 26 | /** 27 | * @param args 28 | * @throws ClassNotFoundException 29 | * @throws IOException 30 | */ 31 | public static void main(String[] args) throws IOException, ClassNotFoundException { 32 | // TODO Auto-generated method stub 33 | String minPostNum = "80"; 34 | String trainDocfile = PathConfig.modelResPath + "USER" + minPostNum + "/USER" + minPostNum + ".data"; 35 | Documents trainDocSet = new Documents(); 36 | trainDocSet = FileUtil.loadClass(trainDocSet, trainDocfile); 37 | System.out.println("train terms: " + trainDocSet.termCountMap.size()); 38 | 39 | String testDataFolder = PathConfig.testDataPath; 40 | Documents testDocSet = new Documents(); 41 | 42 | 43 | testDocSet.readQATestDocs(testDataFolder, trainDocSet); 44 | String testDocfile = testDataFolder + "QATest.data"; 45 | FileUtil.saveClass(testDocSet, testDocfile); 46 | 47 | //Document questionDoc = testDocSet.docs.get(0); 48 | 49 | System.out.println(testDocSet.termCountMap.size()); 50 | System.out.println(testDocSet.tagCountMap.size()); 51 | System.out.println(testDocSet.voteCountMap.size()); 52 | System.out.println(testDocSet.docs.size()); 53 | 54 | //trainDocSet.copyTrainDocVocals(testDocSet); 55 | //FileUtil.saveClass(trainDocSet, trainDocfile); 56 | 57 | /*String userIDFile = PathConfig.scriptDataPath + "USERID" + minPostNum; 58 | String resPath = PathConfig.modelResPath + "USER" + minPostNum + "/model_" + 400; 59 | String resultPath = PathConfig.modelResPath + "USER" + minPostNum + "/"; 60 | ArrayList userIDs = new ArrayList(); 61 | FileUtil.readLines(userIDFile, userIDs); 62 | Documents docSet = new Documents(); 63 | String docfile = resultPath + "USER" + minPostNum + ".data"; 64 | docSet = FileUtil.loadClass(docSet, docfile); 65 | int U = userIDs.size(); 66 | float [][] theta = new float[U][K]; 67 | float [][][] phi = new float[K][U][E]; 68 | float [][] tau = new float[E][docSet.voteToIndexMap.size()]; 69 | float [] expertiseMean = new float[E]; 70 | readTheta(theta, resPath + ".theta"); 71 | readPhi(phi, resPath + ".phi"); 72 | readTau(tau, resPath + ".tau"); 73 | SimpleEvaluate se = new SimpleEvaluate(); 74 | se.printUtopics(theta, U, resPath, userIDs); 75 | se.computeExpertiseMean(tau, docSet, expertiseMean); 76 | se.printKUExpertiseScore(phi, expertiseMean, U, resPath);*/ 77 | } 78 | 79 | private void computeExpertiseMean(float[][] tau, Documents docSet, 80 | float[] expertiseMean) { 81 | // TODO Auto-generated method stub 82 | for(int i = 0; i < E; i++){ 83 | float mean = 0; 84 | for(int j = 0; j < docSet.indexToVoteMap.size(); j++){ 85 | mean += Float.valueOf(docSet.indexToVoteMap.get(j)) * tau[i][j]; 86 | } 87 | expertiseMean[i] = mean; 88 | System.out.println("expertise " + i + " mean : " + mean); 89 | } 90 | } 91 | 92 | private void printKUExpertiseScore(float[][][] phi, float[] expertiseMean, int U, String resPath) { 93 | // TODO Auto-generated method stub 94 | ArrayList KUEMeanLines = new ArrayList(); 95 | for(int k = 0; k < K; k++){ 96 | String line = ""; 97 | for(int u = 0; u < U; u++){ 98 | float expertiseScore = 0; 99 | for(int e = 0; e < E; e++){ 100 | expertiseScore += expertiseMean[e] * phi[k][u][e]; 101 | } 102 | line += expertiseScore + "\t"; 103 | } 104 | KUEMeanLines.add(line); 105 | } 106 | FileUtil.writeLines(resPath + ".KUexpertiseScore", KUEMeanLines); 107 | } 108 | 109 | private void printUtopics(float[][] theta, int U, String resPath, ArrayList userIDs) { 110 | // TODO Auto-generated method stub 111 | //model.utopics 112 | ArrayList utopicsLines = new ArrayList(); 113 | for(int i = 0; i < U; i++){ 114 | List tWordsIndexArray = new ArrayList(); 115 | for(int t = 0; t < K; t++){ 116 | tWordsIndexArray.add(new Integer(t)); 117 | } 118 | Collections.sort(tWordsIndexArray, new SimpleEvaluate.TwordsComparable(theta[i])); 119 | String line = "UserID = " + userIDs.get(i) + "\t"; 120 | for(int t = 0; t < K; t++){ 121 | line += tWordsIndexArray.get(t) + "\t"; 122 | } 123 | utopicsLines.add(line); 124 | } 125 | FileUtil.writeLines(resPath + ".utopics", utopicsLines); 126 | } 127 | 128 | private static void readTau(float[][] tau, String file) { 129 | // TODO Auto-generated method stub 130 | ArrayList lines = new ArrayList(); 131 | FileUtil.readLines(file, lines); 132 | for(int i = 0; i < tau.length; i++){ 133 | String[] tokens = lines.get(i).split("\t"); 134 | for(int j = 0 ; j < tau[i].length; j++){ 135 | tau[i][j] = Float.valueOf(tokens[j]); 136 | } 137 | } 138 | } 139 | 140 | private static void readPhi(float[][][] phi, String file) { 141 | // TODO Auto-generated method stub 142 | ArrayList lines = new ArrayList(); 143 | FileUtil.readLines(file, lines); 144 | for(String line : lines){ 145 | String[] tokens = line.split("\t"); 146 | int i = Integer.valueOf(tokens[0]); 147 | int j = Integer.valueOf(tokens[1]); 148 | int k = Integer.valueOf(tokens[2]); 149 | phi[i][j][k] = Float.valueOf(tokens[3]); 150 | } 151 | } 152 | 153 | private static void readTheta(float[][] theta, String file) { 154 | // TODO Auto-generated method stub 155 | ArrayList lines = new ArrayList(); 156 | FileUtil.readLines(file, lines); 157 | for(int i = 0; i < theta.length; i++){ 158 | String[] tokens = lines.get(i).split("\t"); 159 | for(int j = 0 ; j < theta[i].length; j++){ 160 | theta[i][j] = Float.valueOf(tokens[j]); 161 | } 162 | } 163 | } 164 | 165 | public class TwordsComparable implements Comparator { 166 | public float [] sortProb; // Store probability of each word in topic k 167 | 168 | public TwordsComparable (float[] sortProb){ 169 | this.sortProb = sortProb; 170 | } 171 | 172 | @Override 173 | public int compare(Integer o1, Integer o2) { 174 | // TODO Auto-generated method stub 175 | //Sort topic word index according to the probability of each word in topic k 176 | if(sortProb[o1] > sortProb[o2]) return -1; 177 | else if(sortProb[o1] < sortProb[o2]) return 1; 178 | else return 0; 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/tem/main/TEMModelSampling.java: -------------------------------------------------------------------------------- 1 | package tem.main; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.File; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | 9 | import tem.com.ComUtil; 10 | import tem.com.FileUtil; 11 | import tem.com.JC; 12 | import tem.com.MatrixUtil; 13 | import tem.conf.ConstantConfig; 14 | import tem.conf.PathConfig; 15 | import tem.main.Documents; 16 | import tem.main.Documents.Document; 17 | import tem.main.TEMModel; 18 | 19 | /** 20 | * Gibbs Sampling of Topic Expertise Model 21 | * 22 | * @author yangliu 23 | * @blog http://blog.csdn.net/yangliuy 24 | * @mail yangliuyx@gmail.com 25 | */ 26 | 27 | public class TEMModelSampling { 28 | 29 | public static class modelparameters { 30 | float alpha = 0.5f;// usual value is 50 / K 31 | float beta = 0.01f; 32 | float gamma = 0.01f; 33 | float eta = 0.1f;// usual value is 0.1 34 | float xi = 0.01f; 35 | int topicNum = 20; 36 | int expertiseNum = 3; 37 | 38 | int iteration = 300; 39 | int saveStep = 20; 40 | int beginSaveIters = 5; 41 | } 42 | 43 | /** 44 | * Get parameters from configuring file. If the configuring file has value 45 | * in it, use the value. Else the default value in program will be used 46 | * 47 | * @param ldaparameters 48 | * @param parameterFile 49 | * @return void 50 | */ 51 | private static void getParametersFromFile(modelparameters ldaparameters, 52 | String parameterFile) { 53 | // TODO Auto-generated method stub 54 | ArrayList paramLines = new ArrayList(); 55 | FileUtil.readLines(parameterFile, paramLines); 56 | for (String line : paramLines) { 57 | String[] lineParts = line.split("\t"); 58 | switch (parameters.valueOf(lineParts[0])) { 59 | case alpha: 60 | ldaparameters.alpha = Float.valueOf(lineParts[1]); 61 | break; 62 | case beta: 63 | ldaparameters.beta = Float.valueOf(lineParts[1]); 64 | break; 65 | case gamma: 66 | ldaparameters.gamma = Float.valueOf(lineParts[1]); 67 | break; 68 | case eta: 69 | ldaparameters.eta = Float.valueOf(lineParts[1]); 70 | break; 71 | case xi: 72 | ldaparameters.xi = Float.valueOf(lineParts[1]); 73 | break; 74 | case topicNum: 75 | ldaparameters.topicNum = Integer.valueOf(lineParts[1]); 76 | break; 77 | case expertiseNum: 78 | ldaparameters.expertiseNum = Integer.valueOf(lineParts[1]); 79 | break; 80 | case iteration: 81 | ldaparameters.iteration = Integer.valueOf(lineParts[1]); 82 | break; 83 | case saveStep: 84 | ldaparameters.saveStep = Integer.valueOf(lineParts[1]); 85 | break; 86 | case beginSaveIters: 87 | ldaparameters.beginSaveIters = Integer.valueOf(lineParts[1]); 88 | break; 89 | } 90 | } 91 | } 92 | 93 | public enum parameters { 94 | alpha, beta, gamma, eta, xi, topicNum, expertiseNum, iteration, saveStep, beginSaveIters; 95 | } 96 | 97 | /** 98 | * @param args 99 | * @throws IOException 100 | * @throws ClassNotFoundException 101 | */ 102 | public static void main(String[] args) throws IOException, 103 | ClassNotFoundException { 104 | /*boolean local = true; // run on local machine 105 | //local = !local; // run on server 106 | 107 | new JC(); 108 | String[] descrp = { "ParamsPath", "ResPath", "modelOutPath", 109 | "minPostNum" }; 110 | String[] directory = { "data/modelParams/", "data/modelRes/ThreeM09/", 111 | "data/modelRes/ThreeM09/TMM3/", "80" }; 112 | char[] options = { 'p', 'i', 'o', 'n' }; 113 | if (local) 114 | JC.setInputOptions(descrp, directory, options, args, "1111", 0); 115 | else 116 | JC.setInputOptions(descrp, directory, options, args, "0000", 1); 117 | PathConfig.modelParamsPath = JC.getARG(0); 118 | PathConfig.modelResPath = JC.getARG(1); 119 | PathConfig.modelOutPath = JC.getARG(2); 120 | PathConfig.minPostNum = JC.getARG(3); 121 | JC.close();*/ 122 | 123 | String minPostNum = PathConfig.minPostNum; 124 | // data/originalData/USER80/posts/ 125 | String originalDocsPath = PathConfig.originalDataPath + "USER" 126 | + minPostNum + "/posts/"; 127 | //data/modelRes/ThreeM09/USER80 128 | String resultPath = PathConfig.modelResPath + "USER" + minPostNum + "/"; 129 | String parameterFile = ConstantConfig.LDAPARAMETERFILE; 130 | 131 | modelparameters modelparam = new modelparameters(); 132 | getParametersFromFile(modelparam, parameterFile); 133 | Documents docSet = new Documents(); 134 | 135 | String docfile = resultPath + "USER" + minPostNum + ".data"; 136 | // docSet.readDocs(originalDocsPath, minPostNum); 137 | 138 | // Save Serialized data 139 | docSet = FileUtil.loadClass(docSet, docfile); 140 | // FileUtil.saveClass(docSet, docfile); 141 | // Delete terms that appear only n times 142 | // docSet.deleteRareTerms(3); 143 | System.out.println("indexToTermMap size : " 144 | + docSet.indexToTermMap.size()); 145 | // System.out.println("indexToTermMap : " + docSet.indexToTermMap); 146 | System.out.println("indexToTagMap size : " 147 | + docSet.indexToTagMap.size()); 148 | System.out.println("indexToVoteMap size : " 149 | + docSet.indexToVoteMap.size()); 150 | 151 | // // test(); 152 | // testGMM(); 153 | //if (local) 154 | //removeData(docSet, 10); 155 | 156 | // try { 157 | // getVotes(docSet, PathConfig.votePath); 158 | // } catch (Exception e) { 159 | // e.printStackTrace(); 160 | // } 161 | // 162 | // for (int d = 0; d < 1; d++) { 163 | // Document doc = docSet.docs.get(d); 164 | // System.out.println(doc.docName); 165 | // // System.out.println("tags" + doc.tags); 166 | // System.out.println("title" + doc.title); 167 | // for (int n = 0; n < docSet.docs.get(d).docWords.length; n++) { 168 | // System.out.println("post vote: " 169 | // + docSet.indexToVoteMap.get(doc.votes[n])); 170 | // System.out.println("post tag: " 171 | // + docSet.indexToTagMap.get(doc.tags[n])); 172 | // for (int l = 0; l < docSet.docs.get(d).docWords[n].length; l++) { 173 | // System.out.print(doc.docWords[n][l] + " "); 174 | // // System.out.print("vote_" + 175 | // // docSet.indexToVoteMap.get(doc.votes[n]) 176 | // // + " "); 177 | // // System.out.print("tag_" + 178 | // // docSet.indexToTagMap.get(doc.tags[n]) + 179 | // // " "); 180 | // } 181 | // System.out.println(); 182 | // } 183 | // } 184 | 185 | // System.out.println("indexToTagMap" + docSet.indexToTagMap); 186 | // System.out.println("indexToVoteMap" + docSet.indexToVoteMap); 187 | // System.out.println("indexToTermMap" + docSet.indexToTermMap); 188 | // System.out.println("tagCountMap"); 189 | // // tagCountMap 190 | // for (String tag : docSet.tagCountMap.keySet()) { 191 | // System.out.println(tag + "\t" + docSet.tagCountMap.get(tag)); 192 | // } 193 | // 194 | // System.out.println("voteCountMap"); 195 | // // voteCountMap 196 | // for (String vote : docSet.voteCountMap.keySet()) { 197 | // System.out.println(vote + "\t" + docSet.voteCountMap.get(vote)); 198 | // } 199 | 200 | //Count quesions and answers 201 | int questionCount = 0; 202 | int answerCount = 0; 203 | for (int d = 0; d < docSet.docs.size(); d++) { 204 | Document doc = docSet.docs.get(d); 205 | for(int n = 0; n < doc.docWords.length; n++){ 206 | if(doc.postTypeID[n] == 1){ 207 | questionCount++; 208 | } else { 209 | answerCount++; 210 | } 211 | } 212 | } 213 | System.out.println("quesionsCount: " + questionCount); 214 | System.out.println("answerCount: " + answerCount); 215 | 216 | TEMModel model = new TEMModel(modelparam); 217 | System.out.println("1 Initialize the model ..."); 218 | model.initializeModel(docSet); 219 | System.out.println("2 Learning and Saving the model ..."); 220 | model.inferenceModel(docSet, minPostNum); 221 | System.out.println("3 Output the final model ..."); 222 | model.saveIteratedModel(modelparam.iteration, docSet, minPostNum); 223 | 224 | // save model in serialized data 225 | String modelName = "E_" + model.ENum + "_T_" + model.K; 226 | FileUtil.saveClass(model, PathConfig.modelResPath + "USER" + minPostNum 227 | + "/" + modelName + ".model"); 228 | System.out.println("Done!"); 229 | } 230 | 231 | private static void testGMM() { 232 | String testGMM = "data/modelRes/testGMM.txt"; 233 | 234 | double alpha = 10; 235 | 236 | float[][] GMMData = null; 237 | GMMData = FileUtil.readArray(testGMM); 238 | FGMM fgmm = new FGMM(); // 239 | int ksize = 4; 240 | int[] clusterids = new int[GMMData.length]; 241 | // random assign clusterID 242 | for (int n = 0; n < GMMData.length; n++) { 243 | int id = (int) (Math.floor(Math.random() * ksize)); 244 | clusterids[n] = id; 245 | } 246 | fgmm.init2(GMMData, ksize, clusterids); 247 | // fgmm.learn2(GMMData, 500);// get GMM data index 248 | 249 | for (int iter = 0; iter < 500; iter++) { 250 | if (iter % 10 == 0) { 251 | System.out.print("Iteration " + iter + "\t"); 252 | for (int i = 0; i < ksize; i++) 253 | System.out.print(fgmm.clusterDataIndex.get(i).size() + " "); 254 | System.out.println(); 255 | System.out.println("lambda:"); 256 | for (int k = 0; k < ksize; k++) 257 | ComUtil.print(fgmm.p_lambda[k], " ", "\n"); 258 | System.out.println("mu:"); 259 | for (int k = 0; k < ksize; k++) 260 | ComUtil.print(fgmm.p_mu[k], " ", "\n"); 261 | } 262 | for (int n = 0; n < GMMData.length; n++) { 263 | double[] probsGMM = fgmm.LearnProbs(GMMData, n); 264 | double[] p = new double[ksize]; 265 | 266 | for (int i = 0; i < ksize; i++) { 267 | p[i] = (fgmm.clusterDataIndex.get(i).size() + alpha) 268 | / (fgmm.vector_n + ksize * alpha); 269 | p[i] *= probsGMM[i]; 270 | } 271 | 272 | int newNo = ComUtil.sample(p, p.length); 273 | clusterids[n] = newNo; 274 | 275 | // update new mu and lambda 276 | fgmm.UpdateProbs(GMMData, n, newNo); 277 | } 278 | } 279 | System.out.println("done"); 280 | System.exit(0); 281 | } 282 | 283 | private static void removeData(Documents docSet, int r) { 284 | for (int d = r; d < docSet.docs.size(); d++) { 285 | docSet.docs.remove(d); 286 | d--; 287 | } 288 | System.out.println("doc size: " + docSet.docs.size()); 289 | } 290 | 291 | private static void test() { 292 | double[] set = new double[5]; 293 | ComUtil.print(set, " ", "\n"); 294 | changeset(set); 295 | ComUtil.print(set, " ", "\n"); 296 | System.exit(0); 297 | } 298 | 299 | private static void changeset(double[] set) { 300 | for (int i = 0; i < set.length; i++) 301 | set[i] += 1; 302 | } 303 | 304 | private static void getVotes(Documents docSet, String votePath) 305 | throws Exception { 306 | BufferedWriter writer = new BufferedWriter(new FileWriter(new File( 307 | votePath))); 308 | 309 | for (int d = 0; d < docSet.docs.size(); d++) { 310 | Document doc = docSet.docs.get(d); 311 | // System.out.println(doc.docName); 312 | // //System.out.println("tags" + doc.tags); 313 | // System.out.println("title" + doc.title); 314 | for (int n = 0; n < docSet.docs.get(d).docWords.length; n++) { 315 | // System.out.println(d + "\t" + n + "\t" + 316 | // docSet.indexToVoteMap.get(doc.votes[n])); 317 | writer.write(d + "\t" + n + "\t" 318 | + docSet.indexToVoteMap.get(doc.votes[n]) + "\n"); 319 | // System.out.println("post vote: " 320 | // + docSet.indexToVoteMap.get(doc.votes[n])); 321 | // System.out.println("post tag: " + 322 | // docSet.indexToTagMap.get(doc.tags[n])); 323 | // for(int l = 0; l < docSet.docs.get(d).docWords[n].length; 324 | // l++){ 325 | // System.out.print(doc.docWords[n][l] + " "); 326 | // //System.out.print("vote_" + 327 | // docSet.indexToVoteMap.get(doc.votes[n]) + " "); 328 | // //System.out.print("tag_" + 329 | // docSet.indexToTagMap.get(doc.tags[n]) + " "); 330 | // } 331 | // System.out.println(); 332 | } 333 | writer.flush(); 334 | } 335 | writer.close(); 336 | } 337 | } 338 | -------------------------------------------------------------------------------- /src/tem/main/TEMResPaperVisual.java: -------------------------------------------------------------------------------- 1 | package tem.main; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.io.ObjectInputStream; 6 | 7 | import tem.conf.PathConfig; 8 | 9 | public class TEMResPaperVisual { 10 | 11 | /** 12 | * @param args 13 | * @throws IOException 14 | * @throws Exception 15 | */ 16 | public static void main(String[] args) throws IOException, Exception { 17 | // TODO Auto-generated method stub 18 | String modelFile = PathConfig.modelResPath + "ServerTEMRes/Model_E10_T15.model"; 19 | 20 | //Get TEM model result 21 | TEMModel model = new TEMModel(); 22 | // load model 23 | System.out.println("reading a class from : " + modelFile); 24 | FileInputStream fis = new FileInputStream(modelFile); 25 | ObjectInputStream ois = new ObjectInputStream(fis); 26 | model = (TEMModel) ois.readObject(); 27 | ois.close(); 28 | System.out.println(model.K); 29 | System.out.println(model.ENum); 30 | System.out.println("mu"); 31 | for(int e = 0; e < model.ENum; e++){ 32 | System.out.println(model.fgmm.p_mu[e][0]); 33 | } 34 | System.out.println("lambda"); 35 | for(int e = 0; e < model.ENum; e++){ 36 | System.out.println(model.fgmm.p_lambda[e][0]); 37 | } 38 | 39 | 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/tem/main/TEMResProUserRecMergeU.java: -------------------------------------------------------------------------------- 1 | package tem.main; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.HashSet; 7 | import java.util.Map; 8 | import java.util.Set; 9 | import java.util.TreeMap; 10 | import java.util.TreeSet; 11 | 12 | import tem.com.FileUtil; 13 | import tem.conf.PathConfig; 14 | 15 | /** 16 | * User Rec 17 | * Merge answers with the same user with one 18 | */ 19 | 20 | public class TEMResProUserRecMergeU { 21 | 22 | /** 23 | * @param args 24 | * @throws IOException 25 | */ 26 | public static void main(String[] args) throws IOException { 27 | // TODO Auto-generated method stub 28 | String ModelFileVoteResFolder = PathConfig.modelResPath + "USER" + PathConfig.minPostNum + ""; 29 | ArrayList resLines = new ArrayList(); 30 | ArrayList mergeLines = new ArrayList(); 31 | Set QidAUseridSet = new TreeSet (); 32 | Map IDPairScoreMap = new TreeMap(); 33 | 34 | for(File modelFVRfile : new File(ModelFileVoteResFolder).listFiles()){ 35 | if(modelFVRfile.getName().contains("ModelFileVoteRes")){ 36 | String mergeFileName = ModelFileVoteResFolder + "/MergeFiles/" + modelFVRfile.getName() + ".merge"; 37 | System.out.println("mergeFileName " + mergeFileName); 38 | if(new File(mergeFileName).exists()){ 39 | System.out.println(mergeFileName + "is existed! " ); 40 | continue; 41 | } 42 | resLines.clear(); 43 | QidAUseridSet.clear(); 44 | IDPairScoreMap.clear(); 45 | mergeLines.clear();; 46 | FileUtil.readLines(modelFVRfile.getAbsolutePath(), resLines); 47 | for(int i = 0; i < resLines.size(); i++){ 48 | String[] tokens = resLines.get(i).split("\t"); 49 | QidAUseridSet.add(tokens[0] + "\t" + tokens[1]); 50 | IDPairScoreMap.put(tokens[0] + "\t" + tokens[1], tokens[3] + "\t" + tokens[4] + "\t" + tokens[5]); 51 | } 52 | System.out.println("QidAUseridSet size: " + QidAUseridSet.size()); 53 | for(String idPair : QidAUseridSet){ 54 | double sum = 0; 55 | double count = 0; 56 | for(String resLine : resLines){ 57 | String[] tokens = resLine.split("\t"); 58 | String pairKey = tokens[0] + "\t" + tokens[1]; 59 | if(idPair.equals(pairKey)){ 60 | sum += Double.valueOf(tokens[2]); 61 | count ++; 62 | } 63 | } 64 | double averageVote = sum / count; 65 | mergeLines.add(idPair + "\t" + averageVote + "\t" + IDPairScoreMap.get(idPair)); 66 | } 67 | FileUtil.writeLines(mergeFileName , mergeLines); 68 | mergeLines.clear(); 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/tem/parser/Porter.java: -------------------------------------------------------------------------------- 1 | package tem.parser; 2 | 3 | import java.io.*; 4 | 5 | /* author: Fotis Lazarinis (actually I translated from C to Java) 6 | date: June 1997 7 | address: Psilovraxou 12, Agrinio, 30100 8 | 9 | comments: Compile it, import the Porter class into you program and create an instance. 10 | Then use the stripAffixes method of this method which takes a String as 11 | input and returns the stem of this String again as a String. 12 | 13 | */ 14 | 15 | class NewString { 16 | public String str; 17 | 18 | NewString() { 19 | str = ""; 20 | } 21 | } 22 | 23 | public class Porter { 24 | 25 | private String Clean( String str ) { 26 | int last = str.length(); 27 | 28 | Character ch = new Character( str.charAt(0) ); 29 | String temp = ""; 30 | 31 | for ( int i=0; i < last; i++ ) { 32 | if ( ch.isLetterOrDigit( str.charAt(i) ) ) 33 | temp += str.charAt(i); 34 | } 35 | 36 | return temp; 37 | } //clean 38 | 39 | private boolean hasSuffix( String word, String suffix, NewString stem ) { 40 | 41 | String tmp = ""; 42 | 43 | if ( word.length() <= suffix.length() ) 44 | return false; 45 | if (suffix.length() > 1) 46 | if ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) ) 47 | return false; 48 | 49 | stem.str = ""; 50 | 51 | for ( int i=0; i 0 ) { 92 | if ( vowel(stem.charAt(i),stem.charAt(i-1)) ) 93 | break; 94 | } 95 | else { 96 | if ( vowel(stem.charAt(i),'a') ) 97 | break; 98 | } 99 | } 100 | 101 | for ( i++ ; i < length ; i++ ) { 102 | if ( i > 0 ) { 103 | if ( !vowel(stem.charAt(i),stem.charAt(i-1)) ) 104 | break; 105 | } 106 | else { 107 | if ( !vowel(stem.charAt(i),'?') ) 108 | break; 109 | } 110 | } 111 | if ( i < length ) { 112 | count++; 113 | i++; 114 | } 115 | } //while 116 | 117 | return(count); 118 | } 119 | 120 | private boolean containsVowel( String word ) { 121 | 122 | for (int i=0 ; i < word.length(); i++ ) 123 | if ( i > 0 ) { 124 | if ( vowel(word.charAt(i),word.charAt(i-1)) ) 125 | return true; 126 | } 127 | else { 128 | if ( vowel(word.charAt(0),'a') ) 129 | return true; 130 | } 131 | 132 | return false; 133 | } 134 | 135 | private boolean cvc( String str ) { 136 | int length=str.length(); 137 | 138 | if ( length < 3 ) 139 | return false; 140 | 141 | if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) ) 142 | && (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y') 143 | && (vowel(str.charAt(length-2),str.charAt(length-3))) ) { 144 | 145 | if (length == 3) { 146 | if (!vowel(str.charAt(0),'?')) 147 | return true; 148 | else 149 | return false; 150 | } 151 | else { 152 | if (!vowel(str.charAt(length-3),str.charAt(length-4)) ) 153 | return true; 154 | else 155 | return false; 156 | } 157 | } 158 | 159 | return false; 160 | } 161 | 162 | private String step1( String str ) { 163 | 164 | NewString stem = new NewString(); 165 | 166 | if ( str.charAt( str.length()-1 ) == 's' ) { 167 | if ( (hasSuffix( str, "sses", stem )) || (hasSuffix( str, "ies", stem)) ){ 168 | String tmp = ""; 169 | for (int i=0; i 0 ) { 189 | String tmp = ""; 190 | for (int i=0; i 0 ) { 270 | str = stem.str + suffixes[index][1]; 271 | return str; 272 | } 273 | } 274 | } 275 | 276 | return str; 277 | } 278 | 279 | private String step3( String str ) { 280 | 281 | String[][] suffixes = { { "icate", "ic" }, 282 | { "ative", "" }, 283 | { "alize", "al" }, 284 | { "alise", "al" }, 285 | { "iciti", "ic" }, 286 | { "ical", "ic" }, 287 | { "ful", "" }, 288 | { "ness", "" }}; 289 | NewString stem = new NewString(); 290 | 291 | for ( int index = 0 ; index 0 ) { 294 | str = stem.str + suffixes[index][1]; 295 | return str; 296 | } 297 | } 298 | return str; 299 | } 300 | 301 | private String step4( String str ) { 302 | 303 | String[] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion", 304 | "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"}; 305 | 306 | NewString stem = new NewString(); 307 | 308 | for ( int index = 0 ; index 1 ) { 312 | str = stem.str; 313 | return str; 314 | } 315 | } 316 | } 317 | return str; 318 | } 319 | 320 | private String step5( String str ) { 321 | 322 | if ( str.charAt(str.length()-1) == 'e' ) { 323 | if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */ 324 | String tmp = ""; 325 | for ( int i=0; i 1) ) 343 | if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */ 344 | String tmp = ""; 345 | for ( int i=0; i= 1 ) 374 | str = step2( str ); 375 | if ( str.length() >= 1 ) 376 | str = step3( str ); 377 | if ( str.length() >= 1 ) 378 | str = step4( str ); 379 | if ( str.length() >= 1 ) 380 | str = step5( str ); 381 | 382 | return str; 383 | } 384 | 385 | 386 | public String stripAffixes( String str ) { 387 | 388 | str = str.toLowerCase(); 389 | str = Clean(str); 390 | 391 | if (( str != "" ) && (str.length() > 2)) { 392 | str = stripPrefixes(str); 393 | 394 | if (str != "" ) 395 | str = stripSuffixes(str); 396 | 397 | } 398 | 399 | return str; 400 | } //stripAffixes 401 | 402 | } //class 403 | -------------------------------------------------------------------------------- /src/tem/parser/StanfordTokenizer.java: -------------------------------------------------------------------------------- 1 | package tem.parser; 2 | 3 | import java.io.File; 4 | import java.io.FileReader; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | import java.io.Reader; 8 | import java.io.StringReader; 9 | import java.util.Iterator; 10 | import java.util.LinkedList; 11 | import java.util.List; 12 | import java.util.regex.Matcher; 13 | import java.util.regex.Pattern; 14 | 15 | import edu.stanford.nlp.ling.CoreLabel; 16 | import edu.stanford.nlp.ling.HasWord; 17 | import edu.stanford.nlp.process.CoreLabelTokenFactory; 18 | import edu.stanford.nlp.process.DocumentPreprocessor; 19 | import edu.stanford.nlp.process.PTBTokenizer; 20 | 21 | /**Token sentences in a file or a String sentences 22 | * @author liuyang 23 | * @mail yangliuyx@gmail.com 24 | */ 25 | public class StanfordTokenizer { 26 | 27 | public static File tokenizeFile(File file) throws IOException { 28 | String tokenizedFileName = file.getAbsolutePath() + "_tokenized"; 29 | FileWriter tokenizedFileWriter = new FileWriter(tokenizedFileName); 30 | DocumentPreprocessor dp = new DocumentPreprocessor(file.getAbsolutePath()); 31 | int CurrentSentIndex = 1; 32 | int tokenizedSentCounter = 1; 33 | for (List sentence : dp) { 34 | for(int i = 0; i < sentence.size(); i++){ 35 | if(i == 0){ 36 | Pattern p = Pattern.compile("[0-9]+"); 37 | if(p.matcher(sentence.get(i).toString()).matches()){ 38 | tokenizedFileWriter.append(tokenizedSentCounter + "\t" + sentence.get(i) + "\t"); 39 | CurrentSentIndex = Integer.valueOf(sentence.get(i).toString()); 40 | } else { 41 | System.out.println(tokenizedSentCounter + "\t" + CurrentSentIndex +"\t" + sentence.get(i) + " "); 42 | tokenizedFileWriter.append(tokenizedSentCounter + "\t" + CurrentSentIndex +"\t" + sentence.get(i) + " "); 43 | } 44 | } else { 45 | tokenizedFileWriter.append(sentence.get(i) + " "); 46 | } 47 | } 48 | tokenizedSentCounter++; 49 | tokenizedFileWriter.append("\n"); 50 | tokenizedFileWriter.flush(); 51 | } 52 | return new File(tokenizedFileName); 53 | } 54 | 55 | public static List tokenizeSents(String sents){ 56 | Reader reader = new StringReader(sents); 57 | DocumentPreprocessor dp = new DocumentPreprocessor(reader); 58 | 59 | List sentenceList = new LinkedList(); 60 | Iterator> it = dp.iterator(); 61 | while (it.hasNext()) { 62 | StringBuilder sentenceSb = new StringBuilder(); 63 | List sentence = it.next(); 64 | for (HasWord token : sentence) { 65 | if(sentenceSb.length()>1) { 66 | sentenceSb.append(" "); 67 | } 68 | sentenceSb.append(token); 69 | } 70 | sentenceList.add(sentenceSb.toString()); 71 | } 72 | return sentenceList; 73 | } 74 | } -------------------------------------------------------------------------------- /src/tem/script/DBConnection.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/src/tem/script/DBConnection.java -------------------------------------------------------------------------------- /src/tem/script/ExportExpCorpusFromDB.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | import java.io.File; 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | import java.util.ArrayList; 7 | 8 | import tem.com.FileUtil; 9 | import tem.conf.PathConfig; 10 | import tem.script.DBConnection; 11 | 12 | /**Export users and posts data from stackoverflow database 13 | * @author yangliu 14 | * @blog http://blog.csdn.net/yangliuy 15 | * @mail yangliuyx@gmail.com 16 | */ 17 | public class ExportExpCorpusFromDB { 18 | 19 | /** 20 | * @param args 21 | * @throws SQLException 22 | */ 23 | public static void main(String[] args) throws SQLException { 24 | // TODO Auto-generated method stub 25 | String[] minPostNums = {"30"}; 26 | final DBConnection db = new DBConnection(); 27 | db.getConn(); 28 | for(String minPostNum : minPostNums){ 29 | String userIDFile = PathConfig.scriptDataPath + "USERID" + minPostNum; 30 | /*String sql = "select owneruserid from (select owneruserid, count(posts.id) as postNum from posts" 31 | + " where posts.creationdate > '2009-05-01 00:00:00' and posts.creationdate < '2009-08-01 00:00:00' group by owneruserid) as newt where newt.postNum > " + minPostNum + ";"; 32 | ResultSet rs = db.executeQuery(sql); 33 | ArrayList userIDs = new ArrayList(); 34 | while(rs.next()){ 35 | int userID = rs.getInt("owneruserid"); 36 | if(userID != 0){ 37 | userIDs.add(String.valueOf(userID)); 38 | } 39 | } 40 | System.out.println("userIDs size : " + userIDs.size()); 41 | FileUtil.writeLines(userIDFile, userIDs);*/ 42 | String sql = ""; 43 | ArrayList userIDs = new ArrayList(); 44 | FileUtil.readLines(userIDFile, userIDs); 45 | String oriDataFolder = PathConfig.originalDataPath + "USER" + minPostNum; 46 | if(!new File(oriDataFolder).exists()){ 47 | new File(oriDataFolder).mkdir(); 48 | } 49 | String oriDataUserIDFile = oriDataFolder + "/user.IDs"; 50 | FileUtil.writeLines(oriDataUserIDFile, userIDs); 51 | 52 | String oriDataUserInforFile = oriDataFolder + "/user.Infors"; 53 | ArrayList userInforLines = new ArrayList(); 54 | String postFolder = oriDataFolder + "/posts"; 55 | ArrayList postsLines = new ArrayList(); 56 | if(!new File(postFolder).exists()){ 57 | new File(postFolder).mkdir(); 58 | } 59 | 60 | for(String userID : userIDs){ 61 | String userPostsFile = postFolder + "/" + userID +".posts"; 62 | if(new File(userPostsFile).exists()){ 63 | System.out.println(userPostsFile + " is existed! "); 64 | continue; 65 | } 66 | sql = "select * from posts where owneruserid = '" + userID + "' and posts.creationdate > '2009-05-01 00:00:00' and posts.creationdate < '2009-08-01 00:00:00';"; 67 | ResultSet rs = db.executeQuery(sql); 68 | while(rs.next()){ 69 | String postsLine = rs.getInt("ID") + "\t" + rs.getInt("POSTTYPEID") 70 | + "\t" + rs.getInt("PARENTID") + "\t" + rs.getInt("ACCEPTEDANSWERID") + "\t" + rs.getString("CREATIONDATE") 71 | + "\t" + rs.getInt("SCORE") + "\t" + rs.getInt("VIEWCOUNT") + "\t" + (rs.getString("BODY") == null ? "null": rs.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("OWNERUSERID") 72 | + "\t" + rs.getInt("LASTEDITORUSERID") + "\t" + rs.getString("LASTEDITORDISPLAYNAME") + "\t" + rs.getString("LASTEDITDATE") 73 | + "\t" + rs.getString("LASTACTIVITYDATE") + "\t" + rs.getString("COMMUNITYOWNEDDATE") + "\t" + rs.getString("CLOSEDDATE") 74 | + "\t" + (rs.getString("TITLE") == null?"null":rs.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs.getString("TAGS") + "\t" + rs.getInt("ANSWERCOUNT") 75 | + "\t" + rs.getInt("COMMENTCOUNT") + "\t" + rs.getInt("FAVORITECOUNT"); 76 | postsLines.add(postsLine); 77 | } 78 | 79 | FileUtil.writeLines(userPostsFile, postsLines); 80 | postsLines.clear(); 81 | sql = "select * from users where id = '" + userID + "';"; 82 | rs = db.executeQuery(sql); 83 | while(rs.next()){ 84 | String userInforLine = rs.getInt("ID") + "\t" + rs.getInt("REPUTATION") + "\t" + rs.getString("CREATIONDATE") 85 | + "\t" + rs.getString("DISPLAYNAME") + "\t" + rs.getString("EMAILHASH") 86 | + "\t" + rs.getString("LASTACCESSDATE") + "\t" + rs.getString("WEBSITEURL") + "\t" + rs.getString("LOCATION") 87 | + "\t" + rs.getInt("AGE") + "\t" + (rs.getString("ABOUTME") == null?"null":rs.getString("ABOUTME").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("VIEWS") 88 | + "\t" + rs.getInt("UPVOTES") + "\t" + rs.getInt("DOWNVOTES"); 89 | //System.out.println("userInforLine: " + userInforLine); 90 | userInforLines.add(userInforLine); 91 | } 92 | rs.close(); 93 | } 94 | FileUtil.writeLines(oriDataUserInforFile, userInforLines); 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/tem/script/ExportGraphMatrix.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | import java.sql.ResultSet; 4 | import java.sql.SQLException; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | import tem.com.FileUtil; 10 | import tem.conf.PathConfig; 11 | 12 | public class ExportGraphMatrix { 13 | private static int[][] QAGraph; 14 | private static int userNum; 15 | private static ArrayList indexToUserIDMap; 16 | private static Map userIDToIndexMap; 17 | 18 | /** 19 | * @param args 20 | * @throws SQLException 21 | */ 22 | public static void main(String[] args) throws SQLException { 23 | // TODO Auto-generated method stub 24 | final DBConnection db = new DBConnection(); 25 | String minPostNum = "80"; 26 | db.getConn(); 27 | String sql = ""; 28 | String userIDFile = PathConfig.originalDataPath + "USER" + minPostNum + "/user.IDs"; 29 | String postFolder = PathConfig.originalDataPath + "USER" + minPostNum + "/posts/"; 30 | String askerFolder = PathConfig.originalDataPath + "USER" + minPostNum + "/askers/"; 31 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph"; 32 | ArrayList postLines = new ArrayList(); 33 | 34 | ArrayList userIDs = new ArrayList(); 35 | FileUtil.readLines(userIDFile, userIDs); 36 | buildIndexUserID(userIDs); 37 | QAGraph = new int[userNum][userNum]; 38 | ArrayList askerLines = new ArrayList(); 39 | 40 | for(int i = 0; i < userNum; i++){ 41 | System.out.println("i = " + i); 42 | String postFile = postFolder + userIDs.get(i) + ".posts"; 43 | //String askerFile = askerFolder + userIDs.get(i) + ".askers"; 44 | //if(new File(askerFile).exists()){ 45 | //System.out.println(askerFile + "is exists!"); 46 | //continue; 47 | //} 48 | postLines.clear(); 49 | FileUtil.readLines(postFile, postLines); 50 | 51 | askerLines.clear(); 52 | //System.out.println("after clear, askerLines size: " + askerLines.size()); 53 | for(String postLine : postLines){ 54 | String [] postTokens = postLine.split("\t"); 55 | if(postTokens[1].equals("2")){ 56 | String parentID = postTokens[2]; 57 | String askerID = getAuthorIDbyPostID(parentID, db); 58 | String vote = postTokens[5]; 59 | //System.out.println("vote " + vote); 60 | //askerLines.add(askerID); 61 | //System.out.println("add, askerLines size: " + askerLines.size()); 62 | 63 | //Answer count weighted graph 64 | if(userIDToIndexMap.containsKey(askerID)){ 65 | QAGraph[Integer.valueOf(userIDToIndexMap.get(askerID))][Integer.valueOf(userIDToIndexMap.get(userIDs.get(i)))] += Integer.valueOf(vote); 66 | } 67 | } else { 68 | //askerLines.add("self"); 69 | } 70 | } 71 | //FileUtil.writeLines(askerFile, askerLines); 72 | //System.out.println("before clear, askerLines size: " + askerLines.size()); 73 | 74 | //System.out.println("after clear, askerLines size: " + askerLines.size()); 75 | } 76 | printQAGraph(graphDataFile); 77 | db.close(); 78 | } 79 | 80 | private static void printQAGraph(String graphDataFile) { 81 | // TODO Auto-generated method stub 82 | ArrayList QAGLines = new ArrayList(); 83 | for(int i = 0; i < QAGraph.length; i++){ 84 | String line = ""; 85 | for(int j = 0; j < QAGraph[i].length; j++){ 86 | line += QAGraph[i][j] + "\t"; 87 | } 88 | QAGLines.add(line); 89 | } 90 | FileUtil.writeLines(graphDataFile, QAGLines); 91 | } 92 | 93 | private static String getAuthorIDbyPostID(String postID, DBConnection db) throws SQLException { 94 | // TODO Auto-generated method stub 95 | String sql = "select * from posts where id = "+ postID; 96 | ResultSet rs = db.executeQuery(sql); 97 | String authorID = ""; 98 | while(rs.next()){ 99 | authorID = rs.getString(9); 100 | } 101 | rs.close(); 102 | return authorID; 103 | } 104 | 105 | private static void buildIndexUserID(ArrayList userIDs) { 106 | // TODO Auto-generated method stub 107 | indexToUserIDMap = new ArrayList(); 108 | userIDToIndexMap = new HashMap(); 109 | 110 | for(int i = 0; i < userIDs.size(); i++){ 111 | indexToUserIDMap.add(userIDs.get(i)); 112 | userIDToIndexMap.put(userIDs.get(i), String.valueOf(i)); 113 | } 114 | userNum = userIDs.size(); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/tem/script/ExportTagsFromDB.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | import java.io.File; 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | import java.util.ArrayList; 7 | 8 | import tem.com.FileUtil; 9 | import tem.conf.PathConfig; 10 | import tem.script.DBConnection; 11 | 12 | /**Export tags for each post from stackoverflow database 13 | * @author yangliu 14 | * @blog http://blog.csdn.net/yangliuy 15 | * @mail yangliuyx@gmail.com 16 | */ 17 | public class ExportTagsFromDB { 18 | 19 | /** 20 | * @param args 21 | * @throws SQLException 22 | */ 23 | public static void main(String[] args) throws SQLException { 24 | // TODO Auto-generated method stub 25 | String[] minPostNums = {"30"}; 26 | final DBConnection db = new DBConnection(); 27 | //ResultSet rs; 28 | db.getConn(); 29 | String sql = ""; 30 | StringBuffer sb = new StringBuffer(); 31 | for(String minPostNum : minPostNums){ 32 | String userIDFile = PathConfig.scriptDataPath + "USERID" + minPostNum; 33 | sql = ""; 34 | ArrayList userIDs = new ArrayList(); 35 | FileUtil.readLines(userIDFile, userIDs); 36 | String oriDataFolder = PathConfig.originalDataPath + "USER" + minPostNum; 37 | String postFolder = oriDataFolder + "/posts"; 38 | String tagFolder = oriDataFolder + "/tags"; 39 | ArrayList postsLines = new ArrayList(); 40 | ArrayList tagsLines = new ArrayList(); 41 | if(!new File(tagFolder).exists()){ 42 | new File(tagFolder).mkdir(); 43 | } 44 | 45 | for(String userID : userIDs){ 46 | String userTagsFile = tagFolder + "/" + userID + ".tags"; 47 | System.out.println("Now tag file is: " + userTagsFile); 48 | if(new File(userTagsFile).exists()){ 49 | System.out.println(userTagsFile + "is existed!"); 50 | continue; 51 | } 52 | 53 | String userPostsFile = postFolder + "/" + userID +".posts"; 54 | FileUtil.readLines(userPostsFile, postsLines); 55 | for(String postLine : postsLines){ 56 | String[] postLineTokens = postLine.split("\t"); 57 | if(postLineTokens.length != 20){ 58 | System.err.println("format error : " + postLine); 59 | tagsLines.add(postLineTokens[0] + "\t" + "null"); 60 | continue; 61 | } 62 | String postTypeID = postLineTokens[1]; 63 | if(postTypeID.equals("1")){ 64 | tagsLines.add(postLineTokens[0] + "\t" + postLineTokens[16]); 65 | } else { 66 | String parentID = postLineTokens[2]; 67 | //Use StringBuffer instead of add Strings 68 | sb.delete(0, sb.length()); 69 | sb.append("select * from posts where id = '"); 70 | sb.append(parentID); 71 | sb.append("';"); 72 | sql = sb.toString(); 73 | //System.out.println("sql builder: " + sql); 74 | ResultSet rs = db.executeQuery(sql); 75 | while(rs.next()){ 76 | tagsLines.add(postLineTokens[0] + "\t" + rs.getString("TAGS")); 77 | } 78 | rs.close(); 79 | } 80 | } 81 | 82 | FileUtil.writeLines(userTagsFile, tagsLines); 83 | postsLines.clear(); 84 | tagsLines.clear(); 85 | } 86 | } 87 | db.close(); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/tem/script/ExportTestDataForRank.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | import java.sql.ResultSet; 4 | import java.sql.SQLException; 5 | import java.util.ArrayList; 6 | 7 | import tem.com.FileUtil; 8 | import tem.conf.PathConfig; 9 | 10 | /**Export Test Data for Rank answers/experts 11 | * @author yangliu 12 | * @blog http://blog.csdn.net/yangliuy 13 | * @mail yangliuyx@gmail.com 14 | */ 15 | 16 | public class ExportTestDataForRank { 17 | 18 | /* @param args 19 | * @throws SQLException 20 | */ 21 | public static void main(String[] args) throws SQLException { 22 | // TODO Auto-generated method stub 23 | final DBConnection db = new DBConnection(); 24 | String minPostNum = "80"; 25 | db.getConn(); 26 | String sql = ""; 27 | String userIDFile = PathConfig.scriptDataPath + "USERID" + minPostNum; 28 | ArrayList userIDs = new ArrayList(); 29 | FileUtil.readLines(userIDFile, userIDs); 30 | String testDataFolder = PathConfig.testDataPath; 31 | ArrayList questionLines = new ArrayList(); 32 | ArrayList answerLines = new ArrayList(); 33 | ArrayList questionIDs = new ArrayList(); 34 | String questionFile = testDataFolder + "testData.questions"; 35 | String questionIDFile = testDataFolder + "testDataQuestions.id"; 36 | FileUtil.readLines(questionIDFile, questionIDs); 37 | 38 | for(String questionIDLine : questionIDs){ 39 | String questionID = questionIDLine.split("\t")[1]; 40 | sql = "select * from posts where id = "+ questionID; 41 | ResultSet rs = db.executeQuery(sql); 42 | while(rs.next()){ 43 | questionLines.add(rs.getInt("ID") + "\t" + rs.getInt("POSTTYPEID") 44 | + "\t" + rs.getInt("PARENTID") + "\t" + rs.getInt("ACCEPTEDANSWERID") + "\t" + rs.getString("CREATIONDATE") 45 | + "\t" + rs.getInt("SCORE") + "\t" + rs.getInt("VIEWCOUNT") + "\t" + (rs.getString("BODY") == null ? "null": rs.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("OWNERUSERID") 46 | + "\t" + rs.getInt("LASTEDITORUSERID") + "\t" + rs.getString("LASTEDITORDISPLAYNAME") + "\t" + rs.getString("LASTEDITDATE") 47 | + "\t" + rs.getString("LASTACTIVITYDATE") + "\t" + rs.getString("COMMUNITYOWNEDDATE") + "\t" + rs.getString("CLOSEDDATE") 48 | + "\t" + (rs.getString("TITLE") == null?"null":rs.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs.getString("TAGS") + "\t" + rs.getInt("ANSWERCOUNT") 49 | + "\t" + rs.getInt("COMMENTCOUNT") + "\t" + rs.getInt("FAVORITECOUNT")); 50 | } 51 | 52 | /*for(String userID : userIDs){ 53 | sql = "select * from posts where posts.creationdate > '2009-08-01 00:00:00'" + 54 | " and posts.creationdate < '2009-11-01 00:00:00' and posts.posttypeid = 1 " + 55 | "and answercount > 5 and owneruserid = " + userID; 56 | ResultSet rs = db.executeQuery(sql); 57 | while(rs.next()){ 58 | System.out.println("userID: " + userID + 59 | " question id: " + rs.getInt(1) + 60 | " answercount: " + rs.getInt(18) + 61 | "question tag: " + rs.getString(17) + 62 | " question title: " + rs.getString(16) ); 63 | questionLines.add(rs.getInt("ID") + "\t" + rs.getInt("POSTTYPEID") 64 | + "\t" + rs.getInt("PARENTID") + "\t" + rs.getInt("ACCEPTEDANSWERID") + "\t" + rs.getString("CREATIONDATE") 65 | + "\t" + rs.getInt("SCORE") + "\t" + rs.getInt("VIEWCOUNT") + "\t" + (rs.getString("BODY") == null ? "null": rs.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("OWNERUSERID") 66 | + "\t" + rs.getInt("LASTEDITORUSERID") + "\t" + rs.getString("LASTEDITORDISPLAYNAME") + "\t" + rs.getString("LASTEDITDATE") 67 | + "\t" + rs.getString("LASTACTIVITYDATE") + "\t" + rs.getString("COMMUNITYOWNEDDATE") + "\t" + rs.getString("CLOSEDDATE") 68 | + "\t" + (rs.getString("TITLE") == null?"null":rs.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs.getString("TAGS") + "\t" + rs.getInt("ANSWERCOUNT") 69 | + "\t" + rs.getInt("COMMENTCOUNT") + "\t" + rs.getInt("FAVORITECOUNT"));*/ 70 | System.out.println("questionID " + questionID); 71 | String answerFile = testDataFolder + questionID + ".answers"; 72 | sql = "select * from posts where posts.posttypeid = 2 and parentid = "+ questionID; 73 | ResultSet rs2 = db.executeQuery(sql); 74 | while(rs2.next()){ 75 | answerLines.add(rs2.getInt("ID") + "\t" + rs2.getInt("POSTTYPEID") 76 | + "\t" + rs2.getInt("PARENTID") + "\t" + rs2.getInt("ACCEPTEDANSWERID") + "\t" + rs2.getString("CREATIONDATE") 77 | + "\t" + rs2.getInt("SCORE") + "\t" + rs2.getInt("VIEWCOUNT") + "\t" + (rs2.getString("BODY") == null ? "null": rs2.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs2.getInt("OWNERUSERID") 78 | + "\t" + rs2.getInt("LASTEDITORUSERID") + "\t" + rs2.getString("LASTEDITORDISPLAYNAME") + "\t" + rs2.getString("LASTEDITDATE") 79 | + "\t" + rs2.getString("LASTACTIVITYDATE") + "\t" + rs2.getString("COMMUNITYOWNEDDATE") + "\t" + rs2.getString("CLOSEDDATE") 80 | + "\t" + (rs2.getString("TITLE") == null?"null":rs2.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs2.getString("TAGS") + "\t" + rs2.getInt("ANSWERCOUNT") 81 | + "\t" + rs2.getInt("COMMENTCOUNT") + "\t" + rs2.getInt("FAVORITECOUNT")); 82 | } 83 | FileUtil.writeLines(answerFile, answerLines); 84 | answerLines.clear(); 85 | } 86 | FileUtil.writeLines(questionFile, questionLines); 87 | db.close(); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/tem/script/HandleTagTest.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | public class HandleTagTest { 4 | 5 | /** 6 | * @param args 7 | */ 8 | public static void main(String[] args) { 9 | // TODO Auto-generated method stub 10 | String tags1 = ""; 11 | String tags2 = ""; 12 | String[] tags = tags1.replaceAll("[<>]", " ").split(" "); 13 | System.out.println(tags.length); 14 | 15 | for(String tag : tags){ 16 | System.out.println(tag.replace(" ", "")); 17 | } 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/tem/script/JAMATest.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | /**Standard PageRank Algorithm 4 | * @author yangliu 5 | * @blog http://blog.csdn.net/yangliuy 6 | * @mail yangliuyx@gmail.com 7 | */ 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import java.util.Random; 12 | 13 | import Jama.Matrix; 14 | 15 | public class JAMATest { 16 | private static final double LAMBDA = 0.5; 17 | private static final double THRESHOLD = 0.0000001; 18 | 19 | public static void main(String[] args) { 20 | System.out.println("lambda is " + LAMBDA); 21 | //Both randomly initialise or set all 1 are OK 22 | //PR0 = getInitPR0(3); 23 | double[] PR0Array = new double[3]; 24 | for(int i = 0; i < 3; i++){ 25 | PR0Array[i] = 1; 26 | } 27 | 28 | Matrix PR0 = new Matrix (PR0Array, 1); 29 | System.out.println("Initial state vector PR0 is:"); 30 | PR0.print(3, 3); 31 | 32 | System.out.println("Page Rank Update Matrix newPR:"); 33 | getNewPR(LAMBDA).print(3, 3); 34 | 35 | Matrix pageRank = calPageRank(PR0, LAMBDA); 36 | System.out.println("Final PageRank is:"); 37 | pageRank.print(3, 3); 38 | System.out.println(); 39 | } 40 | 41 | /** 42 | * Randomly Initialise state vector PR0 43 | * 44 | * @param n 45 | * dimension of vector PR0 46 | * @return A random vector, each dimension is 0-5 47 | */ 48 | public static List getInitPR0(int n) { 49 | Random random = new Random(); 50 | List q = new ArrayList(); 51 | for (int i = 0; i < n; i++) { 52 | q.add(new Double(5 * random.nextDouble())); 53 | } 54 | return q; 55 | } 56 | 57 | /** 58 | * Compute Euclidean Distance 59 | * 60 | * @param q1 61 | * 62 | * @param q2 63 | * 64 | * @return distance 65 | */ 66 | public static double calDistance(Matrix q1, Matrix q2) { 67 | double sum = 0; 68 | 69 | if (q1.getColumnDimension() != q2.getColumnDimension() ) { 70 | return -1; 71 | } 72 | 73 | for (int i = 0; i < q1.getColumnDimension() ; i++) { 74 | sum += Math.pow(q1.get(0, i) - q2.get(0, i), 75 | 2); 76 | } 77 | return Math.sqrt(sum); 78 | } 79 | 80 | /** 81 | * compute pagerank 82 | * 83 | * @param PR0 84 | * Initialise state vector 85 | * @param lambda 86 | * lambda 87 | * @return pagerank result 88 | */ 89 | public static Matrix calPageRank(Matrix PR0, double lambda) { 90 | 91 | Matrix newPR = getNewPR(lambda); 92 | Matrix PR; 93 | while (true) { 94 | PR = PR0.times(newPR); 95 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration 96 | System.out.println("distance:" + dis); 97 | if (dis <= THRESHOLD) { 98 | System.out.println("PR:"); 99 | PR.print(3, 3); 100 | break; 101 | } 102 | PR0 = PR; 103 | } 104 | return PR; 105 | } 106 | 107 | /** 108 | * compute NEWPR matrix 109 | * 110 | * @param lambda 111 | * 112 | * @return NEWPR matrix 113 | */ 114 | public static Matrix getNewPR(double lambda) { 115 | 116 | int V = getM().getColumnDimension(); 117 | Matrix add1 = getM().times(lambda); 118 | Matrix add2 = getU().times((1 - lambda) / V); 119 | Matrix newPR = add1.plus(add2); 120 | return newPR; 121 | } 122 | 123 | /** 124 | * Initialise transition matrix M 125 | * 126 | * @return M 127 | */ 128 | public static Matrix getM() { 129 | double[][] m = new double[3][3]; 130 | 131 | m[0][0] = 0; 132 | m[0][1] = 1; 133 | m[0][2] = 0; 134 | 135 | m[1][0] = 0.5; 136 | m[1][1] = 0; 137 | m[1][2] = 0.5; 138 | 139 | m[2][0] = 0; 140 | m[2][1] = 1; 141 | m[2][2] = 0; 142 | 143 | Matrix M = new Matrix(m); 144 | 145 | return M; 146 | } 147 | 148 | /** 149 | * Initialise Matrix U 150 | * 151 | * @return U 152 | */ 153 | public static Matrix getU() { 154 | 155 | double[][] u = new double[3][3]; 156 | 157 | u[0][0] = 1; 158 | u[0][1] = 1; 159 | u[0][2] = 1; 160 | 161 | u[1][0] = 1; 162 | u[1][1] = 1; 163 | u[1][2] = 1; 164 | 165 | u[2][0] = 1; 166 | u[2][1] = 1; 167 | u[2][2] = 1; 168 | 169 | Matrix U = new Matrix(u); 170 | 171 | return U; 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/tem/script/MergeUser10.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Set; 5 | import java.util.TreeSet; 6 | 7 | import tem.com.FileUtil; 8 | 9 | public class MergeUser10 { 10 | 11 | /** 12 | * @param args 13 | */ 14 | public static void main(String[] args) { 15 | // TODO Auto-generated method stub 16 | String data10Path = "data/originalData/ThreeM09/User80MergeUser10/similarQ/User10/"; 17 | String data80Path = "data/originalData/ThreeM09/User80MergeUser10/"; 18 | ArrayList data10IDs = new ArrayList(); 19 | ArrayList data80IDs = new ArrayList(); 20 | ArrayList allIds = new ArrayList(); 21 | ArrayList overLapIDLines = new ArrayList(); 22 | FileUtil.readLines(data10Path + "users.IDs", data10IDs); 23 | FileUtil.readLines(data80Path + "user.IDs", data80IDs); 24 | allIds.addAll(data80IDs); 25 | 26 | //Find overlap userIDs 27 | for(String userID10 : data10IDs){ 28 | String data10PathPost = data10Path + "posts/" + userID10 + ".posts"; 29 | if(data80IDs.contains(userID10.trim())){ 30 | //overlap 31 | System.out.println("voerlap id: " + userID10); 32 | overLapIDLines.add(userID10); 33 | } else{ 34 | allIds.add(userID10); 35 | String newData80PathPost = data80Path + "posts/" + userID10 + ".posts"; 36 | FileUtil.copyFile(data10PathPost, newData80PathPost); 37 | } 38 | } 39 | FileUtil.writeLines(data80Path + "overlapIDs", overLapIDLines); 40 | 41 | FileUtil.writeLines(data80Path + "allUserIDs", allIds); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/tem/script/PageRank2.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/src/tem/script/PageRank2.java -------------------------------------------------------------------------------- /src/tem/script/PageRankYL.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | /**Standard PageRank Algorithm 4 | * @author yangliu 5 | * @blog http://blog.csdn.net/yangliuy 6 | * @mail yangliuyx@gmail.com 7 | */ 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | import java.util.Random; 12 | 13 | public class PageRankYL { 14 | private static final double LAMBDA = 0.5; 15 | private static final double THRESHOLD = 0.0000001; 16 | 17 | public static void main(String[] args) { 18 | System.out.println("lambda is " + LAMBDA); 19 | List PR0 = new ArrayList(); 20 | //Both randomly initialise or set all 1 are OK 21 | //PR0 = getInitPR0(3); 22 | PR0.add(new Double(1)); 23 | PR0.add(new Double(1)); 24 | PR0.add(new Double(1)); 25 | System.out.println("Initial state vector PR0 is:"); 26 | printVec(PR0); 27 | System.out.println("Page Rank Update Matrix newPR:"); 28 | printMatrix(getNewPR(LAMBDA)); 29 | List pageRank = calPageRank(PR0, LAMBDA); 30 | System.out.println("Final PageRank is:"); 31 | printVec(pageRank); 32 | System.out.println(); 33 | } 34 | 35 | public static void printMatrix(List> m) { 36 | for (int i = 0; i < m.size(); i++) { 37 | for (int j = 0; j < m.get(i).size(); j++) { 38 | System.out.print(m.get(i).get(j) + ", "); 39 | } 40 | System.out.println(); 41 | } 42 | } 43 | 44 | 45 | public static void printVec(List v) { 46 | for (int i = 0; i < v.size(); i++) { 47 | System.out.print(v.get(i) + ", "); 48 | } 49 | System.out.println(); 50 | } 51 | 52 | /** 53 | * Randomly Initialise state vector PR0 54 | * 55 | * @param n 56 | * dimension of vector PR0 57 | * @return A random vector, each dimension is 0-5 58 | */ 59 | public static List getInitPR0(int n) { 60 | Random random = new Random(); 61 | List q = new ArrayList(); 62 | for (int i = 0; i < n; i++) { 63 | q.add(new Double(5 * random.nextDouble())); 64 | } 65 | return q; 66 | } 67 | 68 | /** 69 | * Compute Euclidean Distance 70 | * 71 | * @param q1 72 | * 73 | * @param q2 74 | * 75 | * @return distance 76 | */ 77 | public static double calDistance(List q1, List q2) { 78 | double sum = 0; 79 | 80 | if (q1.size() != q2.size()) { 81 | return -1; 82 | } 83 | 84 | for (int i = 0; i < q1.size(); i++) { 85 | sum += Math.pow(q1.get(i).doubleValue() - q2.get(i).doubleValue(), 86 | 2); 87 | } 88 | return Math.sqrt(sum); 89 | } 90 | 91 | /** 92 | * compute pagerank 93 | * 94 | * @param PR0 95 | * Initialise state vector 96 | * @param lambda 97 | * lambda 98 | * @return pagerank result 99 | */ 100 | public static List calPageRank(List PR0, double lambda) { 101 | 102 | List> newPR = getNewPR(lambda); 103 | List PR = null; 104 | while (true) { 105 | PR = vectorMulMatrix(PR0, newPR); 106 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration 107 | System.out.println("distance:" + dis); 108 | if (dis <= THRESHOLD) { 109 | System.out.println("PR0:"); 110 | printVec(PR0); 111 | System.out.println("PR:"); 112 | printVec(PR); 113 | break; 114 | } 115 | PR0 = PR; 116 | } 117 | return PR; 118 | } 119 | 120 | /** 121 | * compute NEWPR matrix 122 | * 123 | * @param lambda 124 | * 125 | * @return NEWPR matrix 126 | */ 127 | public static List> getNewPR(double lambda) { 128 | 129 | int V = getM().size(); 130 | List> add1 = numberMulMatrix(getM(), lambda); 131 | List> add2 = numberMulMatrix(getU(), (1 - lambda) / V); 132 | List> newPR = addMatrix(add1, add2); 133 | return newPR; 134 | } 135 | 136 | /** 137 | * Compute the product of a vector and a matrix 138 | * 139 | * @param v 140 | * a vector 141 | * @param m 142 | * a matrix 143 | * @return a new vector 144 | */ 145 | public static List vectorMulMatrix(List v, List> m 146 | ) { 147 | 148 | if (m == null || v == null || m.size() <= 0 149 | || m.get(0).size() != v.size()) { 150 | return null; 151 | } 152 | 153 | List list = new ArrayList(); 154 | for(int i = 0; i < v.size(); i++){ 155 | double sum = 0; 156 | for(int j = 0; j < m.size(); j++){ 157 | sum += v.get(j) * m.get(j).get(i); 158 | } 159 | list.add(sum); 160 | } 161 | 162 | return list; 163 | } 164 | 165 | 166 | public static List> addMatrix(List> list1, 167 | List> list2) { 168 | List> list = new ArrayList>(); 169 | if (list1.size() != list2.size() || list1.size() <= 0 170 | || list2.size() <= 0) { 171 | return null; 172 | } 173 | for (int i = 0; i < list1.size(); i++) { 174 | list.add(new ArrayList()); 175 | for (int j = 0; j < list1.get(i).size(); j++) { 176 | double temp = list1.get(i).get(j).doubleValue() 177 | + list2.get(i).get(j).doubleValue(); 178 | list.get(i).add(new Double(temp)); 179 | } 180 | } 181 | return list; 182 | } 183 | 184 | 185 | public static List> numberMulMatrix(List> s, 186 | double a) { 187 | List> list = new ArrayList>(); 188 | 189 | for (int i = 0; i < s.size(); i++) { 190 | list.add(new ArrayList()); 191 | for (int j = 0; j < s.get(i).size(); j++) { 192 | double temp = a * s.get(i).get(j).doubleValue(); 193 | list.get(i).add(new Double(temp)); 194 | } 195 | } 196 | return list; 197 | } 198 | 199 | /** 200 | * Initialise transition matrix M 201 | * 202 | * @return M 203 | */ 204 | public static List> getM() { 205 | List row1 = new ArrayList(); 206 | row1.add(new Double(0)); 207 | row1.add(new Double(1)); 208 | row1.add(new Double(0)); 209 | List row2 = new ArrayList(); 210 | row2.add(new Double(0.5)); 211 | row2.add(new Double(0)); 212 | row2.add(new Double(0.5)); 213 | List row3 = new ArrayList(); 214 | row3.add(new Double(0)); 215 | row3.add(new Double(1)); 216 | row3.add(new Double(0)); 217 | 218 | List> M = new ArrayList>(); 219 | M.add(row1); 220 | M.add(row2); 221 | M.add(row3); 222 | 223 | return M; 224 | } 225 | 226 | /** 227 | * Initialise Matrix U 228 | * 229 | * @return U 230 | */ 231 | public static List> getU() { 232 | List row1 = new ArrayList(); 233 | row1.add(new Double(1)); 234 | row1.add(new Double(1)); 235 | row1.add(new Double(1)); 236 | List row2 = new ArrayList(); 237 | row2.add(new Double(1)); 238 | row2.add(new Double(1)); 239 | row2.add(new Double(1)); 240 | List row3 = new ArrayList(); 241 | row3.add(new Double(1)); 242 | row3.add(new Double(1)); 243 | row3.add(new Double(1)); 244 | 245 | List> u = new ArrayList>(); 246 | u.add(row1); 247 | u.add(row2); 248 | u.add(row3); 249 | 250 | return u; 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /src/tem/script/SimilarQuestionPAexport.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | import java.io.File; 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | import java.util.ArrayList; 7 | 8 | import tem.com.FileUtil; 9 | import tem.conf.PathConfig; 10 | import tem.script.DBConnection; 11 | 12 | /**Export similar questions and authors 13 | * @author yangliu 14 | * @blog http://blog.csdn.net/yangliuy 15 | * @mail yangliuyx@gmail.com 16 | */ 17 | public class SimilarQuestionPAexport { 18 | 19 | /** 20 | * @param args 21 | * @throws SQLException 22 | */ 23 | public static void main(String[] args) throws SQLException { 24 | // TODO Auto-generated method stub 25 | final DBConnection db = new DBConnection(); 26 | db.getConn(); 27 | String path = "data/scriptData"; 28 | String QuestionIDFileName = path + "/SimilarQ/TestQuestionLiu.txt"; 29 | String QuestionPostFileName = path + "/SimilarQ/TestQuestionLiu.posts"; 30 | String QuestionAskerFileName = path + "/SimilarQ/TestQuestionLiu.askers"; 31 | ArrayList qIDs = new ArrayList(); 32 | FileUtil.readLines(QuestionIDFileName, qIDs); 33 | ArrayList postLines = new ArrayList(); 34 | ArrayList askerLies = new ArrayList(); 35 | for(String qid : qIDs){ 36 | int askerID = -100; 37 | String sql = "select * from posts where id = " + qid; 38 | ResultSet rs = db.executeQuery(sql); 39 | while(rs.next()){ 40 | String postsLine = rs.getInt("ID") + "\t" + rs.getInt("POSTTYPEID") 41 | + "\t" + rs.getInt("PARENTID") + "\t" + rs.getInt("ACCEPTEDANSWERID") + "\t" + rs.getString("CREATIONDATE") 42 | + "\t" + rs.getInt("SCORE") + "\t" + rs.getInt("VIEWCOUNT") + "\t" + (rs.getString("BODY") == null ? "null": rs.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("OWNERUSERID") 43 | + "\t" + rs.getInt("LASTEDITORUSERID") + "\t" + rs.getString("LASTEDITORDISPLAYNAME") + "\t" + rs.getString("LASTEDITDATE") 44 | + "\t" + rs.getString("LASTACTIVITYDATE") + "\t" + rs.getString("COMMUNITYOWNEDDATE") + "\t" + rs.getString("CLOSEDDATE") 45 | + "\t" + (rs.getString("TITLE") == null?"null":rs.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs.getString("TAGS") + "\t" + rs.getInt("ANSWERCOUNT") 46 | + "\t" + rs.getInt("COMMENTCOUNT") + "\t" + rs.getInt("FAVORITECOUNT"); 47 | postLines.add(postsLine); 48 | askerID = rs.getInt("OWNERUSERID"); 49 | } 50 | sql = "select * from users where id = " + askerID; 51 | rs = db.executeQuery(sql); 52 | while(rs.next()){ 53 | String userInforLine = rs.getInt("ID") + "\t" + rs.getInt("REPUTATION") + "\t" + rs.getString("CREATIONDATE") 54 | + "\t" + rs.getString("DISPLAYNAME") + "\t" + rs.getString("EMAILHASH") 55 | + "\t" + rs.getString("LASTACCESSDATE") + "\t" + rs.getString("WEBSITEURL") + "\t" + rs.getString("LOCATION") 56 | + "\t" + rs.getInt("AGE") + "\t" + (rs.getString("ABOUTME") == null?"null":rs.getString("ABOUTME").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("VIEWS") 57 | + "\t" + rs.getInt("UPVOTES") + "\t" + rs.getInt("DOWNVOTES"); 58 | //System.out.println("userInforLine: " + userInforLine); 59 | askerLies.add(qid + "\t" + userInforLine); 60 | } 61 | rs.close(); 62 | } 63 | FileUtil.writeLines(QuestionAskerFileName, askerLies); 64 | FileUtil.writeLines(QuestionPostFileName, postLines); 65 | db.close(); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/tem/script/SortByValueDemo.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.sql.SQLException; 7 | import java.util.ArrayList; 8 | import java.util.Collections; 9 | import java.util.Comparator; 10 | import java.util.HashMap; 11 | import java.util.List; 12 | import java.util.TreeMap; 13 | 14 | 15 | public class SortByValueDemo { 16 | 17 | public static void main(String[] args) throws SQLException { 18 | SortDemo(); 19 | } 20 | 21 | /** 22 | * @param args 23 | * @return 24 | */ 25 | private static void SortDemo() { 26 | // TODO Auto-generated method stub 27 | HashMap sideMap = new HashMap(); 28 | 29 | sideMap.put("google", 3000); 30 | sideMap.put("baidu", 600); 31 | sideMap.put("amazon", 1000); 32 | sideMap.put("apple", 5000); 33 | 34 | ValueComparator bvc = new ValueComparator(sideMap); 35 | TreeMap sortedSideMap = new TreeMap(bvc); 36 | System. out.println("sideMap size " + sideMap.size()); 37 | sortedSideMap.putAll(sideMap); 38 | System. out.println("sortedSideMap :" + sortedSideMap); 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/tem/script/ValueComparator.java: -------------------------------------------------------------------------------- 1 | package tem.script; 2 | 3 | import java.util.Comparator; 4 | import java.util.Map; 5 | 6 | public class ValueComparator implements Comparator{ 7 | Map baseMap; 8 | 9 | public ValueComparator(Map base){ 10 | this.baseMap = base; 11 | } 12 | 13 | @Override 14 | public int compare(String o1, String o2) { 15 | // TODO Auto-generated method stub 16 | if(baseMap .get(o1) >= baseMap.get(o2)){ 17 | return -1; 18 | } else { 19 | return 1; 20 | } 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/tem/uqa/UQAModelRes.java: -------------------------------------------------------------------------------- 1 | package tem.uqa; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | import tem.com.FileUtil; 10 | import tem.conf.ConstantConfig; 11 | import tem.conf.PathConfig; 12 | import tem.main.Documents; 13 | import tem.main.TEMModel; 14 | import tem.main.TEMModelSampling.modelparameters; 15 | 16 | public class UQAModelRes implements java.io.Serializable { 17 | 18 | private static final long serialVersionUID = 2L; 19 | 20 | //term map 21 | public Map termToIndexMap; 22 | public Map indexToTermMap; 23 | 24 | //tag map 25 | public Map tagToIndexMap; 26 | public Map indexToTagMap; 27 | 28 | //user map 29 | public Map userToIndexMap; 30 | public Map indexToUserMap; 31 | 32 | //theta User topic distribution 33 | public double [][] theta; 34 | 35 | //phi topic word distribution 36 | public double [][] phi; 37 | 38 | //psi topic tag distribution 39 | public double [][] psi; 40 | 41 | public UQAModelRes(){ 42 | termToIndexMap = new HashMap(); 43 | indexToTermMap = new HashMap(); 44 | 45 | tagToIndexMap = new HashMap(); 46 | indexToTagMap = new HashMap(); 47 | 48 | userToIndexMap = new HashMap(); 49 | indexToUserMap = new HashMap(); 50 | } 51 | 52 | public UQAModelRes(String resPath){ 53 | termToIndexMap = new HashMap(); 54 | indexToTermMap = new HashMap(); 55 | 56 | tagToIndexMap = new HashMap(); 57 | indexToTagMap = new HashMap(); 58 | 59 | userToIndexMap = new HashMap(); 60 | indexToUserMap = new HashMap(); 61 | 62 | readMap((resPath + "termMap"), termToIndexMap, indexToTermMap); 63 | readMap((resPath + "tagMap"), tagToIndexMap, indexToTagMap); 64 | readMapForOneCol((resPath + "userMap"), userToIndexMap, indexToUserMap); 65 | 66 | 67 | theta = FileUtil.read2DArray(resPath + "thetaUT"); 68 | phi = FileUtil.read2DArray(resPath + "phiTV"); 69 | psi = FileUtil.read2DArray(resPath + "psiTC"); 70 | } 71 | 72 | private void readMapForOneCol(String fileName, 73 | Map termToIndexMap2, 74 | Map indexToTermMap2) { 75 | // TODO Auto-generated method stub 76 | //Build index from 0 77 | System.out.println("read map from " + fileName); 78 | ArrayList lines = new ArrayList(); 79 | FileUtil.readLines(fileName, lines); 80 | for(int i = 0; i < lines.size(); i++){ 81 | termToIndexMap2.put(lines.get(i).trim(), new Integer(i)); 82 | indexToTermMap2.put(new Integer(i), lines.get(i).trim()); 83 | } 84 | } 85 | 86 | private void readMap(String fileName, Map termToIndexMap2, 87 | Map indexToTermMap2) { 88 | // TODO Auto-generated method stub 89 | System.out.println("read map from " + fileName); 90 | ArrayList lines = new ArrayList(); 91 | FileUtil.readLines(fileName, lines); 92 | for(String line : lines){ 93 | String [] tokens = line.split("\t"); 94 | int index = Integer.parseInt(tokens[1]); 95 | termToIndexMap2.put(tokens[0], new Integer(index)); 96 | indexToTermMap2.put(new Integer(index), tokens[0]); 97 | } 98 | } 99 | 100 | public static void main(String[] args) throws IOException, 101 | ClassNotFoundException { 102 | 103 | String UQAPath = PathConfig.UQAPath; 104 | //UQAModelRes uqaRes = new UQAModelRes(UQAPath); 105 | UQAModelRes uqaRes = new UQAModelRes(); 106 | 107 | String dataFile = UQAPath + "UQAModelRes.data"; 108 | 109 | uqaRes = FileUtil.loadClass(uqaRes, dataFile); 110 | //FileUtil.saveClass(uqaRes, dataFile); 111 | System.out.println(uqaRes.indexToTagMap.size()); 112 | System.out.println(uqaRes.indexToTermMap.size()); 113 | System.out.println(uqaRes.indexToUserMap.size()); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/tem/uqa/UQAModelSampling.java: -------------------------------------------------------------------------------- 1 | package tem.uqa; 2 | 3 | import java.io.IOException; 4 | import edu.smu.data.Alphabet; 5 | import edu.smu.util.DataFormat; 6 | import java.io.BufferedReader; 7 | import java.io.File; 8 | import java.io.FileReader; 9 | import java.util.ArrayList; 10 | import java.util.Collections; 11 | import java.util.HashMap; 12 | import java.util.StringTokenizer; 13 | 14 | public class UQAModelSampling { 15 | 16 | /** 17 | * @param args 18 | * @throws IOException 19 | */ 20 | public static void main(String[] args) throws IOException { 21 | // TODO Auto-generated method stub 22 | String base = "C:\\PhD\\2013\\FindExpertCQA Stack\\Results\\LDAUQA\\"; 23 | String dataFile = base + "/ITO/dataEntitiesAll_bigrams.txt"; 24 | //String dataFile = "./data/nyt.ftm";//format.txt";//kitch.txt";//"data_res.txt";//"data_res.txt";// 25 | //String dataFile = "C:/PhD/2012/ActionKnowledge/Action/src/data/datafilePER.txt";//format.txt";//kitch.txt";//"data_res.txt";//"data_res.txt";// 26 | //String dataFile = "C:/PhD/2012/ActionKnowledge/Action/src/data/datafileAdj.txt"; //for adj 27 | //String resFile = "C:/PhD/2012/ActionKnowledge/Action/src/data/res_50topics_ETAdj.txt"; 28 | String resFile = base+"res_LDA.txt"; 29 | String stopFile = "C:/PhD/2012/ActionKnowledge/Action/src/data/stopwords.txt"; 30 | String betaFile = base+"betaFileLDA.txt"; 31 | String alphaFile = base+"alphaFileLDA.txt"; 32 | String gammaFile = base+"gammaFileLDA.txt"; 33 | String topcatFile = base+"topcatFileLDA.txt"; 34 | 35 | String vocabFileSave = base+"vocabLDA.txt"; 36 | 37 | Alphabet vocab = new Alphabet(); //DataFormat.loadAlphabet(vocabFile);// 38 | Alphabet stopwords = DataFormat.loadAlphabet(stopFile); 39 | String originalDataPath="C:\\PhD\\2013\\FindExpertCQA Stack\\Raw data and Analysis\\Raw Data\\ThreeM\\"; 40 | String minPostNum="100"; 41 | String originalDocsPath = originalDataPath + "USER" + minPostNum + "/posts/"; 42 | getTags(originalDocsPath); 43 | 44 | /* Documents docSet = new Documents(); 45 | docSet.readDocs(originalDocsPath, minPostNum); 46 | 47 | int[][][] w = new int[docSet.docs.size()][][]; 48 | 49 | for (int u = 0; u < docSet.docs.size(); u++) { 50 | w[u] = new int[docSet.docs.get(u).docWords.length][]; //no of posts for each user 51 | for (int n = 0; n < docSet.docs.get(u).docWords.length; n++) { 52 | w[u][n] = new int[docSet.docs.get(u).docWords[n].length]; // no of words in each post 53 | for (int l = 0; l < docSet.docs.get(u).docWords[n].length; l++) { 54 | int term = docSet.docs.get(u).docWords[n][l]; 55 | w[u][n][l] = term; 56 | //System.out.println(term); 57 | } 58 | } 59 | } 60 | 61 | int[][] tags=new int[docSet.docs.size()][]; 62 | for (int u = 0; u < docSet.docs.size(); u++) { 63 | tags[u] = new int[docSet.docs.get(u).tags.length]; //no of tags for each user 64 | for (int n = 0; n < docSet.docs.get(u).tags.length; n++) { 65 | int term = docSet.docs.get(u).tags[n]; 66 | tags[u][n] = term; 67 | //remember each post has only one tag in the current document implementation 68 | if (u==0) System.out.println(docSet.indexToTagMap.get(term) + ":" ); 69 | 70 | } 71 | System.out.println(); 72 | //System.out.println("tags:" + tags[u][0]); 73 | } 74 | 75 | //int[][][] words = DataFormat.getWordsFromFile(dataFile,vocab ,stopwords, -1, "ET" ); 76 | 77 | 78 | int numTokens = 0, numEntites=0; 79 | for(int d = 0; d < w.length; d++){ 80 | for(int s = 0; s < w[d].length; s++){ 81 | numTokens += w[d][s].length; 82 | } 83 | } 84 | System.out.println(w.length); 85 | 86 | 87 | System.out.println("Totally " + numTokens + " tokens."); 88 | /** 89 | * 90 | */ 91 | /** 92 | * running up 93 | */ 94 | //ATSenLDA ATlda = new ATSenLDA(50, vocab, words); 95 | //ETModelLDA ATlda = new ETModelLDA(20, vocab, entity, words, entities); 96 | 97 | //System.out.println(docSet.termToIndexMap); 98 | System.out.println("index term"); 99 | //System.out.println(docSet.indexToTermMap); 100 | 101 | //LDA ATlda = new LDA(20, docSet.termToIndexMap, docSet.indexToTermMap, w); 102 | // UQAModel ATlda = new UQAModel(15, docSet.termToIndexMap, docSet.indexToTermMap, docSet.tagToIndexMap, w, tags); 103 | //LDAUQA ATlda = new LDAUQA(15, docSet.termToIndexMap, docSet.indexToTermMap, w, tags); 104 | 105 | /** 106 | * 107 | */ 108 | /* ATlda.saveVocab(vocabFileSave); 109 | 110 | System.out.println("==============================NEW ITERATION =============================="); 111 | ATlda.run(1000); 112 | 113 | ATlda.printTopics(20, resFile+"1000", true); //print n topic words 114 | 115 | ATlda.saveBeta(betaFile); 116 | ATlda.saveAlpha(alphaFile); 117 | ATlda.saveBeta(gammaFile); 118 | 119 | ATlda.printTopicsCategories(20, topcatFile+"1000", true);*/ 120 | 121 | 122 | } 123 | 124 | public static void getTags(String docsPath){ 125 | double tags[][]=new double[618][2]; 126 | try{ 127 | BufferedReader bf; 128 | ArrayList docLines = new ArrayList(); 129 | System.out.println("tags"); 130 | String line, tag; 131 | HashMap hm=new HashMap(); 132 | StringTokenizer st; 133 | int cnt=0; 134 | int usercnt=0; 135 | for(File docFile : new File(docsPath+ "USER100\\tags\\").listFiles()){ 136 | bf=new BufferedReader(new FileReader(docFile)); 137 | while ((line=bf.readLine())!=null) { 138 | tag=line.substring(line.indexOf(" ")+1); 139 | st=new StringTokenizer(line, ">"); 140 | while(st.hasMoreTokens()){ 141 | tag=st.nextToken(); tag=tag.replace("<", ""); 142 | if(hm.containsKey(tag)){ 143 | cnt=hm.get(tag); 144 | hm.remove(tag); 145 | hm.put(tag, ++cnt); 146 | } 147 | else 148 | hm.put(tag, 1); 149 | } 150 | } 151 | 152 | //tags[usercnt][0]=hm. 153 | } 154 | }catch(Exception e){ 155 | 156 | } 157 | } 158 | } 159 | --------------------------------------------------------------------------------