├── .classpath
├── .project
├── .settings
└── org.eclipse.jdt.core.prefs
├── README.md
├── bin
├── edu
│ └── smu
│ │ ├── data
│ │ ├── Alphabet.class
│ │ ├── DataList.class
│ │ ├── Instance.class
│ │ ├── InstanceList.class
│ │ ├── Lattice.class
│ │ ├── Node.class
│ │ ├── Sequence.class
│ │ ├── SequenceList.class
│ │ └── SparseVector.class
│ │ └── util
│ │ ├── DataFormat.class
│ │ ├── FileUtil.class
│ │ ├── MatrixOps.class
│ │ ├── RemoveIllegalChar.class
│ │ ├── SequeceFeaturePatternExtrator.class
│ │ └── StringUtil.class
└── tem
│ ├── com
│ ├── ComUtil$1.class
│ ├── ComUtil.class
│ ├── FileUtil$1.class
│ ├── FileUtil$2.class
│ ├── FileUtil.class
│ ├── JC.class
│ ├── MathUtil.class
│ ├── MatrixUtil.class
│ ├── POStags.class
│ ├── Sorting.class
│ ├── Stopwords.class
│ ├── ValueComparator.class
│ └── wordFreq.class
│ ├── conf
│ ├── ConstantConfig.class
│ └── PathConfig.class
│ ├── linkas
│ ├── ID.class
│ ├── PR.class
│ ├── TEPR.class
│ └── TSPR.class
│ ├── main
│ ├── Documents$Document.class
│ ├── Documents.class
│ ├── FGMM.class
│ ├── LdaGibbsSampling$modelparameters.class
│ ├── LdaGibbsSampling$parameters.class
│ ├── LdaGibbsSampling.class
│ ├── LdaModel$TwordsComparable.class
│ ├── LdaModel.class
│ ├── ModelComFunc.class
│ ├── SimpleEvaluate$TwordsComparable.class
│ ├── SimpleEvaluate.class
│ ├── TEMModel$TwordsComparable.class
│ ├── TEMModel.class
│ ├── TEMModel1$TwordsComparable.class
│ ├── TEMModel1.class
│ ├── TEMModelSampling$modelparameters.class
│ ├── TEMModelSampling$parameters.class
│ ├── TEMModelSampling.class
│ ├── TEMResPaperVisual.class
│ ├── TEMResPro$Post.class
│ ├── TEMResPro.class
│ └── TEMResProUserRecMergeU.class
│ ├── parser
│ ├── NewString.class
│ ├── Porter.class
│ └── StanfordTokenizer.class
│ ├── script
│ ├── DBConnection.class
│ ├── ExportExpCorpusFromDB.class
│ ├── ExportGraphMatrix.class
│ ├── ExportTagsFromDB.class
│ ├── ExportTestDataForRank.class
│ ├── HandleTagTest.class
│ ├── JAMATest.class
│ ├── MergeUser10.class
│ ├── PageRank2.class
│ ├── PageRankYL.class
│ ├── SimilarQuestionPAexport.class
│ ├── SortByValueDemo.class
│ └── ValueComparator.class
│ └── uqa
│ ├── UQAModel$Pair.class
│ ├── UQAModel.class
│ ├── UQAModelRes.class
│ └── UQAModelSampling.class
└── src
├── edu
└── smu
│ ├── data
│ ├── Alphabet.java
│ ├── DataList.java
│ ├── Instance.java
│ ├── InstanceList.java
│ ├── Lattice.java
│ ├── Node.java
│ ├── Sequence.java
│ ├── SequenceList.java
│ └── SparseVector.java
│ └── util
│ ├── DataFormat.java
│ ├── FileUtil.java
│ ├── MatrixOps.java
│ ├── RemoveIllegalChar.java
│ ├── SequeceFeaturePatternExtrator.java
│ └── StringUtil.java
└── tem
├── com
├── ComUtil.java
├── FileUtil.java
├── JC.java
├── MathUtil.java
├── MatrixUtil.java
├── POStags.java
├── Sorting.java
├── Stopwords.java
├── ValueComparator.java
└── wordFreq.java
├── conf
├── ConstantConfig.java
└── PathConfig.java
├── linkas
├── ID.java
├── PR.java
├── TEPR.java
└── TSPR.java
├── main
├── Documents.java
├── FGMM.java
├── LdaGibbsSampling.java
├── LdaModel.java
├── ModelComFunc.java
├── SimpleEvaluate.java
├── TEMModel.java
├── TEMModel1.java
├── TEMModelSampling.java
├── TEMResPaperVisual.java
├── TEMResPro.java
└── TEMResProUserRecMergeU.java
├── parser
├── Porter.java
└── StanfordTokenizer.java
├── script
├── DBConnection.java
├── ExportExpCorpusFromDB.java
├── ExportGraphMatrix.java
├── ExportTagsFromDB.java
├── ExportTestDataForRank.java
├── HandleTagTest.java
├── JAMATest.java
├── MergeUser10.java
├── PageRank2.java
├── PageRankYL.java
├── SimilarQuestionPAexport.java
├── SortByValueDemo.java
└── ValueComparator.java
└── uqa
├── UQAModel.java
├── UQAModelRes.java
└── UQAModelSampling.java
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | NLPTEM
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.6
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.6
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | TopicExpertiseModel
2 | ===================
3 |
4 | /**
5 | Copyright (C) 2013 by
6 | SMU Text Mining Group/Singapore Management University/Peking University
7 |
8 | TopicExpertiseModel is distributed for research purpose, but
9 | WITHOUT ANY WARRANTY; without even the implied warranty of
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 |
12 | If you use this code, please cite the following paper:
13 |
14 | Liu Yang, Minghui Qiu, Swapna Gottipati, Feida Zhu, Jing Jiang, Huiping Sun and Zhong Chen. CQARank: Jointly Model Topics and Expertise in Community Question Answering. In Proceedings of the 22nd ACM International Conference on Information and Knowledge Management (CIKM 2013). (http://dl.acm.org/citation.cfm?id=2505720)
15 |
16 | Feel free to contact the following people if you find any
17 | problems in the package.
18 | lyang@cs.umass.edu or yangliuyx@gmail.com * */
19 |
20 | Brief Introduction
21 | ===================
22 |
23 | 1. Community Question Answering (CQA) websites, where people share expertise on open platforms, have become large repositories of valuable knowledge. To bring the best value out of these knowledge repositories, it is critically important for CQA services to know how to find the right experts, retrieve archived similar questions and recommend best answers to new questions. To tackle this cluster of closely related problems in a principled approach, we proposed Topic Expertise Model (TEM), a novel probabilistic generative model with GMM hybrid, to jointly model topics and expertise by integrating textual content model and link structure analysis. Based on TEM results, we proposed CQARank to measure user interests and expertise score under different topics. Leveraging the question answering history based on long-term community reviews and voting, our method could find experts with both similar topical preference and high topical expertise.
24 |
25 | 2. This package implements Gibbs sampling for Topic Expertise Model for jointly modeling topics and expertise in question answering communities. More details of our model are described in the following paper:
26 |
27 | Liu Yang, Minghui Qiu, Swapna Gottipati, Feida Zhu, Jing Jiang, Huiping Sun and Zhong Chen. CQARank: Jointly Model Topics and Expertise in Community Question Answering. In Proceedings of the 22nd ACM International Conference on Information and Knowledge Management (CIKM 2013). (http://dl.acm.org/citation.cfm?id=2505720)
28 |
29 | 3. I didn't upload the data under ./data folder since the total size is too large. But I upload some used experimental data into a dropbox folder. You can find the experimental data here. [Download](https://www.dropbox.com/sh/42vei96g0vf56dy/AAATUsvDMq7uXkkPsDF87K5pa?dl=0).
30 |
31 | 4. I am happy that many readers of our CIKM'13 paper sent emails to me on questions about the paper and code since the paper was published. I am always trying my best to reply to those emails. My latest email address is lyang@cs.umass.edu / yangliuyx@gmail.com. I encourange you to use the "Issues" function in Github (https://github.com/yangliuy/TopicExpertiseModel/issues) so that there are QA threads which can be referred to by future readers.
32 |
--------------------------------------------------------------------------------
/bin/edu/smu/data/Alphabet.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Alphabet.class
--------------------------------------------------------------------------------
/bin/edu/smu/data/DataList.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/DataList.class
--------------------------------------------------------------------------------
/bin/edu/smu/data/Instance.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Instance.class
--------------------------------------------------------------------------------
/bin/edu/smu/data/InstanceList.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/InstanceList.class
--------------------------------------------------------------------------------
/bin/edu/smu/data/Lattice.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Lattice.class
--------------------------------------------------------------------------------
/bin/edu/smu/data/Node.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Node.class
--------------------------------------------------------------------------------
/bin/edu/smu/data/Sequence.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/Sequence.class
--------------------------------------------------------------------------------
/bin/edu/smu/data/SequenceList.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/SequenceList.class
--------------------------------------------------------------------------------
/bin/edu/smu/data/SparseVector.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/data/SparseVector.class
--------------------------------------------------------------------------------
/bin/edu/smu/util/DataFormat.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/DataFormat.class
--------------------------------------------------------------------------------
/bin/edu/smu/util/FileUtil.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/FileUtil.class
--------------------------------------------------------------------------------
/bin/edu/smu/util/MatrixOps.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/MatrixOps.class
--------------------------------------------------------------------------------
/bin/edu/smu/util/RemoveIllegalChar.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/RemoveIllegalChar.class
--------------------------------------------------------------------------------
/bin/edu/smu/util/SequeceFeaturePatternExtrator.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/SequeceFeaturePatternExtrator.class
--------------------------------------------------------------------------------
/bin/edu/smu/util/StringUtil.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/edu/smu/util/StringUtil.class
--------------------------------------------------------------------------------
/bin/tem/com/ComUtil$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/ComUtil$1.class
--------------------------------------------------------------------------------
/bin/tem/com/ComUtil.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/ComUtil.class
--------------------------------------------------------------------------------
/bin/tem/com/FileUtil$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/FileUtil$1.class
--------------------------------------------------------------------------------
/bin/tem/com/FileUtil$2.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/FileUtil$2.class
--------------------------------------------------------------------------------
/bin/tem/com/FileUtil.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/FileUtil.class
--------------------------------------------------------------------------------
/bin/tem/com/JC.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/JC.class
--------------------------------------------------------------------------------
/bin/tem/com/MathUtil.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/MathUtil.class
--------------------------------------------------------------------------------
/bin/tem/com/MatrixUtil.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/MatrixUtil.class
--------------------------------------------------------------------------------
/bin/tem/com/POStags.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/POStags.class
--------------------------------------------------------------------------------
/bin/tem/com/Sorting.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/Sorting.class
--------------------------------------------------------------------------------
/bin/tem/com/Stopwords.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/Stopwords.class
--------------------------------------------------------------------------------
/bin/tem/com/ValueComparator.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/ValueComparator.class
--------------------------------------------------------------------------------
/bin/tem/com/wordFreq.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/com/wordFreq.class
--------------------------------------------------------------------------------
/bin/tem/conf/ConstantConfig.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/conf/ConstantConfig.class
--------------------------------------------------------------------------------
/bin/tem/conf/PathConfig.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/conf/PathConfig.class
--------------------------------------------------------------------------------
/bin/tem/linkas/ID.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/linkas/ID.class
--------------------------------------------------------------------------------
/bin/tem/linkas/PR.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/linkas/PR.class
--------------------------------------------------------------------------------
/bin/tem/linkas/TEPR.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/linkas/TEPR.class
--------------------------------------------------------------------------------
/bin/tem/linkas/TSPR.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/linkas/TSPR.class
--------------------------------------------------------------------------------
/bin/tem/main/Documents$Document.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/Documents$Document.class
--------------------------------------------------------------------------------
/bin/tem/main/Documents.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/Documents.class
--------------------------------------------------------------------------------
/bin/tem/main/FGMM.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/FGMM.class
--------------------------------------------------------------------------------
/bin/tem/main/LdaGibbsSampling$modelparameters.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaGibbsSampling$modelparameters.class
--------------------------------------------------------------------------------
/bin/tem/main/LdaGibbsSampling$parameters.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaGibbsSampling$parameters.class
--------------------------------------------------------------------------------
/bin/tem/main/LdaGibbsSampling.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaGibbsSampling.class
--------------------------------------------------------------------------------
/bin/tem/main/LdaModel$TwordsComparable.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaModel$TwordsComparable.class
--------------------------------------------------------------------------------
/bin/tem/main/LdaModel.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/LdaModel.class
--------------------------------------------------------------------------------
/bin/tem/main/ModelComFunc.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/ModelComFunc.class
--------------------------------------------------------------------------------
/bin/tem/main/SimpleEvaluate$TwordsComparable.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/SimpleEvaluate$TwordsComparable.class
--------------------------------------------------------------------------------
/bin/tem/main/SimpleEvaluate.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/SimpleEvaluate.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMModel$TwordsComparable.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModel$TwordsComparable.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMModel.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModel.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMModel1$TwordsComparable.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModel1$TwordsComparable.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMModel1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModel1.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMModelSampling$modelparameters.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModelSampling$modelparameters.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMModelSampling$parameters.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModelSampling$parameters.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMModelSampling.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMModelSampling.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMResPaperVisual.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMResPaperVisual.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMResPro$Post.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMResPro$Post.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMResPro.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMResPro.class
--------------------------------------------------------------------------------
/bin/tem/main/TEMResProUserRecMergeU.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/main/TEMResProUserRecMergeU.class
--------------------------------------------------------------------------------
/bin/tem/parser/NewString.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/parser/NewString.class
--------------------------------------------------------------------------------
/bin/tem/parser/Porter.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/parser/Porter.class
--------------------------------------------------------------------------------
/bin/tem/parser/StanfordTokenizer.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/parser/StanfordTokenizer.class
--------------------------------------------------------------------------------
/bin/tem/script/DBConnection.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/DBConnection.class
--------------------------------------------------------------------------------
/bin/tem/script/ExportExpCorpusFromDB.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ExportExpCorpusFromDB.class
--------------------------------------------------------------------------------
/bin/tem/script/ExportGraphMatrix.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ExportGraphMatrix.class
--------------------------------------------------------------------------------
/bin/tem/script/ExportTagsFromDB.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ExportTagsFromDB.class
--------------------------------------------------------------------------------
/bin/tem/script/ExportTestDataForRank.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ExportTestDataForRank.class
--------------------------------------------------------------------------------
/bin/tem/script/HandleTagTest.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/HandleTagTest.class
--------------------------------------------------------------------------------
/bin/tem/script/JAMATest.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/JAMATest.class
--------------------------------------------------------------------------------
/bin/tem/script/MergeUser10.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/MergeUser10.class
--------------------------------------------------------------------------------
/bin/tem/script/PageRank2.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/PageRank2.class
--------------------------------------------------------------------------------
/bin/tem/script/PageRankYL.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/PageRankYL.class
--------------------------------------------------------------------------------
/bin/tem/script/SimilarQuestionPAexport.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/SimilarQuestionPAexport.class
--------------------------------------------------------------------------------
/bin/tem/script/SortByValueDemo.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/SortByValueDemo.class
--------------------------------------------------------------------------------
/bin/tem/script/ValueComparator.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/script/ValueComparator.class
--------------------------------------------------------------------------------
/bin/tem/uqa/UQAModel$Pair.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/uqa/UQAModel$Pair.class
--------------------------------------------------------------------------------
/bin/tem/uqa/UQAModel.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/uqa/UQAModel.class
--------------------------------------------------------------------------------
/bin/tem/uqa/UQAModelRes.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/uqa/UQAModelRes.class
--------------------------------------------------------------------------------
/bin/tem/uqa/UQAModelSampling.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/bin/tem/uqa/UQAModelSampling.class
--------------------------------------------------------------------------------
/src/edu/smu/data/Alphabet.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 |
3 | import java.util.*;
4 | import java.io.*;
5 |
6 | /**
7 | * An Alphabet object stores the mapping between symbols (represented by
8 | * String objects) and integers (represented by Integer objects). It can be
9 | * used to map feature strings to feature indices, for example, or to map
10 | * class labels to class indices.
11 | *
12 | * A symbol can never be deleted from an Alphabet object once it has been
13 | * added. Integers are assigned to symbols sequentially, starting from 0.
14 | * For example, suppose we have the following code to insert symbols into an
15 | * Alphabet object:
16 | *
17 | * Alphabet alpha = new Alphabet();
18 | * alpha.addSymbol("a");
19 | * alpha.addSymbol("b");
20 | * alpha.addSymbol("z");
21 | * alpha.addSymbol("a");
22 | * alpha.addSymbol("c");
23 | *
24 | * Then internally the following mapping is stored:
25 | *
26 | * a -- 0
27 | * b -- 1
28 | * z -- 2
29 | * c -- 3
30 | */
31 |
32 | public class Alphabet {
33 |
34 | /**
35 | * Constructs a new Alphabet object with no symbol stored.
36 | */
37 | public Alphabet() {
38 | indices = new HashMap();
39 | symbols = new ArrayList();
40 | }
41 |
42 | public Alphabet(String[] symbols) {
43 | indices = new HashMap();
44 | this.symbols = new ArrayList();
45 | addSymbols(symbols);
46 | }
47 |
48 | /**
49 | * Adds a new symbol into the Alphabet object, and returns the integer
50 | * assigned to this symbol. If this symbol is already stored in the Alphabet
51 | * then no new integer is assigned to it and the old integer assigned to it
52 | * is returned.
53 | * @param sym A symbol to be added
54 | * @return The index assigned to the newly added symbol
55 | */
56 | public int addSymbol(String sym) {
57 | if(sym == null){
58 | return -1;
59 | }
60 | if (!indices.containsKey(sym)) {
61 | indices.put(sym, new Integer(indices.size()));
62 | symbols.add(sym);
63 | }
64 | return indices.get(sym).intValue();
65 | }
66 |
67 | /**
68 | * Returns the index associated with the symbol.
69 | * @param sym A symbol of which the index is to be returned
70 | * @return The index associated with the given symbol or -1 if the symbol is
71 | * not stored in the Alphabet
72 | */
73 | public int getIndex(String sym) {
74 | if (indices.containsKey(sym)) {
75 | return indices.get(sym).intValue();
76 | }
77 | return -1;
78 | }
79 |
80 | /**
81 | * Returns the symbol at the given index position.
82 | * @param index The index position at which the symbol is to be returned
83 | * @return The symbol at the given index position or null if the index is
84 | * out of range (index < 0 || index >= size())
85 | */
86 | public String getSymbol(int index) {
87 | if (index >= 0 && index < symbols.size()) {
88 | return symbols.get(index);
89 | }
90 | return null;
91 | }
92 |
93 | /**
94 | * Returns the size of the Alphabet.
95 | * @return The size of this Alphabet object, i.e. the number of symbols
96 | * stored in the Alphabet.
97 | */
98 | public int size() {
99 | // System.out.println("SYS=" + symbols.size());
100 | return indices.size();
101 | }
102 |
103 | /**
104 | * Add a array of symbols into current Alphabet
105 | * @param A array of Strings
106 | */
107 | public void addSymbols(String[] symbols){
108 | assert(symbols.length > 0 );
109 | for(int i = 0; i < symbols.length; i++){
110 | addSymbol(symbols[i]);
111 | }
112 | }
113 | public void display(){
114 | Iterator ite = indices.keySet().iterator();
115 | while( ite.hasNext() ){
116 | String key = ite.next();
117 | //if( indices.get(key) > 2000 )
118 | // System.out.print(key + " " + indices.get(key));
119 | System.out.print(indices.get(key) + " ");
120 | }
121 | System.out.println("\n" + "[" + symbols.size()+ "]");
122 | for(int i = 0; i < symbols.size(); i++){
123 | System.out.print( symbols.get(i) + " ");
124 | }
125 | }
126 | public void saveVocab(String file) throws IOException{
127 | BufferedWriter out = new BufferedWriter(
128 | new FileWriter( new File(file)));
129 |
130 | Iterator ite = indices.keySet().iterator();
131 | while(ite.hasNext()){
132 | String wrd = ite.next();
133 | int id = indices.get(wrd);
134 | out.write( wrd + " " + id + "\n");
135 | }
136 | out.flush();
137 | out.close();
138 | }
139 | private HashMap indices;
140 | private ArrayList symbols;
141 |
142 | }
--------------------------------------------------------------------------------
/src/edu/smu/data/DataList.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collections;
5 | import java.util.Iterator;
6 |
7 | import edu.smu.util.MatrixOps;
8 |
9 | /**
10 | * A class manipulates the collection of instances
11 | */
12 | public class DataList implements Iterable {
13 |
14 | public DataList(){
15 | this.dataSet = new ArrayList();
16 | length = dataSet.size();
17 | iter = dataSet.iterator();
18 | }
19 |
20 | public DataList( ArrayList dataSet){
21 | this.dataSet = new ArrayList(dataSet);
22 | length = dataSet.size();
23 | iter = dataSet.iterator();
24 | }
25 | public int size(){
26 | return length;
27 | }
28 | public Iterator iterator() {
29 | return iter;
30 | }
31 |
32 | public T get(int index){
33 | assert(index < length && index >= 0);
34 | return dataSet.get(index);
35 | }
36 |
37 | public DataList[] split (double[] proportions) {
38 | return split (new java.util.Random(System.currentTimeMillis()), proportions);
39 | }
40 |
41 | public DataList deepClone () {
42 | DataList ret = new DataList( dataSet );
43 | return ret;
44 | }
45 | public void shuffle (java.util.Random r) {
46 | Collections.shuffle (dataSet, r);
47 | }
48 | /**
49 | * Randomly permute the specified InstanceList using the specified source of randomness. And then split it into a array of InstanceList
50 | * @param r
51 | * @param proportions
52 | * @return
53 | */
54 | public DataList[] split (java.util.Random r, double[] proportions) {
55 | DataList shuffled = this.deepClone();
56 | shuffled.shuffle (r);
57 | return shuffled.splitInOrder(proportions);
58 | }
59 | /**
60 | *
61 | * @param A array of proportions to divide the whole instance list
62 | * @return A array of InstanceList
63 | */
64 | public DataList[] splitInOrder (double[] proportions) {
65 | DataList[] ret = new DataList[proportions.length];
66 | double maxind[] = proportions.clone();
67 | MatrixOps.normalize(maxind);
68 | for (int i = 0; i < maxind.length; i++) {
69 | ret[i] = new DataList();
70 | if (i > 0)
71 | maxind[i] += maxind[i-1];
72 | }
73 | for (int i = 0; i < maxind.length; i++) {
74 | // Fill maxind[] with the highest instance index to go in each corresponding returned InstanceList
75 | maxind[i] = Math.rint (maxind[i] * this.size());
76 | }
77 | for (int i = 0, j = 0; i < size(); i++) {
78 | // This gives a slight bias toward putting an extra instance in the last InstanceList.
79 | while (i >= maxind[j] && j < ret.length)
80 | j++;
81 | ret[j].add(dataSet.get(i));
82 | }
83 | return ret;
84 | }
85 | /**
86 | * Add an instance to current list
87 | * @param an instance to be added in the instance list
88 | */
89 | public void add(T instance) {
90 | assert(instance != null);
91 | dataSet.add(instance);
92 | length = dataSet.size();
93 | }
94 |
95 | //Iterable
96 | protected Iterator iter;
97 | //Storing the instance lists
98 | protected ArrayList dataSet;
99 | //The size of dataset
100 | protected int length;
101 | protected Alphabet labelSet;
102 | protected Alphabet featSet;
103 | }
104 |
--------------------------------------------------------------------------------
/src/edu/smu/data/Instance.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 |
3 | import java.util.*;
4 | import java.io.*;
5 |
6 | /**
7 | * An Instance object stores a sparse vector that represents an observation
8 | * together with a label for this observation.
9 | */
10 | public class Instance {
11 |
12 | public Instance(SparseVector featVec, int label) {
13 | this.featVec = featVec;
14 | this.label = label;
15 | id = "UNKNOWN";
16 | predictLabel = -1;
17 | }
18 |
19 | public Instance(SparseVector featVec, int label, String id) {
20 | this.featVec = featVec;
21 | this.label = label;
22 | this.id = id;
23 | predictLabel = -1;
24 | }
25 |
26 | public void setFeaVector( SparseVector featVec ){
27 | this.featVec = featVec;
28 | }
29 |
30 | public void setLabel(int label){
31 | this.label = label;
32 | }
33 |
34 |
35 | public SparseVector getFeatureVector() {
36 | return featVec;
37 | }
38 |
39 | public int getLabel() {
40 | return label;
41 | }
42 |
43 | public int getPredictLabel(){
44 | return predictLabel;
45 | }
46 |
47 | public void setPredictLabel(int l){
48 | predictLabel = l;
49 | }
50 |
51 | public String getID() {
52 | return id;
53 | }
54 |
55 | public void display() {
56 | System.out.println("--------------------------------------------");
57 | System.out.println("Id=" + id );
58 | System.out.println("Label="+ label );
59 | featVec.display();
60 | System.out.println("--------------------------------------------");
61 | }
62 | // The feature vector that represents this Instance object.
63 | protected SparseVector featVec;
64 |
65 | // The class label of this Instance object. The label ranges from 0 to (C-1)
66 | // where C is the total number of classes. If label is set to -1, it means
67 | // this Instance in unlabeled.
68 | protected int label;
69 |
70 | protected int predictLabel;
71 | // A String that can be used to identify this Instance if needed. E.g. if the
72 | // Instance object is a document, the id can be the document ID. It is not
73 | // necessary to set this id.
74 | protected String id;
75 |
76 | }
--------------------------------------------------------------------------------
/src/edu/smu/data/InstanceList.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collections;
5 | import java.util.Iterator;
6 |
7 | import edu.smu.util.MatrixOps;
8 | /**
9 | * A class manipulates the collection of instances
10 | */
11 | public class InstanceList implements Iterable {
12 |
13 | public InstanceList(){
14 | this.dataSet = new ArrayList();
15 | length = dataSet.size();
16 | iter = dataSet.iterator();
17 | }
18 |
19 | public InstanceList( ArrayList dataSet){
20 | this.dataSet = new ArrayList(dataSet);
21 | length = dataSet.size();
22 | iter = dataSet.iterator();
23 | }
24 | public int size(){
25 | return length;
26 | }
27 | public Iterator iterator() {
28 | return iter;
29 | }
30 |
31 | public Instance get(int index){
32 | assert(index < length && index >= 0);
33 | return dataSet.get(index);
34 | }
35 |
36 | public InstanceList[] split (double[] proportions) {
37 | return split (new java.util.Random(System.currentTimeMillis()), proportions);
38 | }
39 |
40 | public InstanceList deepClone () {
41 | InstanceList ret = new InstanceList( dataSet );
42 | return ret;
43 | }
44 | public void shuffle (java.util.Random r) {
45 | Collections.shuffle (dataSet, r);
46 | }
47 | /**
48 | * Randomly permute the specified InstanceList using the specified source of randomness. And then split it into a array of InstanceList
49 | * @param r
50 | * @param proportions
51 | * @return
52 | */
53 | public InstanceList[] split (java.util.Random r, double[] proportions) {
54 | InstanceList shuffled = this.deepClone();
55 | shuffled.shuffle (r);
56 | return shuffled.splitInOrder(proportions);
57 | }
58 | /**
59 | *
60 | * @param A array of proportions to divide the whole instance list
61 | * @return A array of InstanceList
62 | */
63 | public InstanceList[] splitInOrder (double[] proportions) {
64 | InstanceList[] ret = new InstanceList[proportions.length];
65 | double maxind[] = proportions.clone();
66 | MatrixOps.normalize(maxind);
67 | for (int i = 0; i < maxind.length; i++) {
68 | ret[i] = new InstanceList();
69 | if (i > 0)
70 | maxind[i] += maxind[i-1];
71 | }
72 | for (int i = 0; i < maxind.length; i++) {
73 | // Fill maxind[] with the highest instance index to go in each corresponding returned InstanceList
74 | maxind[i] = Math.rint (maxind[i] * this.size());
75 | }
76 | for (int i = 0, j = 0; i < size(); i++) {
77 | // This gives a slight bias toward putting an extra instance in the last InstanceList.
78 | while (i >= maxind[j] && j < ret.length)
79 | j++;
80 | ret[j].add(dataSet.get(i));
81 | }
82 | return ret;
83 | }
84 | /**
85 | * Add an instance to current list
86 | * @param an instance to be added in the instance list
87 | */
88 | public void add(Instance instance) {
89 | assert(instance != null);
90 | dataSet.add(instance);
91 | length = dataSet.size();
92 | }
93 | public void display(){
94 | System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
95 | for(int i = 0; i < length; i++ ){
96 | dataSet.get(i).display();
97 | }
98 | System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
99 | }
100 | //Iterable
101 | private Iterator iter;
102 | //Storing the instance lists
103 | private ArrayList dataSet;
104 | //The size of dataset
105 | private int length;
106 | }
107 |
--------------------------------------------------------------------------------
/src/edu/smu/data/Lattice.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 | //To make the program run fast, we don't use this class right now!
3 | public class Lattice {
4 | public Lattice(int row, int col, int numLabels, Node[][] net, double[][] cost){
5 | this.row = row;
6 | this.col = col;
7 | this.numLabels = numLabels;
8 | this.net = net;
9 | this.cost = cost;
10 | }
11 | public double getCostOf(int x, int y){
12 | return cost[x][y];
13 | }
14 | public void setBestLabel(int x, int y, int l ){
15 | net[x][y].setLabel(l);
16 | }
17 | public int getPrevNodeOf(int x, int y){
18 | return net[x][y].getPrevNode();
19 | }
20 | public void readPrevProbsOf(int x, int y, double[] probs){
21 | net[x][y].getPrevProbs(probs);
22 | }
23 | public void setCurProbsOf(int x, int y, double[] probs){
24 | net[x][y].setCurProbs(probs);
25 | }
26 | public int getRow(){
27 | return row;
28 | }
29 | public int getCol(){
30 | return col;
31 | }
32 | public int getNumFeatures(){
33 | return numLabels;
34 | }
35 | private double[][] cost;
36 | private int row;
37 | private int col;
38 | private int numLabels;
39 | private Node[][] net;
40 | }
41 |
--------------------------------------------------------------------------------
/src/edu/smu/data/Node.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 | //To make the program run fast, we don't use this class right now!
3 | import java.util.Vector;
4 |
5 | import edu.smu.util.MatrixOps;
6 |
7 | public class Node {
8 | public Node(int numLabels){
9 | this.numLabels = numLabels;
10 | prevProbs = new double[numLabels];
11 | curProbs = new double[numLabels];
12 | MatrixOps.setAll(prevProbs, 0);
13 | MatrixOps.setAll(curProbs, 0);
14 | bestLabel = -1;
15 | }
16 | public Node(int numLabels, double[] prevProbs){
17 | this.numLabels = numLabels;
18 | this.prevProbs = new double[numLabels];
19 | this.curProbs = new double[numLabels];
20 | MatrixOps.set(this.prevProbs, prevProbs);
21 | MatrixOps.setAll(curProbs, 0);
22 | bestLabel = -1;
23 | }
24 | public void getPrevProbs(double[] probs){
25 | MatrixOps.set(probs, prevProbs);
26 | }
27 | public void getCurProbs(double[] probs){
28 | MatrixOps.set(probs, curProbs);
29 | }
30 | public void setCurProbs(double[] probs){
31 | MatrixOps.set(curProbs, probs);
32 | }
33 | public int getBestLabel(){
34 | return bestLabel;
35 | }
36 | public void setLabel(int label){
37 | bestLabel = label;
38 | }
39 | public int getPrevNode(){
40 | return prevNode;
41 | }
42 | private int numLabels;
43 | private int prevNode;
44 | private int bestLabel;
45 | double[] prevProbs;
46 | double[] curProbs;
47 | }
48 |
--------------------------------------------------------------------------------
/src/edu/smu/data/Sequence.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 |
3 |
4 | import java.util.ArrayList;
5 | import java.util.Collections;
6 | import java.util.Iterator;
7 |
8 |
9 | /**
10 | * A class manipulates the sequence
11 | */
12 | public class Sequence implements Iterable , Comparable {
13 |
14 | public Sequence(){
15 | this.dataSet = new ArrayList();
16 | length = dataSet.size();
17 | iter = dataSet.iterator();
18 | }
19 |
20 | public Sequence( ArrayList dataSet){
21 | this.dataSet = new ArrayList(dataSet);
22 | iter = dataSet.iterator();
23 | }
24 |
25 | public Instance getInstance(int idx){
26 | assert(idx >= 0 && idx < length);
27 | return dataSet.get(idx);
28 | }
29 |
30 | public void addInstance(Instance inst){
31 | dataSet.add(inst);
32 | length = dataSet.size();
33 | }
34 | public int size(){
35 | return length;// = dataSet.size();
36 | }
37 | public Iterator iterator() {
38 | return iter;
39 | }
40 |
41 | public Instance get(int index){
42 | assert(index < length && index >= 0);
43 | return dataSet.get(index);
44 | }
45 |
46 | public InstanceList deepClone () {
47 | InstanceList ret = new InstanceList( dataSet );
48 | return ret;
49 | }
50 |
51 | public void display(){
52 | //System.out.println(dataSet.size());
53 | System.out.println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^");
54 | for(int i = 0; i < length; i++ ){
55 | dataSet.get(i).display();
56 | if( i != length - 1)
57 | System.out.println("=>");
58 | }
59 | System.out.println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^");
60 | }
61 | public int compareTo(Object o) {
62 | Sequence B = (Sequence)o;
63 | double bp = B.getProb();
64 | if( prob < bp )
65 | return 1;
66 | else if( prob > bp ){
67 | return -1;
68 | } else if( prob == bp ){
69 | return 0;
70 | }
71 | return 0;
72 | }
73 | public double getProb(){
74 | return prob;
75 | }
76 | public void setProb(double prob){
77 | this.prob = prob;
78 | }
79 | public boolean isLabelIn(int label){
80 | for(int i = 0; i < dataSet.size(); i++ ){
81 | Instance inst = dataSet.get(i);
82 | int id = inst.getPredictLabel();
83 | if( id == label ){
84 | return true;
85 | }
86 | }
87 | return false;
88 | }
89 | //
90 | private double prob = -1.0;
91 | //Iterable
92 | private Iterator iter;
93 | //Storing the instance lists
94 | private ArrayList dataSet;
95 | //The size of dataset
96 | private int length;
97 | }
98 |
--------------------------------------------------------------------------------
/src/edu/smu/data/SequenceList.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Iterator;
5 |
6 | public class SequenceList {
7 |
8 | public SequenceList(Alphabet labelSet){
9 | this.labelSet = labelSet;
10 |
11 | dataSet = new ArrayList();
12 | iter = dataSet.iterator();
13 | }
14 |
15 | public SequenceList(Alphabet labelSet, ArrayList arr ){
16 | //System.out.println(arr.size());
17 | this.labelSet = labelSet;
18 | this.dataSet = new ArrayList(arr);
19 | iter = dataSet.iterator();
20 | //System.out.println("->" + dataSet.size());
21 | length = this.dataSet.size();
22 | }
23 |
24 | public void addSequence(Sequence seq){
25 | dataSet.add(seq);
26 | length = dataSet.size();
27 | }
28 | public int size() {
29 | return length;
30 | }
31 |
32 | public Sequence getSequence(int idx){
33 | assert(idx >= 0 && idx < length);
34 | return dataSet.get(idx);
35 | }
36 |
37 | public void getArrayList(ArrayList arrSeq){
38 | for(int i = 0; i < length; i++ ){
39 | arrSeq.add(dataSet.get(i));
40 | }
41 | }
42 | public void getInstanceList(ArrayList arrSeq){
43 | for(int i = 0; i < length; i++ ){
44 | Sequence seq = dataSet.get(i);
45 | for(int j = 0; j < seq.size(); j++ ){
46 | arrSeq.add(seq.get(j));
47 | }
48 | }
49 | }
50 | /*public SequenceList deepClone () {
51 | ArrayList ret = new ArrayList( dataSet );
52 | return ret;
53 | }*/
54 | public Iterator iterator() {
55 | return iter;
56 | }
57 | public InstanceList[] splitByPreviousLabel(){
58 | //Iterator iterSequence = this.iterator();
59 | //while( iterSequence.hasNext() ){
60 | //Sequence seq = iterSequence.next();
61 | /*Iterator iterInst = seq.iterator();
62 | int prev = -1;
63 | while( iterInst.hasNext() ){
64 | Instance inst = iterInst.next();
65 | if( prev != -1 ){
66 | instList[prev].add(inst);
67 | }
68 | prev = inst.getLabel();
69 | }*/
70 | instList = new InstanceList[labelSet.size()];
71 | for(int i = 0; i < labelSet.size(); i++){
72 | instList[i] = new InstanceList();
73 | }
74 | for(int s = 0; s < length; s++ ){
75 | Sequence seq = dataSet.get(s);
76 | //dataSet.r
77 | int prev = -1;
78 | for(int i = 0; i < seq.size(); i++ ){
79 | Instance inst = seq.get(i);
80 | if( prev != -1 ){
81 | //System.out.println("prev=" + prev);
82 | instList[prev].add(inst);
83 | }
84 | prev = inst.getLabel();
85 | }
86 | }
87 | //System.out.println( labelSet.size() );
88 | return instList;
89 | }
90 |
91 | public void display(){
92 | System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
93 | //System.out.println(dataSet.size());
94 | for(int i = 0; i < dataSet.size(); i++ ){
95 | dataSet.get(i).display();
96 | }
97 | System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++");
98 | }
99 | protected Iterator iter;
100 | //Storing the instance lists
101 | protected ArrayList dataSet;
102 | //The size of dataset
103 | protected int length;
104 | protected Alphabet labelSet;
105 | protected Alphabet featSet;
106 | private InstanceList[] instList;
107 | }
108 |
--------------------------------------------------------------------------------
/src/edu/smu/data/SparseVector.java:
--------------------------------------------------------------------------------
1 | package edu.smu.data;
2 |
3 | import java.util.*;
4 | import java.io.*;
5 |
6 | /**
7 | * A SparseVector object represents a sparse vector. It stores the indices of
8 | * the features that have non-zero values and their corresponding feature
9 | * values.
10 | */
11 |
12 | // To be completed!
13 |
14 | public class SparseVector {
15 |
16 | // The constructor can be changed to take in different types of parameters.
17 | /**
18 | * Note that right now SparseVector doesn't support add or remove operations which will be time-consuming in current position.
19 | */
20 | public SparseVector(int[] indices, double[] values) {
21 | assert( indices.length == values.length );
22 |
23 | capacity = indices.length + 1;
24 | this.indices = new int[capacity];
25 | this.values = new double[capacity];
26 | //id2Pos = new HashMap();
27 |
28 | int cnt = 0;
29 | for(int i = 0; i < indices.length; i++ ){
30 | if( values[i] != 0.0 ){ //&& !id2Pos.containsKey(indices[i])){
31 | this.indices[cnt] = indices[i];
32 | this.values[cnt] = values[i];
33 | //id2Pos.put(indices[i], cnt);
34 | cnt++;
35 | }
36 | }
37 | length = cnt;
38 | }
39 |
40 | /**
41 | * Returns the number of entries (non-zero features) stored in this
42 | * SparseVector object.
43 | * @return The number of entries in this SparseVector.
44 | */
45 | public int numEntries() {
46 | return length;
47 | }
48 |
49 | /**
50 | * Returns the index of the i'th feature stored in this SparseVector. For
51 | * example, suppose a SparseVector has the following feature indices and
52 | * feature values:
53 | *
54 | * 2 1.5
55 | * 5 0.5
56 | * 9 1.0
57 | *
58 | * Then calling getFeatureIndexAt(0) returns 2 and calling
59 | * getFeatureIndexAt(2) returns 9.
60 | * @param i The location of the entry from which a feature index is to be
61 | * returned.
62 | * @return The feature index stored in the specified entry.
63 | */
64 | public int getFeatureIndexAt(int i) {
65 | assert(i >= 0 && i < length);
66 | return indices[i];
67 | }
68 | /**
69 | * Returns the value of the i'th feature stored in this SparseVector. For
70 | * example, suppose a SparseVector has the following feature indices and
71 | * feature values:
72 | *
73 | * 2 1.5
74 | * 5 0.5
75 | * 9 1.0
76 | *
77 | * Then calling getFeatureValueAt(0) returns 1.5 and calling
78 | * getFeatureValueAt(2) returns 1.0.
79 | * @param i The location of the entry from which a feature value is to be
80 | * returned.
81 | * @return The feature value stored in the specified entry.
82 | */
83 | public double getFeatureValueAt(int i) {
84 | assert(i >= 0 && i < length);
85 | return values[i];
86 | }
87 |
88 | /**
89 | * Different from getFeatureValueAt, this function supporting locate entry by feature id
90 | * @param feature id
91 | * @return corresponding feature position in this SparseVector, if not return -1
92 | */
93 | /*public int getFeaturePositionOf(int ind){
94 | if(!id2Pos.containsKey(ind)){
95 | return -1;
96 | }
97 | return id2Pos.get(ind);
98 | }*/
99 |
100 | /**
101 | * This function supporting locate entry by feature id, then return its value
102 | * @param feature id
103 | * @return corresponding feature value or Double.MAX_VALUE
104 | */
105 | /*public double getFeatureValueOf(int ind){
106 | if(!id2Pos.containsKey(ind)){
107 | return Double.MAX_VALUE;
108 | }
109 | return values[id2Pos.get(ind)];
110 | }*/
111 |
112 | public int size(){
113 | return length;
114 | }
115 | /**
116 | * Just output the elements into screen
117 | */
118 | public void display(){
119 | for(int i = 0; i < length; i++ ){
120 | System.out.print( "(" + new Integer(indices[i]) + "," + new Double(values[i]) + ")" );
121 | if( i != length-1 )
122 | System.out.print(" , ");
123 | else System.out.println();
124 | }
125 | }
126 | // The following attributes are possible ways to implement this class but
127 | // other data structures are also possible.
128 |
129 | private int[] indices; // The indices of features that have non-zero values.
130 | // If there are no features with none-zero values,
131 | // then "indices" is set to null.
132 | private double[] values; // The values corresponding to the features
133 | // specified in "indices" or null if all features
134 | // are binary.
135 | private int length;
136 | private int capacity;
137 | //private HashMap id2Pos;
138 | }
--------------------------------------------------------------------------------
/src/edu/smu/util/FileUtil.java:
--------------------------------------------------------------------------------
1 | package edu.smu.util;
2 |
3 | import java.util.*;
4 | import java.io.*;
5 |
6 | public class FileUtil {
7 |
8 | public static void readLines(String file, ArrayList lines) {
9 | BufferedReader reader = null;
10 |
11 | try {
12 |
13 | reader = new BufferedReader(new FileReader(new File(file)));
14 |
15 | String line = null;
16 | while( (line = reader.readLine()) != null ) {
17 | lines.add(line);
18 | }
19 |
20 | } catch(FileNotFoundException e) {
21 | e.printStackTrace();
22 | } catch(IOException e) {
23 | e.printStackTrace();
24 | } finally {
25 | if (reader != null) {
26 | try {
27 | reader.close();
28 | } catch(IOException e) {
29 | e.printStackTrace();
30 | }
31 | }
32 | }
33 |
34 | }
35 |
36 | public static void readLinesBySequence(String file, ArrayList lines) {
37 | BufferedReader reader = null;
38 | String content;
39 | try {
40 |
41 | reader = new BufferedReader(new FileReader(new File(file)));
42 |
43 | String line = null;
44 | content = "";
45 | while( (line = reader.readLine()) != null ) {
46 | // System.out.println(line);
47 | // System.out.println(line.length());
48 | //System.out.println(line);
49 | // System.out.println(line.length());
50 |
51 | if( line.length() > 0 ){
52 | if( content.length() > 0 )
53 | content += "@" + line;
54 | else
55 | content = line;
56 | } else {
57 | if( content.length() > 0 ){
58 | //System.out.println(content);
59 | lines.add(content);
60 | content = "";
61 | }
62 | }
63 | }
64 |
65 | if( content.length() > 0 ){
66 | lines.add(content);
67 | }
68 |
69 | } catch(FileNotFoundException e) {
70 | e.printStackTrace();
71 | } catch(IOException e) {
72 | e.printStackTrace();
73 | } finally {
74 | if (reader != null) {
75 | try {
76 | reader.close();
77 | } catch(IOException e) {
78 | e.printStackTrace();
79 | }
80 | }
81 | }
82 | //System.out.println(lines.size());
83 | }
84 |
85 | public static void writeLines(String file, ArrayList lines) {
86 | BufferedWriter writer = null;
87 |
88 | try {
89 |
90 | writer = new BufferedWriter(new FileWriter(new File(file)));
91 |
92 | for(int i = 0; i < lines.size(); i++) {
93 | writer.write(lines.get(i) + "\n");
94 | }
95 |
96 | } catch(FileNotFoundException e) {
97 | e.printStackTrace();
98 | } catch(IOException e) {
99 | e.printStackTrace();
100 | } finally {
101 | if (writer != null) {
102 | try {
103 | writer.close();
104 | } catch(IOException e) {
105 | e.printStackTrace();
106 | }
107 | }
108 | }
109 |
110 | }
111 |
112 |
113 | }
--------------------------------------------------------------------------------
/src/edu/smu/util/RemoveIllegalChar.java:
--------------------------------------------------------------------------------
1 | package edu.smu.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.BufferedWriter;
5 | import java.io.File;
6 | import java.io.FileNotFoundException;
7 | import java.io.FileReader;
8 | import java.io.FileWriter;
9 | import java.io.IOException;
10 | import java.util.StringTokenizer;
11 |
12 | public class RemoveIllegalChar {
13 | public static void main() throws IOException{
14 |
15 | BufferedReader in = new BufferedReader( new FileReader(new File("C:\\cygwin\\home\\xzhao\\opinion_mining\\data\\hotel.txt") ));
16 | BufferedWriter out = new BufferedWriter( new FileWriter(new File("C:\\cygwin\\home\\xzhao\\opinion_mining\\data\\hotel.good.txt") ));
17 |
18 | String line = "";
19 |
20 | while( (line=in.readLine()) != null ){
21 | StringTokenizer st = new StringTokenizer(line);
22 | while( st.hasMoreTokens() ){
23 | String word = st.nextToken();
24 | if( word.indexOf("_") == -1 ){
25 | continue;
26 | }
27 | out.write(word+" ");
28 | }
29 | out.write("\n");
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/edu/smu/util/SequeceFeaturePatternExtrator.java:
--------------------------------------------------------------------------------
1 | package edu.smu.util;
2 |
3 | import java.util.ArrayList;
4 |
5 | import edu.smu.data.Alphabet;
6 | import edu.smu.data.Instance;
7 | import edu.smu.data.Sequence;
8 | import edu.smu.data.SequenceList;
9 | import edu.smu.data.SparseVector;
10 |
11 | public class SequeceFeaturePatternExtrator {
12 | public static SequenceList getTrainSeqListFromFileWithPreviousLabel(String dataFile, String templateFile, Alphabet featSet, Alphabet labelSet){
13 | ArrayList dataLines = new ArrayList();//
14 | FileUtil.readLinesBySequence(dataFile, dataLines);
15 |
16 | ArrayList templateLines = new ArrayList();//
17 | FileUtil.readLines(templateFile, templateLines);
18 |
19 | //System.out.println(dataLines.size());
20 |
21 | int[] x = new int[templateLines.size()];
22 | int[] y = new int[templateLines.size()];
23 | for(int i = 0; i < templateLines.size(); i++){
24 | String[] pat = templateLines.get(i).split(",");
25 | x[i] = new Integer(pat[0]);
26 | y[i] = new Integer(pat[1]);
27 | }
28 |
29 | SequenceList seqList = new SequenceList(labelSet);
30 | featSet.addSymbol("ME_BIAS");
31 |
32 | for(int i = 0; i < dataLines.size(); i++){
33 | //System.out.println(dataLines.get(i));
34 | seqList.addSequence( str2Seq(dataLines.get(i), x, y, featSet, labelSet, Integer.MAX_VALUE ) );
35 | }
36 | return seqList;
37 | }
38 |
39 | public static SequenceList[] getTestSeqListFromFileWithPreviousLabel(String dataFile, String templateFile, Alphabet featSet, Alphabet labelSet){
40 | ArrayList dataLines = new ArrayList();//
41 | FileUtil.readLinesBySequence(dataFile, dataLines);
42 |
43 | ArrayList templateLines = new ArrayList();//
44 | FileUtil.readLines(templateFile, templateLines);
45 |
46 | //System.out.println(dataLines.size());
47 |
48 | int[] x = new int[templateLines.size()];
49 | int[] y = new int[templateLines.size()];
50 | for(int i = 0; i < templateLines.size(); i++){
51 | String[] pat = templateLines.get(i).split(",");
52 | x[i] = new Integer(pat[0]);
53 | y[i] = new Integer(pat[1]);
54 | }
55 |
56 | SequenceList[] seqList = new SequenceList[labelSet.size()];
57 | for(int i = 0; i < seqList.length; i++ ){
58 | seqList[i] = new SequenceList(labelSet);
59 | }
60 | featSet.addSymbol("ME_BIAS");
61 |
62 | for(int i = 0; i < dataLines.size(); i++){
63 | //System.out.println(dataLines.get(i));
64 | for(int l = 0; l < labelSet.size(); l++ ){
65 | seqList[l].addSequence( str2Seq(dataLines.get(i), x, y, featSet, labelSet, l ) );
66 | }
67 | }
68 | return seqList;
69 | }
70 |
71 | public static SequenceList getSeqListFromFile(String dataFile, String templateFile, Alphabet featSet, Alphabet labelSet){
72 | ArrayList dataLines = new ArrayList();//
73 | FileUtil.readLinesBySequence(dataFile, dataLines);
74 |
75 | ArrayList templateLines = new ArrayList();//
76 | FileUtil.readLines(templateFile, templateLines);
77 |
78 | //System.out.println(dataLines.size());
79 |
80 | int[] x = new int[templateLines.size()];
81 | int[] y = new int[templateLines.size()];
82 | for(int i = 0; i < templateLines.size(); i++){
83 | String[] pat = templateLines.get(i).split(",");
84 | x[i] = new Integer(pat[0]);
85 | y[i] = new Integer(pat[1]);
86 | }
87 |
88 | SequenceList seqList = new SequenceList(labelSet);
89 | featSet.addSymbol("ME_BIAS");
90 |
91 | for(int i = 0; i < dataLines.size(); i++){
92 | //System.out.println(dataLines.get(i));
93 | seqList.addSequence( str2Seq(dataLines.get(i), x, y, featSet, labelSet, -1 ));
94 | }
95 | return seqList;
96 | }
97 | public static Sequence str2Seq(String line, int[] x, int[] y, Alphabet featSet, Alphabet labelSet, int prevLabel){
98 | //System.out.println(line);
99 | String[] str = line.split("@");
100 | String[][] item = new String[str.length][];
101 | for(int i = 0; i < item.length; i++ ){
102 | //System.out.println(str[i]);
103 | item[i] = str[i].split(" ");
104 | }
105 |
106 | /*for(int i = 0; i < item.length; i++){
107 | for(int j = 0; j < item[i].length; j++){
108 | System.out.println( item[i][j] + "\t");
109 | }
110 | System.out.println();
111 | }*/
112 |
113 | int row = item.length;
114 | int col = item[0].length;
115 |
116 | //System.out.println(row);
117 | //System.out.println(col);
118 |
119 | Sequence seq = new Sequence();
120 |
121 | for(int r = 0; r < row; r++){
122 | //int label = labelSet.addSymbol(item[r][col-1]);
123 | int label = labelSet.addSymbol(item[r][col-1].substring(0,1));
124 |
125 | ArrayList arrInd = new ArrayList();
126 | ArrayList arrValue = new ArrayList();
127 |
128 | String post = "";
129 |
130 | /**
131 | * Adding the set prev_label;
132 | */
133 | if( prevLabel >= 0 && prevLabel < labelSet.size() && r >= 1){
134 | //if( prev){
135 |
136 | //}
137 | //System.out.println("prev=_" + labelSet.getSymbol(prevLabel));
138 | //arrInd.add( featSet.addSymbol("prev=_" + labelSet.getSymbol(prevLabel)+"_"+item[r-1][1]));
139 | //arrValue.add( 1.0 );
140 | //arrInd.add( featSet.addSymbol("prev=_" + labelSet.getSymbol(prevLabel)));
141 | //arrValue.add( 1.0 );
142 | post = "_prev=_" + labelSet.getSymbol(prevLabel);
143 | }/**
144 | * Adding the previous label;
145 | */
146 | else if( prevLabel == Integer.MAX_VALUE && r >= 1 ){
147 | //arrInd.add( featSet.addSymbol("prev=_" + item[r-1][col-1]));
148 | //System.out.println("prev=_" + item[r-1][col-1]);
149 | //arrValue.add( 1.0 );
150 | post = "_prev=_" + item[r-1][col-1] ;
151 | }
152 |
153 | for(int i = 0; i < x.length; i++){
154 | int tx = x[i] + r;
155 | int ty = y[i];
156 | if( tx >= 0 && tx < row && ty >= 0 && ty < col ){
157 | String fea = item[tx][ty] + "_x["+ new Integer(x[i])+"," + new Integer(y[i]) + "]" + post;
158 | //System.out.println(fea);
159 | int fId = featSet.addSymbol(fea);
160 | double v = 1.0;
161 | arrInd.add(fId);
162 | arrValue.add(v);
163 |
164 |
165 | fea = item[tx][ty] + "_x["+ new Integer(x[i])+"," + new Integer(y[i]) + "]";
166 | arrInd.add(featSet.addSymbol(fea));
167 | arrValue.add(v);
168 |
169 | }
170 | }
171 | int[] inds = new int[arrInd.size()];
172 | MatrixOps.arrayListToArray(arrInd, inds);
173 | double[] values = new double[arrValue.size()];
174 | MatrixOps.arrayListToArray(arrValue, values);
175 |
176 | //System.out.println(inds.length + " " + values.length);
177 | Instance inst = new Instance(new SparseVector(inds, values), label);
178 | //inst.display();
179 | seq.addInstance(inst);
180 | }
181 | return seq;
182 | }
183 | }
184 |
--------------------------------------------------------------------------------
/src/edu/smu/util/StringUtil.java:
--------------------------------------------------------------------------------
1 | package edu.smu.util;
2 |
3 | import java.util.*;
4 |
5 | public class StringUtil {
6 |
7 | /**
8 | * Splits the given String
into tokens.
9 | *
10 | * @param line The String
to be tokenized.
11 | * @param tokens The ArrayList
to store the tokens.
12 | */
13 | public static void tokenize(String line, ArrayList tokens) {
14 | StringTokenizer strTok = new StringTokenizer(line);
15 | while(strTok.hasMoreTokens()) {
16 | String token = strTok.nextToken();
17 | tokens.add(token);
18 | }
19 | }
20 | }
--------------------------------------------------------------------------------
/src/tem/com/JC.java:
--------------------------------------------------------------------------------
1 | package tem.com;
2 | import java.util.ArrayList;
3 | import jargs.gnu.CmdLineParser;
4 |
5 | /* **************************** JC *************************************
6 | * This is a class for input parameters. A demo usage is as follows.
7 | * Relative Path: path in System.getProperty("user.dir")
8 | * Usage:
9 | * 1. JC.setInputOptions(Descry, directory, options, args, property, int i);
10 | * i = 1 for input options; others for specified directory;
11 | * i = 1 -> property.charAt[i] = 0 means input is not required; 1 required;
12 | * i = 0 -> property.charAt[i] = 0 means Relative Path; 1 means absolute path
13 | *
14 | * Demo:
15 | * String [] descry = {"Filelist ", "TagMap ", "File direction ", "OuptDir "};
16 | * String [] directory = {"/filelist_data.txt", "/TagMap.txt","/sentence/","/output/"};
17 | * char [] options = {'f','t','i','o'};
18 | * String property = "1111";
19 | * new JC();
20 | * JC.setInputOptions(Descry, directory, options, args, property, 1);
21 | * String fileName = JC.getARG(0);
22 | * String TagList = JC.getARG(1);
23 | * String dataDir = JC.getARG(2);
24 | * String outputFileName = JC.getARG(3);
25 | * JC.close();
26 | *
27 | * JC.setInputOptions(Descry, directory, options, args, property, 1);
28 | * Call the func: java -jar [name].jar -f filelist -t tagmap -i dir -o outputDir
29 | *
30 | * JC.setInputOptions(Descry, directory, options, args, property, 0);
31 | * Just execute the program !
32 | *
33 | * ************************************************************************/
34 |
35 | public class JC {
36 |
37 | public static CmdLineParser clp;
38 |
39 | public static String CD;
40 |
41 | public static ArrayList Argums; // 1: description 2: option (-f)
42 |
43 | public JC() {
44 | CD = System.getProperty("user.dir");
45 | clp = new CmdLineParser();
46 | Argums = new ArrayList();
47 | }
48 |
49 | public static ArrayList getArgums() {
50 | return Argums;
51 | }
52 |
53 | public static void setArgums(ArrayList argums) {
54 | Argums = argums;
55 | }
56 |
57 | public static void setSinArgums(String argums) {
58 | Argums.add(argums);
59 | }
60 |
61 | public static String getCD() {
62 | return CD;
63 | }
64 |
65 | public void setCD(String cD) {
66 | CD = cD;
67 | }
68 |
69 | public static void close() {
70 |
71 | for (int i = 0; i < Argums.size(); i += 2)
72 | System.err.println(Argums.get(i) + " is: "
73 | + Argums.get(i + 1));
74 | Argums.clear();
75 | }
76 |
77 | static void setOption(char [] options) {
78 |
79 | for(int i = 0; i < options.length; i++) {
80 | clp.addStringOption(options[i], options[i]+"");
81 | }
82 | }
83 |
84 | public static String getARG(int i) {
85 | return Argums.get(2*i+1);
86 | }
87 |
88 | private static void printHelp(String[] descrp, char[] options) {
89 | System.err.println("\nPlease run this file in the following way:");
90 | System.err.println("java -jar [name].jar -" + options[0] + " " + descrp[0]);
91 | for(int m = 1; m < descrp.length; m++) {
92 | System.err.println(" -" + options[m] + " " + descrp[m]);
93 | }
94 | }
95 |
96 | public static void setInputOptions(String[] descrp, String[] directory,
97 | char[] options, String[] args, String string, int i) {
98 | if( i == 1)
99 | setInputOptions(descrp, options, args, string);
100 | else
101 | setInputOptions(descrp, directory, string);
102 | }
103 |
104 |
105 | public static void setInputOptions(String[] descrp, char[] options,
106 | String[] args, String property) {
107 | if(descrp.length != options.length | descrp.length != property.length()) {
108 | System.err.println("\n Length of input parameters is not equal ! ");
109 | System.exit(1);
110 | } else {
111 | setParemeter(descrp, options, args, property);
112 | }
113 | }
114 |
115 | private static void setParemeter(String[] descrp, char[] options,
116 | String[] a, String property) {
117 |
118 | setOption(options);
119 | try {
120 | clp.parse(a);
121 | } catch (CmdLineParser.OptionException e) {
122 | System.err.println(e.getMessage()); e.printStackTrace();
123 | printHelp(descrp, options);
124 | System.exit(1);
125 | }
126 | for(int i = 0; i < descrp.length; i++) {
127 | CmdLineParser.Option tmp = clp.addStringOption(options[i], options[i]+"");
128 | Argums.add(descrp[i]);
129 | Argums.add((String)clp.getOptionValue(tmp));
130 | if(Integer.parseInt(property.charAt(i)+"") == 0 &&
131 | Argums.get(Argums.size()-1) == null) {
132 | System.err.print("-" + options[i] + " option is missing !");
133 | printHelp(descrp, options);
134 | System.exit(1);
135 | }
136 | }
137 | }
138 |
139 | public static void setInputOptions(String[] descrp, String[] directory,
140 | String property) {
141 | if(descrp.length != directory.length | descrp.length != property.length()) {
142 | System.err.println("\n Length of input parameters is not equal ! ");
143 | System.exit(1);
144 | } else {
145 | for(int i = 0; i < descrp.length; i++) {
146 | setParemeter(descrp[i], directory[i],
147 | Integer.parseInt(property.charAt(i)+""));
148 | }
149 | }
150 | }
151 |
152 | public static void setParemeter(String p1, String p2, int i) {
153 |
154 | // i = 1 means p2 is absolute path, others relative path
155 | if (i == 1) {
156 | setSinArgums(p1);
157 | setSinArgums(p2);
158 | } else {
159 | setSinArgums(p1);
160 | setSinArgums(getCD() + p2);
161 | }
162 | }
163 |
164 | }
165 |
--------------------------------------------------------------------------------
/src/tem/com/MathUtil.java:
--------------------------------------------------------------------------------
1 | package tem.com;
2 | /**
3 | * Math Util for Gaussian distribution
4 | *
5 | * @author Minghui
6 | */
7 |
8 | public class MathUtil {
9 |
10 | // return phi(x) = standard Gaussian pdf
11 | public static double phi(double x) {
12 | return Math.exp(-x*x / 2) / Math.sqrt(2 * Math.PI);
13 | }
14 |
15 | // return phi(x, mu, signma) = Gaussian pdf with mean mu and stddev sigma
16 | public static double phi(double x, double mu, double sigma) {
17 | return phi((x - mu) / sigma) / sigma;
18 | }
19 |
20 | // return Phi(z) = standard Gaussian cdf using Taylor approximation
21 | public static double Phi(double z) {
22 | if (z < -8.0) return 0.0;
23 | if (z > 8.0) return 1.0;
24 | double sum = 0.0, term = z;
25 | for (int i = 3; sum + term != sum; i += 2) {
26 | sum = sum + term;
27 | term = term * z * z / i;
28 | }
29 | return 0.5 + sum * phi(z);
30 | }
31 |
32 | // return Phi(z, mu, sigma) = Gaussian cdf with mean mu and stddev sigma
33 | public static double Phi(double z, double mu, double sigma) {
34 | return Phi((z - mu) / sigma);
35 | }
36 |
37 | // Compute z such that Phi(z) = y via bisection search
38 | public static double PhiInverse(double y) {
39 | return PhiInverse(y, .00000001, -8, 8);
40 | }
41 |
42 | // bisection search
43 | private static double PhiInverse(double y, double delta, double lo, double hi) {
44 | double mid = lo + (hi - lo) / 2;
45 | if (hi - lo < delta) return mid;
46 | if (Phi(mid) > y) return PhiInverse(y, delta, lo, mid);
47 | else return PhiInverse(y, delta, mid, hi);
48 | }
49 |
50 |
51 |
52 | // test client
53 | public static void main(String[] args) {
54 | double z = Double.parseDouble(args[0]);
55 | double mu = Double.parseDouble(args[1]);
56 | double sigma = Double.parseDouble(args[2]);
57 | System.out.println(Phi(z, mu, sigma));
58 | double y = Phi(z);
59 | System.out.println(PhiInverse(y));
60 | }
61 |
62 | }
--------------------------------------------------------------------------------
/src/tem/com/MatrixUtil.java:
--------------------------------------------------------------------------------
1 | package tem.com;
2 |
3 | import java.util.*;
4 | import java.util.regex.Matcher;
5 | import java.util.regex.Pattern;
6 | import java.io.*;
7 |
8 | public class MatrixUtil {
9 | // irregular array
10 | public static int[][] getArray() {
11 | int[][] num = { { 1, 2, 3 }, { 4, 5 }, { 2 } };
12 | for (int i = 0; i < num.length; i++) {
13 | for (int j = 0; j < num[i].length; j++)
14 | System.out.println(num[i][j]);
15 | }
16 | return num;
17 | }
18 |
19 | public static void printArray(int[][] num) {
20 | // int [][] num={{1,2,3},{4,5},{2}};
21 | for (int i = 0; i < num.length; i++) {
22 | for (int j = 0; j < num[i].length; j++)
23 | System.out.print(num[i][j] + "\t");
24 | System.out.println();
25 | }
26 | }
27 |
28 | public static void printArray(short[][] num) {
29 | // int [][] num={{1,2,3},{4,5},{2}};
30 | for (int i = 0; i < num.length; i++) {
31 | for (int j = 0; j < num[i].length; j++)
32 | System.out.print(num[i][j] + "\t");
33 | System.out.println();
34 | }
35 | }
36 |
37 | public static void printArray(int[] num) {
38 | for (int i = 0; i < num.length; i++) {
39 | System.out.print(num[i] + "\t");
40 | }
41 | System.out.println();
42 | }
43 |
44 | public static void printArray(long[] num) {
45 | for (int i = 0; i < num.length; i++) {
46 | System.out.print(num[i] + "\t");
47 | }
48 | System.out.println();
49 | }
50 |
51 | public static void printArray(double[] num) {
52 | for (int i = 0; i < num.length; i++) {
53 | System.out.print(num[i] + "\t");
54 | }
55 | System.out.println();
56 | }
57 |
58 | public static void printArray(boolean[][] bs) {
59 | for (int i = 0; i < bs.length; i++) {
60 | for (int j = 0; j < bs[i].length; j++) {
61 | if (bs[i][j])
62 | System.out.print("1\t");
63 | else
64 | System.out.print("0\t");
65 | }
66 | System.out.println();
67 | }
68 | }
69 |
70 | public static double sumCol(float[][] data, int u) {
71 | double a = 0.0D;
72 | for (int m = 0; m < data[u].length; m++) {
73 | a += data[m][u];
74 | }
75 | return a;
76 | }
77 |
78 | public static double sumRow(int[][] matrix, int u) {
79 | double a = 0.0D;
80 | for (int m = 0; m < matrix[u].length; m++) {
81 | a += matrix[u][m];
82 | }
83 | return a;
84 | }
85 |
86 | public static double sum(double[] a2) {
87 | double a = 0l;
88 | for (int i = 0; i < a2.length; i++) {
89 | a += a2[i];
90 | }
91 | return a;
92 | }
93 |
94 | public static double sum(int[] a2) {
95 | double a = 0;
96 | for (int i = 0; i < a2.length; i++) {
97 | a += a2[i];
98 | }
99 | return a;
100 | }
101 |
102 | public static int max(int[] flag) {
103 | int max = flag[0];
104 | for (int i = 1; i < flag.length; i++) {
105 | if (flag[i] > max)
106 | max = flag[i];
107 | }
108 | return max;
109 | }
110 |
111 | public static double max(double[] flag) {
112 | double max = flag[0];
113 | for (int i = 1; i < flag.length; i++) {
114 | if (flag[i] > max)
115 | max = flag[i];
116 | }
117 | return max;
118 | }
119 |
120 | public static double min(double[] flag) {
121 | double min = flag[0];
122 | for (int i = 1; i < flag.length; i++) {
123 | if (flag[i] < min)
124 | min = flag[i];
125 | }
126 | return min;
127 | }
128 |
129 | public static void randperm(int[] set, int vector_n, int m) {
130 | // sample M points from 1:vector_n
131 | List list = new ArrayList();
132 | for (int i = 0; i < vector_n; i++)
133 | list.add(i);
134 | java.util.Collections.shuffle(list);
135 |
136 | for (int i = 0; i < m; i++) {
137 | set[i] = list.get(i);
138 | }
139 | }
140 |
141 | public static double dist(double[] ds, double[] ds2, String distDesp) {
142 | if (distDesp.equals("Euclidean")) {
143 | double dist = 0d;
144 | for (int i = 0; i < ds.length; i++) {
145 | dist += Math.pow(ds[i] - ds2[i], 2);
146 | }
147 | return Math.sqrt(dist);
148 | } else
149 | return (Double) null;
150 | }
151 |
152 | public static double dist(double ds, double ds2, String distDesp) {
153 | if (distDesp.equals("Euclidean")) {
154 | double dist = Math.pow(ds - ds2, 2);
155 | return Math.sqrt(dist);
156 | } else
157 | return (Double) null;
158 | }
159 |
160 | public static double KL(double[] fs, double[] fs2) {
161 | double klScore = 0.0D, phi_i1, phi_i2;
162 | klScore = 0.0;
163 | for (int v = 0; v < fs.length; v++) {
164 | // cal KL Div by summing
165 | phi_i1 = fs[v];
166 | phi_i2 = fs2[v];
167 | if (phi_i1 > 0 && phi_i2 > 0)
168 | klScore += phi_i1 * Math.log(phi_i1 / phi_i2);
169 | }
170 | return klScore;
171 | }
172 |
173 | public static double JS(double[] fs, double[] fs2) {
174 | double[] avg = new double[fs.length];
175 | for (int v = 0; v < fs.length; v++) {
176 | avg[v] = (fs[v] + fs2[v]) / 2;
177 | }
178 |
179 | double JSScore = 0.0d;
180 | JSScore += KL(fs, avg);
181 | JSScore += KL(fs2, avg);
182 | JSScore = JSScore / 2;
183 |
184 | return JSScore;
185 | }
186 |
187 | public static void norm1(double[] thetaD) {
188 | double sum = sum(thetaD);
189 | for (int i = 0; i < thetaD.length; i++)
190 | thetaD[i] = thetaD[i] / sum;
191 | }
192 | }
193 |
--------------------------------------------------------------------------------
/src/tem/com/POStags.java:
--------------------------------------------------------------------------------
1 | package tem.com;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashMap;
5 |
6 |
7 | public class POStags {
8 |
9 | public HashMap gTagMap;
10 |
11 | public POStags() {
12 | String map = "CD ADJ" + "\t" +
13 | "JJ ADJ" + "\t" +
14 | "JJR ADJ" + "\t" +
15 | "JJS ADJ" + "\t" +
16 | "VB V" + "\t" +
17 | "VBD V" + "\t" +
18 | "VBG V" + "\t" +
19 | "VBN V" + "\t" +
20 | "VBP V" + "\t" +
21 | "VBZ V" + "\t" +
22 | "MD V" + "\t" +
23 | "NN N" + "\t" +
24 | "NNS N" + "\t" +
25 | "NNP N" + "\t" +
26 | "NNPS N" + "\t" +
27 | "RB ADV" + "\t" +
28 | "RBR ADV" + "\t" +
29 | "RBS ADV" + "\t" +
30 | "RP ADV" + "\t" +
31 | "WRB ADV" + "\t" +
32 | "DT DET" + "\t" +
33 | "PDT DET" + "\t" +
34 | "WDT DET" + "\t" +
35 | "POS DET" + "\t" +
36 | "PRP PRP" + "\t" +
37 | "WP PRP" + "\t" +
38 | "PRP$ PRP$" + "\t" +
39 | "WP$ PRP$" + "\t" +
40 | "TO PREP" + "\t" +
41 | "IN PREP" + "\t" +
42 | "CC CONJ" + "\t" +
43 | "EX OTHER" + "\t" +
44 | "FW OTHER" + "\t" +
45 | "SYM OTHER" + "\t" +
46 | "UH OTHER" + "\t" +
47 | "LS OTHER" + "\t";
48 |
49 | gTagMap = new HashMap();
50 | String [] maps = map.split("\t");
51 | ArrayList tokens = new ArrayList ();
52 | for(int i = 0; i < maps.length; i++) {
53 | tokens.clear();
54 | FileUtil.tokenize(maps[i], tokens);
55 | if(tokens.size() != 2) {
56 | System.err.println(maps[i]);
57 | } else {
58 | gTagMap.put(tokens.get(0).toLowerCase().trim(),
59 | tokens.get(1).toLowerCase().trim());
60 | }
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/tem/com/Sorting.java:
--------------------------------------------------------------------------------
1 | package tem.com;
2 | import java.util.Comparator;
3 |
4 |
5 | public class Sorting implements Comparator {
6 | public int compare(wordFreq o1, wordFreq o2) {
7 | return Long.valueOf(o2.getNo()).compareTo(Long.valueOf((o1.getNo())));
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/src/tem/com/ValueComparator.java:
--------------------------------------------------------------------------------
1 | package tem.com;
2 |
3 | import java.util.Comparator;
4 | import java.util.Map;
5 |
6 | public class ValueComparator implements Comparator{
7 | Map baseMap;
8 |
9 | public ValueComparator(Map base){
10 | this.baseMap = base;
11 | }
12 |
13 | @Override
14 | public int compare(String o1, String o2) {
15 | // TODO Auto-generated method stub
16 | if(baseMap.get(o1) >= baseMap.get(o2)){
17 | return -1;
18 | } else {
19 | return 1;
20 | }
21 | }
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/tem/com/wordFreq.java:
--------------------------------------------------------------------------------
1 | package tem.com;
2 |
3 | import java.util.ArrayList;
4 |
5 |
6 | public class wordFreq {
7 |
8 | public String word;
9 |
10 | public int No;
11 |
12 | public double prob;
13 |
14 | public String getWord() {
15 | return word;
16 | }
17 |
18 | public void setWord(String word) {
19 | this.word = word;
20 | }
21 |
22 | public int getNo() {
23 | return No;
24 | }
25 |
26 | public void setNo(int no) {
27 | No = no;
28 | }
29 |
30 | public double getProb() {
31 | return prob;
32 | }
33 |
34 | public void setProb(double prob) {
35 | this.prob = prob;
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/tem/conf/ConstantConfig.java:
--------------------------------------------------------------------------------
1 | package tem.conf;
2 |
3 | public class ConstantConfig {
4 |
5 | public static String LDAPARAMETERFILE = "data/modelParams/temParams.txt";
6 |
7 | }
8 |
--------------------------------------------------------------------------------
/src/tem/conf/PathConfig.java:
--------------------------------------------------------------------------------
1 | package tem.conf;
2 |
3 | public class PathConfig {
4 |
5 | public static String scriptDataPath = "data/scriptData/ThreeM09/";
6 |
7 | public static String originalDataPath = "data/originalData/ThreeM09/";
8 |
9 | public static String testDataPath = "data/originalData/TestData/";
10 |
11 | public static String modelParamsPath = "data/modelParams/";
12 |
13 | public static String modelResPath = "data/modelRes/ThreeM09/";
14 |
15 | public static String minPostNum = "50";
16 |
17 | public static String UQAPath = "data/modelRes/ThreeM09/UQA/";
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/tem/linkas/ID.java:
--------------------------------------------------------------------------------
1 | package tem.linkas;
2 |
3 | /**InDegree Algorithm for expert finding (CIKM 12 & KDD08)
4 | * @author yangliu
5 | * @blog http://blog.csdn.net/yangliuy
6 | * @mail yangliuyx@gmail.com
7 | */
8 |
9 |
10 | import java.io.IOException;
11 | import java.io.PrintWriter;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 |
15 |
16 | import Jama.Matrix;
17 |
18 | import tem.com.FileUtil;
19 | import tem.com.MatrixUtil;
20 | import tem.conf.PathConfig;
21 | import tem.main.TEMModel;
22 |
23 | public class ID {
24 |
25 | private static int NODENUM; // Node number
26 | private static Matrix U; // Matrix with all 1
27 | private static Matrix graphAdjM; //Adjancy matrix of graph
28 | private static Matrix transM;//Transition probability matrix
29 |
30 | public static void main(String[] args) throws IOException, ClassNotFoundException {
31 | String minPostNum = "80";
32 | String modelName = "ID";
33 | int K = 15;
34 | //Meature user interests and expertise by in degree of user node
35 | //i.e. The total number of answers the user provides or total number of votes the user gets
36 | //Compute the sum of each column
37 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userAnswerNumWeighted.QAgraph";
38 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph";
39 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/test.QAgraph";
40 | //String TransMFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".transM";
41 | String finalPRALLFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".finalPRAll";
42 | ArrayList finalPRallLines = new ArrayList();
43 |
44 | PrintWriter pw;
45 | readQAGraph(graphDataFile);
46 | NODENUM = graphAdjM.getRowDimension();
47 | System.out.println("NODENUM : " + NODENUM);
48 |
49 | double[] nodeScoreArray = new double[NODENUM];
50 | for(int i = 0; i < NODENUM; i++){
51 | for(int j = 0; j < NODENUM; j++){
52 | nodeScoreArray[i] += graphAdjM.get(j, i);
53 | }
54 | }
55 | MatrixUtil.norm1(nodeScoreArray);
56 |
57 | for(int z = 0; z < K; z++){
58 | System.out.println("now topic = " + z);
59 |
60 | Matrix pageRank = new Matrix(nodeScoreArray, 1);
61 |
62 | System.out.println("Final PageRank is :");
63 | pageRank.print(4, 4);
64 |
65 | String PRLine = "";
66 | for(int i = 0; i < NODENUM; i++){
67 | PRLine += pageRank.get(0, i) + "\t";
68 | }
69 | finalPRallLines.add(PRLine);
70 | }
71 | FileUtil.writeLines(finalPRALLFile, finalPRallLines);
72 | }
73 |
74 | private static void readQAGraph(
75 | String graphDataFile) {
76 | // TODO Auto-generated method stub
77 | ArrayList graphLines = new ArrayList();
78 | FileUtil.readLines(graphDataFile, graphLines);
79 | double[][] graphMatrix = new double[graphLines.size()][];
80 | double minNumber = 1000;
81 | for(int i = 0; i < graphLines.size(); i++){
82 | String[] glineTokens = graphLines.get(i).split("\t");
83 | graphMatrix[i] = new double[glineTokens.length];
84 | for(int j = 0; j < glineTokens.length; j++){
85 | double d = Double.valueOf(glineTokens[j]);
86 | if(d < 0){
87 | graphMatrix[i][j] = 0;
88 | } else {
89 | graphMatrix[i][j] = d;
90 | }
91 | if(minNumber > graphMatrix[i][j]) {
92 | minNumber = graphMatrix[i][j];
93 | }
94 | }
95 | }
96 | //If there is negative number is matrix, find the min one x. Then all number add |x|
97 | System.out.println("minNumber " + minNumber);
98 | /*if (minNumber < 0){
99 | for(int i = 0; i < graphMatrix.length; i++){
100 | for(int j = 0; j < graphMatrix[i].length; j++){
101 | graphMatrix[i][j] += (0 - minNumber);
102 | }
103 | }
104 | }*/
105 | graphAdjM = new Matrix(graphMatrix);
106 | }
107 |
108 | public static void printMatrix(List> m) {
109 | for (int i = 0; i < m.size(); i++) {
110 | for (int j = 0; j < m.get(i).size(); j++) {
111 | System.out.print(m.get(i).get(j) + "\t");
112 | }
113 | System.out.println();
114 | }
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/tem/linkas/PR.java:
--------------------------------------------------------------------------------
1 | package tem.linkas;
2 |
3 | /**Standard PageRank Algorithm (CIKM 12 PR for Expert finding)
4 | * @author yangliu
5 | * @blog http://blog.csdn.net/yangliuy
6 | * @mail yangliuyx@gmail.com
7 | */
8 |
9 | import java.io.FileInputStream;
10 | import java.io.FileWriter;
11 | import java.io.IOException;
12 | import java.io.ObjectInputStream;
13 | import java.io.PrintWriter;
14 | import java.util.ArrayList;
15 | import java.util.List;
16 | import java.util.Random;
17 |
18 | import Jama.Matrix;
19 |
20 | import tem.com.FileUtil;
21 | import tem.conf.PathConfig;
22 | import tem.main.TEMModel;
23 |
24 | public class PR {
25 | private static double LAMBDA = 0.2;
26 | private static double THRESHOLD = 0.0000001;
27 |
28 | private static int NODENUM; // Node number
29 | private static Matrix U; // Matrix with all 1
30 | private static Matrix graphAdjM; //Adjancy matrix of graph
31 | private static Matrix transM;//Transition probability matrix
32 |
33 | public static void main(String[] args) throws IOException, ClassNotFoundException {
34 | String minPostNum = "80";
35 | String modelName = "PR";
36 | int K = 15;
37 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userAnswerNumWeighted.QAgraph";
38 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph";
39 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/test.QAgraph";
40 | //String TransMFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".transM";
41 | String finalPRALLFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".finalPRAll";
42 | ArrayList finalPRallLines = new ArrayList();
43 |
44 | PrintWriter pw;
45 | readQAGraph(graphDataFile);
46 | NODENUM = graphAdjM.getRowDimension();
47 | System.out.println("NODENUM : " + NODENUM);
48 | //1. Init PR state vector and Matrix U
49 | //Both randomly initialise or set all 1 are OK
50 | Matrix PR0 = initPRStateVector();
51 | initU();
52 | System.out.println("Initial state vector PR0 is:");
53 | PR0.print(4, 4);
54 |
55 |
56 | for(int z = 0; z < K; z++){
57 | System.out.println("now topic = " + z);
58 | //String newPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".newPR";
59 | //String finalPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".finalPR";
60 |
61 | //2. Compute transition probability matrix
62 | computeTransM(z);
63 | //pw = new PrintWriter(new FileWriter(TransMFile));
64 | //transM.print(pw, 4, 4);
65 |
66 | //3. Compute newPR update matrix
67 | Matrix newPR = computeNewPR(z);
68 | //Normalized newPR matrix
69 | normal(newPR);
70 | //pw = new PrintWriter(new FileWriter(newPRFile));
71 | //newPR.print(pw, 4, 4);
72 |
73 | //4. Iteratively update PR state vector
74 | Matrix pageRank = calPageRank(PR0, newPR);
75 |
76 | //5. Print final PageRank score
77 | System.out.println("Final PageRank is :");
78 | pageRank.print(4, 4);
79 |
80 | String PRLine = "";
81 | for(int i = 0; i < NODENUM; i++){
82 | PRLine += pageRank.get(0, i) + "\t";
83 | }
84 | finalPRallLines.add(PRLine);
85 | //saveFinalPR(finalPRFile, pageRank);
86 | }
87 | FileUtil.writeLines(finalPRALLFile, finalPRallLines);
88 | }
89 |
90 | private static void normal(Matrix newPR) {
91 | // TODO Auto-generated method stub
92 | for(int i = 0; i < NODENUM; i++){
93 | double sum = 0;
94 | for(int j = 0; j < NODENUM; j++){
95 | sum += newPR.get(i, j);
96 | }
97 | if(sum != 0){
98 | for(int j = 0; j < NODENUM; j++){
99 | newPR.set(i, j, newPR.get(i, j) / sum);
100 | }
101 | }
102 | }
103 | }
104 |
105 | private static void saveFinalPR(String finalPRFile, Matrix pageRank) {
106 | // TODO Auto-generated method stub
107 | ArrayList lines = new ArrayList();
108 | for(int i = 0; i < pageRank.getRowDimension(); i++){
109 | String line = "";
110 | for(int j = 0; j < pageRank.getColumnDimension(); j++){
111 | line += pageRank.get(i, j) + "\t";
112 | }
113 | lines.add(line);
114 | }
115 | FileUtil.writeLines(finalPRFile, lines);
116 | }
117 |
118 | //Matrix with all 1
119 | private static void initU() {
120 | // TODO Auto-generated method stub
121 | double[][] u = new double[NODENUM][NODENUM];
122 | for(int i = 0; i < NODENUM; i++){
123 | for(int j = 0; j < NODENUM; j++){
124 | u[i][j] = 1;
125 | }
126 | }
127 | U = new Matrix(u);
128 | }
129 |
130 | //Compute transition matrix
131 | private static void computeTransM(int z) {
132 | // TODO Auto-generated method stub
133 | double[][] transm = new double[NODENUM][NODENUM];
134 | for(int i = 0; i < NODENUM; i++){
135 | double rowSum = 0;
136 | for(int j = 0; j < NODENUM; j++){
137 | rowSum += graphAdjM.get(i, j);
138 | }
139 | if(rowSum == 0){
140 | for(int j = 0; j < NODENUM; j++){
141 | transm [i][j] = 0;
142 | }
143 | } else {
144 | for(int j = 0; j < NODENUM; j++){
145 | double norWeight = graphAdjM.get(i, j) /
146 | rowSum;
147 | transm [i][j] = norWeight;
148 | }
149 | }
150 | }
151 | transM = new Matrix(transm);
152 | }
153 |
154 | private static double sim(float f, float g) {
155 | // TODO Auto-generated method stub
156 | return 1 - Math.abs(f - g);
157 | }
158 |
159 | //compute pagerank
160 | public static Matrix calPageRank(Matrix PR0, Matrix newPR) {
161 | Matrix PR;
162 | while (true) {
163 | PR = PR0.times(newPR);
164 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration
165 | System.out.println("distance:" + dis);
166 | if (dis <= THRESHOLD) {
167 | System.out.println("PR:");
168 | PR.print(4, 4);
169 | break;
170 | }
171 | PR0 = PR;
172 | }
173 | return PR;
174 | }
175 |
176 | private static Matrix initPRStateVector() {
177 | // TODO Auto-generated method stub
178 | double[] pr0M = new double[NODENUM];
179 | for(int i = 0; i < NODENUM; i++){
180 | pr0M[i] = 1;
181 | }
182 | return new Matrix(pr0M, 1);
183 | }
184 |
185 | private static void readQAGraph(
186 | String graphDataFile) {
187 | // TODO Auto-generated method stub
188 | ArrayList graphLines = new ArrayList();
189 | FileUtil.readLines(graphDataFile, graphLines);
190 | double[][] graphMatrix = new double[graphLines.size()][];
191 | double minNumber = 1000;
192 | for(int i = 0; i < graphLines.size(); i++){
193 | String[] glineTokens = graphLines.get(i).split("\t");
194 | graphMatrix[i] = new double[glineTokens.length];
195 | for(int j = 0; j < glineTokens.length; j++){
196 | double d = Double.valueOf(glineTokens[j]);
197 | if(d < 0){
198 | graphMatrix[i][j] = 0;
199 | } else {
200 | graphMatrix[i][j] = d;
201 | }
202 | if(minNumber > graphMatrix[i][j]) {
203 | minNumber = graphMatrix[i][j];
204 | }
205 | }
206 | }
207 | //If there is negative number is matrix, find the min one x. Then all number add |x|
208 | System.out.println("minNumber " + minNumber);
209 | /*if (minNumber < 0){
210 | for(int i = 0; i < graphMatrix.length; i++){
211 | for(int j = 0; j < graphMatrix[i].length; j++){
212 | graphMatrix[i][j] += (0 - minNumber);
213 | }
214 | }
215 | }*/
216 | graphAdjM = new Matrix(graphMatrix);
217 | }
218 |
219 | public static void printMatrix(List> m) {
220 | for (int i = 0; i < m.size(); i++) {
221 | for (int j = 0; j < m.get(i).size(); j++) {
222 | System.out.print(m.get(i).get(j) + "\t");
223 | }
224 | System.out.println();
225 | }
226 | }
227 |
228 | public static void printVec(List v) {
229 | for (int i = 0; i < v.size(); i++) {
230 | System.out.print(v.get(i) + "\t");
231 | }
232 | System.out.println();
233 | }
234 |
235 | /**
236 | * Randomly Initialise state vector PR0
237 | *
238 | * @param n
239 | * dimension of vector PR0
240 | * @return A random vector, each dimension is 0-5
241 | */
242 | public static List randomInitPR0(int n) {
243 | Random random = new Random();
244 | List q = new ArrayList();
245 | for (int i = 0; i < n; i++) {
246 | q.add(new Double(5 * random.nextDouble()));
247 | }
248 | return q;
249 | }
250 |
251 | /**
252 | * Compute Euclidean Distance
253 | *
254 | * @param q1
255 | *
256 | * @param q2
257 | *
258 | * @return distance
259 | */
260 | public static double calDistance(Matrix q1, Matrix q2) {
261 | double sum = 0;
262 |
263 | if (q1.getColumnDimension() != q2.getColumnDimension() ) {
264 | return -1;
265 | }
266 |
267 | for (int i = 0; i < q1.getColumnDimension() ; i++) {
268 | sum += Math.pow(q1.get(0, i) - q2.get(0, i),
269 | 2);
270 | }
271 | return Math.sqrt(sum);
272 | }
273 |
274 | /**
275 | * compute NEWPR matrix
276 | *
277 | * @return NEWPR matrix
278 | */
279 | public static Matrix computeNewPR(int z) {
280 | Matrix add1 = transM.times(LAMBDA);
281 | //In new PR matrix, the larger values are in the c-th column, the larger score node c tends to get.
282 | /*double [][] newU = new double[NODENUM][NODENUM];
283 | for(int i = 0; i < NODENUM; i++){
284 | double userPreference = model.theta[i][z];
285 | double tspr = userPreference;
286 | for(int k = 0; k < NODENUM; k++){
287 | newU[k][i] = tspr;
288 | }
289 | }
290 | U = new Matrix(newU);*/
291 |
292 | Matrix add2 = U.times( (1 - LAMBDA) / NODENUM);
293 | Matrix newPR = add1.plus(add2);
294 | return newPR;
295 | }
296 | }
297 |
--------------------------------------------------------------------------------
/src/tem/linkas/TEPR.java:
--------------------------------------------------------------------------------
1 | package tem.linkas;
2 |
3 | /**Topic Expertise PageRank Algorithm
4 | * @author yangliu
5 | * @blog http://blog.csdn.net/yangliuy
6 | * @mail yangliuyx@gmail.com
7 | */
8 |
9 | import java.io.FileInputStream;
10 | import java.io.FileWriter;
11 | import java.io.IOException;
12 | import java.io.ObjectInputStream;
13 | import java.io.PrintWriter;
14 | import java.util.ArrayList;
15 | import java.util.List;
16 | import java.util.Random;
17 |
18 | import Jama.Matrix;
19 |
20 | import tem.com.FileUtil;
21 | import tem.conf.PathConfig;
22 | import tem.main.TEMModel;
23 |
24 | public class TEPR {
25 | private static double LAMBDA = 0.2;
26 | private static double THRESHOLD = 0.0000001;
27 |
28 | private static int NODENUM; // Node number
29 | private static Matrix U; // Matrix with all 1
30 | private static Matrix graphAdjM; //Adjancy matrix of graph
31 | private static Matrix transM;//Transition probability matrix
32 |
33 | public static void main(String[] args) throws IOException, ClassNotFoundException {
34 | String minPostNum = "80";
35 | String modelName = "TEPR";
36 | String userIDFile = PathConfig.originalDataPath + "USER" + minPostNum + "/user.IDs";
37 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userAnswerNumWeighted.QAgraph";
38 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph";
39 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/test.QAgraph";
40 | //String TransMFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".transM";
41 |
42 | ArrayList finalPRallLines = new ArrayList();
43 | TEMModel model = new TEMModel();
44 | //String[] ENums = {"11", "13", "14"};
45 | String[] TNums = {"10", "12", "14", "16", "18", "20", "22", "24", "26", "28", "30"};
46 | readQAGraph(graphDataFile);
47 |
48 | NODENUM = graphAdjM.getRowDimension();
49 | System.out.println("NODENUM : " + NODENUM);
50 | for(String T : TNums){
51 | //for(String E : ENums){
52 | // load model
53 | //String modelFile = PathConfig.modelOutPath + "/USER80/Model_E" + E + "_T15.model";
54 | String modelFile = PathConfig.modelResPath + "/USER" + PathConfig.minPostNum + "/Model_E10_T" + T + ".model";
55 | System.out.println("reading a class from : " + modelFile);
56 | FileInputStream fis = new FileInputStream(modelFile);
57 | ObjectInputStream ois = new ObjectInputStream(fis);
58 | model = (TEMModel) ois.readObject();
59 | ois.close();
60 | System.out.println("TopicNum" + model.K);
61 | System.out.println("ENum" + model.ENum);
62 |
63 | String finalPRALLFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".E" + model.ENum + "T" + model.K + "finalPRAll";
64 | finalPRallLines.clear();
65 |
66 | PrintWriter pw;
67 |
68 | //1. Init PR state vector and Matrix U
69 | //Both randomly initialise or set all 1 are OK
70 | Matrix PR0 = initPRStateVector();
71 | initU();
72 | System.out.println("Initial state vector PR0 is:");
73 | PR0.print(4, 4);
74 |
75 | for(int z = 0; z < model.K; z++){
76 | System.out.println("now topic = " + z);
77 | //String newPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".newPR";
78 | //String finalPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".finalPR";
79 |
80 | //2. Compute transition probability matrix
81 | computeTransM(model, z);
82 | //pw = new PrintWriter(new FileWriter(TransMFile));
83 | //transM.print(pw, 4, 4);
84 |
85 | //3. Compute newPR update matrix
86 | Matrix newPR = computeNewPR(model, z);
87 | //Normalized newPR matrix
88 | normal(newPR);
89 | //pw = new PrintWriter(new FileWriter(newPRFile));
90 | //newPR.print(pw, 4, 4);
91 |
92 | //4. Iteratively update PR state vector
93 | Matrix pageRank = calPageRank(PR0, newPR);
94 |
95 | //5. Print final PageRank score
96 | System.out.println("Final PageRank is :");
97 | pageRank.print(4, 4);
98 |
99 | String PRLine = "";
100 | for(int i = 0; i < NODENUM; i++){
101 | PRLine += pageRank.get(0, i) + "\t";
102 | }
103 | finalPRallLines.add(PRLine);
104 | //saveFinalPR(finalPRFile, pageRank);
105 | }
106 | FileUtil.writeLines(finalPRALLFile, finalPRallLines);
107 | }
108 |
109 | }
110 |
111 | private static void normal(Matrix newPR) {
112 | // TODO Auto-generated method stub
113 | for(int i = 0; i < NODENUM; i++){
114 | double sum = 0;
115 | for(int j = 0; j < NODENUM; j++){
116 | sum += newPR.get(i, j);
117 | }
118 | if(sum != 0){
119 | for(int j = 0; j < NODENUM; j++){
120 | newPR.set(i, j, newPR.get(i, j) / sum);
121 | }
122 | }
123 | }
124 | }
125 |
126 | private static void saveFinalPR(String finalPRFile, Matrix pageRank) {
127 | // TODO Auto-generated method stub
128 | ArrayList lines = new ArrayList();
129 | for(int i = 0; i < pageRank.getRowDimension(); i++){
130 | String line = "";
131 | for(int j = 0; j < pageRank.getColumnDimension(); j++){
132 | line += pageRank.get(i, j) + "\t";
133 | }
134 | lines.add(line);
135 | }
136 | FileUtil.writeLines(finalPRFile, lines);
137 | }
138 |
139 | //Matrix with all 1
140 | private static void initU() {
141 | // TODO Auto-generated method stub
142 | double[][] u = new double[NODENUM][NODENUM];
143 | for(int i = 0; i < NODENUM; i++){
144 | for(int j = 0; j < NODENUM; j++){
145 | u[i][j] = 1;
146 | }
147 | }
148 | U = new Matrix(u);
149 | }
150 |
151 | //Compute transition matrix
152 | private static void computeTransM(TEMModel model, int z) {
153 | // TODO Auto-generated method stub
154 | double[][] transm = new double[NODENUM][NODENUM];
155 | for(int i = 0; i < NODENUM; i++){
156 | double rowSum = 0;
157 | for(int j = 0; j < NODENUM; j++){
158 | rowSum += graphAdjM.get(i, j) * sim(model.theta[i][z], model.theta[j][z]);
159 | }
160 | if(rowSum == 0){
161 | for(int j = 0; j < NODENUM; j++){
162 | transm [i][j] = 0;
163 | }
164 | } else {
165 | for(int j = 0; j < NODENUM; j++){
166 | double norWeight = graphAdjM.get(i, j) * sim(model.theta[i][z], model.theta[j][z]) /
167 | rowSum;
168 | transm [i][j] = norWeight;
169 | }
170 | }
171 | }
172 | transM = new Matrix(transm);
173 | }
174 |
175 | private static double sim(float f, float g) {
176 | // TODO Auto-generated method stub
177 | return 1 - Math.abs(f - g);
178 | }
179 |
180 | //compute pagerank
181 | public static Matrix calPageRank(Matrix PR0, Matrix newPR) {
182 | Matrix PR;
183 | while (true) {
184 | PR = PR0.times(newPR);
185 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration
186 | System.out.println("distance:" + dis);
187 | if (dis <= THRESHOLD) {
188 | System.out.println("PR:");
189 | PR.print(4, 4);
190 | break;
191 | }
192 | PR0 = PR;
193 | }
194 | return PR;
195 | }
196 |
197 | private static Matrix initPRStateVector() {
198 | // TODO Auto-generated method stub
199 | double[] pr0M = new double[NODENUM];
200 | for(int i = 0; i < NODENUM; i++){
201 | pr0M[i] = 1;
202 | }
203 | return new Matrix(pr0M, 1);
204 | }
205 |
206 | private static void readQAGraph(
207 | String graphDataFile) {
208 | // TODO Auto-generated method stub
209 | ArrayList graphLines = new ArrayList();
210 | FileUtil.readLines(graphDataFile, graphLines);
211 | double[][] graphMatrix = new double[graphLines.size()][];
212 | double minNumber = 1000;
213 | for(int i = 0; i < graphLines.size(); i++){
214 | String[] glineTokens = graphLines.get(i).split("\t");
215 | graphMatrix[i] = new double[glineTokens.length];
216 | for(int j = 0; j < glineTokens.length; j++){
217 | double d = Double.valueOf(glineTokens[j]);
218 | if(d < 0){
219 | graphMatrix[i][j] = 0;
220 | } else {
221 | graphMatrix[i][j] = d;
222 | }
223 | if(minNumber > graphMatrix[i][j]) {
224 | minNumber = graphMatrix[i][j];
225 | }
226 | }
227 | }
228 | //If there is negative number is matrix, find the min one x. Then all number add |x|
229 | System.out.println("minNumber " + minNumber);
230 | /*if (minNumber < 0){
231 | for(int i = 0; i < graphMatrix.length; i++){
232 | for(int j = 0; j < graphMatrix[i].length; j++){
233 | graphMatrix[i][j] += (0 - minNumber);
234 | }
235 | }
236 | }*/
237 | graphAdjM = new Matrix(graphMatrix);
238 | }
239 |
240 | public static void printMatrix(List> m) {
241 | for (int i = 0; i < m.size(); i++) {
242 | for (int j = 0; j < m.get(i).size(); j++) {
243 | System.out.print(m.get(i).get(j) + "\t");
244 | }
245 | System.out.println();
246 | }
247 | }
248 |
249 | public static void printVec(List v) {
250 | for (int i = 0; i < v.size(); i++) {
251 | System.out.print(v.get(i) + "\t");
252 | }
253 | System.out.println();
254 | }
255 |
256 | /**
257 | * Randomly Initialise state vector PR0
258 | *
259 | * @param n
260 | * dimension of vector PR0
261 | * @return A random vector, each dimension is 0-5
262 | */
263 | public static List randomInitPR0(int n) {
264 | Random random = new Random();
265 | List q = new ArrayList();
266 | for (int i = 0; i < n; i++) {
267 | q.add(new Double(5 * random.nextDouble()));
268 | }
269 | return q;
270 | }
271 |
272 | /**
273 | * Compute Euclidean Distance
274 | *
275 | * @param q1
276 | *
277 | * @param q2
278 | *
279 | * @return distance
280 | */
281 | public static double calDistance(Matrix q1, Matrix q2) {
282 | double sum = 0;
283 |
284 | if (q1.getColumnDimension() != q2.getColumnDimension() ) {
285 | return -1;
286 | }
287 |
288 | for (int i = 0; i < q1.getColumnDimension() ; i++) {
289 | sum += Math.pow(q1.get(0, i) - q2.get(0, i),
290 | 2);
291 | }
292 | return Math.sqrt(sum);
293 | }
294 |
295 | /**
296 | * compute NEWPR matrix
297 | *
298 | * @return NEWPR matrix
299 | */
300 | public static Matrix computeNewPR(TEMModel model, int z) {
301 | Matrix add1 = transM.times(LAMBDA);
302 | //In new PR matrix, the larger values are in the c-th column, the larger score node c tends to get.
303 | //Add user topic preference score and user topic expertise score in Matrix U
304 | double [][] newU = new double[NODENUM][NODENUM];
305 | for(int i = 0; i < NODENUM; i++){
306 | double userPreference = model.theta[i][z];
307 | double userExpertise = 0d;
308 | for (int e = 0; e < model.ENum; e++) {
309 | userExpertise += model.phi[z][i][e] * model.fgmm.p_mu[e][0];
310 | }
311 | double tepr = userPreference * userExpertise;
312 | for(int k = 0; k < NODENUM; k++){
313 | newU[k][i] = tepr;
314 | }
315 | }
316 | U = new Matrix(newU);
317 |
318 | Matrix add2 = U.times( (1 - LAMBDA));
319 | Matrix newPR = add1.plus(add2);
320 | return newPR;
321 | }
322 | }
323 |
--------------------------------------------------------------------------------
/src/tem/linkas/TSPR.java:
--------------------------------------------------------------------------------
1 | package tem.linkas;
2 |
3 | /**Topic Sensitive PageRank Algorithm (CIKM 12 TSPR for Expert finding)
4 | * @author yangliu
5 | * @blog http://blog.csdn.net/yangliuy
6 | * @mail yangliuyx@gmail.com
7 | */
8 |
9 | import java.io.FileInputStream;
10 | import java.io.FileWriter;
11 | import java.io.IOException;
12 | import java.io.ObjectInputStream;
13 | import java.io.PrintWriter;
14 | import java.util.ArrayList;
15 | import java.util.List;
16 | import java.util.Random;
17 |
18 | import Jama.Matrix;
19 |
20 | import tem.com.FileUtil;
21 | import tem.conf.PathConfig;
22 | import tem.main.TEMModel;
23 |
24 | public class TSPR {
25 | private static double LAMBDA = 0.2;
26 | private static double THRESHOLD = 0.0000001;
27 |
28 | private static int NODENUM; // Node number
29 | private static Matrix U; // Matrix with all 1
30 | private static Matrix graphAdjM; //Adjancy matrix of graph
31 | private static Matrix transM;//Transition probability matrix
32 |
33 | public static void main(String[] args) throws IOException, ClassNotFoundException {
34 | String minPostNum = "80";
35 | String modelName = "TSPR";
36 | String userIDFile = PathConfig.originalDataPath + "USER" + minPostNum + "/user.IDs";
37 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userAnswerNumWeighted.QAgraph";
38 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph";
39 | //String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/test.QAgraph";
40 | //String TransMFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".transM";
41 | String finalPRALLFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".finalPRAll";
42 | ArrayList finalPRallLines = new ArrayList();
43 | //TEMModel model = new TEMModel();
44 | // load model
45 | //String modelFile = PathConfig.modelOutPath + "/USER80/ModelFile.model";
46 | //System.out.println("reading a class from : " + modelFile);
47 | //FileInputStream fis = new FileInputStream(modelFile);
48 | //ObjectInputStream ois = new ObjectInputStream(fis);
49 | //model = (TEMModel) ois.readObject();
50 | //ois.close();
51 | //System.out.println(model.K);
52 | //System.out.println(model.theta.length);
53 |
54 | PrintWriter pw;
55 | readQAGraph(graphDataFile);
56 | NODENUM = graphAdjM.getRowDimension();
57 | System.out.println("NODENUM : " + NODENUM);
58 |
59 | String LDAThetaFile = PathConfig.modelResPath + "LDA/lda_500.theta";
60 | double [][] theta = FileUtil.read2DArray(LDAThetaFile);
61 | int K = theta[0].length;
62 |
63 | //1. Init PR state vector and Matrix U
64 | //Both randomly initialise or set all 1 are OK
65 | Matrix PR0 = initPRStateVector();
66 | initU();
67 | System.out.println("Initial state vector PR0 is:");
68 | PR0.print(4, 4);
69 |
70 |
71 | for(int z = 0; z < K; z++){
72 | System.out.println("now topic = " + z);
73 | //String newPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".newPR";
74 | //String finalPRFile = PathConfig.modelResPath + "USER" + minPostNum + "/" + modelName + ".T" + z + ".finalPR";
75 |
76 | //2. Compute transition probability matrix
77 | computeTransM(theta, z);
78 | //pw = new PrintWriter(new FileWriter(TransMFile));
79 | //transM.print(pw, 4, 4);
80 |
81 | //3. Compute newPR update matrix
82 | Matrix newPR = computeNewPR(theta, z);
83 | //Normalized newPR matrix
84 | normal(newPR);
85 | //pw = new PrintWriter(new FileWriter(newPRFile));
86 | //newPR.print(pw, 4, 4);
87 |
88 | //4. Iteratively update PR state vector
89 | Matrix pageRank = calPageRank(PR0, newPR);
90 |
91 | //5. Print final PageRank score
92 | System.out.println("Final PageRank is :");
93 | pageRank.print(4, 4);
94 |
95 | String PRLine = "";
96 | for(int i = 0; i < NODENUM; i++){
97 | PRLine += pageRank.get(0, i) + "\t";
98 | }
99 | finalPRallLines.add(PRLine);
100 | //saveFinalPR(finalPRFile, pageRank);
101 | }
102 | FileUtil.writeLines(finalPRALLFile, finalPRallLines);
103 | }
104 |
105 | private static void normal(Matrix newPR) {
106 | // TODO Auto-generated method stub
107 | for(int i = 0; i < NODENUM; i++){
108 | double sum = 0;
109 | for(int j = 0; j < NODENUM; j++){
110 | sum += newPR.get(i, j);
111 | }
112 | if(sum != 0){
113 | for(int j = 0; j < NODENUM; j++){
114 | newPR.set(i, j, newPR.get(i, j) / sum);
115 | }
116 | }
117 | }
118 | }
119 |
120 | private static void saveFinalPR(String finalPRFile, Matrix pageRank) {
121 | // TODO Auto-generated method stub
122 | ArrayList lines = new ArrayList();
123 | for(int i = 0; i < pageRank.getRowDimension(); i++){
124 | String line = "";
125 | for(int j = 0; j < pageRank.getColumnDimension(); j++){
126 | line += pageRank.get(i, j) + "\t";
127 | }
128 | lines.add(line);
129 | }
130 | FileUtil.writeLines(finalPRFile, lines);
131 | }
132 |
133 | //Matrix with all 1
134 | private static void initU() {
135 | // TODO Auto-generated method stub
136 | double[][] u = new double[NODENUM][NODENUM];
137 | for(int i = 0; i < NODENUM; i++){
138 | for(int j = 0; j < NODENUM; j++){
139 | u[i][j] = 1;
140 | }
141 | }
142 | U = new Matrix(u);
143 | }
144 |
145 | //Compute transition matrix
146 | private static void computeTransM(double[][] theta, int z) {
147 | // TODO Auto-generated method stub
148 | double[][] transm = new double[NODENUM][NODENUM];
149 | for(int i = 0; i < NODENUM; i++){
150 | double rowSum = 0;
151 | for(int j = 0; j < NODENUM; j++){
152 | rowSum += graphAdjM.get(i, j) * sim(theta[i][z], theta[j][z]);
153 | }
154 | if(rowSum == 0){
155 | for(int j = 0; j < NODENUM; j++){
156 | transm [i][j] = 0;
157 | }
158 | } else {
159 | for(int j = 0; j < NODENUM; j++){
160 | double norWeight = graphAdjM.get(i, j) * sim(theta[i][z], theta[j][z]) /
161 | rowSum;
162 | transm [i][j] = norWeight;
163 | }
164 | }
165 | }
166 | transM = new Matrix(transm);
167 | }
168 |
169 | private static double sim(double f, double g) {
170 | // TODO Auto-generated method stub
171 | return 1 - Math.abs(f - g);
172 | }
173 |
174 | //compute pagerank
175 | public static Matrix calPageRank(Matrix PR0, Matrix newPR) {
176 | Matrix PR;
177 | while (true) {
178 | PR = PR0.times(newPR);
179 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration
180 | System.out.println("distance:" + dis);
181 | if (dis <= THRESHOLD) {
182 | System.out.println("PR:");
183 | PR.print(4, 4);
184 | break;
185 | }
186 | PR0 = PR;
187 | }
188 | return PR;
189 | }
190 |
191 | private static Matrix initPRStateVector() {
192 | // TODO Auto-generated method stub
193 | double[] pr0M = new double[NODENUM];
194 | for(int i = 0; i < NODENUM; i++){
195 | pr0M[i] = 1;
196 | }
197 | return new Matrix(pr0M, 1);
198 | }
199 |
200 | private static void readQAGraph(
201 | String graphDataFile) {
202 | // TODO Auto-generated method stub
203 | ArrayList graphLines = new ArrayList();
204 | FileUtil.readLines(graphDataFile, graphLines);
205 | double[][] graphMatrix = new double[graphLines.size()][];
206 | double minNumber = 1000;
207 | for(int i = 0; i < graphLines.size(); i++){
208 | String[] glineTokens = graphLines.get(i).split("\t");
209 | graphMatrix[i] = new double[glineTokens.length];
210 | for(int j = 0; j < glineTokens.length; j++){
211 | double d = Double.valueOf(glineTokens[j]);
212 | if(d < 0){
213 | graphMatrix[i][j] = 0;
214 | } else {
215 | graphMatrix[i][j] = d;
216 | }
217 | if(minNumber > graphMatrix[i][j]) {
218 | minNumber = graphMatrix[i][j];
219 | }
220 | }
221 | }
222 | //If there is negative number is matrix, find the min one x. Then all number add |x|
223 | System.out.println("minNumber " + minNumber);
224 | /*if (minNumber < 0){
225 | for(int i = 0; i < graphMatrix.length; i++){
226 | for(int j = 0; j < graphMatrix[i].length; j++){
227 | graphMatrix[i][j] += (0 - minNumber);
228 | }
229 | }
230 | }*/
231 | graphAdjM = new Matrix(graphMatrix);
232 | }
233 |
234 | public static void printMatrix(List> m) {
235 | for (int i = 0; i < m.size(); i++) {
236 | for (int j = 0; j < m.get(i).size(); j++) {
237 | System.out.print(m.get(i).get(j) + "\t");
238 | }
239 | System.out.println();
240 | }
241 | }
242 |
243 | public static void printVec(List v) {
244 | for (int i = 0; i < v.size(); i++) {
245 | System.out.print(v.get(i) + "\t");
246 | }
247 | System.out.println();
248 | }
249 |
250 | /**
251 | * Randomly Initialise state vector PR0
252 | *
253 | * @param n
254 | * dimension of vector PR0
255 | * @return A random vector, each dimension is 0-5
256 | */
257 | public static List randomInitPR0(int n) {
258 | Random random = new Random();
259 | List q = new ArrayList();
260 | for (int i = 0; i < n; i++) {
261 | q.add(new Double(5 * random.nextDouble()));
262 | }
263 | return q;
264 | }
265 |
266 | /**
267 | * Compute Euclidean Distance
268 | *
269 | * @param q1
270 | *
271 | * @param q2
272 | *
273 | * @return distance
274 | */
275 | public static double calDistance(Matrix q1, Matrix q2) {
276 | double sum = 0;
277 |
278 | if (q1.getColumnDimension() != q2.getColumnDimension() ) {
279 | return -1;
280 | }
281 |
282 | for (int i = 0; i < q1.getColumnDimension() ; i++) {
283 | sum += Math.pow(q1.get(0, i) - q2.get(0, i),
284 | 2);
285 | }
286 | return Math.sqrt(sum);
287 | }
288 |
289 | /**
290 | * compute NEWPR matrix
291 | *
292 | * @return NEWPR matrix
293 | */
294 | public static Matrix computeNewPR(double[][] theta, int z) {
295 | Matrix add1 = transM.times(LAMBDA);
296 | //In new PR matrix, the larger values are in the c-th column, the larger score node c tends to get.
297 | double [][] newU = new double[NODENUM][NODENUM];
298 | for(int i = 0; i < NODENUM; i++){
299 | double userPreference = theta[i][z];
300 | double tspr = userPreference;
301 | for(int k = 0; k < NODENUM; k++){
302 | newU[k][i] = tspr;
303 | }
304 | }
305 | U = new Matrix(newU);
306 |
307 | Matrix add2 = U.times( (1 - LAMBDA));
308 | Matrix newPR = add1.plus(add2);
309 | return newPR;
310 | }
311 | }
312 |
--------------------------------------------------------------------------------
/src/tem/main/LdaGibbsSampling.java:
--------------------------------------------------------------------------------
1 | package tem.main;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.ArrayList;
6 |
7 | import tem.com.FileUtil;
8 | import tem.com.JC;
9 | import tem.conf.ConstantConfig;
10 | import tem.conf.PathConfig;
11 | import tem.main.Documents;
12 | import tem.main.TEMModelSampling.modelparameters;
13 |
14 | /**Liu Yang's implementation of Gibbs Sampling of LDA
15 | * @author yangliu
16 | * @blog http://blog.csdn.net/yangliuy
17 | * @mail yangliuyx@gmail.com
18 | */
19 |
20 | public class LdaGibbsSampling {
21 |
22 | public static class modelparameters {
23 | float alpha = 1f; //usual value is 50 / K
24 | float beta = 0.1f;//usual value is 0.1
25 | int topicNum = 15;
26 | int iteration = 500;
27 | int saveStep = 20;
28 | int beginSaveIters = 440;
29 | }
30 |
31 | /**Get parameters from configuring file. If the
32 | * configuring file has value in it, use the value.
33 | * Else the default value in program will be used
34 | * @param ldaparameters
35 | * @param parameterFile
36 | * @return void
37 | */
38 | private static void getParametersFromFile(modelparameters ldaparameters,
39 | String parameterFile) {
40 | // TODO Auto-generated method stub
41 | ArrayList paramLines = new ArrayList();
42 | FileUtil.readLines(parameterFile, paramLines);
43 | for(String line : paramLines){
44 | String[] lineParts = line.split("\t");
45 | switch(parameters.valueOf(lineParts[0])){
46 | case alpha:
47 | ldaparameters.alpha = Float.valueOf(lineParts[1]);
48 | break;
49 | case beta:
50 | ldaparameters.beta = Float.valueOf(lineParts[1]);
51 | break;
52 | case topicNum:
53 | ldaparameters.topicNum = Integer.valueOf(lineParts[1]);
54 | break;
55 | case iteration:
56 | ldaparameters.iteration = Integer.valueOf(lineParts[1]);
57 | break;
58 | case saveStep:
59 | ldaparameters.saveStep = Integer.valueOf(lineParts[1]);
60 | break;
61 | case beginSaveIters:
62 | ldaparameters.beginSaveIters = Integer.valueOf(lineParts[1]);
63 | break;
64 | }
65 | }
66 | }
67 |
68 | public enum parameters{
69 | alpha, beta, topicNum, iteration, saveStep, beginSaveIters;
70 | }
71 |
72 | /**
73 | * @param args
74 | * @throws IOException
75 | * @throws ClassNotFoundException
76 | */
77 | public static void main(String[] args) throws IOException, ClassNotFoundException {
78 | // TODO Auto-generated method stub
79 | String dataPath = PathConfig.modelResPath + "USER80/";
80 | String minPostNum = PathConfig.minPostNum;
81 | Documents docSet = new Documents();
82 | String docfile = dataPath + "USER" + minPostNum + ".data";
83 | docSet = FileUtil.loadClass(docSet, docfile);
84 |
85 | System.out.println("indexToTermMap size : "
86 | + docSet.indexToTermMap.size());
87 | // System.out.println("indexToTermMap : " + docSet.indexToTermMap);
88 | System.out.println("indexToTagMap size : "
89 | + docSet.indexToTagMap.size());
90 | System.out.println("indexToVoteMap size : "
91 | + docSet.indexToVoteMap.size());
92 |
93 | modelparameters ldaparameters = new modelparameters();
94 | System.out.println("Topic Num : " + ldaparameters.topicNum);
95 | LdaModel model = new LdaModel(ldaparameters);
96 | System.out.println("1 Initialize the model ...");
97 | model.initializeModel(docSet);
98 | System.out.println("2 Learning and Saving the model ...");
99 | model.inferenceModel(docSet);
100 | System.out.println("3 Output the final model ...");
101 | model.saveIteratedModel(ldaparameters.iteration, docSet);
102 | System.out.println("Done!");
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/tem/main/LdaModel.java:
--------------------------------------------------------------------------------
1 | package tem.main;
2 |
3 | /**Class for Lda model
4 | * @author yangliu
5 | * @blog http://blog.csdn.net/yangliuy
6 | * @mail yangliuyx@gmail.com
7 | */
8 | import java.io.BufferedWriter;
9 | import java.io.FileWriter;
10 | import java.io.IOException;
11 | import java.util.ArrayList;
12 | import java.util.Collections;
13 | import java.util.Comparator;
14 | import java.util.List;
15 |
16 | import tem.com.FileUtil;
17 | import tem.conf.PathConfig;
18 | import tem.main.Documents;
19 |
20 | public class LdaModel {
21 |
22 | int [][] doc;//word index array
23 | int V, K, M;//vocabulary size, topic number, document number
24 | int [][] z;//topic label array
25 | float alpha; //doc-topic dirichlet prior parameter
26 | float beta; //topic-word dirichlet prior parameter
27 | int [][] nmk;//given document m, count times of topic k. M*K
28 | int [][] nkt;//given topic k, count times of term t. K*V
29 | int [] nmkSum;//Sum for each row in nmk
30 | int [] nktSum;//Sum for each row in nkt
31 | double [][] phi;//Parameters for topic-word distribution K*V
32 | double [][] theta;//Parameters for doc-topic distribution M*K
33 | int iterations;//Times of iterations
34 | int saveStep;//The number of iterations between two saving
35 | int beginSaveIters;//Begin save model at this iteration
36 |
37 | public LdaModel(LdaGibbsSampling.modelparameters modelparam) {
38 | // TODO Auto-generated constructor stub
39 | alpha = modelparam.alpha;
40 | beta = modelparam.beta;
41 | iterations = modelparam.iteration;
42 | K = modelparam.topicNum;
43 | saveStep = modelparam.saveStep;
44 | beginSaveIters = modelparam.beginSaveIters;
45 | }
46 |
47 | public void initializeModel(Documents docSet) {
48 | // TODO Auto-generated method stub
49 | M = docSet.docs.size();
50 | V = docSet.termToIndexMap.size();
51 | nmk = new int [M][K];
52 | nkt = new int[K][V];
53 | nmkSum = new int[M];
54 | nktSum = new int[K];
55 | phi = new double[K][V];
56 | theta = new double[M][K];
57 |
58 | //initialize documents index array
59 | doc = new int[M][];
60 | for(int m = 0; m < M; m++){
61 | //Notice the limit of memory
62 | int N = 0;
63 | for(int i = 0; i < docSet.docs.get(m).docWords.length; i++){
64 | for(int j = 0; j < docSet.docs.get(m).docWords[i].length; j++){
65 | N++;
66 | }
67 | }
68 | doc[m] = new int[N];
69 | int n = 0;
70 | for(int i = 0; i < docSet.docs.get(m).docWords.length; i++){
71 | for(int j = 0; j < docSet.docs.get(m).docWords[i].length; j++){
72 | doc[m][n] = docSet.docs.get(m).docWords[i][j];
73 | n++;
74 | }
75 | }
76 | }
77 |
78 | //initialize topic lable z for each word
79 | z = new int[M][];
80 | for(int m = 0; m < M; m++){
81 | int N = doc[m].length;
82 | z[m] = new int[N];
83 | for(int n = 0; n < N; n++){
84 | int initTopic = (int)(Math.random() * K);// From 0 to K - 1
85 | z[m][n] = initTopic;
86 | //number of words in doc m assigned to topic initTopic add 1
87 | nmk[m][initTopic]++;
88 | //number of terms doc[m][n] assigned to topic initTopic add 1
89 | nkt[initTopic][doc[m][n]]++;
90 | // total number of words assigned to topic initTopic add 1
91 | nktSum[initTopic]++;
92 | }
93 | // total number of words in document m is N
94 | nmkSum[m] = N;
95 | }
96 | }
97 |
98 | public void inferenceModel(Documents docSet) throws IOException {
99 | // TODO Auto-generated method stub
100 | if(iterations < saveStep + beginSaveIters){
101 | System.err.println("Error: the number of iterations should be larger than " + (saveStep + beginSaveIters));
102 | System.exit(0);
103 | }
104 | for(int i = 0; i < iterations; i++){
105 | System.out.println("Iteration " + i);
106 | if((i >= beginSaveIters) && (((i - beginSaveIters) % saveStep) == 0)){
107 | //Saving the model
108 | System.out.println("Saving model at iteration " + i +" ... ");
109 | //Firstly update parameters
110 | updateEstimatedParameters();
111 | //Secondly print model variables
112 | saveIteratedModel(i, docSet);
113 | }
114 |
115 | //Use Gibbs Sampling to update z[][]
116 | for(int m = 0; m < M; m++){
117 | int N = doc[m].length;
118 | for(int n = 0; n < N; n++){
119 | // Sample from p(z_i|z_-i, w)
120 | int newTopic = sampleTopicZ(m, n);
121 | z[m][n] = newTopic;
122 | }
123 | }
124 | }
125 | }
126 |
127 | private void updateEstimatedParameters() {
128 | // TODO Auto-generated method stub
129 | for(int k = 0; k < K; k++){
130 | for(int t = 0; t < V; t++){
131 | phi[k][t] = (nkt[k][t] + beta) / (nktSum[k] + V * beta);
132 | }
133 | }
134 |
135 | for(int m = 0; m < M; m++){
136 | for(int k = 0; k < K; k++){
137 | theta[m][k] = (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);
138 | }
139 | }
140 | }
141 |
142 | private int sampleTopicZ(int m, int n) {
143 | // TODO Auto-generated method stub
144 | // Sample from p(z_i|z_-i, w) using Gibbs upde rule
145 |
146 | //Remove topic label for w_{m,n}
147 | int oldTopic = z[m][n];
148 | nmk[m][oldTopic]--;
149 | nkt[oldTopic][doc[m][n]]--;
150 | nmkSum[m]--;
151 | nktSum[oldTopic]--;
152 |
153 | //Compute p(z_i = k|z_-i, w)
154 | double [] p = new double[K];
155 | for(int k = 0; k < K; k++){
156 | p[k] = (nkt[k][doc[m][n]] + beta) / (nktSum[k] + V * beta) * (nmk[m][k] + alpha) / (nmkSum[m] + K * alpha);
157 | }
158 |
159 | //Sample a new topic label for w_{m, n} like roulette
160 | //Compute cumulated probability for p
161 | for(int k = 1; k < K; k++){
162 | p[k] += p[k - 1];
163 | }
164 | double u = Math.random() * p[K - 1]; //p[] is unnormalised
165 | int newTopic;
166 | for(newTopic = 0; newTopic < K; newTopic++){
167 | if(u < p[newTopic]){
168 | break;
169 | }
170 | }
171 |
172 | //Add new topic label for w_{m, n}
173 | nmk[m][newTopic]++;
174 | nkt[newTopic][doc[m][n]]++;
175 | nmkSum[m]++;
176 | nktSum[newTopic]++;
177 | return newTopic;
178 | }
179 |
180 | public void saveIteratedModel(int iters, Documents docSet) throws IOException {
181 | // TODO Auto-generated method stub
182 | //lda.params lda.phi lda.theta lda.tassign lda.twords
183 | //lda.params
184 | String resPath = PathConfig.modelResPath + "LDA/";
185 | String modelName = "lda_" + iters;
186 | ArrayList lines = new ArrayList();
187 | lines.add("alpha = " + alpha);
188 | lines.add("beta = " + beta);
189 | lines.add("topicNum = " + K);
190 | lines.add("docNum = " + M);
191 | lines.add("termNum = " + V);
192 | lines.add("iterations = " + iterations);
193 | lines.add("saveStep = " + saveStep);
194 | lines.add("beginSaveIters = " + beginSaveIters);
195 | FileUtil.writeLines(resPath + modelName + ".params", lines);
196 |
197 | //lda.phi K*V
198 | BufferedWriter writer = new BufferedWriter(new FileWriter(resPath + modelName + ".phi"));
199 | for (int i = 0; i < K; i++){
200 | for (int j = 0; j < V; j++){
201 | writer.write(phi[i][j] + "\t");
202 | }
203 | writer.write("\n");
204 | }
205 | writer.close();
206 |
207 | //lda.theta M*K
208 | writer = new BufferedWriter(new FileWriter(resPath + modelName + ".theta"));
209 | for(int i = 0; i < M; i++){
210 | for(int j = 0; j < K; j++){
211 | writer.write(theta[i][j] + "\t");
212 | }
213 | writer.write("\n");
214 | }
215 | writer.close();
216 |
217 | //lda.tassign
218 | writer = new BufferedWriter(new FileWriter(resPath + modelName + ".tassign"));
219 | for(int m = 0; m < M; m++){
220 | for(int n = 0; n < doc[m].length; n++){
221 | writer.write(doc[m][n] + ":" + z[m][n] + "\t");
222 | }
223 | writer.write("\n");
224 | }
225 | writer.close();
226 |
227 | //lda.twords phi[][] K*V
228 | writer = new BufferedWriter(new FileWriter(resPath + modelName + ".twords"));
229 | int topNum = 20; //Find the top 20 topic words in each topic
230 | for(int i = 0; i < K; i++){
231 | List tWordsIndexArray = new ArrayList();
232 | for(int j = 0; j < V; j++){
233 | tWordsIndexArray.add(new Integer(j));
234 | }
235 | Collections.sort(tWordsIndexArray, new LdaModel.TwordsComparable(phi[i]));
236 | writer.write("topic " + i + ":\t");
237 | for(int t = 0; t < topNum; t++){
238 | writer.write(docSet.indexToTermMap.get(tWordsIndexArray.get(t)) + "\t");
239 | }
240 | writer.write("\n");
241 | }
242 | writer.close();
243 | }
244 |
245 | public class TwordsComparable implements Comparator {
246 | public double [] sortProb; // Store probability of each word in topic k
247 |
248 | public TwordsComparable (double[] sortProb){
249 | this.sortProb = sortProb;
250 | }
251 |
252 | @Override
253 | public int compare(Integer o1, Integer o2) {
254 | // TODO Auto-generated method stub
255 | //Sort topic word index according to the probability of each word in topic k
256 | if(sortProb[o1] > sortProb[o2]) return -1;
257 | else if(sortProb[o1] < sortProb[o2]) return 1;
258 | else return 0;
259 | }
260 | }
261 | }
262 |
--------------------------------------------------------------------------------
/src/tem/main/ModelComFunc.java:
--------------------------------------------------------------------------------
1 | package tem.main;
2 |
3 | import java.io.BufferedWriter;
4 | import java.io.PrintWriter;
5 | import java.util.ArrayList;
6 |
7 | import tem.com.MatrixUtil;
8 |
9 | public class ModelComFunc {
10 |
11 | public static void writeData(float[] array, ArrayList strings,
12 | ArrayList rankList, BufferedWriter writer, String prefix) {
13 | PrintWriter writer2 = new PrintWriter(writer);
14 | for (int row = 0; row < rankList.size(); row++) {
15 | writer2.printf("%s\t%s\t%f\n", prefix,
16 | strings.get(rankList.get(row)), array[rankList.get(row)]);
17 | // writer2.printf(prefix + "\t",
18 | // strings.get(rankList.get(row)) + "\t" + array[rankList.get(row)]
19 | // + "\n");
20 | }
21 | }
22 |
23 | // public static void writeData(ArrayList[] cNP2, BufferedWriter
24 | // writer) {
25 | // PrintWriter writer2 = new PrintWriter(writer);
26 | // writer2 = new PrintWriter(writer);
27 | // for (int i = 0; i < cNP2.length; i++) {
28 | // // writer2.printf("%d-th topic:\n", i);
29 | // for (int j = 0; j < cNP2[i].size(); j++) {
30 | // // writer2.printf("%s,\t", Doc.getNps().get(cNP2[i].get(j)));
31 | // }
32 | // writer2.print("\n\n");
33 | // }
34 | // }
35 |
36 | public static void writeData(float[] pi, BufferedWriter writer) {
37 | PrintWriter writer2 = new PrintWriter(writer);
38 | for (int row = 0; row < pi.length; row++) {
39 | writer2.printf("\t%f", pi[row]);
40 | }
41 | }
42 |
43 | public static void writeData(int[][] phi2, PrintWriter writer2) {
44 | for (int row = 0; row < phi2.length; row++) {
45 | // writer2.printf("%d", row);
46 | for (int col = 0; col < phi2[row].length; col++) {
47 | writer2.printf("%d\t", phi2[row][col]);
48 | // writer2.printf(phi2[row][col] + "\t");
49 | }
50 | writer2.print("\n");
51 | }
52 | }
53 |
54 | public static void writeData(float[][] array, BufferedWriter writer) {
55 | PrintWriter writer2 = new PrintWriter(writer);
56 | for (int row = 0; row < array.length; row++) {
57 | // writer2.printf("%d\t", row);
58 | for (int col = 0; col < array[row].length; col++) {
59 | writer2.printf("%f\t", array[row][col]);
60 | // writer2.printf(array[row][col] + "\t");
61 | }
62 | writer2.print("\n");
63 | }
64 | }
65 |
66 | public static void writeData(double[][] vph2, BufferedWriter writer) {
67 | PrintWriter writer2 = new PrintWriter(writer);
68 | for (int row = 0; row < vph2.length; row++) {
69 | // writer2.printf("%d", row);
70 | for (int col = 0; col < vph2[row].length; col++) {
71 | writer2.printf("\t%f", vph2[row][col]);
72 | // writer2.printf("\t" + vph2[row][col]);
73 | }
74 | writer2.print("\n");
75 | }
76 | }
77 |
78 | public static void writeData(float[][][] a, BufferedWriter writer) {
79 | PrintWriter writer2 = new PrintWriter(writer);
80 | for (int i = 0; i < a.length; i++)
81 | for (int row = 0; row < a[i].length; row++) {
82 | writer2.printf("%d\t%d", i, row);
83 | for (int col = 0; col < a[i][row].length; col++) {
84 | writer2.printf("\t%f", a[i][row][col]);
85 | }
86 | writer2.print("\n");
87 | }
88 | }
89 |
90 | public static void writeData(float[][][][] data, BufferedWriter writer) {
91 | PrintWriter writer2 = new PrintWriter(writer);
92 | for (int d = 0; d < data.length; d++)
93 | for (int a = 0; a < data[d].length; a++)
94 | for (int s = 0; s < data[d][a].length; s++) {
95 | writer2.printf("%d\t%d\t%d", d, a, s);
96 | for (int w = 0; w < data[d][a][s].length; w++) {
97 | writer2.printf("\t%f", data[d][a][s][w]);
98 | }
99 | writer2.print("\n");
100 | }
101 | }
102 |
103 | public static void writeData(ArrayList> rankLists,
104 | ArrayList> probs, ArrayList uniWordMap,
105 | ArrayList names, BufferedWriter writer, String string)
106 | throws Exception {
107 | // string: "\t"
108 | // names.get(0) names.get(1) ...
109 | // w11:probs11 w12:probs12 ...
110 | // w21:probs21 w21:probs22 ...
111 | // rankLists.get(0) rankLists.get(1)
112 | int maxsize = rankLists.get(0).size();
113 | for (int i = 0; i < rankLists.size(); i++) {
114 | // get max size
115 | if (rankLists.get(i).size() > maxsize)
116 | maxsize = rankLists.get(i).size();
117 | }
118 | for (int i = 0; i < names.size(); i++) {
119 | writer.write(names.get(i) + string + string);
120 | }
121 | writer.write("\n");
122 |
123 | for (int j = 0; j < maxsize; j++) {
124 | for (int i = 0; i < rankLists.size(); i++) {
125 | if (rankLists.get(i).size() > j && probs.get(i).size() > j) {
126 | writer.write(uniWordMap.get(rankLists.get(i).get(j))
127 | + string + probs.get(i).get(j) + string);
128 | } else
129 | writer.write("null" + string + "0" + string);
130 | }
131 | writer.write("\n");
132 | }
133 | }
134 |
135 | public static boolean checkEqual(int[][][][] a, int[][][] b, String string) {
136 | for (int i = 0; i < a.length; i++) {
137 | for (int j = 0; j < a[i].length; j++) {
138 | for (int k = 0; k < a[i][j].length; k++) {
139 | if (IsLessThanZero(a[i][j][k]))
140 | return false;
141 | }
142 | }
143 | }
144 | for (int i = 0; i < b.length; i++) {
145 | for (int j = 0; j < b[i].length; j++) {
146 | if (IsLessThanZero(b[i][j]))
147 | return false;
148 | }
149 | }
150 | for (int k = 0; k < a.length; k++) {
151 | for (int i = 0; i < a[k].length; i++) {
152 | for (int j = 0; j < a[k][i].length; j++) {
153 | double c = MatrixUtil.sumRow(a[k][i], j);
154 | if (c != b[k][i][j]) {
155 | System.out.println(string + "\t" + c + "\t" + b[i]);
156 | return false;
157 | }
158 | }
159 | }
160 | }
161 | return true;
162 | }
163 |
164 | public static boolean checkEqual(int[][][] a, int[][] b, String string) {
165 | for (int i = 0; i < a.length; i++) {
166 | for (int j = 0; j < a[i].length; j++) {
167 | if (IsLessThanZero(a[i][j]))
168 | return false;
169 | }
170 | }
171 | for (int i = 0; i < b.length; i++) {
172 | if (IsLessThanZero(b[i]))
173 | return false;
174 | }
175 | for (int i = 0; i < a.length; i++) {
176 | for (int j = 0; j < a[i].length; j++) {
177 | double c = MatrixUtil.sumRow(a[i], j);
178 | if (c != b[i][j]) {
179 | System.out.println(string + "\t" + c + "\t" + b[i]);
180 | return false;
181 | }
182 | }
183 | }
184 | return true;
185 | }
186 |
187 | static boolean checkEqual(int[][] a, int[] b, String string) {
188 |
189 | for (int i = 0; i < a.length; i++) {
190 | if (IsLessThanZero(a[i]))
191 | return false;
192 | }
193 | if (IsLessThanZero(b))
194 | return false;
195 | for (int i = 0; i < a.length; i++) {
196 | double c = MatrixUtil.sumRow(a, i);
197 | if (c != b[i]) {
198 | System.out.println(string + "\t" + c + "\t" + b[i]);
199 | return false;
200 | }
201 | }
202 | return true;
203 | }
204 |
205 | private boolean checkEqual(double a, double b, String string) {
206 | if (a < 0 || b < 0)
207 | return false;
208 | if (a != b) {
209 | System.out.println(string + "\t" + a + "\t" + b);
210 | return false;
211 | } else {
212 | return true;
213 | }
214 | }
215 |
216 | public static boolean checkEqual(int[] a, int b, String string) {
217 | if (IsLessThanZero(a) || b < 0)
218 | return false;
219 | double c = MatrixUtil.sum(a);
220 | if (c != b) {
221 | System.out.println(string + "\t" + c + "\t" + b);
222 | return false;
223 | }
224 | return true;
225 | }
226 |
227 | private static boolean IsLessThanZero(int[] b) {
228 | for (int i = 0; i < b.length; i++) {
229 | if (b[i] < 0)
230 | return true;
231 | }
232 | return false;
233 | }
234 |
235 | private static boolean IsLessThanZero(double[] b) {
236 | for (int i = 0; i < b.length; i++) {
237 | if (b[i] < 0)
238 | return true;
239 | }
240 | return false;
241 | }
242 |
243 | protected static double checkDoubleOverflow(double probs, int pos,
244 | int[] countP) {
245 | if (probs < 0) {
246 | System.err.println(probs + "\t" + pos);
247 | for (int i = 0; i < countP.length; i++)
248 | System.err.print(countP[i] + " ");
249 | throw new IndexOutOfBoundsException("p is negative!!");
250 | }
251 | if (probs > 1e+150d) {
252 | // System.out.println("p is too large for double type (> 2e+150d).");
253 | countP[pos]++;
254 | return (probs / 1e+150);
255 | }
256 | if (probs < 1e-150d) {
257 | // System.out.println("p is too small for double type (< 5e-150d).");
258 | countP[pos]--;
259 | return (probs * 1e+150);
260 | }
261 | return probs;
262 | }
263 |
264 | static void reAssignP(double[] p, int[] countP) {
265 | // p and countP should be the same length
266 | int maxV = countP[0];
267 | for (int i = 0; i < countP.length; i++) {
268 | // System.out.print(p[i] + ":" + countP[i] + "\t");
269 | if (countP[i] > maxV) {
270 | maxV = countP[i];
271 | }
272 | }
273 | // System.out.println("\t max:" + maxV);
274 | for (int i = 0; i < p.length; i++) {
275 | p[i] *= Math.pow(1e+150, countP[i] - maxV);
276 | // System.out.print(p[i] + "\t");
277 | }
278 | // System.out.println();
279 | }
280 | }
281 |
--------------------------------------------------------------------------------
/src/tem/main/SimpleEvaluate.java:
--------------------------------------------------------------------------------
1 | package tem.main;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.Collections;
6 | import java.util.Comparator;
7 | import java.util.List;
8 |
9 | import tem.com.FileUtil;
10 | import tem.conf.PathConfig;
11 | import tem.main.Documents.Document;
12 |
13 | /**Simple evaluation for TEM result
14 | * Compute utopics and kuExpertiseScore file
15 | * kuExpertiseScore Matrix is also used in
16 | * Topic Expertise PageRank
17 | * @author yangliu
18 | * @blog http://blog.csdn.net/yangliuy
19 | * @mail yangliuyx@gmail.com
20 | */
21 | public class SimpleEvaluate {
22 |
23 | static int K = 15;
24 | static int E = 4;
25 |
26 | /**
27 | * @param args
28 | * @throws ClassNotFoundException
29 | * @throws IOException
30 | */
31 | public static void main(String[] args) throws IOException, ClassNotFoundException {
32 | // TODO Auto-generated method stub
33 | String minPostNum = "80";
34 | String trainDocfile = PathConfig.modelResPath + "USER" + minPostNum + "/USER" + minPostNum + ".data";
35 | Documents trainDocSet = new Documents();
36 | trainDocSet = FileUtil.loadClass(trainDocSet, trainDocfile);
37 | System.out.println("train terms: " + trainDocSet.termCountMap.size());
38 |
39 | String testDataFolder = PathConfig.testDataPath;
40 | Documents testDocSet = new Documents();
41 |
42 |
43 | testDocSet.readQATestDocs(testDataFolder, trainDocSet);
44 | String testDocfile = testDataFolder + "QATest.data";
45 | FileUtil.saveClass(testDocSet, testDocfile);
46 |
47 | //Document questionDoc = testDocSet.docs.get(0);
48 |
49 | System.out.println(testDocSet.termCountMap.size());
50 | System.out.println(testDocSet.tagCountMap.size());
51 | System.out.println(testDocSet.voteCountMap.size());
52 | System.out.println(testDocSet.docs.size());
53 |
54 | //trainDocSet.copyTrainDocVocals(testDocSet);
55 | //FileUtil.saveClass(trainDocSet, trainDocfile);
56 |
57 | /*String userIDFile = PathConfig.scriptDataPath + "USERID" + minPostNum;
58 | String resPath = PathConfig.modelResPath + "USER" + minPostNum + "/model_" + 400;
59 | String resultPath = PathConfig.modelResPath + "USER" + minPostNum + "/";
60 | ArrayList userIDs = new ArrayList();
61 | FileUtil.readLines(userIDFile, userIDs);
62 | Documents docSet = new Documents();
63 | String docfile = resultPath + "USER" + minPostNum + ".data";
64 | docSet = FileUtil.loadClass(docSet, docfile);
65 | int U = userIDs.size();
66 | float [][] theta = new float[U][K];
67 | float [][][] phi = new float[K][U][E];
68 | float [][] tau = new float[E][docSet.voteToIndexMap.size()];
69 | float [] expertiseMean = new float[E];
70 | readTheta(theta, resPath + ".theta");
71 | readPhi(phi, resPath + ".phi");
72 | readTau(tau, resPath + ".tau");
73 | SimpleEvaluate se = new SimpleEvaluate();
74 | se.printUtopics(theta, U, resPath, userIDs);
75 | se.computeExpertiseMean(tau, docSet, expertiseMean);
76 | se.printKUExpertiseScore(phi, expertiseMean, U, resPath);*/
77 | }
78 |
79 | private void computeExpertiseMean(float[][] tau, Documents docSet,
80 | float[] expertiseMean) {
81 | // TODO Auto-generated method stub
82 | for(int i = 0; i < E; i++){
83 | float mean = 0;
84 | for(int j = 0; j < docSet.indexToVoteMap.size(); j++){
85 | mean += Float.valueOf(docSet.indexToVoteMap.get(j)) * tau[i][j];
86 | }
87 | expertiseMean[i] = mean;
88 | System.out.println("expertise " + i + " mean : " + mean);
89 | }
90 | }
91 |
92 | private void printKUExpertiseScore(float[][][] phi, float[] expertiseMean, int U, String resPath) {
93 | // TODO Auto-generated method stub
94 | ArrayList KUEMeanLines = new ArrayList();
95 | for(int k = 0; k < K; k++){
96 | String line = "";
97 | for(int u = 0; u < U; u++){
98 | float expertiseScore = 0;
99 | for(int e = 0; e < E; e++){
100 | expertiseScore += expertiseMean[e] * phi[k][u][e];
101 | }
102 | line += expertiseScore + "\t";
103 | }
104 | KUEMeanLines.add(line);
105 | }
106 | FileUtil.writeLines(resPath + ".KUexpertiseScore", KUEMeanLines);
107 | }
108 |
109 | private void printUtopics(float[][] theta, int U, String resPath, ArrayList userIDs) {
110 | // TODO Auto-generated method stub
111 | //model.utopics
112 | ArrayList utopicsLines = new ArrayList();
113 | for(int i = 0; i < U; i++){
114 | List tWordsIndexArray = new ArrayList();
115 | for(int t = 0; t < K; t++){
116 | tWordsIndexArray.add(new Integer(t));
117 | }
118 | Collections.sort(tWordsIndexArray, new SimpleEvaluate.TwordsComparable(theta[i]));
119 | String line = "UserID = " + userIDs.get(i) + "\t";
120 | for(int t = 0; t < K; t++){
121 | line += tWordsIndexArray.get(t) + "\t";
122 | }
123 | utopicsLines.add(line);
124 | }
125 | FileUtil.writeLines(resPath + ".utopics", utopicsLines);
126 | }
127 |
128 | private static void readTau(float[][] tau, String file) {
129 | // TODO Auto-generated method stub
130 | ArrayList lines = new ArrayList();
131 | FileUtil.readLines(file, lines);
132 | for(int i = 0; i < tau.length; i++){
133 | String[] tokens = lines.get(i).split("\t");
134 | for(int j = 0 ; j < tau[i].length; j++){
135 | tau[i][j] = Float.valueOf(tokens[j]);
136 | }
137 | }
138 | }
139 |
140 | private static void readPhi(float[][][] phi, String file) {
141 | // TODO Auto-generated method stub
142 | ArrayList lines = new ArrayList();
143 | FileUtil.readLines(file, lines);
144 | for(String line : lines){
145 | String[] tokens = line.split("\t");
146 | int i = Integer.valueOf(tokens[0]);
147 | int j = Integer.valueOf(tokens[1]);
148 | int k = Integer.valueOf(tokens[2]);
149 | phi[i][j][k] = Float.valueOf(tokens[3]);
150 | }
151 | }
152 |
153 | private static void readTheta(float[][] theta, String file) {
154 | // TODO Auto-generated method stub
155 | ArrayList lines = new ArrayList();
156 | FileUtil.readLines(file, lines);
157 | for(int i = 0; i < theta.length; i++){
158 | String[] tokens = lines.get(i).split("\t");
159 | for(int j = 0 ; j < theta[i].length; j++){
160 | theta[i][j] = Float.valueOf(tokens[j]);
161 | }
162 | }
163 | }
164 |
165 | public class TwordsComparable implements Comparator {
166 | public float [] sortProb; // Store probability of each word in topic k
167 |
168 | public TwordsComparable (float[] sortProb){
169 | this.sortProb = sortProb;
170 | }
171 |
172 | @Override
173 | public int compare(Integer o1, Integer o2) {
174 | // TODO Auto-generated method stub
175 | //Sort topic word index according to the probability of each word in topic k
176 | if(sortProb[o1] > sortProb[o2]) return -1;
177 | else if(sortProb[o1] < sortProb[o2]) return 1;
178 | else return 0;
179 | }
180 | }
181 | }
182 |
--------------------------------------------------------------------------------
/src/tem/main/TEMModelSampling.java:
--------------------------------------------------------------------------------
1 | package tem.main;
2 |
3 | import java.io.BufferedWriter;
4 | import java.io.File;
5 | import java.io.FileWriter;
6 | import java.io.IOException;
7 | import java.util.ArrayList;
8 |
9 | import tem.com.ComUtil;
10 | import tem.com.FileUtil;
11 | import tem.com.JC;
12 | import tem.com.MatrixUtil;
13 | import tem.conf.ConstantConfig;
14 | import tem.conf.PathConfig;
15 | import tem.main.Documents;
16 | import tem.main.Documents.Document;
17 | import tem.main.TEMModel;
18 |
19 | /**
20 | * Gibbs Sampling of Topic Expertise Model
21 | *
22 | * @author yangliu
23 | * @blog http://blog.csdn.net/yangliuy
24 | * @mail yangliuyx@gmail.com
25 | */
26 |
27 | public class TEMModelSampling {
28 |
29 | public static class modelparameters {
30 | float alpha = 0.5f;// usual value is 50 / K
31 | float beta = 0.01f;
32 | float gamma = 0.01f;
33 | float eta = 0.1f;// usual value is 0.1
34 | float xi = 0.01f;
35 | int topicNum = 20;
36 | int expertiseNum = 3;
37 |
38 | int iteration = 300;
39 | int saveStep = 20;
40 | int beginSaveIters = 5;
41 | }
42 |
43 | /**
44 | * Get parameters from configuring file. If the configuring file has value
45 | * in it, use the value. Else the default value in program will be used
46 | *
47 | * @param ldaparameters
48 | * @param parameterFile
49 | * @return void
50 | */
51 | private static void getParametersFromFile(modelparameters ldaparameters,
52 | String parameterFile) {
53 | // TODO Auto-generated method stub
54 | ArrayList paramLines = new ArrayList();
55 | FileUtil.readLines(parameterFile, paramLines);
56 | for (String line : paramLines) {
57 | String[] lineParts = line.split("\t");
58 | switch (parameters.valueOf(lineParts[0])) {
59 | case alpha:
60 | ldaparameters.alpha = Float.valueOf(lineParts[1]);
61 | break;
62 | case beta:
63 | ldaparameters.beta = Float.valueOf(lineParts[1]);
64 | break;
65 | case gamma:
66 | ldaparameters.gamma = Float.valueOf(lineParts[1]);
67 | break;
68 | case eta:
69 | ldaparameters.eta = Float.valueOf(lineParts[1]);
70 | break;
71 | case xi:
72 | ldaparameters.xi = Float.valueOf(lineParts[1]);
73 | break;
74 | case topicNum:
75 | ldaparameters.topicNum = Integer.valueOf(lineParts[1]);
76 | break;
77 | case expertiseNum:
78 | ldaparameters.expertiseNum = Integer.valueOf(lineParts[1]);
79 | break;
80 | case iteration:
81 | ldaparameters.iteration = Integer.valueOf(lineParts[1]);
82 | break;
83 | case saveStep:
84 | ldaparameters.saveStep = Integer.valueOf(lineParts[1]);
85 | break;
86 | case beginSaveIters:
87 | ldaparameters.beginSaveIters = Integer.valueOf(lineParts[1]);
88 | break;
89 | }
90 | }
91 | }
92 |
93 | public enum parameters {
94 | alpha, beta, gamma, eta, xi, topicNum, expertiseNum, iteration, saveStep, beginSaveIters;
95 | }
96 |
97 | /**
98 | * @param args
99 | * @throws IOException
100 | * @throws ClassNotFoundException
101 | */
102 | public static void main(String[] args) throws IOException,
103 | ClassNotFoundException {
104 | /*boolean local = true; // run on local machine
105 | //local = !local; // run on server
106 |
107 | new JC();
108 | String[] descrp = { "ParamsPath", "ResPath", "modelOutPath",
109 | "minPostNum" };
110 | String[] directory = { "data/modelParams/", "data/modelRes/ThreeM09/",
111 | "data/modelRes/ThreeM09/TMM3/", "80" };
112 | char[] options = { 'p', 'i', 'o', 'n' };
113 | if (local)
114 | JC.setInputOptions(descrp, directory, options, args, "1111", 0);
115 | else
116 | JC.setInputOptions(descrp, directory, options, args, "0000", 1);
117 | PathConfig.modelParamsPath = JC.getARG(0);
118 | PathConfig.modelResPath = JC.getARG(1);
119 | PathConfig.modelOutPath = JC.getARG(2);
120 | PathConfig.minPostNum = JC.getARG(3);
121 | JC.close();*/
122 |
123 | String minPostNum = PathConfig.minPostNum;
124 | // data/originalData/USER80/posts/
125 | String originalDocsPath = PathConfig.originalDataPath + "USER"
126 | + minPostNum + "/posts/";
127 | //data/modelRes/ThreeM09/USER80
128 | String resultPath = PathConfig.modelResPath + "USER" + minPostNum + "/";
129 | String parameterFile = ConstantConfig.LDAPARAMETERFILE;
130 |
131 | modelparameters modelparam = new modelparameters();
132 | getParametersFromFile(modelparam, parameterFile);
133 | Documents docSet = new Documents();
134 |
135 | String docfile = resultPath + "USER" + minPostNum + ".data";
136 | // docSet.readDocs(originalDocsPath, minPostNum);
137 |
138 | // Save Serialized data
139 | docSet = FileUtil.loadClass(docSet, docfile);
140 | // FileUtil.saveClass(docSet, docfile);
141 | // Delete terms that appear only n times
142 | // docSet.deleteRareTerms(3);
143 | System.out.println("indexToTermMap size : "
144 | + docSet.indexToTermMap.size());
145 | // System.out.println("indexToTermMap : " + docSet.indexToTermMap);
146 | System.out.println("indexToTagMap size : "
147 | + docSet.indexToTagMap.size());
148 | System.out.println("indexToVoteMap size : "
149 | + docSet.indexToVoteMap.size());
150 |
151 | // // test();
152 | // testGMM();
153 | //if (local)
154 | //removeData(docSet, 10);
155 |
156 | // try {
157 | // getVotes(docSet, PathConfig.votePath);
158 | // } catch (Exception e) {
159 | // e.printStackTrace();
160 | // }
161 | //
162 | // for (int d = 0; d < 1; d++) {
163 | // Document doc = docSet.docs.get(d);
164 | // System.out.println(doc.docName);
165 | // // System.out.println("tags" + doc.tags);
166 | // System.out.println("title" + doc.title);
167 | // for (int n = 0; n < docSet.docs.get(d).docWords.length; n++) {
168 | // System.out.println("post vote: "
169 | // + docSet.indexToVoteMap.get(doc.votes[n]));
170 | // System.out.println("post tag: "
171 | // + docSet.indexToTagMap.get(doc.tags[n]));
172 | // for (int l = 0; l < docSet.docs.get(d).docWords[n].length; l++) {
173 | // System.out.print(doc.docWords[n][l] + " ");
174 | // // System.out.print("vote_" +
175 | // // docSet.indexToVoteMap.get(doc.votes[n])
176 | // // + " ");
177 | // // System.out.print("tag_" +
178 | // // docSet.indexToTagMap.get(doc.tags[n]) +
179 | // // " ");
180 | // }
181 | // System.out.println();
182 | // }
183 | // }
184 |
185 | // System.out.println("indexToTagMap" + docSet.indexToTagMap);
186 | // System.out.println("indexToVoteMap" + docSet.indexToVoteMap);
187 | // System.out.println("indexToTermMap" + docSet.indexToTermMap);
188 | // System.out.println("tagCountMap");
189 | // // tagCountMap
190 | // for (String tag : docSet.tagCountMap.keySet()) {
191 | // System.out.println(tag + "\t" + docSet.tagCountMap.get(tag));
192 | // }
193 | //
194 | // System.out.println("voteCountMap");
195 | // // voteCountMap
196 | // for (String vote : docSet.voteCountMap.keySet()) {
197 | // System.out.println(vote + "\t" + docSet.voteCountMap.get(vote));
198 | // }
199 |
200 | //Count quesions and answers
201 | int questionCount = 0;
202 | int answerCount = 0;
203 | for (int d = 0; d < docSet.docs.size(); d++) {
204 | Document doc = docSet.docs.get(d);
205 | for(int n = 0; n < doc.docWords.length; n++){
206 | if(doc.postTypeID[n] == 1){
207 | questionCount++;
208 | } else {
209 | answerCount++;
210 | }
211 | }
212 | }
213 | System.out.println("quesionsCount: " + questionCount);
214 | System.out.println("answerCount: " + answerCount);
215 |
216 | TEMModel model = new TEMModel(modelparam);
217 | System.out.println("1 Initialize the model ...");
218 | model.initializeModel(docSet);
219 | System.out.println("2 Learning and Saving the model ...");
220 | model.inferenceModel(docSet, minPostNum);
221 | System.out.println("3 Output the final model ...");
222 | model.saveIteratedModel(modelparam.iteration, docSet, minPostNum);
223 |
224 | // save model in serialized data
225 | String modelName = "E_" + model.ENum + "_T_" + model.K;
226 | FileUtil.saveClass(model, PathConfig.modelResPath + "USER" + minPostNum
227 | + "/" + modelName + ".model");
228 | System.out.println("Done!");
229 | }
230 |
231 | private static void testGMM() {
232 | String testGMM = "data/modelRes/testGMM.txt";
233 |
234 | double alpha = 10;
235 |
236 | float[][] GMMData = null;
237 | GMMData = FileUtil.readArray(testGMM);
238 | FGMM fgmm = new FGMM(); //
239 | int ksize = 4;
240 | int[] clusterids = new int[GMMData.length];
241 | // random assign clusterID
242 | for (int n = 0; n < GMMData.length; n++) {
243 | int id = (int) (Math.floor(Math.random() * ksize));
244 | clusterids[n] = id;
245 | }
246 | fgmm.init2(GMMData, ksize, clusterids);
247 | // fgmm.learn2(GMMData, 500);// get GMM data index
248 |
249 | for (int iter = 0; iter < 500; iter++) {
250 | if (iter % 10 == 0) {
251 | System.out.print("Iteration " + iter + "\t");
252 | for (int i = 0; i < ksize; i++)
253 | System.out.print(fgmm.clusterDataIndex.get(i).size() + " ");
254 | System.out.println();
255 | System.out.println("lambda:");
256 | for (int k = 0; k < ksize; k++)
257 | ComUtil.print(fgmm.p_lambda[k], " ", "\n");
258 | System.out.println("mu:");
259 | for (int k = 0; k < ksize; k++)
260 | ComUtil.print(fgmm.p_mu[k], " ", "\n");
261 | }
262 | for (int n = 0; n < GMMData.length; n++) {
263 | double[] probsGMM = fgmm.LearnProbs(GMMData, n);
264 | double[] p = new double[ksize];
265 |
266 | for (int i = 0; i < ksize; i++) {
267 | p[i] = (fgmm.clusterDataIndex.get(i).size() + alpha)
268 | / (fgmm.vector_n + ksize * alpha);
269 | p[i] *= probsGMM[i];
270 | }
271 |
272 | int newNo = ComUtil.sample(p, p.length);
273 | clusterids[n] = newNo;
274 |
275 | // update new mu and lambda
276 | fgmm.UpdateProbs(GMMData, n, newNo);
277 | }
278 | }
279 | System.out.println("done");
280 | System.exit(0);
281 | }
282 |
283 | private static void removeData(Documents docSet, int r) {
284 | for (int d = r; d < docSet.docs.size(); d++) {
285 | docSet.docs.remove(d);
286 | d--;
287 | }
288 | System.out.println("doc size: " + docSet.docs.size());
289 | }
290 |
291 | private static void test() {
292 | double[] set = new double[5];
293 | ComUtil.print(set, " ", "\n");
294 | changeset(set);
295 | ComUtil.print(set, " ", "\n");
296 | System.exit(0);
297 | }
298 |
299 | private static void changeset(double[] set) {
300 | for (int i = 0; i < set.length; i++)
301 | set[i] += 1;
302 | }
303 |
304 | private static void getVotes(Documents docSet, String votePath)
305 | throws Exception {
306 | BufferedWriter writer = new BufferedWriter(new FileWriter(new File(
307 | votePath)));
308 |
309 | for (int d = 0; d < docSet.docs.size(); d++) {
310 | Document doc = docSet.docs.get(d);
311 | // System.out.println(doc.docName);
312 | // //System.out.println("tags" + doc.tags);
313 | // System.out.println("title" + doc.title);
314 | for (int n = 0; n < docSet.docs.get(d).docWords.length; n++) {
315 | // System.out.println(d + "\t" + n + "\t" +
316 | // docSet.indexToVoteMap.get(doc.votes[n]));
317 | writer.write(d + "\t" + n + "\t"
318 | + docSet.indexToVoteMap.get(doc.votes[n]) + "\n");
319 | // System.out.println("post vote: "
320 | // + docSet.indexToVoteMap.get(doc.votes[n]));
321 | // System.out.println("post tag: " +
322 | // docSet.indexToTagMap.get(doc.tags[n]));
323 | // for(int l = 0; l < docSet.docs.get(d).docWords[n].length;
324 | // l++){
325 | // System.out.print(doc.docWords[n][l] + " ");
326 | // //System.out.print("vote_" +
327 | // docSet.indexToVoteMap.get(doc.votes[n]) + " ");
328 | // //System.out.print("tag_" +
329 | // docSet.indexToTagMap.get(doc.tags[n]) + " ");
330 | // }
331 | // System.out.println();
332 | }
333 | writer.flush();
334 | }
335 | writer.close();
336 | }
337 | }
338 |
--------------------------------------------------------------------------------
/src/tem/main/TEMResPaperVisual.java:
--------------------------------------------------------------------------------
1 | package tem.main;
2 |
3 | import java.io.FileInputStream;
4 | import java.io.IOException;
5 | import java.io.ObjectInputStream;
6 |
7 | import tem.conf.PathConfig;
8 |
9 | public class TEMResPaperVisual {
10 |
11 | /**
12 | * @param args
13 | * @throws IOException
14 | * @throws Exception
15 | */
16 | public static void main(String[] args) throws IOException, Exception {
17 | // TODO Auto-generated method stub
18 | String modelFile = PathConfig.modelResPath + "ServerTEMRes/Model_E10_T15.model";
19 |
20 | //Get TEM model result
21 | TEMModel model = new TEMModel();
22 | // load model
23 | System.out.println("reading a class from : " + modelFile);
24 | FileInputStream fis = new FileInputStream(modelFile);
25 | ObjectInputStream ois = new ObjectInputStream(fis);
26 | model = (TEMModel) ois.readObject();
27 | ois.close();
28 | System.out.println(model.K);
29 | System.out.println(model.ENum);
30 | System.out.println("mu");
31 | for(int e = 0; e < model.ENum; e++){
32 | System.out.println(model.fgmm.p_mu[e][0]);
33 | }
34 | System.out.println("lambda");
35 | for(int e = 0; e < model.ENum; e++){
36 | System.out.println(model.fgmm.p_lambda[e][0]);
37 | }
38 |
39 |
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/src/tem/main/TEMResProUserRecMergeU.java:
--------------------------------------------------------------------------------
1 | package tem.main;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.ArrayList;
6 | import java.util.HashSet;
7 | import java.util.Map;
8 | import java.util.Set;
9 | import java.util.TreeMap;
10 | import java.util.TreeSet;
11 |
12 | import tem.com.FileUtil;
13 | import tem.conf.PathConfig;
14 |
15 | /**
16 | * User Rec
17 | * Merge answers with the same user with one
18 | */
19 |
20 | public class TEMResProUserRecMergeU {
21 |
22 | /**
23 | * @param args
24 | * @throws IOException
25 | */
26 | public static void main(String[] args) throws IOException {
27 | // TODO Auto-generated method stub
28 | String ModelFileVoteResFolder = PathConfig.modelResPath + "USER" + PathConfig.minPostNum + "";
29 | ArrayList resLines = new ArrayList();
30 | ArrayList mergeLines = new ArrayList();
31 | Set QidAUseridSet = new TreeSet ();
32 | Map IDPairScoreMap = new TreeMap();
33 |
34 | for(File modelFVRfile : new File(ModelFileVoteResFolder).listFiles()){
35 | if(modelFVRfile.getName().contains("ModelFileVoteRes")){
36 | String mergeFileName = ModelFileVoteResFolder + "/MergeFiles/" + modelFVRfile.getName() + ".merge";
37 | System.out.println("mergeFileName " + mergeFileName);
38 | if(new File(mergeFileName).exists()){
39 | System.out.println(mergeFileName + "is existed! " );
40 | continue;
41 | }
42 | resLines.clear();
43 | QidAUseridSet.clear();
44 | IDPairScoreMap.clear();
45 | mergeLines.clear();;
46 | FileUtil.readLines(modelFVRfile.getAbsolutePath(), resLines);
47 | for(int i = 0; i < resLines.size(); i++){
48 | String[] tokens = resLines.get(i).split("\t");
49 | QidAUseridSet.add(tokens[0] + "\t" + tokens[1]);
50 | IDPairScoreMap.put(tokens[0] + "\t" + tokens[1], tokens[3] + "\t" + tokens[4] + "\t" + tokens[5]);
51 | }
52 | System.out.println("QidAUseridSet size: " + QidAUseridSet.size());
53 | for(String idPair : QidAUseridSet){
54 | double sum = 0;
55 | double count = 0;
56 | for(String resLine : resLines){
57 | String[] tokens = resLine.split("\t");
58 | String pairKey = tokens[0] + "\t" + tokens[1];
59 | if(idPair.equals(pairKey)){
60 | sum += Double.valueOf(tokens[2]);
61 | count ++;
62 | }
63 | }
64 | double averageVote = sum / count;
65 | mergeLines.add(idPair + "\t" + averageVote + "\t" + IDPairScoreMap.get(idPair));
66 | }
67 | FileUtil.writeLines(mergeFileName , mergeLines);
68 | mergeLines.clear();
69 | }
70 | }
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/tem/parser/Porter.java:
--------------------------------------------------------------------------------
1 | package tem.parser;
2 |
3 | import java.io.*;
4 |
5 | /* author: Fotis Lazarinis (actually I translated from C to Java)
6 | date: June 1997
7 | address: Psilovraxou 12, Agrinio, 30100
8 |
9 | comments: Compile it, import the Porter class into you program and create an instance.
10 | Then use the stripAffixes method of this method which takes a String as
11 | input and returns the stem of this String again as a String.
12 |
13 | */
14 |
15 | class NewString {
16 | public String str;
17 |
18 | NewString() {
19 | str = "";
20 | }
21 | }
22 |
23 | public class Porter {
24 |
25 | private String Clean( String str ) {
26 | int last = str.length();
27 |
28 | Character ch = new Character( str.charAt(0) );
29 | String temp = "";
30 |
31 | for ( int i=0; i < last; i++ ) {
32 | if ( ch.isLetterOrDigit( str.charAt(i) ) )
33 | temp += str.charAt(i);
34 | }
35 |
36 | return temp;
37 | } //clean
38 |
39 | private boolean hasSuffix( String word, String suffix, NewString stem ) {
40 |
41 | String tmp = "";
42 |
43 | if ( word.length() <= suffix.length() )
44 | return false;
45 | if (suffix.length() > 1)
46 | if ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) )
47 | return false;
48 |
49 | stem.str = "";
50 |
51 | for ( int i=0; i 0 ) {
92 | if ( vowel(stem.charAt(i),stem.charAt(i-1)) )
93 | break;
94 | }
95 | else {
96 | if ( vowel(stem.charAt(i),'a') )
97 | break;
98 | }
99 | }
100 |
101 | for ( i++ ; i < length ; i++ ) {
102 | if ( i > 0 ) {
103 | if ( !vowel(stem.charAt(i),stem.charAt(i-1)) )
104 | break;
105 | }
106 | else {
107 | if ( !vowel(stem.charAt(i),'?') )
108 | break;
109 | }
110 | }
111 | if ( i < length ) {
112 | count++;
113 | i++;
114 | }
115 | } //while
116 |
117 | return(count);
118 | }
119 |
120 | private boolean containsVowel( String word ) {
121 |
122 | for (int i=0 ; i < word.length(); i++ )
123 | if ( i > 0 ) {
124 | if ( vowel(word.charAt(i),word.charAt(i-1)) )
125 | return true;
126 | }
127 | else {
128 | if ( vowel(word.charAt(0),'a') )
129 | return true;
130 | }
131 |
132 | return false;
133 | }
134 |
135 | private boolean cvc( String str ) {
136 | int length=str.length();
137 |
138 | if ( length < 3 )
139 | return false;
140 |
141 | if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )
142 | && (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y')
143 | && (vowel(str.charAt(length-2),str.charAt(length-3))) ) {
144 |
145 | if (length == 3) {
146 | if (!vowel(str.charAt(0),'?'))
147 | return true;
148 | else
149 | return false;
150 | }
151 | else {
152 | if (!vowel(str.charAt(length-3),str.charAt(length-4)) )
153 | return true;
154 | else
155 | return false;
156 | }
157 | }
158 |
159 | return false;
160 | }
161 |
162 | private String step1( String str ) {
163 |
164 | NewString stem = new NewString();
165 |
166 | if ( str.charAt( str.length()-1 ) == 's' ) {
167 | if ( (hasSuffix( str, "sses", stem )) || (hasSuffix( str, "ies", stem)) ){
168 | String tmp = "";
169 | for (int i=0; i 0 ) {
189 | String tmp = "";
190 | for (int i=0; i 0 ) {
270 | str = stem.str + suffixes[index][1];
271 | return str;
272 | }
273 | }
274 | }
275 |
276 | return str;
277 | }
278 |
279 | private String step3( String str ) {
280 |
281 | String[][] suffixes = { { "icate", "ic" },
282 | { "ative", "" },
283 | { "alize", "al" },
284 | { "alise", "al" },
285 | { "iciti", "ic" },
286 | { "ical", "ic" },
287 | { "ful", "" },
288 | { "ness", "" }};
289 | NewString stem = new NewString();
290 |
291 | for ( int index = 0 ; index 0 ) {
294 | str = stem.str + suffixes[index][1];
295 | return str;
296 | }
297 | }
298 | return str;
299 | }
300 |
301 | private String step4( String str ) {
302 |
303 | String[] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion",
304 | "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"};
305 |
306 | NewString stem = new NewString();
307 |
308 | for ( int index = 0 ; index 1 ) {
312 | str = stem.str;
313 | return str;
314 | }
315 | }
316 | }
317 | return str;
318 | }
319 |
320 | private String step5( String str ) {
321 |
322 | if ( str.charAt(str.length()-1) == 'e' ) {
323 | if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
324 | String tmp = "";
325 | for ( int i=0; i 1) )
343 | if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
344 | String tmp = "";
345 | for ( int i=0; i= 1 )
374 | str = step2( str );
375 | if ( str.length() >= 1 )
376 | str = step3( str );
377 | if ( str.length() >= 1 )
378 | str = step4( str );
379 | if ( str.length() >= 1 )
380 | str = step5( str );
381 |
382 | return str;
383 | }
384 |
385 |
386 | public String stripAffixes( String str ) {
387 |
388 | str = str.toLowerCase();
389 | str = Clean(str);
390 |
391 | if (( str != "" ) && (str.length() > 2)) {
392 | str = stripPrefixes(str);
393 |
394 | if (str != "" )
395 | str = stripSuffixes(str);
396 |
397 | }
398 |
399 | return str;
400 | } //stripAffixes
401 |
402 | } //class
403 |
--------------------------------------------------------------------------------
/src/tem/parser/StanfordTokenizer.java:
--------------------------------------------------------------------------------
1 | package tem.parser;
2 |
3 | import java.io.File;
4 | import java.io.FileReader;
5 | import java.io.FileWriter;
6 | import java.io.IOException;
7 | import java.io.Reader;
8 | import java.io.StringReader;
9 | import java.util.Iterator;
10 | import java.util.LinkedList;
11 | import java.util.List;
12 | import java.util.regex.Matcher;
13 | import java.util.regex.Pattern;
14 |
15 | import edu.stanford.nlp.ling.CoreLabel;
16 | import edu.stanford.nlp.ling.HasWord;
17 | import edu.stanford.nlp.process.CoreLabelTokenFactory;
18 | import edu.stanford.nlp.process.DocumentPreprocessor;
19 | import edu.stanford.nlp.process.PTBTokenizer;
20 |
21 | /**Token sentences in a file or a String sentences
22 | * @author liuyang
23 | * @mail yangliuyx@gmail.com
24 | */
25 | public class StanfordTokenizer {
26 |
27 | public static File tokenizeFile(File file) throws IOException {
28 | String tokenizedFileName = file.getAbsolutePath() + "_tokenized";
29 | FileWriter tokenizedFileWriter = new FileWriter(tokenizedFileName);
30 | DocumentPreprocessor dp = new DocumentPreprocessor(file.getAbsolutePath());
31 | int CurrentSentIndex = 1;
32 | int tokenizedSentCounter = 1;
33 | for (List sentence : dp) {
34 | for(int i = 0; i < sentence.size(); i++){
35 | if(i == 0){
36 | Pattern p = Pattern.compile("[0-9]+");
37 | if(p.matcher(sentence.get(i).toString()).matches()){
38 | tokenizedFileWriter.append(tokenizedSentCounter + "\t" + sentence.get(i) + "\t");
39 | CurrentSentIndex = Integer.valueOf(sentence.get(i).toString());
40 | } else {
41 | System.out.println(tokenizedSentCounter + "\t" + CurrentSentIndex +"\t" + sentence.get(i) + " ");
42 | tokenizedFileWriter.append(tokenizedSentCounter + "\t" + CurrentSentIndex +"\t" + sentence.get(i) + " ");
43 | }
44 | } else {
45 | tokenizedFileWriter.append(sentence.get(i) + " ");
46 | }
47 | }
48 | tokenizedSentCounter++;
49 | tokenizedFileWriter.append("\n");
50 | tokenizedFileWriter.flush();
51 | }
52 | return new File(tokenizedFileName);
53 | }
54 |
55 | public static List tokenizeSents(String sents){
56 | Reader reader = new StringReader(sents);
57 | DocumentPreprocessor dp = new DocumentPreprocessor(reader);
58 |
59 | List sentenceList = new LinkedList();
60 | Iterator> it = dp.iterator();
61 | while (it.hasNext()) {
62 | StringBuilder sentenceSb = new StringBuilder();
63 | List sentence = it.next();
64 | for (HasWord token : sentence) {
65 | if(sentenceSb.length()>1) {
66 | sentenceSb.append(" ");
67 | }
68 | sentenceSb.append(token);
69 | }
70 | sentenceList.add(sentenceSb.toString());
71 | }
72 | return sentenceList;
73 | }
74 | }
--------------------------------------------------------------------------------
/src/tem/script/DBConnection.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/src/tem/script/DBConnection.java
--------------------------------------------------------------------------------
/src/tem/script/ExportExpCorpusFromDB.java:
--------------------------------------------------------------------------------
1 | package tem.script;
2 |
3 | import java.io.File;
4 | import java.sql.ResultSet;
5 | import java.sql.SQLException;
6 | import java.util.ArrayList;
7 |
8 | import tem.com.FileUtil;
9 | import tem.conf.PathConfig;
10 | import tem.script.DBConnection;
11 |
12 | /**Export users and posts data from stackoverflow database
13 | * @author yangliu
14 | * @blog http://blog.csdn.net/yangliuy
15 | * @mail yangliuyx@gmail.com
16 | */
17 | public class ExportExpCorpusFromDB {
18 |
19 | /**
20 | * @param args
21 | * @throws SQLException
22 | */
23 | public static void main(String[] args) throws SQLException {
24 | // TODO Auto-generated method stub
25 | String[] minPostNums = {"30"};
26 | final DBConnection db = new DBConnection();
27 | db.getConn();
28 | for(String minPostNum : minPostNums){
29 | String userIDFile = PathConfig.scriptDataPath + "USERID" + minPostNum;
30 | /*String sql = "select owneruserid from (select owneruserid, count(posts.id) as postNum from posts"
31 | + " where posts.creationdate > '2009-05-01 00:00:00' and posts.creationdate < '2009-08-01 00:00:00' group by owneruserid) as newt where newt.postNum > " + minPostNum + ";";
32 | ResultSet rs = db.executeQuery(sql);
33 | ArrayList userIDs = new ArrayList();
34 | while(rs.next()){
35 | int userID = rs.getInt("owneruserid");
36 | if(userID != 0){
37 | userIDs.add(String.valueOf(userID));
38 | }
39 | }
40 | System.out.println("userIDs size : " + userIDs.size());
41 | FileUtil.writeLines(userIDFile, userIDs);*/
42 | String sql = "";
43 | ArrayList userIDs = new ArrayList();
44 | FileUtil.readLines(userIDFile, userIDs);
45 | String oriDataFolder = PathConfig.originalDataPath + "USER" + minPostNum;
46 | if(!new File(oriDataFolder).exists()){
47 | new File(oriDataFolder).mkdir();
48 | }
49 | String oriDataUserIDFile = oriDataFolder + "/user.IDs";
50 | FileUtil.writeLines(oriDataUserIDFile, userIDs);
51 |
52 | String oriDataUserInforFile = oriDataFolder + "/user.Infors";
53 | ArrayList userInforLines = new ArrayList();
54 | String postFolder = oriDataFolder + "/posts";
55 | ArrayList postsLines = new ArrayList();
56 | if(!new File(postFolder).exists()){
57 | new File(postFolder).mkdir();
58 | }
59 |
60 | for(String userID : userIDs){
61 | String userPostsFile = postFolder + "/" + userID +".posts";
62 | if(new File(userPostsFile).exists()){
63 | System.out.println(userPostsFile + " is existed! ");
64 | continue;
65 | }
66 | sql = "select * from posts where owneruserid = '" + userID + "' and posts.creationdate > '2009-05-01 00:00:00' and posts.creationdate < '2009-08-01 00:00:00';";
67 | ResultSet rs = db.executeQuery(sql);
68 | while(rs.next()){
69 | String postsLine = rs.getInt("ID") + "\t" + rs.getInt("POSTTYPEID")
70 | + "\t" + rs.getInt("PARENTID") + "\t" + rs.getInt("ACCEPTEDANSWERID") + "\t" + rs.getString("CREATIONDATE")
71 | + "\t" + rs.getInt("SCORE") + "\t" + rs.getInt("VIEWCOUNT") + "\t" + (rs.getString("BODY") == null ? "null": rs.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("OWNERUSERID")
72 | + "\t" + rs.getInt("LASTEDITORUSERID") + "\t" + rs.getString("LASTEDITORDISPLAYNAME") + "\t" + rs.getString("LASTEDITDATE")
73 | + "\t" + rs.getString("LASTACTIVITYDATE") + "\t" + rs.getString("COMMUNITYOWNEDDATE") + "\t" + rs.getString("CLOSEDDATE")
74 | + "\t" + (rs.getString("TITLE") == null?"null":rs.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs.getString("TAGS") + "\t" + rs.getInt("ANSWERCOUNT")
75 | + "\t" + rs.getInt("COMMENTCOUNT") + "\t" + rs.getInt("FAVORITECOUNT");
76 | postsLines.add(postsLine);
77 | }
78 |
79 | FileUtil.writeLines(userPostsFile, postsLines);
80 | postsLines.clear();
81 | sql = "select * from users where id = '" + userID + "';";
82 | rs = db.executeQuery(sql);
83 | while(rs.next()){
84 | String userInforLine = rs.getInt("ID") + "\t" + rs.getInt("REPUTATION") + "\t" + rs.getString("CREATIONDATE")
85 | + "\t" + rs.getString("DISPLAYNAME") + "\t" + rs.getString("EMAILHASH")
86 | + "\t" + rs.getString("LASTACCESSDATE") + "\t" + rs.getString("WEBSITEURL") + "\t" + rs.getString("LOCATION")
87 | + "\t" + rs.getInt("AGE") + "\t" + (rs.getString("ABOUTME") == null?"null":rs.getString("ABOUTME").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("VIEWS")
88 | + "\t" + rs.getInt("UPVOTES") + "\t" + rs.getInt("DOWNVOTES");
89 | //System.out.println("userInforLine: " + userInforLine);
90 | userInforLines.add(userInforLine);
91 | }
92 | rs.close();
93 | }
94 | FileUtil.writeLines(oriDataUserInforFile, userInforLines);
95 | }
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/tem/script/ExportGraphMatrix.java:
--------------------------------------------------------------------------------
1 | package tem.script;
2 |
3 | import java.sql.ResultSet;
4 | import java.sql.SQLException;
5 | import java.util.ArrayList;
6 | import java.util.HashMap;
7 | import java.util.Map;
8 |
9 | import tem.com.FileUtil;
10 | import tem.conf.PathConfig;
11 |
12 | public class ExportGraphMatrix {
13 | private static int[][] QAGraph;
14 | private static int userNum;
15 | private static ArrayList indexToUserIDMap;
16 | private static Map userIDToIndexMap;
17 |
18 | /**
19 | * @param args
20 | * @throws SQLException
21 | */
22 | public static void main(String[] args) throws SQLException {
23 | // TODO Auto-generated method stub
24 | final DBConnection db = new DBConnection();
25 | String minPostNum = "80";
26 | db.getConn();
27 | String sql = "";
28 | String userIDFile = PathConfig.originalDataPath + "USER" + minPostNum + "/user.IDs";
29 | String postFolder = PathConfig.originalDataPath + "USER" + minPostNum + "/posts/";
30 | String askerFolder = PathConfig.originalDataPath + "USER" + minPostNum + "/askers/";
31 | String graphDataFile = PathConfig.originalDataPath + "USER" + minPostNum + "/userVoteWeighted.QAgraph";
32 | ArrayList postLines = new ArrayList();
33 |
34 | ArrayList userIDs = new ArrayList();
35 | FileUtil.readLines(userIDFile, userIDs);
36 | buildIndexUserID(userIDs);
37 | QAGraph = new int[userNum][userNum];
38 | ArrayList askerLines = new ArrayList();
39 |
40 | for(int i = 0; i < userNum; i++){
41 | System.out.println("i = " + i);
42 | String postFile = postFolder + userIDs.get(i) + ".posts";
43 | //String askerFile = askerFolder + userIDs.get(i) + ".askers";
44 | //if(new File(askerFile).exists()){
45 | //System.out.println(askerFile + "is exists!");
46 | //continue;
47 | //}
48 | postLines.clear();
49 | FileUtil.readLines(postFile, postLines);
50 |
51 | askerLines.clear();
52 | //System.out.println("after clear, askerLines size: " + askerLines.size());
53 | for(String postLine : postLines){
54 | String [] postTokens = postLine.split("\t");
55 | if(postTokens[1].equals("2")){
56 | String parentID = postTokens[2];
57 | String askerID = getAuthorIDbyPostID(parentID, db);
58 | String vote = postTokens[5];
59 | //System.out.println("vote " + vote);
60 | //askerLines.add(askerID);
61 | //System.out.println("add, askerLines size: " + askerLines.size());
62 |
63 | //Answer count weighted graph
64 | if(userIDToIndexMap.containsKey(askerID)){
65 | QAGraph[Integer.valueOf(userIDToIndexMap.get(askerID))][Integer.valueOf(userIDToIndexMap.get(userIDs.get(i)))] += Integer.valueOf(vote);
66 | }
67 | } else {
68 | //askerLines.add("self");
69 | }
70 | }
71 | //FileUtil.writeLines(askerFile, askerLines);
72 | //System.out.println("before clear, askerLines size: " + askerLines.size());
73 |
74 | //System.out.println("after clear, askerLines size: " + askerLines.size());
75 | }
76 | printQAGraph(graphDataFile);
77 | db.close();
78 | }
79 |
80 | private static void printQAGraph(String graphDataFile) {
81 | // TODO Auto-generated method stub
82 | ArrayList QAGLines = new ArrayList();
83 | for(int i = 0; i < QAGraph.length; i++){
84 | String line = "";
85 | for(int j = 0; j < QAGraph[i].length; j++){
86 | line += QAGraph[i][j] + "\t";
87 | }
88 | QAGLines.add(line);
89 | }
90 | FileUtil.writeLines(graphDataFile, QAGLines);
91 | }
92 |
93 | private static String getAuthorIDbyPostID(String postID, DBConnection db) throws SQLException {
94 | // TODO Auto-generated method stub
95 | String sql = "select * from posts where id = "+ postID;
96 | ResultSet rs = db.executeQuery(sql);
97 | String authorID = "";
98 | while(rs.next()){
99 | authorID = rs.getString(9);
100 | }
101 | rs.close();
102 | return authorID;
103 | }
104 |
105 | private static void buildIndexUserID(ArrayList userIDs) {
106 | // TODO Auto-generated method stub
107 | indexToUserIDMap = new ArrayList();
108 | userIDToIndexMap = new HashMap();
109 |
110 | for(int i = 0; i < userIDs.size(); i++){
111 | indexToUserIDMap.add(userIDs.get(i));
112 | userIDToIndexMap.put(userIDs.get(i), String.valueOf(i));
113 | }
114 | userNum = userIDs.size();
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/tem/script/ExportTagsFromDB.java:
--------------------------------------------------------------------------------
1 | package tem.script;
2 |
3 | import java.io.File;
4 | import java.sql.ResultSet;
5 | import java.sql.SQLException;
6 | import java.util.ArrayList;
7 |
8 | import tem.com.FileUtil;
9 | import tem.conf.PathConfig;
10 | import tem.script.DBConnection;
11 |
12 | /**Export tags for each post from stackoverflow database
13 | * @author yangliu
14 | * @blog http://blog.csdn.net/yangliuy
15 | * @mail yangliuyx@gmail.com
16 | */
17 | public class ExportTagsFromDB {
18 |
19 | /**
20 | * @param args
21 | * @throws SQLException
22 | */
23 | public static void main(String[] args) throws SQLException {
24 | // TODO Auto-generated method stub
25 | String[] minPostNums = {"30"};
26 | final DBConnection db = new DBConnection();
27 | //ResultSet rs;
28 | db.getConn();
29 | String sql = "";
30 | StringBuffer sb = new StringBuffer();
31 | for(String minPostNum : minPostNums){
32 | String userIDFile = PathConfig.scriptDataPath + "USERID" + minPostNum;
33 | sql = "";
34 | ArrayList userIDs = new ArrayList();
35 | FileUtil.readLines(userIDFile, userIDs);
36 | String oriDataFolder = PathConfig.originalDataPath + "USER" + minPostNum;
37 | String postFolder = oriDataFolder + "/posts";
38 | String tagFolder = oriDataFolder + "/tags";
39 | ArrayList postsLines = new ArrayList();
40 | ArrayList tagsLines = new ArrayList();
41 | if(!new File(tagFolder).exists()){
42 | new File(tagFolder).mkdir();
43 | }
44 |
45 | for(String userID : userIDs){
46 | String userTagsFile = tagFolder + "/" + userID + ".tags";
47 | System.out.println("Now tag file is: " + userTagsFile);
48 | if(new File(userTagsFile).exists()){
49 | System.out.println(userTagsFile + "is existed!");
50 | continue;
51 | }
52 |
53 | String userPostsFile = postFolder + "/" + userID +".posts";
54 | FileUtil.readLines(userPostsFile, postsLines);
55 | for(String postLine : postsLines){
56 | String[] postLineTokens = postLine.split("\t");
57 | if(postLineTokens.length != 20){
58 | System.err.println("format error : " + postLine);
59 | tagsLines.add(postLineTokens[0] + "\t" + "null");
60 | continue;
61 | }
62 | String postTypeID = postLineTokens[1];
63 | if(postTypeID.equals("1")){
64 | tagsLines.add(postLineTokens[0] + "\t" + postLineTokens[16]);
65 | } else {
66 | String parentID = postLineTokens[2];
67 | //Use StringBuffer instead of add Strings
68 | sb.delete(0, sb.length());
69 | sb.append("select * from posts where id = '");
70 | sb.append(parentID);
71 | sb.append("';");
72 | sql = sb.toString();
73 | //System.out.println("sql builder: " + sql);
74 | ResultSet rs = db.executeQuery(sql);
75 | while(rs.next()){
76 | tagsLines.add(postLineTokens[0] + "\t" + rs.getString("TAGS"));
77 | }
78 | rs.close();
79 | }
80 | }
81 |
82 | FileUtil.writeLines(userTagsFile, tagsLines);
83 | postsLines.clear();
84 | tagsLines.clear();
85 | }
86 | }
87 | db.close();
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/tem/script/ExportTestDataForRank.java:
--------------------------------------------------------------------------------
1 | package tem.script;
2 |
3 | import java.sql.ResultSet;
4 | import java.sql.SQLException;
5 | import java.util.ArrayList;
6 |
7 | import tem.com.FileUtil;
8 | import tem.conf.PathConfig;
9 |
10 | /**Export Test Data for Rank answers/experts
11 | * @author yangliu
12 | * @blog http://blog.csdn.net/yangliuy
13 | * @mail yangliuyx@gmail.com
14 | */
15 |
16 | public class ExportTestDataForRank {
17 |
18 | /* @param args
19 | * @throws SQLException
20 | */
21 | public static void main(String[] args) throws SQLException {
22 | // TODO Auto-generated method stub
23 | final DBConnection db = new DBConnection();
24 | String minPostNum = "80";
25 | db.getConn();
26 | String sql = "";
27 | String userIDFile = PathConfig.scriptDataPath + "USERID" + minPostNum;
28 | ArrayList userIDs = new ArrayList();
29 | FileUtil.readLines(userIDFile, userIDs);
30 | String testDataFolder = PathConfig.testDataPath;
31 | ArrayList questionLines = new ArrayList();
32 | ArrayList answerLines = new ArrayList();
33 | ArrayList questionIDs = new ArrayList();
34 | String questionFile = testDataFolder + "testData.questions";
35 | String questionIDFile = testDataFolder + "testDataQuestions.id";
36 | FileUtil.readLines(questionIDFile, questionIDs);
37 |
38 | for(String questionIDLine : questionIDs){
39 | String questionID = questionIDLine.split("\t")[1];
40 | sql = "select * from posts where id = "+ questionID;
41 | ResultSet rs = db.executeQuery(sql);
42 | while(rs.next()){
43 | questionLines.add(rs.getInt("ID") + "\t" + rs.getInt("POSTTYPEID")
44 | + "\t" + rs.getInt("PARENTID") + "\t" + rs.getInt("ACCEPTEDANSWERID") + "\t" + rs.getString("CREATIONDATE")
45 | + "\t" + rs.getInt("SCORE") + "\t" + rs.getInt("VIEWCOUNT") + "\t" + (rs.getString("BODY") == null ? "null": rs.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("OWNERUSERID")
46 | + "\t" + rs.getInt("LASTEDITORUSERID") + "\t" + rs.getString("LASTEDITORDISPLAYNAME") + "\t" + rs.getString("LASTEDITDATE")
47 | + "\t" + rs.getString("LASTACTIVITYDATE") + "\t" + rs.getString("COMMUNITYOWNEDDATE") + "\t" + rs.getString("CLOSEDDATE")
48 | + "\t" + (rs.getString("TITLE") == null?"null":rs.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs.getString("TAGS") + "\t" + rs.getInt("ANSWERCOUNT")
49 | + "\t" + rs.getInt("COMMENTCOUNT") + "\t" + rs.getInt("FAVORITECOUNT"));
50 | }
51 |
52 | /*for(String userID : userIDs){
53 | sql = "select * from posts where posts.creationdate > '2009-08-01 00:00:00'" +
54 | " and posts.creationdate < '2009-11-01 00:00:00' and posts.posttypeid = 1 " +
55 | "and answercount > 5 and owneruserid = " + userID;
56 | ResultSet rs = db.executeQuery(sql);
57 | while(rs.next()){
58 | System.out.println("userID: " + userID +
59 | " question id: " + rs.getInt(1) +
60 | " answercount: " + rs.getInt(18) +
61 | "question tag: " + rs.getString(17) +
62 | " question title: " + rs.getString(16) );
63 | questionLines.add(rs.getInt("ID") + "\t" + rs.getInt("POSTTYPEID")
64 | + "\t" + rs.getInt("PARENTID") + "\t" + rs.getInt("ACCEPTEDANSWERID") + "\t" + rs.getString("CREATIONDATE")
65 | + "\t" + rs.getInt("SCORE") + "\t" + rs.getInt("VIEWCOUNT") + "\t" + (rs.getString("BODY") == null ? "null": rs.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs.getInt("OWNERUSERID")
66 | + "\t" + rs.getInt("LASTEDITORUSERID") + "\t" + rs.getString("LASTEDITORDISPLAYNAME") + "\t" + rs.getString("LASTEDITDATE")
67 | + "\t" + rs.getString("LASTACTIVITYDATE") + "\t" + rs.getString("COMMUNITYOWNEDDATE") + "\t" + rs.getString("CLOSEDDATE")
68 | + "\t" + (rs.getString("TITLE") == null?"null":rs.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs.getString("TAGS") + "\t" + rs.getInt("ANSWERCOUNT")
69 | + "\t" + rs.getInt("COMMENTCOUNT") + "\t" + rs.getInt("FAVORITECOUNT"));*/
70 | System.out.println("questionID " + questionID);
71 | String answerFile = testDataFolder + questionID + ".answers";
72 | sql = "select * from posts where posts.posttypeid = 2 and parentid = "+ questionID;
73 | ResultSet rs2 = db.executeQuery(sql);
74 | while(rs2.next()){
75 | answerLines.add(rs2.getInt("ID") + "\t" + rs2.getInt("POSTTYPEID")
76 | + "\t" + rs2.getInt("PARENTID") + "\t" + rs2.getInt("ACCEPTEDANSWERID") + "\t" + rs2.getString("CREATIONDATE")
77 | + "\t" + rs2.getInt("SCORE") + "\t" + rs2.getInt("VIEWCOUNT") + "\t" + (rs2.getString("BODY") == null ? "null": rs2.getString("BODY").replaceAll("[\n-\r-\t]", " ")) + "\t" + rs2.getInt("OWNERUSERID")
78 | + "\t" + rs2.getInt("LASTEDITORUSERID") + "\t" + rs2.getString("LASTEDITORDISPLAYNAME") + "\t" + rs2.getString("LASTEDITDATE")
79 | + "\t" + rs2.getString("LASTACTIVITYDATE") + "\t" + rs2.getString("COMMUNITYOWNEDDATE") + "\t" + rs2.getString("CLOSEDDATE")
80 | + "\t" + (rs2.getString("TITLE") == null?"null":rs2.getString("TITLE".replaceAll("[\n-\r-\t]", " ")))+ "\t" + rs2.getString("TAGS") + "\t" + rs2.getInt("ANSWERCOUNT")
81 | + "\t" + rs2.getInt("COMMENTCOUNT") + "\t" + rs2.getInt("FAVORITECOUNT"));
82 | }
83 | FileUtil.writeLines(answerFile, answerLines);
84 | answerLines.clear();
85 | }
86 | FileUtil.writeLines(questionFile, questionLines);
87 | db.close();
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/tem/script/HandleTagTest.java:
--------------------------------------------------------------------------------
1 | package tem.script;
2 |
3 | public class HandleTagTest {
4 |
5 | /**
6 | * @param args
7 | */
8 | public static void main(String[] args) {
9 | // TODO Auto-generated method stub
10 | String tags1 = "";
11 | String tags2 = "";
12 | String[] tags = tags1.replaceAll("[<>]", " ").split(" ");
13 | System.out.println(tags.length);
14 |
15 | for(String tag : tags){
16 | System.out.println(tag.replace(" ", ""));
17 | }
18 | }
19 |
20 | }
21 |
--------------------------------------------------------------------------------
/src/tem/script/JAMATest.java:
--------------------------------------------------------------------------------
1 | package tem.script;
2 |
3 | /**Standard PageRank Algorithm
4 | * @author yangliu
5 | * @blog http://blog.csdn.net/yangliuy
6 | * @mail yangliuyx@gmail.com
7 | */
8 |
9 | import java.util.ArrayList;
10 | import java.util.List;
11 | import java.util.Random;
12 |
13 | import Jama.Matrix;
14 |
15 | public class JAMATest {
16 | private static final double LAMBDA = 0.5;
17 | private static final double THRESHOLD = 0.0000001;
18 |
19 | public static void main(String[] args) {
20 | System.out.println("lambda is " + LAMBDA);
21 | //Both randomly initialise or set all 1 are OK
22 | //PR0 = getInitPR0(3);
23 | double[] PR0Array = new double[3];
24 | for(int i = 0; i < 3; i++){
25 | PR0Array[i] = 1;
26 | }
27 |
28 | Matrix PR0 = new Matrix (PR0Array, 1);
29 | System.out.println("Initial state vector PR0 is:");
30 | PR0.print(3, 3);
31 |
32 | System.out.println("Page Rank Update Matrix newPR:");
33 | getNewPR(LAMBDA).print(3, 3);
34 |
35 | Matrix pageRank = calPageRank(PR0, LAMBDA);
36 | System.out.println("Final PageRank is:");
37 | pageRank.print(3, 3);
38 | System.out.println();
39 | }
40 |
41 | /**
42 | * Randomly Initialise state vector PR0
43 | *
44 | * @param n
45 | * dimension of vector PR0
46 | * @return A random vector, each dimension is 0-5
47 | */
48 | public static List getInitPR0(int n) {
49 | Random random = new Random();
50 | List q = new ArrayList();
51 | for (int i = 0; i < n; i++) {
52 | q.add(new Double(5 * random.nextDouble()));
53 | }
54 | return q;
55 | }
56 |
57 | /**
58 | * Compute Euclidean Distance
59 | *
60 | * @param q1
61 | *
62 | * @param q2
63 | *
64 | * @return distance
65 | */
66 | public static double calDistance(Matrix q1, Matrix q2) {
67 | double sum = 0;
68 |
69 | if (q1.getColumnDimension() != q2.getColumnDimension() ) {
70 | return -1;
71 | }
72 |
73 | for (int i = 0; i < q1.getColumnDimension() ; i++) {
74 | sum += Math.pow(q1.get(0, i) - q2.get(0, i),
75 | 2);
76 | }
77 | return Math.sqrt(sum);
78 | }
79 |
80 | /**
81 | * compute pagerank
82 | *
83 | * @param PR0
84 | * Initialise state vector
85 | * @param lambda
86 | * lambda
87 | * @return pagerank result
88 | */
89 | public static Matrix calPageRank(Matrix PR0, double lambda) {
90 |
91 | Matrix newPR = getNewPR(lambda);
92 | Matrix PR;
93 | while (true) {
94 | PR = PR0.times(newPR);
95 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration
96 | System.out.println("distance:" + dis);
97 | if (dis <= THRESHOLD) {
98 | System.out.println("PR:");
99 | PR.print(3, 3);
100 | break;
101 | }
102 | PR0 = PR;
103 | }
104 | return PR;
105 | }
106 |
107 | /**
108 | * compute NEWPR matrix
109 | *
110 | * @param lambda
111 | *
112 | * @return NEWPR matrix
113 | */
114 | public static Matrix getNewPR(double lambda) {
115 |
116 | int V = getM().getColumnDimension();
117 | Matrix add1 = getM().times(lambda);
118 | Matrix add2 = getU().times((1 - lambda) / V);
119 | Matrix newPR = add1.plus(add2);
120 | return newPR;
121 | }
122 |
123 | /**
124 | * Initialise transition matrix M
125 | *
126 | * @return M
127 | */
128 | public static Matrix getM() {
129 | double[][] m = new double[3][3];
130 |
131 | m[0][0] = 0;
132 | m[0][1] = 1;
133 | m[0][2] = 0;
134 |
135 | m[1][0] = 0.5;
136 | m[1][1] = 0;
137 | m[1][2] = 0.5;
138 |
139 | m[2][0] = 0;
140 | m[2][1] = 1;
141 | m[2][2] = 0;
142 |
143 | Matrix M = new Matrix(m);
144 |
145 | return M;
146 | }
147 |
148 | /**
149 | * Initialise Matrix U
150 | *
151 | * @return U
152 | */
153 | public static Matrix getU() {
154 |
155 | double[][] u = new double[3][3];
156 |
157 | u[0][0] = 1;
158 | u[0][1] = 1;
159 | u[0][2] = 1;
160 |
161 | u[1][0] = 1;
162 | u[1][1] = 1;
163 | u[1][2] = 1;
164 |
165 | u[2][0] = 1;
166 | u[2][1] = 1;
167 | u[2][2] = 1;
168 |
169 | Matrix U = new Matrix(u);
170 |
171 | return U;
172 | }
173 | }
174 |
--------------------------------------------------------------------------------
/src/tem/script/MergeUser10.java:
--------------------------------------------------------------------------------
1 | package tem.script;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Set;
5 | import java.util.TreeSet;
6 |
7 | import tem.com.FileUtil;
8 |
9 | public class MergeUser10 {
10 |
11 | /**
12 | * @param args
13 | */
14 | public static void main(String[] args) {
15 | // TODO Auto-generated method stub
16 | String data10Path = "data/originalData/ThreeM09/User80MergeUser10/similarQ/User10/";
17 | String data80Path = "data/originalData/ThreeM09/User80MergeUser10/";
18 | ArrayList data10IDs = new ArrayList();
19 | ArrayList data80IDs = new ArrayList();
20 | ArrayList allIds = new ArrayList();
21 | ArrayList overLapIDLines = new ArrayList();
22 | FileUtil.readLines(data10Path + "users.IDs", data10IDs);
23 | FileUtil.readLines(data80Path + "user.IDs", data80IDs);
24 | allIds.addAll(data80IDs);
25 |
26 | //Find overlap userIDs
27 | for(String userID10 : data10IDs){
28 | String data10PathPost = data10Path + "posts/" + userID10 + ".posts";
29 | if(data80IDs.contains(userID10.trim())){
30 | //overlap
31 | System.out.println("voerlap id: " + userID10);
32 | overLapIDLines.add(userID10);
33 | } else{
34 | allIds.add(userID10);
35 | String newData80PathPost = data80Path + "posts/" + userID10 + ".posts";
36 | FileUtil.copyFile(data10PathPost, newData80PathPost);
37 | }
38 | }
39 | FileUtil.writeLines(data80Path + "overlapIDs", overLapIDLines);
40 |
41 | FileUtil.writeLines(data80Path + "allUserIDs", allIds);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/tem/script/PageRank2.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yangliuy/TopicExpertiseModel/0f918d78a420b9cf03834576d6595060e64f8a56/src/tem/script/PageRank2.java
--------------------------------------------------------------------------------
/src/tem/script/PageRankYL.java:
--------------------------------------------------------------------------------
1 | package tem.script;
2 |
3 | /**Standard PageRank Algorithm
4 | * @author yangliu
5 | * @blog http://blog.csdn.net/yangliuy
6 | * @mail yangliuyx@gmail.com
7 | */
8 |
9 | import java.util.ArrayList;
10 | import java.util.List;
11 | import java.util.Random;
12 |
13 | public class PageRankYL {
14 | private static final double LAMBDA = 0.5;
15 | private static final double THRESHOLD = 0.0000001;
16 |
17 | public static void main(String[] args) {
18 | System.out.println("lambda is " + LAMBDA);
19 | List PR0 = new ArrayList();
20 | //Both randomly initialise or set all 1 are OK
21 | //PR0 = getInitPR0(3);
22 | PR0.add(new Double(1));
23 | PR0.add(new Double(1));
24 | PR0.add(new Double(1));
25 | System.out.println("Initial state vector PR0 is:");
26 | printVec(PR0);
27 | System.out.println("Page Rank Update Matrix newPR:");
28 | printMatrix(getNewPR(LAMBDA));
29 | List pageRank = calPageRank(PR0, LAMBDA);
30 | System.out.println("Final PageRank is:");
31 | printVec(pageRank);
32 | System.out.println();
33 | }
34 |
35 | public static void printMatrix(List> m) {
36 | for (int i = 0; i < m.size(); i++) {
37 | for (int j = 0; j < m.get(i).size(); j++) {
38 | System.out.print(m.get(i).get(j) + ", ");
39 | }
40 | System.out.println();
41 | }
42 | }
43 |
44 |
45 | public static void printVec(List v) {
46 | for (int i = 0; i < v.size(); i++) {
47 | System.out.print(v.get(i) + ", ");
48 | }
49 | System.out.println();
50 | }
51 |
52 | /**
53 | * Randomly Initialise state vector PR0
54 | *
55 | * @param n
56 | * dimension of vector PR0
57 | * @return A random vector, each dimension is 0-5
58 | */
59 | public static List getInitPR0(int n) {
60 | Random random = new Random();
61 | List q = new ArrayList();
62 | for (int i = 0; i < n; i++) {
63 | q.add(new Double(5 * random.nextDouble()));
64 | }
65 | return q;
66 | }
67 |
68 | /**
69 | * Compute Euclidean Distance
70 | *
71 | * @param q1
72 | *
73 | * @param q2
74 | *
75 | * @return distance
76 | */
77 | public static double calDistance(List q1, List q2) {
78 | double sum = 0;
79 |
80 | if (q1.size() != q2.size()) {
81 | return -1;
82 | }
83 |
84 | for (int i = 0; i < q1.size(); i++) {
85 | sum += Math.pow(q1.get(i).doubleValue() - q2.get(i).doubleValue(),
86 | 2);
87 | }
88 | return Math.sqrt(sum);
89 | }
90 |
91 | /**
92 | * compute pagerank
93 | *
94 | * @param PR0
95 | * Initialise state vector
96 | * @param lambda
97 | * lambda
98 | * @return pagerank result
99 | */
100 | public static List calPageRank(List PR0, double lambda) {
101 |
102 | List> newPR = getNewPR(lambda);
103 | List PR = null;
104 | while (true) {
105 | PR = vectorMulMatrix(PR0, newPR);
106 | double dis = calDistance(PR, PR0);//PR0 store PR vector after last iteration
107 | System.out.println("distance:" + dis);
108 | if (dis <= THRESHOLD) {
109 | System.out.println("PR0:");
110 | printVec(PR0);
111 | System.out.println("PR:");
112 | printVec(PR);
113 | break;
114 | }
115 | PR0 = PR;
116 | }
117 | return PR;
118 | }
119 |
120 | /**
121 | * compute NEWPR matrix
122 | *
123 | * @param lambda
124 | *
125 | * @return NEWPR matrix
126 | */
127 | public static List> getNewPR(double lambda) {
128 |
129 | int V = getM().size();
130 | List> add1 = numberMulMatrix(getM(), lambda);
131 | List> add2 = numberMulMatrix(getU(), (1 - lambda) / V);
132 | List> newPR = addMatrix(add1, add2);
133 | return newPR;
134 | }
135 |
136 | /**
137 | * Compute the product of a vector and a matrix
138 | *
139 | * @param v
140 | * a vector
141 | * @param m
142 | * a matrix
143 | * @return a new vector
144 | */
145 | public static List vectorMulMatrix(List v, List> m
146 | ) {
147 |
148 | if (m == null || v == null || m.size() <= 0
149 | || m.get(0).size() != v.size()) {
150 | return null;
151 | }
152 |
153 | List list = new ArrayList