├── .gitignore
├── LDAdata
├── model-final.others
├── model-final.phi
├── model-final.tassign
├── model-final.theta
├── model-final.twords
├── newdocs.dat
└── wordmap.txt
├── README.md
├── StopWordTable.txt
├── bin
├── application
│ ├── Controller1.class
│ ├── Main.class
│ ├── Scene1.fxml
│ └── application.css
└── com
│ └── sxu
│ ├── Crawler
│ ├── GetUrl.class
│ ├── InfoSpider.class
│ ├── Url.class
│ └── spider.class
│ ├── Similarity
│ └── Similarity.class
│ ├── UserData
│ └── User.class
│ └── Vector
│ ├── LDA
│ └── LDA.class
│ ├── Segmentation
│ ├── Filepro.class
│ ├── JieBa.class
│ └── StopWords.class
│ ├── Word2Vec
│ └── Word2Vec.class
│ └── main
│ └── Process.class
├── build.fxbuild
├── data
└── data.txt
├── lib
├── LDA.jar
├── Word2Vec.jar
├── jieba.jar
└── jsoup-1.8.2.jar
├── result
├── vector.txt
├── vector1000.txt
└── 分词结果.txt
├── src
├── application
│ ├── Controller1.java
│ ├── Main.java
│ ├── Scene1.fxml
│ └── application.css
└── com
│ └── sxu
│ ├── Similarity
│ └── Similarity.java
│ ├── UserData
│ └── User.java
│ └── Vector
│ ├── LDA
│ └── LDA.java
│ ├── Segmentation
│ ├── Filepro.java
│ ├── JieBa.java
│ └── StopWords.java
│ ├── Word2Vec
│ └── Word2Vec.java
│ └── main
│ └── Process.java
├── temp
├── tempCorpus303848893241311564.txt
├── tempCorpus4113479871682855437.txt
└── tempCorpus460574311849048332.txt
└── 网络水军识别系统使用手册.docx
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 |
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 |
--------------------------------------------------------------------------------
/LDAdata/model-final.others:
--------------------------------------------------------------------------------
1 | alpha=0.5
2 | beta=0.1
3 | ntopics=100
4 | ndocs=3065
5 | nwords=9735
6 | liters=49
7 |
--------------------------------------------------------------------------------
/LDAdata/newdocs.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/LDAdata/newdocs.dat
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spamer-Detect-System
2 | 对汽车之家论坛里的评论数据处理和分析,利用用户潜在行为数据得出用户行为特征,采用LDA主题模型得出用户评论的主题特征,采用Word2Vec词向量模型得出用户评论的文本内容特征,采用K-Means聚类得出水军文本类别,结合用户行为特征,最终实现了对网络水军的识别。
3 |
--------------------------------------------------------------------------------
/StopWordTable.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/StopWordTable.txt
--------------------------------------------------------------------------------
/bin/application/Controller1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/application/Controller1.class
--------------------------------------------------------------------------------
/bin/application/Main.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/application/Main.class
--------------------------------------------------------------------------------
/bin/application/Scene1.fxml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/bin/application/application.css:
--------------------------------------------------------------------------------
1 | /* JavaFX CSS - Leave this comment until you have at least create one rule which uses -fx-Property */
--------------------------------------------------------------------------------
/bin/com/sxu/Crawler/GetUrl.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Crawler/GetUrl.class
--------------------------------------------------------------------------------
/bin/com/sxu/Crawler/InfoSpider.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Crawler/InfoSpider.class
--------------------------------------------------------------------------------
/bin/com/sxu/Crawler/Url.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Crawler/Url.class
--------------------------------------------------------------------------------
/bin/com/sxu/Crawler/spider.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Crawler/spider.class
--------------------------------------------------------------------------------
/bin/com/sxu/Similarity/Similarity.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Similarity/Similarity.class
--------------------------------------------------------------------------------
/bin/com/sxu/UserData/User.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/UserData/User.class
--------------------------------------------------------------------------------
/bin/com/sxu/Vector/LDA/LDA.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Vector/LDA/LDA.class
--------------------------------------------------------------------------------
/bin/com/sxu/Vector/Segmentation/Filepro.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Vector/Segmentation/Filepro.class
--------------------------------------------------------------------------------
/bin/com/sxu/Vector/Segmentation/JieBa.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Vector/Segmentation/JieBa.class
--------------------------------------------------------------------------------
/bin/com/sxu/Vector/Segmentation/StopWords.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Vector/Segmentation/StopWords.class
--------------------------------------------------------------------------------
/bin/com/sxu/Vector/Word2Vec/Word2Vec.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Vector/Word2Vec/Word2Vec.class
--------------------------------------------------------------------------------
/bin/com/sxu/Vector/main/Process.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/bin/com/sxu/Vector/main/Process.class
--------------------------------------------------------------------------------
/build.fxbuild:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/data/data.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/data/data.txt
--------------------------------------------------------------------------------
/lib/LDA.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/lib/LDA.jar
--------------------------------------------------------------------------------
/lib/Word2Vec.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/lib/Word2Vec.jar
--------------------------------------------------------------------------------
/lib/jieba.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/lib/jieba.jar
--------------------------------------------------------------------------------
/lib/jsoup-1.8.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/lib/jsoup-1.8.2.jar
--------------------------------------------------------------------------------
/result/分词结果.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Wiskia/Spamer-Detect-System/dd79aa5bf90ff35c7a2f5124a1de3c1ad2a575ce/result/分词结果.txt
--------------------------------------------------------------------------------
/src/application/Controller1.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @author SXU.ZSC
3 | *
4 | */
5 | package application;
6 |
7 | import java.io.BufferedReader;
8 | import java.io.File;
9 | import java.io.FileInputStream;
10 | import java.io.IOException;
11 | import java.io.InputStreamReader;
12 | import java.net.URL;
13 | import java.util.ArrayList;
14 | import java.util.ResourceBundle;
15 |
16 | import javafx.event.ActionEvent;
17 | import javafx.fxml.FXML;
18 | import javafx.fxml.Initializable;
19 | import javafx.scene.Group;
20 | import javafx.scene.Scene;
21 | import javafx.scene.control.Alert;
22 | import javafx.scene.control.Button;
23 | import javafx.scene.control.TextArea;
24 | import javafx.scene.control.TextField;
25 | import javafx.stage.Stage;
26 | import com.sxu.Vector.Segmentation.Filepro;
27 | import com.sxu.Vector.main.Process;
28 | import com.sxu.Crawler.InfoSpider;
29 | import com.sxu.Similarity.Similarity;
30 | import com.sxu.UserData.User;;
31 |
32 | public class Controller1 implements Initializable {
33 | /**
34 | *事务响应配置,
35 | *开始分析、导入文件、预处理、分析用户等步骤实现,以及结果展示
36 | **/
37 | //开始分析
38 | @FXML
39 | private Button start;
40 |
41 | //导入文件
42 | @FXML
43 | private Button input;
44 |
45 | //预处理
46 | @FXML
47 | private Button pro;
48 |
49 | //结果显示区
50 | @FXML
51 | private TextArea textarea;
52 |
53 | //文件路径输入框
54 | @FXML
55 | private TextField address;
56 |
57 | //文件路径
58 | public static String Path = "";
59 | //用户
60 | public static ArrayList user = new ArrayList();
61 | //用户名
62 | public static ArrayList username = new ArrayList();
63 |
64 | // 提醒窗口
65 | public void AlertWindow(String p_header, String p_message,String p_content){
66 | Alert _alert = new Alert(Alert.AlertType.INFORMATION);
67 | Group root = new Group();
68 | Scene scene = new Scene(root,200.0,200.0);
69 | _alert.setTitle(p_header);
70 | _alert.setHeaderText(p_message);
71 | _alert.setContentText(p_content);
72 | Stage d_stage = new Stage();
73 | d_stage.setHeight(200);
74 | d_stage.setWidth(200);
75 | d_stage.setScene(scene);
76 | _alert.initOwner(d_stage);
77 | _alert.show();
78 | }
79 |
80 | //导入文件事件
81 | public void InputAction(ActionEvent evt) throws IOException, Exception {
82 |
83 | Path = address.getText();
84 |
85 | AlertWindow("文件导入","文件导入成功!","文件路径为:"+Path);
86 | }
87 |
88 | //预处理事件
89 | public void ProAction(ActionEvent evt) throws Exception {
90 | if(Path != ""){
91 | String encoding="gbk";
92 | File file=new File(Path);
93 | InputStreamReader read = new InputStreamReader(
94 | new FileInputStream(file),encoding);
95 | BufferedReader bufferedReader = new BufferedReader(read);
96 | String str= "";
97 | str = bufferedReader.readLine();
98 | while (str != null){
99 | System.out.println(str);
100 | String[] all = str.split(";");
101 | if(username.contains(all[0])==false){
102 | username.add(all[0]);
103 | User newuser = new User();
104 | newuser.text.add(Filepro.pro(all[2]));
105 | newuser.setName(all[0]);
106 | newuser.setPd(InfoSpider.demo(all[1]));
107 | user.add(newuser);
108 | }
109 | else {
110 | for(int i = 0;i0.48) user.get(j).pt++;
142 | }
143 | user.get(j).pt = user.get(j).pt/user.get(j).text.size();
144 | if(user.get(j).pt>0.49) {
145 | if(user.get(j).pd<2){
146 | user.get(j).property = "水军用户";
147 | }
148 | else user.get(j).property = "正常用户";
149 | }
150 | else user.get(j).property = "正常用户";
151 | vector = vector+""+user.get(j).name+"用户属性:"+user.get(j).property+"\n"+"p值:"+user.get(j).pt+"\n"+"q值:"+user.get(j).pd+"\n";
152 | textarea.setText(vector);
153 | }
154 | }
155 |
156 | @Override
157 | public void initialize(URL location, ResourceBundle resources) {
158 | // TODO Auto-generated method stub
159 | }
160 | }
161 |
--------------------------------------------------------------------------------
/src/application/Main.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author SXU.ZSC
6 | *
7 | */
8 | package application;
9 |
10 | import javafx.application.Application;
11 | import javafx.fxml.FXMLLoader;
12 | import javafx.stage.Stage;
13 | import javafx.scene.Parent;
14 | import javafx.scene.Scene;
15 |
16 | public class Main extends Application {
17 | /**
18 | *javaFx用户界面主程序,
19 | *打开Scene1.fxml面板
20 | **/
21 | @Override
22 | public void start(Stage primaryStage) {
23 | try {
24 | // BorderPane root = new BorderPane();
25 | // Scene scene = new Scene(root,400,400);
26 | // scene.getStylesheets().add(getClass().getResource("application.css").toExternalForm());
27 | // primaryStage.setScene(scene);
28 | // primaryStage.show();
29 | Parent root = FXMLLoader.load(getClass()
30 | .getResource("/application/Scene1.fxml"));
31 |
32 | primaryStage.setTitle("网络水军识别系统");
33 | primaryStage.setScene(new Scene(root));
34 | primaryStage.show();
35 | } catch(Exception e) {
36 | e.printStackTrace();
37 | }
38 | }
39 |
40 | public static void main(String[] args) {
41 | launch(args);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/application/Scene1.fxml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/src/application/application.css:
--------------------------------------------------------------------------------
1 | /* JavaFX CSS - Leave this comment until you have at least create one rule which uses -fx-Property */
--------------------------------------------------------------------------------
/src/com/sxu/Similarity/Similarity.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | /**
5 | * @author SXU.ZSC
6 | *
7 | */
8 | package com.sxu.Similarity;
9 |
10 | public class Similarity{
11 | /**
12 | *欧式距离算法,
13 | *计算测试点到各个聚类中心的距离
14 | *维度大小:200
15 | **/
16 | public static float[] Sim(float [] a,float [] b ,float [] c){
17 |
18 | float sumXYdiff[]= new float[2];
19 | float[][]diff=new float[2][a.length];
20 | for(int i=0;i text = new ArrayList();
18 | public double pd = 0;
19 | public double pt = 0;
20 | public String getUrl() {
21 | return url;
22 | }
23 | public void setUrl(String url) {
24 | this.url = url;
25 | }
26 |
27 | public String getName() {
28 | return name;
29 | }
30 | public void setName(String name) {
31 | this.name = name;
32 | }
33 | public ArrayList getText() {
34 | return text;
35 | }
36 | public void setText(ArrayList text) {
37 | this.text = text;
38 | }
39 | public double getPd() {
40 | return pd;
41 | }
42 | public void setPd(double pd) {
43 | this.pd = pd;
44 | }
45 | public double getPt() {
46 | return pt;
47 | }
48 | public void setPt(double pt) {
49 | this.pt = pt;
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/src/com/sxu/Vector/LDA/LDA.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @author SXU.ZSC
3 | *
4 | */
5 | package com.sxu.Vector.LDA;
6 |
7 | import java.io.BufferedReader;
8 | import java.io.File;
9 | import java.io.FileInputStream;
10 | import java.io.IOException;
11 | import java.io.InputStreamReader;
12 | import java.io.RandomAccessFile;
13 | import org.kohsuke.args4j.CmdLineException;
14 | import org.kohsuke.args4j.CmdLineParser;
15 | import jgibblda.Inferencer;
16 | import jgibblda.LDACmdOption;
17 | import jgibblda.Model;
18 |
19 | public class LDA {
20 | /**
21 | *调用LDAjar包,实现LDA主题表示算法
22 | *输入:经分词去糙后的一条评论文本
23 | *输出:LDA主题分布
24 | *维度大小:100
25 | **/
26 | public static void test() throws CmdLineException{
27 | //这里定义了一个option类的对象,同时实例化了LDACmdOption里所有的变量
28 | LDACmdOption option = new LDACmdOption();
29 | option.setEst(false);
30 | option.setEstc(true);
31 | option.setDir("D:\\javashit\\SDS\\LDAdata\\");
32 | option.setDfile("newdocs.dat");
33 | option.setModelName("model-final");
34 | CmdLineParser parser = new CmdLineParser(option);
35 | //以下是对option这个对象进行各种操作
36 | parser.parseArgument();
37 | Inferencer inferencer = new Inferencer();
38 | inferencer.init(option);
39 | Model newModel = inferencer.inference();
40 | // for (int i = 0; i < newModel.phi.length; ++i){//输出结果
41 | // //phi: K * V
42 | // System.out.println("-----------------------\ntopic" + i + " : ");
43 | // for (int j = 0; j < 10; ++j){
44 | // System.out.println(inferencer.globalDict.id2word.get(j) + "\t" + newModel.phi[i][j]);
45 | // }
46 | // }
47 | }
48 |
49 | public static void showHelp(CmdLineParser parser){
50 | System.out.println("LDA [options ...] [arguments...]");//输出
51 | parser.printUsage(System.out);
52 | }
53 |
54 | public static void writer(String Str,String path) throws IOException{
55 | RandomAccessFile randomFile = new RandomAccessFile(path, "rw");
56 | // 文件长度,字节数
57 | long fileLength = randomFile.length();
58 | // 将写文件指针移到文件尾。
59 | randomFile.seek(fileLength);
60 | randomFile.write(("\r\n"+Str).getBytes("GB2312"));
61 | randomFile.close();
62 | }
63 |
64 | public static void WriteFirstLine(String Str,String path) throws IOException{
65 | RandomAccessFile randomFile = new RandomAccessFile(path, "rw");
66 | // 文件长度,字节数
67 | long fileLength = randomFile.length();
68 | // 将写文件指针移到文件尾。
69 | randomFile.seek(fileLength);
70 | randomFile.write((Str).getBytes("GB2312"));
71 | randomFile.close();
72 | }
73 |
74 | public static String readlastline(String path) throws IOException{
75 | String str1 = null;
76 | String str2 = null;
77 | try {
78 | String encoding="GBK";
79 | File file=new File(path);
80 | if(file.isFile() && file.exists()){
81 | InputStreamReader read = new InputStreamReader(
82 | new FileInputStream(file),encoding);
83 | BufferedReader bufferedReader = new BufferedReader(read);
84 | while ((str1 = bufferedReader.readLine()) != null){
85 | str2 = str1;
86 | }
87 | read.close();
88 |
89 | }
90 | } catch (Exception e) {
91 | e.printStackTrace();
92 | }
93 | return str2;
94 | }
95 |
96 | public static void main(String[] args) throws CmdLineException{
97 | LDA.test();
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/com/sxu/Vector/Segmentation/Filepro.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @author SXU.ZSC
3 | *
4 | */
5 | package com.sxu.Vector.Segmentation;
6 |
7 | import test.java.com.huaban.analysis.jieba.Demo;
8 |
9 | public class Filepro {
10 | /**
11 | *文本预处理程序,
12 | *对一条文本进行分词去糙处理
13 | **/
14 | public static String pro(String str) throws Exception{
15 |
16 | String str1 = Demo.demo(str);
17 | String str2 = StopWords.test(str1);
18 | // LDA.writer(str2, ProPath);
19 | return str2;
20 |
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/com/sxu/Vector/Segmentation/JieBa.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @author SXU.ZSC
3 | *
4 | */
5 | package com.sxu.Vector.Segmentation;
6 |
7 | import test.java.com.huaban.analysis.jieba.Demo;
8 |
9 | public class JieBa {
10 | /**
11 | *调用jiaba.jar包,测试分词算法,
12 | *对一条文本数据进行分词,以" "为间隔符
13 | **/
14 | public static void main(String[] args) throws Exception {
15 | String a = Demo.demo("");
16 | System.out.println(a);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/com/sxu/Vector/Segmentation/StopWords.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @author SXU.ZSC
3 | *
4 | */
5 | package com.sxu.Vector.Segmentation;
6 |
7 | import java.io.BufferedReader;
8 | import java.io.File;
9 | import java.io.FileInputStream;
10 | import java.io.FileNotFoundException;
11 | import java.io.FileWriter;
12 | import java.io.IOException;
13 | import java.io.InputStreamReader;
14 | import java.util.HashSet;
15 | import java.util.Set;
16 | import java.io.BufferedWriter;
17 |
18 | public class StopWords {
19 | /**
20 | *去停用词算法,
21 | *采用字符匹配,对照停用词表,去除停用词
22 | **/
23 | public static final String stopWordTable = "StopWordTable.txt";
24 | public static Set stopWordSet ;
25 | String ifilePath;
26 | String ofilePath;
27 | String str = null;
28 |
29 | public void setStr(String str) {
30 | this.str = str;
31 | }
32 |
33 | public void writer(String Str) throws IOException{
34 | FileWriter filewriter = new FileWriter(ofilePath);
35 | BufferedWriter writer = new BufferedWriter(filewriter);
36 | writer.write(Str);
37 | writer.flush();
38 | writer.close();
39 | }
40 |
41 | public static void StopWords(){
42 | System.out.println("");
43 | try{
44 | //load stop words table.
45 | BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(stopWordTable)), "GBK"));
46 | stopWordSet= new HashSet();
47 | String stopWord = null;
48 | for (; (stopWord = StopWordFileBr.readLine()) != null;) {
49 | stopWordSet.add(stopWord);
50 | }
51 | // for(Object str:stopWordSet.toArray()){
52 | // System.out.println((String)str);
53 | // }
54 | }catch (FileNotFoundException e) {
55 | e.printStackTrace();
56 | }catch (Exception e) {
57 | e.printStackTrace();
58 | }
59 |
60 | }
61 | public static String excludeStopWords(String[] text) {
62 | try{
63 | for (int i = 0; i < text.length; i++) {
64 | if (stopWordSet.contains(text[i].trim())) {
65 | text[i]=null;
66 | }
67 | }
68 | StringBuffer finalStr = new StringBuffer();
69 | for (int i = 0; i < text.length; i++) {
70 | if (text[i] != null) {
71 | finalStr = finalStr.append(text[i].trim()).append(" ");
72 | }
73 | }
74 | // System.out.println(finalStr);
75 | return finalStr.toString();
76 | } catch (Exception e) {
77 | e.printStackTrace();
78 | }
79 | return null;
80 | }
81 |
82 | public static void main(String argv[]) throws Exception{
83 |
84 | StopWords clean = new StopWords();
85 | clean.setStr("我 爱 中国 , / 钓鱼 岛 是 中国 的");
86 | System.out.println(clean.str);
87 | String[] s = clean.str.split(" ");
88 | // for(int i=0;i w2vMap = W2Vmodel.word2Vector(str, length, 0);
25 | // Map w2vMap = W2Vmodel.fileword2Vector(path1, length, 0);
26 | for ( Map.EntryMyWord2Vector: w2vMap.entrySet()){
27 | // String word1=MyWord2Vector.getKey();
28 | // float[] Vector1=MyWord2Vector.getValue();
29 | sum++;
30 | }
31 | // System.out.println(sum);
32 | float[][]b=new float[sum][];
33 | int i=0;
34 |
35 | for ( Map.EntryMyWord2Vector: w2vMap.entrySet()){
36 | String word2=MyWord2Vector.getKey();
37 | float[] Vector2=MyWord2Vector.getValue();
38 | b[i]=new float[Vector2.length];
39 | for(int j=0;j