├── README.md
└── venv
    ├── Kaggle-Ensemble-Guide
        ├── README.md
        ├── requirements.txt
        ├── samples
        │   ├── _w2_method2.csv
        │   ├── _w2_method3.csv
        │   ├── _w3_method1.csv
        │   ├── kaggle_avg.csv
        │   ├── kaggle_geomean.csv
        │   ├── kaggle_rankavg.csv
        │   ├── kaggle_vote.csv
        │   ├── kaggle_vote_weighted.csv
        │   ├── method1.csv
        │   ├── method2.csv
        │   └── method3.csv
        ├── src
        │   ├── blend_proba.py
        │   ├── correlations.py
        │   ├── kaggle_avg.py
        │   ├── kaggle_geomean.py
        │   ├── kaggle_rankavg.py
        │   └── kaggle_vote.py
        └── stacking
        │   └── ensemble_stacking.py
    ├── bdci
        ├── merge.py
        ├── read_data.py
        ├── snownlp1.py
        └── split_word.py
    ├── datafountain
        ├── guangfudianzhan
        │   ├── __pycache__
        │   │   └── read_data.cpython-36.pyc
        │   ├── dnn_model.py
        │   ├── draw.py
        │   ├── find_base_feature.py
        │   ├── find_best_feature.py
        │   ├── model.py
        │   ├── read_data.py
        │   ├── rnn_model.py
        │   └── tensor_forest.py
        └── taocan
        │   ├── base.ipynb
        │   ├── baseline.py
        │   ├── ml_models.py
        │   ├── tensorflow_modle.py
        │   └── tf_model
        │       ├── checkpoint
        │       ├── stock.model.data-00000-of-00001
        │       ├── stock.model.index
        │       ├── stock.model.max.data-00000-of-00001
        │       ├── stock.model.max.index
        │       ├── stock.model.max.meta
        │       └── stock.model.meta
    ├── dc
        └── guangfu
        │   └── github
        │       ├── README.md
        │       └── baseline.py
    ├── deep_learning
        ├── embedding
        │   └── word2vec.py
        └── yucemoxing
        │   ├── PricePredictor.py
        │   └── chargeInfo.txt
    ├── dwb
        ├── baseline.py
        ├── fasttext
        │   ├── __pycache__
        │   │   └── fasttext.cpython-36.pyc
        │   ├── fasttext.py
        │   └── p4_cnn_sentence_classification.py
        ├── github_model
        │   ├── a01_FastText
        │   │   ├── __pycache__
        │   │   │   └── p5_fastTextB_model.cpython-36.pyc
        │   │   ├── p5_fastTextB_model.py
        │   │   ├── p5_fastTextB_predict.py
        │   │   ├── p5_fastTextB_predict_multilabel.py
        │   │   ├── p5_fastTextB_train.py
        │   │   ├── p6_fastTextB_model_multilabel.py
        │   │   └── p6_fastTextB_train_multilabel.py
        │   ├── a02_TextCNN
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │   │   ├── data_util.cpython-36.pyc
        │   │   │   └── p7_TextCNN_model.cpython-36.pyc
        │   │   ├── data_util.py
        │   │   ├── other_experiement
        │   │   │   ├── __init__.py
        │   │   │   ├── data_util_zhihu.py
        │   │   │   ├── p7_TextCNN_predict_ensemble.py
        │   │   │   ├── p7_TextCNN_predict_exp.py
        │   │   │   ├── p7_TextCNN_predict_exp512.py
        │   │   │   ├── p7_TextCNN_predict_exp512_0609.py
        │   │   │   ├── p7_TextCNN_predict_exp512_simple.py
        │   │   │   ├── p7_TextCNN_train_exp.py
        │   │   │   ├── p7_TextCNN_train_exp512.py
        │   │   │   ├── p7_TextCNN_train_exp_512_0609.py
        │   │   │   └── p8_TextCNN_predict_exp.py
        │   │   ├── p7_TextCNN_model.py
        │   │   ├── p7_TextCNN_model_multilayers.py
        │   │   ├── p7_TextCNN_predict.py
        │   │   └── p7_TextCNN_train.py
        │   └── a03_TextRNN
        │   │   ├── __pycache__
        │   │       └── p8_TextRNN_model.cpython-36.pyc
        │   │   ├── p8_TextRNN_model.py
        │   │   ├── p8_TextRNN_model_multi_layers.py
        │   │   ├── p8_TextRNN_predict.py
        │   │   ├── p8_TextRNN_train.py
        │   │   └── result_rnn.csv
        ├── jieba1
        │   ├── merge.py
        │   └── tjieba.py
        ├── merge.py
        ├── par.py
        └── testcnn
        │   ├── __pycache__
        │       ├── data_helpers.cpython-36.pyc
        │       └── text_cnn.cpython-36.pyc
        │   ├── data_helpers.py
        │   ├── eval.py
        │   ├── text_cnn.py
        │   └── train.py
    ├── pachong
        └── iqiyi
        │   ├── fcxd.py
        │   └── xiangmicc.py
    └── regress_baseline
        ├── cwd.py
        └── regress_baseline.py


/README.md:
--------------------------------------------------------------------------------
1 | # deep_learning
2 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/README.md:
--------------------------------------------------------------------------------
  1 | Kaggle-Ensemble-Guide
  2 | =====================
  3 | 
  4 | A combination of Model Ensembling methods that is extremely useful for increasing accuracy of Kaggle's submission.
  5 | For more information: http://mlwave.com/kaggle-ensembling-guide/
  6 | 
  7 | ## Installation:
  8 | 
  9 |     $ pip install -r requirements.txt
 10 | 
 11 | ## Example:
 12 | 
 13 |     $ python ./src/correlations.py ./samples/method1.csv ./samples/method2.csv
 14 |     Finding correlation between: ./samples/method1.csv and ./samples/method2.csv
 15 |     Column to be measured: Label
 16 |     Pearson's correlation score: 0.67898
 17 |     Kendall's correlation score: 0.66667
 18 |     Spearman's correlation score: 0.71053
 19 | 
 20 |     $ python ./src/kaggle_vote.py "./samples/method*.csv" "./samples/kaggle_vote.csv"
 21 |     parsing: ./samples/method1.csv
 22 |     parsing: ./samples/method2.csv
 23 |     parsing: ./samples/method3.csv
 24 |     wrote to ./samples/kaggle_vote.csv
 25 | 
 26 | 
 27 |     $ python ./src/kaggle_vote.py "./samples/_*.csv" "./samples/kaggle_vote_weighted.csv" "weighted"
 28 |     parsing: ./samples/_w3_method1.csv
 29 |     Using weight: 3
 30 |     parsing: ./samples/_w2_method2.csv
 31 |     Using weight: 2
 32 |     parsing: ./samples/_w2_method3.csv
 33 |     Using weight: 2
 34 |     wrote to ./samples/kaggle_vote_weighted.csv
 35 | 
 36 |     $ python ./src/kaggle_rankavg.py "./samples/method*.csv" "./samples/kaggle_rankavg.csv"
 37 |     parsing: ./samples/method1.csv
 38 |     parsing: ./samples/method2.csv
 39 |     parsing: ./samples/method3.csv
 40 |     wrote to ./samples/kaggle_rankavg.csv
 41 | 
 42 |     $ python ./src/kaggle_avg.py "./samples/method*.csv" "./samples/kaggle_avg.csv"
 43 |     parsing: ./samples/method1.csv
 44 |     parsing: ./samples/method2.csv
 45 |     parsing: ./samples/method3.csv
 46 |     wrote to ./samples/kaggle_avg.csv
 47 | 
 48 |     $ python ./src/kaggle_geomean.py  "./samples/method*.csv" "./samples/kaggle_geomean.csv"
 49 |     parsing: ./samples/method1.csv
 50 |     parsing: ./samples/method2.csv
 51 |     parsing: ./samples/method3.csv
 52 |     wrote to ./samples/kaggle_geomean.csv
 53 | 
 54 | ## Result:
 55 | 
 56 |     ==> ./samples/method1.csv <==
 57 |     ImageId,Label
 58 |     1,1
 59 |     2,0
 60 |     3,9
 61 |     4,9
 62 |     5,3
 63 | 
 64 |     ==> ./samples/method2.csv <==
 65 |     ImageId,Label
 66 |     1,2
 67 |     2,0
 68 |     3,6
 69 |     4,2
 70 |     5,3
 71 | 
 72 |     ==> ./samples/method3.csv <==
 73 |     ImageId,Label
 74 |     1,2
 75 |     2,0
 76 |     3,9
 77 |     4,2
 78 |     5,3
 79 | 
 80 |     ==> ./samples/kaggle_avg.csv <==
 81 |     ImageId,Label
 82 |     1,1.666667
 83 |     2,0.000000
 84 |     3,8.000000
 85 |     4,4.333333
 86 |     5,3.000000
 87 | 
 88 |     ==> ./samples/kaggle_rankavg.csv <==
 89 |     ImageId,Label
 90 |     1,0.25
 91 |     2,0.0
 92 |     3,1.0
 93 |     4,0.5
 94 |     5,0.75
 95 | 
 96 |     ==> ./samples/kaggle_vote.csv <==
 97 |     ImageId,Label
 98 |     1,2
 99 |     2,0
100 |     3,9
101 |     4,2
102 |     5,3
103 | 
104 |     ==> ./samples/kaggle_geomean.csv <==
105 |     ImageId,Label
106 |     1,1.587401
107 |     2,0.000000
108 |     3,7.862224
109 |     4,3.301927
110 |     5,3.000000
111 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | sklearn
3 | numpy
4 | scipy


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/_w2_method2.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,3
3 | 4,2
4 | 3,6
5 | 5,3
6 | 2,0
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/_w2_method3.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,2
3 | 3,9
4 | 2,0
5 | 5,3
6 | 4,2
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/_w3_method1.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 5,3
3 | 2,0
4 | 3,5
5 | 4,9
6 | 1,1
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/kaggle_avg.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,1.666667
3 | 2,0.000000
4 | 3,8.000000
5 | 4,4.333333
6 | 5,3.000000
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/kaggle_geomean.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,1.587401
3 | 2,0.000000
4 | 3,7.862224
5 | 4,3.301927
6 | 5,3.000000
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/kaggle_rankavg.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,0.25
3 | 2,0.0
4 | 3,1.0
5 | 4,0.5
6 | 5,0.75
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/kaggle_vote.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,2
3 | 2,0
4 | 3,9
5 | 4,2
6 | 5,3
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/kaggle_vote_weighted.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,1
3 | 2,0
4 | 3,5
5 | 4,2
6 | 5,3
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/method1.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 5,3
3 | 2,0
4 | 3,9
5 | 4,9
6 | 1,1
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/method2.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,2
3 | 4,2
4 | 3,6
5 | 5,3
6 | 2,0
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/samples/method3.csv:
--------------------------------------------------------------------------------
1 | ImageId,Label
2 | 1,2
3 | 3,9
4 | 2,0
5 | 5,3
6 | 4,2
7 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/src/blend_proba.py:
--------------------------------------------------------------------------------
 1 | from sklearn import cross_validation
 2 | from sklearn.metrics import log_loss, accuracy_score
 3 | import numpy as np
 4 | import pandas as pd
 5 | import random
 6 | import md5
 7 | import json
 8 | 
 9 | def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="",
10 |                 save_test_only="", seed=300373, save_params="",
11 |                 clf_name="XX", generalizers_params=[], minimal_loss=0,
12 |                 return_score=False, minimizer="log_loss"):
13 |   print("\nBlending with classifier:\n\t{}".format(clf))
14 |   folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed))
15 |   print(X_train.shape)
16 |   dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0]))
17 | 
18 |   #iterate through train set and train - predict folds
19 |   loss = 0
20 |   for i, (train_index, test_index) in enumerate( folds ):
21 |     print("Train Fold {}/{}}".format(i+1,nfolds))
22 |     fold_X_train = X_train[train_index]
23 |     fold_y_train = y[train_index]
24 |     fold_X_test = X_train[test_index]
25 |     fold_y_test = y[test_index]
26 |     clf.fit(fold_X_train, fold_y_train)
27 | 
28 |     fold_preds = clf.predict_proba(fold_X_test)
29 |     print("Logistic loss: {}".format(log_loss(fold_y_test,fold_preds)))
30 |     dataset_blend_train[test_index] = fold_preds
31 |     if minimizer == "log_loss":
32 |       loss += log_loss(fold_y_test,fold_preds)
33 |     if minimizer == "accuracy":
34 |       fold_preds_a = np.argmax(fold_preds, axis=1)
35 |       loss += accuracy_score(fold_y_test,fold_preds_a)
36 |     #fold_preds = clf.predict(fold_X_test)
37 | 
38 |     #loss += accuracy_score(fold_y_test,fold_preds)
39 | 
40 |     if minimal_loss > 0 and loss > minimal_loss and i == 0:
41 |       return False, False
42 |     fold_preds = np.argmax(fold_preds, axis=1)
43 |     print("Accuracy:      {}".format(accuracy_score(fold_y_test,fold_preds)))
44 |   avg_loss = loss / float(i+1)
45 |   print("\nAverage:\t{}\n".format(avg_loss))
46 |   #predict test set (better to take average on all folds, but this is quicker)
47 |   print("Test Fold 1/1")
48 |   clf.fit(X_train, y)
49 |   dataset_blend_test = clf.predict_proba(X_test)
50 | 
51 |   if clf_name == "XX":
52 |     clf_name = str(clf)[1:3]
53 | 
54 |   if len(save_preds)>0:
55 |     id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
56 |     print("storing meta predictions at: {}"{}ave_preds)
57 |     np.save("{}_{}_{}_train.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_train))
58 |     np.save("{}_{}_{}_test.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_test))
59 | 
60 |   if len(save_test_only)>0:
61 |     id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
62 |     print("storing meta predictions at: {}"{}ave_test_only)
63 | 
64 |     dataset_blend_test = clf.predict(X_test)
65 |     np.savetxt("{}_{}_{}_test.txt".format((save_test_only,clf_name,avg_loss,id),dataset_blend_test))
66 |     d = {}
67 |     d["stacker"] = clf.get_params()
68 |     d["generalizers"] = generalizers_params
69 |     with open("{}_{}_{}_params.json".format((save_test_only,clf_name,avg_loss, id), 'wb')) as f:
70 |       json.dump(d, f)
71 | 
72 |   if len(save_params)>0:
73 |     id = md5.new("{}"{}tr(clf.get_params())).hexdigest()
74 |     d = {}
75 |     d["name"] = clf_name
76 |     d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()}
77 |     d["generalizers"] = generalizers_params
78 |     with open("{}_{}_{}_params.json".format((save_params,clf_name,avg_loss, id), 'wb')) as f:
79 |       json.dump(d, f)
80 | 
81 |   if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba
82 |     if return_score:
83 |       return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss
84 |     else:
85 |       return dataset_blend_train[:,1], dataset_blend_test[:,1]
86 |   else:
87 |     if return_score:
88 |       return dataset_blend_train, dataset_blend_test, avg_loss
89 |     else:
90 |       return dataset_blend_train, dataset_blend_test


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/src/correlations.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sys
 3 | 
 4 | first_file = sys.argv[1]
 5 | second_file = sys.argv[2]
 6 | 
 7 | def corr(first_file, second_file):
 8 |   first_df = pd.read_csv(first_file,index_col=0)
 9 |   second_df = pd.read_csv(second_file,index_col=0)
10 |   # assuming first column is `prediction_id` and second column is `prediction`
11 |   prediction = first_df.columns[0]
12 |   # correlation
13 |   print("Finding correlation between: {} and {}".format(first_file,second_file))
14 |   print("Column to be measured: {}".format(prediction))
15 |   print("Pearson's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='pearson')))
16 |   print("Kendall's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='kendall')))
17 |   print("Spearman's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='spearman')))
18 | 
19 | corr(first_file, second_file)
20 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/src/kaggle_avg.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from glob import glob
 3 | import sys
 4 | 
 5 | glob_files = sys.argv[1]
 6 | loc_outfile = sys.argv[2]
 7 | 
 8 | def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
 9 |   if method == "average":
10 |     scores = defaultdict(float)
11 |   with open(loc_outfile,"w") as outfile:
12 |     for i, glob_file in enumerate( glob(glob_files) ):
13 |       print("parsing: {}".format(glob_file))
14 |       # sort glob_file by first column, ignoring the first line
15 |       lines = open(glob_file).readlines()
16 |       lines = [lines[0]] + sorted(lines[1:])
17 |       for e, line in enumerate( lines ):
18 |         if i == 0 and e == 0:
19 |           outfile.write(line)
20 |         if e > 0:
21 |           row = line.strip().split(",")
22 |           scores[(e,row[0])] += float(row[1])
23 |     for j,k in sorted(scores):
24 |       outfile.write("%s,%f\n"%(k,scores[(j,k)]/(i+1)))
25 |     print("wrote to {}".format(loc_outfile))
26 | 
27 | kaggle_bag(glob_files, loc_outfile)


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/src/kaggle_geomean.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from collections import defaultdict
 3 | from glob import glob
 4 | import sys
 5 | import math
 6 | 
 7 | glob_files = sys.argv[1]
 8 | loc_outfile = sys.argv[2]
 9 | 
10 | def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
11 |   if method == "average":
12 |     scores = defaultdict(float)
13 |   with open(loc_outfile,"w") as outfile:
14 |     for i, glob_file in enumerate( glob(glob_files) ):
15 |       print("parsing: {}".format(glob_file))
16 |       # sort glob_file by first column, ignoring the first line
17 |       lines = open(glob_file).readlines()
18 |       lines = [lines[0]] + sorted(lines[1:])
19 |       for e, line in enumerate( lines ):
20 |         if i == 0 and e == 0:
21 |           outfile.write(line)
22 |         if e > 0:
23 |           row = line.strip().split(",")
24 |           if scores[(e,row[0])] == 0:
25 |             scores[(e,row[0])] = 1
26 |           scores[(e,row[0])] *= float(row[1])
27 |     for j,k in sorted(scores):
28 |       outfile.write("%s,%f\n"%(k,math.pow(scores[(j,k)],1/(i+1))))
29 |     print("wrote to {}".format(loc_outfile))
30 | 
31 | kaggle_bag(glob_files, loc_outfile)
32 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/src/kaggle_rankavg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from collections import defaultdict
 3 | from glob import glob
 4 | import sys
 5 | 
 6 | glob_files = sys.argv[1]
 7 | loc_outfile = sys.argv[2]
 8 | 
 9 | def kaggle_bag(glob_files, loc_outfile):
10 |   with open(loc_outfile,"w") as outfile:
11 |     all_ranks = defaultdict(list)
12 |     for i, glob_file in enumerate( glob(glob_files) ):
13 |       file_ranks = []
14 |       print("parsing: {}".format(glob_file))
15 |       # sort glob_file by first column, ignoring the first line
16 |       lines = open(glob_file).readlines()
17 |       lines = [lines[0]] + sorted(lines[1:])
18 |       for e, line in enumerate( lines ):
19 |         if e == 0 and i == 0:
20 |           outfile.write( line )
21 |         elif e > 0:
22 |           r = line.strip().split(",")
23 |           file_ranks.append( (float(r[1]), e, r[0]) )
24 |       for rank, item in enumerate( sorted(file_ranks) ):
25 |         all_ranks[(item[1],item[2])].append(rank)
26 |     average_ranks = []
27 |     for k in sorted(all_ranks):
28 |       average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k))
29 |     ranked_ranks = []
30 |     for rank, k in enumerate(sorted(average_ranks)):
31 |       ranked_ranks.append((k[1][0],k[1][1],rank/(len(average_ranks)-1)))
32 |     for k in sorted(ranked_ranks):
33 |       outfile.write("%s,%s\n"%(k[1],k[2]))
34 |     print("wrote to {}".format(loc_outfile))
35 | 
36 | kaggle_bag(glob_files, loc_outfile)


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/src/kaggle_vote.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict, Counter
 2 | from glob import glob
 3 | import sys
 4 | import re
 5 | 
 6 | glob_files = sys.argv[1]
 7 | loc_outfile = sys.argv[2]
 8 | weights_strategy = "uniform"
 9 | if len(sys.argv) == 4:
10 |   weights_strategy = sys.argv[3]
11 | 
12 | def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"):
13 |   pattern = re.compile(r"(.)*_[w|W](\d*)_[.]*")
14 |   if method == "average":
15 |     scores = defaultdict(list)
16 |   with open(loc_outfile,"w") as outfile:
17 |     #weight_list may be usefull using a different method
18 |     weight_list = [1]*len(glob(glob_files))
19 |     for i, glob_file in enumerate( glob(glob_files) ):
20 |       print("parsing: {}".format(glob_file))
21 |       if weights == "weighted":
22 |          weight = pattern.match(glob_file)
23 |          if weight and weight.group(2):
24 |             print("Using weight: {}".format(weight.group(2)))
25 |             weight_list[i] = weight_list[i]*int(weight.group(2))
26 |          else:
27 |             print("Using weight: 1")
28 |       # sort glob_file by first column, ignoring the first line
29 |       lines = open(glob_file).readlines()
30 |       lines = [lines[0]] + sorted(lines[1:])
31 |       for e, line in enumerate( lines ):
32 |         if i == 0 and e == 0:
33 |           outfile.write(line)
34 |         if e > 0:
35 |           row = line.strip().split(",")
36 |           for l in range(1,weight_list[i]+1):
37 |             scores[(e,row[0])].append(row[1])
38 |     for j,k in sorted(scores):
39 |       outfile.write("%s,%s\n"%(k,Counter(scores[(j,k)]).most_common(1)[0][0]))
40 |     print("wrote to {}".format(loc_outfile))
41 | 
42 | kaggle_bag(glob_files, loc_outfile, weights=weights_strategy)
43 | 


--------------------------------------------------------------------------------
/venv/Kaggle-Ensemble-Guide/stacking/ensemble_stacking.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/9/19 下午4:51
 4 | # @Author   :hwwu
 5 | # @File     :ensemble_stacking.py
 6 | 
 7 | from sklearn import datasets
 8 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
 9 | from sklearn.cross_validation import train_test_split
10 | from sklearn.cross_validation import StratifiedKFold
11 | import numpy as np
12 | from sklearn.metrics import roc_auc_score
13 | from sklearn.datasets.samples_generator import make_blobs
14 | 
15 | '''创建训练的数据集'''
16 | data, target = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60)
17 | 
18 | '''模型融合中使用到的各个单模型'''
19 | clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
20 |         RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
21 |         ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
22 |         ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
23 |         GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]
24 | 
25 | '''切分一部分数据作为测试集'''
26 | X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.33, random_state=2017)
27 | 
28 | 
29 | dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
30 | dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))
31 | 
32 | '''5折stacking'''
33 | n_folds = 5
34 | skf = list(StratifiedKFold(y, n_folds))
35 | for j, clf in enumerate(clfs):
36 |     '''依次训练各个单模型'''
37 |     # print(j, clf)
38 |     dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf)))
39 |     for i, (train, test) in enumerate(skf):
40 |         '''使用第i个部分作为预测，剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。'''
41 |         # print("Fold", i)
42 |         X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
43 |         clf.fit(X_train, y_train)
44 |         y_submission = clf.predict_proba(X_test)[:, 1]
45 |         dataset_blend_train[test, j] = y_submission
46 |         dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
47 |     '''对于测试集，直接用这k个模型的预测值均值作为新的特征。'''
48 |     dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
49 |     print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))
50 | # clf = LogisticRegression()
51 | clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
52 | clf.fit(dataset_blend_train, y)
53 | y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
54 | 
55 | print("Linear stretch of predictions to [0,1]")
56 | y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
57 | print("blend result")
58 | print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))


--------------------------------------------------------------------------------
/venv/bdci/merge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/9/12 下午3:48
 4 | # @Author   :hwwu
 5 | # @File     :merge.py
 6 | 
 7 | import pandas as pd, numpy as np
 8 | 
 9 | path = '/Users/liyangyang/Downloads/bdci/'
10 | 
11 | def write_result(id, predictions):
12 |     r_id = []
13 |     r_predictions = []
14 |     for i in range(len(id)):
15 |         r_id.append(str(id[i]))
16 |         r_predictions.append(int(predictions[i]))
17 | 
18 |     english_column = pd.Series(r_id, name='content_id')
19 |     number_column = pd.Series(r_predictions, name='sentiment_value')
20 |     predictions = pd.concat([english_column, number_column], axis=1)
21 |     predictions.to_csv(path + 'merge_result_data_sentiment_value.csv', index=0, sep=',', columns=['content_id', 'sentiment_value'])
22 | 
23 | 
24 | # r75 = pd.read_csv(path+'MultinomialNB.csv')['sentiment_value']
25 | # rcnn = pd.read_csv(path+'LinearSVC.csv')['sentiment_value']
26 | # rrnn = pd.read_csv(path+'RandomForestClassifier.csv')['sentiment_value']
27 | # print('r75.shape',r75.shape)
28 | # print('rcnn.shape',rcnn.shape)
29 | # print('rrnn.shape',rrnn.shape)
30 | #
31 | # id = pd.read_csv(path+'MultinomialNB.csv')['content_id']
32 | # predictions =[]
33 | # for i in range(len(r75)):
34 | #     # id.append(r75['content_id'][i])
35 | #     if (rcnn[i]==rrnn[i]):
36 | #         predictions.append(rcnn[i])
37 | #     else:
38 | #         predictions.append(r75[i])
39 | #
40 | # write_result(id,predictions)
41 | 
42 | sentiment_value = pd.read_csv(path+'merge_result_data_sentiment_value.csv')
43 | subject = pd.read_csv(path+'merge_result_data_subject.csv')
44 | content = pd.read_csv(path+'train.csv')
45 | 
46 | subject.loc[subject['subject'] == 0, 'subject'] = '动力'
47 | subject.loc[subject['subject'] == 1, 'subject'] = '价格'
48 | subject.loc[subject['subject'] == 2, 'subject'] = '内饰'
49 | subject.loc[subject['subject'] == 3, 'subject'] = '配置'
50 | subject.loc[subject['subject'] == 4, 'subject'] = '安全性'
51 | subject.loc[subject['subject'] == 5, 'subject'] = '外观'
52 | subject.loc[subject['subject'] == 6, 'subject'] = '操控'
53 | subject.loc[subject['subject'] == 7, 'subject'] = '油耗'
54 | subject.loc[subject['subject'] == 8, 'subject'] = '空间'
55 | subject.loc[subject['subject'] == 9, 'subject'] = '舒适性'
56 | 
57 | # df = pd.DataFrame({"content_id": sentiment_value['content_id'], "subject": subject['subject'],'sentiment_value':sentiment_value['sentiment_value'].astype(int),'sentiment_word':''})
58 | # df.to_csv(path+'result.csv', index = False, header=True,encoding='UTF-8')
59 | 
60 | content_id = sentiment_value['content_id']
61 | subject = subject['subject']
62 | sentiment_value = sentiment_value['sentiment_value'].astype(int)
63 | sentiment_word = content['sentiment_word'][:len(content_id)]
64 | print('sentiment_value',sentiment_value.shape)
65 | predictions = pd.concat([content_id, subject,sentiment_value,sentiment_word], axis=1)
66 | predictions.to_csv(path + 'result.csv', index=0, sep=',', columns=['content_id', 'subject','sentiment_value','sentiment_word'],encoding='UTF-8')


--------------------------------------------------------------------------------
/venv/bdci/read_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/9/12 下午1:48
  4 | # @Author   :hwwu
  5 | # @File     :read_data.py
  6 | 
  7 | path = '/Users/liyangyang/Downloads/bdci/'
  8 | 
  9 | import pandas as pd, numpy as np
 10 | 
 11 | train = pd.read_csv(path + 'train.csv')[:2000]
 12 | test = pd.read_csv(path + 'test_public.csv')[:1000]
 13 | 
 14 | # y_train = train['sentiment_value'].astype(int)
 15 | train.loc[train['subject'] == '动力', 'subject'] = 0
 16 | train.loc[train['subject'] == '价格', 'subject'] = 1
 17 | train.loc[train['subject'] == '内饰', 'subject'] = 2
 18 | train.loc[train['subject'] == '配置', 'subject'] = 3
 19 | train.loc[train['subject'] == '安全性', 'subject'] = 4
 20 | train.loc[train['subject'] == '外观', 'subject'] = 5
 21 | train.loc[train['subject'] == '操控', 'subject'] = 6
 22 | train.loc[train['subject'] == '油耗', 'subject'] = 7
 23 | train.loc[train['subject'] == '空间', 'subject'] = 8
 24 | train.loc[train['subject'] == '舒适性', 'subject'] = 9
 25 | y_train = train['subject']
 26 | 
 27 | 
 28 | print(train.shape)
 29 | print(test.shape)
 30 | 
 31 | stopword_path = '/Users/liyangyang/Downloads/stopwords/stopwords1893.txt'
 32 | import jieba
 33 | 
 34 | 
 35 | def stopwordslist():
 36 |     stopwords = [line.strip() for line in open(stopword_path, 'r', encoding='utf-8').readlines()]
 37 |     # stopwords = ['，', '。', '、', '...', '“', '”', '《', '》', '：', '；']
 38 |     return stopwords
 39 | 
 40 | 
 41 | def split_word(line):
 42 |     result = []
 43 |     for i in range(len(line)):
 44 |         result.append(line[i:i + 1])
 45 |     return result
 46 | 
 47 | import codecs
 48 | f = codecs.open(path+'train_no_lable.txt', 'a', 'utf8')
 49 | train_doc_list = []
 50 | for i in range(len(train)):
 51 |     sentence_seged = jieba.cut(train['content'][i].strip())
 52 |     # sentence_seged = split_word(train['content'][i].strip())
 53 |     stopwords = stopwordslist()
 54 |     outstr = ''
 55 |     for word in sentence_seged:
 56 |         if word not in stopwords:
 57 |             if (word != '\t') & (word.strip() != ''):
 58 |                 outstr += word
 59 |                 # outstr += '\t'
 60 |                 outstr += ' '
 61 |     # if (outstr == ''):
 62 |     #     outstr = 'NaN'
 63 |     # outstr +='__myprefix__'
 64 |     # outstr +=str(y_train[i])
 65 |     f.write(outstr+'\n')
 66 |     train_doc_list.append(outstr)
 67 | 
 68 | train_doc_list = np.array(train_doc_list)
 69 | print(train_doc_list.shape)
 70 | 
 71 | test_doc_list = []
 72 | for i in range(len(test)):
 73 |     sentence_seged = jieba.cut(test['content'][i].strip())
 74 |     # sentence_seged = split_word(test['content'][i].strip())
 75 |     stopwords = stopwordslist()
 76 |     outstr = ''
 77 |     for word in sentence_seged:
 78 |         if word not in stopwords:
 79 |             if word != '\t':
 80 |                 outstr += word
 81 |                 outstr += '\t'
 82 |     if (outstr == ''):
 83 |         outstr = 'NaN'
 84 |     test_doc_list.append(outstr)
 85 | test_doc_list = np.array(test_doc_list)
 86 | print(test_doc_list.shape)
 87 | #
 88 | from sklearn.feature_extraction.text import CountVectorizer
 89 | from sklearn.feature_extraction.text import TfidfVectorizer
 90 | #
 91 | # count_vec = CountVectorizer(analyzer='word')
 92 | # data_train_count = count_vec.fit_transform(train_doc_list)
 93 | # data_test_count = count_vec.transform(test_doc_list).toarray()
 94 | # #词汇表
 95 | # print('\nvocabulary list:\n\n',count_vec.get_feature_names())
 96 | # print( '\nvocabulary dic :\n\n',count_vec.vocabulary_)
 97 | # print ('vocabulary:\n\n')
 98 | # for key,value in count_vec.vocabulary_.items():
 99 | #     print(key,value)
100 | # print('.............')
101 | # print(data_train_count)
102 | 
103 | tfidf = TfidfVectorizer(
104 |     ngram_range=(1, 1),  # 二元文法模型
105 |     use_idf=1,
106 |     # analyzer='char',
107 |     smooth_idf=1)
108 | 
109 | data_train_count_tf = tfidf.fit_transform(train_doc_list)
110 | data_test_count_tf = tfidf.transform(test_doc_list)
111 | 
112 | print('\nvocabulary list:\n\n',tfidf.get_feature_names())
113 | print( '\nvocabulary dic :\n\n',tfidf.vocabulary_)
114 | print ('vocabulary:\n\n')
115 | for key,value in tfidf.vocabulary_.items():
116 |     print(key,value)
117 | print('.............')
118 | print(type(data_train_count_tf))
119 | #
120 | # from sklearn.naive_bayes import MultinomialNB
121 | # from sklearn.model_selection import cross_val_score
122 | #
123 | # clf = MultinomialNB()
124 | # clf.fit(data_train_count, y_train)
125 | # print("多项式贝叶斯分类器20折交叉验证得分: ", np.mean(cross_val_score(clf, data_train_count, y_train, cv=10, scoring='accuracy')))
126 | # clf.fit(data_train_count_tf, y_train)
127 | # print("多项式贝叶斯分类器TFIDF,20折交叉验证得分: ",
128 | #       np.mean(cross_val_score(clf, data_train_count_tf, y_train, cv=10, scoring='accuracy')))
129 | # # clf_pred = clf.predict(data_test_count_tf)
130 | # # df = pd.DataFrame({"content_id": test['content_id'], "sentiment_value": clf_pred})
131 | # # df.to_csv(path+'MultinomialNB.csv', index = False, header=True)
132 | # #
133 | # from sklearn import svm
134 | #
135 | # lin_clf = svm.LinearSVC(class_weight='balanced')
136 | # lin_clf.fit(data_train_count, y_train)
137 | # print("svm分类器20折交叉验证得分: ", np.mean(cross_val_score(lin_clf, data_train_count, y_train, cv=10, scoring='accuracy')))
138 | # lin_clf.fit(data_train_count_tf, y_train)
139 | # print("svm分类器TFIDF,20折交叉验证得分: ",
140 | #       np.mean(cross_val_score(lin_clf, data_train_count_tf, y_train, cv=10, scoring='accuracy')))
141 | # # lin_clf_pred = lin_clf.predict(data_test_count_tf)
142 | # # df = pd.DataFrame({"content_id": test['content_id'], "sentiment_value": lin_clf_pred})
143 | # # df.to_csv(path+'LinearSVC.csv', index = False, header=True)
144 | #
145 | # from sklearn.ensemble import RandomForestClassifier
146 | #
147 | # lin_forest = RandomForestClassifier(n_estimators=10, random_state=1, class_weight='balanced')
148 | # lin_forest.fit(data_train_count, y_train)
149 | # print("RandomForestClassifier分类器20折交叉验证得分: ",
150 | #       np.mean(cross_val_score(lin_forest, data_train_count, y_train, cv=10, scoring='accuracy')))
151 | # lin_forest.fit(data_train_count_tf, y_train)
152 | # print("RandomForestClassifier分类器TFIDF,20折交叉验证得分: ",
153 | #       np.mean(cross_val_score(lin_forest, data_train_count_tf, y_train, cv=10, scoring='accuracy')))
154 | # # lin_forest_pred = lin_forest.predict(data_test_count_tf)
155 | # # df = pd.DataFrame({"content_id": test['content_id'], "sentiment_value": lin_forest_pred})
156 | # # df.to_csv(path+'RandomForestClassifier.csv', index = False, header=True)
157 | #
158 | #
159 | # import xgboost as xgb
160 | #
161 | # model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=0.0468)
162 | # model_xgb.fit(data_train_count, y_train)
163 | # print("model_xgb分类器20折交叉验证得分: ",
164 | #       np.mean(cross_val_score(model_xgb, data_train_count, y_train, cv=10, scoring='accuracy')))
165 | # model_xgb.fit(data_train_count_tf, y_train)
166 | # print("model_xgb分类器TFIDF,20折交叉验证得分: ",
167 | #       np.mean(cross_val_score(model_xgb, data_train_count_tf, y_train, cv=10, scoring='accuracy')))
168 | # # model_xgb_pred = model_xgb.predict(data_test_count_tf)
169 | # # df = pd.DataFrame({"content_id": test['content_id'], "sentiment_value": model_xgb_pred})
170 | # # df.to_csv(path+'XGBClassifier.csv', index = False, header=True)
171 | 


--------------------------------------------------------------------------------
/venv/bdci/snownlp1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/9/12 下午6:39
  4 | # @Author   :hwwu
  5 | # @File     :snownlp1.py
  6 | import pickle
  7 | import numpy as np
  8 | 
  9 | 
 10 | def readdumpobj(path):
 11 |     file = open(path, "rb")
 12 |     bunch = pickle.load(file)
 13 |     file.close()
 14 |     return bunch
 15 | 
 16 | 
 17 | def outemotionword(path):
 18 |     emotionset = []
 19 |     with open(path, "rb") as fp:
 20 |         for word in fp:
 21 |             if not word.isspace():
 22 |                 word = word.decode("utf-8")
 23 |                 emotionset.append(word.strip())
 24 |     return emotionset
 25 | 
 26 | 
 27 | def loadDataSet(path):  # path是为了读入将情感词典
 28 |     postingList = readdumpobj("D:\linguistic-corpus\postingList\postingList.dat")
 29 |     classVec = readdumpobj("D:\linguistic-corpus\postingList\classVec.dat")
 30 |     emotionset = outemotionword(path)
 31 |     return postingList, classVec, emotionset
 32 | 
 33 | 
 34 | class NBayes(object):
 35 |     def __init__(self):
 36 |         self.vocabulary = []  # 词典，文本set表
 37 |         self.idf = 0  # 词典的idf权值向量
 38 |         self.tf = 0  # 训练集的权值矩阵
 39 |         self.tdm = 0  # P(x|yi)
 40 |         self.Pcates = {}  # P(yi)--是个类别字典
 41 |         self.labels = []  # 对应每个文本的分类，是个外部导入的列表[0,1,0,1,0,1]
 42 |         self.doclength = 0  # 训练集文本数，训练文本长度
 43 |         self.vocablen = 0  # 词典词长,self.vocabulary长度
 44 |         self.testset = 0  # 测试集
 45 | 
 46 |     #   加载训练集并生成词典，以及tf, idf值
 47 |     def train_set(self, trainset, classVec, emotionset):
 48 |         self.cate_prob(classVec)  # 计算每个分类在数据集中的概率：P(yi)
 49 |         self.doclength = len(trainset)
 50 |         tempset = set()
 51 |         [tempset.add(word) for word in emotionset]  # 生成词典
 52 |         self.vocabulary = list(tempset)
 53 |         self.vocablen = len(self.vocabulary)
 54 |         # self.calc_wordfreq(trainset)
 55 |         self.calc_tfidf(trainset)  # 生成tf-idf权值
 56 |         self.build_tdm()  # 按分类累计向量空间的每维值：P(x|yi)
 57 | 
 58 |     # 生成 tf-idf
 59 |     def calc_tfidf(self, trainset):
 60 |         self.idf = np.zeros([1, self.vocablen])
 61 |         self.tf = np.zeros([self.doclength, self.vocablen])
 62 |         for indx in range(self.doclength):
 63 |             for word in trainset[indx]:
 64 |                 if word in self.vocabulary:
 65 |                     self.tf[indx, self.vocabulary.index(word)] += 1
 66 |             # 消除不同句长导致的偏差
 67 |             self.tf[indx] = self.tf[indx] / float(len(trainset[indx]))
 68 |             for signleword in set(trainset[indx]):
 69 |                 if signleword in self.vocabulary:
 70 |                     self.idf[0, self.vocabulary.index(signleword)] += 1
 71 |         self.idf = np.log(float(self.doclength) / (self.idf + 1))  # 防止该词语不在语料中，就会导致分母为零
 72 |         self.tf = np.multiply(self.tf, self.idf)  # 矩阵与向量的点乘
 73 | 
 74 |     # 生成普通的词频向量
 75 |     def calc_wordfreq(self, trainset):
 76 |         self.idf = np.zeros([1, self.vocablen])  # 1*词典数
 77 |         self.tf = np.zeros([self.doclength, self.vocablen])  # 训练集文件数*词典数
 78 |         for indx in range(self.doclength):  # 遍历所有的文本
 79 |             for word in trainset[indx]:  # 遍历文本中的每个词
 80 |                 if word in self.vocabulary:
 81 |                     self.tf[indx, self.vocabulary.index(word)] += 1  # 找到文本的词在字典中的位置+1
 82 |             for signleword in set(trainset[indx]):
 83 |                 if signleword in self.vocabulary:
 84 |                     self.idf[0, self.vocabulary.index(signleword)] += 1
 85 | 
 86 |     # 计算每个分类在数据集中的概率：P(yi)
 87 |     def cate_prob(self, classVec):
 88 |         self.labels = classVec
 89 |         labeltemps = set(self.labels)  # 获取全部分类
 90 |         for labeltemp in labeltemps:
 91 |             # 统计列表中重复的值：self.labels.count(labeltemp)
 92 |             self.Pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels))
 93 | 
 94 |     # 按分类累计向量空间的每维值：P(x|yi)
 95 |     def build_tdm(self):
 96 |         self.tdm = np.zeros([len(self.Pcates), self.vocablen])  # 类别行*词典列
 97 |         sumlist = np.zeros([len(self.Pcates), 1])  # 统计每个分类的总值
 98 |         for indx in range(self.doclength):
 99 |             self.tdm[self.labels[indx]] += self.tf[indx]  # 将同一类别的词向量空间值加总
100 |             sumlist[self.labels[indx]] = np.sum(self.tdm[self.labels[indx]])  # 统计每个分类的总值--是个标量
101 |         self.tdm = self.tdm / sumlist  # P(x|yi)
102 | 
103 |     # 测试集映射到当前词典
104 |     def map2vocab(self, testdata):
105 |         self.testset = np.zeros([1, self.vocablen])
106 |         # 删除测试集中词不在训练集中
107 |         for word in testdata:
108 |             if word in self.vocabulary:
109 |                 self.testset[0, self.vocabulary.index(word)] += 1
110 | 
111 |     # 输出分类类别
112 |     def predict(self, testset):
113 |         if np.shape(testset)[1] != self.vocablen:
114 |             print("输入错误")
115 |             exit(0)
116 |         predvalue = 0
117 |         predclass = ""
118 |         for tdm_vect, keyclass in zip(self.tdm, self.Pcates):
119 |             # P(x|yi)P(yi)
120 |             temp = np.sum(testset * tdm_vect * self.Pcates[keyclass])
121 |             if temp > predvalue:
122 |                 predvalue = temp
123 |                 predclass = keyclass
124 |         return predclass
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     postingList, classVec, emotionset = loadDataSet("D:\sentiment-word\emotionword.txt")
129 |     testset = postingList[119]
130 |     nb = NBayes()  # 类的实例化
131 |     nb.train_set(postingList, classVec, emotionset)  # 训练数据集
132 |     nb.map2vocab(testset)  # 随机选择一个测试句，这里2表示文本中的第三句话，不是脏话，应输出0。
133 |     print(nb.predict(nb.testset))  # 输出分类结果0表示消极，1表示积极
134 |     print("分类结束")
135 | 


--------------------------------------------------------------------------------
/venv/bdci/split_word.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/9/20 下午3:25
 4 | # @Author   :hwwu
 5 | # @File     :split_word.py
 6 | 
 7 | path = '/Users/liyangyang/Downloads/bdci/'
 8 | 
 9 | import pandas as pd, numpy as np
10 | 
11 | train = pd.read_csv(path + 'train.csv')
12 | print(train.shape)
13 | stopword_path = '/Users/liyangyang/Downloads/stopwords/stopwords1893.txt'
14 | import jieba
15 | import fool
16 | def stopwordslist():
17 |     stopwords = [line.strip() for line in open(stopword_path, 'r', encoding='utf-8').readlines()]
18 |     # stopwords = ['，', '。', '、', '...', '“', '”', '《', '》', '：', '；']
19 |     return stopwords
20 | 
21 | import codecs
22 | f = codecs.open(path+'train_no_lable.txt', 'a', 'utf8')
23 | train_doc_list = []
24 | for i in range(100):
25 |     print(train['content'][i].strip())
26 |     print('..........')
27 |     sentence_seged = jieba.cut(train['content'][i].strip())
28 |     outstr = ''
29 |     for word in sentence_seged:
30 |         if (word != '\t') & (word.strip() != ''):
31 |             outstr += word
32 |             outstr += ' '
33 |     print(outstr)
34 |     print('..........')
35 | 
36 |     sentence_seged_fool = fool.cut(train['content'][i].strip())
37 |     print(sentence_seged_fool)
38 |     print('***********')
39 | 


--------------------------------------------------------------------------------
/venv/datafountain/guangfudianzhan/__pycache__/read_data.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/guangfudianzhan/__pycache__/read_data.cpython-36.pyc


--------------------------------------------------------------------------------
/venv/datafountain/guangfudianzhan/dnn_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/8/7 下午1:21
  4 | # @Author   :hwwu
  5 | # @File     :dnn_model.py
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | import sys
 10 | 
 11 | path = '/Users/liyangyang/PycharmProjects/mypy/venv/datafountain/guangfudianzhan/'
 12 | sys.path.append(path)
 13 | import read_data
 14 | 
 15 | dis = [1, 190, 379, 567, 755, 940, 1123, 1314, 1503, 1505, 1694, 1879,
 16 |        2070, 2257, 2444, 2632, 2823, 3013, 3202, 3379, 3567, 3746, 3927, 4089,
 17 |        4278, 4459, 4648, 4652, 4821, 5010, 5013, 5017, 5059, 5061, 5069, 5074,
 18 |        5077, 5281, 5285, 5287, 5292, 5508, 5703, 5911, 5913, 5916, 5918, 6121,
 19 |        6337, 6524, 6528, 6531, 6534, 6723, 6923, 7116, 7326, 7535, 7740, 7937,
 20 |        8146, 8245, 8258, 8310, 8488, 8705, 8711, 8878, 9088, 9296, 9505, 9719,
 21 |        9916, 10124, 10335, 10544, 10736, 10914, 10917, 11119, 11331, 11540,
 22 |        11753, 11963, 12170, 12381, 12592, 12802, 13009, 13214, 13426, 13617,
 23 |        13830, 14032, 14243, 14457, 14666, 14882, 15091, 15299, 15508, 15719,
 24 |        15937, 16144, 16348, 16540, 16747, 16925, 17133, 17342,
 25 |        17527, 17543, 17745, 17876]
 26 | 
 27 | dic = [22, 135, 591, 592, 593, 594, 595, 737, 948, 1070, 1173, 1175, 1286,
 28 |        1362, 1451, 1519, 1565, 1666, 1717, 1894, 2137, 2223, 2271, 2414,
 29 |        2579, 2797, 2875, 2916, 2986, 2684, 3723, 3597, 3599, 3603, 3605,
 30 |        3607, 3610, 3601, 3602, 3421, 3393, 3538, 3539, 3540, 5521, 6016,
 31 |        7437, 11832, 16437, 15355, 3152, 3612,3611]
 32 | 
 33 | 
 34 | def load_train_data():
 35 |     train_ = read_data.read_result_data('public.train.csv')
 36 |     train_x = train_[:, 2:21]
 37 |     train_y = train_[:, 21]
 38 |     train_z = train_[:, 1]
 39 | 
 40 |     train_len = len(train_y)
 41 |     train_y.shape = (1, train_len)
 42 |     train_y = np.transpose(train_y)
 43 | 
 44 |     x, y = [], []
 45 |     for i in range(train_len):
 46 |         if ((round(train_x[i][0], 2) != 0.01) | (round(train_x[i][1], 1) != 0.1)):
 47 | 
 48 |             id = 0.0
 49 |             for j in range(len(dis)):
 50 |                 if (train_z[i] < dis[j]):
 51 |                     id = 0.5 - np.abs((int(train_z[i]) - dis[j - 1]) / (dis[j] - dis[j - 1]) - 0.5)
 52 |                     break
 53 | 
 54 |             if (train_z[i] not in dic):
 55 |                 x.append([
 56 |                     train_x[i][1],
 57 |                     train_x[i][2],
 58 |                     train_x[i][0],
 59 |                     id,
 60 |                     train_x[i][3],
 61 |                     train_x[i][4], train_x[i][5], train_x[i][6],
 62 |                     train_x[i][7], train_x[i][8], train_x[i][9],
 63 |                     train_x[i][7] / (train_x[i][10] + 0.1), train_x[i][8] / (train_x[i][11] + 0.1),
 64 |                     train_x[i][9] / (train_x[i][12] + 0.1),
 65 |                     train_x[i][10], train_x[i][11], train_x[i][12],
 66 |                     train_x[i][13], train_x[i][14], train_x[i][15],
 67 |                     train_x[i][4] * train_x[i][13], train_x[i][5] * train_x[i][14], train_x[i][6] * train_x[i][15],
 68 |                     train_x[i][18],
 69 |                     train_x[i][17],
 70 |                     train_x[i][16]
 71 |                 ])
 72 |                 # x.append(train_x[i])
 73 |                 y.append(abs(train_y[i]))
 74 |     print(len(x))
 75 |     # for i in  range(10):
 76 |     #     print(x[i])
 77 |     return x, y
 78 | 
 79 | 
 80 | def load_test_data():
 81 |     # train_ = read_data.read_result_data('test_data_all.csv')
 82 |     train_ = read_data.read_result_data('public.test.csv')
 83 |     train_x = train_[:, 2:21]
 84 |     train_y = train_[:, 1]
 85 | 
 86 |     train_len = len(train_y)
 87 |     train_y.shape = (1, train_len)
 88 |     train_y = np.transpose(train_y)
 89 | 
 90 |     x, y = [], []
 91 |     for i in range(train_len):
 92 |         if ((round(train_x[i][0], 2) != 0.01) | (round(train_x[i][1], 1) != 0.1)):
 93 | 
 94 |             id = 0.0
 95 |             for j in range(len(dis)):
 96 |                 if (train_y[i] < dis[j]):
 97 |                     id = 0.5 - np.abs((int(train_y[i]) - dis[j - 1]) / (dis[j] - dis[j - 1]) - 0.5)
 98 |                     break
 99 | 
100 |             if (train_y[i] not in dic):
101 |                 x.append([
102 |                     train_x[i][1],
103 |                     train_x[i][2],
104 |                     train_x[i][3],
105 |                     train_x[i][4],
106 |                     train_x[i][0],
107 |                     id,
108 |                     train_x[i][5],
109 |                     train_x[i][6],
110 |                     # train_x[i][7],
111 |                     # train_x[i][8],
112 |                     # train_x[i][9],
113 |                     train_x[i][10], train_x[i][11], train_x[i][12],
114 |                     train_x[i][13], train_x[i][14],
115 |                     train_x[i][15],
116 |                     train_x[i][17],
117 |                     train_x[i][18],
118 |                     train_x[i][16]
119 |                 ])
120 |                 # x.append(train_x[i])
121 |                 y.append(train_y[i])
122 |     print(len(x))
123 |     return x, y
124 | 
125 | 
126 | x, y = load_train_data()
127 | 
128 | train_x = np.reshape(x[1::1], (-1, 26))
129 | train_y = np.reshape(y[1::1], (-1, 1))
130 | test_x = np.reshape(x[1::1], (-1, 26))
131 | test_y = np.reshape(y[1::1], (-1, 1))
132 | #
133 | # x1, y1 = load_test_data()
134 | # test_x = np.reshape(x1, (-1, 17))
135 | # test_y = np.reshape(y1, (-1, 1))
136 | 
137 | feature_columns = [tf.contrib.layers.real_valued_column("", dimension=17)]
138 | classifier = tf.contrib.learn.DNNRegressor(feature_columns=feature_columns,
139 |                                            hidden_units=[1],
140 |                                            optimizer=tf.train.AdamOptimizer(
141 |                                            learning_rate=0.0001
142 |                                            ),
143 |                                            activation_fn=tf.nn.leaky_relu)
144 | # classifier = tf.contrib.learn.DNNLinearCombinedRegressor(dnn_feature_columns=feature_columns,
145 | #                                                          dnn_hidden_units=[1],
146 | #                                                          dnn_optimizer=tf.train.AdamOptimizer(
147 | #                                                          learning_rate=0.001
148 | #                                                          ))
149 | classifier.fit(x=train_x,
150 |                y=train_y,
151 |                max_steps=40000)
152 | 
153 | print(classifier.evaluate(x=train_x, y=train_y))
154 | 
155 | y = classifier.predict(test_x)
156 | y_=[]
157 | for i in y:
158 |     y_.append([i])
159 | 
160 | # r = []
161 | # for i in range(8337):
162 | #     id = test_y[i][0]
163 | #     p = y_[i][0]
164 | #     r.append([id, p])
165 | # np.savetxt('/Users/liyangyang/Downloads/datafountain/guangdianfute/test_data_3', r)
166 | 
167 | 
168 | error = []
169 | for i in range(len(test_y)):
170 |     if((test_y[i] - y_[i]) * (test_y[i] - y_[i]) > 1):
171 |         print(test_x[i], test_y[i], y_[i])
172 |     error.append(test_y[i] - y_[i])
173 | 
174 | squaredError = []
175 | absError = []
176 | for val in error:
177 |     squaredError.append(val * val)  # target-prediction之差平方
178 | 
179 | print("Square Error: ", sorted(squaredError, reverse=True))
180 | 
181 | print("MSE = ", sum(squaredError) / len(squaredError))  # 均方误差MSE
182 | from math import sqrt
183 | 
184 | print("RMSE = ", sqrt(sum(squaredError) / len(squaredError)))  # 均方根误差RMSE
185 | 


--------------------------------------------------------------------------------
/venv/datafountain/guangfudianzhan/draw.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # --coding:utf8 --
  3 | # @TIME     :2018/8/1 上午10:28
  4 | # @Author   :hwwu
  5 | # @File     :draw.py
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | path = '/Users/liyangyang/Downloads/datafountain/guangdianfute/'
 14 | 
 15 | 
 16 | dic = [22, 135, 591, 592, 593, 594, 595, 737, 948, 1070, 1173, 1175, 1286,
 17 |        1362, 1451, 1519, 1565, 1666, 1717, 1894, 2137, 2223, 2271, 2414,
 18 |        2579, 2797, 2875, 2916, 2986, 2684, 3723, 3597, 3599, 3603, 3605,
 19 |        3607, 3610, 3601, 3602, 3421, 3393, 3538, 3539, 3540, 5521, 6016,
 20 |        7437, 11832, 16437, 15355, 3152, 3612, 3611]
 21 | 
 22 | # 板温  现场温度  光照强度   转换效率   转换效率A  转换效率B   转换效率C  电压A  电压B 电压C
 23 | # 电流A   电流B   电流C      功率A     功率B      功率C     平均功率   风速   风向       发电量
 24 | def draw_data(file='public.train.csv'):
 25 |     data = pd.read_csv(path + file)
 26 |     print(data.std())
 27 |     # data = data[(data['平均功率'] < 10000.0)]
 28 |     # data = data[(data['现场温度'] > -1000.0)]
 29 |     # data = data[(data['转换效率'] < 2000.0)]
 30 |     data = data[~data['ID'].isin(dic)]
 31 |     # data = data[(data['电流A'] > 200.0)]
 32 |     # print(len(data))
 33 |     # plt.hist(data['风向'])
 34 |     # 板温 光照强度
 35 |     # xs = data['电压C']/data['电流C']
 36 |     xs = (data['光照强度']*data['转换效率'])/100/12.5
 37 |     # xs = data['现场温度']
 38 |     ys = data['发电量']
 39 |     plt.scatter(xs, ys)
 40 |     # x = [i for i in range(100)]
 41 |     # for i in range(1,11):
 42 |     #     strat=4000+i*100
 43 |     #     plt.plot(x, (data['平均功率']/1000*2)[strat:strat+100], color='r', label='yuce')
 44 |     #     plt.plot(x, data['发电量'][strat:strat+100], color='y', label='shiji')
 45 |     #     plt.show()
 46 |     # strat = 2200
 47 |     # plt.plot(x, (data['平均功率'] / 1000 * 2)[strat:strat + 200], color='r', label='yuce')
 48 |     # plt.plot(x, data['发电量'][strat:strat + 200], color='y', label='shiji')
 49 |     plt.show()
 50 |     # print(data.head())
 51 | 
 52 | import seaborn as sns
 53 | def neighborhood(file='public.train.csv'):
 54 |     train = pd.read_csv(path + file)
 55 |     train.rename(columns={
 56 |         '板温': 'a', '现场温度': 'b', '光照强度': 'c', '转换效率': 'd', '转换效率A': 'e',
 57 |         '转换效率B': 'f', '转换效率C': 'g', '电压A': 'h', '电压B': 'i', '电压C': 'j',
 58 |         '电流A': 'k', '电流B': 'l', '电流C': 'm', '功率A': 'n', '功率B': 'o',
 59 |         '功率C': 'p', '平均功率': 'q', '风速': 'r', '风向': 's', '发电量': 't'
 60 |                           }, inplace=True)
 61 |     k = 10  # number of variables for heatmap
 62 |     corrmat = train.corr()
 63 |     cols = corrmat.nlargest(k, 't')['t'].index
 64 |     cm = np.corrcoef(train[cols].values.T)
 65 |     sns.set(font_scale=1.25)
 66 |     hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values,
 67 |                      xticklabels=cols.values)
 68 |     plt.show()
 69 | # neighborhood()
 70 | 
 71 | file='public.train.csv'
 72 | def t():
 73 |     train = pd.read_csv(path + file)
 74 |     y = train['发电量']
 75 |     train_labels = y.values.copy
 76 |     print(y.describe())
 77 |     sns.distplot(y)
 78 |     print('Skewness: %f' % y.skew())
 79 |     print('Kurtosis: %f' % y.kurt())
 80 |     # 得到训练集的数值特征和类别特征
 81 | 
 82 |     from scipy.stats import skew
 83 |     # log transform the target use log(1+x)
 84 |     train["发电量"] = np.log1p(train["发电量"])
 85 |     sns.distplot(train['发电量'])
 86 |     print("Skewness: %f" % train['发电量'].skew())
 87 |     print("Kurtosis: %f" % train['发电量'].kurt())
 88 | # t()
 89 | 
 90 | draw_data()
 91 | # draw_data('public.test.csv')
 92 | 
 93 | 
 94 | from sklearn.feature_selection import SelectKBest
 95 | import sklearn
 96 | 
 97 | 
 98 | import sys
 99 | 
100 | path = '/Users/liyangyang/PycharmProjects/mypy/venv/datafountain/guangfudianzhan/'
101 | sys.path.append(path)
102 | import read_data
103 | 
104 | dis = [1,190,379,567,755,940,1123,1314,1503,1505,1694,1879,
105 | 2070,2257,2444,2632,2823,3013,3202,3379,3567,3746,3927,4089,
106 | 4278,4459,4648,4652,4821,5010,5013,5017,5059,5061,5069,5074,
107 | 5077,5281,5285,5287,5292,5508,5703,5911,5913,5916,5918,6121,
108 | 6337,6524,6528,6531,6534,6723,6923,7116,7326,7535,7740,7937,
109 | 8146,8245,8258,8310,8488,8705,8711,8878,9088,9296,9505,9719,
110 | 9916,10124,10335,10544,10736,10914,10917,11119,11331,11540,
111 | 11753,11963,12170,12381,12592,12802,13009,13214,13426,13617,
112 | 13830,14032,14243,14457,14666,14882,15091,15299,15508,15719,
113 | 15937,16144,16348,16540,16747,16925,17133,17342,
114 | 17527,17543,17745,17876]
115 | 
116 | 
117 | def load_train_data():
118 |     train_ = read_data.read_result_data('public.train.csv')
119 |     train_x = train_[:, 2:21]
120 |     train_y = train_[:, 21]
121 |     train_z = train_[:, 1]
122 | 
123 |     train_len = len(train_y)
124 |     train_y.shape = (1, train_len)
125 |     train_y = np.transpose(train_y)
126 | 
127 |     x, y = [], []
128 |     for i in range(train_len):
129 |         if ((round(train_x[i][0], 2) != 0.01) | (round(train_x[i][1], 1) != 0.1)):
130 | 
131 |             id = 0.0
132 |             for j in range(len(dis)):
133 |                 if (train_z[i]<dis[j]):
134 |                     id = 0.5 - np.abs((int(train_z[i]) - dis[j - 1]) / (dis[j] - dis[j - 1])-0.5)
135 |                     break
136 | 
137 | 
138 |             x.append([
139 |                       id,
140 |                       train_x[i][0], train_x[i][1],
141 |                       train_x[i][2],
142 |                       train_x[i][3],
143 |                       train_x[i][4],train_x[i][5], train_x[i][6],
144 |                       train_x[i][7],train_x[i][8], train_x[i][9],
145 |                       train_x[i][10],train_x[i][11], train_x[i][12],
146 |                       train_x[i][13], train_x[i][14], train_x[i][15], train_x[i][16],
147 |                       train_x[i][17],train_x[i][18]
148 |             ])
149 |             # x.append(train_x[i])
150 |             y.append(train_y[i])
151 | 
152 |     return x,y
153 | 
154 | # x,y = load_train_data()
155 | # model1 = SelectKBest(sklearn.feature_selection.f_regression, k=5)#选择k个最佳特征
156 | # r = model1.fit_transform(x, y)#iris.data是特征数据，iris.target是标签数据，该函数可以选择出k个特征
157 | # print(r)
158 | # print(model1.scores_)
159 | # print(model1.pvalues_)


--------------------------------------------------------------------------------
/venv/datafountain/guangfudianzhan/find_base_feature.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/8/15 下午4:30
 4 | # @Author   :hwwu
 5 | # @File     :find_base_feature.py
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | from pandas import DataFrame as DF
10 | import xgboost as xgb
11 | 
12 | import warnings
13 | 
14 | warnings.filterwarnings("ignore")
15 | 
16 | dic = [22, 135, 591, 592, 593, 594, 595, 737, 948, 1070, 1173, 1175, 1286,
17 |        1362, 1451, 1519, 1565, 1666, 1717, 1894, 2137, 2223, 2271, 2414,
18 |        2579, 2797, 2875, 2916, 2986, 2684, 3723, 3597, 3599, 3603, 3605,
19 |        3607, 3610, 3601, 3602, 3421, 3393, 3538, 3539, 3540, 5521, 6016,
20 |        7437, 11832, 15355, 3152, 3612, 3611]
21 | 
22 | path = '/Users/liyangyang/Downloads/datafountain/guangdianfute/'
23 | file = 'public.train.csv'
24 | data = pd.read_csv(path + file)
25 | data = data[(data['平均功率'] < 10000.0)]
26 | data = data[(data['现场温度'] > -1000.0)]
27 | data = data[~((data['板温'] == 0.01) & (data['现场温度'] == 0.1))]
28 | data = data[~(data['ID'].isin(dic))]
29 | 
30 | print(data.max())
31 | 
32 | feature_name = [i for i in data.columns if i!='发电量']
33 | feature_name = [i for i in feature_name if i!='ID']
34 | train_data = data[feature_name]
35 | train_label = data['发电量']
36 | 
37 | # from sklearn import preprocessing
38 | # min_max_scaler = preprocessing.MinMaxScaler()
39 | # train_data = min_max_scaler.fit_transform(train_data)
40 | # print(feature_name)
41 | 
42 | #方差选择法
43 | # from sklearn.feature_selection import VarianceThreshold
44 | # print(VarianceThreshold(threshold=0.03).fit_transform(train_data)[0])
45 | 
46 | #相关系数法
47 | # from sklearn.feature_selection import SelectKBest
48 | # from scipy.stats import pearsonr
49 | # print(feature_name)
50 | # print(train_data[0])
51 | # from sklearn.feature_selection import f_regression,mutual_info_regression
52 | # for i in range(1,20):
53 | #        print(SelectKBest(f_regression, k=i).fit_transform(train_data, train_label)[0])
54 | #        print(SelectKBest(mutual_info_regression, k=i).fit_transform(train_data, train_label)[0])
55 | 
56 | #Pearson相关系数
57 | # from scipy.stats import pearsonr
58 | # for i in range(0,19):
59 | #        print(i,pearsonr(train_data[:,i], train_label))
60 | 
61 | from sklearn.feature_selection import RFE
62 | from sklearn.linear_model import LinearRegression
63 | print(train_data.head())
64 | train_data = np.array(train_data)
65 | train_label = np.array(train_label)
66 | print(RFE(estimator=LinearRegression(), n_features_to_select=10).fit_transform(train_data, train_label)[0])
67 | 
68 | 


--------------------------------------------------------------------------------
/venv/datafountain/guangfudianzhan/find_best_feature.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/8/14 上午10:17
  4 | # @Author   :hwwu
  5 | # @File     :find_best_feature.py
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from pandas import DataFrame as DF
 10 | import xgboost as xgb
 11 | 
 12 | import warnings
 13 | 
 14 | warnings.filterwarnings("ignore")
 15 | 
 16 | dic = [22, 135, 591, 592, 593, 594, 595, 737, 948, 1070, 1173, 1175, 1286,
 17 |        1362, 1451, 1519, 1565, 1666, 1717, 1894, 2137, 2223, 2271, 2414,
 18 |        2579, 2797, 2875, 2916, 2986, 2684, 3723, 3597, 3599, 3603, 3605,
 19 |        3607, 3610, 3601, 3602, 3421, 3393, 3538, 3539, 3540, 5521, 6016,
 20 |        7437, 11832, 15355, 3152, 3612, 3611]
 21 | 
 22 | path = '/Users/liyangyang/Downloads/datafountain/guangdianfute/'
 23 | file = 'public.train.csv'
 24 | data = pd.read_csv(path + file)
 25 | data = data[(data['平均功率'] < 10000.0)]
 26 | data = data[(data['现场温度'] > -1000.0)]
 27 | data = data[(data['转换效率'] < 500.0)]
 28 | data = data[~((data['板温'] == 0.01) & (data['现场温度'] == 0.1))]
 29 | data = data[~(data['ID'].isin(dic))]
 30 | 
 31 | train = data[::1]
 32 | test = data[::1]
 33 | 
 34 | feature_name = [i for i in data.columns if i != '发电量']
 35 | feature_name = [i for i in feature_name if i != 'ID']
 36 | feature_name = [i for i in feature_name if i != '现场温度']
 37 | # feature_name = [i for i in feature_name if i != '转换效率']
 38 | # feature_name = [i for i in feature_name if i != '功率A']
 39 | # feature_name = [i for i in feature_name if i != '功率B']
 40 | # feature_name = [i for i in feature_name if i != '功率C']
 41 | 
 42 | train_data = train[feature_name]
 43 | train_label = train['发电量']
 44 | test_data = test[feature_name]
 45 | train_label = np.array(train_label)
 46 | 
 47 | from sklearn import preprocessing
 48 | min_max_scaler = preprocessing.MinMaxScaler()
 49 | train_data = min_max_scaler.fit_transform(train_data)
 50 | train_data = DF(train_data, columns=(i for i in feature_name))
 51 | print(train_data.head())
 52 | 
 53 | xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
 54 |                              learning_rate=0.005, max_depth=23,
 55 |                              max_delta_step=100000,
 56 |                              min_child_weight=1.7817, n_estimators=2200,
 57 |                              reg_alpha=0.4640, reg_lambda=0.8571,
 58 |                              subsample=0.5213, silent=1,
 59 |                              random_state=7, nthread=-1)
 60 | 
 61 | 
 62 | def rmse_my(y_test, y_):
 63 |     error = []
 64 |     for i in range(len(y_test)):
 65 |         error.append(y_test[i] - y_[i])
 66 | 
 67 |     squaredError = []
 68 |     for val in error:
 69 |         squaredError.append(val * val)  # target-prediction之差平方
 70 |     from math import sqrt
 71 |     RMSE = sqrt(sum(squaredError) / len(squaredError))
 72 |     print("RMSE = ", RMSE)  # 均方根误差RMSE
 73 |     return RMSE
 74 | 
 75 | 
 76 | def get_division_feature(data, feature_name):
 77 |     new_feature = []
 78 |     new_feature_name = []
 79 |     for i in range(len(data[feature_name].columns) - 1):
 80 |         for j in range(i + 1, len(data[feature_name].columns)):
 81 |             new_feature_name.append(data[feature_name].columns[i] + '/' + data[feature_name].columns[j])
 82 |             new_feature_name.append(data[feature_name].columns[i] + '*' + data[feature_name].columns[j])
 83 |             new_feature_name.append(data[feature_name].columns[i] + '+' + data[feature_name].columns[j])
 84 |             new_feature_name.append(data[feature_name].columns[i] + '-' + data[feature_name].columns[j])
 85 |             new_feature.append(data[data[feature_name].columns[i]] / data[data[feature_name].columns[j]])
 86 |             new_feature.append(data[data[feature_name].columns[i]] * data[data[feature_name].columns[j]])
 87 |             new_feature.append(data[data[feature_name].columns[i]] + data[data[feature_name].columns[j]])
 88 |             new_feature.append(data[data[feature_name].columns[i]] - data[data[feature_name].columns[j]])
 89 | 
 90 |     temp_data = DF(pd.concat(new_feature, axis=1))
 91 |     temp_data.columns = new_feature_name
 92 |     data = pd.concat([temp_data], axis=1).reset_index(drop=True)
 93 |     # print(data.shape)
 94 |     return data.reset_index(drop=True)
 95 | 
 96 | 
 97 | def get_square_feature(data, feature_name):
 98 |     new_feature = []
 99 |     new_feature_name = []
100 |     for i in range(len(data[feature_name].columns)):
101 |         new_feature_name.append(data[feature_name].columns[i] + '**2')
102 |         new_feature_name.append(data[feature_name].columns[i] + '**1/2')
103 |         new_feature.append(data[data[feature_name].columns[i]] ** 2)
104 |         new_feature.append(data[data[feature_name].columns[i]] ** (1 / 2))
105 |     temp_data = DF(pd.concat(new_feature, axis=1))
106 |     temp_data.columns = new_feature_name
107 |     data = pd.concat([temp_data], axis=1).reset_index(drop=True)
108 |     # print(data.shape)
109 |     return data.reset_index(drop=True)
110 | 
111 | 
112 | def find_best_feature(feature_name):
113 |     get_ans_face = feature_name
114 |     xgb_model.fit(train_data[get_ans_face], train_label)
115 |     y_ = xgb_model.predict(train_data[get_ans_face])
116 |     m = rmse_my(train_label, y_)
117 |     return m
118 | 
119 | 
120 | train_datatrain_d = get_square_feature(train_data, feature_name)
121 | train_data_division = get_division_feature(train_data, feature_name)
122 | train_data = pd.concat([train_datatrain_d, train_data_division, train_data], axis=1)
123 | feature_name = [i for i in train_data.columns]
124 | print(train_data.shape)
125 | 
126 | print(feature_name)
127 | 
128 | now_feature = []
129 | # check = 0.05416978387299058
130 | # d = [1,2,3,5,6,7,8,9,10,21,22,27,31,32,33,34,35,36,37,38,39,40,42,43,44,46,47,48,49,55,56,60,61,65,66,78,79,80,82,
131 | #      103,104,108,109,110,111,112,128,129,130,131,214,215,221,222,243,247,248,251,252]
132 | #
133 | # for i in d:
134 | #     now_feature.append(feature_name[i-1])
135 | # for i in range(354,len(feature_name)):
136 | # check = 0.05878801229207516
137 | d = [1, 2, 3, 4, 5, 6, 7, 9, 10, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 82, 83, 84, 85, 86, 87,
138 |      89, 90, 94, 95, 96, 97, 98, 122, 123, 124, 125, 126, 127, 128, 129, 132, 133, 136, 137, 163, 164, 167, 168, 170,
139 |      171, 172, 176, 177, 179, 180, 214]
140 | 
141 | # for i in d:
142 | #     now_feature.append(feature_name[i - 1])
143 | # for i in range(324, len(feature_name)):
144 | #     now_feature.append(feature_name[i])
145 | #     jj = find_best_feature(now_feature)
146 | #     if jj < check:
147 | #         print('目前特征长度为', len(now_feature), ' 目前帅气的RSME为值是', jj, ' 成功加入第', i + 1, '个', 'RSME降低', check - jj)
148 | #         check = jj
149 | #     else:
150 | #         print('尝试加入第', i + 1, '个特征失败')
151 | #         now_feature.pop()
152 | #     print(now_feature)
153 | #
154 | now_feature2 = []
155 | check = 100
156 | for i in range(len(feature_name)):
157 |     now_feature2.append(feature_name[len(feature_name)-i-1])
158 |     jj = find_best_feature(now_feature2)
159 |     if jj<check:
160 |         print('目前特征长度为',len(now_feature2),' 目前帅气的cv值是',jj,' 成功加入第',i+1,'个','增值为',check-jj)
161 |         check = jj
162 |     else:
163 |         print('尝试加入第', i + 1, '个特征失败')
164 |         now_feature2.pop()
165 |     print(now_feature2)
166 | 


--------------------------------------------------------------------------------
/venv/datafountain/guangfudianzhan/read_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/7/16 上午11:32
  4 | # @Author   :hwwu
  5 | # @File     :read_data.py
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | path = '/Users/liyangyang/Downloads/datafountain/guangdianfute/'
 11 | 
 12 | 
 13 | def read_original_data(file):
 14 |     data = pd.read_csv(path + file)
 15 |     data = data[(data['平均功率'] < 10000.0)]
 16 |     data = data[(data['现场温度'] > -1000.0)]
 17 |     # data = data[(data['转换效率'] < 3000.0)]
 18 |     data = data[~((data['板温']==0.01)&(data['现场温度']==0.1))]
 19 | 
 20 |     return data
 21 | 
 22 | 
 23 | def load_original_data(file='public.train.csv'):
 24 |     train = read_original_data(file)
 25 |     return train.reset_index()
 26 | 
 27 | 
 28 | def read_result_data(file='public.train.csv'):
 29 |     train = load_original_data(file)
 30 |     result = np.array(train)
 31 |     print(result.shape)
 32 |     return result
 33 | 
 34 | 
 35 | def write_test_result1():
 36 |     train_ = read_result_data('public.test.csv')
 37 |     train_x = train_[:, 2:21]
 38 |     train_y = train_[:, 1]
 39 | 
 40 |     res = []
 41 | 
 42 |     train_len = len(train_y)
 43 |     train_y.shape = (1, train_len)
 44 |     train_y = np.transpose(train_y)
 45 | 
 46 |     for i in range(train_len):
 47 |         if ((round(train_x[i][0], 2) == 0.01) & (round(train_x[i][1], 1) == 0.1)):
 48 |             res.append([train_y[i], 0.379993053])
 49 | 
 50 |     print(len(res))
 51 |     np.savetxt(path + 'test_data_1', res)
 52 | 
 53 | 
 54 | def write_test_result():
 55 |     train_1 = read_result_data('public.test.csv')
 56 |     train_2 = read_result_data('public.train.csv')
 57 |     train_x = train_1[:, 1:21]
 58 |     train_y = train_2[:, 1:21]
 59 | 
 60 |     train = np.vstack([train_x, train_y])
 61 | 
 62 |     train_a = train[:, ::-1].T
 63 |     train_a2 = np.lexsort(train_a)
 64 |     train = train[train_a2]
 65 | 
 66 |     np.savetxt(path + 'test_data_all.csv', train, fmt="%.2f", delimiter=',')
 67 | 
 68 | 
 69 | # write_test_result()
 70 | 
 71 | 
 72 | def write_result():
 73 |     x1 = np.loadtxt(path + 'test_data_1')
 74 |     x2 = np.loadtxt(path + 'test_data_3')
 75 | 
 76 |     user_id = []
 77 |     price = []
 78 |     for i in range(len(x1)):
 79 |         user_id.append(int(x1[i][0]))
 80 |         # price.append(round(x1[i][1],1))
 81 |         price.append(round(x1[i][1], 7))
 82 |     for i in range(len(x2)):
 83 |         user_id.append(int(x2[i][0]))
 84 |         price.append(round(x2[i][1], 7))
 85 |     english_column = pd.Series(user_id)
 86 |     number_column = pd.Series(price)
 87 |     predictions = pd.concat([english_column, number_column], axis=1)
 88 |     # another way to handle
 89 |     # save = pd.DataFrame({'user_id': user_id, 'prediction_pay_price': price})
 90 |     predictions.to_csv(path + 'result_data.csv', index=0, sep=',')
 91 | 
 92 | 
 93 | # write_result()
 94 | 
 95 | def write_result2():
 96 |     t = read_result_data('public.test.csv')
 97 |     x1 = np.loadtxt(path + 'test_data_all_2')
 98 |     x2 = np.loadtxt(path + 'test_data_3')
 99 |     t1 = t[:, 1]
100 |     map = {}
101 |     r = []
102 |     map2 = {}
103 |     for i in range(len(x2)):
104 |         map2[int(x2[i][0])] = x2[i][1]
105 |     for i in range(8409):
106 |         if ((round(t[i][0], 2) != 0.01) | (round(t[i][1], 1) != 0.1)):
107 |             map[int(t1[i])] = 0
108 |     for i in range(len(x1)):
109 |         a1 = int(x1[i][0])
110 |         a2 = x1[i][2]
111 |         if (a1==16437):
112 |             r.append([a1,9.911484700000000814e+00])
113 |             print(a1)
114 |         elif (a1 in map2.keys()):
115 |             r.append([a1, map2[a1]])
116 |         elif (a1 in map.keys()):
117 |             r.append([a1, a2])
118 |         # else:
119 |         #     r.append([a1,a2])
120 | 
121 |     np.savetxt('/Users/liyangyang/Downloads/datafountain/guangdianfute/test_data_2', r)
122 | 
123 | 
124 | # write_result2()
125 | 
126 | 
127 | def mid_merge_r():
128 |     x1 = np.loadtxt(path + 'test_data_all_1')
129 |     t = read_result_data('public.train.csv')
130 |     t1 = t[:, 1]
131 |     t2 = t[:, 21]
132 |     map = {}
133 |     r = []
134 |     for i in range(9000):
135 |         if ((round(t[i][0], 2) != 0.01) | (round(t[i][1], 1) != 0.1)):
136 |             map[int(t1[i])] = t2[i]
137 |     for i in range(len(x1)):
138 |         a1 = int(x1[i][0])
139 |         a2 = x1[i][1]
140 |         if (a1 in map.keys()):
141 |             r.append([a1, a2, round(map[a1], 7)])
142 |         else:
143 |             r.append([a1, a2, 0.0])
144 | 
145 |     np.savetxt('/Users/liyangyang/Downloads/datafountain/guangdianfute/test_data_all_2', r)
146 | 
147 | 
148 | # mid_merge_r()
149 | 
150 | import matplotlib.pyplot as plt
151 | 
152 | 
153 | def plot():
154 |     x1 = np.loadtxt(path + 'test_data_all_2')
155 |     s = 90*100
156 |     b = 100
157 |     e = s + b
158 |     x = [i for i in range(s, e)]
159 |     # 以折线图表示结果
160 |     plt.figure()
161 |     plt.plot(x, x1[s:e,1], color='r', label='yuce')
162 |     plt.plot(x, x1[s:e, 2], color='y', label='shiji')
163 |     plt.xlabel("Time(s)")  # X轴标签
164 |     plt.ylabel("Value")  # Y轴标签
165 |     plt.show()
166 | 
167 | # plot()
168 | 
169 | if __name__ == '__main__':
170 |     pass


--------------------------------------------------------------------------------
/venv/datafountain/guangfudianzhan/rnn_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/6/21 下午1:27
  4 | # @Author   :hwwu
  5 | # @File     :PricePredictor.py
  6 | 
  7 | import codecs
  8 | import matplotlib.pyplot as plt
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | import pandas as pd
 12 | 
 13 | 
 14 | class PricePredictor:
 15 |     # lstm param
 16 |     timeStep = 19
 17 |     hiddenUnitSize = 38  # 隐藏层神经元数量
 18 |     batchSize = 88  # 每一批次训练多少个样例
 19 |     inputSize = 19  # 输入维度
 20 |     outputSize = 1  # 输出维度
 21 |     lr = 0.0001  # 学习率
 22 |     train_x, train_y = [], []  # 训练数据集
 23 |     dataFile = '/Users/liyangyang/Downloads/datafountain/guangdianfute/public.train.csv'
 24 |     testFile = '/Users/liyangyang/Downloads/datafountain/guangdianfute/public.test.csv'
 25 |     train_data = []
 26 |     X = tf.placeholder(tf.float32, [None, timeStep, inputSize])
 27 |     Y = tf.placeholder(tf.float32, [None, timeStep])
 28 |     # Y = tf.placeholder(tf.float32, [None, timeStep, outputSize])
 29 |     weights = {
 30 |         'in': tf.Variable(tf.random_normal([inputSize, hiddenUnitSize])),
 31 |         'out': tf.Variable(tf.random_normal([hiddenUnitSize, 1]))
 32 |     }
 33 | 
 34 |     biases = {
 35 |         'in': tf.Variable(tf.constant(0.1, shape=[hiddenUnitSize, ])),
 36 |         'out': tf.Variable(tf.constant(0.1, shape=[1, ]))
 37 |     }
 38 | 
 39 |     savePath = '/Users/liyangyang/PycharmProjects/mypy/venv/datafountain/guangfudianzhan/model/stock.train.model'
 40 | 
 41 |     def loadData(self):
 42 |         data = pd.read_csv(self.dataFile)
 43 |         data = np.array(data)
 44 |         train_len = len(data)
 45 |         train = []
 46 |         for i in range(train_len):
 47 |             if ((round(data[i][1], 2) != 0.01) | (round(data[i][2], 1) != 0.1)):
 48 |                 if (data[i][2] < -1000):
 49 |                     print(data[i][2])
 50 |                     data[i][2] = -6.0
 51 |                 if (data[i][19] > 360):
 52 |                     data[i][19] -= 360
 53 |                 if (data[i][20] < 0):
 54 |                     data[i][20] = -data[i][20]
 55 |                 train.append(data[i])
 56 |         print(len(train))
 57 |         self.train_data = np.array(train)
 58 | 
 59 |     # 构造数据
 60 |     def buildTrainDataSet(self):
 61 |         x_ = self.train_data[:, 1:20]
 62 |         y_ = self.train_data[:, 20]
 63 |         for i in range(len(self.train_data) - self.timeStep - 1):
 64 |             x = x_[i:i + self.timeStep]
 65 |             y = y_[i:i + self.timeStep]
 66 |             self.train_x.append(x.tolist())
 67 |             self.train_y.append(y.tolist())
 68 | 
 69 |     # lstm算法定义
 70 |     def lstm(self, batchSize=None):
 71 |         if batchSize is None:
 72 |             batchSize = self.batchSize
 73 |         weightIn = self.weights['in']
 74 |         biasesIn = self.biases['in']
 75 |         input = tf.reshape(self.X, [-1, self.inputSize])
 76 |         inputRnn = tf.matmul(input, weightIn) + biasesIn
 77 |         inputRnn = tf.reshape(inputRnn, [-1, self.timeStep, self.hiddenUnitSize])  # 将tensor转成3维，作为lstm cell的输入
 78 |         # cell=tf.nn.rnn_cell.BasicLSTMCell(self.hiddenUnitSize, reuse=True)
 79 |         # initState=cell.zero_state(batchSize,dtype=tf.float32)
 80 |         # output_rnn,final_states=tf.nn.dynamic_rnn(cell, inputRnn,initial_state=initState, dtype=tf.float32)  #output_rnn是记录lstm每个输出节点的结果，final_states是最后一个cell的结果
 81 | 
 82 |         # **步骤2：定义一层 LSTM_cell，只需要说明 hidden_size, 它会自动匹配输入的 X 的维度
 83 |         lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hiddenUnitSize, forget_bias=1.0, state_is_tuple=True)
 84 |         # **步骤3：添加 dropout layer, 一般只设置 output_keep_prob
 85 | 
 86 |         # 运行test的时候注释掉这段，不能dropout
 87 |         lstm_cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell, input_keep_prob=1.0, output_keep_prob=0.5)
 88 |         # **步骤4：调用 MultiRNNCell 来实现多层 LSTM
 89 |         mlstm_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * 5, state_is_tuple=True)
 90 |         # **步骤5：用全零来初始化state
 91 |         init_state = mlstm_cell.zero_state(batchSize, dtype=tf.float32)
 92 |         output_rnn, final_states = tf.nn.dynamic_rnn(mlstm_cell, inputRnn, initial_state=init_state,
 93 |                                                      dtype=tf.float32)  # output_rnn是记录lstm每个输出节点的结果，final_states是最后一个cell的结果
 94 | 
 95 |         output = tf.reshape(output_rnn, [-1, self.hiddenUnitSize])  # 作为输出层的输入
 96 |         w_out = self.weights['out']
 97 |         b_out = self.biases['out']
 98 |         pred = tf.matmul(output, w_out) + b_out
 99 |         return pred, final_states
100 | 
101 |     # 训练模型
102 |     def trainLstm(self):
103 |         pred, _ = self.lstm()
104 |         # 定义损失函数
105 |         loss = tf.sqrt(tf.reduce_mean(tf.square(tf.reshape(pred, [-1]) - tf.reshape(self.Y, [-1]))))
106 |         # 定义训练模型
107 |         train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
108 |         saver = tf.train.Saver(tf.global_variables())
109 |         with tf.Session() as sess:
110 |             # sess.run(tf.global_variables_initializer())
111 |             saver.restore(sess,self.savePath)
112 |             # 重复训练100次，训练是一个耗时的过程
113 |             for i in range(1000):
114 |                 step = 0
115 |                 start = 0
116 |                 end = start + self.batchSize
117 |                 while end < len(self.train_x):
118 |                     _, loss_ = sess.run([train_op, loss], feed_dict={self.X: self.train_x[start:end],
119 |                                                                                  self.Y: self.train_y[start:end]})
120 |                     # start += 1
121 |                     start += self.batchSize
122 |                     end = start + self.batchSize
123 |                     # 每10步保存一次参数
124 |                     if step % 500 == 0:
125 |                         print('test loss is :', i, loss_)
126 |                     if (i % 10 == 0) & (step % 500 == 0):
127 |                         print("保存模型")
128 |                         saver.save(sess, self.savePath)
129 |                     step += 1
130 | 
131 |     def prediction(self):
132 |         pred, _ = self.lstm()  # 预测时只输入[1,time_step,inputSize]的测试数据
133 |         saver = tf.train.Saver(tf.global_variables())
134 |         with tf.Session() as sess:
135 |             # 参数恢复
136 |             saver.restore(sess, self.savePath)
137 |             # 取训练集最后一行为测试样本. shape=[1,time_step,inputSize]
138 |             result = []
139 |             start = 20
140 |             end = start + self.batchSize
141 |             # while end < len(self.train_x):
142 |             pred = sess.run([pred], feed_dict={self.X: self.train_x[start:end]
143 |                                                             })
144 |             # 以折线图表示结果
145 |             p = np.reshape(pred, [self.batchSize, -1])
146 |             s = 0
147 |             b = self.timeStep
148 |             x = [i for i in range(s, b*19)]
149 |             # 以折线图表示结果
150 |             plt.figure()
151 |             plt.plot(x, p[0], color='r', label='yuce')
152 |             plt.plot(x, self.train_y[s:b], color='y', label='shiji')
153 |             plt.xlabel("Time(s)")  # X轴标签
154 |             plt.ylabel("Value")  # Y轴标签
155 |             plt.show()
156 | 
157 | 
158 | predictor = PricePredictor()
159 | predictor.loadData()
160 | 
161 | # 构建训练数据
162 | predictor.buildTrainDataSet()
163 | 
164 | # # 模型训练
165 | predictor.trainLstm()
166 | #
167 | # # 预测－预测前需要先完成模型训练
168 | # predictor.prediction()
169 | 


--------------------------------------------------------------------------------
/venv/datafountain/taocan/ml_models.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/10/15 上午11:28
  4 | # @Author   :hwwu
  5 | # @File     :ml_models.py
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | path = '/Users/liyangyang/Downloads/datafountain/taocan/'
 11 | 
 12 | 
 13 | ###service_type,is_mix_service,online_time,1_total_fee,2_total_fee,3_total_fee,4_total_fee,
 14 | # month_traffic,many_over_bill,contract_type,contract_time,is_promise_low_consume,net_service,
 15 | # pay_times,pay_num,last_month_traffic,local_trafffic_month,local_caller_time,service1_caller_time,
 16 | # service2_caller_time,gender,age,complaint_level,former_complaint_num,former_complaint_fee,
 17 | # current_service,user_id
 18 | def getdata(data, f=True):
 19 |     # data = pd.read_csv(path + 'train_all.csv')
 20 |     if f:
 21 |         data.loc[data['current_service'] == 90063345, 'current_service'] = 0
 22 |         data.loc[data['current_service'] == 89950166, 'current_service'] = 1
 23 |         data.loc[data['current_service'] == 89950167, 'current_service'] = 2
 24 |         data.loc[data['current_service'] == 99999828, 'current_service'] = 3
 25 |         data.loc[data['current_service'] == 90109916, 'current_service'] = 4
 26 |         data.loc[data['current_service'] == 89950168, 'current_service'] = 5
 27 |         data.loc[data['current_service'] == 99999827, 'current_service'] = 6
 28 |         data.loc[data['current_service'] == 99999826, 'current_service'] = 7
 29 |         data.loc[data['current_service'] == 90155946, 'current_service'] = 8
 30 |         data.loc[data['current_service'] == 99999830, 'current_service'] = 9
 31 |         data.loc[data['current_service'] == 99999825, 'current_service'] = 10
 32 |         data.loc[data['age'] == '\\N', 'age'] = 0
 33 |         data.loc[data['gender'] == '\\N', 'gender'] = 0
 34 | 
 35 |     data['age'] = data['age'].astype('int64')
 36 |     data.loc[data['age'] < 20, 'age'] = 0
 37 |     data.loc[(data['age'] >= 20) & (data['age'] < 30), 'age'] = 1
 38 |     data.loc[(data['age'] >= 30) & (data['age'] < 40), 'age'] = 2
 39 |     data.loc[(data['age'] >= 40) & (data['age'] < 50), 'age'] = 3
 40 |     data.loc[data['age'] >= 50, 'age'] = 4
 41 | 
 42 |     data['gender'] = data['gender'].astype('int64')
 43 | 
 44 |     data.loc[data['2_total_fee'] == '\\N', '2_total_fee'] = 0.0
 45 |     data.loc[data['3_total_fee'] == '\\N', '3_total_fee'] = 0.0
 46 |     data['2_total_fee'] = data['2_total_fee'].astype('float64')
 47 |     data['3_total_fee'] = data['3_total_fee'].astype('float64')
 48 |     data.loc[data['1_total_fee'] > 500.0, '1_total_fee'] = 500.0
 49 |     data.loc[data['2_total_fee'] > 500.0, '2_total_fee'] = 500.0
 50 |     data.loc[data['3_total_fee'] > 500.0, '3_total_fee'] = 500.0
 51 |     data.loc[data['4_total_fee'] > 500.0, '4_total_fee'] = 500.0
 52 | 
 53 |     data['total_fee'] = 0
 54 |     data.loc[data['1_total_fee'] < .0, 'total_fee'] = 1
 55 |     data.loc[data['2_total_fee'] < .0, 'total_fee'] = 1
 56 |     data.loc[data['3_total_fee'] < .0, 'total_fee'] = 1
 57 |     data.loc[data['4_total_fee'] < .0, 'total_fee'] = 1
 58 |     data.loc[data['1_total_fee'] > 499.0, 'total_fee'] = 2
 59 |     data.loc[data['2_total_fee'] > 499.0, 'total_fee'] = 2
 60 |     data.loc[data['3_total_fee'] > 499.0, 'total_fee'] = 2
 61 |     data.loc[data['4_total_fee'] > 499.0, 'total_fee'] = 2
 62 | 
 63 |     data['month_traffic_0'] = 0
 64 |     data.loc[(data['month_traffic'] > 0) & (data['month_traffic'] < 1024), 'month_traffic_0'] = 1
 65 |     data.loc[data['month_traffic'] == 1024.0, 'month_traffic_0'] = 2
 66 |     data.loc[data['month_traffic'] > 1024, 'month_traffic_0'] = 3
 67 | 
 68 |     data.loc[data['online_time'] > 140, 'online_time'] = 140
 69 | 
 70 |     data['pay_ave'] = data['pay_num'] / data['pay_times']
 71 |     data.loc[data['pay_times'] > 10, 'pay_times'] = 10
 72 | 
 73 |     data['my_traffic'] = data['last_month_traffic'].apply(lambda x: parse_traffic(x))
 74 | 
 75 |     data = data.drop(['local_trafffic_month'], axis=1)
 76 |     data = data.drop(['last_month_traffic'], axis=1)
 77 |     data = data.drop(['month_traffic'], axis=1)
 78 | 
 79 |     data.loc[data['local_caller_time'] == 0.0, 'local_caller_time'] = 0
 80 |     data.loc[(data['local_caller_time'] > 0) & (data['local_caller_time'] < 10), 'local_caller_time'] = 1
 81 |     data.loc[(data['local_caller_time'] >= 10) & (data['local_caller_time'] < 100), 'local_caller_time'] = 2
 82 |     data.loc[data['local_caller_time'] >= 100, 'local_caller_time'] = 3
 83 | 
 84 |     data.loc[data['service1_caller_time'] == 0.0, 'service1_caller_time'] = 0
 85 |     data.loc[(data['service1_caller_time'] > 0) & (data['service1_caller_time'] < 10), 'service1_caller_time'] = 1
 86 |     data.loc[(data['service1_caller_time'] >= 10) & (data['service1_caller_time'] < 100), 'service1_caller_time'] = 2
 87 |     data.loc[data['service1_caller_time'] >= 100, 'service1_caller_time'] = 3
 88 | 
 89 |     data.loc[data['service2_caller_time'] == 0.0, 'service2_caller_time'] = 0
 90 |     data.loc[(data['service2_caller_time'] > 0) & (data['service2_caller_time'] < 10), 'service2_caller_time'] = 1
 91 |     data.loc[(data['service2_caller_time'] >= 10) & (data['service2_caller_time'] < 100), 'service2_caller_time'] = 2
 92 |     data.loc[data['service2_caller_time'] >= 100, 'service2_caller_time'] = 3
 93 | 
 94 |     data['complaint_num'] = 0
 95 |     data.loc[data['former_complaint_num'] > 0, 'complaint_num'] = 1
 96 | 
 97 |     data['complaint_fee'] = 0
 98 |     data.loc[data['former_complaint_fee'] > 0, 'complaint_fee'] = 1
 99 | 
100 |     return data
101 | 
102 | 
103 | def parse_traffic(x):
104 |     m = x / 1024.0
105 |     if m == 0.0:
106 |         return 0
107 |     elif m < 1.0:
108 |         return 0.5
109 |     elif m == 1.0:
110 |         return 1
111 |     elif m < 2.0:
112 |         return 1.5
113 |     elif m == 2.0:
114 |         return 2
115 |     elif m < 3.0:
116 |         return 2.5
117 |     elif m == 3.0:
118 |         return 3
119 |     elif m < 4.0:
120 |         return 3.5
121 |     elif m == 4.0:
122 |         return 4
123 |     else:
124 |         return 5
125 | 
126 | 
127 | data = pd.read_csv(path + 'train_all.csv')
128 | data = getdata(data)
129 | train_data = data
130 | train_x = train_data.drop(['user_id', 'current_service'], axis=1)
131 | train_y = train_data['current_service']
132 | 
133 | ####### test数据
134 | republish_test_data = pd.read_csv(path + 'republish_test.csv')
135 | republish_test_data = getdata(republish_test_data, f=False)
136 | # print('republish_test_data: ', republish_test_data.shape)
137 | 
138 | user_id = republish_test_data['user_id']
139 | republish_test = republish_test_data.drop(['user_id'], axis=1)
140 | 
141 | from sklearn.model_selection import train_test_split
142 | 
143 | Y_CAT = pd.Categorical(train_y)
144 | X_train, X_test, y_train, y_test = train_test_split(train_x, Y_CAT.codes, test_size=0.05, random_state=666)
145 | 
146 | y_test = np.array(y_test)
147 | 
148 | 
149 | def score(y_pred):
150 |     y_pred = [list(x).index(max(x)) for x in y_pred]
151 |     count = 0
152 |     for i in range(len(y_pred)):
153 |         # print(test_y[i:i+1][0])
154 |         if (y_pred[i] == y_test[i:i + 1][0]):
155 |             # print(y_pred[i], test_y[i:i + 1][0])
156 |             count += 1
157 |     print(count, len(y_pred), count / len(y_pred))
158 | 
159 | 
160 | from sklearn.naive_bayes import MultinomialNB
161 | from sklearn.model_selection import cross_val_score
162 | 
163 | # clf = MultinomialNB()
164 | # clf.fit(X_train, y_train)
165 | # print("多项式贝叶斯分类器20折交叉验证得分: ", np.mean(cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')))
166 | # score(clf.predict(X_test))
167 | #
168 | from sklearn import svm
169 | 
170 | lin_clf = svm.LinearSVC(class_weight='balanced')
171 | lin_clf.fit(X_train, y_train)
172 | print("svm分类器20折交叉验证得分: ", np.mean(cross_val_score(lin_clf, X_train, y_train, cv=5, scoring='accuracy')))
173 | score(lin_clf.predict(X_test))
174 | 
175 | from sklearn.ensemble import RandomForestClassifier
176 | 
177 | lin_forest = RandomForestClassifier(n_estimators=10, random_state=1, class_weight='balanced')
178 | lin_forest.fit(X_train, y_train)
179 | print("RandomForestClassifier分类器20折交叉验证得分: ",
180 |       np.mean(cross_val_score(lin_forest, X_train, y_train, cv=5, scoring='accuracy')))
181 | score(lin_forest.predict(X_test))
182 | 
183 | import xgboost as xgb
184 | 
185 | model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=0.0468)
186 | model_xgb.fit(X_train, y_train)
187 | print("model_xgb分类器20折交叉验证得分: ",
188 |       np.mean(cross_val_score(model_xgb, X_train, y_train, cv=5, scoring='accuracy')))
189 | score(model_xgb.predict(X_test))
190 | 


--------------------------------------------------------------------------------
/venv/datafountain/taocan/tf_model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "/Users/liyangyang/PycharmProjects/mypy/deep_learning/venv/datafountain/taocan/tf_model/stock.model"
2 | all_model_checkpoint_paths: "/Users/liyangyang/PycharmProjects/mypy/deep_learning/venv/datafountain/taocan/tf_model/stock.model.max"
3 | all_model_checkpoint_paths: "/Users/liyangyang/PycharmProjects/mypy/deep_learning/venv/datafountain/taocan/tf_model/stock.model"
4 | 


--------------------------------------------------------------------------------
/venv/datafountain/taocan/tf_model/stock.model.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.data-00000-of-00001


--------------------------------------------------------------------------------
/venv/datafountain/taocan/tf_model/stock.model.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.index


--------------------------------------------------------------------------------
/venv/datafountain/taocan/tf_model/stock.model.max.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.max.data-00000-of-00001


--------------------------------------------------------------------------------
/venv/datafountain/taocan/tf_model/stock.model.max.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.max.index


--------------------------------------------------------------------------------
/venv/datafountain/taocan/tf_model/stock.model.max.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.max.meta


--------------------------------------------------------------------------------
/venv/datafountain/taocan/tf_model/stock.model.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.meta


--------------------------------------------------------------------------------
/venv/dc/guangfu/github/README.md:
--------------------------------------------------------------------------------
1 | 基础特征做的baseline 线上1.65左右
2 | 
3 | 提分点：
4 | 1、好好做下特征工程
5 | 2、模型融合
6 | 可以参考下这个 https://mp.weixin.qq.com/s/Yix0xVp2SiqaAcuS6Q049g
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/venv/dc/guangfu/github/baseline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/10/25 上午11:07
  4 | # @Author   :hwwu
  5 | # @File     :baseline.py
  6 | 
  7 | import pandas as pd
  8 | from sklearn.model_selection import train_test_split
  9 | import lightgbm as lgb
 10 | from sklearn.preprocessing import PolynomialFeatures
 11 | 
 12 | path = './dc/guangfu/'
 13 | 
 14 | 
 15 | def get_hour(x):
 16 |     h = int(x[11:13])
 17 |     m = int(x[14:16])
 18 |     if m in [14, 29, 44]:
 19 |         m += 1
 20 |     if m == 59:
 21 |         m = 0
 22 |         h += 1
 23 |     if h == 24:
 24 |         h = 0
 25 |     return h * 60 + m
 26 | 
 27 | 
 28 | def add_poly_features(data, column_names):
 29 |     features = data[column_names]
 30 |     rest_features = data.drop(column_names, axis=1)
 31 |     poly_transformer = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
 32 |     poly_features = pd.DataFrame(poly_transformer.fit_transform(features),
 33 |                                  columns=poly_transformer.get_feature_names(column_names))
 34 | 
 35 |     for col in poly_features.columns:
 36 |         rest_features.insert(1, col, poly_features[col])
 37 |     return rest_features
 38 | 
 39 | 
 40 | train_x_old = pd.read_csv(path + 'train_1.csv')
 41 | test = pd.read_csv(path + 'test_1.csv')
 42 | train_x_old['month'] = train_x_old['时间'].apply(lambda x: x[5:7]).astype('int32')
 43 | train_x_old['day'] = train_x_old['时间'].apply(lambda x: x[8:10]).astype('int32')
 44 | train_x_old['hour'] = train_x_old['时间'].apply(lambda x: get_hour(x)).astype('int32')
 45 | test['month'] = test['时间'].apply(lambda x: x[5:7]).astype('int32')
 46 | test['day'] = test['时间'].apply(lambda x: x[8:10]).astype('int32')
 47 | test['hour'] = test['时间'].apply(lambda x: get_hour(x)).astype('int32')
 48 | 
 49 | train_y = train_x_old['实际功率']
 50 | train_x = train_x_old.drop(['实发辐照度', '实际功率'], axis=1)
 51 | train_x['dis2peak'] = train_x['hour'].apply(lambda x: (810 - abs(810 - x)) / 810)
 52 | train_x = add_poly_features(train_x, ['风速', '风向'])
 53 | train_x = add_poly_features(train_x, ['温度', '压强', '湿度'])
 54 | 
 55 | id = test['id']
 56 | del_id = test[test['辐照度'].isin([-1.0])]['id']
 57 | test = test.drop(['id'], axis=1)
 58 | test['dis2peak'] = test['hour'].apply(lambda x: (810 - abs(810 - x)) / 810)
 59 | test = add_poly_features(test, ['风速', '风向'])
 60 | test = add_poly_features(test, ['温度', '压强', '湿度'])
 61 | 
 62 | train_x = train_x.drop(['时间'], axis=1)
 63 | test = test.drop(['时间'], axis=1)
 64 | print('train_x.shape,test_1.shape : ', train_x.shape, test.shape)
 65 | 
 66 | X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.1, random_state=678)
 67 | 
 68 | params = {
 69 |     "objective": "regression",
 70 |     "metric": "mse",
 71 |     "num_leaves": 30,
 72 |     "min_child_samples": 100,
 73 |     "learning_rate": 0.03,
 74 |     "bagging_fraction": 0.7,
 75 |     "feature_fraction": 0.5,
 76 |     "bagging_frequency": 5,
 77 |     "bagging_seed": 666,
 78 |     "verbosity": -1
 79 | }
 80 | 
 81 | 
 82 | def lgb_train():
 83 |     lgb_train = lgb.Dataset(X_train, label=y_train)
 84 |     lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
 85 |     print('begin train')
 86 |     gbm = lgb.train(params,
 87 |                     lgb_train,
 88 |                     num_boost_round=50000,
 89 |                     valid_sets=lgb_eval,
 90 |                     early_stopping_rounds=100,
 91 |                     verbose_eval=100)
 92 |     # y_pred = gbm.predict(X_test)
 93 |     ##write result
 94 |     republish_pred = gbm.predict(test)
 95 |     republish_pred = pd.DataFrame(republish_pred)
 96 |     sub = pd.concat([id, republish_pred], axis=1)
 97 |     print(sub.shape)
 98 |     sub.columns = ['id', 'predicition']
 99 |     sub.loc[sub['id'].isin(del_id), 'predicition'] = 0.0
100 |     sub.to_csv(path + '/baseline1.csv', index=False, sep=',', encoding='UTF-8')
101 | 
102 | 
103 | lgb_train()
104 | 


--------------------------------------------------------------------------------
/venv/deep_learning/yucemoxing/PricePredictor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/6/21 下午1:27
  4 | # @Author   :hwwu
  5 | # @File     :PricePredictor.py
  6 | 
  7 | import codecs
  8 | import matplotlib.pyplot as plt
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | 
 12 | class PricePredictor:
 13 |     # lstm param
 14 |     timeStep = 20
 15 |     hiddenUnitSize = 10 #隐藏层神经元数量
 16 |     batchSize = 60  #每一批次训练多少个样例
 17 |     inputSize = 1 #输入维度
 18 |     outputSize=1 #输出维度
 19 |     lr = 0.0006 #学习率
 20 |     train_x, train_y = [],[] #训练数据集
 21 |     sortedChargeList = []  #排序的训练数据集
 22 |     normalizeData = []    #归一化的数据
 23 |     dataFile = '/Users/liyangyang/PycharmProjects/mypy/venv/deep_learning/yucemoxing/chargeInfo.txt'
 24 |     date2Price = {}       #日期－每平米的价格映射
 25 |     chargeList = []       #交易价格
 26 |     date2Charge = {}      #日期－交易价格映射
 27 |     meanPrice = 0        #均价
 28 |     stdPrice = 0
 29 |     X = tf.placeholder(tf.float32, [None, timeStep, inputSize])
 30 |     Y = tf.placeholder(tf.float32, [None, timeStep, outputSize])
 31 |     weights = {
 32 |         'in': tf.Variable(tf.random_normal([inputSize, hiddenUnitSize])),
 33 |         'out': tf.Variable(tf.random_normal([hiddenUnitSize, 1]))
 34 |     }
 35 | 
 36 |     biases = {
 37 |         'in': tf.Variable(tf.constant(0.1, shape=[hiddenUnitSize, ])),
 38 |         'out': tf.Variable(tf.constant(0.1, shape=[1, ]))
 39 |     }
 40 | 
 41 |     def loadData(self):
 42 |         fp = codecs.open(self.dataFile, 'r', 'utf-8')
 43 |         line = fp.readline()
 44 | 
 45 |         # parse line to data
 46 |         while line:
 47 |             line = fp.readline()
 48 |             data = line.split(" ")
 49 |             if len(data) < 7:
 50 |                 continue
 51 |             area = float(data[5].replace("平米", ""))
 52 |             price = float(data[2])
 53 |             pricePerSquare = price / area
 54 |             charge = [str(data[1]), data[6].replace('\n', ''), data[3], pricePerSquare]
 55 |             self.chargeList.append(charge)
 56 |             self.date2Charge[str(data[1])] = charge  # date: {name:price}
 57 |             self.date2Price[str(data[1])] = pricePerSquare
 58 | 
 59 |         self.sortedChargeList = sorted(self.chargeList, key=predictor.getKey, reverse=False)
 60 | 
 61 |     def getKey(self, item):
 62 |         return item[1]
 63 | 
 64 |     # 构造数据
 65 |     def buildTrainDataSet(self):
 66 |         data = []
 67 |         for price in self.sortedChargeList:
 68 |             data.append(price[3])
 69 | 
 70 |         self.meanPrice = np.mean(data);
 71 |         self.stdPrice = np.std(data)
 72 |         self.normalizeData = (data - self.meanPrice) / self.stdPrice #标准化
 73 | 
 74 |         self.normalizeData = self.normalizeData[:,np.newaxis]  #增加维度
 75 |         for i in range(len(self.normalizeData)-self.timeStep-1):
 76 |             x=self.normalizeData[i:i+self.timeStep]
 77 |             y=self.normalizeData[i+1:i+self.timeStep+1]
 78 |             self.train_x.append(x.tolist())
 79 |             self.train_y.append(y.tolist())
 80 | 
 81 |     # lstm算法定义
 82 |     def lstm(self, batchSize = None):
 83 |         if batchSize is None :
 84 |             batchSize = self.batchSize
 85 |         weightIn = self.weights['in']
 86 |         biasesIn = self.biases['in']
 87 |         input = tf.reshape(self.X, [-1,self.inputSize])
 88 |         inputRnn=tf.matmul(input,weightIn)+biasesIn
 89 |         inputRnn=tf.reshape(inputRnn,[-1,self.timeStep,self.hiddenUnitSize])  #将tensor转成3维，作为lstm cell的输入
 90 |         cell=tf.nn.rnn_cell.BasicLSTMCell(self.hiddenUnitSize, reuse=True)
 91 |         initState=cell.zero_state(batchSize,dtype=tf.float32)
 92 |         output_rnn,final_states=tf.nn.dynamic_rnn(cell, inputRnn,initial_state=initState, dtype=tf.float32)  #output_rnn是记录lstm每个输出节点的结果，final_states是最后一个cell的结果
 93 |         output=tf.reshape(output_rnn,[-1,self.hiddenUnitSize]) #作为输出层的输入
 94 |         w_out=self.weights['out']
 95 |         b_out=self.biases['out']
 96 |         pred=tf.matmul(output,w_out)+b_out
 97 |         return pred,final_states
 98 | 
 99 |     # 训练模型
100 |     def trainLstm(self) :
101 |         pred,_ = self.lstm()
102 |         #定义损失函数
103 |         loss = tf.reduce_mean(tf.square(tf.reshape(pred, [-1]) - tf.reshape(self.Y, [-1])))
104 |         #定义训练模型
105 |         train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)
106 |         saver = tf.train.Saver(tf.global_variables())
107 |         with tf.Session() as sess:
108 |             sess.run(tf.global_variables_initializer())
109 |             # 重复训练100次，训练是一个耗时的过程
110 |             for i in range(100):
111 |                 step = 0
112 |                 start = 0
113 |                 end = start + self.batchSize
114 |                 while end < len(self.train_x):
115 |                     _, loss_ = sess.run([train_op, loss], feed_dict={self.X: self.train_x[start:end], self.Y: self.train_y[start:end]})
116 |                     start += self.batchSize
117 |                     end = start + self.batchSize
118 |                     # 每10步保存一次参数
119 |                     if step % 10 == 0:
120 |                         print(i, step, loss_)
121 |                         print("保存模型：", saver.save(sess, '/Users/liyangyang/PycharmProjects/mypy/venv/deep_learning/yucemoxing/model/stock.model'))
122 |                     step += 1
123 | 
124 | 
125 |     def prediction(self):
126 |         pred, _ = self.lstm(1)  # 预测时只输入[1,time_step,inputSize]的测试数据
127 |         saver = tf.train.Saver(tf.global_variables())
128 |         with tf.Session() as sess:
129 |             # 参数恢复
130 |             module_file = tf.train.latest_checkpoint('/Users/liyangyang/PycharmProjects/mypy/venv/deep_learning/yucemoxing/model/')
131 |             saver.restore(sess, module_file)
132 |             # 取训练集最后一行为测试样本. shape=[1,time_step,inputSize]
133 |             prev_seq = self.train_x[-1]
134 |             predict = []
135 |             # 得到之后100个预测结果
136 |             for i in range(100):
137 |                 next_seq = sess.run(pred, feed_dict={self.X: [prev_seq]})
138 |                 predict.append(next_seq[-1])
139 |                 # 每次得到最后一个时间步的预测结果，与之前的数据加在一起，形成新的测试样本
140 |                 prev_seq = np.vstack((prev_seq[1:], next_seq[-1]))
141 |             # 以折线图表示结果
142 |             plt.figure()
143 |             true_price = self.stdPrice**predict
144 |             true_price = [price + self.meanPrice for price in true_price]
145 |             plt.plot(list(range(len(self.normalizeData), len(self.normalizeData) + len(predict))), true_price, color='r')
146 |             plt.show()
147 | 
148 | predictor = PricePredictor()
149 | predictor.loadData()
150 | 
151 | # print('sortedChargeList:')
152 | # print(predictor.sortedChargeList)
153 | # print('chargeList')
154 | # print(predictor.chargeList)
155 | # print('date2Charge')
156 | # print(predictor.date2Charge)
157 | # print('date2Price')
158 | # print(predictor.date2Price)
159 | 
160 | # 构建训练数据
161 | predictor.buildTrainDataSet()
162 | # print(predictor.train_x[0:10])
163 | # print(predictor.train_y[0:10])
164 | 
165 | # # 模型训练
166 | # predictor.trainLstm()
167 | #
168 | # # 预测－预测前需要先完成模型训练
169 | predictor.prediction()
170 | 


--------------------------------------------------------------------------------
/venv/dwb/baseline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/8/20 上午11:28
  4 | # @Author   :hwwu
  5 | # @File     :baseline.py
  6 | import pandas as pd, numpy as np
  7 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  8 | from sklearn import svm
  9 | from sklearn.ensemble import RandomForestClassifier
 10 | from tensorflow.contrib import learn
 11 | 
 12 | path = '/Users/liyangyang/Downloads/dwb/new_data/'
 13 | # column = "word_seg"
 14 | # train = pd.read_csv(path+'train_set.csv')
 15 | test = pd.read_csv(path+'test_set.csv')
 16 | test_id = test["id"].copy()
 17 | vec = TfidfVectorizer(ngram_range=(3,4),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1)
 18 | #
 19 | # train = np.array(train[column])
 20 | # test = np.array(test[column])
 21 | #
 22 | # vocab_processor_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/vocab-5000/'
 23 | # vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_processor_path)
 24 | # train = np.array(list(vocab_processor.transform(train)))
 25 | # test = np.array(list(vocab_processor.transform(test)))
 26 | # #
 27 | # # train = pd.DataFrame(train)
 28 | # # test = pd.DataFrame(test)
 29 | #
 30 | # np.save(path+'vocab/vocab_train',train)
 31 | # np.save(path+'vocab/vocab_test',test)
 32 | 
 33 | 
 34 | # t1 = np.load(path+'vocab/vocab_train.npy')
 35 | # t2 = np.load(path+'vocab/vocab_test.npy')
 36 | #
 37 | # train=[]
 38 | # test=[]
 39 | # for i in range(len(t1)):
 40 | #     row = str(t1[i][0])
 41 | #     for j in range(1,len(t1[i])):
 42 | #         s = str(t1[i][j])
 43 | #         if (s!='0'):
 44 | #             row = row + '\t' + s
 45 | #     train.append(row)
 46 | # print(train[0])
 47 | #
 48 | # for i in range(len(t2)):
 49 | #     row = str(t2[i][0])
 50 | #     for j in range(1,len(t2[i])):
 51 | #         s = str(t2[i][j])
 52 | #         if (s != '0'):
 53 | #             row = row + '\t' + s
 54 | #     test.append(row)
 55 | # print(test[0])
 56 | #
 57 | # train = np.array(train)
 58 | # test = np.array(test)
 59 | # print(train.shape)
 60 | # print(test.shape)
 61 | #
 62 | # np.save(path+'vocab/vocab_train_1',train)
 63 | # np.save(path+'vocab/vocab_test_1',test)
 64 | 
 65 | t1 = np.load(path+'vocab/vocab_train_1.npy')
 66 | train = np.array(t1)
 67 | 
 68 | print('start tf-idf fit')
 69 | trn_term_doc = vec.fit_transform(train)
 70 | np.savetxt(path+'tf-idf/train_data',trn_term_doc)
 71 | 
 72 | print('start tf-idf transform')
 73 | t2 = np.load(path+'vocab/vocab_test_1.npy')
 74 | test = np.array(t2)
 75 | test_term_doc = vec.transform(test)
 76 | print('tf-idf transform done')
 77 | 
 78 | fid0=open(path+'baseline_time.csv','w')
 79 | np.savetxt(path+'tf-idf/test_data',test_term_doc)
 80 | print('save data done')
 81 | 
 82 | y=(train["class"]-1).astype(int)
 83 | print('start fit')
 84 | lin_clf = svm.LinearSVC()
 85 | lin_clf.fit(trn_term_doc[:80000],y[:80000])
 86 | # preds = lin_clf.predict(test_term_doc)
 87 | # lin_forest = RandomForestClassifier(n_estimators=100, random_state=1)
 88 | # lin_forest.fit(trn_term_doc,y)
 89 | print('fit done')
 90 | print('start predict')
 91 | pred = lin_clf.score(trn_term_doc[80000:],y[80000:])
 92 | print('predict done')
 93 | print(pred)
 94 | 
 95 | preds = lin_clf.predict(test_term_doc)
 96 | i=0
 97 | fid0.write("id,class"+"\n")
 98 | for item in preds:
 99 |     fid0.write(str(i)+","+str(item+1)+"\n")
100 |     i=i+1
101 | fid0.close()
102 | 
103 | 


--------------------------------------------------------------------------------
/venv/dwb/fasttext/__pycache__/fasttext.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/fasttext/__pycache__/fasttext.cpython-36.pyc


--------------------------------------------------------------------------------
/venv/dwb/fasttext/fasttext.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/8/22 下午1:37
  4 | # @Author   :hwwu
  5 | # @File     :fasttext.py
  6 | 
  7 | import pandas as pd, numpy as np
  8 | import fastText
  9 | 
 10 | path = '/Users/liyangyang/Downloads/dwb/new_data/'
 11 | # column = "word_seg"
 12 | column = "article"
 13 | 
 14 | 
 15 | def write_train_data():
 16 |     train = pd.read_csv(path + 'train_set.csv')
 17 |     f = open(path + 't_train_set.txt', 'a')
 18 |     for i in range(80000):
 19 |         row = str(train[column][i]) + '\t' + '__myprefix__' + str(train['class'][i])
 20 |         f.write(row + '\n')
 21 |     f.close()
 22 |     f1 = open(path + 't_test_set.txt', 'a')
 23 |     for i in range(80000, len(train)):
 24 |         row = str(train[column][i]) + '\t' + '__myprefix__' + str(train['class'][i])
 25 |         f1.write(row + '\n')
 26 |     f1.close()
 27 |     # f2 = open(path + 'test_set1.txt', 'a')
 28 |     # for i in range(100000, len(train)):
 29 |     #     row = str(train[column][i])
 30 |     #     f2.write(row + '\n')
 31 |     # f2.close()
 32 | 
 33 | 
 34 | # write_train_data()
 35 | 
 36 | model_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/fasttext/'
 37 | 
 38 | 
 39 | def model():
 40 |     # model = fastText.train_supervised(path + 'train_set.txt', label='__myprefix__',bucket=400000
 41 |     #                                        ,wordNgrams=2,minCount=3,lr=1,lrUpdateRate=0)
 42 |     model = fastText.train_supervised(path + 't_train_set.txt', label='__myprefix__', bucket=39759
 43 |                                       , wordNgrams=3, minCount=3, lr=1, lrUpdateRate=200
 44 |                                       ,dim=128)
 45 |     result = model.test(path + 't_test_set.txt')
 46 |     print(result)
 47 |     # model.save_model(model_path + 'model')
 48 | 
 49 |     true_labels = []
 50 |     all_words = []
 51 |     f = open(path + 't_test_set.txt', 'r')
 52 |     for line in f:
 53 |         words, labels = model.get_line(line.strip())
 54 |         if len(labels) == 0:
 55 |             continue
 56 |         all_words.append(" ".join(words))
 57 |         true_labels += [labels]
 58 |     predictions, _ = model.predict(all_words)
 59 | 
 60 |     n = 0
 61 |     for i in range(len(true_labels)):
 62 |         if (predictions[i]==true_labels[i]):
 63 |             n+=1
 64 |     print(n/len(true_labels))
 65 | 
 66 |     # model = fastText.load_model(model_path + 'model')
 67 |     # id, all_words = get_test_words(model)
 68 |     # print('start predict data')
 69 |     # predictions, _ = model.predict(all_words)
 70 |     # print('predict data done')
 71 |     # write_result(id, predictions)
 72 | 
 73 | model()
 74 | 
 75 | def get_test_words(model):
 76 |     all_words = []
 77 |     id = []
 78 |     print('start read test set data')
 79 |     test = pd.read_csv(path + 'test_set.csv')
 80 |     for i in range(len(test)):
 81 |         words, _ = model.get_line(test[column][i].strip())
 82 |         all_words.append(" ".join(words))
 83 |         id.append(test['id'][i])
 84 |     print('read test set data done')
 85 |     return id, all_words
 86 | 
 87 | 
 88 | def write_result(id, predictions):
 89 |     r_id = []
 90 |     r_predictions = []
 91 |     for i in range(len(id)):
 92 |         r_id.append(int(id[i]))
 93 |         # price.append(round(x1[i][1],1))
 94 |         r_predictions.append(int(tostr(predictions[i])))
 95 | 
 96 |     english_column = pd.Series(r_id, name='id')
 97 |     number_column = pd.Series(r_predictions, name='class')
 98 |     predictions = pd.concat([english_column, number_column], axis=1)
 99 |     # another way to handle
100 |     # save = pd.DataFrame({'user_id': user_id, 'prediction_pay_price': price})
101 |     predictions.to_csv(path + 'result_data.csv', index=0, sep=',', columns=['id', 'class'])
102 | 
103 | 
104 | def tostr(s):
105 |     s = str(s).replace('__myprefix__', '')
106 |     s = s.replace('[', '')
107 |     s = s.replace(']', '')
108 |     s = s.replace('\'', '')
109 |     return s
110 | 
111 | # model()
112 | 


--------------------------------------------------------------------------------
/venv/dwb/github_model/a01_FastText/__pycache__/p5_fastTextB_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a01_FastText/__pycache__/p5_fastTextB_model.cpython-36.pyc


--------------------------------------------------------------------------------
/venv/dwb/github_model/a01_FastText/p5_fastTextB_model.py:
--------------------------------------------------------------------------------
  1 | # fast text. using: very simple model;n-gram to captrue location information;h-softmax to speed up training/inference
  2 | # for the n-gram you can use data_util to generate. see method process_one_sentence_to_get_ui_bi_tri_gram under aa1_data_util/data_util_zhihu.py
  3 | print("started...")
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | 
  7 | class fastTextB:
  8 |     def __init__(self, label_size, learning_rate, batch_size, decay_steps, decay_rate,num_sampled,sentence_len,vocab_size,embed_size,is_training):
  9 |         """init all hyperparameter here"""
 10 |         # set hyperparamter
 11 |         self.label_size = label_size
 12 |         self.batch_size = batch_size
 13 |         self.num_sampled = num_sampled
 14 |         self.sentence_len=sentence_len
 15 |         self.vocab_size=vocab_size
 16 |         self.embed_size=embed_size
 17 |         self.is_training=is_training
 18 |         self.learning_rate=learning_rate
 19 | 
 20 |         # add placeholder (X,label)
 21 |         self.sentence = tf.placeholder(tf.int32, [None, self.sentence_len], name="sentence")  # X
 22 |         self.labels = tf.placeholder(tf.int32, [None], name="Labels")  # y
 23 | 
 24 |         self.global_step = tf.Variable(0, trainable=False, name="Global_Step")
 25 |         self.epoch_step=tf.Variable(0,trainable=False,name="Epoch_Step")
 26 |         self.epoch_increment=tf.assign(self.epoch_step,tf.add(self.epoch_step,tf.constant(1)))
 27 |         self.decay_steps, self.decay_rate = decay_steps, decay_rate
 28 | 
 29 |         self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step")
 30 |         self.instantiate_weights()
 31 |         self.logits = self.inference() #[None, self.label_size]
 32 |         if not is_training:
 33 |             return
 34 |         self.loss_val = self.loss()
 35 |         self.train_op = self.train()
 36 |         self.predictions = tf.argmax(self.logits, axis=1, name="predictions")  # shape:[None,]
 37 |         correct_prediction = tf.equal(tf.cast(self.predictions,tf.int32), self.labels) #tf.argmax(self.logits, 1)-->[batch_size]
 38 |         self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy") # shape=()
 39 | 
 40 |     def instantiate_weights(self):
 41 |         """define all weights here"""
 42 |         # embedding matrix
 43 |         self.Embedding = tf.get_variable("Embedding", [self.vocab_size, self.embed_size])
 44 |         self.W = tf.get_variable("W", [self.embed_size, self.label_size])
 45 |         self.b = tf.get_variable("b", [self.label_size])
 46 | 
 47 |     def inference(self):
 48 |         """main computation graph here: 1.embedding-->2.average-->3.linear classifier"""
 49 |         # 1.get emebedding of words in the sentence
 50 |         sentence_embeddings = tf.nn.embedding_lookup(self.Embedding,self.sentence)  # [None,self.sentence_len,self.embed_size]
 51 | 
 52 |         sentence_embeddings = tf.nn.dropout(sentence_embeddings,keep_prob=0.8)
 53 |         # 2.average vectors, to get representation of the sentence
 54 |         self.sentence_embeddings = tf.reduce_mean(sentence_embeddings, axis=1)  # [None,self.embed_size]
 55 | 
 56 |         # 3.linear classifier layer
 57 |         logits = tf.matmul(self.sentence_embeddings, self.W) + self.b #[None, self.label_size]==tf.matmul([None,self.embed_size],[self.embed_size,self.label_size])
 58 |         return logits
 59 | 
 60 |     def loss(self,l2_lambda=0.01): #0.0001-->0.001
 61 |         """calculate loss using (NCE)cross entropy here"""
 62 |         # Compute the average NCE loss for the batch.
 63 |         # tf.nce_loss automatically draws a new sample of the negative labels each
 64 |         # time we evaluate the loss.
 65 |         if not self.is_training: #training
 66 |             labels=tf.reshape(self.labels,[-1])               #[batch_size,1]------>[batch_size,]
 67 |             labels=tf.expand_dims(labels,1)                   #[batch_size,]----->[batch_size,1]
 68 |             loss = tf.reduce_mean( #inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of the input network.
 69 |                 tf.nn.nce_loss(weights=tf.transpose(self.W),  #[embed_size, label_size]--->[label_size,embed_size]. nce_weights:A `Tensor` of shape `[num_classes, dim].O.K.
 70 |                                biases=self.b,                 #[label_size]. nce_biases:A `Tensor` of shape `[num_classes]`.
 71 |                                labels=labels,                 #[batch_size,1]. train_labels, # A `Tensor` of type `int64` and shape `[batch_size,num_true]`. The target classes.
 72 |                                inputs=self.sentence_embeddings,# [None,self.embed_size] #A `Tensor` of shape `[batch_size, dim]`.  The forward activations of the input network.
 73 |                                num_sampled=self.num_sampled,  #scalar. 100
 74 |                                num_classes=self.label_size,partition_strategy="div"))  #scalar. 1999
 75 |         else:#eval/inference
 76 |             #logits = tf.matmul(self.sentence_embeddings, tf.transpose(self.W)) #matmul([None,self.embed_size])--->
 77 |             #logits = tf.nn.bias_add(logits, self.b)
 78 |             labels_one_hot = tf.one_hot(self.labels, self.label_size) #[batch_size]---->[batch_size,label_size]
 79 |             #sigmoid_cross_entropy_with_logits:Computes sigmoid cross entropy given `logits`.Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive.  For instance, one could perform multilabel classification where a picture can contain both an elephant and a dog at the same time.
 80 |             # labels = tf.expand_dims(self.labels, 1)
 81 |             loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_one_hot,logits=self.logits) #labels:[batch_size,label_size];logits:[batch, label_size]
 82 |             print("loss0:", loss) #shape=(?, 1999)
 83 |             loss = tf.reduce_mean(loss)
 84 |             print("loss1:",loss)  #shape=(?,)
 85 |         l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda
 86 |         return loss+l2_losses
 87 | 
 88 |     def train(self):
 89 |         """based on the loss, use SGD to update parameter"""
 90 |         learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps,self.decay_rate, staircase=True)
 91 |         train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,learning_rate=learning_rate, optimizer="adam")
 92 | 
 93 |         return train_op
 94 | 
 95 | #test started
 96 | def test():
 97 |     #below is a function test; if you use this for text classifiction, you need to tranform sentence to indices of vocabulary first. then feed data to the graph.
 98 |     num_classes=19
 99 |     learning_rate=0.01
100 |     batch_size=8
101 |     decay_steps=1000
102 |     decay_rate=0.9
103 |     sequence_length=5
104 |     vocab_size=10000
105 |     embed_size=100
106 |     is_training=True
107 |     dropout_keep_prob=1
108 |     fastText=fastTextB(num_classes, learning_rate, batch_size, decay_steps, decay_rate,5,sequence_length,vocab_size,embed_size,is_training)
109 |     with tf.Session() as sess:
110 |         sess.run(tf.global_variables_initializer())
111 |         for i in range(100):
112 |             input_x=np.zeros((batch_size,sequence_length),dtype=np.int32) #[None, self.sequence_length]
113 |             input_y=input_y=np.array([1,0,1,1,1,2,1,1],dtype=np.int32) #np.zeros((batch_size),dtype=np.int32) #[None, self.sequence_length]
114 |             loss,acc,predict,_=sess.run([fastText.loss_val,fastText.accuracy,fastText.predictions,fastText.train_op],
115 |                                         feed_dict={fastText.sentence:input_x,fastText.labels:input_y})
116 |             print("loss:",loss,"acc:",acc,"label:",input_y,"prediction:",predict)
117 | #test()
118 | print("ended...")
119 | 


--------------------------------------------------------------------------------
/venv/dwb/github_model/a01_FastText/p5_fastTextB_predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | try:
  5 |     reload                        # Python 2
  6 | except NameError:
  7 |     from importlib import reload  # Python 3
  8 | import sys
  9 | reload(sys)
 10 | sys.setdefaultencoding('utf8')
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | from p5_fastTextB_model import fastTextB as fastText
 14 | # from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 15 | from tflearn.data_utils import to_categorical, pad_sequences
 16 | import os
 17 | import codecs
 18 | 
 19 | #configuration
 20 | FLAGS=tf.app.flags.FLAGS
 21 | tf.app.flags.DEFINE_integer("label_size",19,"number of label")
 22 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 23 | tf.app.flags.DEFINE_integer("batch_size", 512, "Batch size for training/evaluating.") #批处理的大小 32-->128
 24 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 25 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 26 | tf.app.flags.DEFINE_integer("num_sampled",100,"number of noise sampling")
 27 | tf.app.flags.DEFINE_string("ckpt_dir","fast_text_checkpoint/","checkpoint location for the model")
 28 | tf.app.flags.DEFINE_integer("sentence_len",300,"max sentence length")
 29 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 30 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 31 | tf.app.flags.DEFINE_integer("num_epochs",15,"embedding size")
 32 | tf.app.flags.DEFINE_integer("validate_every", 3, "Validate every validate_every epochs.") #每10轮做一次验证
 33 | tf.app.flags.DEFINE_string("predict_target_file","fast_text_checkpoint/zhihu_result_ftB2.csv","target file path for final prediction")
 34 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-v4only-title.txt',"target file path for final prediction")
 35 | 
 36 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction)
 37 | def main(_):
 38 |     # 1.load data with vocabulary of words and labels
 39 |     vocabulary_word2index, vocabulary_index2word = create_voabulary()
 40 |     vocab_size = len(vocabulary_word2index)
 41 |     vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
 42 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 43 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 44 |     testX=[]
 45 |     question_id_list=[]
 46 |     for tuple in test:
 47 |         question_id,question_string_list=tuple
 48 |         question_id_list.append(question_id)
 49 |         testX.append(question_string_list)
 50 | 
 51 |     # 2.Data preprocessing: Sequence padding
 52 |     print("start padding....")
 53 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
 54 |     print("end padding...")
 55 | 
 56 |     # 3.create session.
 57 |     config=tf.ConfigProto()
 58 |     config.gpu_options.allow_growth=True
 59 |     with tf.Session(config=config) as sess:
 60 |         # 4.Instantiate Model
 61 |         fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
 62 |         saver=tf.train.Saver()
 63 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 64 |             print("Restoring Variables from Checkpoint")
 65 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 66 |         else:
 67 |             print("Can't find the checkpoint.going to stop")
 68 |             return
 69 |         # 5.feed data, to get logits
 70 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 71 |         batch_size=1
 72 |         index=0
 73 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 74 |         for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)):
 75 |             logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999)
 76 |             # 6. get lable using logtis
 77 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
 78 |             # 7. write question id and labels to file system.
 79 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 80 |             index=index+1
 81 |         predict_target_file_f.close()
 82 | 
 83 | # get label using logits
 84 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 85 |     # test
 86 |     #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 87 |     index_list=np.argsort(logits)[-top_number:]
 88 |     index_list=index_list[::-1]
 89 |     label_list=[]
 90 |     for index in index_list:
 91 |         label=vocabulary_index2word_label[index]
 92 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 93 |     return label_list
 94 | 
 95 | # write question id and labels to file system.
 96 | def write_question_id_with_labels(question_id,labels_list,f):
 97 |     labels_string=",".join(labels_list)
 98 |     f.write(question_id+","+labels_string+"\n")
 99 | 
100 | if __name__ == "__main__":
101 |     tf.app.run()
102 | 


--------------------------------------------------------------------------------
/venv/dwb/github_model/a01_FastText/p5_fastTextB_predict_multilabel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #prediction using model.
 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
 4 | import sys
 5 | reload(sys)
 6 | sys.setdefaultencoding('utf8')
 7 | import tensorflow as tf
 8 | import numpy as np
 9 | from p5_fastTextB_model import fastTextB as fastText
10 | # from p4_zhihu_load_data import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
11 | from tflearn.data_utils import to_categorical, pad_sequences
12 | import os
13 | import codecs
14 | 
15 | #configuration
16 | FLAGS=tf.app.flags.FLAGS
17 | tf.app.flags.DEFINE_integer("label_size",19,"number of label")
18 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
19 | tf.app.flags.DEFINE_integer("batch_size", 512, "Batch size for training/evaluating.") #批处理的大小 32-->128
20 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
21 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
22 | tf.app.flags.DEFINE_integer("num_sampled",100,"number of noise sampling")
23 | tf.app.flags.DEFINE_string("ckpt_dir","fast_text_checkpoint_multi/","checkpoint location for the model")
24 | tf.app.flags.DEFINE_integer("sentence_len",300,"max sentence length")
25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
27 | tf.app.flags.DEFINE_integer("num_epochs",15,"embedding size")
28 | tf.app.flags.DEFINE_integer("validate_every", 10, "Validate every validate_every epochs.") #每10轮做一次验证
29 | tf.app.flags.DEFINE_string("predict_target_file","fast_text_checkpoint_multi/zhihu_result_ftB_multilabel.csv","target file path for final prediction")
30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-v4only-title.txt',"target file path for final prediction")
31 | 
32 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction)
33 | def main(_):
34 |     # 1.load data with vocabulary of words and labels
35 |     vocabulary_word2index, vocabulary_index2word = create_voabulary()
36 |     vocab_size = len(vocabulary_word2index)
37 |     print("vocab_size:",vocab_size)
38 |     #iii=0
39 |     #iii/0
40 |     vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
41 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO
42 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO
43 |     testX=[]
44 |     question_id_list=[]
45 |     for tuple in test:
46 |         question_id,question_string_list=tuple
47 |         question_id_list.append(question_id)
48 |         testX.append(question_string_list)
49 | 
50 |     # 2.Data preprocessing: Sequence padding
51 |     print("start padding....")
52 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
53 |     print("end padding...")
54 | 
55 |     # 3.create session.
56 |     config=tf.ConfigProto()
57 |     config.gpu_options.allow_growth=True
58 |     with tf.Session(config=config) as sess:
59 |         # 4.Instantiate Model
60 |         fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
61 |         saver=tf.train.Saver()
62 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
63 |             print("Restoring Variables from Checkpoint")
64 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
65 |         else:
66 |             print("Can't find the checkpoint.going to stop")
67 |             return
68 |         # 5.feed data, to get logits
69 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
70 |         batch_size=1
71 |         index=0
72 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
73 |         for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)):
74 |             logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999)
75 |             # 6. get lable using logtis
76 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
77 |             # 7. write question id and labels to file system.
78 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
79 |             index=index+1
80 |         predict_target_file_f.close()
81 | 
82 | # get label using logits
83 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
84 |     index_list=np.argsort(logits)[-top_number:]
85 |     index_list=index_list[::-1]
86 |     label_list=[]
87 |     for index in index_list:
88 |         label=vocabulary_index2word_label[index]
89 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
90 |     return label_list
91 | 
92 | # write question id and labels to file system.
93 | def write_question_id_with_labels(question_id,labels_list,f):
94 |     labels_string=",".join(labels_list)
95 |     f.write(question_id+","+labels_string+"\n")
96 | 
97 | if __name__ == "__main__":
98 |     tf.app.run()


--------------------------------------------------------------------------------
/venv/dwb/github_model/a01_FastText/p6_fastTextB_model_multilabel.py:
--------------------------------------------------------------------------------
  1 | # autor:xul
  2 | # fast text. using: very simple model;n-gram to captrue location information;h-softmax to speed up training/inference
  3 | print("started...")
  4 | import tensorflow as tf
  5 | 
  6 | class fastTextB:
  7 |     def __init__(self, label_size, learning_rate, batch_size, decay_steps, decay_rate,num_sampled,sentence_len,vocab_size,embed_size,is_training,max_label_per_example=5):
  8 |         """init all hyperparameter here"""
  9 |         # 1.set hyper-paramter
 10 |         self.label_size = label_size #e.g.1999
 11 |         self.batch_size = batch_size
 12 |         self.num_sampled = num_sampled
 13 |         self.sentence_len=sentence_len
 14 |         self.vocab_size=vocab_size
 15 |         self.embed_size=embed_size
 16 |         self.is_training=is_training
 17 |         self.learning_rate=learning_rate
 18 |         self.max_label_per_example=max_label_per_example
 19 | 
 20 |         # 2.add placeholder (X,label)
 21 |         self.sentence = tf.placeholder(tf.int32, [None, self.sentence_len], name="sentence")     #X
 22 |         self.labels = tf.placeholder(tf.int64, [None,self.max_label_per_example], name="Labels") #y [1,2,3,3,3]
 23 |         self.labels_l1999=tf.placeholder(tf.int64,[None,self.label_size])
 24 |         #3.set some variables
 25 |         self.global_step = tf.Variable(0, trainable=False, name="Global_Step")
 26 |         self.epoch_step=tf.Variable(0, trainable=False,name="Epoch_Step")
 27 |         self.epoch_increment=tf.assign(self.epoch_step,tf.add(self.epoch_step,tf.constant(1)))
 28 |         self.decay_steps, self.decay_rate = decay_steps, decay_rate
 29 |         self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step")
 30 | 
 31 |         #4.init weights
 32 |         self.instantiate_weights()
 33 |         #5.main graph: inference
 34 |         self.logits = self.inference() #[None, self.label_size]
 35 |         #6.calculate loss
 36 |         self.loss_val = self.loss()
 37 |         #7.start training by update parameters using according loss
 38 |         self.train_op = self.train()
 39 | 
 40 |         #8.calcuate accuracy
 41 |         # correct_prediction = tf.equal(tf.argmax(self.logits, 1), self.labels) #2.TODO tf.argmax(self.logits, 1)-->[batch_size]
 42 |         # self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy") #TODO
 43 | 
 44 |     def instantiate_weights(self):
 45 |         """define all weights here"""
 46 |         # embedding matrix
 47 |         self.Embedding = tf.get_variable("Embedding", [self.vocab_size, self.embed_size])
 48 |         self.W = tf.get_variable("W", [self.embed_size, self.label_size])
 49 |         self.b = tf.get_variable("b", [self.label_size])
 50 | 
 51 |     def inference(self):
 52 |         """main computation graph here: 1.embedding-->2.average-->3.linear classifier"""
 53 |         # 1.get emebedding of words in the sentence
 54 |         sentence_embeddings = tf.nn.embedding_lookup(self.Embedding,self.sentence)  # [None,self.sentence_len,self.embed_size]
 55 | 
 56 |         # 2.average vectors, to get representation of the sentence
 57 |         self.sentence_embeddings = tf.reduce_mean(sentence_embeddings, axis=1)  # [None,self.embed_size]
 58 | 
 59 |         # 3.linear classifier layer
 60 |         logits = tf.matmul(self.sentence_embeddings, self.W) + self.b #[None, self.label_size]==tf.matmul([None,self.embed_size],[self.embed_size,self.label_size])
 61 |         return logits
 62 | 
 63 | 
 64 |     def loss(self,l2_lambda=0.0001):
 65 |         """calculate loss using (NCE)cross entropy here"""
 66 |         # Compute the average NCE loss for the batch.
 67 |         # tf.nce_loss automatically draws a new sample of the negative labels each
 68 |         # time we evaluate the loss.
 69 |         if self.is_training:#training
 70 |             #labels=tf.reshape(self.labels,[-1])               #3.[batch_size,max_label_per_example]------>[batch_size*max_label_per_example,]
 71 |             #labels=tf.expand_dims(labels,1)                   #[batch_size*max_label_per_example,]----->[batch_size*max_label_per_example,1]
 72 |             #nce_loss: notice-->for now, if you have a variable number of target classes, you can pad them out to a constant number by either repeating them or by padding with an otherwise unused class.
 73 |             loss = tf.reduce_mean(#inputs's SHAPE should be: [batch_size, dim]
 74 |                 tf.nn.nce_loss(weights=tf.transpose(self.W),  #[embed_size, label_size]--->[label_size,embed_size]. nce_weights:A `Tensor` of shape `[num_classes, dim].O.K.
 75 |                                biases=self.b,                 #[label_size]. nce_biases:A `Tensor` of shape `[num_classes]`.
 76 |                                labels=self.labels,                 #4.[batch_size,max_label_per_example]. train_labels, # A `Tensor` of type `int64` and shape `[batch_size,num_true]`. The target classes.
 77 |                                inputs=self.sentence_embeddings,#TODO [None,self.embed_size] #A `Tensor` of shape `[batch_size, dim]`.  The forward activations of the input network.
 78 |                                num_sampled=self.num_sampled,  #  scalar. 100
 79 |                                num_true=self.max_label_per_example,
 80 |                                num_classes=self.label_size,partition_strategy="div"))  #scalar. 1999
 81 |         else:#eval(/inference)
 82 |             labels_multi_hot = self.labels_l1999 #[batch_size,label_size]
 83 |             #sigmoid_cross_entropy_with_logits:Computes sigmoid cross entropy given `logits`.Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive.  For instance, one could perform multilabel classification where a picture can contain both an elephant and a dog at the same time.
 84 |             loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_multi_hot,logits=self.logits) #labels:[batch_size,label_size];logits:[batch, label_size]
 85 |             loss = tf.reduce_sum(loss, axis=1)
 86 | 
 87 |         # add regularization result in not converge
 88 |         l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda
 89 |         loss=loss+l2_losses
 90 |         return loss
 91 | 
 92 |     def train(self):
 93 |         """based on the loss, use SGD to update parameter"""
 94 |         learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps,
 95 |                                                    self.decay_rate, staircase=True)
 96 |         train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,
 97 |                                                    learning_rate=learning_rate, optimizer="Adam")
 98 |         return train_op
 99 | 
100 | 
101 | print("ended...")


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a02_TextCNN/__init__.py


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/__pycache__/data_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a02_TextCNN/__pycache__/data_util.cpython-36.pyc


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/__pycache__/p7_TextCNN_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a02_TextCNN/__pycache__/p7_TextCNN_model.cpython-36.pyc


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/data_util.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import codecs
  3 | import random
  4 | import numpy as np
  5 | from tflearn.data_utils import pad_sequences
  6 | from collections import Counter
  7 | import os
  8 | import pickle
  9 | 
 10 | local_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/github_model/a02_TextCNN/'
 11 | 
 12 | 
 13 | def load_data_multilabel(traning_data_path,vocab_word2index, vocab_label2index,sentence_len,training_portion=0.9):
 14 |     """
 15 |     convert data as indexes using word2index dicts.
 16 |     :param traning_data_path:
 17 |     :param vocab_word2index:
 18 |     :param vocab_label2index:
 19 |     :return:
 20 |     """
 21 |     file_object = codecs.open(traning_data_path, mode='r', encoding='utf-8')
 22 |     lines = file_object.readlines()
 23 |     random.shuffle(lines)
 24 |     label_size=len(vocab_label2index)
 25 |     X = []
 26 |     Y = []
 27 |     for i,line in enumerate(lines):
 28 |         raw_list = line.strip().split("__myprefix__")
 29 |         input_list = raw_list[0].strip().split(" ")
 30 |         input_list = [x.strip().replace(" ", "") for x in input_list if x != '']
 31 |         x=[vocab_word2index.get(x,0) for x in input_list]
 32 |         label_list = raw_list[1:]
 33 |         label_list=[l.strip().replace(" ", "") for l in label_list if l != '']
 34 |         label_list=[vocab_label2index[label] for label in label_list]
 35 |         # y=transform_multilabel_as_multihot(label_list,label_size)
 36 |         y=label_list
 37 |         X.append(x)
 38 |         Y.append(y)
 39 |     Y = np.array(Y).reshape(-1)
 40 |     X = pad_sequences(X, maxlen=sentence_len, value=0.)  # padding to max length
 41 |     number_examples = len(lines)
 42 |     training_number=int(training_portion* number_examples)
 43 |     train = (X[0:training_number], Y[0:training_number])
 44 |     valid_number=number_examples-training_number
 45 |     test = (X[training_number+ 1:training_number+valid_number+1], Y[training_number + 1:training_number+valid_number+1])
 46 |     return train,test
 47 | 
 48 | 
 49 | def transform_multilabel_as_multihot(label_list,label_size):
 50 |     """
 51 |     convert to multi-hot style
 52 |     :param label_list: e.g.[0,1,4], here 4 means in the 4th position it is true value(as indicate by'1')
 53 |     :param label_size: e.g.199
 54 |     :return:e.g.[1,1,0,1,0,0,........]
 55 |     """
 56 |     result=np.zeros(label_size)
 57 |     #set those location as 1, all else place as 0.
 58 |     result[label_list] = 1
 59 |     return result
 60 | 
 61 | #use pretrained word embedding to get word vocabulary and labels, and its relationship with index
 62 | def create_vocabulary(training_data_path,vocab_size,name_scope='cnn'):
 63 |     """
 64 |     create vocabulary
 65 |     :param training_data_path:
 66 |     :param vocab_size:
 67 |     :param name_scope:
 68 |     :return:
 69 |     """
 70 | 
 71 |     cache_vocabulary_label_pik=local_path+'cache'+"_"+name_scope # path to save cache
 72 |     if not os.path.isdir(cache_vocabulary_label_pik): # create folder if not exists.
 73 |         os.makedirs(cache_vocabulary_label_pik)
 74 | 
 75 |     # if cache exists. load it; otherwise create it.
 76 |     cache_path =cache_vocabulary_label_pik+"/"+'vocab_label.pik'
 77 |     print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path))
 78 |     if os.path.exists(cache_path):
 79 |         with open(cache_path, 'rb') as data_f:
 80 |             return pickle.load(data_f)
 81 |     else:
 82 |         vocabulary_word2index={}
 83 |         vocabulary_index2word={}
 84 |         # vocabulary_word2index[_PAD]=PAD_ID
 85 |         # vocabulary_index2word[PAD_ID]=_PAD
 86 |         # vocabulary_word2index[_UNK]=UNK_ID
 87 |         # vocabulary_index2word[UNK_ID]=_UNK
 88 | 
 89 |         vocabulary_label2index={}
 90 |         vocabulary_index2label={}
 91 | 
 92 |         #1.load raw data
 93 |         file_object = codecs.open(training_data_path, mode='r', encoding='utf-8')
 94 |         lines=file_object.readlines()
 95 |         #2.loop each line,put to counter
 96 |         c_inputs=Counter()
 97 |         c_labels=Counter()
 98 |         for line in lines:
 99 |             raw_list=line.strip().split("__myprefix__")
100 | 
101 |             input_list = raw_list[0].strip().split(" ")
102 |             input_list = [x.strip().replace(" ", "") for x in input_list if x != '']
103 |             label_list=[l.strip().replace(" ","") for l in raw_list[1:] if l!='']
104 |             c_inputs.update(input_list)
105 |             c_labels.update(label_list)
106 |         #return most frequency words
107 |         vocab_list=c_inputs.most_common(vocab_size)
108 |         label_list=c_labels.most_common()
109 |         #put those words to dict
110 |         for i,tuplee in enumerate(vocab_list):
111 |             word,_=tuplee
112 |             vocabulary_word2index[word]=i+1
113 |             vocabulary_index2word[i+1]=word
114 | 
115 |         for i,tuplee in enumerate(label_list):
116 |             label,_=tuplee;label=str(label)
117 |             vocabulary_label2index[label]=i
118 |             vocabulary_index2label[i]=label
119 | 
120 |         #save to file system if vocabulary of words not exists.
121 |         if not os.path.exists(cache_path):
122 |             with open(cache_path, 'ab') as data_f:
123 |                 pickle.dump((vocabulary_word2index,vocabulary_index2word,vocabulary_label2index,vocabulary_index2label), data_f)
124 |     return vocabulary_word2index,vocabulary_index2word,vocabulary_label2index,vocabulary_index2label
125 | 
126 | def get_target_label_short(eval_y):
127 |     eval_y_short=[] #will be like:[22,642,1391]
128 |     for index,label in enumerate(eval_y):
129 |         if label>0:
130 |             eval_y_short.append(index)
131 |     return eval_y_short
132 | 
133 | # training_data_path = '/Users/liyangyang/Downloads/bdci/train.txt'
134 | # vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label= \
135 | #     create_vocabulary(training_data_path,17259,name_scope='cnn')
136 | # vocab_size = len(vocabulary_word2index);print("cnn_model.vocab_size:",vocab_size);num_classes=len(vocabulary_index2label);print("num_classes:",num_classes)
137 | # print(vocabulary_index2label)
138 | # train, test= load_data_multilabel(training_data_path,vocabulary_word2index, vocabulary_label2index,200)
139 | # trainX, trainY = train
140 | # testX, testY = test
141 | # #print some message for debug purpose
142 | # print("length of training data:",len(trainX),";length of validation data:",len(testX))
143 | # print("trainX[0]:", trainX[1]);
144 | # print("trainY[0]:", trainY[1])
145 | # # train_y_short = get_target_label_short(trainY[1])
146 | # # print("train_y_short:", train_y_short)
147 | # for i in range(1,100):
148 | #     print(vocabulary_index2word[i])


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/other_experiement/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a02_TextCNN/other_experiement/__init__.py


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/other_experiement/p7_TextCNN_predict_ensemble.py:
--------------------------------------------------------------------------------
 1 | from  p7_TextCNN_predict import get_logits_with_value_by_input
 2 | from p7_TextCNN_predict_exp import get_logits_with_value_by_input_exp
 3 | import tensorflow as tf
 4 | def main(_):
 5 |     for start in range(217360):
 6 |         end=start+1
 7 |         label_list,p_list=get_logits_with_value_by_input(start,end)
 8 |         label_list_exp, p_list_exp=get_logits_with_value_by_input_exp(start,end)
 9 | 
10 |         if start<5:
11 |             print("----------------------------------------------------")
12 |             print(start,"label_list0:",label_list,"p_list0:",p_list)
13 |             print(start,"label_list1:", label_list_exp, "p_list1:", p_list_exp)
14 |         else:
15 |             break
16 | 
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     tf.app.run()


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/other_experiement/p7_TextCNN_predict_exp512.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | #from p5_fastTextB_model import fastTextB as fastText
 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 11 | from tflearn.data_utils import pad_sequences #to_categorical
 12 | import os
 13 | import codecs
 14 | from p7_TextCNN_model import TextCNN
 15 | 
 16 | #configuration
 17 | FLAGS=tf.app.flags.FLAGS
 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 23 | tf.app.flags.DEFINE_string("ckpt_dir","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512/","checkpoint location for the model")
 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length")
 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.")
 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证
 29 | tf.app.flags.DEFINE_string("predict_target_file","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512/zhihu_result_cnn_multilabel_v7_exp512_20170616.csv","target file path for final prediction")
 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt
 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100
 32 | tf.app.flags.DEFINE_integer("num_filters", 600, "number of filters") #128-->512
 33 | tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model")
 34 | 
 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")
 36 | 
 37 | ##############################################################################################################################################
 38 | filter_sizes=[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7]
 39 | 
 40 | def main(_):
 41 |     # 1.load data with vocabulary of words and labels
 42 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2")
 43 |     vocab_size = len(vocabulary_word2index)
 44 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
 45 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 46 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 47 |     testX=[]
 48 |     question_id_list=[]
 49 |     for tuple in test:
 50 |         question_id,question_string_list=tuple
 51 |         question_id_list.append(question_id)
 52 |         testX.append(question_string_list)
 53 |     # 2.Data preprocessing: Sequence padding
 54 |     print("start padding....")
 55 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
 56 |     print("end padding...")
 57 |    # 3.create session.
 58 |     config=tf.ConfigProto()
 59 |     config.gpu_options.allow_growth=True
 60 |     with tf.Session(config=config) as sess:
 61 |         # 4.Instantiate Model
 62 |         textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate,
 63 |                         FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
 64 |         saver=tf.train.Saver()
 65 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 66 |             print("Restoring Variables from Checkpoint")
 67 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 68 |         else:
 69 |             print("Can't find the checkpoint.going to stop")
 70 |             return
 71 |         # 5.feed data, to get logits
 72 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 73 |         index=0
 74 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 75 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 76 |             logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
 77 |             # 6. get lable using logtis
 78 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
 79 |             # 7. write question id and labels to file system.
 80 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 81 |             index=index+1
 82 |         predict_target_file_f.close()
 83 | 
 84 | # get label using logits
 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 86 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 87 |     index_list=index_list[::-1]
 88 |     label_list=[]
 89 |     for index in index_list:
 90 |         label=vocabulary_index2word_label[index]
 91 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 92 |     return label_list
 93 | 
 94 | # get label using logits
 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5):
 96 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 97 |     index_list=index_list[::-1]
 98 |     value_list=[]
 99 |     label_list=[]
100 |     for index in index_list:
101 |         label=vocabulary_index2word_label[index]
102 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
103 |         value_list.append(logits[index])
104 |     return label_list,value_list
105 | 
106 | # write question id and labels to file system.
107 | def write_question_id_with_labels(question_id,labels_list,f):
108 |     labels_string=",".join(labels_list)
109 |     f.write(question_id+","+labels_string+"\n")
110 | 
111 | if __name__ == "__main__":
112 |     tf.app.run()
113 |     #labels,list_value=get_logits_with_value_by_input(0, 1)
114 |     #print("labels:",labels)
115 |     #print("list_value:", list_value)


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/other_experiement/p7_TextCNN_predict_exp512_0609.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | #from p5_fastTextB_model import fastTextB as fastText
 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 11 | from tflearn.data_utils import pad_sequences #to_categorical
 12 | import os
 13 | import codecs
 14 | from p7_TextCNN_model import TextCNN
 15 | 
 16 | #configuration
 17 | FLAGS=tf.app.flags.FLAGS
 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 23 | tf.app.flags.DEFINE_string("ckpt_dir","text_cnn_title_desc_checkpoint_exp512_0609/","checkpoint location for the model")
 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length")
 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.")
 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证
 29 | tf.app.flags.DEFINE_string("predict_target_file","text_cnn_title_desc_checkpoint_exp512_0609/zhihu_result_cnn_multilabel_exp512_0609.csv","target file path for final prediction")
 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt
 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100
 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128
 33 | tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model")
 34 | 
 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")
 36 | 
 37 | ##############################################################################################################################################
 38 | filter_sizes=[2,3,5,6,7,8] #[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7]
 39 | 
 40 | def main(_):
 41 |     # 1.load data with vocabulary of words and labels
 42 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2")
 43 |     vocab_size = len(vocabulary_word2index)
 44 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
 45 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 46 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 47 |     testX=[]
 48 |     question_id_list=[]
 49 |     for tuple in test:
 50 |         question_id,question_string_list=tuple
 51 |         question_id_list.append(question_id)
 52 |         testX.append(question_string_list)
 53 |     # 2.Data preprocessing: Sequence padding
 54 |     print("start padding....")
 55 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
 56 |     print("end padding...")
 57 |    # 3.create session.
 58 |     config=tf.ConfigProto()
 59 |     config.gpu_options.allow_growth=True
 60 |     with tf.Session(config=config) as sess:
 61 |         # 4.Instantiate Model
 62 |         textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate,
 63 |                         FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
 64 |         saver=tf.train.Saver()
 65 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 66 |             print("Restoring Variables from Checkpoint")
 67 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 68 |         else:
 69 |             print("Can't find the checkpoint.going to stop")
 70 |             return
 71 |         # 5.feed data, to get logits
 72 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 73 |         index=0
 74 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 75 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 76 |             logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
 77 |             # 6. get lable using logtis
 78 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
 79 |             # 7. write question id and labels to file system.
 80 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 81 |             index=index+1
 82 |         predict_target_file_f.close()
 83 | 
 84 | # get label using logits
 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 86 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 87 |     index_list=index_list[::-1]
 88 |     label_list=[]
 89 |     for index in index_list:
 90 |         label=vocabulary_index2word_label[index]
 91 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 92 |     return label_list
 93 | 
 94 | # get label using logits
 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5):
 96 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 97 |     index_list=index_list[::-1]
 98 |     value_list=[]
 99 |     label_list=[]
100 |     for index in index_list:
101 |         label=vocabulary_index2word_label[index]
102 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
103 |         value_list.append(logits[index])
104 |     return label_list,value_list
105 | 
106 | # write question id and labels to file system.
107 | def write_question_id_with_labels(question_id,labels_list,f):
108 |     labels_string=",".join(labels_list)
109 |     f.write(question_id+","+labels_string+"\n")
110 | 
111 | if __name__ == "__main__":
112 |     tf.app.run()
113 |     #labels,list_value=get_logits_with_value_by_input(0, 1)
114 |     #print("labels:",labels)
115 |     #print("list_value:", list_value)


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/other_experiement/p7_TextCNN_predict_exp512_simple.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | #from p5_fastTextB_model import fastTextB as fastText
 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 11 | from tflearn.data_utils import pad_sequences #to_categorical
 12 | import os
 13 | import codecs
 14 | from p7_TextCNN_model import TextCNN
 15 | 
 16 | #configuration
 17 | FLAGS=tf.app.flags.FLAGS
 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 23 | tf.app.flags.DEFINE_string("ckpt_dir","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512_simple/","checkpoint location for the model")
 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length")
 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.")
 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证
 29 | tf.app.flags.DEFINE_string("predict_target_file","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512_simple/zhihu_result_cnn_multilabel_exp512_simple.csv","target file path for final prediction")
 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt
 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100
 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128
 33 | tf.app.flags.DEFINE_string("ckpt_dir2","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model")
 34 | 
 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")
 36 | 
 37 | ##############################################################################################################################################
 38 | filter_sizes=[7] #[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7]
 39 | 
 40 | def main(_):
 41 |     # 1.load data with vocabulary of words and labels
 42 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2")
 43 |     vocab_size = len(vocabulary_word2index)
 44 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
 45 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 46 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 47 |     testX=[]
 48 |     question_id_list=[]
 49 |     for tuple in test:
 50 |         question_id,question_string_list=tuple
 51 |         question_id_list.append(question_id)
 52 |         testX.append(question_string_list)
 53 |     # 2.Data preprocessing: Sequence padding
 54 |     print("start padding....")
 55 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
 56 |     print("end padding...")
 57 |    # 3.create session.
 58 |     config=tf.ConfigProto()
 59 |     config.gpu_options.allow_growth=True
 60 |     with tf.Session(config=config) as sess:
 61 |         # 4.Instantiate Model
 62 |         textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate,
 63 |                         FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
 64 |         saver=tf.train.Saver()
 65 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
 66 |             print("Restoring Variables from Checkpoint")
 67 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 68 |         else:
 69 |             print("Can't find the checkpoint.going to stop")
 70 |             return
 71 |         # 5.feed data, to get logits
 72 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
 73 |         index=0
 74 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 75 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
 76 |             logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
 77 |             # 6. get lable using logtis
 78 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
 79 |             # 7. write question id and labels to file system.
 80 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
 81 |             index=index+1
 82 |         predict_target_file_f.close()
 83 | 
 84 | # get label using logits
 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
 86 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 87 |     index_list=index_list[::-1]
 88 |     label_list=[]
 89 |     for index in index_list:
 90 |         label=vocabulary_index2word_label[index]
 91 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
 92 |     return label_list
 93 | 
 94 | # get label using logits
 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5):
 96 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
 97 |     index_list=index_list[::-1]
 98 |     value_list=[]
 99 |     label_list=[]
100 |     for index in index_list:
101 |         label=vocabulary_index2word_label[index]
102 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
103 |         value_list.append(logits[index])
104 |     return label_list,value_list
105 | 
106 | # write question id and labels to file system.
107 | def write_question_id_with_labels(question_id,labels_list,f):
108 |     labels_string=",".join(labels_list)
109 |     f.write(question_id+","+labels_string+"\n")
110 | 
111 | if __name__ == "__main__":
112 |     tf.app.run()
113 |     #labels,list_value=get_logits_with_value_by_input(0, 1)
114 |     #print("labels:",labels)
115 |     #print("list_value:", list_value)
116 | 


--------------------------------------------------------------------------------
/venv/dwb/github_model/a02_TextCNN/other_experiement/p8_TextCNN_predict_exp.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #prediction using model.
  3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
  4 | import sys
  5 | reload(sys)
  6 | sys.setdefaultencoding('utf8')
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | #from p5_fastTextB_model import fastTextB as fastText
 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label
 11 | from tflearn.data_utils import pad_sequences #to_categorical
 12 | import os
 13 | import codecs
 14 | from p7_TextCNN_model import TextCNN
 15 | 
 16 | #configuration
 17 | FLAGS=tf.app.flags.FLAGS
 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label")
 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128
 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128
 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
 23 | tf.app.flags.DEFINE_string("ckpt_dir","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model")
 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length")
 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.")
 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证
 29 | tf.app.flags.DEFINE_string("predict_target_file","text_cnn_title_desc_checkpoint_exp/zhihu_result_cnn_multilabel_v6_exp.csv","target file path for final prediction")
 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt
 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100
 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128
 33 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.")
 34 | 
 35 | ##################################################################################################################################
 36 | filter_sizes=[3,4,5,7,15,20,25]
 37 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction)
 38 | # 1.load data with vocabulary of words and labels
 39 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',
 40 |                                                                 word2vec_model_path=FLAGS.word2vec_model_path,
 41 |                                                                 name_scope="cnn2")
 42 | vocab_size = len(vocabulary_word2index)
 43 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
 44 | questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
 45 | test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists)
 46 | testX = []
 47 | question_id_list = []
 48 | for tuple in test:
 49 |     question_id, question_string_list = tuple
 50 |     question_id_list.append(question_id)
 51 |     testX.append(question_string_list)
 52 | # 2.Data preprocessing: Sequence padding
 53 | print("start padding....")
 54 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
 55 | print("end padding...")
 56 | # 3.create session.
 57 | config = tf.ConfigProto()
 58 | config.gpu_options.allow_growth = True
 59 | sess=tf.Session(config=config)
 60 | # 4.Instantiate Model
 61 | textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size,
 62 |                   FLAGS.decay_steps, FLAGS.decay_rate,
 63 |                   FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training)
 64 | saver = tf.train.Saver()
 65 | if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
 66 |     print("Restoring Variables from Checkpoint")
 67 |     saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
 68 | else:
 69 |     print("Can't find the checkpoint.going to stop")
 70 |     #return
 71 | # 5.feed data, to get logits
 72 | number_of_training_data = len(testX2);
 73 | print("number_of_training_data:", number_of_training_data)
 74 | #index = 0
 75 | #predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
 76 | ##################################################################################################################################
 77 | def get_logits_by_input_exp(start,end):
 78 |     x=testX2[start:end]
 79 |     logits = sess.run(textCNN.logits, feed_dict={textCNN.input_x: x, textCNN.dropout_keep_prob: 1})
 80 |     predicted_labels,value_labels = get_label_using_logits_with_value(logits[0], vocabulary_index2word_label)
 81 |     value_labels_exp= np.exp(value_labels)
 82 |     p_labels=value_labels_exp/np.sum(value_labels_exp)
 83 |     return predicted_labels,p_labels
 84 | 
 85 | def main(_):
 86 |     # 1.load data with vocabulary of words and labels
 87 |     vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2")
 88 |     vocab_size = len(vocabulary_word2index)
 89 |     vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
 90 |     questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
 91 |     test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
 92 |     testX=[]
 93 |     question_id_list=[]
 94 |     for tuple in test:
 95 |         question_id,question_string_list=tuple
 96 |         question_id_list.append(question_id)
 97 |         testX.append(question_string_list)
 98 |     # 2.Data preprocessing: Sequence padding
 99 |     print("start padding....")
100 |     testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
101 |     print("end padding...")
102 |    # 3.create session.
103 |     config=tf.ConfigProto()
104 |     config.gpu_options.allow_growth=True
105 |     with tf.Session(config=config) as sess:
106 |         # 4.Instantiate Model
107 |         textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate,
108 |                         FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
109 |         saver=tf.train.Saver()
110 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
111 |             print("Restoring Variables from Checkpoint")
112 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
113 |         else:
114 |             print("Can't find the checkpoint.going to stop")
115 |             return
116 |         # 5.feed data, to get logits
117 |         number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
118 |         index=0
119 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
120 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
121 |             logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
122 |             # 6. get lable using logtis
123 |             predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
124 |             # 7. write question id and labels to file system.
125 |             write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
126 |             index=index+1
127 |         predict_target_file_f.close()
128 | 
129 | # get label using logits
130 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5):
131 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
132 |     index_list=index_list[::-1]
133 |     label_list=[]
134 |     for index in index_list:
135 |         label=vocabulary_index2word_label[index]
136 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
137 |     return label_list
138 | 
139 | # get label using logits
140 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5):
141 |     index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
142 |     index_list=index_list[::-1]
143 |     value_list=[]
144 |     label_list=[]
145 |     for index in index_list:
146 |         label=vocabulary_index2word_label[index]
147 |         label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876'])
148 |         value_list.append(logits[index])
149 |     return label_list,value_list
150 | 
151 | # write question id and labels to file system.
152 | def write_question_id_with_labels(question_id,labels_list,f):
153 |     labels_string=",".join(labels_list)
154 |     f.write(question_id+","+labels_string+"\n")
155 | 
156 | if __name__ == "__main__":
157 |     #tf.app.run()
158 |     labels,list_value=get_logits_by_input_exp(0, 1)
159 |     print("labels:",labels)
160 |     print("list_value:", list_value)


--------------------------------------------------------------------------------
/venv/dwb/github_model/a03_TextRNN/__pycache__/p8_TextRNN_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a03_TextRNN/__pycache__/p8_TextRNN_model.cpython-36.pyc


--------------------------------------------------------------------------------
/venv/dwb/github_model/a03_TextRNN/p8_TextRNN_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #TextRNN: 1. embeddding layer, 2.Bi-LSTM layer, 3.concat output, 4.FC layer, 5.softmax
  3 | import tensorflow as tf
  4 | from tensorflow.contrib import rnn
  5 | import numpy as np
  6 | 
  7 | class TextRNN:
  8 |     def __init__(self,num_classes, learning_rate, batch_size, decay_steps, decay_rate,sequence_length,
  9 |                  vocab_size,embed_size,is_training,initializer=tf.random_normal_initializer(stddev=0.1)):
 10 |         """init all hyperparameter here"""
 11 |         # set hyperparamter
 12 |         self.num_classes = num_classes
 13 |         self.batch_size = batch_size
 14 |         self.sequence_length=sequence_length
 15 |         self.vocab_size=vocab_size
 16 |         self.embed_size=embed_size
 17 |         self.hidden_size=embed_size
 18 |         self.is_training=is_training
 19 |         self.learning_rate=learning_rate
 20 |         self.initializer=initializer
 21 |         self.num_sampled=20
 22 | 
 23 |         # add placeholder (X,label)
 24 |         self.input_x = tf.placeholder(tf.int32, [None, self.sequence_length], name="input_x")  # X
 25 |         self.input_y = tf.placeholder(tf.int32,[None], name="input_y")  # y [None,num_classes]
 26 |         self.dropout_keep_prob=tf.placeholder(tf.float32,name="dropout_keep_prob")
 27 | 
 28 |         self.global_step = tf.Variable(0, trainable=False, name="Global_Step")
 29 |         self.epoch_step=tf.Variable(0,trainable=False,name="Epoch_Step")
 30 |         self.epoch_increment=tf.assign(self.epoch_step,tf.add(self.epoch_step,tf.constant(1)))
 31 |         self.decay_steps, self.decay_rate = decay_steps, decay_rate
 32 | 
 33 |         self.instantiate_weights()
 34 |         self.logits = self.inference() #[None, self.label_size]. main computation graph is here.
 35 |         if not is_training:
 36 |             return
 37 |         self.loss_val = self.loss() #-->self.loss_nce()
 38 |         self.train_op = self.train()
 39 |         self.predictions = tf.argmax(self.logits, axis=1, name="predictions")  # shape:[None,]
 40 |         correct_prediction = tf.equal(tf.cast(self.predictions,tf.int32), self.input_y) #tf.argmax(self.logits, 1)-->[batch_size]
 41 |         self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy") # shape=()
 42 |     def instantiate_weights(self):
 43 |         """define all weights here"""
 44 |         with tf.name_scope("embedding"): # embedding matrix
 45 |             self.Embedding = tf.get_variable("Embedding",shape=[self.vocab_size, self.embed_size],initializer=self.initializer) #[vocab_size,embed_size] tf.random_uniform([self.vocab_size, self.embed_size],-1.0,1.0)
 46 |             self.W_projection = tf.get_variable("W_projection",shape=[self.hidden_size*2, self.num_classes],initializer=self.initializer) #[embed_size,label_size]
 47 |             self.b_projection = tf.get_variable("b_projection",shape=[self.num_classes])       #[label_size]
 48 | 
 49 |     def inference(self):
 50 |         """main computation graph here: 1. embeddding layer, 2.Bi-LSTM layer, 3.concat, 4.FC layer 5.softmax """
 51 |         #1.get emebedding of words in the sentence
 52 |         self.embedded_words = tf.nn.embedding_lookup(self.Embedding,self.input_x) #shape:[None,sentence_length,embed_size]
 53 |         #2. Bi-lstm layer
 54 |         # define lstm cess:get lstm cell output
 55 |         lstm_fw_cell=rnn.BasicLSTMCell(self.hidden_size) #forward direction cell
 56 |         lstm_bw_cell=rnn.BasicLSTMCell(self.hidden_size) #backward direction cell
 57 |         if self.dropout_keep_prob is not None:
 58 |             lstm_fw_cell=rnn.DropoutWrapper(lstm_fw_cell,output_keep_prob=self.dropout_keep_prob)
 59 |             lstm_bw_cell=rnn.DropoutWrapper(lstm_bw_cell,output_keep_prob=self.dropout_keep_prob)
 60 |         # bidirectional_dynamic_rnn: input: [batch_size, max_time, input_size]
 61 |         #                            output: A tuple (outputs, output_states)
 62 |         #                                    where:outputs: A tuple (output_fw, output_bw) containing the forward and the backward rnn output `Tensor`.
 63 |         outputs,_=tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell,self.embedded_words,dtype=tf.float32) #[batch_size,sequence_length,hidden_size] #creates a dynamic bidirectional recurrent neural network
 64 |         print("outputs:===>",outputs) #outputs:(<tf.Tensor 'bidirectional_rnn/fw/fw/transpose:0' shape=(?, 5, 100) dtype=float32>, <tf.Tensor 'ReverseV2:0' shape=(?, 5, 100) dtype=float32>))
 65 |         #3. concat output
 66 |         output_rnn=tf.concat(outputs,axis=2) #[batch_size,sequence_length,hidden_size*2]
 67 |         #self.output_rnn_last=tf.reduce_mean(output_rnn,axis=1) #[batch_size,hidden_size*2] 
 68 |         self.output_rnn_last=output_rnn[:,-1,:] ##[batch_size,hidden_size*2] #TODO
 69 |         print("output_rnn_last:", self.output_rnn_last) # <tf.Tensor 'strided_slice:0' shape=(?, 200) dtype=float32>
 70 |         #4. logits(use linear layer)
 71 |         with tf.name_scope("output"): #inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of the input network.
 72 |             logits = tf.matmul(self.output_rnn_last, self.W_projection) + self.b_projection  # [batch_size,num_classes]
 73 |         return logits
 74 | 
 75 |     def loss(self,l2_lambda=0.0001):
 76 |         with tf.name_scope("loss"):
 77 |             #input: `logits` and `labels` must have the same shape `[batch_size, num_classes]`
 78 |             #output: A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the softmax cross entropy loss.
 79 |             losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.input_y, logits=self.logits);#sigmoid_cross_entropy_with_logits.#losses=tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,logits=self.logits)
 80 |             #print("1.sparse_softmax_cross_entropy_with_logits.losses:",losses) # shape=(?,)
 81 |             loss=tf.reduce_mean(losses)#print("2.loss.loss:", loss) #shape=()
 82 |             l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda
 83 |             loss=loss+l2_losses
 84 |         return loss
 85 | 
 86 |     def loss_nce(self,l2_lambda=0.0001): #0.0001-->0.001
 87 |         """calculate loss using (NCE)cross entropy here"""
 88 |         # Compute the average NCE loss for the batch.
 89 |         # tf.nce_loss automatically draws a new sample of the negative labels each
 90 |         # time we evaluate the loss.
 91 |         if self.is_training: #training
 92 |             #labels=tf.reshape(self.input_y,[-1])               #[batch_size,1]------>[batch_size,]
 93 |             labels=tf.expand_dims(self.input_y,1)                   #[batch_size,]----->[batch_size,1]
 94 |             loss = tf.reduce_mean( #inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of the input network.
 95 |                 tf.nn.nce_loss(weights=tf.transpose(self.W_projection),#[hidden_size*2, num_classes]--->[num_classes,hidden_size*2]. nce_weights:A `Tensor` of shape `[num_classes, dim].O.K.
 96 |                                biases=self.b_projection,                 #[label_size]. nce_biases:A `Tensor` of shape `[num_classes]`.
 97 |                                labels=labels,                 #[batch_size,1]. train_labels, # A `Tensor` of type `int64` and shape `[batch_size,num_true]`. The target classes.
 98 |                                inputs=self.output_rnn_last,# [batch_size,hidden_size*2] #A `Tensor` of shape `[batch_size, dim]`.  The forward activations of the input network.
 99 |                                num_sampled=self.num_sampled,  #scalar. 100
100 |                                num_classes=self.num_classes,partition_strategy="div"))  #scalar. 1999
101 |         l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda
102 |         loss = loss + l2_losses
103 |         return loss
104 | 
105 |     def train(self):
106 |         """based on the loss, use SGD to update parameter"""
107 |         learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps,self.decay_rate, staircase=True)
108 |         train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,learning_rate=learning_rate, optimizer="Adam")
109 |         return train_op
110 | 
111 | #test started
112 | def test():
113 |     #below is a function test; if you use this for text classifiction, you need to tranform sentence to indices of vocabulary first. then feed data to the graph.
114 |     num_classes=10
115 |     learning_rate=0.01
116 |     batch_size=8
117 |     decay_steps=1000
118 |     decay_rate=0.9
119 |     sequence_length=5
120 |     vocab_size=10000
121 |     embed_size=100
122 |     is_training=True
123 |     dropout_keep_prob=1#0.5
124 |     textRNN=TextRNN(num_classes, learning_rate, batch_size, decay_steps, decay_rate,sequence_length,vocab_size,embed_size,is_training)
125 |     with tf.Session() as sess:
126 |         sess.run(tf.global_variables_initializer())
127 |         for i in range(100):
128 |             input_x=np.zeros((batch_size,sequence_length)) #[None, self.sequence_length]
129 |             input_y=input_y=np.array([1,0,1,1,1,2,1,1]) #np.zeros((batch_size),dtype=np.int32) #[None, self.sequence_length]
130 |             loss,acc,predict,_=sess.run([textRNN.loss_val,textRNN.accuracy,textRNN.predictions,textRNN.train_op],feed_dict={textRNN.input_x:input_x,textRNN.input_y:input_y,textRNN.dropout_keep_prob:dropout_keep_prob})
131 |             print("loss:",loss,"acc:",acc,"label:",input_y,"prediction:",predict)
132 | # test()
133 | 


--------------------------------------------------------------------------------
/venv/dwb/github_model/a03_TextRNN/p8_TextRNN_predict.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #prediction using model.
 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict
 4 | import sys
 5 | sys.path.append("/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn")
 6 | import data_helpers
 7 | 
 8 | import tensorflow as tf
 9 | import numpy as np
10 | 
11 | sys.path.append("/Users/liyangyang/PycharmProjects/mypy/venv/dwb/github_model/a03_TextRNN")
12 | from p8_TextRNN_model import TextRNN
13 | import os
14 | import pickle
15 | from tensorflow.contrib import learn
16 | import codecs
17 | 
18 | #configuration
19 | FLAGS=tf.app.flags.FLAGS
20 | tf.app.flags.DEFINE_integer("num_classes",19,"number of label")
21 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate")
22 | tf.app.flags.DEFINE_integer("batch_size", 128, "Batch size for training/evaluating.") #批处理的大小 32-->128
23 | tf.app.flags.DEFINE_integer("decay_steps", 12000, "how many steps before decay learning rate.") #批处理的大小 32-->128
24 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少
25 | tf.app.flags.DEFINE_string("ckpt_dir", "/Users/liyangyang/PycharmProjects/mypy/venv/dwb/github_model/a03_TextRNN/text_rnn_checkpoint/", "checkpoint location for the model")
26 | tf.app.flags.DEFINE_integer("sequence_length",2000,"max sentence length")
27 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size")
28 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference")
29 | tf.app.flags.DEFINE_string("data_file", "test_set.csv", "Data source for the positive data.")
30 | tf.app.flags.DEFINE_string("predict_target_file","/Users/liyangyang/PycharmProjects/mypy/venv/dwb/github_model/a03_TextRNN/result_rnn.csv","target file path for final prediction")
31 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction)
32 | def main(_):
33 |     # 1.load data with vocabulary of words and labels
34 |     vocab_processor_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/vocab'
35 |     # print("end padding & transform to one hot...")
36 |     x_train, y = data_helpers.load_data_and_labels(FLAGS.data_file)
37 |     print('y.shape',y.shape)
38 | 
39 |     # vocab_processor = learn.preprocessing.VocabularyProcessor(2000,min_frequency=2)
40 |     # x = np.array(list(vocab_processor.fit_transform(x_train)))
41 |     # vocab_processor.save(vocab_processor_path)
42 | 
43 |     vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_processor_path)
44 |     testX2 = np.array(list(vocab_processor.transform(x_train)))
45 |     vocab_size = len(vocab_processor.vocabulary_)
46 |     print("end padding...")
47 |    # 3.create session.
48 |     config=tf.ConfigProto()
49 |     config.gpu_options.allow_growth=True
50 |     with tf.Session(config=config) as sess:
51 |         # 4.Instantiate Model
52 |         textRNN=TextRNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
53 |                         vocab_size, FLAGS.embed_size, FLAGS.is_training)
54 |         saver=tf.train.Saver()
55 |         if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
56 |             print("Restoring Variables from Checkpoint for TextRNN")
57 |             saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
58 |         else:
59 |             print("Can't find the checkpoint.going to stop")
60 |             return
61 |         # 5.feed data, to get logits
62 |         number_of_training_data=len(testX2)
63 |         print("number_of_training_data:",number_of_training_data)
64 |         index=0
65 |         predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
66 |         #for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
67 |         for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
68 |             logits=sess.run(textRNN.logits,feed_dict={textRNN.input_x:testX2[start:end],textRNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
69 |             # 6. get lable using logtis
70 |             #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) #logits[0]
71 |             # 7. write question id and labels to file system.
72 |             #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
73 |             #############################################################################################################
74 |             print("start:",start,";end:",end)
75 |             question_id_sublist=y[start:end]
76 |             get_label_using_logits_batch(question_id_sublist, logits, predict_target_file_f)
77 |             ########################################################################################################
78 |             index=index+1
79 |         predict_target_file_f.close()
80 | 
81 | # get label using logits
82 | def get_label_using_logits_batch(question_id_sublist,logits_batch,f):
83 |     print("get_label_using_logits.shape:", logits_batch.shape) # (10, 1999))=[batch_size,num_labels]===>需要(10,5)
84 |     for i,logits in enumerate(logits_batch):
85 |         lable = int(np.argmax(logits))+1  # print("sum_p", np.sum(1.0 / (1 + np.exp(-logits))))
86 |         # print(question_id_sublist[i],lable)
87 |         write_question_id_with_labels(question_id_sublist[i], lable, f)
88 |     f.flush()
89 | # write question id and labels to file system.
90 | def write_question_id_with_labels(question_id,lable,f):
91 |     f.write(str(question_id)+","+str(lable)+"\n")
92 | 
93 | if __name__ == "__main__":
94 |     tf.app.run()


--------------------------------------------------------------------------------
/venv/dwb/jieba1/merge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/9/10 下午6:49
 4 | # @Author   :hwwu
 5 | # @File     :merge.py
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | 
10 | path = '/Users/liyangyang/Downloads/jieba/data/'
11 | 
12 | result = pd.read_csv(path + 'result.csv')
13 | kw = pd.read_csv(path + 'train_docs_keywords.txt', sep='\t', header=None)
14 | kw.columns = ['id', 'lw']
15 | # print(kw)
16 | result['label1']=result['label1'].replace('','nan')
17 | result['label2']=result['label2'].replace('','nan')
18 | id = []
19 | label1 = []
20 | label2 = []
21 | print(len(result))
22 | for i in range(len(result)):
23 |     m_id = str(result['id'][i])
24 |     if (m_id=='D101107'):
25 |         print(m_id,str(result['label1'][i]))
26 |         print(m_id,str(result['label2'][i]))
27 |     id.append(m_id)
28 |     l1 = ''
29 |     l2 = ''
30 |     for j in range(len(kw)):
31 |         if (m_id == kw['id'][j]):
32 |             words = str(kw['lw'][j]).split(',')
33 |             if(len(words)>1):
34 |                 l1 = words[0]
35 |                 l2 = words[1]
36 |             else:
37 |                 l1 = words[0]
38 |                 l2 = 'nan'
39 |             print(m_id,l1,l2)
40 |     if (l1 != ''):
41 |         label1.append(l1.replace(',',''))
42 |         label2.append(l2.replace(',',''))
43 |     else:
44 |         label1.append(str(result['label1'][i]).replace(',',''))
45 |         label2.append(str(result['label2'][i]).replace(',',''))
46 | 
47 | id_column = pd.Series(id, name='id')
48 | label1_column = pd.Series(label1, name='label1')
49 | label2_column = pd.Series(label2, name='label2')
50 | predictions = pd.concat([id_column, label1_column, label2_column], axis=1)
51 | predictions.to_csv(path + 'merge_result_data.csv', index=0, sep=',', columns=['id', 'label1', 'label2'],encoding='UTF-8')
52 | 
53 | 


--------------------------------------------------------------------------------
/venv/dwb/jieba1/tjieba.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/9/10 下午1:42
  4 | # @Author   :hwwu
  5 | # @File     :jieba1.py
  6 | 
  7 | path = '/Users/liyangyang/Downloads/jieba/data/'
  8 | file = 'all_docs.txt'
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | 
 13 | data = pd.read_csv(path+file,sep='\001',header=None)
 14 | data.columns = ['id','title','doc']
 15 | # # print(data['doc'])
 16 | # new_data = data['title']+data['doc']
 17 | # print(new_data)
 18 | #
 19 | # regex = analyse.extract_tags(new_data,topK=4,withWeight=False,allowPOS=())
 20 | #
 21 | # print(regex)
 22 | 
 23 | print(len(data))
 24 | 
 25 | import codecs
 26 | import os
 27 | 
 28 | import jieba
 29 | import jieba.analyse as analyse
 30 | import numpy as np
 31 | import pandas as pd
 32 | from sklearn.feature_extraction.text import TfidfVectorizer
 33 | 
 34 | base_path = path + file
 35 | seg_path = path + 'all_docs_seg.txt'
 36 | 
 37 | stopword_path='/Users/liyangyang/Downloads/stopwords/CNENstopwords.txt'
 38 | 
 39 | def stopwordslist():
 40 |     stopwords = [line.strip() for line in open(stopword_path, 'r', encoding='utf-8').readlines()]
 41 |     return stopwords
 42 | 
 43 | 
 44 | def segment():
 45 |     """word segment"""
 46 |     fw = codecs.open(seg_path, 'w', 'utf-8')
 47 |     doc_list=[]
 48 |     for i in range(len(data)):
 49 |         title = str(data['title'][i])
 50 |         doc = str(data['doc'][i])
 51 |         # row = line.split('\001')
 52 |         # seg_list = jieba.cut(line.strip())
 53 |         sentence_seged = jieba.cut((title+ '。' + doc).strip())
 54 |         stopwords = stopwordslist()
 55 |         outstr = ''
 56 |         for word in sentence_seged:
 57 |             if word not in stopwords:
 58 |                 if word != '\t':
 59 |                     outstr += word
 60 |                     outstr +='\t'
 61 |         l = outstr
 62 |         doc_list.append(l)
 63 |         fw.write(l)
 64 |         fw.write('\n')
 65 |     fw.flush()
 66 |     fw.close()
 67 |     return doc_list
 68 | 
 69 | def tfidf_top(trade_list, doc_list, max_df, topn):
 70 |     vectorizer = TfidfVectorizer(max_df=max_df,min_df=1,use_idf=1,smooth_idf=1, sublinear_tf=1)
 71 |     matrix = vectorizer.fit_transform(doc_list)
 72 |     feature_dict = {v: k for k, v in vectorizer.vocabulary_.items()}  # index -> feature_name
 73 |     top_n_matrix = np.argsort(-matrix.todense())[:, :topn]  # top tf-idf words for each row
 74 |     df = pd.DataFrame(np.vectorize(feature_dict.get)(top_n_matrix), index=trade_list)  # convert matrix to df
 75 |     return df
 76 | 
 77 | 
 78 | # dl = segment()
 79 | # print('first')
 80 | # tl = data['id']
 81 | # tdf = tfidf_top(tl, dl, max_df=0.5, topn=2)
 82 | # print('second')
 83 | # tdf.to_csv(path+'resilt.csv', header=False, encoding='utf-8')
 84 | # print('done')
 85 | 
 86 | 
 87 | 
 88 | # fw = codecs.open(path+'result.csv', 'w', 'utf-8')
 89 | #
 90 | # fw.write("id,label1,label2"+"\n")
 91 | #
 92 | #
 93 | # def textrank():
 94 | #     n = 0
 95 | #     fw = codecs.open(seg_path, 'w', 'utf-8')
 96 | #     # doc_list = []
 97 | #     # for i in range(len(data)):
 98 | #     for i in range(100):
 99 | #         # n+=1
100 | #         # title = str(data['title'][i])
101 | #         # doc = str(data['doc'][i])
102 | #         row = line.split('\001')
103 | #         seg_list = jieba.cut(line.strip())
104 | #         # sentence_seged = jieba.cut((title + '。' + doc).strip())
105 | #         # stopwords = stopwordslist()
106 | #         # outstr = ''
107 | #         # for word in sentence_seged:
108 | #         #     if word not in stopwords:
109 | #                 if word != '\t':
110 | #                     outstr += word
111 | #                     outstr += '\t'
112 | #         keywords = analyse.textrank(outstr, topK=2, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
113 | #         if (len(keywords)==0):
114 | #             fw.write(str(data['id'][i]) + "," + str('') + "," + str('') + "\n")
115 | #         elif (len(keywords)==1):
116 | #             fw.write(str(data['id'][i]) + "," + str(keywords[0]) + "," + str('') + "\n")
117 | #         else:
118 | #             fw.write(str(data['id'][i]) + "," + str(keywords[0]) + "," + str(keywords[1]) + "\n")
119 | #
120 | #         if(n%1000==0):
121 | #             print('flush',n/1000)
122 | #             fw.flush()
123 | # textrank()
124 | 


--------------------------------------------------------------------------------
/venv/dwb/merge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/9/5 上午11:17
 4 | # @Author   :hwwu
 5 | # @File     :merge.py
 6 | 
 7 | import pandas as pd, numpy as np
 8 | 
 9 | path = '/Users/liyangyang/Downloads/dwb/new_data/'
10 | 
11 | def write_result(id, predictions):
12 |     r_id = []
13 |     r_predictions = []
14 |     for i in range(len(id)):
15 |         r_id.append(int(id[i]))
16 |         r_predictions.append(int(predictions[i]))
17 | 
18 |     english_column = pd.Series(r_id, name='id')
19 |     number_column = pd.Series(r_predictions, name='class')
20 |     predictions = pd.concat([english_column, number_column], axis=1)
21 |     predictions.to_csv(path + 'merge_result_data.csv', index=0, sep=',', columns=['id', 'class'])
22 | 
23 | 
24 | r75 = pd.read_csv(path+'result_data.csv')['class']
25 | rcnn = pd.read_csv(path+'p4_cnn_result_data.csv')['class']
26 | rrnn = pd.read_csv(path+'result_rnn.csv')['class']
27 | 
28 | id = []
29 | predictions =[]
30 | for i in range(len(r75)):
31 |     id.append(i)
32 |     if (rcnn[i]==rrnn[i]):
33 |         predictions.append(rcnn[i])
34 |     else:
35 |         predictions.append(r75[i])
36 | 
37 | write_result(id,predictions)
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/venv/dwb/par.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/9/5 下午2:15
 4 | # @Author   :hwwu
 5 | # @File     :par.py
 6 | 
 7 | import pandas as pd, numpy as np
 8 | 
 9 | # path = '/Users/liyangyang/Downloads/dwb/new_data/'
10 | path = '/Users/liyangyang/Downloads/bdci/'
11 | 
12 | # train = pd.read_csv(path + 'train_set.csv')['word_seg']
13 | 
14 | from gensim.models import word2vec
15 | 
16 | # train = np.load(path + 'vocab/vocab_train.npy')
17 | # test = np.load(path + 'vocab/vocab_test.npy')
18 | #
19 | # test = np.array(test)
20 | #
21 | # total = np.append(train, test, axis=0)
22 | #
23 | # t =[]
24 | # for i in range(len(train)):
25 | #     row = train[i]
26 | #     r = []
27 | #     for j in range(len(row)):
28 | #         r.append(str(row[j]))
29 | #     r = np.reshape(r,[len(row),1])
30 | #     t.append(r)
31 | # t = np.array(t)
32 | # print(train.shape)
33 | #
34 | # sentences = word2vec.PathLineSentences(path+'train_no_lable.txt')
35 | # model = word2vec.Word2Vec(sentences,size=128, min_count=1, iter=10,workers=10)
36 | #
37 | # model.save(path+'word2vec/model')
38 | #
39 | #
40 | model = word2vec.Word2Vec.load(path+'word2vec/model')
41 | #
42 | # print(train[0])
43 | print(model.wv['系统'])
44 | 
45 | similarities=model.wv.most_similar('系统',topn=20)
46 | 
47 | for word , score in similarities:
48 |     print(word , score)
49 | 
50 | # y1 = model.similarity('1', '2')
51 | # print(y1)
52 | #
53 | # y2 = model.similarity('1', '3')
54 | # print(y2)
55 | 
56 | # word2vec.word2vec('/Users/liyangyang/Downloads/word2vec-0.10.2/README.md',path+'word2vec/model.bin',size=128,iter_=10,threads=10,min_count=1)
57 | 


--------------------------------------------------------------------------------
/venv/dwb/testcnn/__pycache__/data_helpers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/testcnn/__pycache__/data_helpers.cpython-36.pyc


--------------------------------------------------------------------------------
/venv/dwb/testcnn/__pycache__/text_cnn.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/testcnn/__pycache__/text_cnn.cpython-36.pyc


--------------------------------------------------------------------------------
/venv/dwb/testcnn/data_helpers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/8/20 下午4:16
  4 | # @Author   :hwwu
  5 | # @File     :data_helpers.py
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | path = '/Users/liyangyang/Downloads/dwb/new_data/'
 10 | column = "word_seg"
 11 | import random
 12 | 
 13 | 
 14 | stopword_path = '/Users/liyangyang/Downloads/stopwords/stopwords1893.txt'
 15 | import jieba
 16 | 
 17 | 
 18 | def stopwordslist():
 19 |     # stopwords = [line.strip() for line in open(stopword_path, 'r', encoding='utf-8').readlines()]
 20 |     stopwords = ['，', '。', '、', '...', '“', '”', '《', '》', '：', '；']
 21 |     return stopwords
 22 | 
 23 | def dense_to_one_hot(labels_dense, num_classes):
 24 |   """Convert class labels from scalars to one-hot vectors."""
 25 |   num_labels = labels_dense.shape[0]
 26 |   index_offset = np.arange(num_labels) * num_classes
 27 |   labels_one_hot = np.zeros((num_labels, num_classes))
 28 |   labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
 29 |   return labels_one_hot
 30 | 
 31 | def load_data_and_labels(data_file):
 32 |     """
 33 |     Loads MR polarity data from files, splits the data into words and generates labels.
 34 |     Returns split sentences and labels.
 35 |     """
 36 |     # Load data from files
 37 |     # text = pd.read_csv(path + data_file)
 38 |     # x_text = np.array(text[column])
 39 |     #
 40 |     # # y=[]
 41 |     # # for i in range(len(text['class'])):
 42 |     # # print(text['class'])
 43 |     # # y = dense_to_one_hot(text['class'],19)
 44 |     # y = (text["class"]-1).astype(int)
 45 |     # y = np.array(text["id"])
 46 | 
 47 |     train = pd.read_csv(path + 'train.csv')
 48 |     random.shuffle(train)
 49 |     train_doc_list = []
 50 |     for i in range(len(train)):
 51 |         sentence_seged = jieba.cut(train['content'][i].strip())
 52 |         # sentence_seged = split_word(train['content'][i].strip())
 53 |         stopwords = stopwordslist()
 54 |         outstr = ''
 55 |         for word in sentence_seged:
 56 |             if word not in stopwords:
 57 |                 if (word != '\t') & (word.strip() != ''):
 58 |                     outstr += word
 59 |                     outstr += '\t'
 60 |         if (outstr == ''):
 61 |             outstr = 'NaN'
 62 |         train_doc_list.append(outstr)
 63 |     x_train = np.array(train_doc_list)
 64 | 
 65 |     train.loc[train['subject'] == '动力', 'subject'] = 0
 66 |     train.loc[train['subject'] == '价格', 'subject'] = 1
 67 |     train.loc[train['subject'] == '内饰', 'subject'] = 2
 68 |     train.loc[train['subject'] == '配置', 'subject'] = 3
 69 |     train.loc[train['subject'] == '安全性', 'subject'] = 4
 70 |     train.loc[train['subject'] == '外观', 'subject'] = 5
 71 |     train.loc[train['subject'] == '操控', 'subject'] = 6
 72 |     train.loc[train['subject'] == '油耗', 'subject'] = 7
 73 |     train.loc[train['subject'] == '空间', 'subject'] = 8
 74 |     train.loc[train['subject'] == '舒适性', 'subject'] = 9
 75 |     y_train = train['subject']
 76 |     return x_train, y_train
 77 | 
 78 | def load_dev_data_and_labels(data_file):
 79 |     """
 80 |     Loads MR polarity data from files, splits the data into words and generates labels.
 81 |     Returns split sentences and labels.
 82 |     """
 83 |     # Load data from files
 84 |     text = pd.read_csv(path + 'train_set.csv')
 85 |     x_text = np.array(text[column])
 86 |     # y=[]
 87 |     # for i in range(len(text['class'])):
 88 |     # print(text['class'])
 89 |     y = np.array(text['class'])
 90 |     return x_text, y
 91 | 
 92 | 
 93 | def batch_iter(data, batch_size, num_epochs, shuffle=True):
 94 |     """
 95 |     Generates a batch iterator for a dataset.
 96 |     """
 97 |     data = np.array(data)
 98 |     data_size = len(data)
 99 |     num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
100 |     for epoch in range(num_epochs):
101 |         # Shuffle the data at each epoch
102 |         if shuffle:
103 |             shuffle_indices = np.random.permutation(np.arange(data_size))
104 |             shuffled_data = data[shuffle_indices]
105 |         else:
106 |             shuffled_data = data
107 |         for batch_num in range(num_batches_per_epoch):
108 |             start_index = batch_num * batch_size
109 |             end_index = min((batch_num + 1) * batch_size, data_size)
110 |             yield shuffled_data[start_index:end_index]
111 | 
112 | from tensorflow.contrib import learn
113 | 
114 | def test():
115 |     x_text, y = load_data_and_labels('train_set.csv')
116 |     vocab_processor = learn.preprocessing.VocabularyProcessor(2000,min_frequency=3)
117 |     x = np.array(list(vocab_processor.fit_transform(x_text)))
118 | 
119 |     vocab_size = len(vocab_processor.vocabulary_)
120 |     print(vocab_size)
121 | 
122 | # test()
123 | 
124 | 


--------------------------------------------------------------------------------
/venv/dwb/testcnn/eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/8/20 下午4:17
  4 | # @Author   :hwwu
  5 | # @File     :eval.py
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | import os
 10 | import time
 11 | import datetime
 12 | import sys
 13 | sys.path.append("/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn")
 14 | import data_helpers
 15 | from tensorflow.contrib import learn
 16 | import csv
 17 | 
 18 | # Parameters
 19 | # ==================================================
 20 | 
 21 | # Data Parameters
 22 | tf.flags.DEFINE_string("data_file", "train_set.csv", "Data source for the positive data.")
 23 | 
 24 | # Eval Parameters
 25 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
 26 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
 27 | tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data")
 28 | 
 29 | # Misc Parameters
 30 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 31 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 32 | 
 33 | 
 34 | FLAGS = tf.flags.FLAGS
 35 | FLAGS._parse_flags()
 36 | print("\nParameters:")
 37 | for attr, value in sorted(FLAGS.__flags.items()):
 38 |     print("{}={}".format(attr.upper(), value))
 39 | print("")
 40 | 
 41 | # CHANGE THIS: Load data. Load your own data here
 42 | if FLAGS.eval_train:
 43 |     x_raw, y_test = data_helpers.load_dev_data_and_labels(FLAGS.data_file)
 44 |     x_raw= x_raw[10000:12000]
 45 |     y_test= y_test[10000:12000]
 46 |     # y_test = np.argmax(y_test, axis=1)
 47 | else:
 48 |     x_raw = ["a masterpiece four years in the making", "everything is off."]
 49 |     y_test = [1, 0]
 50 | 
 51 | # Map data into vocabulary
 52 | vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
 53 | vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
 54 | x_test = np.array(list(vocab_processor.transform(x_raw)))
 55 | 
 56 | print("\nEvaluating...\n")
 57 | 
 58 | # Evaluation
 59 | # ==================================================
 60 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
 61 | graph = tf.Graph()
 62 | with graph.as_default():
 63 |     session_conf = tf.ConfigProto(
 64 |       allow_soft_placement=FLAGS.allow_soft_placement,
 65 |       log_device_placement=FLAGS.log_device_placement)
 66 |     sess = tf.Session(config=session_conf)
 67 |     with sess.as_default():
 68 |         # Load the saved meta graph and restore variables
 69 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
 70 |         saver.restore(sess, checkpoint_file)
 71 | 
 72 |         # Get the placeholders from the graph by name
 73 |         input_x = graph.get_operation_by_name("input_x").outputs[0]
 74 |         # input_y = graph.get_operation_by_name("input_y").outputs[0]
 75 |         dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
 76 | 
 77 |         # Tensors we want to evaluate
 78 |         predictions = graph.get_operation_by_name("output/predictions").outputs[0]
 79 | 
 80 |         # Generate batches for one epoch
 81 |         batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)
 82 | 
 83 |         # Collect the predictions here
 84 |         all_predictions = []
 85 | 
 86 |         for x_test_batch in batches:
 87 |             batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
 88 |             all_predictions = np.concatenate([all_predictions, batch_predictions])
 89 | 
 90 | # Print accuracy if y_test is defined
 91 | if y_test is not None:
 92 |     correct_predictions = float(sum(all_predictions == y_test))
 93 |     print("Total number of test examples: {}".format(len(y_test)))
 94 |     print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
 95 | 
 96 | # Save the evaluation to a csv
 97 | predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
 98 | out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
 99 | print("Saving evaluation to {0}".format(out_path))
100 | with open(out_path, 'w') as f:
101 |     csv.writer(f).writerows(predictions_human_readable)


--------------------------------------------------------------------------------
/venv/dwb/testcnn/text_cnn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/8/20 下午4:09
 4 | # @Author   :hwwu
 5 | # @File     :text_cnn.py
 6 | 
 7 | import tensorflow as tf
 8 | import numpy as np
 9 | 
10 | 
11 | class TextCNN(object):
12 |     """
13 |     A CNN for text classification.
14 |     Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
15 |     """
16 |     def __init__(
17 |       self, sequence_length, num_classes, vocab_size,
18 |       embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
19 | 
20 |         # Placeholders for input, output and dropout
21 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
22 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
23 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
24 | 
25 |         # Keeping track of l2 regularization loss (optional)
26 |         l2_loss = tf.constant(0.0)
27 | 
28 |         # Embedding layer
29 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
30 |             self.W = tf.Variable(
31 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
32 |                 name="W")
33 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
34 |             self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
35 | 
36 |         # Create a convolution + maxpool layer for each filter size
37 |         pooled_outputs = []
38 |         for i, filter_size in enumerate(filter_sizes):
39 |             with tf.name_scope("conv-maxpool-%s" % filter_size):
40 |                 # Convolution Layer
41 |                 filter_shape = [filter_size, embedding_size, 1, num_filters]
42 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
43 |                 b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
44 |                 conv = tf.nn.conv2d(
45 |                     self.embedded_chars_expanded,
46 |                     W,
47 |                     strides=[1, 1, 1, 1],
48 |                     padding="VALID",
49 |                     name="conv")
50 |                 # Apply nonlinearity
51 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
52 |                 # Maxpooling over the outputs
53 |                 pooled = tf.nn.max_pool(
54 |                     h,
55 |                     ksize=[1, sequence_length - filter_size + 1, 1, 1],
56 |                     strides=[1, 1, 1, 1],
57 |                     padding='VALID',
58 |                     name="pool")
59 |                 pooled_outputs.append(pooled)
60 | 
61 |         # Combine all the pooled features
62 |         num_filters_total = num_filters * len(filter_sizes)
63 |         self.h_pool = tf.concat(pooled_outputs, 3)
64 |         self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
65 | 
66 |         # Add dropout
67 |         with tf.name_scope("dropout"):
68 |             self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
69 | 
70 |         # Final (unnormalized) scores and predictions
71 |         with tf.name_scope("output"):
72 |             W = tf.get_variable(
73 |                 "W",
74 |                 shape=[num_filters_total, num_classes],
75 |                 initializer=tf.contrib.layers.xavier_initializer())
76 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
77 |             l2_loss += tf.nn.l2_loss(W)
78 |             l2_loss += tf.nn.l2_loss(b)
79 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
80 |             self.predictions = tf.argmax(self.scores, 1, name="predictions")
81 | 
82 |         # Calculate mean cross-entropy loss
83 |         with tf.name_scope("loss"):
84 |             losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
85 |             self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
86 | 
87 |         # Accuracy
88 |         with tf.name_scope("accuracy"):
89 |             correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
90 |             self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")


--------------------------------------------------------------------------------
/venv/dwb/testcnn/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*-coding:utf8 -*-
  3 | # @TIME     :2018/8/20 下午4:12
  4 | # @Author   :hwwu
  5 | # @File     :train.py
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | import os
 10 | import time
 11 | import datetime
 12 | import sys
 13 | sys.path.append("/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn")
 14 | import data_helpers
 15 | from text_cnn import TextCNN
 16 | from tensorflow.contrib import learn
 17 | 
 18 | # Parameters
 19 | # ==================================================
 20 | 
 21 | # Data loading params
 22 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
 23 | tf.flags.DEFINE_string("data_file", "train_set.csv", "Data source for the positive data.")
 24 | 
 25 | # Model Hyperparameters
 26 | tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
 27 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
 28 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
 29 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 30 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")
 31 | 
 32 | # Training parameters
 33 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)")
 34 | tf.flags.DEFINE_integer("num_epochs", 2000, "Number of training epochs (default: 200)")
 35 | tf.flags.DEFINE_integer("evaluate_every", 10, "Evaluate model on dev set after this many steps (default: 100)")
 36 | tf.flags.DEFINE_integer("checkpoint_every", 10, "Save model after this many steps (default: 100)")
 37 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
 38 | # Misc Parameters
 39 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 40 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 41 | 
 42 | FLAGS = tf.flags.FLAGS
 43 | # FLAGS._parse_flags()
 44 | # print("\nParameters:")
 45 | # for attr, value in sorted(FLAGS.__flags.items()):
 46 | #     print("{}={}".format(attr.upper(), value))
 47 | # print("")
 48 | 
 49 | def preprocess():
 50 |     # Data Preparation
 51 |     # ==================================================
 52 |     print("Loading data...")
 53 |     x_text, y = data_helpers.load_data_and_labels(FLAGS.data_file)
 54 |     vocab_processor = learn.preprocessing.VocabularyProcessor(100)
 55 |     x = np.array(list(vocab_processor.fit_transform(x_text)))
 56 | 
 57 |     x_train = x[:8000]
 58 |     x_dev = x[8000:]
 59 |     y_train = y[:8000]
 60 |     y_dev = y[8000:]
 61 | 
 62 |     print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
 63 |     print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
 64 |     return x_train, y_train, vocab_processor, x_dev, y_dev
 65 | 
 66 | def train(x_train, y_train, vocab_processor, x_dev, y_dev):
 67 |     # Training
 68 |     # ==================================================
 69 | 
 70 |     with tf.Graph().as_default():
 71 |         session_conf = tf.ConfigProto(
 72 |           allow_soft_placement=FLAGS.allow_soft_placement,
 73 |           log_device_placement=FLAGS.log_device_placement)
 74 |         sess = tf.Session(config=session_conf)
 75 |         with sess.as_default():
 76 |             cnn = TextCNN(
 77 |                 sequence_length=x_train.shape[1],
 78 |                 num_classes=y_train.shape[1],
 79 |                 vocab_size=len(vocab_processor.vocabulary_),
 80 |                 embedding_size=FLAGS.embedding_dim,
 81 |                 filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
 82 |                 num_filters=FLAGS.num_filters,
 83 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
 84 | 
 85 |             # Define Training procedure
 86 |             global_step = tf.Variable(0, name="global_step", trainable=False)
 87 |             optimizer = tf.train.AdamOptimizer(1e-3)
 88 |             grads_and_vars = optimizer.compute_gradients(cnn.loss)
 89 |             train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
 90 | 
 91 |             # Keep track of gradient values and sparsity (optional)
 92 |             grad_summaries = []
 93 |             for g, v in grads_and_vars:
 94 |                 if g is not None:
 95 |                     grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
 96 |                     sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
 97 |                     grad_summaries.append(grad_hist_summary)
 98 |                     grad_summaries.append(sparsity_summary)
 99 |             grad_summaries_merged = tf.summary.merge(grad_summaries)
100 | 
101 |             # Output directory for models and summaries
102 |             timestamp = str(int(time.time()))
103 |             out_dir = os.path.abspath(os.path.join('/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/model/', "runs", timestamp))
104 |             print("Writing to {}\n".format(out_dir))
105 | 
106 |             # Summaries for loss and accuracy
107 |             loss_summary = tf.summary.scalar("loss", cnn.loss)
108 |             acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
109 | 
110 |             # Train Summaries
111 |             train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
112 |             train_summary_dir = os.path.join(out_dir, "summaries", "train")
113 |             train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
114 | 
115 |             # Dev summaries
116 |             dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
117 |             dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
118 |             dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
119 | 
120 |             # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
121 |             checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
122 |             checkpoint_prefix = os.path.join(checkpoint_dir, "model")
123 |             if not os.path.exists(checkpoint_dir):
124 |                 os.makedirs(checkpoint_dir)
125 |             saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
126 | 
127 |             # Write vocabulary
128 |             vocab_processor.save(os.path.join(out_dir, "vocab"))
129 | 
130 |             # Initialize all variables
131 |             sess.run(tf.global_variables_initializer())
132 | 
133 |             def train_step(x_batch, y_batch):
134 |                 """
135 |                 A single training step
136 |                 """
137 |                 feed_dict = {
138 |                   cnn.input_x: x_batch,
139 |                   cnn.input_y: y_batch,
140 |                   cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
141 |                 }
142 |                 _, step, summaries, loss, accuracy = sess.run(
143 |                     [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
144 |                     feed_dict)
145 |                 time_str = datetime.datetime.now().isoformat()
146 |                 print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
147 |                 train_summary_writer.add_summary(summaries, step)
148 | 
149 |             def dev_step(x_batch, y_batch, writer=None):
150 |                 """
151 |                 Evaluates model on a dev set
152 |                 """
153 |                 feed_dict = {
154 |                   cnn.input_x: x_batch,
155 |                   cnn.input_y: y_batch,
156 |                   cnn.dropout_keep_prob: 1.0
157 |                 }
158 |                 step, summaries, loss, accuracy = sess.run(
159 |                     [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
160 |                     feed_dict)
161 |                 time_str = datetime.datetime.now().isoformat()
162 |                 # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
163 |                 print('loss',loss)
164 |                 print('accuracy',accuracy)
165 |                 if writer:
166 |                     writer.add_summary(summaries, step)
167 | 
168 |             # Generate batches
169 |             batches = data_helpers.batch_iter(
170 |                 list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
171 |             # Training loop. For each batch...
172 |             for batch in batches:
173 |                 x_batch, y_batch = zip(*batch)
174 |                 train_step(x_batch, y_batch)
175 |                 current_step = tf.train.global_step(sess, global_step)
176 |                 # if current_step % FLAGS.evaluate_every == 0:
177 |                 #     print("\nEvaluation:")
178 |                 #     dev_step(x_dev, y_dev, writer=dev_summary_writer)
179 |                 #     print("..")
180 |                 if current_step % FLAGS.checkpoint_every == 0:
181 |                     path = saver.save(sess, checkpoint_prefix, global_step=current_step)
182 |                     print("Saved model checkpoint to {}\n".format(path))
183 | 
184 | def main(argv=None):
185 |     x_train, y_train, vocab_processor, x_dev, y_dev = preprocess()
186 |     train(x_train, y_train, vocab_processor, x_dev, y_dev)
187 | 
188 | if __name__ == '__main__':
189 |     tf.app.run()


--------------------------------------------------------------------------------
/venv/pachong/iqiyi/fcxd.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/11/30 下午1:43
 4 | # @Author   :hwwu
 5 | # @File     :fcxd.py
 6 | 
 7 | import requests
 8 | import re
 9 | from bs4 import BeautifulSoup
10 | import os
11 | import shutil
12 | import json
13 | from urllib import parse
14 | 
15 | # https://2wk.com/vip.php?url=
16 | # 这个网址能解析的视频都可以通过这个下载
17 | 
18 | headers = {
19 |     # 'Access-Control-Allow-Credentials': 'true',
20 |     # 'Cache-Control': 'max-age=900',
21 |     # 'Content-Encoding': 'gzip',
22 |     # 'Content-Language': 'zh-CN',
23 |     # 'Content-Type': 'text/html; charset=UTF-8',
24 |     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
25 |     # 'Upgrade-Insecure-Requests': '1'
26 | }
27 | 
28 | y_url2 = 'http://www.iqiyi.com/lib/m_209926914.html?src=search'
29 | y_target2 = requests.get(url=y_url2).text
30 | y_soup2 = BeautifulSoup(y_target2, 'html.parser')
31 | y_returnsoup2 = y_soup2.find_all('div', attrs={'class': 'site-piclist_pic'})
32 | 
33 | # 用正则表达式获取剧集链接
34 | y_result2 = re.findall('(?<=href=\").*?(?=\")', str(y_returnsoup2))
35 | # 用正则表达式获取剧集名称
36 | title2 = re.findall('(?<=title=\").*?(?=\">)', str(y_returnsoup2))
37 | j = len(title2)
38 | # 输出爬取结果
39 | for i in range(2, j - 2):
40 |     str1 = '第' + str(i + 1) + '集'
41 |     print(y_result2[i])
42 |     print(str1, title2[i])
43 |     xm_url = 'http://aikan-tv.com/?url=' + y_result2[i]
44 |     req = requests.get(xm_url, headers=headers)
45 | 
46 |     soup1 = BeautifulSoup(req.text, 'html.parser')
47 |     returnsoup1 = soup1.find_all('iframe')
48 |     result1 = re.findall('(?<=src=\").*?(?=\")', str(returnsoup1))
49 | 
50 |     req = requests.get(result1[0], headers=headers)
51 |     req_json = re.findall('"api1.php", (.+),', req.text)[0]
52 |     info = json.loads(req_json)
53 | 
54 |     data = {'time': info['time'], 'key': info['key'],
55 |             'url': info['url'], 'type': info['type'], 'referer': info['referer']}
56 |     req = requests.post('https://yun.odflv.com/odflv2/api1.php', headers=headers, data=data)
57 |     info = json.loads(req.text)
58 |     url = info['url']
59 | 
60 |     url1 = parse.unquote(url)
61 | 
62 |     req = requests.get(url1, headers=headers)
63 |     result2 = re.findall('(?<=/).*?(?=.m3u8)', str(req.text))
64 | 
65 |     req = requests.get('https://acfun.iqiyi-kuyun.com/' + result2[0] + '.m3u8', headers=headers)
66 |     text = req.text
67 |     tl = text.split('\n')
68 |     new_index = []
69 |     for l in tl:
70 |         if l.find('.ts') > 0:
71 |             new_index.append(l)
72 |     print(len(new_index), new_index)
73 |     file_path = '/Users/liyangyang/Downloads/pachong/iqiyi/mid/fcxd/' + str1
74 |     os.makedirs(file_path, exist_ok=True)
75 |     for ii, ni in enumerate(new_index):
76 |         url = 'https://acfun.iqiyi-kuyun.com' + ni
77 |         r = requests.get(url, headers=headers)
78 |         # print(url)
79 |         content_length = int(r.headers['Content-Length'])
80 |         path = file_path + '/' + str(ii) + '.ts'
81 |         with open(path, 'ab') as file:
82 |             file.write(r.content)
83 |             file.flush()
84 |             print(ni, 'receive data，file size : %d' % (content_length))
85 | 
86 |     new_path = '/Users/liyangyang/Downloads/pachong/iqiyi/result/fcxd/' + str1
87 |     os.makedirs(new_path, exist_ok=True)
88 |     exec_str = "cat " + file_path + '/*.ts  > ' + new_path + '/' + title2[i] + '.ts'
89 |     print(exec_str)
90 |     os.system(exec_str)
91 |     shutil.rmtree(file_path)
92 |     # sec_str = 'ffmpeg -y -i ' + new_path + '/new.ts -c:v libx264 -c:a copy -bsf:a aac_adtstoasc ' + new_path + '/new.mp4'
93 |     # print(sec_str)
94 |     # os.system(sec_str)
95 | 


--------------------------------------------------------------------------------
/venv/pachong/iqiyi/xiangmicc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*-coding:utf8 -*-
 3 | # @TIME     :2018/11/30 下午1:43
 4 | # @Author   :hwwu
 5 | # @File     :xiangmicc.py
 6 | 
 7 | import requests
 8 | import re
 9 | from bs4 import BeautifulSoup
10 | import os
11 | 
12 | headers = {
13 |     'Access-Control-Allow-Credentials': 'true',
14 |     'Cache-Control': 'max-age=900',
15 |     'Content-Encoding': 'gzip',
16 |     'Content-Language': 'zh-CN',
17 |     'Content-Type': 'text/html; charset=UTF-8',
18 |     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
19 |     'Upgrade-Insecure-Requests': '1'
20 | }
21 | 
22 | y_url2 = 'http://www.iqiyi.com/a_19rrh9km3x.html#vfrm=2-4-0-1'
23 | y_target2 = requests.get(url=y_url2).text
24 | y_soup2 = BeautifulSoup(y_target2, 'html.parser')
25 | y_returnsoup2 = y_soup2.find_all('div', attrs={'class': 'site-piclist_pic'})
26 | 
27 | # 用正则表达式获取剧集链接
28 | y_result2 = re.findall('(?<=href=\").*?(?=\")', str(y_returnsoup2))
29 | # 用正则表达式获取剧集名称
30 | title2 = re.findall('(?<=title=\").*?(?=\">)', str(y_returnsoup2))
31 | j = len(title2)
32 | # 输出爬取结果
33 | for i in range(7, j - 2):
34 |     str1 = '第' + str(i) + '集'
35 |     print(y_result2[i])
36 |     print(str1, title2[i])
37 |     xm_url = 'https://2wk.com/vip.php?url=' + y_result2[i]
38 |     req = requests.get(xm_url, headers=headers)
39 |     soup1 = BeautifulSoup(req.text, 'html.parser')
40 |     returnsoup1 = soup1.find_all('iframe')
41 |     result1 = re.findall('(?<=src=\").*?(?=\")', str(returnsoup1))
42 | 
43 |     req = requests.get(result1[0], headers=headers)
44 |     result2 = re.findall('(?<=src=\').*?(?=\')', str(req.text))
45 | 
46 |     req = requests.get('https:' + result2[0], headers=headers)
47 |     soup2 = BeautifulSoup(req.text, 'html.parser')
48 |     returnsoup2 = soup2.find_all('title')
49 |     title = re.findall('(?<=<title>).*?(?=</title>)', str(req.text))
50 |     body = soup2.find_all('body')
51 |     m3u8Url = re.findall('(?<=m3u8Url = \").*?(?=\")', str(req.text))
52 | 
53 |     www = re.findall('(?<=//).*?(?=/)', str(result2[0]))
54 |     index = 'https://' + www[0] + m3u8Url[0]
55 | 
56 |     index = index.replace('index.m3u8', '1000kb/hls/index.m3u8')
57 |     req = requests.get(index, headers=headers)
58 |     text = req.text
59 |     tl = text.split('\n')
60 |     new_index = []
61 |     for l in tl:
62 |         if l.find('.ts') > 0:
63 |             new_index.append(l)
64 | 
65 |     file_path = '/Users/liyangyang/Downloads/pachong/iqiyi/mid/xmcc/' + str1
66 |     os.makedirs(file_path, exist_ok=True)
67 |     for ni in new_index:
68 |         url = index.replace('index.m3u8', ni)
69 |         r = requests.get(url, headers=headers)
70 |         # print(url)
71 |         # content_length = int(r.headers['Content-Length'])
72 |         path = file_path + '/' + ni
73 |         with open(path, 'ab') as file:
74 |             file.write(r.content)
75 |             file.flush()
76 |             # print(ni, 'receive data，file size : %d   total size:%d' % (os.path.getsize(path), content_length))
77 | 
78 |     new_path = '/Users/liyangyang/Downloads/pachong/iqiyi/result/xmcc/' + str1
79 |     os.makedirs(new_path, exist_ok=True)
80 |     exec_str = "cat " + file_path + '/*.ts  > ' + new_path + '/' + title2[i] + '.ts'
81 |     print(exec_str)
82 |     os.system(exec_str)
83 |     os.defpath(file_path)
84 |     # sec_str = 'ffmpeg -y -i ' + new_path + '/new.ts -c:v libx264 -c:a copy -bsf:a aac_adtstoasc ' + new_path + '/new.mp4'
85 |     # print(sec_str)
86 |     # os.system(sec_str)
87 | 


--------------------------------------------------------------------------------
/venv/regress_baseline/cwd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*-coding:utf8 -*-
3 | # @TIME     :2018/10/10 下午5:32
4 | # @Author   :hwwu
5 | # @File     :cwd.py
6 | import os
7 | 
8 | print(os.getcwd())


--------------------------------------------------------------------------------