├── README.md └── venv ├── Kaggle-Ensemble-Guide ├── README.md ├── requirements.txt ├── samples │ ├── _w2_method2.csv │ ├── _w2_method3.csv │ ├── _w3_method1.csv │ ├── kaggle_avg.csv │ ├── kaggle_geomean.csv │ ├── kaggle_rankavg.csv │ ├── kaggle_vote.csv │ ├── kaggle_vote_weighted.csv │ ├── method1.csv │ ├── method2.csv │ └── method3.csv ├── src │ ├── blend_proba.py │ ├── correlations.py │ ├── kaggle_avg.py │ ├── kaggle_geomean.py │ ├── kaggle_rankavg.py │ └── kaggle_vote.py └── stacking │ └── ensemble_stacking.py ├── bdci ├── merge.py ├── read_data.py ├── snownlp1.py └── split_word.py ├── datafountain ├── guangfudianzhan │ ├── __pycache__ │ │ └── read_data.cpython-36.pyc │ ├── dnn_model.py │ ├── draw.py │ ├── find_base_feature.py │ ├── find_best_feature.py │ ├── model.py │ ├── read_data.py │ ├── rnn_model.py │ └── tensor_forest.py └── taocan │ ├── base.ipynb │ ├── baseline.py │ ├── ml_models.py │ ├── tensorflow_modle.py │ └── tf_model │ ├── checkpoint │ ├── stock.model.data-00000-of-00001 │ ├── stock.model.index │ ├── stock.model.max.data-00000-of-00001 │ ├── stock.model.max.index │ ├── stock.model.max.meta │ └── stock.model.meta ├── dc └── guangfu │ └── github │ ├── README.md │ └── baseline.py ├── deep_learning ├── embedding │ └── word2vec.py └── yucemoxing │ ├── PricePredictor.py │ └── chargeInfo.txt ├── dwb ├── baseline.py ├── fasttext │ ├── __pycache__ │ │ └── fasttext.cpython-36.pyc │ ├── fasttext.py │ └── p4_cnn_sentence_classification.py ├── github_model │ ├── a01_FastText │ │ ├── __pycache__ │ │ │ └── p5_fastTextB_model.cpython-36.pyc │ │ ├── p5_fastTextB_model.py │ │ ├── p5_fastTextB_predict.py │ │ ├── p5_fastTextB_predict_multilabel.py │ │ ├── p5_fastTextB_train.py │ │ ├── p6_fastTextB_model_multilabel.py │ │ └── p6_fastTextB_train_multilabel.py │ ├── a02_TextCNN │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── data_util.cpython-36.pyc │ │ │ └── p7_TextCNN_model.cpython-36.pyc │ │ ├── data_util.py │ │ ├── other_experiement │ │ │ ├── __init__.py │ │ │ ├── data_util_zhihu.py │ │ │ ├── p7_TextCNN_predict_ensemble.py │ │ │ ├── p7_TextCNN_predict_exp.py │ │ │ ├── p7_TextCNN_predict_exp512.py │ │ │ ├── p7_TextCNN_predict_exp512_0609.py │ │ │ ├── p7_TextCNN_predict_exp512_simple.py │ │ │ ├── p7_TextCNN_train_exp.py │ │ │ ├── p7_TextCNN_train_exp512.py │ │ │ ├── p7_TextCNN_train_exp_512_0609.py │ │ │ └── p8_TextCNN_predict_exp.py │ │ ├── p7_TextCNN_model.py │ │ ├── p7_TextCNN_model_multilayers.py │ │ ├── p7_TextCNN_predict.py │ │ └── p7_TextCNN_train.py │ └── a03_TextRNN │ │ ├── __pycache__ │ │ └── p8_TextRNN_model.cpython-36.pyc │ │ ├── p8_TextRNN_model.py │ │ ├── p8_TextRNN_model_multi_layers.py │ │ ├── p8_TextRNN_predict.py │ │ ├── p8_TextRNN_train.py │ │ └── result_rnn.csv ├── jieba1 │ ├── merge.py │ └── tjieba.py ├── merge.py ├── par.py └── testcnn │ ├── __pycache__ │ ├── data_helpers.cpython-36.pyc │ └── text_cnn.cpython-36.pyc │ ├── data_helpers.py │ ├── eval.py │ ├── text_cnn.py │ └── train.py ├── pachong └── iqiyi │ ├── fcxd.py │ └── xiangmicc.py └── regress_baseline ├── cwd.py └── regress_baseline.py /README.md: -------------------------------------------------------------------------------- 1 | # deep_learning 2 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/README.md: -------------------------------------------------------------------------------- 1 | Kaggle-Ensemble-Guide 2 | ===================== 3 | 4 | A combination of Model Ensembling methods that is extremely useful for increasing accuracy of Kaggle's submission. 5 | For more information: http://mlwave.com/kaggle-ensembling-guide/ 6 | 7 | ## Installation: 8 | 9 | $ pip install -r requirements.txt 10 | 11 | ## Example: 12 | 13 | $ python ./src/correlations.py ./samples/method1.csv ./samples/method2.csv 14 | Finding correlation between: ./samples/method1.csv and ./samples/method2.csv 15 | Column to be measured: Label 16 | Pearson's correlation score: 0.67898 17 | Kendall's correlation score: 0.66667 18 | Spearman's correlation score: 0.71053 19 | 20 | $ python ./src/kaggle_vote.py "./samples/method*.csv" "./samples/kaggle_vote.csv" 21 | parsing: ./samples/method1.csv 22 | parsing: ./samples/method2.csv 23 | parsing: ./samples/method3.csv 24 | wrote to ./samples/kaggle_vote.csv 25 | 26 | 27 | $ python ./src/kaggle_vote.py "./samples/_*.csv" "./samples/kaggle_vote_weighted.csv" "weighted" 28 | parsing: ./samples/_w3_method1.csv 29 | Using weight: 3 30 | parsing: ./samples/_w2_method2.csv 31 | Using weight: 2 32 | parsing: ./samples/_w2_method3.csv 33 | Using weight: 2 34 | wrote to ./samples/kaggle_vote_weighted.csv 35 | 36 | $ python ./src/kaggle_rankavg.py "./samples/method*.csv" "./samples/kaggle_rankavg.csv" 37 | parsing: ./samples/method1.csv 38 | parsing: ./samples/method2.csv 39 | parsing: ./samples/method3.csv 40 | wrote to ./samples/kaggle_rankavg.csv 41 | 42 | $ python ./src/kaggle_avg.py "./samples/method*.csv" "./samples/kaggle_avg.csv" 43 | parsing: ./samples/method1.csv 44 | parsing: ./samples/method2.csv 45 | parsing: ./samples/method3.csv 46 | wrote to ./samples/kaggle_avg.csv 47 | 48 | $ python ./src/kaggle_geomean.py "./samples/method*.csv" "./samples/kaggle_geomean.csv" 49 | parsing: ./samples/method1.csv 50 | parsing: ./samples/method2.csv 51 | parsing: ./samples/method3.csv 52 | wrote to ./samples/kaggle_geomean.csv 53 | 54 | ## Result: 55 | 56 | ==> ./samples/method1.csv <== 57 | ImageId,Label 58 | 1,1 59 | 2,0 60 | 3,9 61 | 4,9 62 | 5,3 63 | 64 | ==> ./samples/method2.csv <== 65 | ImageId,Label 66 | 1,2 67 | 2,0 68 | 3,6 69 | 4,2 70 | 5,3 71 | 72 | ==> ./samples/method3.csv <== 73 | ImageId,Label 74 | 1,2 75 | 2,0 76 | 3,9 77 | 4,2 78 | 5,3 79 | 80 | ==> ./samples/kaggle_avg.csv <== 81 | ImageId,Label 82 | 1,1.666667 83 | 2,0.000000 84 | 3,8.000000 85 | 4,4.333333 86 | 5,3.000000 87 | 88 | ==> ./samples/kaggle_rankavg.csv <== 89 | ImageId,Label 90 | 1,0.25 91 | 2,0.0 92 | 3,1.0 93 | 4,0.5 94 | 5,0.75 95 | 96 | ==> ./samples/kaggle_vote.csv <== 97 | ImageId,Label 98 | 1,2 99 | 2,0 100 | 3,9 101 | 4,2 102 | 5,3 103 | 104 | ==> ./samples/kaggle_geomean.csv <== 105 | ImageId,Label 106 | 1,1.587401 107 | 2,0.000000 108 | 3,7.862224 109 | 4,3.301927 110 | 5,3.000000 111 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | sklearn 3 | numpy 4 | scipy -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/_w2_method2.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,3 3 | 4,2 4 | 3,6 5 | 5,3 6 | 2,0 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/_w2_method3.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,2 3 | 3,9 4 | 2,0 5 | 5,3 6 | 4,2 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/_w3_method1.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 5,3 3 | 2,0 4 | 3,5 5 | 4,9 6 | 1,1 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/kaggle_avg.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,1.666667 3 | 2,0.000000 4 | 3,8.000000 5 | 4,4.333333 6 | 5,3.000000 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/kaggle_geomean.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,1.587401 3 | 2,0.000000 4 | 3,7.862224 5 | 4,3.301927 6 | 5,3.000000 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/kaggle_rankavg.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,0.25 3 | 2,0.0 4 | 3,1.0 5 | 4,0.5 6 | 5,0.75 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/kaggle_vote.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,2 3 | 2,0 4 | 3,9 5 | 4,2 6 | 5,3 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/kaggle_vote_weighted.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,1 3 | 2,0 4 | 3,5 5 | 4,2 6 | 5,3 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/method1.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 5,3 3 | 2,0 4 | 3,9 5 | 4,9 6 | 1,1 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/method2.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,2 3 | 4,2 4 | 3,6 5 | 5,3 6 | 2,0 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/samples/method3.csv: -------------------------------------------------------------------------------- 1 | ImageId,Label 2 | 1,2 3 | 3,9 4 | 2,0 5 | 5,3 6 | 4,2 7 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/src/blend_proba.py: -------------------------------------------------------------------------------- 1 | from sklearn import cross_validation 2 | from sklearn.metrics import log_loss, accuracy_score 3 | import numpy as np 4 | import pandas as pd 5 | import random 6 | import md5 7 | import json 8 | 9 | def blend_proba(clf, X_train, y, X_test, nfolds=5, save_preds="", 10 | save_test_only="", seed=300373, save_params="", 11 | clf_name="XX", generalizers_params=[], minimal_loss=0, 12 | return_score=False, minimizer="log_loss"): 13 | print("\nBlending with classifier:\n\t{}".format(clf)) 14 | folds = list(cross_validation.StratifiedKFold(y, nfolds,shuffle=True,random_state=seed)) 15 | print(X_train.shape) 16 | dataset_blend_train = np.zeros((X_train.shape[0],np.unique(y).shape[0])) 17 | 18 | #iterate through train set and train - predict folds 19 | loss = 0 20 | for i, (train_index, test_index) in enumerate( folds ): 21 | print("Train Fold {}/{}}".format(i+1,nfolds)) 22 | fold_X_train = X_train[train_index] 23 | fold_y_train = y[train_index] 24 | fold_X_test = X_train[test_index] 25 | fold_y_test = y[test_index] 26 | clf.fit(fold_X_train, fold_y_train) 27 | 28 | fold_preds = clf.predict_proba(fold_X_test) 29 | print("Logistic loss: {}".format(log_loss(fold_y_test,fold_preds))) 30 | dataset_blend_train[test_index] = fold_preds 31 | if minimizer == "log_loss": 32 | loss += log_loss(fold_y_test,fold_preds) 33 | if minimizer == "accuracy": 34 | fold_preds_a = np.argmax(fold_preds, axis=1) 35 | loss += accuracy_score(fold_y_test,fold_preds_a) 36 | #fold_preds = clf.predict(fold_X_test) 37 | 38 | #loss += accuracy_score(fold_y_test,fold_preds) 39 | 40 | if minimal_loss > 0 and loss > minimal_loss and i == 0: 41 | return False, False 42 | fold_preds = np.argmax(fold_preds, axis=1) 43 | print("Accuracy: {}".format(accuracy_score(fold_y_test,fold_preds))) 44 | avg_loss = loss / float(i+1) 45 | print("\nAverage:\t{}\n".format(avg_loss)) 46 | #predict test set (better to take average on all folds, but this is quicker) 47 | print("Test Fold 1/1") 48 | clf.fit(X_train, y) 49 | dataset_blend_test = clf.predict_proba(X_test) 50 | 51 | if clf_name == "XX": 52 | clf_name = str(clf)[1:3] 53 | 54 | if len(save_preds)>0: 55 | id = md5.new("{}"{}tr(clf.get_params())).hexdigest() 56 | print("storing meta predictions at: {}"{}ave_preds) 57 | np.save("{}_{}_{}_train.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_train)) 58 | np.save("{}_{}_{}_test.npy".format((save_preds,clf_name,avg_loss,id),dataset_blend_test)) 59 | 60 | if len(save_test_only)>0: 61 | id = md5.new("{}"{}tr(clf.get_params())).hexdigest() 62 | print("storing meta predictions at: {}"{}ave_test_only) 63 | 64 | dataset_blend_test = clf.predict(X_test) 65 | np.savetxt("{}_{}_{}_test.txt".format((save_test_only,clf_name,avg_loss,id),dataset_blend_test)) 66 | d = {} 67 | d["stacker"] = clf.get_params() 68 | d["generalizers"] = generalizers_params 69 | with open("{}_{}_{}_params.json".format((save_test_only,clf_name,avg_loss, id), 'wb')) as f: 70 | json.dump(d, f) 71 | 72 | if len(save_params)>0: 73 | id = md5.new("{}"{}tr(clf.get_params())).hexdigest() 74 | d = {} 75 | d["name"] = clf_name 76 | d["params"] = { k:(v.get_params() if "\n" in str(v) or "<" in str(v) else v) for k,v in clf.get_params().items()} 77 | d["generalizers"] = generalizers_params 78 | with open("{}_{}_{}_params.json".format((save_params,clf_name,avg_loss, id), 'wb')) as f: 79 | json.dump(d, f) 80 | 81 | if np.unique(y).shape[0] == 2: # when binary classification only return positive class proba 82 | if return_score: 83 | return dataset_blend_train[:,1], dataset_blend_test[:,1], avg_loss 84 | else: 85 | return dataset_blend_train[:,1], dataset_blend_test[:,1] 86 | else: 87 | if return_score: 88 | return dataset_blend_train, dataset_blend_test, avg_loss 89 | else: 90 | return dataset_blend_train, dataset_blend_test -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/src/correlations.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | 4 | first_file = sys.argv[1] 5 | second_file = sys.argv[2] 6 | 7 | def corr(first_file, second_file): 8 | first_df = pd.read_csv(first_file,index_col=0) 9 | second_df = pd.read_csv(second_file,index_col=0) 10 | # assuming first column is `prediction_id` and second column is `prediction` 11 | prediction = first_df.columns[0] 12 | # correlation 13 | print("Finding correlation between: {} and {}".format(first_file,second_file)) 14 | print("Column to be measured: {}".format(prediction)) 15 | print("Pearson's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='pearson'))) 16 | print("Kendall's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='kendall'))) 17 | print("Spearman's correlation score: {}".format(first_df[prediction].corr(second_df[prediction],method='spearman'))) 18 | 19 | corr(first_file, second_file) 20 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/src/kaggle_avg.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from glob import glob 3 | import sys 4 | 5 | glob_files = sys.argv[1] 6 | loc_outfile = sys.argv[2] 7 | 8 | def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): 9 | if method == "average": 10 | scores = defaultdict(float) 11 | with open(loc_outfile,"w") as outfile: 12 | for i, glob_file in enumerate( glob(glob_files) ): 13 | print("parsing: {}".format(glob_file)) 14 | # sort glob_file by first column, ignoring the first line 15 | lines = open(glob_file).readlines() 16 | lines = [lines[0]] + sorted(lines[1:]) 17 | for e, line in enumerate( lines ): 18 | if i == 0 and e == 0: 19 | outfile.write(line) 20 | if e > 0: 21 | row = line.strip().split(",") 22 | scores[(e,row[0])] += float(row[1]) 23 | for j,k in sorted(scores): 24 | outfile.write("%s,%f\n"%(k,scores[(j,k)]/(i+1))) 25 | print("wrote to {}".format(loc_outfile)) 26 | 27 | kaggle_bag(glob_files, loc_outfile) -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/src/kaggle_geomean.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import defaultdict 3 | from glob import glob 4 | import sys 5 | import math 6 | 7 | glob_files = sys.argv[1] 8 | loc_outfile = sys.argv[2] 9 | 10 | def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): 11 | if method == "average": 12 | scores = defaultdict(float) 13 | with open(loc_outfile,"w") as outfile: 14 | for i, glob_file in enumerate( glob(glob_files) ): 15 | print("parsing: {}".format(glob_file)) 16 | # sort glob_file by first column, ignoring the first line 17 | lines = open(glob_file).readlines() 18 | lines = [lines[0]] + sorted(lines[1:]) 19 | for e, line in enumerate( lines ): 20 | if i == 0 and e == 0: 21 | outfile.write(line) 22 | if e > 0: 23 | row = line.strip().split(",") 24 | if scores[(e,row[0])] == 0: 25 | scores[(e,row[0])] = 1 26 | scores[(e,row[0])] *= float(row[1]) 27 | for j,k in sorted(scores): 28 | outfile.write("%s,%f\n"%(k,math.pow(scores[(j,k)],1/(i+1)))) 29 | print("wrote to {}".format(loc_outfile)) 30 | 31 | kaggle_bag(glob_files, loc_outfile) 32 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/src/kaggle_rankavg.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from collections import defaultdict 3 | from glob import glob 4 | import sys 5 | 6 | glob_files = sys.argv[1] 7 | loc_outfile = sys.argv[2] 8 | 9 | def kaggle_bag(glob_files, loc_outfile): 10 | with open(loc_outfile,"w") as outfile: 11 | all_ranks = defaultdict(list) 12 | for i, glob_file in enumerate( glob(glob_files) ): 13 | file_ranks = [] 14 | print("parsing: {}".format(glob_file)) 15 | # sort glob_file by first column, ignoring the first line 16 | lines = open(glob_file).readlines() 17 | lines = [lines[0]] + sorted(lines[1:]) 18 | for e, line in enumerate( lines ): 19 | if e == 0 and i == 0: 20 | outfile.write( line ) 21 | elif e > 0: 22 | r = line.strip().split(",") 23 | file_ranks.append( (float(r[1]), e, r[0]) ) 24 | for rank, item in enumerate( sorted(file_ranks) ): 25 | all_ranks[(item[1],item[2])].append(rank) 26 | average_ranks = [] 27 | for k in sorted(all_ranks): 28 | average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k)) 29 | ranked_ranks = [] 30 | for rank, k in enumerate(sorted(average_ranks)): 31 | ranked_ranks.append((k[1][0],k[1][1],rank/(len(average_ranks)-1))) 32 | for k in sorted(ranked_ranks): 33 | outfile.write("%s,%s\n"%(k[1],k[2])) 34 | print("wrote to {}".format(loc_outfile)) 35 | 36 | kaggle_bag(glob_files, loc_outfile) -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/src/kaggle_vote.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from glob import glob 3 | import sys 4 | import re 5 | 6 | glob_files = sys.argv[1] 7 | loc_outfile = sys.argv[2] 8 | weights_strategy = "uniform" 9 | if len(sys.argv) == 4: 10 | weights_strategy = sys.argv[3] 11 | 12 | def kaggle_bag(glob_files, loc_outfile, method="average", weights="uniform"): 13 | pattern = re.compile(r"(.)*_[w|W](\d*)_[.]*") 14 | if method == "average": 15 | scores = defaultdict(list) 16 | with open(loc_outfile,"w") as outfile: 17 | #weight_list may be usefull using a different method 18 | weight_list = [1]*len(glob(glob_files)) 19 | for i, glob_file in enumerate( glob(glob_files) ): 20 | print("parsing: {}".format(glob_file)) 21 | if weights == "weighted": 22 | weight = pattern.match(glob_file) 23 | if weight and weight.group(2): 24 | print("Using weight: {}".format(weight.group(2))) 25 | weight_list[i] = weight_list[i]*int(weight.group(2)) 26 | else: 27 | print("Using weight: 1") 28 | # sort glob_file by first column, ignoring the first line 29 | lines = open(glob_file).readlines() 30 | lines = [lines[0]] + sorted(lines[1:]) 31 | for e, line in enumerate( lines ): 32 | if i == 0 and e == 0: 33 | outfile.write(line) 34 | if e > 0: 35 | row = line.strip().split(",") 36 | for l in range(1,weight_list[i]+1): 37 | scores[(e,row[0])].append(row[1]) 38 | for j,k in sorted(scores): 39 | outfile.write("%s,%s\n"%(k,Counter(scores[(j,k)]).most_common(1)[0][0])) 40 | print("wrote to {}".format(loc_outfile)) 41 | 42 | kaggle_bag(glob_files, loc_outfile, weights=weights_strategy) 43 | -------------------------------------------------------------------------------- /venv/Kaggle-Ensemble-Guide/stacking/ensemble_stacking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/19 下午4:51 4 | # @Author :hwwu 5 | # @File :ensemble_stacking.py 6 | 7 | from sklearn import datasets 8 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier 9 | from sklearn.cross_validation import train_test_split 10 | from sklearn.cross_validation import StratifiedKFold 11 | import numpy as np 12 | from sklearn.metrics import roc_auc_score 13 | from sklearn.datasets.samples_generator import make_blobs 14 | 15 | '''创建训练的数据集''' 16 | data, target = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60) 17 | 18 | '''模型融合中使用到的各个单模型''' 19 | clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'), 20 | RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'), 21 | ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'), 22 | ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'), 23 | GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)] 24 | 25 | '''切分一部分数据作为测试集''' 26 | X, X_predict, y, y_predict = train_test_split(data, target, test_size=0.33, random_state=2017) 27 | 28 | 29 | dataset_blend_train = np.zeros((X.shape[0], len(clfs))) 30 | dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs))) 31 | 32 | '''5折stacking''' 33 | n_folds = 5 34 | skf = list(StratifiedKFold(y, n_folds)) 35 | for j, clf in enumerate(clfs): 36 | '''依次训练各个单模型''' 37 | # print(j, clf) 38 | dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf))) 39 | for i, (train, test) in enumerate(skf): 40 | '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。''' 41 | # print("Fold", i) 42 | X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] 43 | clf.fit(X_train, y_train) 44 | y_submission = clf.predict_proba(X_test)[:, 1] 45 | dataset_blend_train[test, j] = y_submission 46 | dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1] 47 | '''对于测试集,直接用这k个模型的预测值均值作为新的特征。''' 48 | dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) 49 | print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j])) 50 | # clf = LogisticRegression() 51 | clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30) 52 | clf.fit(dataset_blend_train, y) 53 | y_submission = clf.predict_proba(dataset_blend_test)[:, 1] 54 | 55 | print("Linear stretch of predictions to [0,1]") 56 | y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) 57 | print("blend result") 58 | print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission))) -------------------------------------------------------------------------------- /venv/bdci/merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/12 下午3:48 4 | # @Author :hwwu 5 | # @File :merge.py 6 | 7 | import pandas as pd, numpy as np 8 | 9 | path = '/Users/liyangyang/Downloads/bdci/' 10 | 11 | def write_result(id, predictions): 12 | r_id = [] 13 | r_predictions = [] 14 | for i in range(len(id)): 15 | r_id.append(str(id[i])) 16 | r_predictions.append(int(predictions[i])) 17 | 18 | english_column = pd.Series(r_id, name='content_id') 19 | number_column = pd.Series(r_predictions, name='sentiment_value') 20 | predictions = pd.concat([english_column, number_column], axis=1) 21 | predictions.to_csv(path + 'merge_result_data_sentiment_value.csv', index=0, sep=',', columns=['content_id', 'sentiment_value']) 22 | 23 | 24 | # r75 = pd.read_csv(path+'MultinomialNB.csv')['sentiment_value'] 25 | # rcnn = pd.read_csv(path+'LinearSVC.csv')['sentiment_value'] 26 | # rrnn = pd.read_csv(path+'RandomForestClassifier.csv')['sentiment_value'] 27 | # print('r75.shape',r75.shape) 28 | # print('rcnn.shape',rcnn.shape) 29 | # print('rrnn.shape',rrnn.shape) 30 | # 31 | # id = pd.read_csv(path+'MultinomialNB.csv')['content_id'] 32 | # predictions =[] 33 | # for i in range(len(r75)): 34 | # # id.append(r75['content_id'][i]) 35 | # if (rcnn[i]==rrnn[i]): 36 | # predictions.append(rcnn[i]) 37 | # else: 38 | # predictions.append(r75[i]) 39 | # 40 | # write_result(id,predictions) 41 | 42 | sentiment_value = pd.read_csv(path+'merge_result_data_sentiment_value.csv') 43 | subject = pd.read_csv(path+'merge_result_data_subject.csv') 44 | content = pd.read_csv(path+'train.csv') 45 | 46 | subject.loc[subject['subject'] == 0, 'subject'] = '动力' 47 | subject.loc[subject['subject'] == 1, 'subject'] = '价格' 48 | subject.loc[subject['subject'] == 2, 'subject'] = '内饰' 49 | subject.loc[subject['subject'] == 3, 'subject'] = '配置' 50 | subject.loc[subject['subject'] == 4, 'subject'] = '安全性' 51 | subject.loc[subject['subject'] == 5, 'subject'] = '外观' 52 | subject.loc[subject['subject'] == 6, 'subject'] = '操控' 53 | subject.loc[subject['subject'] == 7, 'subject'] = '油耗' 54 | subject.loc[subject['subject'] == 8, 'subject'] = '空间' 55 | subject.loc[subject['subject'] == 9, 'subject'] = '舒适性' 56 | 57 | # df = pd.DataFrame({"content_id": sentiment_value['content_id'], "subject": subject['subject'],'sentiment_value':sentiment_value['sentiment_value'].astype(int),'sentiment_word':''}) 58 | # df.to_csv(path+'result.csv', index = False, header=True,encoding='UTF-8') 59 | 60 | content_id = sentiment_value['content_id'] 61 | subject = subject['subject'] 62 | sentiment_value = sentiment_value['sentiment_value'].astype(int) 63 | sentiment_word = content['sentiment_word'][:len(content_id)] 64 | print('sentiment_value',sentiment_value.shape) 65 | predictions = pd.concat([content_id, subject,sentiment_value,sentiment_word], axis=1) 66 | predictions.to_csv(path + 'result.csv', index=0, sep=',', columns=['content_id', 'subject','sentiment_value','sentiment_word'],encoding='UTF-8') -------------------------------------------------------------------------------- /venv/bdci/read_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/12 下午1:48 4 | # @Author :hwwu 5 | # @File :read_data.py 6 | 7 | path = '/Users/liyangyang/Downloads/bdci/' 8 | 9 | import pandas as pd, numpy as np 10 | 11 | train = pd.read_csv(path + 'train.csv')[:2000] 12 | test = pd.read_csv(path + 'test_public.csv')[:1000] 13 | 14 | # y_train = train['sentiment_value'].astype(int) 15 | train.loc[train['subject'] == '动力', 'subject'] = 0 16 | train.loc[train['subject'] == '价格', 'subject'] = 1 17 | train.loc[train['subject'] == '内饰', 'subject'] = 2 18 | train.loc[train['subject'] == '配置', 'subject'] = 3 19 | train.loc[train['subject'] == '安全性', 'subject'] = 4 20 | train.loc[train['subject'] == '外观', 'subject'] = 5 21 | train.loc[train['subject'] == '操控', 'subject'] = 6 22 | train.loc[train['subject'] == '油耗', 'subject'] = 7 23 | train.loc[train['subject'] == '空间', 'subject'] = 8 24 | train.loc[train['subject'] == '舒适性', 'subject'] = 9 25 | y_train = train['subject'] 26 | 27 | 28 | print(train.shape) 29 | print(test.shape) 30 | 31 | stopword_path = '/Users/liyangyang/Downloads/stopwords/stopwords1893.txt' 32 | import jieba 33 | 34 | 35 | def stopwordslist(): 36 | stopwords = [line.strip() for line in open(stopword_path, 'r', encoding='utf-8').readlines()] 37 | # stopwords = [',', '。', '、', '...', '“', '”', '《', '》', ':', ';'] 38 | return stopwords 39 | 40 | 41 | def split_word(line): 42 | result = [] 43 | for i in range(len(line)): 44 | result.append(line[i:i + 1]) 45 | return result 46 | 47 | import codecs 48 | f = codecs.open(path+'train_no_lable.txt', 'a', 'utf8') 49 | train_doc_list = [] 50 | for i in range(len(train)): 51 | sentence_seged = jieba.cut(train['content'][i].strip()) 52 | # sentence_seged = split_word(train['content'][i].strip()) 53 | stopwords = stopwordslist() 54 | outstr = '' 55 | for word in sentence_seged: 56 | if word not in stopwords: 57 | if (word != '\t') & (word.strip() != ''): 58 | outstr += word 59 | # outstr += '\t' 60 | outstr += ' ' 61 | # if (outstr == ''): 62 | # outstr = 'NaN' 63 | # outstr +='__myprefix__' 64 | # outstr +=str(y_train[i]) 65 | f.write(outstr+'\n') 66 | train_doc_list.append(outstr) 67 | 68 | train_doc_list = np.array(train_doc_list) 69 | print(train_doc_list.shape) 70 | 71 | test_doc_list = [] 72 | for i in range(len(test)): 73 | sentence_seged = jieba.cut(test['content'][i].strip()) 74 | # sentence_seged = split_word(test['content'][i].strip()) 75 | stopwords = stopwordslist() 76 | outstr = '' 77 | for word in sentence_seged: 78 | if word not in stopwords: 79 | if word != '\t': 80 | outstr += word 81 | outstr += '\t' 82 | if (outstr == ''): 83 | outstr = 'NaN' 84 | test_doc_list.append(outstr) 85 | test_doc_list = np.array(test_doc_list) 86 | print(test_doc_list.shape) 87 | # 88 | from sklearn.feature_extraction.text import CountVectorizer 89 | from sklearn.feature_extraction.text import TfidfVectorizer 90 | # 91 | # count_vec = CountVectorizer(analyzer='word') 92 | # data_train_count = count_vec.fit_transform(train_doc_list) 93 | # data_test_count = count_vec.transform(test_doc_list).toarray() 94 | # #词汇表 95 | # print('\nvocabulary list:\n\n',count_vec.get_feature_names()) 96 | # print( '\nvocabulary dic :\n\n',count_vec.vocabulary_) 97 | # print ('vocabulary:\n\n') 98 | # for key,value in count_vec.vocabulary_.items(): 99 | # print(key,value) 100 | # print('.............') 101 | # print(data_train_count) 102 | 103 | tfidf = TfidfVectorizer( 104 | ngram_range=(1, 1), # 二元文法模型 105 | use_idf=1, 106 | # analyzer='char', 107 | smooth_idf=1) 108 | 109 | data_train_count_tf = tfidf.fit_transform(train_doc_list) 110 | data_test_count_tf = tfidf.transform(test_doc_list) 111 | 112 | print('\nvocabulary list:\n\n',tfidf.get_feature_names()) 113 | print( '\nvocabulary dic :\n\n',tfidf.vocabulary_) 114 | print ('vocabulary:\n\n') 115 | for key,value in tfidf.vocabulary_.items(): 116 | print(key,value) 117 | print('.............') 118 | print(type(data_train_count_tf)) 119 | # 120 | # from sklearn.naive_bayes import MultinomialNB 121 | # from sklearn.model_selection import cross_val_score 122 | # 123 | # clf = MultinomialNB() 124 | # clf.fit(data_train_count, y_train) 125 | # print("多项式贝叶斯分类器20折交叉验证得分: ", np.mean(cross_val_score(clf, data_train_count, y_train, cv=10, scoring='accuracy'))) 126 | # clf.fit(data_train_count_tf, y_train) 127 | # print("多项式贝叶斯分类器TFIDF,20折交叉验证得分: ", 128 | # np.mean(cross_val_score(clf, data_train_count_tf, y_train, cv=10, scoring='accuracy'))) 129 | # # clf_pred = clf.predict(data_test_count_tf) 130 | # # df = pd.DataFrame({"content_id": test['content_id'], "sentiment_value": clf_pred}) 131 | # # df.to_csv(path+'MultinomialNB.csv', index = False, header=True) 132 | # # 133 | # from sklearn import svm 134 | # 135 | # lin_clf = svm.LinearSVC(class_weight='balanced') 136 | # lin_clf.fit(data_train_count, y_train) 137 | # print("svm分类器20折交叉验证得分: ", np.mean(cross_val_score(lin_clf, data_train_count, y_train, cv=10, scoring='accuracy'))) 138 | # lin_clf.fit(data_train_count_tf, y_train) 139 | # print("svm分类器TFIDF,20折交叉验证得分: ", 140 | # np.mean(cross_val_score(lin_clf, data_train_count_tf, y_train, cv=10, scoring='accuracy'))) 141 | # # lin_clf_pred = lin_clf.predict(data_test_count_tf) 142 | # # df = pd.DataFrame({"content_id": test['content_id'], "sentiment_value": lin_clf_pred}) 143 | # # df.to_csv(path+'LinearSVC.csv', index = False, header=True) 144 | # 145 | # from sklearn.ensemble import RandomForestClassifier 146 | # 147 | # lin_forest = RandomForestClassifier(n_estimators=10, random_state=1, class_weight='balanced') 148 | # lin_forest.fit(data_train_count, y_train) 149 | # print("RandomForestClassifier分类器20折交叉验证得分: ", 150 | # np.mean(cross_val_score(lin_forest, data_train_count, y_train, cv=10, scoring='accuracy'))) 151 | # lin_forest.fit(data_train_count_tf, y_train) 152 | # print("RandomForestClassifier分类器TFIDF,20折交叉验证得分: ", 153 | # np.mean(cross_val_score(lin_forest, data_train_count_tf, y_train, cv=10, scoring='accuracy'))) 154 | # # lin_forest_pred = lin_forest.predict(data_test_count_tf) 155 | # # df = pd.DataFrame({"content_id": test['content_id'], "sentiment_value": lin_forest_pred}) 156 | # # df.to_csv(path+'RandomForestClassifier.csv', index = False, header=True) 157 | # 158 | # 159 | # import xgboost as xgb 160 | # 161 | # model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=0.0468) 162 | # model_xgb.fit(data_train_count, y_train) 163 | # print("model_xgb分类器20折交叉验证得分: ", 164 | # np.mean(cross_val_score(model_xgb, data_train_count, y_train, cv=10, scoring='accuracy'))) 165 | # model_xgb.fit(data_train_count_tf, y_train) 166 | # print("model_xgb分类器TFIDF,20折交叉验证得分: ", 167 | # np.mean(cross_val_score(model_xgb, data_train_count_tf, y_train, cv=10, scoring='accuracy'))) 168 | # # model_xgb_pred = model_xgb.predict(data_test_count_tf) 169 | # # df = pd.DataFrame({"content_id": test['content_id'], "sentiment_value": model_xgb_pred}) 170 | # # df.to_csv(path+'XGBClassifier.csv', index = False, header=True) 171 | -------------------------------------------------------------------------------- /venv/bdci/snownlp1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/12 下午6:39 4 | # @Author :hwwu 5 | # @File :snownlp1.py 6 | import pickle 7 | import numpy as np 8 | 9 | 10 | def readdumpobj(path): 11 | file = open(path, "rb") 12 | bunch = pickle.load(file) 13 | file.close() 14 | return bunch 15 | 16 | 17 | def outemotionword(path): 18 | emotionset = [] 19 | with open(path, "rb") as fp: 20 | for word in fp: 21 | if not word.isspace(): 22 | word = word.decode("utf-8") 23 | emotionset.append(word.strip()) 24 | return emotionset 25 | 26 | 27 | def loadDataSet(path): # path是为了读入将情感词典 28 | postingList = readdumpobj("D:\linguistic-corpus\postingList\postingList.dat") 29 | classVec = readdumpobj("D:\linguistic-corpus\postingList\classVec.dat") 30 | emotionset = outemotionword(path) 31 | return postingList, classVec, emotionset 32 | 33 | 34 | class NBayes(object): 35 | def __init__(self): 36 | self.vocabulary = [] # 词典,文本set表 37 | self.idf = 0 # 词典的idf权值向量 38 | self.tf = 0 # 训练集的权值矩阵 39 | self.tdm = 0 # P(x|yi) 40 | self.Pcates = {} # P(yi)--是个类别字典 41 | self.labels = [] # 对应每个文本的分类,是个外部导入的列表[0,1,0,1,0,1] 42 | self.doclength = 0 # 训练集文本数,训练文本长度 43 | self.vocablen = 0 # 词典词长,self.vocabulary长度 44 | self.testset = 0 # 测试集 45 | 46 | # 加载训练集并生成词典,以及tf, idf值 47 | def train_set(self, trainset, classVec, emotionset): 48 | self.cate_prob(classVec) # 计算每个分类在数据集中的概率:P(yi) 49 | self.doclength = len(trainset) 50 | tempset = set() 51 | [tempset.add(word) for word in emotionset] # 生成词典 52 | self.vocabulary = list(tempset) 53 | self.vocablen = len(self.vocabulary) 54 | # self.calc_wordfreq(trainset) 55 | self.calc_tfidf(trainset) # 生成tf-idf权值 56 | self.build_tdm() # 按分类累计向量空间的每维值:P(x|yi) 57 | 58 | # 生成 tf-idf 59 | def calc_tfidf(self, trainset): 60 | self.idf = np.zeros([1, self.vocablen]) 61 | self.tf = np.zeros([self.doclength, self.vocablen]) 62 | for indx in range(self.doclength): 63 | for word in trainset[indx]: 64 | if word in self.vocabulary: 65 | self.tf[indx, self.vocabulary.index(word)] += 1 66 | # 消除不同句长导致的偏差 67 | self.tf[indx] = self.tf[indx] / float(len(trainset[indx])) 68 | for signleword in set(trainset[indx]): 69 | if signleword in self.vocabulary: 70 | self.idf[0, self.vocabulary.index(signleword)] += 1 71 | self.idf = np.log(float(self.doclength) / (self.idf + 1)) # 防止该词语不在语料中,就会导致分母为零 72 | self.tf = np.multiply(self.tf, self.idf) # 矩阵与向量的点乘 73 | 74 | # 生成普通的词频向量 75 | def calc_wordfreq(self, trainset): 76 | self.idf = np.zeros([1, self.vocablen]) # 1*词典数 77 | self.tf = np.zeros([self.doclength, self.vocablen]) # 训练集文件数*词典数 78 | for indx in range(self.doclength): # 遍历所有的文本 79 | for word in trainset[indx]: # 遍历文本中的每个词 80 | if word in self.vocabulary: 81 | self.tf[indx, self.vocabulary.index(word)] += 1 # 找到文本的词在字典中的位置+1 82 | for signleword in set(trainset[indx]): 83 | if signleword in self.vocabulary: 84 | self.idf[0, self.vocabulary.index(signleword)] += 1 85 | 86 | # 计算每个分类在数据集中的概率:P(yi) 87 | def cate_prob(self, classVec): 88 | self.labels = classVec 89 | labeltemps = set(self.labels) # 获取全部分类 90 | for labeltemp in labeltemps: 91 | # 统计列表中重复的值:self.labels.count(labeltemp) 92 | self.Pcates[labeltemp] = float(self.labels.count(labeltemp)) / float(len(self.labels)) 93 | 94 | # 按分类累计向量空间的每维值:P(x|yi) 95 | def build_tdm(self): 96 | self.tdm = np.zeros([len(self.Pcates), self.vocablen]) # 类别行*词典列 97 | sumlist = np.zeros([len(self.Pcates), 1]) # 统计每个分类的总值 98 | for indx in range(self.doclength): 99 | self.tdm[self.labels[indx]] += self.tf[indx] # 将同一类别的词向量空间值加总 100 | sumlist[self.labels[indx]] = np.sum(self.tdm[self.labels[indx]]) # 统计每个分类的总值--是个标量 101 | self.tdm = self.tdm / sumlist # P(x|yi) 102 | 103 | # 测试集映射到当前词典 104 | def map2vocab(self, testdata): 105 | self.testset = np.zeros([1, self.vocablen]) 106 | # 删除测试集中词不在训练集中 107 | for word in testdata: 108 | if word in self.vocabulary: 109 | self.testset[0, self.vocabulary.index(word)] += 1 110 | 111 | # 输出分类类别 112 | def predict(self, testset): 113 | if np.shape(testset)[1] != self.vocablen: 114 | print("输入错误") 115 | exit(0) 116 | predvalue = 0 117 | predclass = "" 118 | for tdm_vect, keyclass in zip(self.tdm, self.Pcates): 119 | # P(x|yi)P(yi) 120 | temp = np.sum(testset * tdm_vect * self.Pcates[keyclass]) 121 | if temp > predvalue: 122 | predvalue = temp 123 | predclass = keyclass 124 | return predclass 125 | 126 | 127 | if __name__ == "__main__": 128 | postingList, classVec, emotionset = loadDataSet("D:\sentiment-word\emotionword.txt") 129 | testset = postingList[119] 130 | nb = NBayes() # 类的实例化 131 | nb.train_set(postingList, classVec, emotionset) # 训练数据集 132 | nb.map2vocab(testset) # 随机选择一个测试句,这里2表示文本中的第三句话,不是脏话,应输出0。 133 | print(nb.predict(nb.testset)) # 输出分类结果0表示消极,1表示积极 134 | print("分类结束") 135 | -------------------------------------------------------------------------------- /venv/bdci/split_word.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/20 下午3:25 4 | # @Author :hwwu 5 | # @File :split_word.py 6 | 7 | path = '/Users/liyangyang/Downloads/bdci/' 8 | 9 | import pandas as pd, numpy as np 10 | 11 | train = pd.read_csv(path + 'train.csv') 12 | print(train.shape) 13 | stopword_path = '/Users/liyangyang/Downloads/stopwords/stopwords1893.txt' 14 | import jieba 15 | import fool 16 | def stopwordslist(): 17 | stopwords = [line.strip() for line in open(stopword_path, 'r', encoding='utf-8').readlines()] 18 | # stopwords = [',', '。', '、', '...', '“', '”', '《', '》', ':', ';'] 19 | return stopwords 20 | 21 | import codecs 22 | f = codecs.open(path+'train_no_lable.txt', 'a', 'utf8') 23 | train_doc_list = [] 24 | for i in range(100): 25 | print(train['content'][i].strip()) 26 | print('..........') 27 | sentence_seged = jieba.cut(train['content'][i].strip()) 28 | outstr = '' 29 | for word in sentence_seged: 30 | if (word != '\t') & (word.strip() != ''): 31 | outstr += word 32 | outstr += ' ' 33 | print(outstr) 34 | print('..........') 35 | 36 | sentence_seged_fool = fool.cut(train['content'][i].strip()) 37 | print(sentence_seged_fool) 38 | print('***********') 39 | -------------------------------------------------------------------------------- /venv/datafountain/guangfudianzhan/__pycache__/read_data.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/guangfudianzhan/__pycache__/read_data.cpython-36.pyc -------------------------------------------------------------------------------- /venv/datafountain/guangfudianzhan/dnn_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/8/7 下午1:21 4 | # @Author :hwwu 5 | # @File :dnn_model.py 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import sys 10 | 11 | path = '/Users/liyangyang/PycharmProjects/mypy/venv/datafountain/guangfudianzhan/' 12 | sys.path.append(path) 13 | import read_data 14 | 15 | dis = [1, 190, 379, 567, 755, 940, 1123, 1314, 1503, 1505, 1694, 1879, 16 | 2070, 2257, 2444, 2632, 2823, 3013, 3202, 3379, 3567, 3746, 3927, 4089, 17 | 4278, 4459, 4648, 4652, 4821, 5010, 5013, 5017, 5059, 5061, 5069, 5074, 18 | 5077, 5281, 5285, 5287, 5292, 5508, 5703, 5911, 5913, 5916, 5918, 6121, 19 | 6337, 6524, 6528, 6531, 6534, 6723, 6923, 7116, 7326, 7535, 7740, 7937, 20 | 8146, 8245, 8258, 8310, 8488, 8705, 8711, 8878, 9088, 9296, 9505, 9719, 21 | 9916, 10124, 10335, 10544, 10736, 10914, 10917, 11119, 11331, 11540, 22 | 11753, 11963, 12170, 12381, 12592, 12802, 13009, 13214, 13426, 13617, 23 | 13830, 14032, 14243, 14457, 14666, 14882, 15091, 15299, 15508, 15719, 24 | 15937, 16144, 16348, 16540, 16747, 16925, 17133, 17342, 25 | 17527, 17543, 17745, 17876] 26 | 27 | dic = [22, 135, 591, 592, 593, 594, 595, 737, 948, 1070, 1173, 1175, 1286, 28 | 1362, 1451, 1519, 1565, 1666, 1717, 1894, 2137, 2223, 2271, 2414, 29 | 2579, 2797, 2875, 2916, 2986, 2684, 3723, 3597, 3599, 3603, 3605, 30 | 3607, 3610, 3601, 3602, 3421, 3393, 3538, 3539, 3540, 5521, 6016, 31 | 7437, 11832, 16437, 15355, 3152, 3612,3611] 32 | 33 | 34 | def load_train_data(): 35 | train_ = read_data.read_result_data('public.train.csv') 36 | train_x = train_[:, 2:21] 37 | train_y = train_[:, 21] 38 | train_z = train_[:, 1] 39 | 40 | train_len = len(train_y) 41 | train_y.shape = (1, train_len) 42 | train_y = np.transpose(train_y) 43 | 44 | x, y = [], [] 45 | for i in range(train_len): 46 | if ((round(train_x[i][0], 2) != 0.01) | (round(train_x[i][1], 1) != 0.1)): 47 | 48 | id = 0.0 49 | for j in range(len(dis)): 50 | if (train_z[i] < dis[j]): 51 | id = 0.5 - np.abs((int(train_z[i]) - dis[j - 1]) / (dis[j] - dis[j - 1]) - 0.5) 52 | break 53 | 54 | if (train_z[i] not in dic): 55 | x.append([ 56 | train_x[i][1], 57 | train_x[i][2], 58 | train_x[i][0], 59 | id, 60 | train_x[i][3], 61 | train_x[i][4], train_x[i][5], train_x[i][6], 62 | train_x[i][7], train_x[i][8], train_x[i][9], 63 | train_x[i][7] / (train_x[i][10] + 0.1), train_x[i][8] / (train_x[i][11] + 0.1), 64 | train_x[i][9] / (train_x[i][12] + 0.1), 65 | train_x[i][10], train_x[i][11], train_x[i][12], 66 | train_x[i][13], train_x[i][14], train_x[i][15], 67 | train_x[i][4] * train_x[i][13], train_x[i][5] * train_x[i][14], train_x[i][6] * train_x[i][15], 68 | train_x[i][18], 69 | train_x[i][17], 70 | train_x[i][16] 71 | ]) 72 | # x.append(train_x[i]) 73 | y.append(abs(train_y[i])) 74 | print(len(x)) 75 | # for i in range(10): 76 | # print(x[i]) 77 | return x, y 78 | 79 | 80 | def load_test_data(): 81 | # train_ = read_data.read_result_data('test_data_all.csv') 82 | train_ = read_data.read_result_data('public.test.csv') 83 | train_x = train_[:, 2:21] 84 | train_y = train_[:, 1] 85 | 86 | train_len = len(train_y) 87 | train_y.shape = (1, train_len) 88 | train_y = np.transpose(train_y) 89 | 90 | x, y = [], [] 91 | for i in range(train_len): 92 | if ((round(train_x[i][0], 2) != 0.01) | (round(train_x[i][1], 1) != 0.1)): 93 | 94 | id = 0.0 95 | for j in range(len(dis)): 96 | if (train_y[i] < dis[j]): 97 | id = 0.5 - np.abs((int(train_y[i]) - dis[j - 1]) / (dis[j] - dis[j - 1]) - 0.5) 98 | break 99 | 100 | if (train_y[i] not in dic): 101 | x.append([ 102 | train_x[i][1], 103 | train_x[i][2], 104 | train_x[i][3], 105 | train_x[i][4], 106 | train_x[i][0], 107 | id, 108 | train_x[i][5], 109 | train_x[i][6], 110 | # train_x[i][7], 111 | # train_x[i][8], 112 | # train_x[i][9], 113 | train_x[i][10], train_x[i][11], train_x[i][12], 114 | train_x[i][13], train_x[i][14], 115 | train_x[i][15], 116 | train_x[i][17], 117 | train_x[i][18], 118 | train_x[i][16] 119 | ]) 120 | # x.append(train_x[i]) 121 | y.append(train_y[i]) 122 | print(len(x)) 123 | return x, y 124 | 125 | 126 | x, y = load_train_data() 127 | 128 | train_x = np.reshape(x[1::1], (-1, 26)) 129 | train_y = np.reshape(y[1::1], (-1, 1)) 130 | test_x = np.reshape(x[1::1], (-1, 26)) 131 | test_y = np.reshape(y[1::1], (-1, 1)) 132 | # 133 | # x1, y1 = load_test_data() 134 | # test_x = np.reshape(x1, (-1, 17)) 135 | # test_y = np.reshape(y1, (-1, 1)) 136 | 137 | feature_columns = [tf.contrib.layers.real_valued_column("", dimension=17)] 138 | classifier = tf.contrib.learn.DNNRegressor(feature_columns=feature_columns, 139 | hidden_units=[1], 140 | optimizer=tf.train.AdamOptimizer( 141 | learning_rate=0.0001 142 | ), 143 | activation_fn=tf.nn.leaky_relu) 144 | # classifier = tf.contrib.learn.DNNLinearCombinedRegressor(dnn_feature_columns=feature_columns, 145 | # dnn_hidden_units=[1], 146 | # dnn_optimizer=tf.train.AdamOptimizer( 147 | # learning_rate=0.001 148 | # )) 149 | classifier.fit(x=train_x, 150 | y=train_y, 151 | max_steps=40000) 152 | 153 | print(classifier.evaluate(x=train_x, y=train_y)) 154 | 155 | y = classifier.predict(test_x) 156 | y_=[] 157 | for i in y: 158 | y_.append([i]) 159 | 160 | # r = [] 161 | # for i in range(8337): 162 | # id = test_y[i][0] 163 | # p = y_[i][0] 164 | # r.append([id, p]) 165 | # np.savetxt('/Users/liyangyang/Downloads/datafountain/guangdianfute/test_data_3', r) 166 | 167 | 168 | error = [] 169 | for i in range(len(test_y)): 170 | if((test_y[i] - y_[i]) * (test_y[i] - y_[i]) > 1): 171 | print(test_x[i], test_y[i], y_[i]) 172 | error.append(test_y[i] - y_[i]) 173 | 174 | squaredError = [] 175 | absError = [] 176 | for val in error: 177 | squaredError.append(val * val) # target-prediction之差平方 178 | 179 | print("Square Error: ", sorted(squaredError, reverse=True)) 180 | 181 | print("MSE = ", sum(squaredError) / len(squaredError)) # 均方误差MSE 182 | from math import sqrt 183 | 184 | print("RMSE = ", sqrt(sum(squaredError) / len(squaredError))) # 均方根误差RMSE 185 | -------------------------------------------------------------------------------- /venv/datafountain/guangfudianzhan/draw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # --coding:utf8 -- 3 | # @TIME :2018/8/1 上午10:28 4 | # @Author :hwwu 5 | # @File :draw.py 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | path = '/Users/liyangyang/Downloads/datafountain/guangdianfute/' 14 | 15 | 16 | dic = [22, 135, 591, 592, 593, 594, 595, 737, 948, 1070, 1173, 1175, 1286, 17 | 1362, 1451, 1519, 1565, 1666, 1717, 1894, 2137, 2223, 2271, 2414, 18 | 2579, 2797, 2875, 2916, 2986, 2684, 3723, 3597, 3599, 3603, 3605, 19 | 3607, 3610, 3601, 3602, 3421, 3393, 3538, 3539, 3540, 5521, 6016, 20 | 7437, 11832, 16437, 15355, 3152, 3612, 3611] 21 | 22 | # 板温 现场温度 光照强度 转换效率 转换效率A 转换效率B 转换效率C 电压A 电压B 电压C 23 | # 电流A 电流B 电流C 功率A 功率B 功率C 平均功率 风速 风向 发电量 24 | def draw_data(file='public.train.csv'): 25 | data = pd.read_csv(path + file) 26 | print(data.std()) 27 | # data = data[(data['平均功率'] < 10000.0)] 28 | # data = data[(data['现场温度'] > -1000.0)] 29 | # data = data[(data['转换效率'] < 2000.0)] 30 | data = data[~data['ID'].isin(dic)] 31 | # data = data[(data['电流A'] > 200.0)] 32 | # print(len(data)) 33 | # plt.hist(data['风向']) 34 | # 板温 光照强度 35 | # xs = data['电压C']/data['电流C'] 36 | xs = (data['光照强度']*data['转换效率'])/100/12.5 37 | # xs = data['现场温度'] 38 | ys = data['发电量'] 39 | plt.scatter(xs, ys) 40 | # x = [i for i in range(100)] 41 | # for i in range(1,11): 42 | # strat=4000+i*100 43 | # plt.plot(x, (data['平均功率']/1000*2)[strat:strat+100], color='r', label='yuce') 44 | # plt.plot(x, data['发电量'][strat:strat+100], color='y', label='shiji') 45 | # plt.show() 46 | # strat = 2200 47 | # plt.plot(x, (data['平均功率'] / 1000 * 2)[strat:strat + 200], color='r', label='yuce') 48 | # plt.plot(x, data['发电量'][strat:strat + 200], color='y', label='shiji') 49 | plt.show() 50 | # print(data.head()) 51 | 52 | import seaborn as sns 53 | def neighborhood(file='public.train.csv'): 54 | train = pd.read_csv(path + file) 55 | train.rename(columns={ 56 | '板温': 'a', '现场温度': 'b', '光照强度': 'c', '转换效率': 'd', '转换效率A': 'e', 57 | '转换效率B': 'f', '转换效率C': 'g', '电压A': 'h', '电压B': 'i', '电压C': 'j', 58 | '电流A': 'k', '电流B': 'l', '电流C': 'm', '功率A': 'n', '功率B': 'o', 59 | '功率C': 'p', '平均功率': 'q', '风速': 'r', '风向': 's', '发电量': 't' 60 | }, inplace=True) 61 | k = 10 # number of variables for heatmap 62 | corrmat = train.corr() 63 | cols = corrmat.nlargest(k, 't')['t'].index 64 | cm = np.corrcoef(train[cols].values.T) 65 | sns.set(font_scale=1.25) 66 | hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, 67 | xticklabels=cols.values) 68 | plt.show() 69 | # neighborhood() 70 | 71 | file='public.train.csv' 72 | def t(): 73 | train = pd.read_csv(path + file) 74 | y = train['发电量'] 75 | train_labels = y.values.copy 76 | print(y.describe()) 77 | sns.distplot(y) 78 | print('Skewness: %f' % y.skew()) 79 | print('Kurtosis: %f' % y.kurt()) 80 | # 得到训练集的数值特征和类别特征 81 | 82 | from scipy.stats import skew 83 | # log transform the target use log(1+x) 84 | train["发电量"] = np.log1p(train["发电量"]) 85 | sns.distplot(train['发电量']) 86 | print("Skewness: %f" % train['发电量'].skew()) 87 | print("Kurtosis: %f" % train['发电量'].kurt()) 88 | # t() 89 | 90 | draw_data() 91 | # draw_data('public.test.csv') 92 | 93 | 94 | from sklearn.feature_selection import SelectKBest 95 | import sklearn 96 | 97 | 98 | import sys 99 | 100 | path = '/Users/liyangyang/PycharmProjects/mypy/venv/datafountain/guangfudianzhan/' 101 | sys.path.append(path) 102 | import read_data 103 | 104 | dis = [1,190,379,567,755,940,1123,1314,1503,1505,1694,1879, 105 | 2070,2257,2444,2632,2823,3013,3202,3379,3567,3746,3927,4089, 106 | 4278,4459,4648,4652,4821,5010,5013,5017,5059,5061,5069,5074, 107 | 5077,5281,5285,5287,5292,5508,5703,5911,5913,5916,5918,6121, 108 | 6337,6524,6528,6531,6534,6723,6923,7116,7326,7535,7740,7937, 109 | 8146,8245,8258,8310,8488,8705,8711,8878,9088,9296,9505,9719, 110 | 9916,10124,10335,10544,10736,10914,10917,11119,11331,11540, 111 | 11753,11963,12170,12381,12592,12802,13009,13214,13426,13617, 112 | 13830,14032,14243,14457,14666,14882,15091,15299,15508,15719, 113 | 15937,16144,16348,16540,16747,16925,17133,17342, 114 | 17527,17543,17745,17876] 115 | 116 | 117 | def load_train_data(): 118 | train_ = read_data.read_result_data('public.train.csv') 119 | train_x = train_[:, 2:21] 120 | train_y = train_[:, 21] 121 | train_z = train_[:, 1] 122 | 123 | train_len = len(train_y) 124 | train_y.shape = (1, train_len) 125 | train_y = np.transpose(train_y) 126 | 127 | x, y = [], [] 128 | for i in range(train_len): 129 | if ((round(train_x[i][0], 2) != 0.01) | (round(train_x[i][1], 1) != 0.1)): 130 | 131 | id = 0.0 132 | for j in range(len(dis)): 133 | if (train_z[i] -1000.0)] 27 | data = data[~((data['板温'] == 0.01) & (data['现场温度'] == 0.1))] 28 | data = data[~(data['ID'].isin(dic))] 29 | 30 | print(data.max()) 31 | 32 | feature_name = [i for i in data.columns if i!='发电量'] 33 | feature_name = [i for i in feature_name if i!='ID'] 34 | train_data = data[feature_name] 35 | train_label = data['发电量'] 36 | 37 | # from sklearn import preprocessing 38 | # min_max_scaler = preprocessing.MinMaxScaler() 39 | # train_data = min_max_scaler.fit_transform(train_data) 40 | # print(feature_name) 41 | 42 | #方差选择法 43 | # from sklearn.feature_selection import VarianceThreshold 44 | # print(VarianceThreshold(threshold=0.03).fit_transform(train_data)[0]) 45 | 46 | #相关系数法 47 | # from sklearn.feature_selection import SelectKBest 48 | # from scipy.stats import pearsonr 49 | # print(feature_name) 50 | # print(train_data[0]) 51 | # from sklearn.feature_selection import f_regression,mutual_info_regression 52 | # for i in range(1,20): 53 | # print(SelectKBest(f_regression, k=i).fit_transform(train_data, train_label)[0]) 54 | # print(SelectKBest(mutual_info_regression, k=i).fit_transform(train_data, train_label)[0]) 55 | 56 | #Pearson相关系数 57 | # from scipy.stats import pearsonr 58 | # for i in range(0,19): 59 | # print(i,pearsonr(train_data[:,i], train_label)) 60 | 61 | from sklearn.feature_selection import RFE 62 | from sklearn.linear_model import LinearRegression 63 | print(train_data.head()) 64 | train_data = np.array(train_data) 65 | train_label = np.array(train_label) 66 | print(RFE(estimator=LinearRegression(), n_features_to_select=10).fit_transform(train_data, train_label)[0]) 67 | 68 | -------------------------------------------------------------------------------- /venv/datafountain/guangfudianzhan/find_best_feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/8/14 上午10:17 4 | # @Author :hwwu 5 | # @File :find_best_feature.py 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from pandas import DataFrame as DF 10 | import xgboost as xgb 11 | 12 | import warnings 13 | 14 | warnings.filterwarnings("ignore") 15 | 16 | dic = [22, 135, 591, 592, 593, 594, 595, 737, 948, 1070, 1173, 1175, 1286, 17 | 1362, 1451, 1519, 1565, 1666, 1717, 1894, 2137, 2223, 2271, 2414, 18 | 2579, 2797, 2875, 2916, 2986, 2684, 3723, 3597, 3599, 3603, 3605, 19 | 3607, 3610, 3601, 3602, 3421, 3393, 3538, 3539, 3540, 5521, 6016, 20 | 7437, 11832, 15355, 3152, 3612, 3611] 21 | 22 | path = '/Users/liyangyang/Downloads/datafountain/guangdianfute/' 23 | file = 'public.train.csv' 24 | data = pd.read_csv(path + file) 25 | data = data[(data['平均功率'] < 10000.0)] 26 | data = data[(data['现场温度'] > -1000.0)] 27 | data = data[(data['转换效率'] < 500.0)] 28 | data = data[~((data['板温'] == 0.01) & (data['现场温度'] == 0.1))] 29 | data = data[~(data['ID'].isin(dic))] 30 | 31 | train = data[::1] 32 | test = data[::1] 33 | 34 | feature_name = [i for i in data.columns if i != '发电量'] 35 | feature_name = [i for i in feature_name if i != 'ID'] 36 | feature_name = [i for i in feature_name if i != '现场温度'] 37 | # feature_name = [i for i in feature_name if i != '转换效率'] 38 | # feature_name = [i for i in feature_name if i != '功率A'] 39 | # feature_name = [i for i in feature_name if i != '功率B'] 40 | # feature_name = [i for i in feature_name if i != '功率C'] 41 | 42 | train_data = train[feature_name] 43 | train_label = train['发电量'] 44 | test_data = test[feature_name] 45 | train_label = np.array(train_label) 46 | 47 | from sklearn import preprocessing 48 | min_max_scaler = preprocessing.MinMaxScaler() 49 | train_data = min_max_scaler.fit_transform(train_data) 50 | train_data = DF(train_data, columns=(i for i in feature_name)) 51 | print(train_data.head()) 52 | 53 | xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 54 | learning_rate=0.005, max_depth=23, 55 | max_delta_step=100000, 56 | min_child_weight=1.7817, n_estimators=2200, 57 | reg_alpha=0.4640, reg_lambda=0.8571, 58 | subsample=0.5213, silent=1, 59 | random_state=7, nthread=-1) 60 | 61 | 62 | def rmse_my(y_test, y_): 63 | error = [] 64 | for i in range(len(y_test)): 65 | error.append(y_test[i] - y_[i]) 66 | 67 | squaredError = [] 68 | for val in error: 69 | squaredError.append(val * val) # target-prediction之差平方 70 | from math import sqrt 71 | RMSE = sqrt(sum(squaredError) / len(squaredError)) 72 | print("RMSE = ", RMSE) # 均方根误差RMSE 73 | return RMSE 74 | 75 | 76 | def get_division_feature(data, feature_name): 77 | new_feature = [] 78 | new_feature_name = [] 79 | for i in range(len(data[feature_name].columns) - 1): 80 | for j in range(i + 1, len(data[feature_name].columns)): 81 | new_feature_name.append(data[feature_name].columns[i] + '/' + data[feature_name].columns[j]) 82 | new_feature_name.append(data[feature_name].columns[i] + '*' + data[feature_name].columns[j]) 83 | new_feature_name.append(data[feature_name].columns[i] + '+' + data[feature_name].columns[j]) 84 | new_feature_name.append(data[feature_name].columns[i] + '-' + data[feature_name].columns[j]) 85 | new_feature.append(data[data[feature_name].columns[i]] / data[data[feature_name].columns[j]]) 86 | new_feature.append(data[data[feature_name].columns[i]] * data[data[feature_name].columns[j]]) 87 | new_feature.append(data[data[feature_name].columns[i]] + data[data[feature_name].columns[j]]) 88 | new_feature.append(data[data[feature_name].columns[i]] - data[data[feature_name].columns[j]]) 89 | 90 | temp_data = DF(pd.concat(new_feature, axis=1)) 91 | temp_data.columns = new_feature_name 92 | data = pd.concat([temp_data], axis=1).reset_index(drop=True) 93 | # print(data.shape) 94 | return data.reset_index(drop=True) 95 | 96 | 97 | def get_square_feature(data, feature_name): 98 | new_feature = [] 99 | new_feature_name = [] 100 | for i in range(len(data[feature_name].columns)): 101 | new_feature_name.append(data[feature_name].columns[i] + '**2') 102 | new_feature_name.append(data[feature_name].columns[i] + '**1/2') 103 | new_feature.append(data[data[feature_name].columns[i]] ** 2) 104 | new_feature.append(data[data[feature_name].columns[i]] ** (1 / 2)) 105 | temp_data = DF(pd.concat(new_feature, axis=1)) 106 | temp_data.columns = new_feature_name 107 | data = pd.concat([temp_data], axis=1).reset_index(drop=True) 108 | # print(data.shape) 109 | return data.reset_index(drop=True) 110 | 111 | 112 | def find_best_feature(feature_name): 113 | get_ans_face = feature_name 114 | xgb_model.fit(train_data[get_ans_face], train_label) 115 | y_ = xgb_model.predict(train_data[get_ans_face]) 116 | m = rmse_my(train_label, y_) 117 | return m 118 | 119 | 120 | train_datatrain_d = get_square_feature(train_data, feature_name) 121 | train_data_division = get_division_feature(train_data, feature_name) 122 | train_data = pd.concat([train_datatrain_d, train_data_division, train_data], axis=1) 123 | feature_name = [i for i in train_data.columns] 124 | print(train_data.shape) 125 | 126 | print(feature_name) 127 | 128 | now_feature = [] 129 | # check = 0.05416978387299058 130 | # d = [1,2,3,5,6,7,8,9,10,21,22,27,31,32,33,34,35,36,37,38,39,40,42,43,44,46,47,48,49,55,56,60,61,65,66,78,79,80,82, 131 | # 103,104,108,109,110,111,112,128,129,130,131,214,215,221,222,243,247,248,251,252] 132 | # 133 | # for i in d: 134 | # now_feature.append(feature_name[i-1]) 135 | # for i in range(354,len(feature_name)): 136 | # check = 0.05878801229207516 137 | d = [1, 2, 3, 4, 5, 6, 7, 9, 10, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 82, 83, 84, 85, 86, 87, 138 | 89, 90, 94, 95, 96, 97, 98, 122, 123, 124, 125, 126, 127, 128, 129, 132, 133, 136, 137, 163, 164, 167, 168, 170, 139 | 171, 172, 176, 177, 179, 180, 214] 140 | 141 | # for i in d: 142 | # now_feature.append(feature_name[i - 1]) 143 | # for i in range(324, len(feature_name)): 144 | # now_feature.append(feature_name[i]) 145 | # jj = find_best_feature(now_feature) 146 | # if jj < check: 147 | # print('目前特征长度为', len(now_feature), ' 目前帅气的RSME为值是', jj, ' 成功加入第', i + 1, '个', 'RSME降低', check - jj) 148 | # check = jj 149 | # else: 150 | # print('尝试加入第', i + 1, '个特征失败') 151 | # now_feature.pop() 152 | # print(now_feature) 153 | # 154 | now_feature2 = [] 155 | check = 100 156 | for i in range(len(feature_name)): 157 | now_feature2.append(feature_name[len(feature_name)-i-1]) 158 | jj = find_best_feature(now_feature2) 159 | if jj -1000.0)] 17 | # data = data[(data['转换效率'] < 3000.0)] 18 | data = data[~((data['板温']==0.01)&(data['现场温度']==0.1))] 19 | 20 | return data 21 | 22 | 23 | def load_original_data(file='public.train.csv'): 24 | train = read_original_data(file) 25 | return train.reset_index() 26 | 27 | 28 | def read_result_data(file='public.train.csv'): 29 | train = load_original_data(file) 30 | result = np.array(train) 31 | print(result.shape) 32 | return result 33 | 34 | 35 | def write_test_result1(): 36 | train_ = read_result_data('public.test.csv') 37 | train_x = train_[:, 2:21] 38 | train_y = train_[:, 1] 39 | 40 | res = [] 41 | 42 | train_len = len(train_y) 43 | train_y.shape = (1, train_len) 44 | train_y = np.transpose(train_y) 45 | 46 | for i in range(train_len): 47 | if ((round(train_x[i][0], 2) == 0.01) & (round(train_x[i][1], 1) == 0.1)): 48 | res.append([train_y[i], 0.379993053]) 49 | 50 | print(len(res)) 51 | np.savetxt(path + 'test_data_1', res) 52 | 53 | 54 | def write_test_result(): 55 | train_1 = read_result_data('public.test.csv') 56 | train_2 = read_result_data('public.train.csv') 57 | train_x = train_1[:, 1:21] 58 | train_y = train_2[:, 1:21] 59 | 60 | train = np.vstack([train_x, train_y]) 61 | 62 | train_a = train[:, ::-1].T 63 | train_a2 = np.lexsort(train_a) 64 | train = train[train_a2] 65 | 66 | np.savetxt(path + 'test_data_all.csv', train, fmt="%.2f", delimiter=',') 67 | 68 | 69 | # write_test_result() 70 | 71 | 72 | def write_result(): 73 | x1 = np.loadtxt(path + 'test_data_1') 74 | x2 = np.loadtxt(path + 'test_data_3') 75 | 76 | user_id = [] 77 | price = [] 78 | for i in range(len(x1)): 79 | user_id.append(int(x1[i][0])) 80 | # price.append(round(x1[i][1],1)) 81 | price.append(round(x1[i][1], 7)) 82 | for i in range(len(x2)): 83 | user_id.append(int(x2[i][0])) 84 | price.append(round(x2[i][1], 7)) 85 | english_column = pd.Series(user_id) 86 | number_column = pd.Series(price) 87 | predictions = pd.concat([english_column, number_column], axis=1) 88 | # another way to handle 89 | # save = pd.DataFrame({'user_id': user_id, 'prediction_pay_price': price}) 90 | predictions.to_csv(path + 'result_data.csv', index=0, sep=',') 91 | 92 | 93 | # write_result() 94 | 95 | def write_result2(): 96 | t = read_result_data('public.test.csv') 97 | x1 = np.loadtxt(path + 'test_data_all_2') 98 | x2 = np.loadtxt(path + 'test_data_3') 99 | t1 = t[:, 1] 100 | map = {} 101 | r = [] 102 | map2 = {} 103 | for i in range(len(x2)): 104 | map2[int(x2[i][0])] = x2[i][1] 105 | for i in range(8409): 106 | if ((round(t[i][0], 2) != 0.01) | (round(t[i][1], 1) != 0.1)): 107 | map[int(t1[i])] = 0 108 | for i in range(len(x1)): 109 | a1 = int(x1[i][0]) 110 | a2 = x1[i][2] 111 | if (a1==16437): 112 | r.append([a1,9.911484700000000814e+00]) 113 | print(a1) 114 | elif (a1 in map2.keys()): 115 | r.append([a1, map2[a1]]) 116 | elif (a1 in map.keys()): 117 | r.append([a1, a2]) 118 | # else: 119 | # r.append([a1,a2]) 120 | 121 | np.savetxt('/Users/liyangyang/Downloads/datafountain/guangdianfute/test_data_2', r) 122 | 123 | 124 | # write_result2() 125 | 126 | 127 | def mid_merge_r(): 128 | x1 = np.loadtxt(path + 'test_data_all_1') 129 | t = read_result_data('public.train.csv') 130 | t1 = t[:, 1] 131 | t2 = t[:, 21] 132 | map = {} 133 | r = [] 134 | for i in range(9000): 135 | if ((round(t[i][0], 2) != 0.01) | (round(t[i][1], 1) != 0.1)): 136 | map[int(t1[i])] = t2[i] 137 | for i in range(len(x1)): 138 | a1 = int(x1[i][0]) 139 | a2 = x1[i][1] 140 | if (a1 in map.keys()): 141 | r.append([a1, a2, round(map[a1], 7)]) 142 | else: 143 | r.append([a1, a2, 0.0]) 144 | 145 | np.savetxt('/Users/liyangyang/Downloads/datafountain/guangdianfute/test_data_all_2', r) 146 | 147 | 148 | # mid_merge_r() 149 | 150 | import matplotlib.pyplot as plt 151 | 152 | 153 | def plot(): 154 | x1 = np.loadtxt(path + 'test_data_all_2') 155 | s = 90*100 156 | b = 100 157 | e = s + b 158 | x = [i for i in range(s, e)] 159 | # 以折线图表示结果 160 | plt.figure() 161 | plt.plot(x, x1[s:e,1], color='r', label='yuce') 162 | plt.plot(x, x1[s:e, 2], color='y', label='shiji') 163 | plt.xlabel("Time(s)") # X轴标签 164 | plt.ylabel("Value") # Y轴标签 165 | plt.show() 166 | 167 | # plot() 168 | 169 | if __name__ == '__main__': 170 | pass -------------------------------------------------------------------------------- /venv/datafountain/guangfudianzhan/rnn_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/6/21 下午1:27 4 | # @Author :hwwu 5 | # @File :PricePredictor.py 6 | 7 | import codecs 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import tensorflow as tf 11 | import pandas as pd 12 | 13 | 14 | class PricePredictor: 15 | # lstm param 16 | timeStep = 19 17 | hiddenUnitSize = 38 # 隐藏层神经元数量 18 | batchSize = 88 # 每一批次训练多少个样例 19 | inputSize = 19 # 输入维度 20 | outputSize = 1 # 输出维度 21 | lr = 0.0001 # 学习率 22 | train_x, train_y = [], [] # 训练数据集 23 | dataFile = '/Users/liyangyang/Downloads/datafountain/guangdianfute/public.train.csv' 24 | testFile = '/Users/liyangyang/Downloads/datafountain/guangdianfute/public.test.csv' 25 | train_data = [] 26 | X = tf.placeholder(tf.float32, [None, timeStep, inputSize]) 27 | Y = tf.placeholder(tf.float32, [None, timeStep]) 28 | # Y = tf.placeholder(tf.float32, [None, timeStep, outputSize]) 29 | weights = { 30 | 'in': tf.Variable(tf.random_normal([inputSize, hiddenUnitSize])), 31 | 'out': tf.Variable(tf.random_normal([hiddenUnitSize, 1])) 32 | } 33 | 34 | biases = { 35 | 'in': tf.Variable(tf.constant(0.1, shape=[hiddenUnitSize, ])), 36 | 'out': tf.Variable(tf.constant(0.1, shape=[1, ])) 37 | } 38 | 39 | savePath = '/Users/liyangyang/PycharmProjects/mypy/venv/datafountain/guangfudianzhan/model/stock.train.model' 40 | 41 | def loadData(self): 42 | data = pd.read_csv(self.dataFile) 43 | data = np.array(data) 44 | train_len = len(data) 45 | train = [] 46 | for i in range(train_len): 47 | if ((round(data[i][1], 2) != 0.01) | (round(data[i][2], 1) != 0.1)): 48 | if (data[i][2] < -1000): 49 | print(data[i][2]) 50 | data[i][2] = -6.0 51 | if (data[i][19] > 360): 52 | data[i][19] -= 360 53 | if (data[i][20] < 0): 54 | data[i][20] = -data[i][20] 55 | train.append(data[i]) 56 | print(len(train)) 57 | self.train_data = np.array(train) 58 | 59 | # 构造数据 60 | def buildTrainDataSet(self): 61 | x_ = self.train_data[:, 1:20] 62 | y_ = self.train_data[:, 20] 63 | for i in range(len(self.train_data) - self.timeStep - 1): 64 | x = x_[i:i + self.timeStep] 65 | y = y_[i:i + self.timeStep] 66 | self.train_x.append(x.tolist()) 67 | self.train_y.append(y.tolist()) 68 | 69 | # lstm算法定义 70 | def lstm(self, batchSize=None): 71 | if batchSize is None: 72 | batchSize = self.batchSize 73 | weightIn = self.weights['in'] 74 | biasesIn = self.biases['in'] 75 | input = tf.reshape(self.X, [-1, self.inputSize]) 76 | inputRnn = tf.matmul(input, weightIn) + biasesIn 77 | inputRnn = tf.reshape(inputRnn, [-1, self.timeStep, self.hiddenUnitSize]) # 将tensor转成3维,作为lstm cell的输入 78 | # cell=tf.nn.rnn_cell.BasicLSTMCell(self.hiddenUnitSize, reuse=True) 79 | # initState=cell.zero_state(batchSize,dtype=tf.float32) 80 | # output_rnn,final_states=tf.nn.dynamic_rnn(cell, inputRnn,initial_state=initState, dtype=tf.float32) #output_rnn是记录lstm每个输出节点的结果,final_states是最后一个cell的结果 81 | 82 | # **步骤2:定义一层 LSTM_cell,只需要说明 hidden_size, 它会自动匹配输入的 X 的维度 83 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.hiddenUnitSize, forget_bias=1.0, state_is_tuple=True) 84 | # **步骤3:添加 dropout layer, 一般只设置 output_keep_prob 85 | 86 | # 运行test的时候注释掉这段,不能dropout 87 | lstm_cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell, input_keep_prob=1.0, output_keep_prob=0.5) 88 | # **步骤4:调用 MultiRNNCell 来实现多层 LSTM 89 | mlstm_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * 5, state_is_tuple=True) 90 | # **步骤5:用全零来初始化state 91 | init_state = mlstm_cell.zero_state(batchSize, dtype=tf.float32) 92 | output_rnn, final_states = tf.nn.dynamic_rnn(mlstm_cell, inputRnn, initial_state=init_state, 93 | dtype=tf.float32) # output_rnn是记录lstm每个输出节点的结果,final_states是最后一个cell的结果 94 | 95 | output = tf.reshape(output_rnn, [-1, self.hiddenUnitSize]) # 作为输出层的输入 96 | w_out = self.weights['out'] 97 | b_out = self.biases['out'] 98 | pred = tf.matmul(output, w_out) + b_out 99 | return pred, final_states 100 | 101 | # 训练模型 102 | def trainLstm(self): 103 | pred, _ = self.lstm() 104 | # 定义损失函数 105 | loss = tf.sqrt(tf.reduce_mean(tf.square(tf.reshape(pred, [-1]) - tf.reshape(self.Y, [-1])))) 106 | # 定义训练模型 107 | train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) 108 | saver = tf.train.Saver(tf.global_variables()) 109 | with tf.Session() as sess: 110 | # sess.run(tf.global_variables_initializer()) 111 | saver.restore(sess,self.savePath) 112 | # 重复训练100次,训练是一个耗时的过程 113 | for i in range(1000): 114 | step = 0 115 | start = 0 116 | end = start + self.batchSize 117 | while end < len(self.train_x): 118 | _, loss_ = sess.run([train_op, loss], feed_dict={self.X: self.train_x[start:end], 119 | self.Y: self.train_y[start:end]}) 120 | # start += 1 121 | start += self.batchSize 122 | end = start + self.batchSize 123 | # 每10步保存一次参数 124 | if step % 500 == 0: 125 | print('test loss is :', i, loss_) 126 | if (i % 10 == 0) & (step % 500 == 0): 127 | print("保存模型") 128 | saver.save(sess, self.savePath) 129 | step += 1 130 | 131 | def prediction(self): 132 | pred, _ = self.lstm() # 预测时只输入[1,time_step,inputSize]的测试数据 133 | saver = tf.train.Saver(tf.global_variables()) 134 | with tf.Session() as sess: 135 | # 参数恢复 136 | saver.restore(sess, self.savePath) 137 | # 取训练集最后一行为测试样本. shape=[1,time_step,inputSize] 138 | result = [] 139 | start = 20 140 | end = start + self.batchSize 141 | # while end < len(self.train_x): 142 | pred = sess.run([pred], feed_dict={self.X: self.train_x[start:end] 143 | }) 144 | # 以折线图表示结果 145 | p = np.reshape(pred, [self.batchSize, -1]) 146 | s = 0 147 | b = self.timeStep 148 | x = [i for i in range(s, b*19)] 149 | # 以折线图表示结果 150 | plt.figure() 151 | plt.plot(x, p[0], color='r', label='yuce') 152 | plt.plot(x, self.train_y[s:b], color='y', label='shiji') 153 | plt.xlabel("Time(s)") # X轴标签 154 | plt.ylabel("Value") # Y轴标签 155 | plt.show() 156 | 157 | 158 | predictor = PricePredictor() 159 | predictor.loadData() 160 | 161 | # 构建训练数据 162 | predictor.buildTrainDataSet() 163 | 164 | # # 模型训练 165 | predictor.trainLstm() 166 | # 167 | # # 预测-预测前需要先完成模型训练 168 | # predictor.prediction() 169 | -------------------------------------------------------------------------------- /venv/datafountain/taocan/ml_models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/10/15 上午11:28 4 | # @Author :hwwu 5 | # @File :ml_models.py 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | path = '/Users/liyangyang/Downloads/datafountain/taocan/' 11 | 12 | 13 | ###service_type,is_mix_service,online_time,1_total_fee,2_total_fee,3_total_fee,4_total_fee, 14 | # month_traffic,many_over_bill,contract_type,contract_time,is_promise_low_consume,net_service, 15 | # pay_times,pay_num,last_month_traffic,local_trafffic_month,local_caller_time,service1_caller_time, 16 | # service2_caller_time,gender,age,complaint_level,former_complaint_num,former_complaint_fee, 17 | # current_service,user_id 18 | def getdata(data, f=True): 19 | # data = pd.read_csv(path + 'train_all.csv') 20 | if f: 21 | data.loc[data['current_service'] == 90063345, 'current_service'] = 0 22 | data.loc[data['current_service'] == 89950166, 'current_service'] = 1 23 | data.loc[data['current_service'] == 89950167, 'current_service'] = 2 24 | data.loc[data['current_service'] == 99999828, 'current_service'] = 3 25 | data.loc[data['current_service'] == 90109916, 'current_service'] = 4 26 | data.loc[data['current_service'] == 89950168, 'current_service'] = 5 27 | data.loc[data['current_service'] == 99999827, 'current_service'] = 6 28 | data.loc[data['current_service'] == 99999826, 'current_service'] = 7 29 | data.loc[data['current_service'] == 90155946, 'current_service'] = 8 30 | data.loc[data['current_service'] == 99999830, 'current_service'] = 9 31 | data.loc[data['current_service'] == 99999825, 'current_service'] = 10 32 | data.loc[data['age'] == '\\N', 'age'] = 0 33 | data.loc[data['gender'] == '\\N', 'gender'] = 0 34 | 35 | data['age'] = data['age'].astype('int64') 36 | data.loc[data['age'] < 20, 'age'] = 0 37 | data.loc[(data['age'] >= 20) & (data['age'] < 30), 'age'] = 1 38 | data.loc[(data['age'] >= 30) & (data['age'] < 40), 'age'] = 2 39 | data.loc[(data['age'] >= 40) & (data['age'] < 50), 'age'] = 3 40 | data.loc[data['age'] >= 50, 'age'] = 4 41 | 42 | data['gender'] = data['gender'].astype('int64') 43 | 44 | data.loc[data['2_total_fee'] == '\\N', '2_total_fee'] = 0.0 45 | data.loc[data['3_total_fee'] == '\\N', '3_total_fee'] = 0.0 46 | data['2_total_fee'] = data['2_total_fee'].astype('float64') 47 | data['3_total_fee'] = data['3_total_fee'].astype('float64') 48 | data.loc[data['1_total_fee'] > 500.0, '1_total_fee'] = 500.0 49 | data.loc[data['2_total_fee'] > 500.0, '2_total_fee'] = 500.0 50 | data.loc[data['3_total_fee'] > 500.0, '3_total_fee'] = 500.0 51 | data.loc[data['4_total_fee'] > 500.0, '4_total_fee'] = 500.0 52 | 53 | data['total_fee'] = 0 54 | data.loc[data['1_total_fee'] < .0, 'total_fee'] = 1 55 | data.loc[data['2_total_fee'] < .0, 'total_fee'] = 1 56 | data.loc[data['3_total_fee'] < .0, 'total_fee'] = 1 57 | data.loc[data['4_total_fee'] < .0, 'total_fee'] = 1 58 | data.loc[data['1_total_fee'] > 499.0, 'total_fee'] = 2 59 | data.loc[data['2_total_fee'] > 499.0, 'total_fee'] = 2 60 | data.loc[data['3_total_fee'] > 499.0, 'total_fee'] = 2 61 | data.loc[data['4_total_fee'] > 499.0, 'total_fee'] = 2 62 | 63 | data['month_traffic_0'] = 0 64 | data.loc[(data['month_traffic'] > 0) & (data['month_traffic'] < 1024), 'month_traffic_0'] = 1 65 | data.loc[data['month_traffic'] == 1024.0, 'month_traffic_0'] = 2 66 | data.loc[data['month_traffic'] > 1024, 'month_traffic_0'] = 3 67 | 68 | data.loc[data['online_time'] > 140, 'online_time'] = 140 69 | 70 | data['pay_ave'] = data['pay_num'] / data['pay_times'] 71 | data.loc[data['pay_times'] > 10, 'pay_times'] = 10 72 | 73 | data['my_traffic'] = data['last_month_traffic'].apply(lambda x: parse_traffic(x)) 74 | 75 | data = data.drop(['local_trafffic_month'], axis=1) 76 | data = data.drop(['last_month_traffic'], axis=1) 77 | data = data.drop(['month_traffic'], axis=1) 78 | 79 | data.loc[data['local_caller_time'] == 0.0, 'local_caller_time'] = 0 80 | data.loc[(data['local_caller_time'] > 0) & (data['local_caller_time'] < 10), 'local_caller_time'] = 1 81 | data.loc[(data['local_caller_time'] >= 10) & (data['local_caller_time'] < 100), 'local_caller_time'] = 2 82 | data.loc[data['local_caller_time'] >= 100, 'local_caller_time'] = 3 83 | 84 | data.loc[data['service1_caller_time'] == 0.0, 'service1_caller_time'] = 0 85 | data.loc[(data['service1_caller_time'] > 0) & (data['service1_caller_time'] < 10), 'service1_caller_time'] = 1 86 | data.loc[(data['service1_caller_time'] >= 10) & (data['service1_caller_time'] < 100), 'service1_caller_time'] = 2 87 | data.loc[data['service1_caller_time'] >= 100, 'service1_caller_time'] = 3 88 | 89 | data.loc[data['service2_caller_time'] == 0.0, 'service2_caller_time'] = 0 90 | data.loc[(data['service2_caller_time'] > 0) & (data['service2_caller_time'] < 10), 'service2_caller_time'] = 1 91 | data.loc[(data['service2_caller_time'] >= 10) & (data['service2_caller_time'] < 100), 'service2_caller_time'] = 2 92 | data.loc[data['service2_caller_time'] >= 100, 'service2_caller_time'] = 3 93 | 94 | data['complaint_num'] = 0 95 | data.loc[data['former_complaint_num'] > 0, 'complaint_num'] = 1 96 | 97 | data['complaint_fee'] = 0 98 | data.loc[data['former_complaint_fee'] > 0, 'complaint_fee'] = 1 99 | 100 | return data 101 | 102 | 103 | def parse_traffic(x): 104 | m = x / 1024.0 105 | if m == 0.0: 106 | return 0 107 | elif m < 1.0: 108 | return 0.5 109 | elif m == 1.0: 110 | return 1 111 | elif m < 2.0: 112 | return 1.5 113 | elif m == 2.0: 114 | return 2 115 | elif m < 3.0: 116 | return 2.5 117 | elif m == 3.0: 118 | return 3 119 | elif m < 4.0: 120 | return 3.5 121 | elif m == 4.0: 122 | return 4 123 | else: 124 | return 5 125 | 126 | 127 | data = pd.read_csv(path + 'train_all.csv') 128 | data = getdata(data) 129 | train_data = data 130 | train_x = train_data.drop(['user_id', 'current_service'], axis=1) 131 | train_y = train_data['current_service'] 132 | 133 | ####### test数据 134 | republish_test_data = pd.read_csv(path + 'republish_test.csv') 135 | republish_test_data = getdata(republish_test_data, f=False) 136 | # print('republish_test_data: ', republish_test_data.shape) 137 | 138 | user_id = republish_test_data['user_id'] 139 | republish_test = republish_test_data.drop(['user_id'], axis=1) 140 | 141 | from sklearn.model_selection import train_test_split 142 | 143 | Y_CAT = pd.Categorical(train_y) 144 | X_train, X_test, y_train, y_test = train_test_split(train_x, Y_CAT.codes, test_size=0.05, random_state=666) 145 | 146 | y_test = np.array(y_test) 147 | 148 | 149 | def score(y_pred): 150 | y_pred = [list(x).index(max(x)) for x in y_pred] 151 | count = 0 152 | for i in range(len(y_pred)): 153 | # print(test_y[i:i+1][0]) 154 | if (y_pred[i] == y_test[i:i + 1][0]): 155 | # print(y_pred[i], test_y[i:i + 1][0]) 156 | count += 1 157 | print(count, len(y_pred), count / len(y_pred)) 158 | 159 | 160 | from sklearn.naive_bayes import MultinomialNB 161 | from sklearn.model_selection import cross_val_score 162 | 163 | # clf = MultinomialNB() 164 | # clf.fit(X_train, y_train) 165 | # print("多项式贝叶斯分类器20折交叉验证得分: ", np.mean(cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy'))) 166 | # score(clf.predict(X_test)) 167 | # 168 | from sklearn import svm 169 | 170 | lin_clf = svm.LinearSVC(class_weight='balanced') 171 | lin_clf.fit(X_train, y_train) 172 | print("svm分类器20折交叉验证得分: ", np.mean(cross_val_score(lin_clf, X_train, y_train, cv=5, scoring='accuracy'))) 173 | score(lin_clf.predict(X_test)) 174 | 175 | from sklearn.ensemble import RandomForestClassifier 176 | 177 | lin_forest = RandomForestClassifier(n_estimators=10, random_state=1, class_weight='balanced') 178 | lin_forest.fit(X_train, y_train) 179 | print("RandomForestClassifier分类器20折交叉验证得分: ", 180 | np.mean(cross_val_score(lin_forest, X_train, y_train, cv=5, scoring='accuracy'))) 181 | score(lin_forest.predict(X_test)) 182 | 183 | import xgboost as xgb 184 | 185 | model_xgb = xgb.XGBClassifier(colsample_bytree=0.4603, gamma=0.0468) 186 | model_xgb.fit(X_train, y_train) 187 | print("model_xgb分类器20折交叉验证得分: ", 188 | np.mean(cross_val_score(model_xgb, X_train, y_train, cv=5, scoring='accuracy'))) 189 | score(model_xgb.predict(X_test)) 190 | -------------------------------------------------------------------------------- /venv/datafountain/taocan/tf_model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "/Users/liyangyang/PycharmProjects/mypy/deep_learning/venv/datafountain/taocan/tf_model/stock.model" 2 | all_model_checkpoint_paths: "/Users/liyangyang/PycharmProjects/mypy/deep_learning/venv/datafountain/taocan/tf_model/stock.model.max" 3 | all_model_checkpoint_paths: "/Users/liyangyang/PycharmProjects/mypy/deep_learning/venv/datafountain/taocan/tf_model/stock.model" 4 | -------------------------------------------------------------------------------- /venv/datafountain/taocan/tf_model/stock.model.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.data-00000-of-00001 -------------------------------------------------------------------------------- /venv/datafountain/taocan/tf_model/stock.model.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.index -------------------------------------------------------------------------------- /venv/datafountain/taocan/tf_model/stock.model.max.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.max.data-00000-of-00001 -------------------------------------------------------------------------------- /venv/datafountain/taocan/tf_model/stock.model.max.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.max.index -------------------------------------------------------------------------------- /venv/datafountain/taocan/tf_model/stock.model.max.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.max.meta -------------------------------------------------------------------------------- /venv/datafountain/taocan/tf_model/stock.model.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/datafountain/taocan/tf_model/stock.model.meta -------------------------------------------------------------------------------- /venv/dc/guangfu/github/README.md: -------------------------------------------------------------------------------- 1 | 基础特征做的baseline 线上1.65左右 2 | 3 | 提分点: 4 | 1、好好做下特征工程 5 | 2、模型融合 6 | 可以参考下这个 https://mp.weixin.qq.com/s/Yix0xVp2SiqaAcuS6Q049g 7 | 8 | 9 | -------------------------------------------------------------------------------- /venv/dc/guangfu/github/baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/10/25 上午11:07 4 | # @Author :hwwu 5 | # @File :baseline.py 6 | 7 | import pandas as pd 8 | from sklearn.model_selection import train_test_split 9 | import lightgbm as lgb 10 | from sklearn.preprocessing import PolynomialFeatures 11 | 12 | path = './dc/guangfu/' 13 | 14 | 15 | def get_hour(x): 16 | h = int(x[11:13]) 17 | m = int(x[14:16]) 18 | if m in [14, 29, 44]: 19 | m += 1 20 | if m == 59: 21 | m = 0 22 | h += 1 23 | if h == 24: 24 | h = 0 25 | return h * 60 + m 26 | 27 | 28 | def add_poly_features(data, column_names): 29 | features = data[column_names] 30 | rest_features = data.drop(column_names, axis=1) 31 | poly_transformer = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) 32 | poly_features = pd.DataFrame(poly_transformer.fit_transform(features), 33 | columns=poly_transformer.get_feature_names(column_names)) 34 | 35 | for col in poly_features.columns: 36 | rest_features.insert(1, col, poly_features[col]) 37 | return rest_features 38 | 39 | 40 | train_x_old = pd.read_csv(path + 'train_1.csv') 41 | test = pd.read_csv(path + 'test_1.csv') 42 | train_x_old['month'] = train_x_old['时间'].apply(lambda x: x[5:7]).astype('int32') 43 | train_x_old['day'] = train_x_old['时间'].apply(lambda x: x[8:10]).astype('int32') 44 | train_x_old['hour'] = train_x_old['时间'].apply(lambda x: get_hour(x)).astype('int32') 45 | test['month'] = test['时间'].apply(lambda x: x[5:7]).astype('int32') 46 | test['day'] = test['时间'].apply(lambda x: x[8:10]).astype('int32') 47 | test['hour'] = test['时间'].apply(lambda x: get_hour(x)).astype('int32') 48 | 49 | train_y = train_x_old['实际功率'] 50 | train_x = train_x_old.drop(['实发辐照度', '实际功率'], axis=1) 51 | train_x['dis2peak'] = train_x['hour'].apply(lambda x: (810 - abs(810 - x)) / 810) 52 | train_x = add_poly_features(train_x, ['风速', '风向']) 53 | train_x = add_poly_features(train_x, ['温度', '压强', '湿度']) 54 | 55 | id = test['id'] 56 | del_id = test[test['辐照度'].isin([-1.0])]['id'] 57 | test = test.drop(['id'], axis=1) 58 | test['dis2peak'] = test['hour'].apply(lambda x: (810 - abs(810 - x)) / 810) 59 | test = add_poly_features(test, ['风速', '风向']) 60 | test = add_poly_features(test, ['温度', '压强', '湿度']) 61 | 62 | train_x = train_x.drop(['时间'], axis=1) 63 | test = test.drop(['时间'], axis=1) 64 | print('train_x.shape,test_1.shape : ', train_x.shape, test.shape) 65 | 66 | X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.1, random_state=678) 67 | 68 | params = { 69 | "objective": "regression", 70 | "metric": "mse", 71 | "num_leaves": 30, 72 | "min_child_samples": 100, 73 | "learning_rate": 0.03, 74 | "bagging_fraction": 0.7, 75 | "feature_fraction": 0.5, 76 | "bagging_frequency": 5, 77 | "bagging_seed": 666, 78 | "verbosity": -1 79 | } 80 | 81 | 82 | def lgb_train(): 83 | lgb_train = lgb.Dataset(X_train, label=y_train) 84 | lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) 85 | print('begin train') 86 | gbm = lgb.train(params, 87 | lgb_train, 88 | num_boost_round=50000, 89 | valid_sets=lgb_eval, 90 | early_stopping_rounds=100, 91 | verbose_eval=100) 92 | # y_pred = gbm.predict(X_test) 93 | ##write result 94 | republish_pred = gbm.predict(test) 95 | republish_pred = pd.DataFrame(republish_pred) 96 | sub = pd.concat([id, republish_pred], axis=1) 97 | print(sub.shape) 98 | sub.columns = ['id', 'predicition'] 99 | sub.loc[sub['id'].isin(del_id), 'predicition'] = 0.0 100 | sub.to_csv(path + '/baseline1.csv', index=False, sep=',', encoding='UTF-8') 101 | 102 | 103 | lgb_train() 104 | -------------------------------------------------------------------------------- /venv/deep_learning/yucemoxing/PricePredictor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/6/21 下午1:27 4 | # @Author :hwwu 5 | # @File :PricePredictor.py 6 | 7 | import codecs 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import tensorflow as tf 11 | 12 | class PricePredictor: 13 | # lstm param 14 | timeStep = 20 15 | hiddenUnitSize = 10 #隐藏层神经元数量 16 | batchSize = 60 #每一批次训练多少个样例 17 | inputSize = 1 #输入维度 18 | outputSize=1 #输出维度 19 | lr = 0.0006 #学习率 20 | train_x, train_y = [],[] #训练数据集 21 | sortedChargeList = [] #排序的训练数据集 22 | normalizeData = [] #归一化的数据 23 | dataFile = '/Users/liyangyang/PycharmProjects/mypy/venv/deep_learning/yucemoxing/chargeInfo.txt' 24 | date2Price = {} #日期-每平米的价格映射 25 | chargeList = [] #交易价格 26 | date2Charge = {} #日期-交易价格映射 27 | meanPrice = 0 #均价 28 | stdPrice = 0 29 | X = tf.placeholder(tf.float32, [None, timeStep, inputSize]) 30 | Y = tf.placeholder(tf.float32, [None, timeStep, outputSize]) 31 | weights = { 32 | 'in': tf.Variable(tf.random_normal([inputSize, hiddenUnitSize])), 33 | 'out': tf.Variable(tf.random_normal([hiddenUnitSize, 1])) 34 | } 35 | 36 | biases = { 37 | 'in': tf.Variable(tf.constant(0.1, shape=[hiddenUnitSize, ])), 38 | 'out': tf.Variable(tf.constant(0.1, shape=[1, ])) 39 | } 40 | 41 | def loadData(self): 42 | fp = codecs.open(self.dataFile, 'r', 'utf-8') 43 | line = fp.readline() 44 | 45 | # parse line to data 46 | while line: 47 | line = fp.readline() 48 | data = line.split(" ") 49 | if len(data) < 7: 50 | continue 51 | area = float(data[5].replace("平米", "")) 52 | price = float(data[2]) 53 | pricePerSquare = price / area 54 | charge = [str(data[1]), data[6].replace('\n', ''), data[3], pricePerSquare] 55 | self.chargeList.append(charge) 56 | self.date2Charge[str(data[1])] = charge # date: {name:price} 57 | self.date2Price[str(data[1])] = pricePerSquare 58 | 59 | self.sortedChargeList = sorted(self.chargeList, key=predictor.getKey, reverse=False) 60 | 61 | def getKey(self, item): 62 | return item[1] 63 | 64 | # 构造数据 65 | def buildTrainDataSet(self): 66 | data = [] 67 | for price in self.sortedChargeList: 68 | data.append(price[3]) 69 | 70 | self.meanPrice = np.mean(data); 71 | self.stdPrice = np.std(data) 72 | self.normalizeData = (data - self.meanPrice) / self.stdPrice #标准化 73 | 74 | self.normalizeData = self.normalizeData[:,np.newaxis] #增加维度 75 | for i in range(len(self.normalizeData)-self.timeStep-1): 76 | x=self.normalizeData[i:i+self.timeStep] 77 | y=self.normalizeData[i+1:i+self.timeStep+1] 78 | self.train_x.append(x.tolist()) 79 | self.train_y.append(y.tolist()) 80 | 81 | # lstm算法定义 82 | def lstm(self, batchSize = None): 83 | if batchSize is None : 84 | batchSize = self.batchSize 85 | weightIn = self.weights['in'] 86 | biasesIn = self.biases['in'] 87 | input = tf.reshape(self.X, [-1,self.inputSize]) 88 | inputRnn=tf.matmul(input,weightIn)+biasesIn 89 | inputRnn=tf.reshape(inputRnn,[-1,self.timeStep,self.hiddenUnitSize]) #将tensor转成3维,作为lstm cell的输入 90 | cell=tf.nn.rnn_cell.BasicLSTMCell(self.hiddenUnitSize, reuse=True) 91 | initState=cell.zero_state(batchSize,dtype=tf.float32) 92 | output_rnn,final_states=tf.nn.dynamic_rnn(cell, inputRnn,initial_state=initState, dtype=tf.float32) #output_rnn是记录lstm每个输出节点的结果,final_states是最后一个cell的结果 93 | output=tf.reshape(output_rnn,[-1,self.hiddenUnitSize]) #作为输出层的输入 94 | w_out=self.weights['out'] 95 | b_out=self.biases['out'] 96 | pred=tf.matmul(output,w_out)+b_out 97 | return pred,final_states 98 | 99 | # 训练模型 100 | def trainLstm(self) : 101 | pred,_ = self.lstm() 102 | #定义损失函数 103 | loss = tf.reduce_mean(tf.square(tf.reshape(pred, [-1]) - tf.reshape(self.Y, [-1]))) 104 | #定义训练模型 105 | train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) 106 | saver = tf.train.Saver(tf.global_variables()) 107 | with tf.Session() as sess: 108 | sess.run(tf.global_variables_initializer()) 109 | # 重复训练100次,训练是一个耗时的过程 110 | for i in range(100): 111 | step = 0 112 | start = 0 113 | end = start + self.batchSize 114 | while end < len(self.train_x): 115 | _, loss_ = sess.run([train_op, loss], feed_dict={self.X: self.train_x[start:end], self.Y: self.train_y[start:end]}) 116 | start += self.batchSize 117 | end = start + self.batchSize 118 | # 每10步保存一次参数 119 | if step % 10 == 0: 120 | print(i, step, loss_) 121 | print("保存模型:", saver.save(sess, '/Users/liyangyang/PycharmProjects/mypy/venv/deep_learning/yucemoxing/model/stock.model')) 122 | step += 1 123 | 124 | 125 | def prediction(self): 126 | pred, _ = self.lstm(1) # 预测时只输入[1,time_step,inputSize]的测试数据 127 | saver = tf.train.Saver(tf.global_variables()) 128 | with tf.Session() as sess: 129 | # 参数恢复 130 | module_file = tf.train.latest_checkpoint('/Users/liyangyang/PycharmProjects/mypy/venv/deep_learning/yucemoxing/model/') 131 | saver.restore(sess, module_file) 132 | # 取训练集最后一行为测试样本. shape=[1,time_step,inputSize] 133 | prev_seq = self.train_x[-1] 134 | predict = [] 135 | # 得到之后100个预测结果 136 | for i in range(100): 137 | next_seq = sess.run(pred, feed_dict={self.X: [prev_seq]}) 138 | predict.append(next_seq[-1]) 139 | # 每次得到最后一个时间步的预测结果,与之前的数据加在一起,形成新的测试样本 140 | prev_seq = np.vstack((prev_seq[1:], next_seq[-1])) 141 | # 以折线图表示结果 142 | plt.figure() 143 | true_price = self.stdPrice**predict 144 | true_price = [price + self.meanPrice for price in true_price] 145 | plt.plot(list(range(len(self.normalizeData), len(self.normalizeData) + len(predict))), true_price, color='r') 146 | plt.show() 147 | 148 | predictor = PricePredictor() 149 | predictor.loadData() 150 | 151 | # print('sortedChargeList:') 152 | # print(predictor.sortedChargeList) 153 | # print('chargeList') 154 | # print(predictor.chargeList) 155 | # print('date2Charge') 156 | # print(predictor.date2Charge) 157 | # print('date2Price') 158 | # print(predictor.date2Price) 159 | 160 | # 构建训练数据 161 | predictor.buildTrainDataSet() 162 | # print(predictor.train_x[0:10]) 163 | # print(predictor.train_y[0:10]) 164 | 165 | # # 模型训练 166 | # predictor.trainLstm() 167 | # 168 | # # 预测-预测前需要先完成模型训练 169 | predictor.prediction() 170 | -------------------------------------------------------------------------------- /venv/dwb/baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/8/20 上午11:28 4 | # @Author :hwwu 5 | # @File :baseline.py 6 | import pandas as pd, numpy as np 7 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 8 | from sklearn import svm 9 | from sklearn.ensemble import RandomForestClassifier 10 | from tensorflow.contrib import learn 11 | 12 | path = '/Users/liyangyang/Downloads/dwb/new_data/' 13 | # column = "word_seg" 14 | # train = pd.read_csv(path+'train_set.csv') 15 | test = pd.read_csv(path+'test_set.csv') 16 | test_id = test["id"].copy() 17 | vec = TfidfVectorizer(ngram_range=(3,4),min_df=3, max_df=0.9,use_idf=1,smooth_idf=1, sublinear_tf=1) 18 | # 19 | # train = np.array(train[column]) 20 | # test = np.array(test[column]) 21 | # 22 | # vocab_processor_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/vocab-5000/' 23 | # vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_processor_path) 24 | # train = np.array(list(vocab_processor.transform(train))) 25 | # test = np.array(list(vocab_processor.transform(test))) 26 | # # 27 | # # train = pd.DataFrame(train) 28 | # # test = pd.DataFrame(test) 29 | # 30 | # np.save(path+'vocab/vocab_train',train) 31 | # np.save(path+'vocab/vocab_test',test) 32 | 33 | 34 | # t1 = np.load(path+'vocab/vocab_train.npy') 35 | # t2 = np.load(path+'vocab/vocab_test.npy') 36 | # 37 | # train=[] 38 | # test=[] 39 | # for i in range(len(t1)): 40 | # row = str(t1[i][0]) 41 | # for j in range(1,len(t1[i])): 42 | # s = str(t1[i][j]) 43 | # if (s!='0'): 44 | # row = row + '\t' + s 45 | # train.append(row) 46 | # print(train[0]) 47 | # 48 | # for i in range(len(t2)): 49 | # row = str(t2[i][0]) 50 | # for j in range(1,len(t2[i])): 51 | # s = str(t2[i][j]) 52 | # if (s != '0'): 53 | # row = row + '\t' + s 54 | # test.append(row) 55 | # print(test[0]) 56 | # 57 | # train = np.array(train) 58 | # test = np.array(test) 59 | # print(train.shape) 60 | # print(test.shape) 61 | # 62 | # np.save(path+'vocab/vocab_train_1',train) 63 | # np.save(path+'vocab/vocab_test_1',test) 64 | 65 | t1 = np.load(path+'vocab/vocab_train_1.npy') 66 | train = np.array(t1) 67 | 68 | print('start tf-idf fit') 69 | trn_term_doc = vec.fit_transform(train) 70 | np.savetxt(path+'tf-idf/train_data',trn_term_doc) 71 | 72 | print('start tf-idf transform') 73 | t2 = np.load(path+'vocab/vocab_test_1.npy') 74 | test = np.array(t2) 75 | test_term_doc = vec.transform(test) 76 | print('tf-idf transform done') 77 | 78 | fid0=open(path+'baseline_time.csv','w') 79 | np.savetxt(path+'tf-idf/test_data',test_term_doc) 80 | print('save data done') 81 | 82 | y=(train["class"]-1).astype(int) 83 | print('start fit') 84 | lin_clf = svm.LinearSVC() 85 | lin_clf.fit(trn_term_doc[:80000],y[:80000]) 86 | # preds = lin_clf.predict(test_term_doc) 87 | # lin_forest = RandomForestClassifier(n_estimators=100, random_state=1) 88 | # lin_forest.fit(trn_term_doc,y) 89 | print('fit done') 90 | print('start predict') 91 | pred = lin_clf.score(trn_term_doc[80000:],y[80000:]) 92 | print('predict done') 93 | print(pred) 94 | 95 | preds = lin_clf.predict(test_term_doc) 96 | i=0 97 | fid0.write("id,class"+"\n") 98 | for item in preds: 99 | fid0.write(str(i)+","+str(item+1)+"\n") 100 | i=i+1 101 | fid0.close() 102 | 103 | -------------------------------------------------------------------------------- /venv/dwb/fasttext/__pycache__/fasttext.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/fasttext/__pycache__/fasttext.cpython-36.pyc -------------------------------------------------------------------------------- /venv/dwb/fasttext/fasttext.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/8/22 下午1:37 4 | # @Author :hwwu 5 | # @File :fasttext.py 6 | 7 | import pandas as pd, numpy as np 8 | import fastText 9 | 10 | path = '/Users/liyangyang/Downloads/dwb/new_data/' 11 | # column = "word_seg" 12 | column = "article" 13 | 14 | 15 | def write_train_data(): 16 | train = pd.read_csv(path + 'train_set.csv') 17 | f = open(path + 't_train_set.txt', 'a') 18 | for i in range(80000): 19 | row = str(train[column][i]) + '\t' + '__myprefix__' + str(train['class'][i]) 20 | f.write(row + '\n') 21 | f.close() 22 | f1 = open(path + 't_test_set.txt', 'a') 23 | for i in range(80000, len(train)): 24 | row = str(train[column][i]) + '\t' + '__myprefix__' + str(train['class'][i]) 25 | f1.write(row + '\n') 26 | f1.close() 27 | # f2 = open(path + 'test_set1.txt', 'a') 28 | # for i in range(100000, len(train)): 29 | # row = str(train[column][i]) 30 | # f2.write(row + '\n') 31 | # f2.close() 32 | 33 | 34 | # write_train_data() 35 | 36 | model_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/fasttext/' 37 | 38 | 39 | def model(): 40 | # model = fastText.train_supervised(path + 'train_set.txt', label='__myprefix__',bucket=400000 41 | # ,wordNgrams=2,minCount=3,lr=1,lrUpdateRate=0) 42 | model = fastText.train_supervised(path + 't_train_set.txt', label='__myprefix__', bucket=39759 43 | , wordNgrams=3, minCount=3, lr=1, lrUpdateRate=200 44 | ,dim=128) 45 | result = model.test(path + 't_test_set.txt') 46 | print(result) 47 | # model.save_model(model_path + 'model') 48 | 49 | true_labels = [] 50 | all_words = [] 51 | f = open(path + 't_test_set.txt', 'r') 52 | for line in f: 53 | words, labels = model.get_line(line.strip()) 54 | if len(labels) == 0: 55 | continue 56 | all_words.append(" ".join(words)) 57 | true_labels += [labels] 58 | predictions, _ = model.predict(all_words) 59 | 60 | n = 0 61 | for i in range(len(true_labels)): 62 | if (predictions[i]==true_labels[i]): 63 | n+=1 64 | print(n/len(true_labels)) 65 | 66 | # model = fastText.load_model(model_path + 'model') 67 | # id, all_words = get_test_words(model) 68 | # print('start predict data') 69 | # predictions, _ = model.predict(all_words) 70 | # print('predict data done') 71 | # write_result(id, predictions) 72 | 73 | model() 74 | 75 | def get_test_words(model): 76 | all_words = [] 77 | id = [] 78 | print('start read test set data') 79 | test = pd.read_csv(path + 'test_set.csv') 80 | for i in range(len(test)): 81 | words, _ = model.get_line(test[column][i].strip()) 82 | all_words.append(" ".join(words)) 83 | id.append(test['id'][i]) 84 | print('read test set data done') 85 | return id, all_words 86 | 87 | 88 | def write_result(id, predictions): 89 | r_id = [] 90 | r_predictions = [] 91 | for i in range(len(id)): 92 | r_id.append(int(id[i])) 93 | # price.append(round(x1[i][1],1)) 94 | r_predictions.append(int(tostr(predictions[i]))) 95 | 96 | english_column = pd.Series(r_id, name='id') 97 | number_column = pd.Series(r_predictions, name='class') 98 | predictions = pd.concat([english_column, number_column], axis=1) 99 | # another way to handle 100 | # save = pd.DataFrame({'user_id': user_id, 'prediction_pay_price': price}) 101 | predictions.to_csv(path + 'result_data.csv', index=0, sep=',', columns=['id', 'class']) 102 | 103 | 104 | def tostr(s): 105 | s = str(s).replace('__myprefix__', '') 106 | s = s.replace('[', '') 107 | s = s.replace(']', '') 108 | s = s.replace('\'', '') 109 | return s 110 | 111 | # model() 112 | -------------------------------------------------------------------------------- /venv/dwb/github_model/a01_FastText/__pycache__/p5_fastTextB_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a01_FastText/__pycache__/p5_fastTextB_model.cpython-36.pyc -------------------------------------------------------------------------------- /venv/dwb/github_model/a01_FastText/p5_fastTextB_model.py: -------------------------------------------------------------------------------- 1 | # fast text. using: very simple model;n-gram to captrue location information;h-softmax to speed up training/inference 2 | # for the n-gram you can use data_util to generate. see method process_one_sentence_to_get_ui_bi_tri_gram under aa1_data_util/data_util_zhihu.py 3 | print("started...") 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | class fastTextB: 8 | def __init__(self, label_size, learning_rate, batch_size, decay_steps, decay_rate,num_sampled,sentence_len,vocab_size,embed_size,is_training): 9 | """init all hyperparameter here""" 10 | # set hyperparamter 11 | self.label_size = label_size 12 | self.batch_size = batch_size 13 | self.num_sampled = num_sampled 14 | self.sentence_len=sentence_len 15 | self.vocab_size=vocab_size 16 | self.embed_size=embed_size 17 | self.is_training=is_training 18 | self.learning_rate=learning_rate 19 | 20 | # add placeholder (X,label) 21 | self.sentence = tf.placeholder(tf.int32, [None, self.sentence_len], name="sentence") # X 22 | self.labels = tf.placeholder(tf.int32, [None], name="Labels") # y 23 | 24 | self.global_step = tf.Variable(0, trainable=False, name="Global_Step") 25 | self.epoch_step=tf.Variable(0,trainable=False,name="Epoch_Step") 26 | self.epoch_increment=tf.assign(self.epoch_step,tf.add(self.epoch_step,tf.constant(1))) 27 | self.decay_steps, self.decay_rate = decay_steps, decay_rate 28 | 29 | self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step") 30 | self.instantiate_weights() 31 | self.logits = self.inference() #[None, self.label_size] 32 | if not is_training: 33 | return 34 | self.loss_val = self.loss() 35 | self.train_op = self.train() 36 | self.predictions = tf.argmax(self.logits, axis=1, name="predictions") # shape:[None,] 37 | correct_prediction = tf.equal(tf.cast(self.predictions,tf.int32), self.labels) #tf.argmax(self.logits, 1)-->[batch_size] 38 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy") # shape=() 39 | 40 | def instantiate_weights(self): 41 | """define all weights here""" 42 | # embedding matrix 43 | self.Embedding = tf.get_variable("Embedding", [self.vocab_size, self.embed_size]) 44 | self.W = tf.get_variable("W", [self.embed_size, self.label_size]) 45 | self.b = tf.get_variable("b", [self.label_size]) 46 | 47 | def inference(self): 48 | """main computation graph here: 1.embedding-->2.average-->3.linear classifier""" 49 | # 1.get emebedding of words in the sentence 50 | sentence_embeddings = tf.nn.embedding_lookup(self.Embedding,self.sentence) # [None,self.sentence_len,self.embed_size] 51 | 52 | sentence_embeddings = tf.nn.dropout(sentence_embeddings,keep_prob=0.8) 53 | # 2.average vectors, to get representation of the sentence 54 | self.sentence_embeddings = tf.reduce_mean(sentence_embeddings, axis=1) # [None,self.embed_size] 55 | 56 | # 3.linear classifier layer 57 | logits = tf.matmul(self.sentence_embeddings, self.W) + self.b #[None, self.label_size]==tf.matmul([None,self.embed_size],[self.embed_size,self.label_size]) 58 | return logits 59 | 60 | def loss(self,l2_lambda=0.01): #0.0001-->0.001 61 | """calculate loss using (NCE)cross entropy here""" 62 | # Compute the average NCE loss for the batch. 63 | # tf.nce_loss automatically draws a new sample of the negative labels each 64 | # time we evaluate the loss. 65 | if not self.is_training: #training 66 | labels=tf.reshape(self.labels,[-1]) #[batch_size,1]------>[batch_size,] 67 | labels=tf.expand_dims(labels,1) #[batch_size,]----->[batch_size,1] 68 | loss = tf.reduce_mean( #inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. 69 | tf.nn.nce_loss(weights=tf.transpose(self.W), #[embed_size, label_size]--->[label_size,embed_size]. nce_weights:A `Tensor` of shape `[num_classes, dim].O.K. 70 | biases=self.b, #[label_size]. nce_biases:A `Tensor` of shape `[num_classes]`. 71 | labels=labels, #[batch_size,1]. train_labels, # A `Tensor` of type `int64` and shape `[batch_size,num_true]`. The target classes. 72 | inputs=self.sentence_embeddings,# [None,self.embed_size] #A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. 73 | num_sampled=self.num_sampled, #scalar. 100 74 | num_classes=self.label_size,partition_strategy="div")) #scalar. 1999 75 | else:#eval/inference 76 | #logits = tf.matmul(self.sentence_embeddings, tf.transpose(self.W)) #matmul([None,self.embed_size])---> 77 | #logits = tf.nn.bias_add(logits, self.b) 78 | labels_one_hot = tf.one_hot(self.labels, self.label_size) #[batch_size]---->[batch_size,label_size] 79 | #sigmoid_cross_entropy_with_logits:Computes sigmoid cross entropy given `logits`.Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For instance, one could perform multilabel classification where a picture can contain both an elephant and a dog at the same time. 80 | # labels = tf.expand_dims(self.labels, 1) 81 | loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_one_hot,logits=self.logits) #labels:[batch_size,label_size];logits:[batch, label_size] 82 | print("loss0:", loss) #shape=(?, 1999) 83 | loss = tf.reduce_mean(loss) 84 | print("loss1:",loss) #shape=(?,) 85 | l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda 86 | return loss+l2_losses 87 | 88 | def train(self): 89 | """based on the loss, use SGD to update parameter""" 90 | learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps,self.decay_rate, staircase=True) 91 | train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,learning_rate=learning_rate, optimizer="adam") 92 | 93 | return train_op 94 | 95 | #test started 96 | def test(): 97 | #below is a function test; if you use this for text classifiction, you need to tranform sentence to indices of vocabulary first. then feed data to the graph. 98 | num_classes=19 99 | learning_rate=0.01 100 | batch_size=8 101 | decay_steps=1000 102 | decay_rate=0.9 103 | sequence_length=5 104 | vocab_size=10000 105 | embed_size=100 106 | is_training=True 107 | dropout_keep_prob=1 108 | fastText=fastTextB(num_classes, learning_rate, batch_size, decay_steps, decay_rate,5,sequence_length,vocab_size,embed_size,is_training) 109 | with tf.Session() as sess: 110 | sess.run(tf.global_variables_initializer()) 111 | for i in range(100): 112 | input_x=np.zeros((batch_size,sequence_length),dtype=np.int32) #[None, self.sequence_length] 113 | input_y=input_y=np.array([1,0,1,1,1,2,1,1],dtype=np.int32) #np.zeros((batch_size),dtype=np.int32) #[None, self.sequence_length] 114 | loss,acc,predict,_=sess.run([fastText.loss_val,fastText.accuracy,fastText.predictions,fastText.train_op], 115 | feed_dict={fastText.sentence:input_x,fastText.labels:input_y}) 116 | print("loss:",loss,"acc:",acc,"label:",input_y,"prediction:",predict) 117 | #test() 118 | print("ended...") 119 | -------------------------------------------------------------------------------- /venv/dwb/github_model/a01_FastText/p5_fastTextB_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | try: 5 | reload # Python 2 6 | except NameError: 7 | from importlib import reload # Python 3 8 | import sys 9 | reload(sys) 10 | sys.setdefaultencoding('utf8') 11 | import tensorflow as tf 12 | import numpy as np 13 | from p5_fastTextB_model import fastTextB as fastText 14 | # from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 15 | from tflearn.data_utils import to_categorical, pad_sequences 16 | import os 17 | import codecs 18 | 19 | #configuration 20 | FLAGS=tf.app.flags.FLAGS 21 | tf.app.flags.DEFINE_integer("label_size",19,"number of label") 22 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 23 | tf.app.flags.DEFINE_integer("batch_size", 512, "Batch size for training/evaluating.") #批处理的大小 32-->128 24 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 25 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 26 | tf.app.flags.DEFINE_integer("num_sampled",100,"number of noise sampling") 27 | tf.app.flags.DEFINE_string("ckpt_dir","fast_text_checkpoint/","checkpoint location for the model") 28 | tf.app.flags.DEFINE_integer("sentence_len",300,"max sentence length") 29 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 30 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 31 | tf.app.flags.DEFINE_integer("num_epochs",15,"embedding size") 32 | tf.app.flags.DEFINE_integer("validate_every", 3, "Validate every validate_every epochs.") #每10轮做一次验证 33 | tf.app.flags.DEFINE_string("predict_target_file","fast_text_checkpoint/zhihu_result_ftB2.csv","target file path for final prediction") 34 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-v4only-title.txt',"target file path for final prediction") 35 | 36 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) 37 | def main(_): 38 | # 1.load data with vocabulary of words and labels 39 | vocabulary_word2index, vocabulary_index2word = create_voabulary() 40 | vocab_size = len(vocabulary_word2index) 41 | vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label() 42 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 43 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 44 | testX=[] 45 | question_id_list=[] 46 | for tuple in test: 47 | question_id,question_string_list=tuple 48 | question_id_list.append(question_id) 49 | testX.append(question_string_list) 50 | 51 | # 2.Data preprocessing: Sequence padding 52 | print("start padding....") 53 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 54 | print("end padding...") 55 | 56 | # 3.create session. 57 | config=tf.ConfigProto() 58 | config.gpu_options.allow_growth=True 59 | with tf.Session(config=config) as sess: 60 | # 4.Instantiate Model 61 | fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 62 | saver=tf.train.Saver() 63 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 64 | print("Restoring Variables from Checkpoint") 65 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 66 | else: 67 | print("Can't find the checkpoint.going to stop") 68 | return 69 | # 5.feed data, to get logits 70 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 71 | batch_size=1 72 | index=0 73 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 74 | for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)): 75 | logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999) 76 | # 6. get lable using logtis 77 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 78 | # 7. write question id and labels to file system. 79 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 80 | index=index+1 81 | predict_target_file_f.close() 82 | 83 | # get label using logits 84 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 85 | # test 86 | #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 87 | index_list=np.argsort(logits)[-top_number:] 88 | index_list=index_list[::-1] 89 | label_list=[] 90 | for index in index_list: 91 | label=vocabulary_index2word_label[index] 92 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 93 | return label_list 94 | 95 | # write question id and labels to file system. 96 | def write_question_id_with_labels(question_id,labels_list,f): 97 | labels_string=",".join(labels_list) 98 | f.write(question_id+","+labels_string+"\n") 99 | 100 | if __name__ == "__main__": 101 | tf.app.run() 102 | -------------------------------------------------------------------------------- /venv/dwb/github_model/a01_FastText/p5_fastTextB_predict_multilabel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | from p5_fastTextB_model import fastTextB as fastText 10 | # from p4_zhihu_load_data import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import to_categorical, pad_sequences 12 | import os 13 | import codecs 14 | 15 | #configuration 16 | FLAGS=tf.app.flags.FLAGS 17 | tf.app.flags.DEFINE_integer("label_size",19,"number of label") 18 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 19 | tf.app.flags.DEFINE_integer("batch_size", 512, "Batch size for training/evaluating.") #批处理的大小 32-->128 20 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 22 | tf.app.flags.DEFINE_integer("num_sampled",100,"number of noise sampling") 23 | tf.app.flags.DEFINE_string("ckpt_dir","fast_text_checkpoint_multi/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sentence_len",300,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"embedding size") 28 | tf.app.flags.DEFINE_integer("validate_every", 10, "Validate every validate_every epochs.") #每10轮做一次验证 29 | tf.app.flags.DEFINE_string("predict_target_file","fast_text_checkpoint_multi/zhihu_result_ftB_multilabel.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-v4only-title.txt',"target file path for final prediction") 31 | 32 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) 33 | def main(_): 34 | # 1.load data with vocabulary of words and labels 35 | vocabulary_word2index, vocabulary_index2word = create_voabulary() 36 | vocab_size = len(vocabulary_word2index) 37 | print("vocab_size:",vocab_size) 38 | #iii=0 39 | #iii/0 40 | vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label() 41 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO 42 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO 43 | testX=[] 44 | question_id_list=[] 45 | for tuple in test: 46 | question_id,question_string_list=tuple 47 | question_id_list.append(question_id) 48 | testX.append(question_string_list) 49 | 50 | # 2.Data preprocessing: Sequence padding 51 | print("start padding....") 52 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 53 | print("end padding...") 54 | 55 | # 3.create session. 56 | config=tf.ConfigProto() 57 | config.gpu_options.allow_growth=True 58 | with tf.Session(config=config) as sess: 59 | # 4.Instantiate Model 60 | fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 61 | saver=tf.train.Saver() 62 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 63 | print("Restoring Variables from Checkpoint") 64 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 65 | else: 66 | print("Can't find the checkpoint.going to stop") 67 | return 68 | # 5.feed data, to get logits 69 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 70 | batch_size=1 71 | index=0 72 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 73 | for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)): 74 | logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999) 75 | # 6. get lable using logtis 76 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 77 | # 7. write question id and labels to file system. 78 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 79 | index=index+1 80 | predict_target_file_f.close() 81 | 82 | # get label using logits 83 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 84 | index_list=np.argsort(logits)[-top_number:] 85 | index_list=index_list[::-1] 86 | label_list=[] 87 | for index in index_list: 88 | label=vocabulary_index2word_label[index] 89 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 90 | return label_list 91 | 92 | # write question id and labels to file system. 93 | def write_question_id_with_labels(question_id,labels_list,f): 94 | labels_string=",".join(labels_list) 95 | f.write(question_id+","+labels_string+"\n") 96 | 97 | if __name__ == "__main__": 98 | tf.app.run() -------------------------------------------------------------------------------- /venv/dwb/github_model/a01_FastText/p6_fastTextB_model_multilabel.py: -------------------------------------------------------------------------------- 1 | # autor:xul 2 | # fast text. using: very simple model;n-gram to captrue location information;h-softmax to speed up training/inference 3 | print("started...") 4 | import tensorflow as tf 5 | 6 | class fastTextB: 7 | def __init__(self, label_size, learning_rate, batch_size, decay_steps, decay_rate,num_sampled,sentence_len,vocab_size,embed_size,is_training,max_label_per_example=5): 8 | """init all hyperparameter here""" 9 | # 1.set hyper-paramter 10 | self.label_size = label_size #e.g.1999 11 | self.batch_size = batch_size 12 | self.num_sampled = num_sampled 13 | self.sentence_len=sentence_len 14 | self.vocab_size=vocab_size 15 | self.embed_size=embed_size 16 | self.is_training=is_training 17 | self.learning_rate=learning_rate 18 | self.max_label_per_example=max_label_per_example 19 | 20 | # 2.add placeholder (X,label) 21 | self.sentence = tf.placeholder(tf.int32, [None, self.sentence_len], name="sentence") #X 22 | self.labels = tf.placeholder(tf.int64, [None,self.max_label_per_example], name="Labels") #y [1,2,3,3,3] 23 | self.labels_l1999=tf.placeholder(tf.int64,[None,self.label_size]) 24 | #3.set some variables 25 | self.global_step = tf.Variable(0, trainable=False, name="Global_Step") 26 | self.epoch_step=tf.Variable(0, trainable=False,name="Epoch_Step") 27 | self.epoch_increment=tf.assign(self.epoch_step,tf.add(self.epoch_step,tf.constant(1))) 28 | self.decay_steps, self.decay_rate = decay_steps, decay_rate 29 | self.epoch_step = tf.Variable(0, trainable=False, name="Epoch_Step") 30 | 31 | #4.init weights 32 | self.instantiate_weights() 33 | #5.main graph: inference 34 | self.logits = self.inference() #[None, self.label_size] 35 | #6.calculate loss 36 | self.loss_val = self.loss() 37 | #7.start training by update parameters using according loss 38 | self.train_op = self.train() 39 | 40 | #8.calcuate accuracy 41 | # correct_prediction = tf.equal(tf.argmax(self.logits, 1), self.labels) #2.TODO tf.argmax(self.logits, 1)-->[batch_size] 42 | # self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy") #TODO 43 | 44 | def instantiate_weights(self): 45 | """define all weights here""" 46 | # embedding matrix 47 | self.Embedding = tf.get_variable("Embedding", [self.vocab_size, self.embed_size]) 48 | self.W = tf.get_variable("W", [self.embed_size, self.label_size]) 49 | self.b = tf.get_variable("b", [self.label_size]) 50 | 51 | def inference(self): 52 | """main computation graph here: 1.embedding-->2.average-->3.linear classifier""" 53 | # 1.get emebedding of words in the sentence 54 | sentence_embeddings = tf.nn.embedding_lookup(self.Embedding,self.sentence) # [None,self.sentence_len,self.embed_size] 55 | 56 | # 2.average vectors, to get representation of the sentence 57 | self.sentence_embeddings = tf.reduce_mean(sentence_embeddings, axis=1) # [None,self.embed_size] 58 | 59 | # 3.linear classifier layer 60 | logits = tf.matmul(self.sentence_embeddings, self.W) + self.b #[None, self.label_size]==tf.matmul([None,self.embed_size],[self.embed_size,self.label_size]) 61 | return logits 62 | 63 | 64 | def loss(self,l2_lambda=0.0001): 65 | """calculate loss using (NCE)cross entropy here""" 66 | # Compute the average NCE loss for the batch. 67 | # tf.nce_loss automatically draws a new sample of the negative labels each 68 | # time we evaluate the loss. 69 | if self.is_training:#training 70 | #labels=tf.reshape(self.labels,[-1]) #3.[batch_size,max_label_per_example]------>[batch_size*max_label_per_example,] 71 | #labels=tf.expand_dims(labels,1) #[batch_size*max_label_per_example,]----->[batch_size*max_label_per_example,1] 72 | #nce_loss: notice-->for now, if you have a variable number of target classes, you can pad them out to a constant number by either repeating them or by padding with an otherwise unused class. 73 | loss = tf.reduce_mean(#inputs's SHAPE should be: [batch_size, dim] 74 | tf.nn.nce_loss(weights=tf.transpose(self.W), #[embed_size, label_size]--->[label_size,embed_size]. nce_weights:A `Tensor` of shape `[num_classes, dim].O.K. 75 | biases=self.b, #[label_size]. nce_biases:A `Tensor` of shape `[num_classes]`. 76 | labels=self.labels, #4.[batch_size,max_label_per_example]. train_labels, # A `Tensor` of type `int64` and shape `[batch_size,num_true]`. The target classes. 77 | inputs=self.sentence_embeddings,#TODO [None,self.embed_size] #A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. 78 | num_sampled=self.num_sampled, # scalar. 100 79 | num_true=self.max_label_per_example, 80 | num_classes=self.label_size,partition_strategy="div")) #scalar. 1999 81 | else:#eval(/inference) 82 | labels_multi_hot = self.labels_l1999 #[batch_size,label_size] 83 | #sigmoid_cross_entropy_with_logits:Computes sigmoid cross entropy given `logits`.Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For instance, one could perform multilabel classification where a picture can contain both an elephant and a dog at the same time. 84 | loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_multi_hot,logits=self.logits) #labels:[batch_size,label_size];logits:[batch, label_size] 85 | loss = tf.reduce_sum(loss, axis=1) 86 | 87 | # add regularization result in not converge 88 | l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda 89 | loss=loss+l2_losses 90 | return loss 91 | 92 | def train(self): 93 | """based on the loss, use SGD to update parameter""" 94 | learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, 95 | self.decay_rate, staircase=True) 96 | train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step, 97 | learning_rate=learning_rate, optimizer="Adam") 98 | return train_op 99 | 100 | 101 | print("ended...") -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a02_TextCNN/__init__.py -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/__pycache__/data_util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a02_TextCNN/__pycache__/data_util.cpython-36.pyc -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/__pycache__/p7_TextCNN_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a02_TextCNN/__pycache__/p7_TextCNN_model.cpython-36.pyc -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/data_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import codecs 3 | import random 4 | import numpy as np 5 | from tflearn.data_utils import pad_sequences 6 | from collections import Counter 7 | import os 8 | import pickle 9 | 10 | local_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/github_model/a02_TextCNN/' 11 | 12 | 13 | def load_data_multilabel(traning_data_path,vocab_word2index, vocab_label2index,sentence_len,training_portion=0.9): 14 | """ 15 | convert data as indexes using word2index dicts. 16 | :param traning_data_path: 17 | :param vocab_word2index: 18 | :param vocab_label2index: 19 | :return: 20 | """ 21 | file_object = codecs.open(traning_data_path, mode='r', encoding='utf-8') 22 | lines = file_object.readlines() 23 | random.shuffle(lines) 24 | label_size=len(vocab_label2index) 25 | X = [] 26 | Y = [] 27 | for i,line in enumerate(lines): 28 | raw_list = line.strip().split("__myprefix__") 29 | input_list = raw_list[0].strip().split(" ") 30 | input_list = [x.strip().replace(" ", "") for x in input_list if x != ''] 31 | x=[vocab_word2index.get(x,0) for x in input_list] 32 | label_list = raw_list[1:] 33 | label_list=[l.strip().replace(" ", "") for l in label_list if l != ''] 34 | label_list=[vocab_label2index[label] for label in label_list] 35 | # y=transform_multilabel_as_multihot(label_list,label_size) 36 | y=label_list 37 | X.append(x) 38 | Y.append(y) 39 | Y = np.array(Y).reshape(-1) 40 | X = pad_sequences(X, maxlen=sentence_len, value=0.) # padding to max length 41 | number_examples = len(lines) 42 | training_number=int(training_portion* number_examples) 43 | train = (X[0:training_number], Y[0:training_number]) 44 | valid_number=number_examples-training_number 45 | test = (X[training_number+ 1:training_number+valid_number+1], Y[training_number + 1:training_number+valid_number+1]) 46 | return train,test 47 | 48 | 49 | def transform_multilabel_as_multihot(label_list,label_size): 50 | """ 51 | convert to multi-hot style 52 | :param label_list: e.g.[0,1,4], here 4 means in the 4th position it is true value(as indicate by'1') 53 | :param label_size: e.g.199 54 | :return:e.g.[1,1,0,1,0,0,........] 55 | """ 56 | result=np.zeros(label_size) 57 | #set those location as 1, all else place as 0. 58 | result[label_list] = 1 59 | return result 60 | 61 | #use pretrained word embedding to get word vocabulary and labels, and its relationship with index 62 | def create_vocabulary(training_data_path,vocab_size,name_scope='cnn'): 63 | """ 64 | create vocabulary 65 | :param training_data_path: 66 | :param vocab_size: 67 | :param name_scope: 68 | :return: 69 | """ 70 | 71 | cache_vocabulary_label_pik=local_path+'cache'+"_"+name_scope # path to save cache 72 | if not os.path.isdir(cache_vocabulary_label_pik): # create folder if not exists. 73 | os.makedirs(cache_vocabulary_label_pik) 74 | 75 | # if cache exists. load it; otherwise create it. 76 | cache_path =cache_vocabulary_label_pik+"/"+'vocab_label.pik' 77 | print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path)) 78 | if os.path.exists(cache_path): 79 | with open(cache_path, 'rb') as data_f: 80 | return pickle.load(data_f) 81 | else: 82 | vocabulary_word2index={} 83 | vocabulary_index2word={} 84 | # vocabulary_word2index[_PAD]=PAD_ID 85 | # vocabulary_index2word[PAD_ID]=_PAD 86 | # vocabulary_word2index[_UNK]=UNK_ID 87 | # vocabulary_index2word[UNK_ID]=_UNK 88 | 89 | vocabulary_label2index={} 90 | vocabulary_index2label={} 91 | 92 | #1.load raw data 93 | file_object = codecs.open(training_data_path, mode='r', encoding='utf-8') 94 | lines=file_object.readlines() 95 | #2.loop each line,put to counter 96 | c_inputs=Counter() 97 | c_labels=Counter() 98 | for line in lines: 99 | raw_list=line.strip().split("__myprefix__") 100 | 101 | input_list = raw_list[0].strip().split(" ") 102 | input_list = [x.strip().replace(" ", "") for x in input_list if x != ''] 103 | label_list=[l.strip().replace(" ","") for l in raw_list[1:] if l!=''] 104 | c_inputs.update(input_list) 105 | c_labels.update(label_list) 106 | #return most frequency words 107 | vocab_list=c_inputs.most_common(vocab_size) 108 | label_list=c_labels.most_common() 109 | #put those words to dict 110 | for i,tuplee in enumerate(vocab_list): 111 | word,_=tuplee 112 | vocabulary_word2index[word]=i+1 113 | vocabulary_index2word[i+1]=word 114 | 115 | for i,tuplee in enumerate(label_list): 116 | label,_=tuplee;label=str(label) 117 | vocabulary_label2index[label]=i 118 | vocabulary_index2label[i]=label 119 | 120 | #save to file system if vocabulary of words not exists. 121 | if not os.path.exists(cache_path): 122 | with open(cache_path, 'ab') as data_f: 123 | pickle.dump((vocabulary_word2index,vocabulary_index2word,vocabulary_label2index,vocabulary_index2label), data_f) 124 | return vocabulary_word2index,vocabulary_index2word,vocabulary_label2index,vocabulary_index2label 125 | 126 | def get_target_label_short(eval_y): 127 | eval_y_short=[] #will be like:[22,642,1391] 128 | for index,label in enumerate(eval_y): 129 | if label>0: 130 | eval_y_short.append(index) 131 | return eval_y_short 132 | 133 | # training_data_path = '/Users/liyangyang/Downloads/bdci/train.txt' 134 | # vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label= \ 135 | # create_vocabulary(training_data_path,17259,name_scope='cnn') 136 | # vocab_size = len(vocabulary_word2index);print("cnn_model.vocab_size:",vocab_size);num_classes=len(vocabulary_index2label);print("num_classes:",num_classes) 137 | # print(vocabulary_index2label) 138 | # train, test= load_data_multilabel(training_data_path,vocabulary_word2index, vocabulary_label2index,200) 139 | # trainX, trainY = train 140 | # testX, testY = test 141 | # #print some message for debug purpose 142 | # print("length of training data:",len(trainX),";length of validation data:",len(testX)) 143 | # print("trainX[0]:", trainX[1]); 144 | # print("trainY[0]:", trainY[1]) 145 | # # train_y_short = get_target_label_short(trainY[1]) 146 | # # print("train_y_short:", train_y_short) 147 | # for i in range(1,100): 148 | # print(vocabulary_index2word[i]) -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/other_experiement/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a02_TextCNN/other_experiement/__init__.py -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/other_experiement/p7_TextCNN_predict_ensemble.py: -------------------------------------------------------------------------------- 1 | from p7_TextCNN_predict import get_logits_with_value_by_input 2 | from p7_TextCNN_predict_exp import get_logits_with_value_by_input_exp 3 | import tensorflow as tf 4 | def main(_): 5 | for start in range(217360): 6 | end=start+1 7 | label_list,p_list=get_logits_with_value_by_input(start,end) 8 | label_list_exp, p_list_exp=get_logits_with_value_by_input_exp(start,end) 9 | 10 | if start<5: 11 | print("----------------------------------------------------") 12 | print(start,"label_list0:",label_list,"p_list0:",p_list) 13 | print(start,"label_list1:", label_list_exp, "p_list1:", p_list_exp) 14 | else: 15 | break 16 | 17 | 18 | 19 | if __name__ == "__main__": 20 | tf.app.run() -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/other_experiement/p7_TextCNN_predict_exp512.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | #from p5_fastTextB_model import fastTextB as fastText 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import pad_sequences #to_categorical 12 | import os 13 | import codecs 14 | from p7_TextCNN_model import TextCNN 15 | 16 | #configuration 17 | FLAGS=tf.app.flags.FLAGS 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 23 | tf.app.flags.DEFINE_string("ckpt_dir","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.") 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证 29 | tf.app.flags.DEFINE_string("predict_target_file","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512/zhihu_result_cnn_multilabel_v7_exp512_20170616.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 32 | tf.app.flags.DEFINE_integer("num_filters", 600, "number of filters") #128-->512 33 | tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model") 34 | 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") 36 | 37 | ############################################################################################################################################## 38 | filter_sizes=[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7] 39 | 40 | def main(_): 41 | # 1.load data with vocabulary of words and labels 42 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2") 43 | vocab_size = len(vocabulary_word2index) 44 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") 45 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 46 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 47 | testX=[] 48 | question_id_list=[] 49 | for tuple in test: 50 | question_id,question_string_list=tuple 51 | question_id_list.append(question_id) 52 | testX.append(question_string_list) 53 | # 2.Data preprocessing: Sequence padding 54 | print("start padding....") 55 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 56 | print("end padding...") 57 | # 3.create session. 58 | config=tf.ConfigProto() 59 | config.gpu_options.allow_growth=True 60 | with tf.Session(config=config) as sess: 61 | # 4.Instantiate Model 62 | textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate, 63 | FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 64 | saver=tf.train.Saver() 65 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 66 | print("Restoring Variables from Checkpoint") 67 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 68 | else: 69 | print("Can't find the checkpoint.going to stop") 70 | return 71 | # 5.feed data, to get logits 72 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 73 | index=0 74 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 75 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 76 | logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 77 | # 6. get lable using logtis 78 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 79 | # 7. write question id and labels to file system. 80 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 81 | index=index+1 82 | predict_target_file_f.close() 83 | 84 | # get label using logits 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 86 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 87 | index_list=index_list[::-1] 88 | label_list=[] 89 | for index in index_list: 90 | label=vocabulary_index2word_label[index] 91 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 92 | return label_list 93 | 94 | # get label using logits 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5): 96 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 97 | index_list=index_list[::-1] 98 | value_list=[] 99 | label_list=[] 100 | for index in index_list: 101 | label=vocabulary_index2word_label[index] 102 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 103 | value_list.append(logits[index]) 104 | return label_list,value_list 105 | 106 | # write question id and labels to file system. 107 | def write_question_id_with_labels(question_id,labels_list,f): 108 | labels_string=",".join(labels_list) 109 | f.write(question_id+","+labels_string+"\n") 110 | 111 | if __name__ == "__main__": 112 | tf.app.run() 113 | #labels,list_value=get_logits_with_value_by_input(0, 1) 114 | #print("labels:",labels) 115 | #print("list_value:", list_value) -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/other_experiement/p7_TextCNN_predict_exp512_0609.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | #from p5_fastTextB_model import fastTextB as fastText 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import pad_sequences #to_categorical 12 | import os 13 | import codecs 14 | from p7_TextCNN_model import TextCNN 15 | 16 | #configuration 17 | FLAGS=tf.app.flags.FLAGS 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 23 | tf.app.flags.DEFINE_string("ckpt_dir","text_cnn_title_desc_checkpoint_exp512_0609/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.") 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证 29 | tf.app.flags.DEFINE_string("predict_target_file","text_cnn_title_desc_checkpoint_exp512_0609/zhihu_result_cnn_multilabel_exp512_0609.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128 33 | tf.app.flags.DEFINE_string("ckpt_dir2","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model") 34 | 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") 36 | 37 | ############################################################################################################################################## 38 | filter_sizes=[2,3,5,6,7,8] #[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7] 39 | 40 | def main(_): 41 | # 1.load data with vocabulary of words and labels 42 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2") 43 | vocab_size = len(vocabulary_word2index) 44 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") 45 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 46 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 47 | testX=[] 48 | question_id_list=[] 49 | for tuple in test: 50 | question_id,question_string_list=tuple 51 | question_id_list.append(question_id) 52 | testX.append(question_string_list) 53 | # 2.Data preprocessing: Sequence padding 54 | print("start padding....") 55 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 56 | print("end padding...") 57 | # 3.create session. 58 | config=tf.ConfigProto() 59 | config.gpu_options.allow_growth=True 60 | with tf.Session(config=config) as sess: 61 | # 4.Instantiate Model 62 | textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate, 63 | FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 64 | saver=tf.train.Saver() 65 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 66 | print("Restoring Variables from Checkpoint") 67 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 68 | else: 69 | print("Can't find the checkpoint.going to stop") 70 | return 71 | # 5.feed data, to get logits 72 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 73 | index=0 74 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 75 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 76 | logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 77 | # 6. get lable using logtis 78 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 79 | # 7. write question id and labels to file system. 80 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 81 | index=index+1 82 | predict_target_file_f.close() 83 | 84 | # get label using logits 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 86 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 87 | index_list=index_list[::-1] 88 | label_list=[] 89 | for index in index_list: 90 | label=vocabulary_index2word_label[index] 91 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 92 | return label_list 93 | 94 | # get label using logits 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5): 96 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 97 | index_list=index_list[::-1] 98 | value_list=[] 99 | label_list=[] 100 | for index in index_list: 101 | label=vocabulary_index2word_label[index] 102 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 103 | value_list.append(logits[index]) 104 | return label_list,value_list 105 | 106 | # write question id and labels to file system. 107 | def write_question_id_with_labels(question_id,labels_list,f): 108 | labels_string=",".join(labels_list) 109 | f.write(question_id+","+labels_string+"\n") 110 | 111 | if __name__ == "__main__": 112 | tf.app.run() 113 | #labels,list_value=get_logits_with_value_by_input(0, 1) 114 | #print("labels:",labels) 115 | #print("list_value:", list_value) -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/other_experiement/p7_TextCNN_predict_exp512_simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | #from p5_fastTextB_model import fastTextB as fastText 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import pad_sequences #to_categorical 12 | import os 13 | import codecs 14 | from p7_TextCNN_model import TextCNN 15 | 16 | #configuration 17 | FLAGS=tf.app.flags.FLAGS 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 23 | tf.app.flags.DEFINE_string("ckpt_dir","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512_simple/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.") 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证 29 | tf.app.flags.DEFINE_string("predict_target_file","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp512_simple/zhihu_result_cnn_multilabel_exp512_simple.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128 33 | tf.app.flags.DEFINE_string("ckpt_dir2","checkpoint_text_cnn/text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model") 34 | 35 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") 36 | 37 | ############################################################################################################################################## 38 | filter_sizes=[7] #[3,4,5,7,10,15,20,25]#[1,2,3,4,5,6,7] 39 | 40 | def main(_): 41 | # 1.load data with vocabulary of words and labels 42 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2") 43 | vocab_size = len(vocabulary_word2index) 44 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") 45 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 46 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 47 | testX=[] 48 | question_id_list=[] 49 | for tuple in test: 50 | question_id,question_string_list=tuple 51 | question_id_list.append(question_id) 52 | testX.append(question_string_list) 53 | # 2.Data preprocessing: Sequence padding 54 | print("start padding....") 55 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 56 | print("end padding...") 57 | # 3.create session. 58 | config=tf.ConfigProto() 59 | config.gpu_options.allow_growth=True 60 | with tf.Session(config=config) as sess: 61 | # 4.Instantiate Model 62 | textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate, 63 | FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 64 | saver=tf.train.Saver() 65 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 66 | print("Restoring Variables from Checkpoint") 67 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 68 | else: 69 | print("Can't find the checkpoint.going to stop") 70 | return 71 | # 5.feed data, to get logits 72 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 73 | index=0 74 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 75 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 76 | logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 77 | # 6. get lable using logtis 78 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 79 | # 7. write question id and labels to file system. 80 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 81 | index=index+1 82 | predict_target_file_f.close() 83 | 84 | # get label using logits 85 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 86 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 87 | index_list=index_list[::-1] 88 | label_list=[] 89 | for index in index_list: 90 | label=vocabulary_index2word_label[index] 91 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 92 | return label_list 93 | 94 | # get label using logits 95 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5): 96 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 97 | index_list=index_list[::-1] 98 | value_list=[] 99 | label_list=[] 100 | for index in index_list: 101 | label=vocabulary_index2word_label[index] 102 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 103 | value_list.append(logits[index]) 104 | return label_list,value_list 105 | 106 | # write question id and labels to file system. 107 | def write_question_id_with_labels(question_id,labels_list,f): 108 | labels_string=",".join(labels_list) 109 | f.write(question_id+","+labels_string+"\n") 110 | 111 | if __name__ == "__main__": 112 | tf.app.run() 113 | #labels,list_value=get_logits_with_value_by_input(0, 1) 114 | #print("labels:",labels) 115 | #print("list_value:", list_value) 116 | -------------------------------------------------------------------------------- /venv/dwb/github_model/a02_TextCNN/other_experiement/p8_TextCNN_predict_exp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | import tensorflow as tf 8 | import numpy as np 9 | #from p5_fastTextB_model import fastTextB as fastText 10 | from data_util_zhihu import load_data_predict,load_final_test_data,create_voabulary,create_voabulary_label 11 | from tflearn.data_utils import pad_sequences #to_categorical 12 | import os 13 | import codecs 14 | from p7_TextCNN_model import TextCNN 15 | 16 | #configuration 17 | FLAGS=tf.app.flags.FLAGS 18 | tf.app.flags.DEFINE_integer("num_classes",1999,"number of label") 19 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 20 | tf.app.flags.DEFINE_integer("batch_size", 1, "Batch size for training/evaluating.") #批处理的大小 32-->128 21 | tf.app.flags.DEFINE_integer("decay_steps", 5000, "how many steps before decay learning rate.") #批处理的大小 32-->128 22 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 23 | tf.app.flags.DEFINE_string("ckpt_dir","text_cnn_title_desc_checkpoint_exp/","checkpoint location for the model") 24 | tf.app.flags.DEFINE_integer("sentence_len",100,"max sentence length") 25 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 26 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 27 | tf.app.flags.DEFINE_integer("num_epochs",15,"number of epochs.") 28 | tf.app.flags.DEFINE_integer("validate_every", 1, "Validate every validate_every epochs.") #每10轮做一次验证 29 | tf.app.flags.DEFINE_string("predict_target_file","text_cnn_title_desc_checkpoint_exp/zhihu_result_cnn_multilabel_v6_exp.csv","target file path for final prediction") 30 | tf.app.flags.DEFINE_string("predict_source_file",'test-zhihu-forpredict-title-desc-v6.txt',"target file path for final prediction") #test-zhihu-forpredict-v4only-title.txt 31 | tf.app.flags.DEFINE_string("word2vec_model_path","zhihu-word2vec-title-desc.bin-100","word2vec's vocabulary and vectors") #zhihu-word2vec.bin-100 32 | tf.app.flags.DEFINE_integer("num_filters", 256, "number of filters") #128 33 | #tf.app.flags.DEFINE_boolean("multi_label_flag",True,"use multi label or single label.") 34 | 35 | ################################################################################################################################## 36 | filter_sizes=[3,4,5,7,15,20,25] 37 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) 38 | # 1.load data with vocabulary of words and labels 39 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple', 40 | word2vec_model_path=FLAGS.word2vec_model_path, 41 | name_scope="cnn2") 42 | vocab_size = len(vocabulary_word2index) 43 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") 44 | questionid_question_lists = load_final_test_data(FLAGS.predict_source_file) 45 | test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists) 46 | testX = [] 47 | question_id_list = [] 48 | for tuple in test: 49 | question_id, question_string_list = tuple 50 | question_id_list.append(question_id) 51 | testX.append(question_string_list) 52 | # 2.Data preprocessing: Sequence padding 53 | print("start padding....") 54 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 55 | print("end padding...") 56 | # 3.create session. 57 | config = tf.ConfigProto() 58 | config.gpu_options.allow_growth = True 59 | sess=tf.Session(config=config) 60 | # 4.Instantiate Model 61 | textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, 62 | FLAGS.decay_steps, FLAGS.decay_rate, 63 | FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training) 64 | saver = tf.train.Saver() 65 | if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): 66 | print("Restoring Variables from Checkpoint") 67 | saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 68 | else: 69 | print("Can't find the checkpoint.going to stop") 70 | #return 71 | # 5.feed data, to get logits 72 | number_of_training_data = len(testX2); 73 | print("number_of_training_data:", number_of_training_data) 74 | #index = 0 75 | #predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 76 | ################################################################################################################################## 77 | def get_logits_by_input_exp(start,end): 78 | x=testX2[start:end] 79 | logits = sess.run(textCNN.logits, feed_dict={textCNN.input_x: x, textCNN.dropout_keep_prob: 1}) 80 | predicted_labels,value_labels = get_label_using_logits_with_value(logits[0], vocabulary_index2word_label) 81 | value_labels_exp= np.exp(value_labels) 82 | p_labels=value_labels_exp/np.sum(value_labels_exp) 83 | return predicted_labels,p_labels 84 | 85 | def main(_): 86 | # 1.load data with vocabulary of words and labels 87 | vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',word2vec_model_path=FLAGS.word2vec_model_path,name_scope="cnn2") 88 | vocab_size = len(vocabulary_word2index) 89 | vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2") 90 | questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) 91 | test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) 92 | testX=[] 93 | question_id_list=[] 94 | for tuple in test: 95 | question_id,question_string_list=tuple 96 | question_id_list.append(question_id) 97 | testX.append(question_string_list) 98 | # 2.Data preprocessing: Sequence padding 99 | print("start padding....") 100 | testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.) # padding to max length 101 | print("end padding...") 102 | # 3.create session. 103 | config=tf.ConfigProto() 104 | config.gpu_options.allow_growth=True 105 | with tf.Session(config=config) as sess: 106 | # 4.Instantiate Model 107 | textCNN=TextCNN(filter_sizes,FLAGS.num_filters,FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,FLAGS.decay_rate, 108 | FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training) 109 | saver=tf.train.Saver() 110 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 111 | print("Restoring Variables from Checkpoint") 112 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 113 | else: 114 | print("Can't find the checkpoint.going to stop") 115 | return 116 | # 5.feed data, to get logits 117 | number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data) 118 | index=0 119 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 120 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 121 | logits=sess.run(textCNN.logits,feed_dict={textCNN.input_x:testX2[start:end],textCNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 122 | # 6. get lable using logtis 123 | predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) 124 | # 7. write question id and labels to file system. 125 | write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 126 | index=index+1 127 | predict_target_file_f.close() 128 | 129 | # get label using logits 130 | def get_label_using_logits(logits,vocabulary_index2word_label,top_number=5): 131 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 132 | index_list=index_list[::-1] 133 | label_list=[] 134 | for index in index_list: 135 | label=vocabulary_index2word_label[index] 136 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 137 | return label_list 138 | 139 | # get label using logits 140 | def get_label_using_logits_with_value(logits,vocabulary_index2word_label,top_number=5): 141 | index_list=np.argsort(logits)[-top_number:] #print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 142 | index_list=index_list[::-1] 143 | value_list=[] 144 | label_list=[] 145 | for index in index_list: 146 | label=vocabulary_index2word_label[index] 147 | label_list.append(label) #('get_label_using_logits.label_list:', [u'-3423450385060590478', u'2838091149470021485', u'-3174907002942471215', u'-1812694399780494968', u'6815248286057533876']) 148 | value_list.append(logits[index]) 149 | return label_list,value_list 150 | 151 | # write question id and labels to file system. 152 | def write_question_id_with_labels(question_id,labels_list,f): 153 | labels_string=",".join(labels_list) 154 | f.write(question_id+","+labels_string+"\n") 155 | 156 | if __name__ == "__main__": 157 | #tf.app.run() 158 | labels,list_value=get_logits_by_input_exp(0, 1) 159 | print("labels:",labels) 160 | print("list_value:", list_value) -------------------------------------------------------------------------------- /venv/dwb/github_model/a03_TextRNN/__pycache__/p8_TextRNN_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/github_model/a03_TextRNN/__pycache__/p8_TextRNN_model.cpython-36.pyc -------------------------------------------------------------------------------- /venv/dwb/github_model/a03_TextRNN/p8_TextRNN_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #TextRNN: 1. embeddding layer, 2.Bi-LSTM layer, 3.concat output, 4.FC layer, 5.softmax 3 | import tensorflow as tf 4 | from tensorflow.contrib import rnn 5 | import numpy as np 6 | 7 | class TextRNN: 8 | def __init__(self,num_classes, learning_rate, batch_size, decay_steps, decay_rate,sequence_length, 9 | vocab_size,embed_size,is_training,initializer=tf.random_normal_initializer(stddev=0.1)): 10 | """init all hyperparameter here""" 11 | # set hyperparamter 12 | self.num_classes = num_classes 13 | self.batch_size = batch_size 14 | self.sequence_length=sequence_length 15 | self.vocab_size=vocab_size 16 | self.embed_size=embed_size 17 | self.hidden_size=embed_size 18 | self.is_training=is_training 19 | self.learning_rate=learning_rate 20 | self.initializer=initializer 21 | self.num_sampled=20 22 | 23 | # add placeholder (X,label) 24 | self.input_x = tf.placeholder(tf.int32, [None, self.sequence_length], name="input_x") # X 25 | self.input_y = tf.placeholder(tf.int32,[None], name="input_y") # y [None,num_classes] 26 | self.dropout_keep_prob=tf.placeholder(tf.float32,name="dropout_keep_prob") 27 | 28 | self.global_step = tf.Variable(0, trainable=False, name="Global_Step") 29 | self.epoch_step=tf.Variable(0,trainable=False,name="Epoch_Step") 30 | self.epoch_increment=tf.assign(self.epoch_step,tf.add(self.epoch_step,tf.constant(1))) 31 | self.decay_steps, self.decay_rate = decay_steps, decay_rate 32 | 33 | self.instantiate_weights() 34 | self.logits = self.inference() #[None, self.label_size]. main computation graph is here. 35 | if not is_training: 36 | return 37 | self.loss_val = self.loss() #-->self.loss_nce() 38 | self.train_op = self.train() 39 | self.predictions = tf.argmax(self.logits, axis=1, name="predictions") # shape:[None,] 40 | correct_prediction = tf.equal(tf.cast(self.predictions,tf.int32), self.input_y) #tf.argmax(self.logits, 1)-->[batch_size] 41 | self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy") # shape=() 42 | def instantiate_weights(self): 43 | """define all weights here""" 44 | with tf.name_scope("embedding"): # embedding matrix 45 | self.Embedding = tf.get_variable("Embedding",shape=[self.vocab_size, self.embed_size],initializer=self.initializer) #[vocab_size,embed_size] tf.random_uniform([self.vocab_size, self.embed_size],-1.0,1.0) 46 | self.W_projection = tf.get_variable("W_projection",shape=[self.hidden_size*2, self.num_classes],initializer=self.initializer) #[embed_size,label_size] 47 | self.b_projection = tf.get_variable("b_projection",shape=[self.num_classes]) #[label_size] 48 | 49 | def inference(self): 50 | """main computation graph here: 1. embeddding layer, 2.Bi-LSTM layer, 3.concat, 4.FC layer 5.softmax """ 51 | #1.get emebedding of words in the sentence 52 | self.embedded_words = tf.nn.embedding_lookup(self.Embedding,self.input_x) #shape:[None,sentence_length,embed_size] 53 | #2. Bi-lstm layer 54 | # define lstm cess:get lstm cell output 55 | lstm_fw_cell=rnn.BasicLSTMCell(self.hidden_size) #forward direction cell 56 | lstm_bw_cell=rnn.BasicLSTMCell(self.hidden_size) #backward direction cell 57 | if self.dropout_keep_prob is not None: 58 | lstm_fw_cell=rnn.DropoutWrapper(lstm_fw_cell,output_keep_prob=self.dropout_keep_prob) 59 | lstm_bw_cell=rnn.DropoutWrapper(lstm_bw_cell,output_keep_prob=self.dropout_keep_prob) 60 | # bidirectional_dynamic_rnn: input: [batch_size, max_time, input_size] 61 | # output: A tuple (outputs, output_states) 62 | # where:outputs: A tuple (output_fw, output_bw) containing the forward and the backward rnn output `Tensor`. 63 | outputs,_=tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell,lstm_bw_cell,self.embedded_words,dtype=tf.float32) #[batch_size,sequence_length,hidden_size] #creates a dynamic bidirectional recurrent neural network 64 | print("outputs:===>",outputs) #outputs:(, )) 65 | #3. concat output 66 | output_rnn=tf.concat(outputs,axis=2) #[batch_size,sequence_length,hidden_size*2] 67 | #self.output_rnn_last=tf.reduce_mean(output_rnn,axis=1) #[batch_size,hidden_size*2] 68 | self.output_rnn_last=output_rnn[:,-1,:] ##[batch_size,hidden_size*2] #TODO 69 | print("output_rnn_last:", self.output_rnn_last) # 70 | #4. logits(use linear layer) 71 | with tf.name_scope("output"): #inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. 72 | logits = tf.matmul(self.output_rnn_last, self.W_projection) + self.b_projection # [batch_size,num_classes] 73 | return logits 74 | 75 | def loss(self,l2_lambda=0.0001): 76 | with tf.name_scope("loss"): 77 | #input: `logits` and `labels` must have the same shape `[batch_size, num_classes]` 78 | #output: A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the softmax cross entropy loss. 79 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.input_y, logits=self.logits);#sigmoid_cross_entropy_with_logits.#losses=tf.nn.softmax_cross_entropy_with_logits(labels=self.input_y,logits=self.logits) 80 | #print("1.sparse_softmax_cross_entropy_with_logits.losses:",losses) # shape=(?,) 81 | loss=tf.reduce_mean(losses)#print("2.loss.loss:", loss) #shape=() 82 | l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda 83 | loss=loss+l2_losses 84 | return loss 85 | 86 | def loss_nce(self,l2_lambda=0.0001): #0.0001-->0.001 87 | """calculate loss using (NCE)cross entropy here""" 88 | # Compute the average NCE loss for the batch. 89 | # tf.nce_loss automatically draws a new sample of the negative labels each 90 | # time we evaluate the loss. 91 | if self.is_training: #training 92 | #labels=tf.reshape(self.input_y,[-1]) #[batch_size,1]------>[batch_size,] 93 | labels=tf.expand_dims(self.input_y,1) #[batch_size,]----->[batch_size,1] 94 | loss = tf.reduce_mean( #inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. 95 | tf.nn.nce_loss(weights=tf.transpose(self.W_projection),#[hidden_size*2, num_classes]--->[num_classes,hidden_size*2]. nce_weights:A `Tensor` of shape `[num_classes, dim].O.K. 96 | biases=self.b_projection, #[label_size]. nce_biases:A `Tensor` of shape `[num_classes]`. 97 | labels=labels, #[batch_size,1]. train_labels, # A `Tensor` of type `int64` and shape `[batch_size,num_true]`. The target classes. 98 | inputs=self.output_rnn_last,# [batch_size,hidden_size*2] #A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. 99 | num_sampled=self.num_sampled, #scalar. 100 100 | num_classes=self.num_classes,partition_strategy="div")) #scalar. 1999 101 | l2_losses = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name]) * l2_lambda 102 | loss = loss + l2_losses 103 | return loss 104 | 105 | def train(self): 106 | """based on the loss, use SGD to update parameter""" 107 | learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps,self.decay_rate, staircase=True) 108 | train_op = tf.contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,learning_rate=learning_rate, optimizer="Adam") 109 | return train_op 110 | 111 | #test started 112 | def test(): 113 | #below is a function test; if you use this for text classifiction, you need to tranform sentence to indices of vocabulary first. then feed data to the graph. 114 | num_classes=10 115 | learning_rate=0.01 116 | batch_size=8 117 | decay_steps=1000 118 | decay_rate=0.9 119 | sequence_length=5 120 | vocab_size=10000 121 | embed_size=100 122 | is_training=True 123 | dropout_keep_prob=1#0.5 124 | textRNN=TextRNN(num_classes, learning_rate, batch_size, decay_steps, decay_rate,sequence_length,vocab_size,embed_size,is_training) 125 | with tf.Session() as sess: 126 | sess.run(tf.global_variables_initializer()) 127 | for i in range(100): 128 | input_x=np.zeros((batch_size,sequence_length)) #[None, self.sequence_length] 129 | input_y=input_y=np.array([1,0,1,1,1,2,1,1]) #np.zeros((batch_size),dtype=np.int32) #[None, self.sequence_length] 130 | loss,acc,predict,_=sess.run([textRNN.loss_val,textRNN.accuracy,textRNN.predictions,textRNN.train_op],feed_dict={textRNN.input_x:input_x,textRNN.input_y:input_y,textRNN.dropout_keep_prob:dropout_keep_prob}) 131 | print("loss:",loss,"acc:",acc,"label:",input_y,"prediction:",predict) 132 | # test() 133 | -------------------------------------------------------------------------------- /venv/dwb/github_model/a03_TextRNN/p8_TextRNN_predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #prediction using model. 3 | #process--->1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.predict 4 | import sys 5 | sys.path.append("/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn") 6 | import data_helpers 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | 11 | sys.path.append("/Users/liyangyang/PycharmProjects/mypy/venv/dwb/github_model/a03_TextRNN") 12 | from p8_TextRNN_model import TextRNN 13 | import os 14 | import pickle 15 | from tensorflow.contrib import learn 16 | import codecs 17 | 18 | #configuration 19 | FLAGS=tf.app.flags.FLAGS 20 | tf.app.flags.DEFINE_integer("num_classes",19,"number of label") 21 | tf.app.flags.DEFINE_float("learning_rate",0.01,"learning rate") 22 | tf.app.flags.DEFINE_integer("batch_size", 128, "Batch size for training/evaluating.") #批处理的大小 32-->128 23 | tf.app.flags.DEFINE_integer("decay_steps", 12000, "how many steps before decay learning rate.") #批处理的大小 32-->128 24 | tf.app.flags.DEFINE_float("decay_rate", 0.9, "Rate of decay for learning rate.") #0.5一次衰减多少 25 | tf.app.flags.DEFINE_string("ckpt_dir", "/Users/liyangyang/PycharmProjects/mypy/venv/dwb/github_model/a03_TextRNN/text_rnn_checkpoint/", "checkpoint location for the model") 26 | tf.app.flags.DEFINE_integer("sequence_length",2000,"max sentence length") 27 | tf.app.flags.DEFINE_integer("embed_size",100,"embedding size") 28 | tf.app.flags.DEFINE_boolean("is_training",False,"is traning.true:tranining,false:testing/inference") 29 | tf.app.flags.DEFINE_string("data_file", "test_set.csv", "Data source for the positive data.") 30 | tf.app.flags.DEFINE_string("predict_target_file","/Users/liyangyang/PycharmProjects/mypy/venv/dwb/github_model/a03_TextRNN/result_rnn.csv","target file path for final prediction") 31 | #1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) 32 | def main(_): 33 | # 1.load data with vocabulary of words and labels 34 | vocab_processor_path = '/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/vocab' 35 | # print("end padding & transform to one hot...") 36 | x_train, y = data_helpers.load_data_and_labels(FLAGS.data_file) 37 | print('y.shape',y.shape) 38 | 39 | # vocab_processor = learn.preprocessing.VocabularyProcessor(2000,min_frequency=2) 40 | # x = np.array(list(vocab_processor.fit_transform(x_train))) 41 | # vocab_processor.save(vocab_processor_path) 42 | 43 | vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_processor_path) 44 | testX2 = np.array(list(vocab_processor.transform(x_train))) 45 | vocab_size = len(vocab_processor.vocabulary_) 46 | print("end padding...") 47 | # 3.create session. 48 | config=tf.ConfigProto() 49 | config.gpu_options.allow_growth=True 50 | with tf.Session(config=config) as sess: 51 | # 4.Instantiate Model 52 | textRNN=TextRNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length, 53 | vocab_size, FLAGS.embed_size, FLAGS.is_training) 54 | saver=tf.train.Saver() 55 | if os.path.exists(FLAGS.ckpt_dir+"checkpoint"): 56 | print("Restoring Variables from Checkpoint for TextRNN") 57 | saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir)) 58 | else: 59 | print("Can't find the checkpoint.going to stop") 60 | return 61 | # 5.feed data, to get logits 62 | number_of_training_data=len(testX2) 63 | print("number_of_training_data:",number_of_training_data) 64 | index=0 65 | predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8') 66 | #for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 67 | for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)): 68 | logits=sess.run(textRNN.logits,feed_dict={textRNN.input_x:testX2[start:end],textRNN.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999) 69 | # 6. get lable using logtis 70 | #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label) #logits[0] 71 | # 7. write question id and labels to file system. 72 | #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f) 73 | ############################################################################################################# 74 | print("start:",start,";end:",end) 75 | question_id_sublist=y[start:end] 76 | get_label_using_logits_batch(question_id_sublist, logits, predict_target_file_f) 77 | ######################################################################################################## 78 | index=index+1 79 | predict_target_file_f.close() 80 | 81 | # get label using logits 82 | def get_label_using_logits_batch(question_id_sublist,logits_batch,f): 83 | print("get_label_using_logits.shape:", logits_batch.shape) # (10, 1999))=[batch_size,num_labels]===>需要(10,5) 84 | for i,logits in enumerate(logits_batch): 85 | lable = int(np.argmax(logits))+1 # print("sum_p", np.sum(1.0 / (1 + np.exp(-logits)))) 86 | # print(question_id_sublist[i],lable) 87 | write_question_id_with_labels(question_id_sublist[i], lable, f) 88 | f.flush() 89 | # write question id and labels to file system. 90 | def write_question_id_with_labels(question_id,lable,f): 91 | f.write(str(question_id)+","+str(lable)+"\n") 92 | 93 | if __name__ == "__main__": 94 | tf.app.run() -------------------------------------------------------------------------------- /venv/dwb/jieba1/merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/10 下午6:49 4 | # @Author :hwwu 5 | # @File :merge.py 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | path = '/Users/liyangyang/Downloads/jieba/data/' 11 | 12 | result = pd.read_csv(path + 'result.csv') 13 | kw = pd.read_csv(path + 'train_docs_keywords.txt', sep='\t', header=None) 14 | kw.columns = ['id', 'lw'] 15 | # print(kw) 16 | result['label1']=result['label1'].replace('','nan') 17 | result['label2']=result['label2'].replace('','nan') 18 | id = [] 19 | label1 = [] 20 | label2 = [] 21 | print(len(result)) 22 | for i in range(len(result)): 23 | m_id = str(result['id'][i]) 24 | if (m_id=='D101107'): 25 | print(m_id,str(result['label1'][i])) 26 | print(m_id,str(result['label2'][i])) 27 | id.append(m_id) 28 | l1 = '' 29 | l2 = '' 30 | for j in range(len(kw)): 31 | if (m_id == kw['id'][j]): 32 | words = str(kw['lw'][j]).split(',') 33 | if(len(words)>1): 34 | l1 = words[0] 35 | l2 = words[1] 36 | else: 37 | l1 = words[0] 38 | l2 = 'nan' 39 | print(m_id,l1,l2) 40 | if (l1 != ''): 41 | label1.append(l1.replace(',','')) 42 | label2.append(l2.replace(',','')) 43 | else: 44 | label1.append(str(result['label1'][i]).replace(',','')) 45 | label2.append(str(result['label2'][i]).replace(',','')) 46 | 47 | id_column = pd.Series(id, name='id') 48 | label1_column = pd.Series(label1, name='label1') 49 | label2_column = pd.Series(label2, name='label2') 50 | predictions = pd.concat([id_column, label1_column, label2_column], axis=1) 51 | predictions.to_csv(path + 'merge_result_data.csv', index=0, sep=',', columns=['id', 'label1', 'label2'],encoding='UTF-8') 52 | 53 | -------------------------------------------------------------------------------- /venv/dwb/jieba1/tjieba.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/10 下午1:42 4 | # @Author :hwwu 5 | # @File :jieba1.py 6 | 7 | path = '/Users/liyangyang/Downloads/jieba/data/' 8 | file = 'all_docs.txt' 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | data = pd.read_csv(path+file,sep='\001',header=None) 14 | data.columns = ['id','title','doc'] 15 | # # print(data['doc']) 16 | # new_data = data['title']+data['doc'] 17 | # print(new_data) 18 | # 19 | # regex = analyse.extract_tags(new_data,topK=4,withWeight=False,allowPOS=()) 20 | # 21 | # print(regex) 22 | 23 | print(len(data)) 24 | 25 | import codecs 26 | import os 27 | 28 | import jieba 29 | import jieba.analyse as analyse 30 | import numpy as np 31 | import pandas as pd 32 | from sklearn.feature_extraction.text import TfidfVectorizer 33 | 34 | base_path = path + file 35 | seg_path = path + 'all_docs_seg.txt' 36 | 37 | stopword_path='/Users/liyangyang/Downloads/stopwords/CNENstopwords.txt' 38 | 39 | def stopwordslist(): 40 | stopwords = [line.strip() for line in open(stopword_path, 'r', encoding='utf-8').readlines()] 41 | return stopwords 42 | 43 | 44 | def segment(): 45 | """word segment""" 46 | fw = codecs.open(seg_path, 'w', 'utf-8') 47 | doc_list=[] 48 | for i in range(len(data)): 49 | title = str(data['title'][i]) 50 | doc = str(data['doc'][i]) 51 | # row = line.split('\001') 52 | # seg_list = jieba.cut(line.strip()) 53 | sentence_seged = jieba.cut((title+ '。' + doc).strip()) 54 | stopwords = stopwordslist() 55 | outstr = '' 56 | for word in sentence_seged: 57 | if word not in stopwords: 58 | if word != '\t': 59 | outstr += word 60 | outstr +='\t' 61 | l = outstr 62 | doc_list.append(l) 63 | fw.write(l) 64 | fw.write('\n') 65 | fw.flush() 66 | fw.close() 67 | return doc_list 68 | 69 | def tfidf_top(trade_list, doc_list, max_df, topn): 70 | vectorizer = TfidfVectorizer(max_df=max_df,min_df=1,use_idf=1,smooth_idf=1, sublinear_tf=1) 71 | matrix = vectorizer.fit_transform(doc_list) 72 | feature_dict = {v: k for k, v in vectorizer.vocabulary_.items()} # index -> feature_name 73 | top_n_matrix = np.argsort(-matrix.todense())[:, :topn] # top tf-idf words for each row 74 | df = pd.DataFrame(np.vectorize(feature_dict.get)(top_n_matrix), index=trade_list) # convert matrix to df 75 | return df 76 | 77 | 78 | # dl = segment() 79 | # print('first') 80 | # tl = data['id'] 81 | # tdf = tfidf_top(tl, dl, max_df=0.5, topn=2) 82 | # print('second') 83 | # tdf.to_csv(path+'resilt.csv', header=False, encoding='utf-8') 84 | # print('done') 85 | 86 | 87 | 88 | # fw = codecs.open(path+'result.csv', 'w', 'utf-8') 89 | # 90 | # fw.write("id,label1,label2"+"\n") 91 | # 92 | # 93 | # def textrank(): 94 | # n = 0 95 | # fw = codecs.open(seg_path, 'w', 'utf-8') 96 | # # doc_list = [] 97 | # # for i in range(len(data)): 98 | # for i in range(100): 99 | # # n+=1 100 | # # title = str(data['title'][i]) 101 | # # doc = str(data['doc'][i]) 102 | # row = line.split('\001') 103 | # seg_list = jieba.cut(line.strip()) 104 | # # sentence_seged = jieba.cut((title + '。' + doc).strip()) 105 | # # stopwords = stopwordslist() 106 | # # outstr = '' 107 | # # for word in sentence_seged: 108 | # # if word not in stopwords: 109 | # if word != '\t': 110 | # outstr += word 111 | # outstr += '\t' 112 | # keywords = analyse.textrank(outstr, topK=2, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) 113 | # if (len(keywords)==0): 114 | # fw.write(str(data['id'][i]) + "," + str('') + "," + str('') + "\n") 115 | # elif (len(keywords)==1): 116 | # fw.write(str(data['id'][i]) + "," + str(keywords[0]) + "," + str('') + "\n") 117 | # else: 118 | # fw.write(str(data['id'][i]) + "," + str(keywords[0]) + "," + str(keywords[1]) + "\n") 119 | # 120 | # if(n%1000==0): 121 | # print('flush',n/1000) 122 | # fw.flush() 123 | # textrank() 124 | -------------------------------------------------------------------------------- /venv/dwb/merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/5 上午11:17 4 | # @Author :hwwu 5 | # @File :merge.py 6 | 7 | import pandas as pd, numpy as np 8 | 9 | path = '/Users/liyangyang/Downloads/dwb/new_data/' 10 | 11 | def write_result(id, predictions): 12 | r_id = [] 13 | r_predictions = [] 14 | for i in range(len(id)): 15 | r_id.append(int(id[i])) 16 | r_predictions.append(int(predictions[i])) 17 | 18 | english_column = pd.Series(r_id, name='id') 19 | number_column = pd.Series(r_predictions, name='class') 20 | predictions = pd.concat([english_column, number_column], axis=1) 21 | predictions.to_csv(path + 'merge_result_data.csv', index=0, sep=',', columns=['id', 'class']) 22 | 23 | 24 | r75 = pd.read_csv(path+'result_data.csv')['class'] 25 | rcnn = pd.read_csv(path+'p4_cnn_result_data.csv')['class'] 26 | rrnn = pd.read_csv(path+'result_rnn.csv')['class'] 27 | 28 | id = [] 29 | predictions =[] 30 | for i in range(len(r75)): 31 | id.append(i) 32 | if (rcnn[i]==rrnn[i]): 33 | predictions.append(rcnn[i]) 34 | else: 35 | predictions.append(r75[i]) 36 | 37 | write_result(id,predictions) 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /venv/dwb/par.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/9/5 下午2:15 4 | # @Author :hwwu 5 | # @File :par.py 6 | 7 | import pandas as pd, numpy as np 8 | 9 | # path = '/Users/liyangyang/Downloads/dwb/new_data/' 10 | path = '/Users/liyangyang/Downloads/bdci/' 11 | 12 | # train = pd.read_csv(path + 'train_set.csv')['word_seg'] 13 | 14 | from gensim.models import word2vec 15 | 16 | # train = np.load(path + 'vocab/vocab_train.npy') 17 | # test = np.load(path + 'vocab/vocab_test.npy') 18 | # 19 | # test = np.array(test) 20 | # 21 | # total = np.append(train, test, axis=0) 22 | # 23 | # t =[] 24 | # for i in range(len(train)): 25 | # row = train[i] 26 | # r = [] 27 | # for j in range(len(row)): 28 | # r.append(str(row[j])) 29 | # r = np.reshape(r,[len(row),1]) 30 | # t.append(r) 31 | # t = np.array(t) 32 | # print(train.shape) 33 | # 34 | # sentences = word2vec.PathLineSentences(path+'train_no_lable.txt') 35 | # model = word2vec.Word2Vec(sentences,size=128, min_count=1, iter=10,workers=10) 36 | # 37 | # model.save(path+'word2vec/model') 38 | # 39 | # 40 | model = word2vec.Word2Vec.load(path+'word2vec/model') 41 | # 42 | # print(train[0]) 43 | print(model.wv['系统']) 44 | 45 | similarities=model.wv.most_similar('系统',topn=20) 46 | 47 | for word , score in similarities: 48 | print(word , score) 49 | 50 | # y1 = model.similarity('1', '2') 51 | # print(y1) 52 | # 53 | # y2 = model.similarity('1', '3') 54 | # print(y2) 55 | 56 | # word2vec.word2vec('/Users/liyangyang/Downloads/word2vec-0.10.2/README.md',path+'word2vec/model.bin',size=128,iter_=10,threads=10,min_count=1) 57 | -------------------------------------------------------------------------------- /venv/dwb/testcnn/__pycache__/data_helpers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/testcnn/__pycache__/data_helpers.cpython-36.pyc -------------------------------------------------------------------------------- /venv/dwb/testcnn/__pycache__/text_cnn.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hwwu/deep_learning/8c37912069a06a58f80034fe1be7ba5fbc0865d4/venv/dwb/testcnn/__pycache__/text_cnn.cpython-36.pyc -------------------------------------------------------------------------------- /venv/dwb/testcnn/data_helpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/8/20 下午4:16 4 | # @Author :hwwu 5 | # @File :data_helpers.py 6 | 7 | import numpy as np 8 | import pandas as pd 9 | path = '/Users/liyangyang/Downloads/dwb/new_data/' 10 | column = "word_seg" 11 | import random 12 | 13 | 14 | stopword_path = '/Users/liyangyang/Downloads/stopwords/stopwords1893.txt' 15 | import jieba 16 | 17 | 18 | def stopwordslist(): 19 | # stopwords = [line.strip() for line in open(stopword_path, 'r', encoding='utf-8').readlines()] 20 | stopwords = [',', '。', '、', '...', '“', '”', '《', '》', ':', ';'] 21 | return stopwords 22 | 23 | def dense_to_one_hot(labels_dense, num_classes): 24 | """Convert class labels from scalars to one-hot vectors.""" 25 | num_labels = labels_dense.shape[0] 26 | index_offset = np.arange(num_labels) * num_classes 27 | labels_one_hot = np.zeros((num_labels, num_classes)) 28 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 29 | return labels_one_hot 30 | 31 | def load_data_and_labels(data_file): 32 | """ 33 | Loads MR polarity data from files, splits the data into words and generates labels. 34 | Returns split sentences and labels. 35 | """ 36 | # Load data from files 37 | # text = pd.read_csv(path + data_file) 38 | # x_text = np.array(text[column]) 39 | # 40 | # # y=[] 41 | # # for i in range(len(text['class'])): 42 | # # print(text['class']) 43 | # # y = dense_to_one_hot(text['class'],19) 44 | # y = (text["class"]-1).astype(int) 45 | # y = np.array(text["id"]) 46 | 47 | train = pd.read_csv(path + 'train.csv') 48 | random.shuffle(train) 49 | train_doc_list = [] 50 | for i in range(len(train)): 51 | sentence_seged = jieba.cut(train['content'][i].strip()) 52 | # sentence_seged = split_word(train['content'][i].strip()) 53 | stopwords = stopwordslist() 54 | outstr = '' 55 | for word in sentence_seged: 56 | if word not in stopwords: 57 | if (word != '\t') & (word.strip() != ''): 58 | outstr += word 59 | outstr += '\t' 60 | if (outstr == ''): 61 | outstr = 'NaN' 62 | train_doc_list.append(outstr) 63 | x_train = np.array(train_doc_list) 64 | 65 | train.loc[train['subject'] == '动力', 'subject'] = 0 66 | train.loc[train['subject'] == '价格', 'subject'] = 1 67 | train.loc[train['subject'] == '内饰', 'subject'] = 2 68 | train.loc[train['subject'] == '配置', 'subject'] = 3 69 | train.loc[train['subject'] == '安全性', 'subject'] = 4 70 | train.loc[train['subject'] == '外观', 'subject'] = 5 71 | train.loc[train['subject'] == '操控', 'subject'] = 6 72 | train.loc[train['subject'] == '油耗', 'subject'] = 7 73 | train.loc[train['subject'] == '空间', 'subject'] = 8 74 | train.loc[train['subject'] == '舒适性', 'subject'] = 9 75 | y_train = train['subject'] 76 | return x_train, y_train 77 | 78 | def load_dev_data_and_labels(data_file): 79 | """ 80 | Loads MR polarity data from files, splits the data into words and generates labels. 81 | Returns split sentences and labels. 82 | """ 83 | # Load data from files 84 | text = pd.read_csv(path + 'train_set.csv') 85 | x_text = np.array(text[column]) 86 | # y=[] 87 | # for i in range(len(text['class'])): 88 | # print(text['class']) 89 | y = np.array(text['class']) 90 | return x_text, y 91 | 92 | 93 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 94 | """ 95 | Generates a batch iterator for a dataset. 96 | """ 97 | data = np.array(data) 98 | data_size = len(data) 99 | num_batches_per_epoch = int((len(data)-1)/batch_size) + 1 100 | for epoch in range(num_epochs): 101 | # Shuffle the data at each epoch 102 | if shuffle: 103 | shuffle_indices = np.random.permutation(np.arange(data_size)) 104 | shuffled_data = data[shuffle_indices] 105 | else: 106 | shuffled_data = data 107 | for batch_num in range(num_batches_per_epoch): 108 | start_index = batch_num * batch_size 109 | end_index = min((batch_num + 1) * batch_size, data_size) 110 | yield shuffled_data[start_index:end_index] 111 | 112 | from tensorflow.contrib import learn 113 | 114 | def test(): 115 | x_text, y = load_data_and_labels('train_set.csv') 116 | vocab_processor = learn.preprocessing.VocabularyProcessor(2000,min_frequency=3) 117 | x = np.array(list(vocab_processor.fit_transform(x_text))) 118 | 119 | vocab_size = len(vocab_processor.vocabulary_) 120 | print(vocab_size) 121 | 122 | # test() 123 | 124 | -------------------------------------------------------------------------------- /venv/dwb/testcnn/eval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/8/20 下午4:17 4 | # @Author :hwwu 5 | # @File :eval.py 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import os 10 | import time 11 | import datetime 12 | import sys 13 | sys.path.append("/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn") 14 | import data_helpers 15 | from tensorflow.contrib import learn 16 | import csv 17 | 18 | # Parameters 19 | # ================================================== 20 | 21 | # Data Parameters 22 | tf.flags.DEFINE_string("data_file", "train_set.csv", "Data source for the positive data.") 23 | 24 | # Eval Parameters 25 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 26 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run") 27 | tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data") 28 | 29 | # Misc Parameters 30 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 31 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 32 | 33 | 34 | FLAGS = tf.flags.FLAGS 35 | FLAGS._parse_flags() 36 | print("\nParameters:") 37 | for attr, value in sorted(FLAGS.__flags.items()): 38 | print("{}={}".format(attr.upper(), value)) 39 | print("") 40 | 41 | # CHANGE THIS: Load data. Load your own data here 42 | if FLAGS.eval_train: 43 | x_raw, y_test = data_helpers.load_dev_data_and_labels(FLAGS.data_file) 44 | x_raw= x_raw[10000:12000] 45 | y_test= y_test[10000:12000] 46 | # y_test = np.argmax(y_test, axis=1) 47 | else: 48 | x_raw = ["a masterpiece four years in the making", "everything is off."] 49 | y_test = [1, 0] 50 | 51 | # Map data into vocabulary 52 | vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") 53 | vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) 54 | x_test = np.array(list(vocab_processor.transform(x_raw))) 55 | 56 | print("\nEvaluating...\n") 57 | 58 | # Evaluation 59 | # ================================================== 60 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 61 | graph = tf.Graph() 62 | with graph.as_default(): 63 | session_conf = tf.ConfigProto( 64 | allow_soft_placement=FLAGS.allow_soft_placement, 65 | log_device_placement=FLAGS.log_device_placement) 66 | sess = tf.Session(config=session_conf) 67 | with sess.as_default(): 68 | # Load the saved meta graph and restore variables 69 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 70 | saver.restore(sess, checkpoint_file) 71 | 72 | # Get the placeholders from the graph by name 73 | input_x = graph.get_operation_by_name("input_x").outputs[0] 74 | # input_y = graph.get_operation_by_name("input_y").outputs[0] 75 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 76 | 77 | # Tensors we want to evaluate 78 | predictions = graph.get_operation_by_name("output/predictions").outputs[0] 79 | 80 | # Generate batches for one epoch 81 | batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) 82 | 83 | # Collect the predictions here 84 | all_predictions = [] 85 | 86 | for x_test_batch in batches: 87 | batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0}) 88 | all_predictions = np.concatenate([all_predictions, batch_predictions]) 89 | 90 | # Print accuracy if y_test is defined 91 | if y_test is not None: 92 | correct_predictions = float(sum(all_predictions == y_test)) 93 | print("Total number of test examples: {}".format(len(y_test))) 94 | print("Accuracy: {:g}".format(correct_predictions/float(len(y_test)))) 95 | 96 | # Save the evaluation to a csv 97 | predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions)) 98 | out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv") 99 | print("Saving evaluation to {0}".format(out_path)) 100 | with open(out_path, 'w') as f: 101 | csv.writer(f).writerows(predictions_human_readable) -------------------------------------------------------------------------------- /venv/dwb/testcnn/text_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/8/20 下午4:09 4 | # @Author :hwwu 5 | # @File :text_cnn.py 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | 11 | class TextCNN(object): 12 | """ 13 | A CNN for text classification. 14 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 15 | """ 16 | def __init__( 17 | self, sequence_length, num_classes, vocab_size, 18 | embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): 19 | 20 | # Placeholders for input, output and dropout 21 | self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x") 22 | self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") 23 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 24 | 25 | # Keeping track of l2 regularization loss (optional) 26 | l2_loss = tf.constant(0.0) 27 | 28 | # Embedding layer 29 | with tf.device('/cpu:0'), tf.name_scope("embedding"): 30 | self.W = tf.Variable( 31 | tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), 32 | name="W") 33 | self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) 34 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) 35 | 36 | # Create a convolution + maxpool layer for each filter size 37 | pooled_outputs = [] 38 | for i, filter_size in enumerate(filter_sizes): 39 | with tf.name_scope("conv-maxpool-%s" % filter_size): 40 | # Convolution Layer 41 | filter_shape = [filter_size, embedding_size, 1, num_filters] 42 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") 43 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") 44 | conv = tf.nn.conv2d( 45 | self.embedded_chars_expanded, 46 | W, 47 | strides=[1, 1, 1, 1], 48 | padding="VALID", 49 | name="conv") 50 | # Apply nonlinearity 51 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 52 | # Maxpooling over the outputs 53 | pooled = tf.nn.max_pool( 54 | h, 55 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 56 | strides=[1, 1, 1, 1], 57 | padding='VALID', 58 | name="pool") 59 | pooled_outputs.append(pooled) 60 | 61 | # Combine all the pooled features 62 | num_filters_total = num_filters * len(filter_sizes) 63 | self.h_pool = tf.concat(pooled_outputs, 3) 64 | self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) 65 | 66 | # Add dropout 67 | with tf.name_scope("dropout"): 68 | self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) 69 | 70 | # Final (unnormalized) scores and predictions 71 | with tf.name_scope("output"): 72 | W = tf.get_variable( 73 | "W", 74 | shape=[num_filters_total, num_classes], 75 | initializer=tf.contrib.layers.xavier_initializer()) 76 | b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") 77 | l2_loss += tf.nn.l2_loss(W) 78 | l2_loss += tf.nn.l2_loss(b) 79 | self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") 80 | self.predictions = tf.argmax(self.scores, 1, name="predictions") 81 | 82 | # Calculate mean cross-entropy loss 83 | with tf.name_scope("loss"): 84 | losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) 85 | self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss 86 | 87 | # Accuracy 88 | with tf.name_scope("accuracy"): 89 | correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) 90 | self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") -------------------------------------------------------------------------------- /venv/dwb/testcnn/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/8/20 下午4:12 4 | # @Author :hwwu 5 | # @File :train.py 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import os 10 | import time 11 | import datetime 12 | import sys 13 | sys.path.append("/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn") 14 | import data_helpers 15 | from text_cnn import TextCNN 16 | from tensorflow.contrib import learn 17 | 18 | # Parameters 19 | # ================================================== 20 | 21 | # Data loading params 22 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 23 | tf.flags.DEFINE_string("data_file", "train_set.csv", "Data source for the positive data.") 24 | 25 | # Model Hyperparameters 26 | tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") 27 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 28 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 29 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 30 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)") 31 | 32 | # Training parameters 33 | tf.flags.DEFINE_integer("batch_size", 128, "Batch Size (default: 64)") 34 | tf.flags.DEFINE_integer("num_epochs", 2000, "Number of training epochs (default: 200)") 35 | tf.flags.DEFINE_integer("evaluate_every", 10, "Evaluate model on dev set after this many steps (default: 100)") 36 | tf.flags.DEFINE_integer("checkpoint_every", 10, "Save model after this many steps (default: 100)") 37 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)") 38 | # Misc Parameters 39 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 40 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 41 | 42 | FLAGS = tf.flags.FLAGS 43 | # FLAGS._parse_flags() 44 | # print("\nParameters:") 45 | # for attr, value in sorted(FLAGS.__flags.items()): 46 | # print("{}={}".format(attr.upper(), value)) 47 | # print("") 48 | 49 | def preprocess(): 50 | # Data Preparation 51 | # ================================================== 52 | print("Loading data...") 53 | x_text, y = data_helpers.load_data_and_labels(FLAGS.data_file) 54 | vocab_processor = learn.preprocessing.VocabularyProcessor(100) 55 | x = np.array(list(vocab_processor.fit_transform(x_text))) 56 | 57 | x_train = x[:8000] 58 | x_dev = x[8000:] 59 | y_train = y[:8000] 60 | y_dev = y[8000:] 61 | 62 | print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) 63 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) 64 | return x_train, y_train, vocab_processor, x_dev, y_dev 65 | 66 | def train(x_train, y_train, vocab_processor, x_dev, y_dev): 67 | # Training 68 | # ================================================== 69 | 70 | with tf.Graph().as_default(): 71 | session_conf = tf.ConfigProto( 72 | allow_soft_placement=FLAGS.allow_soft_placement, 73 | log_device_placement=FLAGS.log_device_placement) 74 | sess = tf.Session(config=session_conf) 75 | with sess.as_default(): 76 | cnn = TextCNN( 77 | sequence_length=x_train.shape[1], 78 | num_classes=y_train.shape[1], 79 | vocab_size=len(vocab_processor.vocabulary_), 80 | embedding_size=FLAGS.embedding_dim, 81 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 82 | num_filters=FLAGS.num_filters, 83 | l2_reg_lambda=FLAGS.l2_reg_lambda) 84 | 85 | # Define Training procedure 86 | global_step = tf.Variable(0, name="global_step", trainable=False) 87 | optimizer = tf.train.AdamOptimizer(1e-3) 88 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 89 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 90 | 91 | # Keep track of gradient values and sparsity (optional) 92 | grad_summaries = [] 93 | for g, v in grads_and_vars: 94 | if g is not None: 95 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 96 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 97 | grad_summaries.append(grad_hist_summary) 98 | grad_summaries.append(sparsity_summary) 99 | grad_summaries_merged = tf.summary.merge(grad_summaries) 100 | 101 | # Output directory for models and summaries 102 | timestamp = str(int(time.time())) 103 | out_dir = os.path.abspath(os.path.join('/Users/liyangyang/PycharmProjects/mypy/venv/dwb/testcnn/model/', "runs", timestamp)) 104 | print("Writing to {}\n".format(out_dir)) 105 | 106 | # Summaries for loss and accuracy 107 | loss_summary = tf.summary.scalar("loss", cnn.loss) 108 | acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) 109 | 110 | # Train Summaries 111 | train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) 112 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 113 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 114 | 115 | # Dev summaries 116 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 117 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 118 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) 119 | 120 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 121 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 122 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 123 | if not os.path.exists(checkpoint_dir): 124 | os.makedirs(checkpoint_dir) 125 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) 126 | 127 | # Write vocabulary 128 | vocab_processor.save(os.path.join(out_dir, "vocab")) 129 | 130 | # Initialize all variables 131 | sess.run(tf.global_variables_initializer()) 132 | 133 | def train_step(x_batch, y_batch): 134 | """ 135 | A single training step 136 | """ 137 | feed_dict = { 138 | cnn.input_x: x_batch, 139 | cnn.input_y: y_batch, 140 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob 141 | } 142 | _, step, summaries, loss, accuracy = sess.run( 143 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], 144 | feed_dict) 145 | time_str = datetime.datetime.now().isoformat() 146 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 147 | train_summary_writer.add_summary(summaries, step) 148 | 149 | def dev_step(x_batch, y_batch, writer=None): 150 | """ 151 | Evaluates model on a dev set 152 | """ 153 | feed_dict = { 154 | cnn.input_x: x_batch, 155 | cnn.input_y: y_batch, 156 | cnn.dropout_keep_prob: 1.0 157 | } 158 | step, summaries, loss, accuracy = sess.run( 159 | [global_step, dev_summary_op, cnn.loss, cnn.accuracy], 160 | feed_dict) 161 | time_str = datetime.datetime.now().isoformat() 162 | # print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 163 | print('loss',loss) 164 | print('accuracy',accuracy) 165 | if writer: 166 | writer.add_summary(summaries, step) 167 | 168 | # Generate batches 169 | batches = data_helpers.batch_iter( 170 | list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) 171 | # Training loop. For each batch... 172 | for batch in batches: 173 | x_batch, y_batch = zip(*batch) 174 | train_step(x_batch, y_batch) 175 | current_step = tf.train.global_step(sess, global_step) 176 | # if current_step % FLAGS.evaluate_every == 0: 177 | # print("\nEvaluation:") 178 | # dev_step(x_dev, y_dev, writer=dev_summary_writer) 179 | # print("..") 180 | if current_step % FLAGS.checkpoint_every == 0: 181 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 182 | print("Saved model checkpoint to {}\n".format(path)) 183 | 184 | def main(argv=None): 185 | x_train, y_train, vocab_processor, x_dev, y_dev = preprocess() 186 | train(x_train, y_train, vocab_processor, x_dev, y_dev) 187 | 188 | if __name__ == '__main__': 189 | tf.app.run() -------------------------------------------------------------------------------- /venv/pachong/iqiyi/fcxd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/11/30 下午1:43 4 | # @Author :hwwu 5 | # @File :fcxd.py 6 | 7 | import requests 8 | import re 9 | from bs4 import BeautifulSoup 10 | import os 11 | import shutil 12 | import json 13 | from urllib import parse 14 | 15 | # https://2wk.com/vip.php?url= 16 | # 这个网址能解析的视频都可以通过这个下载 17 | 18 | headers = { 19 | # 'Access-Control-Allow-Credentials': 'true', 20 | # 'Cache-Control': 'max-age=900', 21 | # 'Content-Encoding': 'gzip', 22 | # 'Content-Language': 'zh-CN', 23 | # 'Content-Type': 'text/html; charset=UTF-8', 24 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', 25 | # 'Upgrade-Insecure-Requests': '1' 26 | } 27 | 28 | y_url2 = 'http://www.iqiyi.com/lib/m_209926914.html?src=search' 29 | y_target2 = requests.get(url=y_url2).text 30 | y_soup2 = BeautifulSoup(y_target2, 'html.parser') 31 | y_returnsoup2 = y_soup2.find_all('div', attrs={'class': 'site-piclist_pic'}) 32 | 33 | # 用正则表达式获取剧集链接 34 | y_result2 = re.findall('(?<=href=\").*?(?=\")', str(y_returnsoup2)) 35 | # 用正则表达式获取剧集名称 36 | title2 = re.findall('(?<=title=\").*?(?=\">)', str(y_returnsoup2)) 37 | j = len(title2) 38 | # 输出爬取结果 39 | for i in range(2, j - 2): 40 | str1 = '第' + str(i + 1) + '集' 41 | print(y_result2[i]) 42 | print(str1, title2[i]) 43 | xm_url = 'http://aikan-tv.com/?url=' + y_result2[i] 44 | req = requests.get(xm_url, headers=headers) 45 | 46 | soup1 = BeautifulSoup(req.text, 'html.parser') 47 | returnsoup1 = soup1.find_all('iframe') 48 | result1 = re.findall('(?<=src=\").*?(?=\")', str(returnsoup1)) 49 | 50 | req = requests.get(result1[0], headers=headers) 51 | req_json = re.findall('"api1.php", (.+),', req.text)[0] 52 | info = json.loads(req_json) 53 | 54 | data = {'time': info['time'], 'key': info['key'], 55 | 'url': info['url'], 'type': info['type'], 'referer': info['referer']} 56 | req = requests.post('https://yun.odflv.com/odflv2/api1.php', headers=headers, data=data) 57 | info = json.loads(req.text) 58 | url = info['url'] 59 | 60 | url1 = parse.unquote(url) 61 | 62 | req = requests.get(url1, headers=headers) 63 | result2 = re.findall('(?<=/).*?(?=.m3u8)', str(req.text)) 64 | 65 | req = requests.get('https://acfun.iqiyi-kuyun.com/' + result2[0] + '.m3u8', headers=headers) 66 | text = req.text 67 | tl = text.split('\n') 68 | new_index = [] 69 | for l in tl: 70 | if l.find('.ts') > 0: 71 | new_index.append(l) 72 | print(len(new_index), new_index) 73 | file_path = '/Users/liyangyang/Downloads/pachong/iqiyi/mid/fcxd/' + str1 74 | os.makedirs(file_path, exist_ok=True) 75 | for ii, ni in enumerate(new_index): 76 | url = 'https://acfun.iqiyi-kuyun.com' + ni 77 | r = requests.get(url, headers=headers) 78 | # print(url) 79 | content_length = int(r.headers['Content-Length']) 80 | path = file_path + '/' + str(ii) + '.ts' 81 | with open(path, 'ab') as file: 82 | file.write(r.content) 83 | file.flush() 84 | print(ni, 'receive data,file size : %d' % (content_length)) 85 | 86 | new_path = '/Users/liyangyang/Downloads/pachong/iqiyi/result/fcxd/' + str1 87 | os.makedirs(new_path, exist_ok=True) 88 | exec_str = "cat " + file_path + '/*.ts > ' + new_path + '/' + title2[i] + '.ts' 89 | print(exec_str) 90 | os.system(exec_str) 91 | shutil.rmtree(file_path) 92 | # sec_str = 'ffmpeg -y -i ' + new_path + '/new.ts -c:v libx264 -c:a copy -bsf:a aac_adtstoasc ' + new_path + '/new.mp4' 93 | # print(sec_str) 94 | # os.system(sec_str) 95 | -------------------------------------------------------------------------------- /venv/pachong/iqiyi/xiangmicc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/11/30 下午1:43 4 | # @Author :hwwu 5 | # @File :xiangmicc.py 6 | 7 | import requests 8 | import re 9 | from bs4 import BeautifulSoup 10 | import os 11 | 12 | headers = { 13 | 'Access-Control-Allow-Credentials': 'true', 14 | 'Cache-Control': 'max-age=900', 15 | 'Content-Encoding': 'gzip', 16 | 'Content-Language': 'zh-CN', 17 | 'Content-Type': 'text/html; charset=UTF-8', 18 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36', 19 | 'Upgrade-Insecure-Requests': '1' 20 | } 21 | 22 | y_url2 = 'http://www.iqiyi.com/a_19rrh9km3x.html#vfrm=2-4-0-1' 23 | y_target2 = requests.get(url=y_url2).text 24 | y_soup2 = BeautifulSoup(y_target2, 'html.parser') 25 | y_returnsoup2 = y_soup2.find_all('div', attrs={'class': 'site-piclist_pic'}) 26 | 27 | # 用正则表达式获取剧集链接 28 | y_result2 = re.findall('(?<=href=\").*?(?=\")', str(y_returnsoup2)) 29 | # 用正则表达式获取剧集名称 30 | title2 = re.findall('(?<=title=\").*?(?=\">)', str(y_returnsoup2)) 31 | j = len(title2) 32 | # 输出爬取结果 33 | for i in range(7, j - 2): 34 | str1 = '第' + str(i) + '集' 35 | print(y_result2[i]) 36 | print(str1, title2[i]) 37 | xm_url = 'https://2wk.com/vip.php?url=' + y_result2[i] 38 | req = requests.get(xm_url, headers=headers) 39 | soup1 = BeautifulSoup(req.text, 'html.parser') 40 | returnsoup1 = soup1.find_all('iframe') 41 | result1 = re.findall('(?<=src=\").*?(?=\")', str(returnsoup1)) 42 | 43 | req = requests.get(result1[0], headers=headers) 44 | result2 = re.findall('(?<=src=\').*?(?=\')', str(req.text)) 45 | 46 | req = requests.get('https:' + result2[0], headers=headers) 47 | soup2 = BeautifulSoup(req.text, 'html.parser') 48 | returnsoup2 = soup2.find_all('title') 49 | title = re.findall('(?<=).*?(?=)', str(req.text)) 50 | body = soup2.find_all('body') 51 | m3u8Url = re.findall('(?<=m3u8Url = \").*?(?=\")', str(req.text)) 52 | 53 | www = re.findall('(?<=//).*?(?=/)', str(result2[0])) 54 | index = 'https://' + www[0] + m3u8Url[0] 55 | 56 | index = index.replace('index.m3u8', '1000kb/hls/index.m3u8') 57 | req = requests.get(index, headers=headers) 58 | text = req.text 59 | tl = text.split('\n') 60 | new_index = [] 61 | for l in tl: 62 | if l.find('.ts') > 0: 63 | new_index.append(l) 64 | 65 | file_path = '/Users/liyangyang/Downloads/pachong/iqiyi/mid/xmcc/' + str1 66 | os.makedirs(file_path, exist_ok=True) 67 | for ni in new_index: 68 | url = index.replace('index.m3u8', ni) 69 | r = requests.get(url, headers=headers) 70 | # print(url) 71 | # content_length = int(r.headers['Content-Length']) 72 | path = file_path + '/' + ni 73 | with open(path, 'ab') as file: 74 | file.write(r.content) 75 | file.flush() 76 | # print(ni, 'receive data,file size : %d total size:%d' % (os.path.getsize(path), content_length)) 77 | 78 | new_path = '/Users/liyangyang/Downloads/pachong/iqiyi/result/xmcc/' + str1 79 | os.makedirs(new_path, exist_ok=True) 80 | exec_str = "cat " + file_path + '/*.ts > ' + new_path + '/' + title2[i] + '.ts' 81 | print(exec_str) 82 | os.system(exec_str) 83 | os.defpath(file_path) 84 | # sec_str = 'ffmpeg -y -i ' + new_path + '/new.ts -c:v libx264 -c:a copy -bsf:a aac_adtstoasc ' + new_path + '/new.mp4' 85 | # print(sec_str) 86 | # os.system(sec_str) 87 | -------------------------------------------------------------------------------- /venv/regress_baseline/cwd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*-coding:utf8 -*- 3 | # @TIME :2018/10/10 下午5:32 4 | # @Author :hwwu 5 | # @File :cwd.py 6 | import os 7 | 8 | print(os.getcwd()) --------------------------------------------------------------------------------