├── Base ├── books │ ├── JTZHBC │ │ ├── 2.recommendation │ │ │ └── recommendations.py │ │ ├── 3.discovery │ │ │ ├── 3test.py │ │ │ ├── clusters.py │ │ │ └── generatefeedvector.py │ │ ├── 4.searchengine │ │ │ ├── nn.py │ │ │ └── searchengine.py │ │ ├── 5.optimization │ │ │ ├── dorm.py │ │ │ ├── optimization.py │ │ │ ├── socialnetwork.py │ │ │ └── test.py │ │ └── readme.md │ └── ML_in_action │ │ ├── AdaBoost │ │ ├── adaboost.py │ │ └── test.py │ │ ├── Bayes │ │ ├── bayes.py │ │ └── test.py │ │ ├── DecisionTree │ │ ├── test.py │ │ ├── treePlotter.py │ │ └── trees.py │ │ ├── Logistic │ │ ├── logRegres.py │ │ ├── test.py │ │ └── testSet.txt │ │ ├── Regression │ │ ├── regression.py │ │ └── test.py │ │ ├── SVM │ │ ├── svm.py │ │ └── test.py │ │ ├── kMeans │ │ ├── kMeans.py │ │ └── test.py │ │ ├── kNN │ │ ├── kNN.py │ │ └── test.py │ │ └── readme.md ├── challenge │ ├── AIchallenge │ │ ├── AI_pic2text_pre.ipynb │ │ ├── AI_pic2text_pre2.ipynb │ │ └── AI_pic2text_pre3.ipynb │ ├── DataFountain │ │ └── writing_classify │ │ │ └── pre_1.ipynb │ ├── biendata │ │ └── mobike │ │ │ ├── example_learn_mobike.ipynb │ │ │ └── mobike_mine.ipynb │ ├── kaggle │ │ ├── HousePrices │ │ │ ├── House Prices.ipynb │ │ │ ├── predictions.csv │ │ │ ├── test.csv │ │ │ └── train.csv │ │ ├── Titanic │ │ │ ├── Titanic.ipynb │ │ │ ├── pre.csv │ │ │ ├── test.csv │ │ │ └── train.csv │ │ ├── readme.md │ │ └── searchrelevance.pdf │ └── tianchi │ │ └── shop_location │ │ ├── baseline1.ipynb │ │ ├── baseline2.ipynb │ │ └── shop_pre1.ipynb ├── courses │ ├── DL_AndrewNg │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── course1 │ │ │ ├── BuildingDNNv3.ipynb │ │ │ ├── DNNApplicationv3.ipynb │ │ │ ├── LRwithNN.ipynb │ │ │ └── week3NN.ipynb │ │ ├── course2 │ │ │ ├── GradientChecking.ipynb │ │ │ ├── Initialization.ipynb │ │ │ ├── OptimizationMethods.ipynb │ │ │ ├── Regularization.ipynb │ │ │ └── TensorflowTutorial.ipynb │ │ ├── course4 │ │ │ ├── .DS_Store │ │ │ ├── ArtGenerationwithNeuralStyleTransfer2.ipynb │ │ │ ├── AutonomousdrivingapplicationCardetectionv3.ipynb │ │ │ ├── ConvolutionmodelApplicationv1.ipynb │ │ │ ├── ConvolutionmodelStepbyStepv2.ipynb │ │ │ ├── FaceRecognitionfortheHappyHousev3.ipynb │ │ │ ├── KerasTutorialHappyHousev2.ipynb │ │ │ └── ResidualNetworksv2.ipynb │ │ └── course5 │ │ │ ├── .DS_Store │ │ │ ├── BuildingaRecurrentNeuralNetworkStepbyStepv3.ipynb │ │ │ ├── DinosaurusIslandCharacterlevellanguagemodelfinalv3.ipynb │ │ │ ├── Emojifyv2.ipynb │ │ │ ├── ImproviseaJazzSolowithanLSTMNetworkv3.ipynb │ │ │ ├── Neuralmachinetranslationwithattentionv3.ipynb │ │ │ ├── Operationsonwordvectorsv2.ipynb │ │ │ ├── Triggerworddetectionv1.ipynb │ │ │ └── rnn_utils.py │ ├── coursera_ML │ │ ├── ex1_liner │ │ │ ├── computeCost.m │ │ │ ├── ex1.m │ │ │ └── gradientDescent.m │ │ ├── ex2_logistc │ │ │ ├── costFunction.m │ │ │ ├── costFunctionReg.m │ │ │ ├── ex2.m │ │ │ ├── ex2_reg.m │ │ │ ├── plotData.m │ │ │ ├── predict.m │ │ │ └── sigmoid.m │ │ ├── ex3_nn │ │ │ ├── lrCostFunction.m │ │ │ ├── oneVsAll.m │ │ │ ├── predict.m │ │ │ └── predictOneVsAll.m │ │ └── readme.md │ ├── cs231n │ │ ├── README.md │ │ ├── assignment1 │ │ │ ├── .gitignore │ │ │ ├── .ipynb_checkpoints │ │ │ │ ├── features-checkpoint.ipynb │ │ │ │ ├── knn-checkpoint.ipynb │ │ │ │ ├── softmax-checkpoint.ipynb │ │ │ │ ├── svm-checkpoint.ipynb │ │ │ │ └── two_layer_net-checkpoint.ipynb │ │ │ ├── README.md │ │ │ ├── collectSubmission.sh │ │ │ ├── cs231n │ │ │ │ ├── __init__.py │ │ │ │ ├── classifiers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── k_nearest_neighbor.py │ │ │ │ │ ├── linear_classifier.py │ │ │ │ │ ├── linear_svm.py │ │ │ │ │ ├── neural_net.py │ │ │ │ │ └── softmax.py │ │ │ │ ├── data_utils.py │ │ │ │ ├── datasets │ │ │ │ │ ├── .gitignore │ │ │ │ │ └── get_datasets.sh │ │ │ │ ├── features.py │ │ │ │ ├── gradient_check.py │ │ │ │ └── vis_utils.py │ │ │ ├── features.ipynb │ │ │ ├── frameworkpython │ │ │ ├── knn.ipynb │ │ │ ├── setup_googlecloud.sh │ │ │ ├── softmax.ipynb │ │ │ ├── start_ipython_osx.sh │ │ │ ├── svm.ipynb │ │ │ └── two_layer_net.ipynb │ │ └── note │ │ │ └── inverted_dropout.py │ └── qiyuezaixian │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── course1 │ │ ├── PythonRegEx.html │ │ ├── jieba.html │ │ └── string_operation.ipynb │ │ ├── course2 │ │ ├── Ngram.html │ │ ├── bayes.html │ │ ├── bayesClassfierNews.html │ │ ├── bayesDetector.html │ │ └── stopwords_cn.txt │ │ ├── course3 │ │ └── HillaryEmail.ipynb │ │ └── course5 │ │ └── HMM_POS_TAG.html ├── frameworks │ ├── caffe │ │ ├── code │ │ │ ├── ssd_detection_output_layer.py │ │ │ └── test_ssd.py │ │ ├── docs │ │ │ └── ubuntu18_anaconda3_py27_cpu_COMPILE_CAFFE.md │ │ ├── project │ │ │ └── caffe_ssd_write_layer │ │ │ │ ├── 3.jpg │ │ │ │ ├── caffe_ssd_deploy.prototxt │ │ │ │ ├── caffe_ssd_deploy2.prototxt │ │ │ │ ├── dog_bike_car.jpg │ │ │ │ ├── img │ │ │ │ └── two_faces_300.jpg │ │ │ │ ├── nnie_ssd_deploy.prototxt │ │ │ │ ├── res.jpg │ │ │ │ ├── res222.jpg │ │ │ │ ├── test_caffe_ssd.py │ │ │ │ ├── test_img.jpg │ │ │ │ ├── test_img2.jpg │ │ │ │ ├── test_img3.jpg │ │ │ │ ├── test_ssd_concat.py │ │ │ │ ├── test_ssd_detection_output.py │ │ │ │ ├── test_ssd_priorbox.py │ │ │ │ ├── test_ssd_priorbox_originSSD.py │ │ │ │ ├── two_faces_300.jpg │ │ │ │ ├── yufacedetectnet-open-v1-concat.prototxt │ │ │ │ ├── yufacedetectnet-open-v1-detection_output.caffemodel │ │ │ │ ├── yufacedetectnet-open-v1-detection_output.prototxt │ │ │ │ ├── yufacedetectnet-open-v1-priorbox.prototxt │ │ │ │ ├── yufacedetectnet-open-v1.caffemodel │ │ │ │ ├── yufacedetectnet-open-v1.prototxt │ │ │ │ ├── yufacedetectnet-open-v1_my.caffemodel │ │ │ │ ├── yufacedetectnet-open-v1_my.prototxt │ │ │ │ ├── yufacedetectnet-open-v1_new.caffemodel │ │ │ │ └── yufacedetectnet-open-v1_new.prototxt │ │ └── readme.md │ ├── keras │ │ ├── .DS_Store │ │ ├── baseline │ │ │ ├── main.py │ │ │ ├── my_data.py │ │ │ └── my_model.py │ │ ├── data │ │ │ ├── 0_0.png │ │ │ └── 2_100.png │ │ ├── demo │ │ │ ├── .DS_Store │ │ │ ├── Keras_GAN.ipynb │ │ │ ├── RNN_classify.ipynb │ │ │ ├── Word_Language_Modelling_LSTM.ipynb │ │ │ ├── cam_heatmap.py │ │ │ ├── classify_focal_loss.py │ │ │ ├── clearData.py │ │ │ ├── data_aug.py │ │ │ ├── data_generator.py │ │ │ ├── fmeasure_metric.py │ │ │ ├── h5_customer_to_tflite.py │ │ │ ├── h5_to_ckpt.py │ │ │ ├── h5_to_pb.py │ │ │ ├── h5_to_tflite.py │ │ │ ├── keras_cifar10.ipynb │ │ │ ├── keras_mnist.ipynb │ │ │ ├── keras_net.py │ │ │ ├── layer_trainable.py │ │ │ ├── lstm_word_embedding.ipynb │ │ │ ├── multi_output_class_weight.py │ │ │ ├── pretrain.py │ │ │ ├── show_keras_data.py │ │ │ └── tflite_pre.py │ │ ├── keras_example.ipynb │ │ ├── note │ │ │ ├── .DS_Store │ │ │ └── keras_multiGPU.md │ │ ├── project │ │ │ ├── .DS_Store │ │ │ ├── 3D_predict.py │ │ │ ├── Caipiao_nn.ipynb │ │ │ ├── history3D.txt │ │ │ └── plate_color.ipynb │ │ └── readme.md │ ├── mxnet │ │ └── load_pre_demo.py │ ├── pytorch │ │ ├── IOU_balanced.py │ │ ├── IoU_loss.py │ │ ├── demo │ │ │ ├── CEloss.py │ │ │ ├── onnx_pre.py │ │ │ └── show_pth_data.py │ │ ├── practice │ │ │ ├── 60分钟入门PyTorch-0.目录.ipynb │ │ │ ├── 60分钟入门PyTorch-1.PyTorch是什么?.ipynb │ │ │ ├── 60分钟入门PyTorch-2.AUTOGRAD.ipynb │ │ │ ├── 60分钟入门PyTorch-3.神经网络.ipynb │ │ │ ├── 60分钟入门PyTorch-4.训练一个分类器.ipynb │ │ │ ├── 60分钟入门PyTorch-5.数据并行.ipynb │ │ │ ├── gan_pytorch.py │ │ │ ├── mnist_demo.py │ │ │ ├── pytorch_example.ipynb │ │ │ └── pytorch_lstm.ipynb │ │ └── readme.md │ └── tensorflow │ │ ├── .DS_Store │ │ ├── basic │ │ ├── .DS_Store │ │ ├── Learn_tf.ipynb │ │ ├── TFLiteModelMaker │ │ │ ├── README.md │ │ │ └── train.py │ │ ├── TensorFlowExample.ipynb │ │ ├── ckpt2pb.py │ │ ├── ckpt_pre.py │ │ ├── onnx_pre.py │ │ ├── pb2tflite.py │ │ ├── pruned_demo.py │ │ ├── read_pb.py │ │ ├── tf_pb_pre.py │ │ ├── tf_save_load.ipynb │ │ ├── tflite_pre.py │ │ └── tflite_show_middle_output.py │ │ ├── demo │ │ ├── .DS_Store │ │ ├── TF_logsitic.ipynb │ │ ├── basic_mnist_demo.py │ │ ├── mnist_cnn_demo.py │ │ └── ten_people_face_reconize │ │ │ ├── .DS_Store │ │ │ ├── main.py │ │ │ ├── model │ │ │ └── .DS_Store │ │ │ ├── olivettifaces.gif │ │ │ └── result.png │ │ └── readme.md └── tools │ ├── lightgbm │ ├── readme.md │ ├── simpleexample.py │ └── sklearnexample.py │ ├── scikit-learn │ ├── .DS_Store │ ├── README.md │ ├── choose.png │ ├── demo │ │ ├── kmeans_color.py │ │ └── tSNE.py │ ├── ex2data1.txt │ ├── pearsonr.ipynb │ ├── sklearn_LR.py │ └── useful.py │ ├── spark │ ├── .DS_Store │ ├── README.md │ ├── learnsparkLDA │ │ ├── .DS_Store │ │ ├── learn_sparkRDD.ipynb │ │ ├── spark_MLlib.ipynb │ │ ├── spark_pairRDD.ipynb │ │ ├── spark_saveload.ipynb │ │ └── spark_uplevel.ipynb │ └── start.py │ └── xgboost │ ├── readme.md │ ├── xgboost.ipynb │ └── xgboost_multi.ipynb ├── CV ├── codes │ ├── IOU.py │ ├── flickr_to_voc.py │ ├── label_smoothing.py │ ├── makeVOCDirs.py │ ├── nms.py │ ├── pascalVOC2csv.py │ ├── show_voc_box.py │ ├── simple_mixup.py │ ├── to_coco_person17.py │ ├── txt2xml.py │ └── updateTXT.py ├── knowledge.md ├── nets │ ├── .DS_Store │ ├── alexnet │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── alexnet.jpg │ │ └── keras_alexnet.py │ ├── lenet5 │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── keras_lenet5.py │ │ └── lenet5.jpg │ └── vgg │ │ ├── .DS_Store │ │ ├── README.md │ │ ├── keras_vgg.py │ │ ├── vgg.jpg │ │ └── vgg16.jpg └── note │ ├── .DS_Store │ ├── DCNN_book_note.md │ ├── chineseocr-ctpn-densenet.md │ ├── cptn.jpg │ ├── ctc.jpg │ ├── denseblock.jpg │ ├── densenet.jpg │ ├── handwrite_ocr_note.md │ ├── nms.jpg │ ├── vgg1.jpg │ ├── vgg2.jpg │ ├── vgg3.jpg │ └── vgg4.jpg ├── DIY ├── .DS_Store ├── Adaboost.ipynb ├── CRF.ipynb ├── DecisionTree.ipynb ├── EM.ipynb ├── HMM.ipynb ├── IOU.py ├── LR.ipynb ├── NN.ipynb ├── NaiveBayes.ipynb ├── PCA.ipynb ├── ROC_AUC.ipynb ├── SVM.ipynb ├── Stacking.py ├── ex1_py_liner.ipynb ├── ex1data1.txt ├── kMeans.ipynb ├── kNN.ipynb ├── lenses.txt ├── perceptron.ipynb ├── tryStacking.ipynb └── yoloF1.py ├── DM ├── knowledge.md └── note │ ├── .DS_Store │ ├── FeatureEngneering.md │ └── img │ └── fe.jpg ├── NLP ├── .DS_Store ├── codes │ └── re.ipynb ├── knowledge.md └── tools │ ├── NLPIR │ └── Start.py │ ├── gensim │ ├── load_w2v_ch.py │ ├── process_wiki_data.py │ ├── readme.md │ ├── test_word2vec.ipynb │ └── train_word2vec_model.py │ ├── jieba │ ├── cixing.py │ ├── if-idf.py │ ├── jieba_cut.ipynb │ ├── jieba_cut_ngram.py │ ├── read_save.ipynb │ └── readme.md │ ├── lda │ └── lda.ipynb │ ├── nltk │ ├── func │ │ ├── nltk_FreqDist.ipynb │ │ └── tokenize_text_html.ipynb │ ├── practice │ │ ├── Sentiment_analysis.ipynb │ │ ├── TF-IDF.ipynb │ │ ├── Text_similarity.ipynb │ │ └── wordsNormalization.ipynb │ └── readme.md │ └── word2vec │ ├── .DS_Store │ ├── readme.md │ └── word2vec_start.ipynb ├── Others ├── .DS_Store └── infos │ └── README.md └── README.md /Base/books/JTZHBC/3.discovery/3test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import clusters 3 | blognames,words,data = clusters.readfile('blogdata.txt') 4 | clust = clusters.hcluster(data) 5 | 6 | #显示聚类树 7 | #clusters.printclust(clust,labels=blognames) 8 | 9 | #生成聚类图 10 | #clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') 11 | 12 | #多维缩放 13 | coords = clusters.scaledown(data) 14 | clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg') 15 | -------------------------------------------------------------------------------- /Base/books/JTZHBC/3.discovery/generatefeedvector.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import feedparser 3 | import re 4 | 5 | # 返回一个RSS订阅源的标题和包含单词计数情况的字典 6 | def getwordcounts(url): 7 | # Parse the feed 解析订阅源 8 | d=feedparser.parse(url) 9 | wc={} 10 | 11 | # Loop over all the entries循环遍历所有文章条目 12 | for e in d.entries: 13 | if 'summary' in e: summary=e.summary 14 | else: summary=e.description 15 | 16 | # Extract a list of words提取一个单词列表 17 | words=getwords(e.title+' '+summary) 18 | for word in words: 19 | wc.setdefault(word,0) 20 | #setdefault() 函数和get() 方法类似, 如果键不存在于字典中,将会添加键并将值设为默认值。 21 | wc[word]+=1 22 | return d.feed.title,wc 23 | 24 | def getwords(html): 25 | # Remove all the HTML tags 去除所有HTML标记 26 | txt=re.compile(r'<[^>]+>').sub('',html) 27 | #re.sub(a,b,x)用作把x中的a替换为b,这里没有b,应该就是删除空格 28 | 29 | # Split words by all non-alpha characters利用所有非字母字符拆分出单词 30 | words=re.compile(r'[^A-Z^a-z]+').split(txt) 31 | 32 | # Convert to lowercase转化小写 33 | return [word.lower() for word in words if word!=''] 34 | 35 | 36 | apcount={} 37 | wordcounts={} 38 | feedlist=[line for line in file('feedlist.txt')] #循环遍历订阅源 39 | #不用read直接读取txt文件 40 | for feedurl in feedlist: 41 | try: 42 | title,wc=getwordcounts(feedurl) #得到每篇文章的词和次数 43 | wordcounts[title]=wc 44 | for word,count in wc.items(): #items()方法返回字典的(键,值)元组对的列表 / 试了下,不加items会报错 45 | apcount.setdefault(word,0) 46 | if count>1: 47 | apcount[word]+=1 48 | except: 49 | print 'Failed to parse feed %s' % feedurl 50 | 51 | wordlist=[] 52 | for w,bc in apcount.items(): #items()方法返回字典的(键,值)元组对的列表 53 | frac=float(bc)/len(feedlist) 54 | if frac>0.1 and frac<0.5: #去掉出现频率太高太低的词 55 | wordlist.append(w) 56 | 57 | out=file('blogdata1.txt','w') 58 | out.write('Blog') 59 | for word in wordlist: out.write('\t%s' % word) 60 | out.write('\n') 61 | for blog,wc in wordcounts.items(): 62 | print blog 63 | out.write(blog) 64 | for word in wordlist: 65 | if word in wc: out.write('\t%d' % wc[word]) 66 | else: out.write('\t0') 67 | out.write('\n') 68 | -------------------------------------------------------------------------------- /Base/books/JTZHBC/5.optimization/dorm.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #分配宿舍 3 | 4 | import random 5 | import math 6 | 7 | # The dorms, each of which has two available spaces 8 | dorms=['Zeus','Athena','Hercules','Bacchus','Pluto'] 9 | 10 | # People, along with their first and second choices 11 | prefs=[('Toby', ('Bacchus', 'Hercules')), 12 | ('Steve', ('Zeus', 'Pluto')), 13 | ('Karen', ('Athena', 'Zeus')), 14 | ('Sarah', ('Zeus', 'Pluto')), 15 | ('Dave', ('Athena', 'Bacchus')), 16 | ('Jeff', ('Hercules', 'Pluto')), 17 | ('Fred', ('Pluto', 'Athena')), 18 | ('Suzie', ('Bacchus', 'Hercules')), 19 | ('Laura', ('Bacchus', 'Hercules')), 20 | ('James', ('Hercules', 'Athena'))] 21 | 22 | # [(0,9),(0,8),(0,7),(0,6),...,(0,0)] 23 | domain=[(0,(len(dorms)*2)-i-1) for i in range(0,len(dorms)*2)] #这个列表推导式不错 24 | 25 | def printsolution(vec): #把数字列表打印成选择宿舍的情况 26 | slots=[] 27 | # Create two slots for each dorm 28 | for i in range(len(dorms)): slots+=[i,i] 29 | 30 | # Loop over each students assignment 31 | for i in range(len(vec)): 32 | x=int(vec[i]) 33 | 34 | # Choose the slot from the remaining ones 35 | dorm=dorms[slots[x]] 36 | # Show the student and assigned dorm 37 | print prefs[i][0],dorm 38 | # Remove this slot 39 | del slots[x] 40 | 41 | def dormcost(vec):#成本函数 42 | cost=0 43 | # Create list a of slots 44 | slots=[0,0,1,1,2,2,3,3,4,4] 45 | 46 | # Loop over each student 47 | for i in range(len(vec)): 48 | x=int(vec[i]) 49 | dorm=dorms[slots[x]] 50 | pref=prefs[i][1] 51 | # First choice costs 0, second choice costs 1 52 | if pref[0]==dorm: cost+=0 53 | elif pref[1]==dorm: cost+=1 54 | else: cost+=3 55 | # Not on the list costs 3 56 | 57 | # Remove selected slot 58 | del slots[x] 59 | 60 | return cost 61 | -------------------------------------------------------------------------------- /Base/books/JTZHBC/5.optimization/socialnetwork.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #关系网络可视化 3 | import math 4 | 5 | people=['Charlie','Augustus','Veruca','Violet','Mike','Joe','Willy','Miranda'] 6 | 7 | links=[('Augustus', 'Willy'), 8 | ('Mike', 'Joe'), 9 | ('Miranda', 'Mike'), 10 | ('Violet', 'Augustus'), 11 | ('Miranda', 'Willy'), 12 | ('Charlie', 'Mike'), 13 | ('Veruca', 'Joe'), 14 | ('Miranda', 'Augustus'), 15 | ('Willy', 'Augustus'), 16 | ('Joe', 'Charlie'), 17 | ('Veruca', 'Augustus'), 18 | ('Miranda', 'Joe')] 19 | 20 | 21 | def crosscount(v): #计算交叉线 22 | # Convert the number list into a dictionary of person:(x,y) 23 | loc=dict([(people[i],(v[i*2],v[i*2+1])) for i in range(0,len(people))]) 24 | total=0 25 | 26 | # Loop through every pair of links 27 | for i in range(len(links)): 28 | for j in range(i+1,len(links)): 29 | 30 | # Get the locations 31 | (x1,y1),(x2,y2)=loc[links[i][0]],loc[links[i][1]] 32 | (x3,y3),(x4,y4)=loc[links[j][0]],loc[links[j][1]] 33 | 34 | den=(y4-y3)*(x2-x1)-(x4-x3)*(y2-y1) 35 | 36 | # den==0 if the lines are parallel 37 | if den==0: continue 38 | 39 | # Otherwise ua and ub are the fraction of the 40 | # line where they cross 41 | ua=((x4-x3)*(y1-y3)-(y4-y3)*(x1-x3))/den 42 | ub=((x2-x1)*(y1-y3)-(y2-y1)*(x1-x3))/den 43 | 44 | # If the fraction is between 0 and 1 for both lines 45 | # then they cross each other 46 | if ua>0 and ua<1 and ub>0 and ub<1: 47 | total+=1 48 | for i in range(len(people)): 49 | for j in range(i+1,len(people)): 50 | # Get the locations of the two nodes 51 | (x1,y1),(x2,y2)=loc[people[i]],loc[people[j]] 52 | 53 | # Find the distance between them 54 | dist=math.sqrt(math.pow(x1-x2,2)+math.pow(y1-y2,2)) 55 | # Penalize any nodes closer than 50 pixels 56 | if dist<50: 57 | total+=(1.0-(dist/50.0)) 58 | 59 | return total 60 | from PIL import Image,ImageDraw 61 | 62 | def drawnetwork(sol):#绘制网络 63 | # Create the image 64 | img=Image.new('RGB',(400,400),(255,255,255)) 65 | draw=ImageDraw.Draw(img) 66 | 67 | # Create the position dict 68 | pos=dict([(people[i],(sol[i*2],sol[i*2+1])) for i in range(0,len(people))]) 69 | 70 | for (a,b) in links: 71 | draw.line((pos[a],pos[b]),fill=(255,0,0)) 72 | 73 | for n,p in pos.items(): 74 | draw.text(p,n,(0,0,0)) 75 | 76 | img.show() 77 | 78 | 79 | domain=[(10,370)]*(len(people)*2) 80 | -------------------------------------------------------------------------------- /Base/books/JTZHBC/5.optimization/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import optimization 4 | 5 | #s = [1,4,3,2,7,3,6,3,2,4,5,3] 6 | 7 | #print optimization.printschedule(s) 8 | 9 | #print optimization.schedulecost(s) 10 | 11 | domain = [(0,9)]*len(optimization.people)*2 12 | #s = optimization.randomoptimize(domain,optimization.schedulecost) 13 | 14 | #s = optimization.hillclimb(domain,optimization.schedulecost) 15 | 16 | #s = optimization.annealingoptimize(domain,optimization.schedulecost) 17 | 18 | s = optimization.geneticoptimize(domain,optimization.schedulecost) 19 | 20 | print optimization.schedulecost(s) 21 | print optimization.printschedule(s) 22 | 23 | 24 | ''' 25 | 这两种及大多数优化方法都假设:大多数问题,最优解应该接近于其他的最优解。 26 | 但某些特殊情况不一定有效。比如存在陡峭的突变的最优解。 27 | ''' 28 | -------------------------------------------------------------------------------- /Base/books/JTZHBC/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ### 第2章 提供推荐 3 | * [影片推荐系统](/JTZHBC/2.recommendation/recommendations.py) 4 | 5 | ### 第3章 发现群组 6 | * [字词向量](/JTZHBC/3.discovery/generatefeedvector.py) 7 | * [聚类](/JTZHBC/3.discovery/clusters.py) 8 | 9 | ### 第4章 搜索与排名 10 | * [爬虫与搜索引擎](/JTZHBC/4.searchengine/searchengine.py) 11 | * [神经网络](/JTZHBC/4.searchengine/nn.py) 12 | 13 | ### 第5章 优化 14 | * [优化](/JTZHBC/5.optimization/optimization.py) `(爬山法,模拟退火算法,遗传算法)` 15 | * [宿舍分配问题](/JTZHBC/5.optimization/dorm.py) 16 | * [关系网络可视化](/JTZHBC/5.optimization/socialnetwork.py) 17 | 18 | ### 第6章 文档过滤 19 | 20 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/AdaBoost/adaboost.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from numpy import * 3 | 4 | #创建简单数据集 5 | def loadSimpData(): 6 | dataMat = matrix([[1.,2.1], 7 | [2. ,1.1], 8 | [1.3,1. ], 9 | [1. ,1. ], 10 | [2. ,1. ]]) 11 | classLabels = [1.0,1.0,-1.0,-1.0,1.0] 12 | return dataMat,classLabels 13 | 14 | #####树桩(单层决策树)分类器 15 | #通过阈值比较对数据进行分类 16 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 17 | #threshVal阈值 threshIneq不等的类型 18 | retArray = ones((shape(dataMatrix)[0],1)) #先全部设成1 19 | if threshIneq == 'lt': #gt 大于greater than/ lt小于 less than 20 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #数组过滤 / 若是大于,则把第dimen维上小于阈值的设为-1 21 | else: 22 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 # / 若是小于,则把第dimen维上大于阈值的设为-1 23 | return retArray 24 | 25 | #遍历上面函数的所有可能输入值,找到数据集上最佳的单层决策树 26 | def buildStump(dataArr,classLabels,D): 27 | #D权重向量 28 | dataMatrix = mat(dataArr);labelMat = mat(classLabels).T 29 | m,n = shape(dataMatrix) 30 | numSteps = 10.0 #用于在特征的所有可能值上进行遍历 31 | bestStump = {} # 存储给定权重向量D时所得到得最佳单层决策树的相关信息 32 | bestClasEst = mat(zeros((m,1))) 33 | minError = inf #先初始化为无穷大,用于寻找可能的最小错误率 34 | for i in range(n): #在数据集的所有特征上遍历 35 | rangeMin = dataMatrix[:,i].min();rangeMax = dataMatrix[:,i].max(); #这种极值求法应该是np中的,得到第i个特征(第i列)上的极值 36 | stepSize = (rangeMax-rangeMin)/numSteps #得到步长 37 | for j in range(-1,int(numSteps)+1): #在这些步上遍历 38 | for inequal in ['lt','gt']: #在大于小于中切换不等式 / 这两层循环其实就是遍历了所有阈值取值的情况,且每种阈值取值对应两种情况:大于它为1还是小于为1 39 | threshVal = (rangeMin + float(j)*stepSize) 40 | predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal) #分类预测结果 41 | errArr = mat(ones((m,1))) 42 | errArr[predictedVals == labelMat] = 0 #数组过滤得到误差向量 43 | weightedError = D.T*errArr #计算加权错误率 44 | print "split:dim %d,thresh %.2f,thresh ineqal: %s,the weighted error is %.3f" % (i,threshVal,inequal,weightedError) 45 | if weightedError < minError: #如果误差向量变小了,则在bestStump字典中保存该单层决策树 46 | minError = weightedError 47 | bestClasEst = predictedVals.copy() 48 | bestStump['dim'] = i 49 | bestStump['thresh'] = threshVal 50 | bestStump['ineq'] = inequal 51 | return bestStump,minError,bestClasEst 52 | 53 | 54 | #完整AdaBoost算法 55 | #基于单层决策树的AdaBoost训练过程 56 | def adaBoostTrainDS(dataArr,classLabels,numIt=40): 57 | #numIt迭代次数 58 | weakClassArr = [] 59 | m = shape(dataArr)[0] 60 | D = mat(ones((m,1))/m) 61 | aggClassEst = mat(zeros((m,1))) 62 | for i in range(numIt): 63 | bestStump,error,classEst = buildStump(dataArr,classLabels,D) 64 | print "D:",D.T 65 | alpha = float(0.5*log((1.0-error)/max(error,1e-16))) 66 | bestStump['alpha'] = alpha 67 | weakClassArr.append(bestStump) 68 | print "classEst:",classEst.T 69 | expon = multiply(-1*alpha*mat(classLabels).T,classEst) #为下一次迭代计算D 70 | D = multiply(D,exp(expon)) 71 | D = D/D.sum() #D包含了每个数据点的权重 72 | aggClassEst += alpha*classEst #错误率累加计算 73 | print "aggClassEst: ",aggClassEst 74 | aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T,ones((m,1))) 75 | errorRate = aggErrors.sum() / m 76 | print "total error:",errorRate,'\n' 77 | if errorRate == 0.0:break 78 | return weakClassArr 79 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/AdaBoost/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import adaboost 4 | from numpy import * 5 | 6 | datMat,classLabels=adaboost.loadSimpData() 7 | 8 | #print datMat,classLabels 9 | 10 | D = mat(ones((5,1))/5) 11 | #print adaboost.buildStump(datMat,classLabels,D) 12 | 13 | classifierArray = adaboost.adaBoostTrainDS(datMat,classLabels,9) 14 | print classifierArray 15 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/Bayes/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import bayes 4 | 5 | ''' 6 | listOPosts,listClasses = bayes.loadDataSet() 7 | myVocabList = bayes.createVocabList(listOPosts) #构建一个包含所有词的词汇表 8 | #print myVocabList 9 | 10 | #print bayes.setOfWords2Vec(myVocabList,listOPosts[0]) 11 | #print bayes.setOfWords2Vec(myVocabList,listOPosts[3]) 12 | 13 | 14 | trainMat = [] 15 | for postinDoc in listOPosts: #循环使用词向量来填充trainMat列表 16 | trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc)) # 把训练样本的每一项文档中的词在词汇表中出现的位置标识成1,然后把所有词向量构成一个矩阵 17 | p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses) #概率向量 18 | 19 | #print p0V,p1V 20 | ''' 21 | #全部封装到测试函数里面去了 22 | 23 | #print bayes.testingNB() 24 | 25 | #垃圾邮件测试 26 | print bayes.spamTest() 27 | 28 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/DecisionTree/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import trees 3 | import treePlotter 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf8') 7 | 8 | #myDat[0][-1]='maybe' 9 | #print trees.calcShannonEnt(myDat) 10 | #print trees.chooseBestFeatureToSplit(myDat) 11 | #print trees.splitDataSet(myDat,0,0) 12 | #print trees.splitDataSet(myDat,0,1) 13 | 14 | myDat,labels = trees.createDataSet() 15 | #print myDat 16 | #print trees.createTree(myDat,labels) 17 | 18 | #treePlotter.createPlot() 19 | 20 | #print treePlotter.retrieveTree(1) 21 | 22 | myTree = treePlotter.retrieveTree(0) 23 | #print treePlotter.getNumLeafs(myTree),treePlotter.getTreeDepth(myTree) 24 | 25 | #myTree['no surfacing'][3]='maybe' 26 | #print myTree 27 | #treePlotter.createPlot(myTree) 28 | 29 | ''' 30 | print labels 31 | print myTree 32 | print trees.classify(myTree,labels,[1,0]) 33 | print trees.classify(myTree,labels,[1,1]) 34 | ''' 35 | 36 | #trees.storeTree(myTree,'classifierStorage.txt') 37 | #print trees.grabTree('classifierStorage.txt') 38 | 39 | #预测隐形眼镜类型 40 | fr=open('lenses.txt') 41 | lenses = [inst.strip().split('\t') for inst in fr.readlines()] 42 | lensesLabels = ['age','prescript','astigmatic','tearRate'] 43 | lensesTree = trees.createTree(lenses,lensesLabels) 44 | print lensesTree 45 | treePlotter.createPlot(lensesTree) 46 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/DecisionTree/treePlotter.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #使用文本注解绘制树节点 3 | 4 | import matplotlib.pyplot as plt 5 | 6 | decisionNode = dict(boxstyle="sawtooth",fc="0.8") #声明字典的一种新方式 7 | leafNode = dict(boxstyle="round4",fc="0.8") 8 | arrow_args = dict(arrowstyle="<-") #定义文本框和箭头样式 9 | 10 | #绘制带箭头的注解 11 | def plotNode(nodeTxt,centerPt,parentPt,nodeType): 12 | createPlot.axl.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\ 13 | xytext=centerPt,textcoords='axes fraction',\ 14 | va="center",ha="center",bbox=nodeType,arrowprops=arrow_args) 15 | 16 | #在父子节点间填充文本信息 17 | def plotMidText(cntrPt,parentPt,txtString): 18 | xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0] 19 | yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1] 20 | createPlot.axl.text(xMid,yMid,txtString) 21 | 22 | # 23 | def plotTree(myTree,parentPt,nodeTxt): 24 | numLeafs = getNumLeafs(myTree) 25 | depth = getTreeDepth(myTree) #计算宽与高 26 | firstStr = myTree.keys()[0] 27 | cntrPt = (plotTree.xOff +(1.0+float(numLeafs))/2.0/plotTree.totalW,plotTree.yOff) #全局变量plotTree.xOff/.yOff追踪已经绘制的节点位置 28 | plotMidText(cntrPt,parentPt,nodeTxt) #标记子节点属性值 29 | plotNode(firstStr,cntrPt,parentPt,decisionNode) 30 | secondDict = myTree[firstStr] 31 | plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD #减少y偏移 32 | for key in secondDict.keys(): 33 | if type(secondDict[key]).__name__=='dict': 34 | plotTree(secondDict[key],cntrPt,str(key)) 35 | else: 36 | plotTree.xOff = plotTree.xOff +1.0/plotTree.totalW 37 | plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),cntrPt,leafNode) 38 | plotMidText((plotTree.xOff,plotTree.yOff),cntrPt,str(key)) 39 | plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD 40 | 41 | 42 | def createPlot(inTree): 43 | fig = plt.figure(1,facecolor='white') 44 | fig.clf() 45 | axprops = dict(xticks=[],yticks=[]) 46 | createPlot.axl = plt.subplot(111,frameon=False,**axprops) 47 | plotTree.totalW = float(getNumLeafs(inTree)) 48 | plotTree.totalD = float(getTreeDepth(inTree)) 49 | plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0; #设置根节点在y轴1.0,x轴宽度的1/2处 50 | plotTree(inTree,(0.5,1.0),'') 51 | plt.show() 52 | 53 | #获取叶节点的数目 54 | def getNumLeafs(myTree): 55 | numLeafs = 0 56 | firstStr = myTree.keys()[0] #dict.keys()返回字典中所有关键字组成的list 57 | secondDict = myTree[firstStr] #因为子树下都是从0,1开始,所以这里可以当做关键字 参看49行 58 | for key in secondDict.keys(): 59 | if type(secondDict[key]).__name__=='dict': #测试节点数据类型是否为字典,可通过__name__访问 60 | numLeafs += getNumLeafs(secondDict[key]) #递归 61 | else: numLeafs+=1 62 | return numLeafs 63 | 64 | #获取树的层数 65 | def getTreeDepth(myTree): 66 | maxDepth = 0 67 | firstStr = myTree.keys()[0] 68 | secondDict = myTree[firstStr] 69 | for key in secondDict.keys(): 70 | if type(secondDict[key]).__name__=='dict': 71 | thisDepth = 1 + getTreeDepth(secondDict[key]) 72 | else:thisDepth = 1 73 | if thisDepth > maxDepth:maxDepth = thisDepth 74 | return maxDepth 75 | 76 | #节省时间,输出预先存储的树信息/ 主要用于测试 77 | def retrieveTree(i): 78 | listOfTrees = [{'no surfacing':{0:'no',1:{'flippers':{0:'no',1:'yes'}}}}, 79 | {'no surfacing':{0:'no',1:{'flippers':{0:{'head':{0:'no',1:'yes'}},1:'no'}}}} 80 | ] 81 | return listOfTrees[i] 82 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/Logistic/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import logRegres 3 | from numpy import * 4 | 5 | 6 | dataArr,labelMat=logRegres.loadDataSet() 7 | #print logRegres.gradAscent(dataArr,labelMat) 8 | 9 | #weights = logRegres.gradAscent(dataArr,labelMat) 10 | #print weights,weights.getA() 11 | #logRegres.plotBestFit(weights.getA()) #矩阵通过这个getA()这个方法可以将自身返回成一个n维数组对象, 12 | #不然直接使用weights在plotBestFit函数中的weights[1]就不是一个数而是[ 0.48007329]了 13 | 14 | weights = logRegres.stocGradAscent1(array(dataArr),labelMat) 15 | logRegres.plotBestFit(weights) 16 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/Logistic/testSet.txt: -------------------------------------------------------------------------------- 1 | -0.017612 14.053064 0 2 | -1.395634 4.662541 1 3 | -0.752157 6.538620 0 4 | -1.322371 7.152853 0 5 | 0.423363 11.054677 0 6 | 0.406704 7.067335 1 7 | 0.667394 12.741452 0 8 | -2.460150 6.866805 1 9 | 0.569411 9.548755 0 10 | -0.026632 10.427743 0 11 | 0.850433 6.920334 1 12 | 1.347183 13.175500 0 13 | 1.176813 3.167020 1 14 | -1.781871 9.097953 0 15 | -0.566606 5.749003 1 16 | 0.931635 1.589505 1 17 | -0.024205 6.151823 1 18 | -0.036453 2.690988 1 19 | -0.196949 0.444165 1 20 | 1.014459 5.754399 1 21 | 1.985298 3.230619 1 22 | -1.693453 -0.557540 1 23 | -0.576525 11.778922 0 24 | -0.346811 -1.678730 1 25 | -2.124484 2.672471 1 26 | 1.217916 9.597015 0 27 | -0.733928 9.098687 0 28 | -3.642001 -1.618087 1 29 | 0.315985 3.523953 1 30 | 1.416614 9.619232 0 31 | -0.386323 3.989286 1 32 | 0.556921 8.294984 1 33 | 1.224863 11.587360 0 34 | -1.347803 -2.406051 1 35 | 1.196604 4.951851 1 36 | 0.275221 9.543647 0 37 | 0.470575 9.332488 0 38 | -1.889567 9.542662 0 39 | -1.527893 12.150579 0 40 | -1.185247 11.309318 0 41 | -0.445678 3.297303 1 42 | 1.042222 6.105155 1 43 | -0.618787 10.320986 0 44 | 1.152083 0.548467 1 45 | 0.828534 2.676045 1 46 | -1.237728 10.549033 0 47 | -0.683565 -2.166125 1 48 | 0.229456 5.921938 1 49 | -0.959885 11.555336 0 50 | 0.492911 10.993324 0 51 | 0.184992 8.721488 0 52 | -0.355715 10.325976 0 53 | -0.397822 8.058397 0 54 | 0.824839 13.730343 0 55 | 1.507278 5.027866 1 56 | 0.099671 6.835839 1 57 | -0.344008 10.717485 0 58 | 1.785928 7.718645 1 59 | -0.918801 11.560217 0 60 | -0.364009 4.747300 1 61 | -0.841722 4.119083 1 62 | 0.490426 1.960539 1 63 | -0.007194 9.075792 0 64 | 0.356107 12.447863 0 65 | 0.342578 12.281162 0 66 | -0.810823 -1.466018 1 67 | 2.530777 6.476801 1 68 | 1.296683 11.607559 0 69 | 0.475487 12.040035 0 70 | -0.783277 11.009725 0 71 | 0.074798 11.023650 0 72 | -1.337472 0.468339 1 73 | -0.102781 13.763651 0 74 | -0.147324 2.874846 1 75 | 0.518389 9.887035 0 76 | 1.015399 7.571882 0 77 | -1.658086 -0.027255 1 78 | 1.319944 2.171228 1 79 | 2.056216 5.019981 1 80 | -0.851633 4.375691 1 81 | -1.510047 6.061992 0 82 | -1.076637 -3.181888 1 83 | 1.821096 10.283990 0 84 | 3.010150 8.401766 1 85 | -1.099458 1.688274 1 86 | -0.834872 -1.733869 1 87 | -0.846637 3.849075 1 88 | 1.400102 12.628781 0 89 | 1.752842 5.468166 1 90 | 0.078557 0.059736 1 91 | 0.089392 -0.715300 1 92 | 1.825662 12.693808 0 93 | 0.197445 9.744638 0 94 | 0.126117 0.922311 1 95 | -0.679797 1.220530 1 96 | 0.677983 2.556666 1 97 | 0.761349 10.693862 0 98 | -2.168791 0.143632 1 99 | 1.388610 9.341997 0 100 | 0.317029 14.739025 0 101 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/Regression/regression.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from numpy import * 3 | 4 | def loadDataSet(fileName): 5 | numFeat = len(open(fileName).readline().split('\t')) - 1 6 | dataMat = []; labelMat = [] 7 | fr = open(fileName) 8 | for line in fr.readlines(): 9 | lineArr = [] 10 | curLine = line.strip().split('\t') 11 | for i in range(numFeat): 12 | lineArr.append(float(curLine[i])) 13 | dataMat.append(lineArr) 14 | labelMat.append(float(curLine[-1])) 15 | return dataMat,labelMat 16 | 17 | #计算最佳拟合直线 18 | def standRegres(xArr,yArr): 19 | xMat = mat(xArr);yMat =mat(yArr).T 20 | xTx = xMat.T*xMat 21 | if linalg.det(xTx) == 0.0: #np提供的线性函数库linalg,其中linalg.det(x)函数计算行列式的值 22 | print 'this matrix is singular,cannot do inverse' 23 | return 24 | ws = xTx.I * (xMat.T*yMat) #.T是转置的话这里.I就是逆了呗 / 这里都是用的p138的公式 25 | return ws #预测的参数向量 26 | 27 | 28 | #####局部加权线性回归函数 / 给定x空间中任意一点,计算出对应的预测值yHat 29 | def lwlr(testPoint,xArr,yArr,k=1.0): 30 | xMat = mat(xArr); yMat = mat(yArr).T 31 | m = shape(xMat)[0] 32 | weights = mat(eye((m))) #numpy.eye()创建对角矩阵 33 | for j in range(m): 34 | diffMat = testPoint - xMat[j,:] 35 | weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2)) #权值大小以指数级衰减,参数k控制衰减速度 36 | xTx = xMat.T * (weights * xMat) 37 | if linalg.det(xTx) == 0.0: #判断行列式是否为0 38 | print 'this matrix is singular, cannot do inverse' 39 | return 40 | ws = xTx.I * (xMat.T * (weights * yMat)) #按照书上的公式 / 得到对回归系数ws的一个估计 41 | return testPoint *ws 42 | 43 | def lwlrTest(testArr,xArr,yArr,k=1.0): #用于为数据集中每个点调用lwlr(),有助于求解k的大小 44 | m = shape(testArr)[0] 45 | yHat = zeros(m) 46 | for i in range(m): 47 | yHat[i] = lwlr(testArr[i],xArr,yArr,k) 48 | return yHat 49 | 50 | #缩减系数:岭回归(在矩阵XT*X上加一个kI从而使矩阵非奇异,进而能求逆,I为单位矩阵 / 缩减法的一种,相当于对回归系数的大小施加了限制) 51 | #lasso:限制回归系。难以求解,可使用计算简便的逐步线性回归法(属于贪心算法,每一步都尽可能减少误差)求近似结果。 52 | 53 | #权衡方差与误差。可指出哪些特征时关键的,哪些是不重要的。 54 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/Regression/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from numpy import * 3 | import regression 4 | 5 | xArr,yArr=regression.loadDataSet('ex0.txt') 6 | #print xArr[0:2] 7 | 8 | 9 | ws = regression.standRegres(xArr,yArr) 10 | print ws 11 | 12 | #使用新的ws值计算预测的值yHat 13 | xMat = mat(xArr) 14 | yMat = mat(yArr) 15 | yHat = xMat*ws 16 | 17 | #绘出数据集散点图和最佳拟合直线图 18 | import matplotlib.pyplot as plt 19 | fig = plt.figure() 20 | ax = fig.add_subplot(111) 21 | ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0]) 22 | 23 | #为了绘制计算出的最佳拟合曲线,需要绘出yHat的值 24 | #若直线上的数据点次序混乱,绘图时将会出现问题,固要先将点按照升序排列 25 | xCopy = xMat.copy() 26 | xCopy.sort(0) #这个应该是np中的sort,意思是按照0维度排序 27 | yHat = xCopy*ws 28 | ax.plot(xCopy[:,1],yHat) 29 | plt.show() 30 | 31 | #对单点进行估计 32 | print yArr[0] 33 | print regression.lwlr(xArr[0],xArr,yArr,1.0) 34 | print regression.lwlr(xArr[0],xArr,yArr,0.001) 35 | 36 | #得到所有点的估计 37 | yHat = regression.lwlrTest(xArr,xArr,yArr,0.003) 38 | srtInd = xMat[:,1].argsort(0) 39 | xSort = xMat[srtInd][:,0,:] 40 | fig = plt.figure() 41 | ax = fig.add_subplot(111) 42 | ax.plot(xSort[:,1],yHat[srtInd]) 43 | ax.scatter(xMat[:,1].flatten().A[0],mat(yArr).T.flatten().A[0], s=2,c='red') 44 | plt.show() 45 | 46 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/SVM/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import svm 4 | 5 | dataArr,labelArr = svm.loadDataSet('testSet.txt') 6 | 7 | #print labelArr 8 | 9 | b,alphas = svm.smoSimple(dataArr,labelArr,0.6,0.001,40) 10 | 11 | print b 12 | print alphas[alphas>0] 13 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/kMeans/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import kMeans 4 | from numpy import * 5 | 6 | datMat=mat(kMeans.loadDataSet('testSet.txt')) 7 | ''' 8 | print datMat[:,0] 9 | print min(datMat[:,0]) 10 | print min(datMat[:,1]) 11 | print max(datMat[:,0]) 12 | print max(datMat[:,1]) 13 | 14 | print kMeans.randCent(datMat,2) 15 | print kMeans.distEclud(datMat[0],datMat[1]) 16 | ''' 17 | 18 | #myCentroids,clusterAssing = kMeans.kMeans(datMat,4) 19 | 20 | datMat3 = mat(kMeans.loadDataSet('testSet2.txt')) 21 | centList,myNewAssments = kMeans.biKmeans(datMat3,3) 22 | 23 | print centList 24 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/kNN/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import kNN 4 | import matplotlib 5 | import matplotlib.pyplot as plt 6 | from numpy import * 7 | 8 | #group,labels = kNN.createDataSet() 9 | 10 | #kNN.classify0([0,0], group,labels,3) 11 | 12 | datingDataMat,datingLabels = kNN.file2matrix('datingTestSet2.txt') 13 | """ 14 | fig = plt.figure() #表示绘制一个图 15 | ax = fig.add_subplot(111) #将画布分割成1行1列,图像画在从左到右从上到下的第1块 16 | ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels)) #scatter生成散点图函数。使用datingDataMat矩阵的第二、三列数据 17 | #datingDataMat[:,1]意思是所有行的第2列(从0开始) 18 | #后面第一个数字参数对应左边两种颜色的点的半径大小,第二个数字试了下没什么变化 19 | plt.show() 20 | """ 21 | 22 | normMat,ranges,minVals = kNN.autoNorm(datingDataMat) 23 | 24 | #print kNN.datingClassTest() 25 | 26 | 27 | #print kNN.classifyPerson() 28 | 29 | #testVector = kNN.img2vector('testDigits/0_13.txt') 30 | #print testVector[0,32:63] 31 | 32 | print kNN.handwritingClassTest() 33 | -------------------------------------------------------------------------------- /Base/books/ML_in_action/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ## 机器学习实战 3 | 4 | 记录随书敲的代码,同时自己添加了详细的注释。 5 | 6 | ### 第一部分 分类 7 | 8 | * [k-近邻算法/kNN](https://github.com/fire717/Machine-Learning/tree/master/ML_in_action/kNN) 9 | * [决策树](https://github.com/fire717/Machine-Learning/tree/master/ML_in_action/DecisionTree) 10 | * [朴素贝叶斯](https://github.com/fire717/Machine-Learning/tree/master/ML_in_action/Bayes) 11 | * [Logistic回归](https://github.com/fire717/Machine-Learning/tree/master/ML_in_action/Logistic) 12 | * [支持向量机SVM](https://github.com/fire717/Machine-Learning/tree/master/ML_in_action/SVM) 13 | * [AdaBoost元算法](https://github.com/fire717/Machine-Learning/tree/master/ML_in_action/AdaBoost) 14 | 15 | ### 第二部分 回归 16 | 17 | 18 | 19 | ### 第三部分 无监督学习 20 | 21 | * [K-均值聚类算法/kMeans](https://github.com/fire717/Machine-Learning/tree/master/ML_in_action/kMeans) 22 | -------------------------------------------------------------------------------- /Base/challenge/DataFountain/writing_classify/pre_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# 读取数据" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [] 27 | } 28 | ], 29 | "metadata": { 30 | "kernelspec": { 31 | "display_name": "Python 3", 32 | "language": "python", 33 | "name": "python3" 34 | }, 35 | "language_info": { 36 | "codemirror_mode": { 37 | "name": "ipython", 38 | "version": 3 39 | }, 40 | "file_extension": ".py", 41 | "mimetype": "text/x-python", 42 | "name": "python", 43 | "nbconvert_exporter": "python", 44 | "pygments_lexer": "ipython3", 45 | "version": "3.5.4" 46 | } 47 | }, 48 | "nbformat": 4, 49 | "nbformat_minor": 2 50 | } 51 | -------------------------------------------------------------------------------- /Base/challenge/kaggle/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ### 目录 3 | * [房价](./HousePrices) 4 | * [泰坦尼克](./Titanic) 5 | * [五金网站关键词搜索](./searchrelevance.pdf) -------------------------------------------------------------------------------- /Base/challenge/kaggle/searchrelevance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/challenge/kaggle/searchrelevance.pdf -------------------------------------------------------------------------------- /Base/courses/DL_AndrewNg/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/courses/DL_AndrewNg/.DS_Store -------------------------------------------------------------------------------- /Base/courses/DL_AndrewNg/README.md: -------------------------------------------------------------------------------- 1 | ## Deep Learning Specialization 2 | 3 | ### Course 1:Neural Networks and Deep Learning 4 | * [Logistic Regression with a Neural Network mindset](./course1/LRwithNN.ipynb) 5 | * [Planar data classification with a hidden layer](./course1/week3NN.ipynb) 6 | * [Building your Deep Neural Network: Step by Step](./course1/BuildingDNNv3.ipynb) 7 | * [Deep Neural Network for Image Classification: Application](./course1/DNNApplicationv3.ipynb) 8 | 9 | ### Course 2:Improving Deep Neural Networks: Hyperparameter tuning, Regularization and Optimization 10 | * [Initialization](./course2/Initialization.ipynb) 11 | * [Regularization](./course2/Regularization.ipynb) 12 | * [Gradient Checking](./course2/GradientChecking.ipynb) 13 | * [Optimization](./course2/OptimizationMethods.ipynb) 14 | * [Tensorflow Tutorial](./course2/TensorflowTutorial.ipynb) 15 | 16 | ### Course 3:Structuring Machine Learning Projects 17 | * No homework 18 | 19 | ### Course 4:Convolutional Neural Networks 20 | * [Convolutional Model: step by step](./course4/ConvolutionmodelStepbyStepv2.ipynb) 21 | * [Convolutional Model: application (tensorflow)](./course4/ConvolutionmodelApplicationv1.ipynb) 22 | * [Keras Tutorial - The Happy House](./course4/KerasTutorialHappyHousev2.ipynb) 23 | * [Residual Networks](./course4/ResidualNetworksv2.ipynb) 24 | * [Car detection with YOLOv2](./course4/AutonomousdrivingapplicationCardetectionv3.ipynb) 25 | * [Art generation with Neural Style Transfer](./course4/ArtGenerationwithNeuralStyleTransfer2.ipynb) 26 | * [Face Recognition for the Happy House](./course4/FaceRecognitionfortheHappyHousev3.ipynb) 27 | 28 | ### Course 5:Sequence Models 29 | * [rnn_utils](./course5/rnn_utils.py) 30 | * [Building a recurrent neural network - step by step](./course5/BuildingaRecurrentNeuralNetworkStepbyStepv3.ipynb) 31 | * [Dinosaur Island - Character-Level Language Modeling](./course5/DinosaurusIslandCharacterlevellanguagemodelfinalv3.ipynb) 32 | * [Jazz improvisation with LSTM](./course5/ImproviseaJazzSolowithanLSTMNetworkv3.ipynb) 33 | * [Operations on word vectors - Debiasing](./course5/Operationsonwordvectorsv2.ipynb) 34 | * [Emojify](./course5/Emojifyv2.ipynb) 35 | * [Neural Machine Translation with Attention](./course5/Neuralmachinetranslationwithattentionv3.ipynb) 36 | * [Trigger word detection](./course5/Triggerworddetectionv1.ipynb) 37 | 38 | 39 | 其它笔记资源:https://github.com/fengdu78/deeplearning_ai_books 40 | -------------------------------------------------------------------------------- /Base/courses/DL_AndrewNg/course4/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/courses/DL_AndrewNg/course4/.DS_Store -------------------------------------------------------------------------------- /Base/courses/DL_AndrewNg/course5/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/courses/DL_AndrewNg/course5/.DS_Store -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex1_liner/computeCost.m: -------------------------------------------------------------------------------- 1 | function J = computeCost(X, y, theta) 2 | %COMPUTECOST Compute cost for linear regression 3 | % J = COMPUTECOST(X, y, theta) computes the cost of using theta as the 4 | % parameter for linear regression to fit the data points in X and y 5 | 6 | % Initialize some useful values 7 | m = length(y); % number of training examples 8 | 9 | % You need to return the following variables correctly 10 | J = 0; 11 | 12 | % ====================== YOUR CODE HERE ====================== 13 | % Instructions: Compute the cost of a particular choice of theta 14 | % You should set J to the cost. 15 | 16 | %X(:,2) = (X(:,2)-mean(X(:,2)))/(max(X(:,2))-min(X(:,2))) 17 | %y = (y-mean(y)/((max(y)-min(y)) 18 | 19 | J = (1/(2*m))*sum((X*theta-y).^2) 20 | 21 | 22 | 23 | % ========================================================================= 24 | 25 | end 26 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex1_liner/gradientDescent.m: -------------------------------------------------------------------------------- 1 | function [theta, J_history] = gradientDescent(X, y, theta, alpha, num_iters) 2 | %GRADIENTDESCENT Performs gradient descent to learn theta 3 | % theta = GRADIENTDESCENT(X, y, theta, alpha, num_iters) updates theta by 4 | % taking num_iters gradient steps with learning rate alpha 5 | 6 | % Initialize some useful values 7 | m = length(y); % number of training examples 8 | J_history = zeros(num_iters, 1); 9 | 10 | for iter = 1:num_iters 11 | 12 | % ====================== YOUR CODE HERE ====================== 13 | % Instructions: Perform a single gradient step on the parameter vector 14 | % theta. 15 | % 16 | % Hint: While debugging, it can be useful to print out the values 17 | % of the cost function (computeCost) and gradient here. 18 | % 19 | 20 | tmp = theta %simultaneously update 21 | tmp(1) = theta(1)-alpha*(1/m)*sum(X*theta-y) 22 | tmp(2) = theta(2)-alpha*(1/m)*((X*theta-y)'*X(:,2)) 23 | theta = tmp 24 | 25 | % n = length(theta); 26 | % theta1 = theta; 27 | % for i = 1:n 28 | % S = 0; 29 | % for j = 1:m 30 | % S = S + (X(j,:)*theta-y(j)).*X(j,i); 31 | % end 32 | % S = S*alpha/m; 33 | % theta1(i) = theta(i) - S; 34 | % end 35 | % theta = theta1; 36 | 37 | % ============================================================ 38 | 39 | % Save the cost J in every iteration 40 | J_history(iter) = computeCost(X, y, theta); 41 | 42 | end 43 | 44 | end 45 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex2_logistc/costFunction.m: -------------------------------------------------------------------------------- 1 | function [J, grad] = costFunction(theta, X, y) 2 | %COSTFUNCTION Compute cost and gradient for logistic regression 3 | % J = COSTFUNCTION(theta, X, y) computes the cost of using theta as the 4 | % parameter for logistic regression and the gradient of the cost 5 | % w.r.t. to the parameters. 6 | 7 | % Initialize some useful values 8 | m = length(y); % number of training examples 9 | 10 | % You need to return the following variables correctly 11 | J = 0; 12 | grad = zeros(size(theta)); 13 | 14 | % ====================== YOUR CODE HERE ====================== 15 | % Instructions: Compute the cost of a particular choice of theta. 16 | % You should set J to the cost. 17 | % Compute the partial derivatives and set grad to the partial 18 | % derivatives of the cost w.r.t. each parameter in theta 19 | % 20 | % Note: grad should have the same dimensions as theta 21 | % 22 | 23 | %要注意sigmoid的参数 24 | J = -1/m*(y'*log(sigmoid(X*theta))+(ones(size(y))-y)'*log(1-sigmoid(X*theta))) 25 | 26 | for i=1:size(theta), 27 | grad(i) = 1/m*((sigmoid(X*theta)-y)'*X(:,i)) 28 | 29 | % ============================================================= 30 | 31 | end 32 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex2_logistc/costFunctionReg.m: -------------------------------------------------------------------------------- 1 | function [J, grad] = costFunctionReg(theta, X, y, lambda) 2 | %COSTFUNCTIONREG Compute cost and gradient for logistic regression with regularization 3 | % J = COSTFUNCTIONREG(theta, X, y, lambda) computes the cost of using 4 | % theta as the parameter for regularized logistic regression and the 5 | % gradient of the cost w.r.t. to the parameters. 6 | 7 | % Initialize some useful values 8 | m = length(y); % number of training examples 9 | 10 | % You need to return the following variables correctly 11 | J = 0; 12 | grad = zeros(size(theta)); 13 | 14 | % ====================== YOUR CODE HERE ====================== 15 | % Instructions: Compute the cost of a particular choice of theta. 16 | % You should set J to the cost. 17 | % Compute the partial derivatives and set grad to the partial 18 | % derivatives of the cost w.r.t. each parameter in theta 19 | n = size(theta); 20 | 21 | for i = 1:m 22 | h_thetax = sigmoid(X(i,:)*theta); 23 | J = J - y(i)*log(h_thetax) -(1-y(i))*log(1-h_thetax); 24 | end 25 | for i = 2:n 26 | J = J + 0.5*lambda*theta(i)*theta(i); 27 | end 28 | J = J/m; 29 | 30 | 31 | for j =1:n 32 | sum = 0; 33 | for i = 1:m 34 | h_thetax = sigmoid(X(i,:)*theta); 35 | sum = sum + (h_thetax - y(i))*X(i,j); 36 | end 37 | if(j==1) 38 | grad(j) = sum/m; 39 | else 40 | grad(j) = theta(j)*lambda/m + sum/m; 41 | end 42 | end 43 | 44 | 45 | 46 | % ============================================================= 47 | 48 | end 49 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex2_logistc/plotData.m: -------------------------------------------------------------------------------- 1 | function plotData(X, y) 2 | %PLOTDATA Plots the data points X and y into a new figure 3 | % PLOTDATA(x,y) plots the data points with + for the positive examples 4 | % and o for the negative examples. X is assumed to be a Mx2 matrix. 5 | 6 | % Create New Figure 7 | figure; hold on; 8 | 9 | % ====================== YOUR CODE HERE ====================== 10 | % Instructions: Plot the positive and negative examples on a 11 | % 2D plot, using the option 'k+' for the positive 12 | % examples and 'ko' for the negative examples. 13 | % 14 | 15 | 16 | % Find Indices of Positive and Negative Examples 17 | pos = find(y==1); neg = find(y == 0); %选择y=1的那些行 18 | % Plot Examples 19 | plot(X(pos, 1), X(pos, 2), 'k+', 'LineWidth', 2,'MarkerSize', 7);%选择y=1的那些行 20 | plot(X(neg, 1), X(neg, 2), 'ko', 'MarkerFaceColor', 'y', 'MarkerSize', 7); 21 | 22 | 23 | 24 | % ========================================================================= 25 | 26 | 27 | 28 | hold off; 29 | 30 | end 31 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex2_logistc/predict.m: -------------------------------------------------------------------------------- 1 | function p = predict(theta, X) 2 | %PREDICT Predict whether the label is 0 or 1 using learned logistic 3 | %regression parameters theta 4 | % p = PREDICT(theta, X) computes the predictions for X using a 5 | % threshold at 0.5 (i.e., if sigmoid(theta'*x) >= 0.5, predict 1) 6 | 7 | m = size(X, 1); % Number of training examples 8 | 9 | % You need to return the following variables correctly 10 | p = zeros(m, 1); 11 | 12 | % ====================== YOUR CODE HERE ====================== 13 | % Instructions: Complete the following code to make predictions using 14 | % your learned logistic regression parameters. 15 | % You should set p to a vector of 0's and 1's 16 | % 17 | 18 | for i=1:m, 19 | if sigmoid(X*theta)(i)>=0.5,%sigmoid(X*theta)也是一个列向量! 20 | p(i)=1; 21 | else 22 | p(i)=0; 23 | end; 24 | end; 25 | 26 | % ========================================================================= 27 | 28 | end 29 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex2_logistc/sigmoid.m: -------------------------------------------------------------------------------- 1 | function g = sigmoid(z) 2 | %SIGMOID Compute sigmoid function 3 | % g = SIGMOID(z) computes the sigmoid of z. 4 | 5 | % You need to return the following variables correctly 6 | g = zeros(size(z)); 7 | 8 | % ====================== YOUR CODE HERE ====================== 9 | % Instructions: Compute the sigmoid of each value of z (z can be a matrix, 10 | % vector or scalar). 11 | 12 | 13 | %for i=1:size(z), 14 | % g(i)=1/(1+exp(-z(i))); 15 | %end; 因为测试不是列向量是行向量,所以只有第一位对。所以还是要按答案写考虑矩阵 16 | % 17 | [m,n] = size(z); 18 | 19 | for i = 1:m 20 | for j= 1:n 21 | g(i,j) = 1/(1+exp(-z(i,j))); 22 | end 23 | end 24 | 25 | % ============================================================= 26 | 27 | end 28 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex3_nn/lrCostFunction.m: -------------------------------------------------------------------------------- 1 | function [J, grad] = lrCostFunction(theta, X, y, lambda) 2 | %LRCOSTFUNCTION Compute cost and gradient for logistic regression with 3 | %regularization 4 | % J = LRCOSTFUNCTION(theta, X, y, lambda) computes the cost of using 5 | % theta as the parameter for regularized logistic regression and the 6 | % gradient of the cost w.r.t. to the parameters. 7 | 8 | % Initialize some useful values 9 | m = length(y); % number of training examples 10 | 11 | % You need to return the following variables correctly 12 | J = 0; 13 | grad = zeros(size(theta)); 14 | 15 | % ====================== YOUR CODE HERE ====================== 16 | % Instructions: Compute the cost of a particular choice of theta. 17 | % You should set J to the cost. 18 | % Compute the partial derivatives and set grad to the partial 19 | % derivatives of the cost w.r.t. each parameter in theta 20 | % 21 | % Hint: The computation of the cost function and gradients can be 22 | % efficiently vectorized. For example, consider the computation 23 | % 24 | % sigmoid(X * theta) 25 | % 26 | % Each row of the resulting matrix will contain the value of the 27 | % prediction for that example. You can make use of this to vectorize 28 | % the cost function and gradient computations. 29 | % 30 | % Hint: When computing the gradient of the regularized cost function, 31 | % there're many possible vectorized solutions, but one solution 32 | % looks like: 33 | % grad = (unregularized gradient for logistic regression) 34 | % temp = theta; 35 | % temp(1) = 0; % because we don't add anything for j = 0 36 | % grad = grad + YOUR_CODE_HERE (using the temp variable) 37 | % 38 | 39 | %temp = - y.*log(h_thetax) -(ones(m,1)-y).*log(1-h_thetax); 答案这里还是点乘,我是直接矩阵相乘了 40 | %J = sum(temp)/m; 41 | %temp = theta; 42 | %temp(1) = 0; 43 | %J = J + 0.5*lambda*(temp'*temp)/m; 44 | % 45 | %grad = X'*(h_thetax -y)/m; 46 | %temp = theta; 47 | %temp(1) = 0; 48 | %temp = temp*lambda/m; 49 | %grad = grad + temp; 50 | 51 | h_thetax = sigmoid(X*theta); 52 | J = -(y'*log(h_thetax)+(ones(size(y))-y)'*log(1-h_thetax))/m+lambda*(theta'*theta-theta(1)^2)*0.5/m 53 | 54 | grad = ((h_thetax-y)'*X)'/m 55 | temp = theta; 56 | temp(1) = 0; 57 | temp = temp*lambda/m; 58 | grad = grad + temp; 59 | % ============================================================= 60 | 61 | grad = grad(:); %这句有什么意义?! 62 | 63 | end 64 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex3_nn/oneVsAll.m: -------------------------------------------------------------------------------- 1 | function [all_theta] = oneVsAll(X, y, num_labels, lambda) 2 | %ONEVSALL trains multiple logistic regression classifiers and returns all 3 | %the classifiers in a matrix all_theta, where the i-th row of all_theta 4 | %corresponds to the classifier for label i 5 | % [all_theta] = ONEVSALL(X, y, num_labels, lambda) trains num_labels 6 | % logistic regression classifiers and returns each of these classifiers 7 | % in a matrix all_theta, where the i-th row of all_theta corresponds 8 | % to the classifier for label i 9 | 10 | % Some useful variables 11 | m = size(X, 1); 12 | n = size(X, 2); 13 | 14 | % You need to return the following variables correctly 15 | all_theta = zeros(num_labels, n + 1); 16 | 17 | % Add ones to the X data matrix 18 | X = [ones(m, 1) X]; 19 | 20 | % ====================== YOUR CODE HERE ====================== 21 | % Instructions: You should complete the following code to train num_labels 22 | % logistic regression classifiers with regularization 23 | % parameter lambda. 24 | % 25 | % Hint: theta(:) will return a column vector. 26 | % 27 | % Hint: You can use y == c to obtain a vector of 1's and 0's that tell you 28 | % whether the ground truth is true/false for this class. 29 | % 30 | % Note: For this assignment, we recommend using fmincg to optimize the cost 31 | % function. It is okay to use a for-loop (for c = 1:num_labels) to 32 | % loop over the different classes. 33 | % 34 | % fmincg works similarly to fminunc, but is more efficient when we 35 | % are dealing with large number of parameters. 36 | % 37 | % Example Code for fmincg: 38 | % 39 | % % Set Initial theta 40 | % initial_theta = zeros(n + 1, 1); 41 | % 42 | % % Set options for fminunc 43 | % options = optimset('GradObj', 'on', 'MaxIter', 50); 44 | % 45 | % % Run fmincg to obtain the optimal theta 46 | % % This function will return theta and the cost 47 | % [theta] = ... 48 | % fmincg (@(t)(lrCostFunction(t, X, (y == c), lambda)), ... 49 | % initial_theta, options); 50 | % 51 | 52 | for c = 1:num_labels 53 | initial_theta = zeros(n + 1, 1); 54 | options = optimset('GradObj', 'on', 'MaxIter', 50); 55 | [theta] = fmincg (@(t)(lrCostFunction(t, X, (y == c), lambda)), ... 56 | initial_theta, options); 57 | all_theta(c,:) = theta(:); 58 | end 59 | 60 | 61 | 62 | 63 | 64 | 65 | % ========================================================================= 66 | 67 | 68 | end 69 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex3_nn/predict.m: -------------------------------------------------------------------------------- 1 | function p = predict(Theta1, Theta2, X) 2 | %PREDICT Predict the label of an input given a trained neural network 3 | % p = PREDICT(Theta1, Theta2, X) outputs the predicted label of X given the 4 | % trained weights of a neural network (Theta1, Theta2) 5 | 6 | % Useful values 7 | m = size(X, 1); 8 | num_labels = size(Theta2, 1); 9 | 10 | % You need to return the following variables correctly 11 | p = zeros(size(X, 1), 1); 12 | 13 | % ====================== YOUR CODE HERE ====================== 14 | % Instructions: Complete the following code to make predictions using 15 | % your learned neural network. You should set p to a 16 | % vector containing labels between 1 to num_labels. 17 | % 18 | % Hint: The max function might come in useful. In particular, the max 19 | % function can also return the index of the max element, for more 20 | % information see 'help max'. If your examples are in rows, then, you 21 | % can use max(A, [], 2) to obtain the max for each row. 22 | % 23 | 24 | % Add ones to the X data matrix 25 | X = [ones(m, 1) X]; %添加一列的方式 26 | 27 | z = X*Theta1'; 28 | X2 = sigmoid(z); 29 | 30 | X2 = [ones(m, 1) X2]; %两层 处理两次 31 | temp = X2*Theta2'; 32 | [M,I] = max(temp,[],2); %在第2维方向上取最大值,也就是每行最大值 33 | 34 | p = I; 35 | 36 | 37 | 38 | 39 | % ========================================================================= 40 | 41 | 42 | end 43 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/ex3_nn/predictOneVsAll.m: -------------------------------------------------------------------------------- 1 | function p = predictOneVsAll(all_theta, X) 2 | %PREDICT Predict the label for a trained one-vs-all classifier. The labels 3 | %are in the range 1..K, where K = size(all_theta, 1). 4 | % p = PREDICTONEVSALL(all_theta, X) will return a vector of predictions 5 | % for each example in the matrix X. Note that X contains the examples in 6 | % rows. all_theta is a matrix where the i-th row is a trained logistic 7 | % regression theta vector for the i-th class. You should set p to a vector 8 | % of values from 1..K (e.g., p = [1; 3; 1; 2] predicts classes 1, 3, 1, 2 9 | % for 4 examples) 10 | 11 | m = size(X, 1); 12 | num_labels = size(all_theta, 1); 13 | 14 | % You need to return the following variables correctly 15 | p = zeros(size(X, 1), 1); 16 | 17 | % Add ones to the X data matrix 18 | X = [ones(m, 1) X]; 19 | 20 | % ====================== YOUR CODE HERE ====================== 21 | % Instructions: Complete the following code to make predictions using 22 | % your learned logistic regression parameters (one-vs-all). 23 | % You should set p to a vector of predictions (from 1 to 24 | % num_labels). 25 | % 26 | % Hint: This code can be done all vectorized using the max function. 27 | % In particular, the max function can also return the index of the 28 | % max element, for more information see 'help max'. If your examples 29 | % are in rows, then, you can use max(A, [], 2) to obtain the max 30 | % for each row. 31 | % 32 | 33 | temp = all_theta*X'; 34 | [M,I] = max(temp); %[最大值,索引号] 35 | 36 | p = I; 37 | 38 | 39 | 40 | 41 | % ========================================================================= 42 | 43 | 44 | end 45 | -------------------------------------------------------------------------------- /Base/courses/coursera_ML/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ### coursera上机器学习课程作业代码 3 | 4 | 5 | 其它笔记资源:[斯坦福大学2014(吴恩达)机器学习教程中文笔记](https://github.com/fengdu78/Coursera-ML-AndrewNg-Notes) 6 | -------------------------------------------------------------------------------- /Base/courses/cs231n/README.md: -------------------------------------------------------------------------------- 1 | ## CS231n 2 | [![Python](https://img.shields.io/badge/python-3.5-green.svg)](https://github.com/fire717/Machine-Learning/tree/master/cs231n) 3 | 4 | ### 资源 5 | * [官方Schedule and Syllabus](http://cs231n.stanford.edu/syllabus.html) 6 | * [课程视频-网易云](http://study.163.com/course/courseMain.htm?courseId=1003223001) 7 | * [课程视频-b站](http://www.bilibili.com/video/av13260183/index_1.html#page=1) 8 | * [斯坦福CS231n Spring 2017开放全部课程视频](https://zhuanlan.zhihu.com/p/28488268?utm_medium=social&utm_source=wechat_session) 9 | 10 | ### 作业 11 | * [官方作业说明](http://cs231n.github.io/) 12 | * [参考](https://github.com/lightaime/cs231n) 13 | 14 | * [assignment1](./assignment1) 15 | 16 | 17 | ### 笔记 18 | [CS231n官方笔记授权翻译总集篇发布](https://zhuanlan.zhihu.com/p/21930884) 19 | 20 | * [反向随机失活(inverted dropout)](./note/inverted_dropout.py) -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | .env/* 4 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/README.md: -------------------------------------------------------------------------------- 1 | Details about this assignment can be found [on the course webpage](http://cs231n.github.io/), under Assignment #1 of Spring 2017. 2 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/collectSubmission.sh: -------------------------------------------------------------------------------- 1 | rm -f assignment1.zip 2 | zip -r assignment1.zip . -x "*.git*" "*cs231n/datasets*" "*.ipynb_checkpoints*" "*README.md" "*collectSubmission.sh" "*requirements.txt" ".env/*" 3 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/cs231n/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/courses/cs231n/assignment1/cs231n/__init__.py -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/cs231n/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | from cs231n.classifiers.k_nearest_neighbor import * 2 | from cs231n.classifiers.linear_classifier import * 3 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/cs231n/datasets/.gitignore: -------------------------------------------------------------------------------- 1 | cifar-10-batches-py/* 2 | tiny-imagenet-100-A* 3 | tiny-imagenet-100-B* 4 | tiny-100-A-pretrained/* 5 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/cs231n/datasets/get_datasets.sh: -------------------------------------------------------------------------------- 1 | # Get CIFAR10 2 | wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz 3 | tar -xzvf cifar-10-python.tar.gz 4 | rm cifar-10-python.tar.gz 5 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/cs231n/vis_utils.py: -------------------------------------------------------------------------------- 1 | from past.builtins import xrange 2 | 3 | from math import sqrt, ceil 4 | import numpy as np 5 | 6 | def visualize_grid(Xs, ubound=255.0, padding=1): 7 | """ 8 | Reshape a 4D tensor of image data to a grid for easy visualization. 9 | 10 | Inputs: 11 | - Xs: Data of shape (N, H, W, C) 12 | - ubound: Output grid will have values scaled to the range [0, ubound] 13 | - padding: The number of blank pixels between elements of the grid 14 | """ 15 | (N, H, W, C) = Xs.shape 16 | grid_size = int(ceil(sqrt(N))) 17 | grid_height = H * grid_size + padding * (grid_size - 1) 18 | grid_width = W * grid_size + padding * (grid_size - 1) 19 | grid = np.zeros((grid_height, grid_width, C)) 20 | next_idx = 0 21 | y0, y1 = 0, H 22 | for y in xrange(grid_size): 23 | x0, x1 = 0, W 24 | for x in xrange(grid_size): 25 | if next_idx < N: 26 | img = Xs[next_idx] 27 | low, high = np.min(img), np.max(img) 28 | grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low) 29 | # grid[y0:y1, x0:x1] = Xs[next_idx] 30 | next_idx += 1 31 | x0 += W + padding 32 | x1 += W + padding 33 | y0 += H + padding 34 | y1 += H + padding 35 | # grid_max = np.max(grid) 36 | # grid_min = np.min(grid) 37 | # grid = ubound * (grid - grid_min) / (grid_max - grid_min) 38 | return grid 39 | 40 | def vis_grid(Xs): 41 | """ visualize a grid of images """ 42 | (N, H, W, C) = Xs.shape 43 | A = int(ceil(sqrt(N))) 44 | G = np.ones((A*H+A, A*W+A, C), Xs.dtype) 45 | G *= np.min(Xs) 46 | n = 0 47 | for y in range(A): 48 | for x in range(A): 49 | if n < N: 50 | G[y*H+y:(y+1)*H+y, x*W+x:(x+1)*W+x, :] = Xs[n,:,:,:] 51 | n += 1 52 | # normalize to [0,1] 53 | maxg = G.max() 54 | ming = G.min() 55 | G = (G - ming)/(maxg-ming) 56 | return G 57 | 58 | def vis_nn(rows): 59 | """ visualize array of arrays of images """ 60 | N = len(rows) 61 | D = len(rows[0]) 62 | H,W,C = rows[0][0].shape 63 | Xs = rows[0][0] 64 | G = np.ones((N*H+N, D*W+D, C), Xs.dtype) 65 | for y in range(N): 66 | for x in range(D): 67 | G[y*H+y:(y+1)*H+y, x*W+x:(x+1)*W+x, :] = rows[y][x] 68 | # normalize to [0,1] 69 | maxg = G.max() 70 | ming = G.min() 71 | G = (G - ming)/(maxg-ming) 72 | return G 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/frameworkpython: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # what real Python executable to use 4 | #PYVER=2.7 5 | #PATHTOPYTHON=/usr/local/bin/ 6 | #PYTHON=${PATHTOPYTHON}python${PYVER} 7 | 8 | PYTHON=$(which $(readlink .env/bin/python)) # only works with python3 9 | 10 | # find the root of the virtualenv, it should be the parent of the dir this script is in 11 | ENV=`$PYTHON -c "import os; print(os.path.abspath(os.path.join(os.path.dirname(\"$0\"), '..')))"` 12 | 13 | # now run Python with the virtualenv set as Python's HOME 14 | export PYTHONHOME=$ENV 15 | exec $PYTHON "$@" 16 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/setup_googlecloud.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This is the set-up script for Google Cloud. 4 | sudo apt-get update 5 | sudo apt-get install libncurses5-dev 6 | sudo apt-get install python-dev 7 | sudo apt-get install python-pip 8 | sudo apt-get install libjpeg8-dev 9 | sudo ln -s /usr/lib/x86_64-linux-gnu/libjpeg.so /usr/lib 10 | pip install pillow 11 | sudo apt-get build-dep python-imaging 12 | sudo apt-get install libjpeg8 libjpeg62-dev libfreetype6 libfreetype6-dev 13 | sudo pip install virtualenv 14 | virtualenv .env # Create a virtual environment 15 | source .env/bin/activate # Activate the virtual environment 16 | pip install -r requirements.txt # Install dependencies 17 | deactivate 18 | echo "**************************************************" 19 | echo "***** End of Google Cloud Set-up Script ********" 20 | echo "**************************************************" 21 | echo "" 22 | echo "If you had no errors, You can proceed to work with your virtualenv as normal." 23 | echo "(run 'source .env/bin/activate' in your assignment directory to load the venv," 24 | echo " and run 'deactivate' to exit the venv. See assignment handout for details.)" 25 | -------------------------------------------------------------------------------- /Base/courses/cs231n/assignment1/start_ipython_osx.sh: -------------------------------------------------------------------------------- 1 | # Assume the virtualenv is called .env 2 | 3 | cp frameworkpython .env/bin 4 | .env/bin/frameworkpython -m IPython notebook 5 | -------------------------------------------------------------------------------- /Base/courses/cs231n/note/inverted_dropout.py: -------------------------------------------------------------------------------- 1 | #反向随机失活(inverted dropout) 2 | """ 3 | 反向随机失活: 推荐实现方式. 4 | 在训练的时候drop和调整数值范围,测试时不做任何事. 5 | """ 6 | 7 | p = 0.5 # 激活神经元的概率. p值更高 = 随机失活更弱 8 | 9 | def train_step(X): 10 | # 3层neural network的前向传播 11 | H1 = np.maximum(0, np.dot(W1, X) + b1) 12 | U1 = (np.random.rand(*H1.shape) < p) / p # 第一个随机失活遮罩. 注意/p! 13 | H1 *= U1 # drop! 14 | H2 = np.maximum(0, np.dot(W2, H1) + b2) 15 | U2 = (np.random.rand(*H2.shape) < p) / p # 第二个随机失活遮罩. 注意/p! 16 | H2 *= U2 # drop! 17 | out = np.dot(W3, H2) + b3 18 | 19 | # 反向传播:计算梯度... (略) 20 | # 进行参数更新... (略) 21 | 22 | def predict(X): 23 | # 前向传播时模型集成 24 | H1 = np.maximum(0, np.dot(W1, X) + b1) # 不用数值范围调整了 25 | H2 = np.maximum(0, np.dot(W2, H1) + b2) 26 | out = np.dot(W3, H2) + b3 -------------------------------------------------------------------------------- /Base/courses/qiyuezaixian/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/courses/qiyuezaixian/.DS_Store -------------------------------------------------------------------------------- /Base/courses/qiyuezaixian/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/courses/qiyuezaixian/README.md -------------------------------------------------------------------------------- /Base/frameworks/caffe/docs/ubuntu18_anaconda3_py27_cpu_COMPILE_CAFFE.md: -------------------------------------------------------------------------------- 1 | 2 | 先随便搜一个教程照着装,比如[这个](https://blog.csdn.net/lukaslong/article/details/81390276) 3 | 然后会遇到如下问题: 4 | 5 | 6 | #### 1. recipe for target '.build_release/src/caffe/layers/detection_output_layer.o' failed 7 | ``` 8 | 先protoc --version查看版本 9 | 然后conda install protobuf=x.x.x 10 | ``` 11 | 12 | #### 2.Makefile:621: recipe for target '.build_release/tools/convert_imageset.bin' failed 13 | ``` 14 | conda install py-opencv=3.4.2 15 | ``` 16 | 17 | #### 3.fatal error: caffe/proto/caffe.pb.h: 没有那个文件或目录 18 | ``` 19 | In the directory you installed Caffe to 20 | protoc src/caffe/proto/caffe.proto --cpp_out=. 21 | mkdir include/caffe/proto 22 | mv src/caffe/proto/caffe.pb.h include/caffe/proto 23 | ``` 24 | 25 | #### 4.libprotobuf.so.19: cannot open shared object file: No such file or directory 26 | ``` 27 | sudo find / -name libprotobuf.so.19 28 | 发现确实存在libprotobuf.so.19(备注libprotobuf.so.19是一个软链接文件) 29 | 解决办法: 30 | sudo cp xx/xx/libprotobuf.so.19.0.0 /usr/local/lib/ 31 | sudo ln -s /usr/local/lib/libprotobuf.so.19.0.0 /usr/local/lib/libprotobuf.so.19 32 | 33 | export LD_LIBRARY_PATH=/usr/local/lib 34 | ``` 35 | 36 | #### 5.ImportError: libopencv_core.so.3.4: cannot open shared object file: No such file or directory 37 | ``` 38 | sudo find / -name "libopencv_core.so.3.4*" 39 | Then got the result: /usr/local/lib/libopencv_core.so.3.2. 40 | Create a file called /etc/ld.so.conf.d/opencv.conf  41 | write to it the path to the folder where the binary is stored. 42 | For example, I wrote /usr/local/lib/ to my opencv.conf file. 43 | Run the command line as follows. 44 | sudo ldconfig -v 45 | ``` 46 | 47 | #### 6.ImportError: 'No module named skimage.io' 48 | ``` 49 | pip install scikit-image 50 | ``` 51 | 52 | #### 7.TypeError: __new__() got an unexpected keyword argument 'serialized_options' 53 | ``` 54 | pip install -U protobuf 55 | ``` 56 | 57 | 58 | 最后需要加入环境变量export PYTHONPATH=~/caffe-ssd/python:$PYTHONPATH 59 | -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/3.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/dog_bike_car.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/dog_bike_car.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/img/two_faces_300.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/img/two_faces_300.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/res.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/res.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/res222.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/res222.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/test_img.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/test_img.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/test_img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/test_img2.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/test_img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/test_img3.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/two_faces_300.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/two_faces_300.jpg -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/yufacedetectnet-open-v1-detection_output.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/yufacedetectnet-open-v1-detection_output.caffemodel -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/yufacedetectnet-open-v1.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/yufacedetectnet-open-v1.caffemodel -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/yufacedetectnet-open-v1_my.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/yufacedetectnet-open-v1_my.caffemodel -------------------------------------------------------------------------------- /Base/frameworks/caffe/project/caffe_ssd_write_layer/yufacedetectnet-open-v1_new.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/caffe/project/caffe_ssd_write_layer/yufacedetectnet-open-v1_new.caffemodel -------------------------------------------------------------------------------- /Base/frameworks/caffe/readme.md: -------------------------------------------------------------------------------- 1 | # Caffe 2 | 3 | 4 | ### 文档资料 5 | * [ssd版caffe](https://github.com/weiliu89/caffe/tree/ssd) 6 | * [编译caffe源码](./docs/ubuntu18_anaconda3_py27_cpu_COMPILE_CAFFE.md) 7 | 8 | 9 | ### 代码片段 10 | * [SSD predict](./code/test_ssd.py) 11 | * [SSD detection_output层](./code/ssd_detection_output_layer.py) 12 | 13 | 14 | ### 项目 15 | * [手写SSD后面几层](./project/caffe_ssd_write_layer/) 16 | -------------------------------------------------------------------------------- /Base/frameworks/keras/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/keras/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/keras/baseline/main.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | # @fire 3 | import cv2 4 | import os,sys 5 | import numpy as np 6 | from PIL import Image 7 | import random 8 | 9 | 10 | from my_data import myData 11 | from my_model import myModel 12 | 13 | 14 | def getAllName(file_dir): 15 | L=[] 16 | for root, dirs, files in os.walk(file_dir): 17 | # root 所指的是当前正在遍历的这个文件夹的本身的地址 18 | # dirs 是一个 list ,内容是该文件夹中所有的目录的名字(不包括子目录) 19 | # files 同样是 list , 内容是该文件夹中所有的文件(不包括子目录) 20 | for file in files: 21 | if os.path.splitext(file)[1] == '.jpg' or os.path.splitext(file)[1] == '.png': 22 | L.append(os.path.join(root, file)) 23 | return L 24 | 25 | 26 | 27 | 28 | 29 | data_path_fake = "data/train/fake/" 30 | data_path_true = "data/train/true/" 31 | fake_imgs_train = getAllName(data_path_fake) 32 | true_imgs_train = getAllName(data_path_true) 33 | 34 | data_path_fake = "data/val/fake/" 35 | data_path_true = "data/val/true/" 36 | fake_imgs_val = getAllName(data_path_fake) 37 | true_imgs_val = getAllName(data_path_true) 38 | 39 | 40 | 41 | 42 | batch_size = 16 43 | nb_epoch = 20 44 | img_name_list_train_cate1 = true_imgs_train 45 | img_name_list_train_cate2 = fake_imgs_train 46 | img_name_list_val_cate1 = true_imgs_val 47 | img_name_list_val_cate2 = fake_imgs_val 48 | my_data = myData(batch_size, nb_epoch, img_name_list_train_cate1, img_name_list_train_cate2, 49 | img_name_list_val_cate1, img_name_list_val_cate2) 50 | 51 | print(my_data.total_train, my_data.total_val) 52 | 53 | 54 | my_model = myModel() 55 | 56 | my_model.train(my_data) 57 | -------------------------------------------------------------------------------- /Base/frameworks/keras/baseline/my_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import keras 4 | from keras import Sequential 5 | from keras.layers import Conv2D,Activation,MaxPooling2D,Flatten,Dense,Activation,Dropout 6 | 7 | 8 | class myModel(object): 9 | 10 | 11 | 12 | def __init__(self): 13 | self.model = Sequential() 14 | self.model.add(Conv2D(32, (3, 3), input_shape=(100,100,3))) 15 | self.model.add(Activation('relu')) 16 | self.model.add(MaxPooling2D(pool_size=(2, 2))) 17 | 18 | self.model.add(Conv2D(32, (3, 3))) 19 | self.model.add(Activation('relu')) 20 | self.model.add(MaxPooling2D(pool_size=(2, 2))) 21 | 22 | self.model.add(Conv2D(64, (3, 3))) 23 | self.model.add(Activation('relu')) 24 | self.model.add(MaxPooling2D(pool_size=(2, 2))) 25 | 26 | self.model.add(Conv2D(64, (3, 3))) 27 | self.model.add(Activation('relu')) 28 | self.model.add(MaxPooling2D(pool_size=(2, 2))) 29 | 30 | self.model.add(Flatten()) 31 | self.model.add(Dense(64)) 32 | self.model.add(Activation('relu')) 33 | self.model.add(Dropout(0.85)) 34 | self.model.add(Dense(2)) 35 | self.model.add(Activation('sigmoid')) 36 | 37 | 38 | def train(self, dataset): 39 | batch_size = dataset.batch_size 40 | nb_epoch = dataset.nb_epoch 41 | self.model.compile(loss='binary_crossentropy', 42 | optimizer='adam', 43 | metrics=['accuracy']) 44 | self.model.fit_generator(dataset.train_data_generate(), 45 | steps_per_epoch=dataset.total_train // batch_size, 46 | epochs=nb_epoch, 47 | validation_data=dataset.val_data_generate(), 48 | validation_steps=dataset.total_val//batch_size) 49 | 50 | 51 | def save(self, file_path="model.h5"): 52 | print('Model Saved.') 53 | self.model.save_weights(file_path) 54 | 55 | def load(self, file_path="model.h5"): 56 | print('Model Loaded.') 57 | self.model.load_weights(file_path) 58 | 59 | def predict(self, image): 60 | # 预测样本分类 61 | img = image.resize((1, IMAGE_SIZE, IMAGE_SIZE, 3)) 62 | img = image.astype('float32') 63 | img /= 255 64 | 65 | #归一化 66 | result = self.model.predict(img) 67 | print(result) 68 | # 概率 69 | result = self.model.predict_classes(img) 70 | print(result) 71 | # 0/1 72 | 73 | return result[0] 74 | 75 | def evaluate(self, dataset): 76 | # 测试样本准确率 77 | score = self.model.evaluate_generator(dataset.valid,steps=2) 78 | print("样本准确率%s: %.2f%%" % (self.model.metrics_names[1], score[1] * 100)) 79 | -------------------------------------------------------------------------------- /Base/frameworks/keras/baseline/my_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import keras 4 | from keras import Sequential 5 | from keras.layers import Conv2D,Activation,MaxPooling2D,Flatten,Dense,Activation,Dropout 6 | 7 | 8 | class myModel(object): 9 | 10 | 11 | 12 | def __init__(self): 13 | self.model = Sequential() 14 | self.model.add(Conv2D(32, (3, 3), input_shape=(100,100,3))) 15 | self.model.add(Activation('relu')) 16 | self.model.add(MaxPooling2D(pool_size=(2, 2))) 17 | 18 | self.model.add(Conv2D(32, (3, 3))) 19 | self.model.add(Activation('relu')) 20 | self.model.add(MaxPooling2D(pool_size=(2, 2))) 21 | 22 | self.model.add(Conv2D(64, (3, 3))) 23 | self.model.add(Activation('relu')) 24 | self.model.add(MaxPooling2D(pool_size=(2, 2))) 25 | 26 | self.model.add(Conv2D(64, (3, 3))) 27 | self.model.add(Activation('relu')) 28 | self.model.add(MaxPooling2D(pool_size=(2, 2))) 29 | 30 | self.model.add(Flatten()) 31 | self.model.add(Dense(64)) 32 | self.model.add(Activation('relu')) 33 | self.model.add(Dropout(0.85)) 34 | self.model.add(Dense(2)) 35 | self.model.add(Activation('sigmoid')) 36 | 37 | 38 | def train(self, dataset): 39 | batch_size = dataset.batch_size 40 | nb_epoch = dataset.nb_epoch 41 | self.model.compile(loss='binary_crossentropy', 42 | optimizer='adam', 43 | metrics=['accuracy']) 44 | self.model.fit_generator(dataset.train_data_generate(), 45 | steps_per_epoch=dataset.total_train // batch_size, 46 | epochs=nb_epoch, 47 | validation_data=dataset.val_data_generate(), 48 | validation_steps=dataset.total_val//batch_size) 49 | 50 | 51 | def save(self, file_path="model.h5"): 52 | print('Model Saved.') 53 | self.model.save_weights(file_path) 54 | 55 | def load(self, file_path="model.h5"): 56 | print('Model Loaded.') 57 | self.model.load_weights(file_path) 58 | 59 | def predict(self, image): 60 | # 预测样本分类 61 | img = image.resize((1, IMAGE_SIZE, IMAGE_SIZE, 3)) 62 | img = image.astype('float32') 63 | img /= 255 64 | 65 | #归一化 66 | result = self.model.predict(img) 67 | print(result) 68 | # 概率 69 | result = self.model.predict_classes(img) 70 | print(result) 71 | # 0/1 72 | 73 | return result[0] 74 | 75 | def evaluate(self, dataset): 76 | # 测试样本准确率 77 | score = self.model.evaluate_generator(dataset.valid,steps=2) 78 | print("样本准确率%s: %.2f%%" % (self.model.metrics_names[1], score[1] * 100)) 79 | -------------------------------------------------------------------------------- /Base/frameworks/keras/data/0_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/keras/data/0_0.png -------------------------------------------------------------------------------- /Base/frameworks/keras/data/2_100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/keras/data/2_100.png -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/keras/demo/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/classify_focal_loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | @Fire 3 | focal loss本身是用于检测网络的 4 | 其中alpha因子用于控制不平衡样本比例,主要是降低背景类 5 | gamma因子用于控制易分类样本权重 6 | 7 | 而在分类任务中,没有背景类,且keras的fit可以直接设置class_weight,所以这里直接去掉了alpha 8 | 9 | """ 10 | # version 1 11 | def focal_loss(y_true,y_pred,gamma = 2): 12 | ''' 13 | :param y_true: ont-hot encoding ,shape is [batch_size,nums_classes] 14 | :param y_pred: shape is [batch_size,nums_classes],each example defined as probability for per class 15 | :return:shape is [batch_size,], a list include cross_entropy for per example 16 | ''' 17 | y_pred = K.clip(y_pred, K.epsilon(),1.0 - K.epsilon()) 18 | crossEntropyLoss = -((1-y_pred)**gamma)*y_true * tf.log(y_pred)#facal loss 19 | 20 | return tf.reduce_sum(crossEntropyLoss,-1) 21 | 22 | # version 2 rec 23 | def focal_loss(target, output, gamma=2): 24 | output /= K.sum(output, axis=-1, keepdims=True) 25 | eps = K.epsilon() 26 | output = K.clip(output, eps, 1. - eps) 27 | return -K.sum(K.pow(1. - output, gamma) * target * K.log(output), 28 | axis=-1) 29 | -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/fmeasure_metric.py: -------------------------------------------------------------------------------- 1 | import keras.backend as K 2 | 3 | def binary_accuracy(y_true, y_pred, threshold=0.5): 4 | if threshold != 0.5: 5 | threshold = K.cast(threshold, y_pred.dtype) 6 | y_pred = K.cast(y_pred > threshold, y_pred.dtype) 7 | return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1) 8 | 9 | def precision(y_true, y_pred): 10 | # Calculates the precision 11 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 12 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 13 | precision = true_positives / (predicted_positives + K.epsilon()) 14 | return precision 15 | 16 | def recall(y_true, y_pred): 17 | # Calculates the recall 18 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 19 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 20 | recall = true_positives / (possible_positives + K.epsilon()) 21 | return recall 22 | 23 | def fbeta_score(y_true, y_pred, beta=1): 24 | # Calculates the F score, the weighted harmonic mean of precision and recall. 25 | if beta < 0: 26 | raise ValueError('The lowest choosable beta is zero (only precision).') 27 | 28 | # If there are no true positives, fix the F score at 0 like sklearn. 29 | if K.sum(K.round(K.clip(y_true, 0, 1))) == 0: 30 | return 0 31 | p = precision(y_true, y_pred) 32 | r = recall(y_true, y_pred) 33 | bb = beta ** 2 34 | fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon()) 35 | return fbeta_score 36 | 37 | def fmeasure(y_true, y_pred): 38 | # Calculates the f-measure, the harmonic mean of precision and recall. 39 | return fbeta_score(y_true, y_pred, beta=1) 40 | 41 | earlystop = EarlyStopping(monitor='val_fmeasure', patience=4, verbose=0, mode='max') 42 | 43 | model.compile(optimizer = 'adam', 44 | loss='binary_crossentropy', 45 | metrics=['accuracy',fmeasure,recall,precision]) 46 | -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/h5_to_ckpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas as pd 4 | 5 | 6 | from keras.preprocessing.image import ImageDataGenerator 7 | from keras.callbacks import EarlyStopping,ModelCheckpoint,ReduceLROnPlateau 8 | import tensorflow as tf 9 | from keras.models import load_model 10 | import numpy as np 11 | import random 12 | 13 | random.seed(2020) 14 | np.random.seed(2020) 15 | 16 | import os 17 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 18 | #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" 19 | 20 | 21 | 22 | def train(cfg): 23 | 24 | save_dir = cfg['save_dir'] 25 | shape = (int(cfg['height']), int(cfg['width']), 3) 26 | 27 | n_class = int(cfg['class_number']) 28 | batch = int(cfg['batch']) 29 | 30 | if not os.path.exists(save_dir): 31 | os.mkdir(save_dir) 32 | 33 | # if cfg['model'] == 'large': 34 | # from model.mobilenet_v3_large import MobileNetV3_Large 35 | # model = MobileNetV3_Large(shape, n_class).build() 36 | # if cfg['model'] == 'small': 37 | # from model.mobilenet_v3_small import MobileNetV3_Small 38 | # model = MobileNetV3_Small(shape, n_class).build() 39 | 40 | # if cfg['model'] == 'mymodel': 41 | # from model.my_model import MyModel 42 | # model = MyModel(shape, n_class).build() 43 | 44 | # if cfg['model'] == 'v2': 45 | # from model.mobilenet_v2 import MyModel 46 | # model = MyModel(shape, n_class).buildRaw() 47 | 48 | model_path = "save/v2" 49 | loaded_model = load_model(os.path.join(model_path,'e_06_0.20_1.00.h5')) 50 | from keras import backend as K 51 | import tensorflow as tf 52 | print(loaded_model.input.op.name) 53 | print(loaded_model.output.op.name) 54 | saver = tf.train.Saver() 55 | saver.save(K.get_session(), 'save/ckpt/keras_model.ckpt') 56 | 57 | """ 58 | 59 | python freeze_graph.py --input_meta_graph=./ckpt/keras_model.ckpt.meta --input_checkpoint=./ckpt/keras_model.ckpt --output_graph=./ckpt/keras_model.pb --output_node_names="dense/Softmax" --input_binary=true 60 | """ 61 | 62 | if __name__ == '__main__': 63 | # with open('config/config.json', 'r') as f: 64 | # cfg = json.load(f) 65 | 66 | cfg = { 67 | "model": "v2", 68 | "height": 224, 69 | "width": 224, 70 | "class_number": 2, 71 | "batch": 16, 72 | "epochs": 50, 73 | "train_dir": "/home/AlgorithmicGroup/yw/workshop/antiface/data/test_position/level1/train", 74 | "eval_dir": "/home/AlgorithmicGroup/yw/workshop/antiface/data/test_position/level1/val", 75 | "save_dir": "save", 76 | "weights": "" 77 | } 78 | train(cfg) 79 | #nohup python -u train_cls.py > nohup.log 2>&1 & 80 | -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/h5_to_pb.py: -------------------------------------------------------------------------------- 1 | #*-coding:utf-8-* 2 | 3 | """ 4 | 将keras的.h5的模型文件,转换成TensorFlow的pb文件 5 | """ 6 | # ========================================================== 7 | 8 | from keras.models import load_model 9 | import tensorflow as tf 10 | import os 11 | from keras import backend 12 | from keras.applications.mobilenetv2 import MobileNetV2 13 | from keras.layers import Input 14 | from keras.preprocessing import image 15 | from keras.applications.mobilenetv2 import preprocess_input, decode_predictions 16 | from keras.applications.inception_resnet_v2 import InceptionResNetV2 17 | 18 | from keras import backend as K 19 | K.set_learning_phase(0) 20 | 21 | def h5_to_pb(h5_model, output_dir, model_name, out_prefix="output_", log_tensorboard=True): 22 | """.h5模型文件转换成pb模型文件 23 | Argument: 24 | h5_model: str 25 | .h5模型文件 26 | output_dir: str 27 | pb模型文件保存路径 28 | model_name: str 29 | pb模型文件名称 30 | out_prefix: str 31 | 根据训练,需要修改 32 | log_tensorboard: bool 33 | 是否生成日志文件 34 | Return: 35 | pb模型文件 36 | """ 37 | if os.path.exists(output_dir) == False: 38 | os.mkdir(output_dir) 39 | out_nodes = [] 40 | for i in range(len(h5_model.outputs)): 41 | out_nodes.append(out_prefix + str(i + 1)) 42 | #tf.identity(h5_model.output[i], out_prefix + str(i + 1)) 43 | tf.identity(h5_model.outputs[i],out_prefix + str(i + 1)) 44 | sess = backend.get_session() 45 | 46 | from tensorflow.python.framework import graph_util, graph_io 47 | # 写入pb模型文件 48 | init_graph = sess.graph.as_graph_def() 49 | main_graph = graph_util.convert_variables_to_constants(sess, init_graph, out_nodes) 50 | graph_io.write_graph(main_graph, output_dir, name=model_name, as_text=False) 51 | # 输出日志文件 52 | # if log_tensorboard: 53 | # from tensorflow.python.tools import import_pb_to_tensorboard 54 | # import_pb_to_tensorboard.import_to_tensorboard(os.path.join(output_dir, model_name), output_dir) 55 | 56 | 57 | if __name__ == '__main__': 58 | # .h模型文件路径参数 59 | # input_path = './' 60 | # weight_file = '224_1.0_epoch1_1.0.h5' 61 | # #weight_file = 'mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.4_224.h5' 62 | # weight_file_path = os.path.join(input_path, weight_file) 63 | # output_graph_name = weight_file[:-3] + '.pb' 64 | 65 | # # pb模型文件输出输出路径 66 | # output_dir = input_path 67 | 68 | # 加载模型 69 | # h5_model = 0 70 | 71 | # input_tensor = Input(shape=(224, 224, 3)) # or you could put (None, None, 3) for shape. 72 | # h5_model = MobileNetV2(input_tensor=input_tensor, alpha=1.0, include_top=False,weights=input_path+weight_file) 73 | h5_model = load_model('224_1.0_epoch1_1.0.h5', compile=False) 74 | output_dir = "./" 75 | output_graph_path = "224_1.0_epoch1_1.0_new.pb" 76 | 77 | #h5_model.summary() 78 | h5_to_pb(h5_model, output_dir=output_dir, model_name=output_graph_path) 79 | print('Finished') 80 | -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/h5_to_tflite.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #python rename.py "xx路径" 3 | import cv2 4 | import os,sys 5 | import numpy as np 6 | from PIL import Image 7 | import random 8 | 9 | from keras import backend as K 10 | import tensorflow as tf 11 | 12 | 13 | 14 | from keras.models import load_model,save_model 15 | my_model = load_model('model_all.h5', compile=False) 16 | #my_model.summary() 17 | 18 | 19 | my_model.save('model_tmp.h5') 20 | 21 | # keras_file = './tmp/keras_model.ckpt' 22 | # saver = tf.train.Saver() 23 | # saver.save(K.get_session(), keras_file) 24 | #python freeze_graph.py --input_meta_graph=./tmp/keras_model.ckpt.meta --input_checkpoint=./tmp/keras_model.ckpt --output_graph=./tmp/keras_model.pb --output_node_names="activation_6/Sigmoid" --input_binary=false 25 | converter = tf.lite.TocoConverter.from_keras_model_file("model_tmp.h5") 26 | #converter.post_training_quantize = True 27 | tflite_quantized_model=converter.convert() 28 | 29 | open("model.tflite", "wb").write(tflite_quantized_model) 30 | -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/layer_trainable.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | model = ... 4 | 5 | 6 | opt = Adam(lr=float(0.001)) 7 | for layer in model.layers[:-8]: 8 | layer.trainable = False 9 | print(model.summary()) 10 | 11 | model.compile(loss='binary_crossentropy', 12 | optimizer=opt, 13 | metrics=[binary_accuracy])#fmeasure 14 | 15 | model.fit_generator(myGenerator(train_generator,cate_names_final,pre_to_label), 16 | validation_data=myGenerator(val_generator,cate_names_final,pre_to_label), 17 | steps_per_epoch=count_train // batch_size, 18 | validation_steps=count_val // batch_size, 19 | epochs=6, 20 | class_weight='auto', 21 | callbacks=[reduce_lr]) 22 | 23 | 24 | 25 | for layer in model.layers[:-8]: 26 | layer.trainable = True 27 | print(model.summary()) 28 | 29 | opt = Adam(lr=float(0.0001)) 30 | model.compile(loss='binary_crossentropy', 31 | optimizer=opt, 32 | metrics=[binary_accuracy])#fmeasure 33 | 34 | model.fit_generator(myGenerator(train_generator,cate_names_final,pre_to_label), 35 | validation_data=myGenerator(val_generator,cate_names_final,pre_to_label), 36 | steps_per_epoch=count_train // batch_size, 37 | validation_steps=count_val // batch_size, 38 | epochs=cfg['epochs'], 39 | class_weight='auto', 40 | callbacks=[earlystop,checkpoint,reduce_lr]) 41 | -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/multi_output_class_weight.py: -------------------------------------------------------------------------------- 1 | from sklearn.utils import class_weight 2 | 3 | 4 | class_weights = class_weight.compute_class_weight('balanced', 5 | np.unique(label_list), 6 | label_list) 7 | 8 | 9 | 10 | model.fit_generator(..., 11 | class_weight={'outputs':class_weights}, 12 | ) 13 | #'outputs' is the output (which u want to balance) layer name 14 | -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/show_keras_data.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | #import matplotlib.pyplot as plt 3 | #from PIL import Image 4 | from keras.preprocessing.image import ImageDataGenerator 5 | #import glob 6 | 7 | # 设置生成器参数 8 | datagen = ImageDataGenerator( 9 | rescale=1. / 255, 10 | horizontal_flip=True, 11 | channel_shift_range=20) 12 | 13 | 14 | 15 | ptrain = "/home/AlgorithmicGroup/yw/workshop/antiface/data/v3/val" 16 | SAVE_PATH = "images/gen/" 17 | 18 | gen_data = datagen.flow_from_directory( 19 | ptrain, 20 | target_size=(224, 224), 21 | batch_size=1, 22 | class_mode='categorical', 23 | shuffle=True, 24 | save_to_dir=SAVE_PATH, 25 | save_prefix='gen') 26 | 27 | # 生成9张图 28 | for i in range(100): 29 | gen_data.next() 30 | 31 | -------------------------------------------------------------------------------- /Base/frameworks/keras/demo/tflite_pre.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from __future__ import print_function 3 | import numpy as np 4 | import cv2 5 | from cv2 import dnn 6 | import sys 7 | 8 | import tensorflow as tf 9 | from tensorflow.python.framework import graph_util 10 | import os 11 | 12 | 13 | 14 | # Load TFLite model and allocate tensors. 15 | interpreter = tf.lite.Interpreter(model_path="model.tflite") 16 | interpreter.allocate_tensors() 17 | 18 | input_details = interpreter.get_input_details() 19 | output_details = interpreter.get_output_details() 20 | 21 | print(input_details) 22 | print(output_details) 23 | 24 | 25 | img = cv2.imread( "D:/Data/clothes_style/data/TestSet/img_0.jpg") 26 | print("img shape: ", img.shape) 27 | rows = img.shape[ 0] 28 | cols = img.shape[ 1] 29 | input_data = cv2.resize(img, ( 224, 224)) 30 | #input_data = np.array([input_data[:, :, [ 2, 1, 0]]]) # BGR2RGB 31 | 32 | 33 | input_data = cv2.resize(input_data, (224, 224), interpolation=cv2.INTER_CUBIC) 34 | input_data = np.array(input_data) 35 | input_data = np.reshape(input_data, (1, 224, 224, 3)) 36 | input_data = input_data.astype('float32') 37 | input_data = np.multiply(input_data, 1.0 / 255) 38 | #input_data = np.multiply(input_data, 1.0 / 127.5) - 1 39 | 40 | 41 | 42 | index = input_details[0]['index'] 43 | interpreter.set_tensor(index, input_data) 44 | interpreter.invoke() 45 | output_data = interpreter.get_tensor(output_details[0]['index']) 46 | print('output_data :',output_data) 47 | 48 | -------------------------------------------------------------------------------- /Base/frameworks/keras/note/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/keras/note/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/keras/note/keras_multiGPU.md: -------------------------------------------------------------------------------- 1 | # Keras 多GPU训练 2 | > Fire 2018.12.05 3 | 4 | ### 1.指定GPU训练 5 | ```python 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 7 | os.environ["CUDA_VISIBLE_DEVICES"] = "0,2" 8 | ``` 9 | 10 | ### 2.多块GPU训练 11 | 直接可以跑,但是通过nvidia-smi观察GPU占用率,只有第一块显卡占用了在跑,其他占用都是闲置的。 12 | 13 | 设置: 14 | 15 | ```python 16 | 17 | from keras.utils import multi_gpu_model 18 | # 最多支持8块GPU 19 | 20 | model = Model(input=.., output=..) #这里同单卡,声明好模型 21 | 22 | parallel_model = multi_gpu_model(model, gpus=4) #这里假设有4块 23 | 24 | parallel_model.compile(...) #这里同单卡compile 25 | parallel_model.fit(x, y, epochs=40, batch_size=128) 26 | 27 | ``` 28 | 29 | 注意:用多卡跑的时候,batchsize要乘以对应的块数,因为会把总的batchsize分到几块上面。 30 | 31 | 但是还是会报错,说 32 | > could not satisfy explicit device specification '/device:GPU:3' because no supported kernel for GPU device is aviailable 33 | 34 | 解决方法是,在加载模型之前添加: 35 | 36 | ``` python 37 | import tensorflow as tf 38 | from keras.backend.tensorflow_backend import set_session 39 | 40 | config = tf.ConfigProto(allow_soft_placement=True) 41 | set_session(tf.Session(config=config)) 42 | ``` 43 | 44 | 另外如果遇到如下报错: 45 | ```shell 46 | AttributeError: '_TfDeviceCaptureOp' object has no attribute '_set_device_from_string' 47 | ``` 48 | 是由于keras2.2+tensorflow1.14+的一个bug,升级keras到2.3或者降级tensorflow到1.13可以解决。 49 | 50 | P.S.网上还有很多人遇到Modelcheckpoint callback报错的问题,我没遇到过,贴一个供参考: 51 | 52 | [Keras 多GPU下模型和参数保存Modelcheckpoint callback报错](https://blog.csdn.net/Umi_you/article/details/81301002) 53 | 54 | 55 | ### 3.多块GPU训练模型用多块GPU预测 56 | ```python 57 | from keras.utils import multi_gpu_model 58 | 59 | basemodel = Model(inputs=input, outputs=y_pred) ##这里同单卡,声明好模型 60 | 61 | multi_model=multi_gpu_model(basemodel,gpus=4) 62 | multi_model.load_weights("multi_model.h5") #加载多卡训练的模型 63 | 64 | multi_model.predict(...) #预测 65 | ``` 66 | 67 | ### 4.多块GPU训练模型用单块GPU预测 68 | 多核训练的网络的每一层都是按GPU来命名的,训练时采用多个GPU那么当导入参数的时候必须指定相同数量的GPU才行。所以直接将model切换到单GPU的环境中会报错,此时我们必须将参数保存为单GPU的形式。 69 | 70 | ```python 71 | from keras.utils import multi_gpu_model 72 | 73 | basemodel = Model(inputs=input, outputs=y_pred) ##这里同单卡,声明好模型 74 | 75 | multi_model=multi_gpu_model(basemodel,gpus=4) 76 | multi_model.load_weights("multi_model.h5") # 此时basemodel也自动载入了权重, 77 | 78 | basemodel.save('basemodel.h5') 79 | ``` 80 | 这里保存的basemodel.h5就是对应单卡的模型,直接在单卡机器上加载就可以使用了。 81 | -------------------------------------------------------------------------------- /Base/frameworks/keras/project/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/keras/project/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/mxnet/load_pre_demo.py: -------------------------------------------------------------------------------- 1 | # load model and predicate 2 | import mxnet as mx 3 | import numpy as np 4 | import cv2 5 | from collections import namedtuple 6 | Batch = namedtuple('Batch', ['data']) 7 | 8 | def load_model(prefix, epoch, ctx, height, width): 9 | print(prefix, epoch) 10 | sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) 11 | mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None) 12 | mod.bind(for_training=False, 13 | data_shapes=[('data', (1, 3, int(height), int(width)))]) 14 | mod.set_params(arg_params=arg_params, aux_params=aux_params, allow_missing=True) 15 | return sym, mod 16 | 17 | height, width = 112,112 18 | load_epoch = 0 19 | model_prefix = "mynet" 20 | sym, mod = load_model(model_prefix, load_epoch, mx.cpu(), height, width) # ctx = mx.cpu() mx.gpu(0) 21 | 22 | img=cv2.imread('./h.jpg') 23 | img=cv2.resize(img,(width, height)) 24 | img = np.reshape(img, (3, height, width)) 25 | img = np.array([img]) 26 | 27 | print(img.shape) 28 | img = mx.nd.array(img) 29 | mod.forward(Batch([img])) 30 | print('height', height, 'width', width) 31 | print('img',img[0,2,0]) 32 | prob = mod.get_outputs()[0].asnumpy() 33 | 34 | print(prob.shape) 35 | -------------------------------------------------------------------------------- /Base/frameworks/pytorch/IoU_loss.py: -------------------------------------------------------------------------------- 1 | #参考https://blog.csdn.net/weixin_38241876/article/details/110041645 2 | #但是它计算有问题,这里是修改后的 3 | 4 | def myIOULoss(self,predicted_locations, gt_locations, labels, 5 | GIoU=False, DIoU=False, CIoU=False): 6 | #torch.Size([691, 4]) torch.Size([691, 4]) 7 | 8 | # 9 | ### 1. to conner type box 10 | pos_mask = labels > 0 11 | pre_boxes = box_utils.convert_locations_to_boxes( 12 | predicted_locations, self.priors, 0.1, 0.2) 13 | pre_boxes = box_utils.center_form_to_corner_form(pre_boxes) 14 | pre_boxes = pre_boxes[pos_mask, :].reshape(-1, 4) 15 | #print(pre_boxes[:5])#[0.3799, 0.2177, 0.4424, 0.2723] 16 | 17 | gt_boxes = box_utils.convert_locations_to_boxes( 18 | gt_locations, self.priors, 0.1, 0.2) 19 | gt_boxes = box_utils.center_form_to_corner_form(gt_boxes) 20 | gt_boxes = gt_boxes[pos_mask, :].reshape(-1, 4) 21 | #print(gt_boxes[:5]) 22 | #print(pre_boxes.shape, gt_boxes.shape) 23 | num_pos = gt_boxes.size(0) 24 | 25 | 26 | ### 2.compute IOU 27 | b1_x1, b1_y1, b1_x2, b1_y2 = pre_boxes[:,0], pre_boxes[:,1], pre_boxes[:,2], pre_boxes[:,3] 28 | b2_x1, b2_y1, b2_x2, b2_y2 = gt_boxes[:,0], gt_boxes[:,1], gt_boxes[:,2], gt_boxes[:,3] 29 | 30 | # Intersection area 31 | inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \ 32 | (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0) 33 | 34 | # Union Area 35 | w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 36 | w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 37 | union = (w1 * h1 + 1e-16) + w2 * h2 - inter 38 | 39 | #print(inter.shape, union.shape) 40 | iou = inter / union # iou 41 | # print(iou.shape) #[691] 42 | # b 43 | if GIoU or DIoU or CIoU: 44 | cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1) # convex (smallest enclosing box) width 45 | ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1) # convex height 46 | if GIoU: # Generalized IoU https://arxiv.org/pdf/1902.09630.pdf 47 | c_area = cw * ch + 1e-16 # convex area 48 | loss = iou - (c_area - union) / c_area # GIoU 49 | loss = 1-loss 50 | else: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 51 | # convex diagonal squared 52 | c2 = cw ** 2 + ch ** 2 + 1e-16 53 | # centerpoint distance squared 54 | rho2 = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2)) ** 2 / 4 + ((b2_y1 + b2_y2) - (b1_y1 + b1_y2)) ** 2 / 4 55 | if DIoU: 56 | loss = iou - rho2 / c2 # DIoU 57 | loss = 1-loss 58 | elif CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 59 | v = (4 / math.pi ** 2) * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2) 60 | with torch.no_grad(): 61 | alpha = v / (1 - iou + v) 62 | loss = iou - (rho2 / c2 + v * alpha) # CIoU 63 | loss = 1-loss 64 | 65 | loss = loss.sum() 66 | else: 67 | iou = -torch.log(iou + 1e-16) #防止为0 68 | loss = iou.sum() 69 | 70 | #print(loss,num_pos) 71 | # b 72 | return loss, num_pos 73 | -------------------------------------------------------------------------------- /Base/frameworks/pytorch/demo/CEloss.py: -------------------------------------------------------------------------------- 1 | def myCELoss(self, pre, label): 2 | #print(pre.shape, label.shape)#torch.Size([2764, 3]) torch.Size([2764] 3 | 4 | ### 原始CE loss 5 | #loss = F.cross_entropy(pre, label, reduction='sum') #e0 loss 7.9068 6 | 7 | ### CE loss等价实现1 8 | # log_soft_out = F.log_softmax(pre, dim=-1) 9 | # loss = F.nll_loss(log_soft_out, label, reduction='sum') 10 | 11 | ### CE loss等价实现2 12 | # soft_out = F.softmax(pre, dim=-1) 13 | # log_soft_out = torch.log(soft_out) 14 | # loss = F.nll_loss(log_soft_out, label, reduction='sum') 15 | 16 | ### CE loss等价实现3 17 | # log_soft_out = F.log_softmax(pre, dim=-1) 18 | # one_hot = F.one_hot(label, pre.shape[1]).float().to(self.device) 19 | # loss = torch.sum(-one_hot * log_soft_out) 20 | 21 | ### label smooth 22 | log_soft_out = F.log_softmax(pre, dim=-1) 23 | one_hot = F.one_hot(label, pre.shape[1]).float().to(pre.device) 24 | one_hot = one_hot * (1-self.labelsmooth)+self.labelsmooth/pre.shape[1] 25 | loss = torch.sum(-one_hot * log_soft_out) 26 | 27 | ### label smooth, 加强face when==facemask 28 | # log_soft_out = F.log_softmax(pre, dim=-1) 29 | # one_hot = F.one_hot(label, pre.shape[1]).float().to(self.device) 30 | # one_hot = one_hot * (1-self.labelsmooth)+self.labelsmooth/pre.shape[1] 31 | # facemask_index = label==2 32 | # one_hot[facemask_index,1] = one_hot[facemask_index,0]+one_hot[facemask_index,1] 33 | # one_hot[facemask_index,2] = one_hot[facemask_index,2]-one_hot[facemask_index,0] 34 | # loss = torch.sum(-one_hot * log_soft_out) 35 | 36 | 37 | # print(loss) #4388.9595/1.5879 38 | # #b 39 | return loss 40 | -------------------------------------------------------------------------------- /Base/frameworks/pytorch/demo/onnx_pre.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | # from __future__ import print_function 3 | import numpy as np 4 | import cv2 5 | # from cv2 import dnn 6 | # import sys 7 | 8 | # import tensorflow as tf 9 | # from tensorflow.python.framework import graph_util 10 | # import os 11 | 12 | # import time 13 | 14 | 15 | import time 16 | 17 | import onnxruntime as rt 18 | 19 | model_path = 'mymodel.onnx' 20 | sess=rt.InferenceSession(model_path)#model_path就是模型的地址 21 | input_name=sess.get_inputs()[0].name 22 | 23 | 24 | img = cv2.imread( 'tmp/face0_0.8583003.jpg') 25 | print("img shape: ", img.shape) 26 | inp = cv2.resize(img, ( 112, 112)) 27 | inp = inp[:, :, [ 2, 1, 0]] # BGR2RGB 28 | 29 | data = inp.reshape( 1, inp.shape[ 0], inp.shape[ 1], 3) 30 | #print(data.shape) 31 | data = np.transpose(data,(0,3,1,2)) 32 | data = data/255.0 33 | data = (data-0.5)/0.5 34 | #print(data.shape) 35 | data = data.astype(np.float32) 36 | 37 | for _ in range(5): 38 | t = time.time() 39 | res=sess.run(None,{input_name:data})[0] 40 | print(time.time() - t) 41 | 42 | print("res: ", res[0][:20]) 43 | print("res: ", np.array(res).shape) 44 | -------------------------------------------------------------------------------- /Base/frameworks/pytorch/demo/show_pth_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torchvision import datasets, transforms 8 | import os 9 | import numpy as np 10 | import random 11 | from torch.utils.data.dataset import Dataset 12 | import cv2 13 | import torchvision.transforms as transforms 14 | # import imagehash 15 | from PIL import Image 16 | from torchsummary import summary 17 | import torchvision.models as models 18 | import pretrainedmodels 19 | #from pretrainedmodels.models.xception import Xception,xception 20 | 21 | #print(pretrainedmodels.pretrained_settings['xception']) 22 | #{'imagenet': {'url': 'http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth', 'input_space': 'RGB', 'input_size': [3, 299, 299], 'input_range': [0, 1], 'mean': [0.5, 0.5, 0.5], 'std': [0.5, 0.5, 0.5], 'num_classes': 1000, 'scale': 0.8975}} 23 | 24 | #b 25 | 26 | import os 27 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 28 | 29 | 30 | my_seed = 42 31 | random.seed(my_seed) 32 | np.random.seed(my_seed) 33 | torch.manual_seed(my_seed) 34 | 35 | def getAllName(file_dir, tail_list = ['.png','.jpg']): 36 | L=[] 37 | for root, dirs, files in os.walk(file_dir): 38 | for file in files: 39 | if os.path.splitext(file)[1] in tail_list: 40 | L.append(os.path.join(root, file)) 41 | return L 42 | 43 | 44 | 45 | img_path_list = getAllName("../../mywork/data/datasets/raw/train_clean/train_pad") 46 | transform = transforms.Compose([ 47 | # transforms.Resize((224, 224)), 48 | # transforms.CenterCrop(size=(210, 180)), 49 | transforms.Resize((224, 224)), 50 | #transforms.RandomAffine(20, translate=(0.2,0.1), scale=(0.9,1.1),shear=(10,10), fillcolor=(0,0,0)), 51 | #transforms.RandomHorizontalFlip(), 52 | # transforms.RandomRotation(20), 53 | #transforms.ColorJitter(brightness=0.5, contrast=0.5, hue=0.2), 54 | #transforms.ToTensor(), 55 | #transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]) 56 | ]) 57 | 58 | 59 | for i,img_path in enumerate(img_path_list): 60 | img = Image.open(img_path).convert('RGB') 61 | img = transform(img) 62 | img.save("tmp/"+str(i)+".jpg", quality=100) 63 | 64 | if i>100: 65 | break 66 | 67 | -------------------------------------------------------------------------------- /Base/frameworks/pytorch/practice/60分钟入门PyTorch-0.目录.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 60分钟入门深度学习工具-PyTorch(目录)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**作者**:Soumith Chintala\n", 15 | "\n", 16 | "原文翻译自:https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html\n", 17 | " \n", 18 | "中文翻译、注释制作:黄海广\n", 19 | "\n", 20 | "github:https://github.com/fengdu78\n", 21 | "\n", 22 | "代码全部测试通过。\n", 23 | "\n", 24 | "配置环境:PyTorch 1.0,Python 3.6\n", 25 | "\n", 26 | "主机:显卡:一块1080ti;内存:32g(注:绝大部分代码不需要GPU)\n", 27 | "![公众号](images/gongzhong.jpg)\n", 28 | " " 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 本教程的目标:\n", 36 | "\n", 37 | "* 在高层次上理解PyTorch的张量(Tensor)库和神经网络\n", 38 | "* 训练一个小型神经网络对图像进行分类\n", 39 | "* 本教程假设您对numpy有基本的了解\n", 40 | "\n", 41 | "**注意**: 务必确认您已经安装了 torch 和 torchvision 两个包。" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## 目录" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "* 1.[Pytorch是什么?](60分钟入门PyTorch-1.PyTorch是什么?.ipynb)\n", 56 | "* 2.[AUTOGRAD](60分钟入门PyTorch-2.AUTOGRAD.ipynb)\n", 57 | "* 3.[神经网络](60分钟入门PyTorch-3.神经网络.ipynb)\n", 58 | "* 4.[训练一个分类器](60分钟入门PyTorch-4.训练一个分类器.ipynb)\n", 59 | "* 5.[数据并行](60分钟入门PyTorch-5.数据并行.ipynb)" 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "pth", 66 | "language": "python", 67 | "name": "pth" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.6.10" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 2 84 | } 85 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/tensorflow/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/tensorflow/basic/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/TFLiteModelMaker/README.md: -------------------------------------------------------------------------------- 1 | # awesome_train_tflite 2 | 3 | 利用TF的官方工具TFLite Model Maker,几行代码使用流行的预训练模型训练,直接生成tflite模型。 4 | 5 | ## 使用方法 6 | ### 1. 环境配置 7 | 8 | * 需要tensorflow 2.0以上,我是安装的2.1.1 9 | * 按照examples/tensorflow_examples/lite/model_maker/requirements.txt安装其余库 10 | 11 | ### 2.工具代码 12 | 13 | * 从[官方github](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker)下载。 14 | 15 | 16 | ### 3.开始训练 17 | 18 | 参考代码:[examples/train.py](examples/train.py) 19 | 20 | 代码不到十行,有详细注释。 21 | 22 | 23 | ## 其他 24 | 25 | ### 相关链接 26 | * [TFLite Model Maker 官方Github](https://github.com/tensorflow/examples/tree/master/tensorflow_examples/lite/model_maker) 27 | * [TensorFlow Hub:官方预训练模型下载](https://tfhub.dev/) 28 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/TFLiteModelMaker/train.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | from tensorflow_examples.lite.model_maker.core.data_util.image_dataloader import ImageClassifierDataLoader 5 | from tensorflow_examples.lite.model_maker.core.task import image_classifier 6 | from tensorflow_examples.lite.model_maker.core.task import model_spec as ms 7 | 8 | 9 | data_path = r"/home/AlgorithmicGroup/yw/workshop/antiface/data/clean_data" 10 | # 这个path指图像数据文件夹路径,其下面按类别分为多个子文件夹 11 | data = ImageClassifierDataLoader.from_folder(data_path) 12 | train_data, test_data = data.split(0.92) 13 | 14 | print("done data load.") 15 | 16 | model = image_classifier.create(train_data, 17 | 18 | model_spec=ms.efficientnet_lite0_spec, 19 | shuffle=True, 20 | validation_data=test_data, 21 | batch_size=32, 22 | epochs=20, 23 | train_whole_model=False, 24 | dropout_rate=0.2, 25 | learning_rate=0.005, 26 | momentum=0.9) 27 | #指定模型为efficientnet_lite0,可以换成其他的 28 | """ 29 | def get_default_hparams(): 30 | return HParams( 31 | train_epochs=5, 32 | do_fine_tuning=False,(train_whole_model) 33 | batch_size=32, 34 | learning_rate=0.005, 35 | momentum=0.9, 36 | dropout_rate=0.2) 37 | """ 38 | 39 | 40 | #loss, accuracy = model.evaluate(test_data) 41 | #训练过程也会打印相关信息,类似keras 42 | 43 | 44 | model.export('image_classifier.tflite', 'image_labels.txt') 45 | #导出tflite模型,image_labels即对应的类别 46 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/TensorFlowExample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import tensorflow as tf" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "Writing and running programs in TensorFlow has the following steps:\n", 20 | "\n", 21 | "1. Create Tensors (variables) that are not yet executed/evaluated. \n", 22 | "2. Write operations between those Tensors.\n", 23 | "3. Initialize your Tensors. \n", 24 | "4. Create a Session. \n", 25 | "5. Run the Session. This will run the operations you'd written above. " 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 12, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "0.0\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "coefficients = np.array([[1.],[-10],[25.]])\n", 43 | "\n", 44 | "w = tf.Variable(0,dtype=tf.float32)\n", 45 | "x = tf.placeholder(tf.float32,[3,1])\n", 46 | "\n", 47 | "cost = x[0][0]*w**2 + x[1][0]*w + x[2][0]\n", 48 | "train = tf.train.GradientDescentOptimizer(0.01).minimize(cost)\n", 49 | "\n", 50 | "init = tf.global_variables_initializer()\n", 51 | "\n", 52 | "session = tf.Session()\n", 53 | "session.run(init)\n", 54 | "\n", 55 | "print(session.run(w))" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 13, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "4.99999\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "for i in range(1000):\n", 73 | " session.run(train,feed_dict={x:coefficients})\n", 74 | "print(session.run(w))" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.5.4" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 2 108 | } 109 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/ckpt2pb.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | from tensorflow.python import pywrap_tensorflow 4 | import os 5 | 6 | 7 | def getTensorName(checkpoint_path): 8 | reader=pywrap_tensorflow.NewCheckpointReader(checkpoint_path) 9 | var_to_shape_map=reader.get_variable_to_shape_map() 10 | 11 | with open("tensorname.txt","w", encoding="utf-8") as f: 12 | for key in var_to_shape_map: 13 | f.write('tensor_name: '+key+'\n') 14 | 15 | 16 | def freeze_graph(input_checkpoint,output_graph): 17 | ''' 18 | :param input_checkpoint: 19 | :param output_graph: PB模型保存路径 20 | :return: 21 | ''' 22 | # checkpoint = tf.train.get_checkpoint_state(model_folder) #检查目录下ckpt文件状态是否可用 23 | # input_checkpoint = checkpoint.model_checkpoint_path #得ckpt文件路径 24 | 25 | # 指定输出的节点名称,该节点名称必须是原模型中存在的节点 26 | output_node_names = "logits/age/BiasAdd,logits/gender/BiasAdd"#,logits/gender/biases,logits/age/biases,logits/age/weights" 27 | saver = tf.compat.v1.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) 28 | graph = tf.compat.v1.get_default_graph() # 获得默认的图 29 | input_graph_def = graph.as_graph_def() # 返回一个序列化的图代表当前的图 30 | 31 | with tf.compat.v1.Session() as sess: 32 | saver.restore(sess, input_checkpoint) #恢复图并得到数据 33 | output_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants( # 模型持久化,将变量值固定 34 | sess=sess, 35 | input_graph_def=input_graph_def,# 等于:sess.graph_def 36 | output_node_names=output_node_names.split(","))# 如果有多个输出节点,以逗号隔开 37 | 38 | with tf.io.gfile.GFile(output_graph, "wb") as f: #保存模型 39 | f.write(output_graph_def.SerializeToString()) #序列化输出 40 | print("%d ops in the final graph." % len(output_graph_def.node)) #得到当前图有几个操作节点 41 | 42 | # for op in graph.get_operations(): 43 | # print(op.name, op.values()) 44 | 45 | 46 | 47 | if __name__ == "__main__": 48 | # 输入ckpt模型路径 49 | input_checkpoint='./savedmodel.ckpt' 50 | # 输出pb模型的路径 51 | out_pb_path="./frozen_model.pb" 52 | # 调用freeze_graph将ckpt转为pb 53 | 54 | #getTensorName(input_checkpoint) 55 | freeze_graph(input_checkpoint,out_pb_path) 56 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/ckpt_pre.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from __future__ import print_function 3 | import numpy as np 4 | import cv2 5 | from cv2 import dnn 6 | import sys 7 | 8 | import tensorflow as tf 9 | from tensorflow.python.framework import graph_util 10 | import os 11 | 12 | 13 | import os 14 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 15 | 16 | 17 | 18 | #### data 19 | img = cv2.imread( '1593250301105_1f43f6a0e8.png') 20 | print("img shape: ", img.shape) 21 | rows = img.shape[ 0] 22 | cols = img.shape[ 1] 23 | img = cv2.resize(img, ( 224, 224)) 24 | #img = img[:, :, [ 2, 1, 0]] # BGR2RGB 25 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 26 | img = np.multiply(img, 1.0 / 255.0) 27 | 28 | 29 | 30 | 31 | 32 | #### model 33 | sess = tf.Session() 34 | saver = tf.train.import_meta_graph('./keras_model.ckpt.meta') # 加载模型结构 35 | saver.restore(sess, tf.train.latest_checkpoint('./')) # 只需要指定目录就可以恢复所有变量信息 36 | 37 | 38 | # 获取placeholder变量 39 | input_x = sess.graph.get_tensor_by_name('input_1:0') 40 | 41 | # 获取需要进行计算的operator 42 | op = sess.graph.get_tensor_by_name('dense_1/Softmax:0') 43 | 44 | ret = sess.run(op, 45 | feed_dict={ input_x: np.array([img],dtype = np.float32)}) 46 | print("ret: ",ret) 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/onnx_pre.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | # from __future__ import print_function 3 | import numpy as np 4 | import cv2 5 | # from cv2 import dnn 6 | # import sys 7 | 8 | # import tensorflow as tf 9 | # from tensorflow.python.framework import graph_util 10 | # import os 11 | 12 | # import time 13 | 14 | 15 | 16 | 17 | import onnxruntime as rt 18 | 19 | model_path = 'keras_model.onnx' 20 | sess=rt.InferenceSession(model_path)#model_path就是模型的地址 21 | input_name=sess.get_inputs()[0].name 22 | 23 | 24 | img = cv2.imread( './1593250301105_1f43f6a0e8.png') 25 | print("img shape: ", img.shape) 26 | rows = img.shape[ 0] 27 | cols = img.shape[ 1] 28 | inp = cv2.resize(img, ( 224, 224)) 29 | inp = inp[:, :, [ 2, 1, 0]] # BGR2RGB 30 | inp = inp/255.0 31 | data = inp.reshape( 1, inp.shape[ 0], inp.shape[ 1], 3) 32 | print(data.shape) 33 | data = np.transpose(data,(0,3,1,2)) 34 | print(data.shape) 35 | data = data.astype(np.float32) 36 | 37 | res=sess.run(None,{input_name:data}) 38 | 39 | print("res: ", res) 40 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/pb2tflite.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | import pathlib2 as pathlib 4 | 5 | 6 | # 1.伪量化 7 | # converter = tf.contrib.lite.TocoConverter.from_frozen_graph('model.pb',["input_image"],["result"], input_shapes={"input_image":[1,626,361,3]}) #Python 2.7.6版本,但测试量化后模型大小不会变小 8 | converter = tf.lite.TFLiteConverter.from_frozen_graph('frozen_insightface_r50.pb',["data"],["output"], input_shapes={"data":[1,112,112,3]}) #python3.4.3--nightly版本,测试量化后模型大小会变小 9 | 10 | converter.post_training_quantize = True 11 | 12 | tflite_quantized_model=converter.convert() 13 | 14 | open("quantized_model.tflite", "wb").write(tflite_quantized_model) 15 | 16 | 17 | 18 | # 2 量化 19 | # converter = tf.lite.TFLiteConverter.from_frozen_graph('frozen_insightface_r50.pb',["data"],["output"], input_shapes={"data":[1,112,112,3]}) #python3.4.3--nightly版本,测试量化后模型大小会变小 20 | 21 | # converter.inference_type = tf.contrib.lite.constants.QUANTIZED_UINT8 22 | 23 | # converter.quantized_input_stats = {"data" : (127, 2.)} 24 | 25 | # converter.default_ranges_stats=(0, 6) 26 | 27 | # tflite_quantized_model=converter.convert() 28 | 29 | # open("true_quantized_model.tflite", "wb").write(tflite_quantized_model) 30 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/read_pb.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from __future__ import print_function 3 | import numpy as np 4 | import cv2 5 | from cv2 import dnn 6 | import sys 7 | 8 | import tensorflow as tf 9 | 10 | 11 | 12 | # Read the graph. 13 | model_dir = 'frozen_inference_graph.pb' 14 | with tf.gfile.FastGFile(model_dir, 'rb') as f: 15 | graph_def = tf.GraphDef() 16 | graph_def.ParseFromString(f.read()) 17 | with tf.Session() as sess: 18 | # Restore session 19 | sess.graph.as_default() 20 | tf.import_graph_def(graph_def, name= '') 21 | # Read and preprocess an image. 22 | img = cv2.imread( 'coco.png') 23 | rows = img.shape[ 0] 24 | cols = img.shape[ 1] 25 | inp = cv2.resize(img, ( 300, 300)) 26 | inp = inp[:, :, [ 2, 1, 0]] # BGR2RGB 27 | # Run the model 28 | out = sess.run([sess.graph.get_tensor_by_name( 'num_detections:0'), 29 | sess.graph.get_tensor_by_name( 'detection_scores:0'), 30 | sess.graph.get_tensor_by_name( 'detection_boxes:0'), 31 | sess.graph.get_tensor_by_name( 'detection_classes:0')], 32 | feed_dict={ 'image_tensor:0': inp.reshape( 1, inp.shape[ 0], inp.shape[ 1], 3)}) 33 | # Visualize detected bounding boxes. 34 | num_detections = int(out[ 0][ 0]) 35 | print(num_detections) 36 | for i in range(num_detections): 37 | classId = int(out[ 3][ 0][i]) 38 | score = float(out[ 1][ 0][i]) 39 | bbox = [float(v) for v in out[ 2][ 0][i]] 40 | if score > 0.3: 41 | x = bbox[ 1] * cols 42 | y = bbox[ 0] * rows 43 | right = bbox[ 3] * cols 44 | bottom = bbox[ 2] * rows 45 | cv2.rectangle(img, (int(x), int(y)), (int(right), int(bottom)), ( 125, 255, 51), thickness= 2) 46 | cv2.imshow( 'TensorFlow MobileNet-SSD', img) 47 | cv2.waitKey() 48 | 49 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/tf_pb_pre.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from __future__ import print_function 3 | import numpy as np 4 | import cv2 5 | from cv2 import dnn 6 | import sys 7 | 8 | import tensorflow as tf 9 | from tensorflow.python.framework import graph_util 10 | import os 11 | 12 | 13 | 14 | 15 | model_dir = './' 16 | model_name = 'frozen_insightface_r50.pb' 17 | 18 | # def create_graph(): 19 | # with tf.gfile.FastGFile(os.path.join(model_dir, model_name), 'rb') as f: 20 | # graph_def = tf.GraphDef() 21 | # graph_def.ParseFromString(f.read()) 22 | # tf.import_graph_def(graph_def, name='') 23 | 24 | # create_graph() 25 | # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node] 26 | # for tensor_name in tensor_name_list: 27 | # print(tensor_name,'\n') 28 | 29 | # print("00000") 30 | 31 | 32 | #Read the graph. 33 | with tf.gfile.FastGFile(os.path.join(model_dir, model_name), 'rb') as f: 34 | graph_def = tf.GraphDef() 35 | graph_def.ParseFromString(f.read()) 36 | print("11111") 37 | with tf.Session() as sess: 38 | # Restore session 39 | sess.graph.as_default() 40 | print("22222") 41 | tf.import_graph_def(graph_def, name= '') 42 | # Read and preprocess an image. 43 | img = cv2.imread( '../t4.png') 44 | print("img shape: ", img.shape) 45 | rows = img.shape[ 0] 46 | cols = img.shape[ 1] 47 | inp = cv2.resize(img, ( 112, 112)) 48 | inp = inp[:, :, [ 2, 1, 0]] # BGR2RGB 49 | # Run the model 50 | out = sess.run([sess.graph.get_tensor_by_name( 'output:0'),], 51 | feed_dict={ 'data:0': inp.reshape( 1, inp.shape[ 0], inp.shape[ 1], 3)}) 52 | # Visualize detected bounding boxes. 53 | print("out: ", out) 54 | # detections = int(out[ 0][ 0]) 55 | # print(detections) 56 | 57 | def get_ga(data): 58 | 59 | ret = data[0] 60 | 61 | print("ret length: ", len(ret[0])) 62 | 63 | #ret = ret1 64 | 65 | g = ret[:,0:2].flatten() 66 | gender = np.argmax(g) 67 | a = ret[:,2:202].reshape( (100,2) ) 68 | a = np.argmax(a, axis=1) 69 | age = int(sum(a)) 70 | 71 | return gender, age 72 | 73 | gender, age = get_ga(out) 74 | print(gender, age) 75 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/tflite_pre.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from __future__ import print_function 3 | import numpy as np 4 | import cv2 5 | from cv2 import dnn 6 | import sys 7 | 8 | import tensorflow as tf 9 | from tensorflow.python.framework import graph_util 10 | import os 11 | 12 | 13 | model_name = 'frozen_insightface_r50.pb' 14 | 15 | # #Read the graph. 16 | # with tf.gfile.FastGFile(os.path.join(model_dir, model_name), 'rb') as f: 17 | # graph_def = tf.GraphDef() 18 | # graph_def.ParseFromString(f.read()) 19 | # print("11111") 20 | # with tf.Session() as sess: 21 | # # Restore session 22 | # sess.graph.as_default() 23 | # print("22222") 24 | # tf.import_graph_def(graph_def, name= '') 25 | # # Read and preprocess an image. 26 | # img = cv2.imread( '../t7.png') 27 | # print("img shape: ", img.shape) 28 | # rows = img.shape[ 0] 29 | # cols = img.shape[ 1] 30 | # inp = cv2.resize(img, ( 112, 112)) 31 | # inp = inp[:, :, [ 2, 1, 0]] # BGR2RGB 32 | # # Run the model 33 | # out = sess.run([sess.graph.get_tensor_by_name( 'output:0'),], 34 | # feed_dict={ 'data:0': inp.reshape( 1, inp.shape[ 0], inp.shape[ 1], 3)}) 35 | # # Visualize detected bounding boxes. 36 | # print("out: ", out) 37 | # # detections = int(out[ 0][ 0]) 38 | # # print(detections) 39 | 40 | # def get_ga(data): 41 | 42 | # ret = data[0] 43 | 44 | # print("ret length: ", len(ret[0])) 45 | 46 | # #ret = ret1 47 | 48 | # g = ret[:,0:2].flatten() 49 | # gender = np.argmax(g) 50 | # a = ret[:,2:202].reshape( (100,2) ) 51 | # a = np.argmax(a, axis=1) 52 | # age = int(sum(a)) 53 | 54 | # return gender, age 55 | 56 | # gender, age = get_ga(out) 57 | # print(gender, age) 58 | 59 | 60 | 61 | 62 | # Load TFLite model and allocate tensors. 63 | interpreter = tf.lite.Interpreter(model_path="quantized_model.tflite") 64 | interpreter.allocate_tensors() 65 | 66 | input_details = interpreter.get_input_details() 67 | output_details = interpreter.get_output_details() 68 | 69 | print(input_details) 70 | print(output_details) 71 | 72 | 73 | img = cv2.imread( '../head112.jpg') 74 | print("img shape: ", img.shape) 75 | rows = img.shape[ 0] 76 | cols = img.shape[ 1] 77 | input_data = cv2.resize(img, ( 112, 112)) 78 | input_data = np.array([input_data[:, :, [ 2, 1, 0]]]) # BGR2RGB 79 | print(input_data.shape) 80 | input_data = input_data.astype(np.float32) 81 | index = input_details[0]['index'] 82 | interpreter.set_tensor(index, input_data) 83 | interpreter.invoke() 84 | output_data = interpreter.get_tensor(output_details[0]['index']) 85 | print('output_data shape:',output_data.shape) 86 | 87 | def get_ga(data): 88 | print("ret length: ", len(data)) 89 | g = data[:,0:2].flatten() 90 | gender = np.argmax(g) 91 | a = data[:,2:202].reshape( (100,2) ) 92 | a = np.argmax(a, axis=1) 93 | age = int(sum(a)) 94 | return gender, age 95 | 96 | print("output_data max : ", np.argmax(output_data)) 97 | 98 | print(output_data[0,0],output_data[0,1], output_data[0,163],output_data[0,164] ) 99 | print(output_data[0,164]/np.sum(output_data[0,:])) 100 | gender, age = get_ga(output_data) 101 | print(gender, age) 102 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/basic/tflite_show_middle_output.py: -------------------------------------------------------------------------------- 1 | from tensorflow.lite.python import schema_py_generated as schema_fb 2 | import flatbuffers 3 | import tensorflow as tf 4 | import time 5 | import os 6 | import cv2 7 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 8 | 9 | def OutputsOffset(subgraph, j): 10 | o = flatbuffers.number_types.UOffsetTFlags.py_type(subgraph._tab.Offset(8)) 11 | if o != 0: 12 | a = subgraph._tab.Vector(o) 13 | return a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4) 14 | return 0 15 | 16 | #参考了https://github.com/raymond-li/tflite_tensor_outputter/blob/master/tflite_tensor_outputter.py 17 | #调整output到指定idx 18 | def buffer_change_output_tensor_to(model_buffer, new_tensor_i): 19 | 20 | root = schema_fb.Model.GetRootAsModel(model_buffer, 0) 21 | output_tensor_index_offset = OutputsOffset(root.Subgraphs(0), 0) 22 | 23 | # Flatbuffer scalars are stored in little-endian. 24 | new_tensor_i_bytes = bytes([ 25 | new_tensor_i & 0x000000FF, \ 26 | (new_tensor_i & 0x0000FF00) >> 8, \ 27 | (new_tensor_i & 0x00FF0000) >> 16, \ 28 | (new_tensor_i & 0xFF000000) >> 24 \ 29 | ]) 30 | # Replace the 4 bytes corresponding to the first output tensor index 31 | return model_buffer[:output_tensor_index_offset] + new_tensor_i_bytes + model_buffer[output_tensor_index_offset + 4:] 32 | 33 | 34 | # Read the model. 35 | with open('lite-model_movenet_singlepose_lightning_3.tflite', 'rb') as f: 36 | model_buffer = f.read() 37 | 38 | # 修改输出idx 39 | idx = 95 #可以通过interpreter.get_tensor_details(),查各层的idx值 40 | model_buffer = buffer_change_output_tensor_to(model_buffer, idx) 41 | 42 | 43 | # 推理 44 | interpreter = tf.lite.Interpreter(model_content=model_buffer) 45 | interpreter.allocate_tensors() 46 | 47 | print(interpreter.get_tensor_details()) 48 | 49 | input_index = interpreter.get_input_details()[0]["index"] 50 | output_index = interpreter.get_output_details()[0]["index"] 51 | 52 | 53 | image_path = '320240.jpg' 54 | image = tf.io.read_file(image_path) 55 | image = tf.compat.v1.image.decode_jpeg(image) 56 | image = tf.expand_dims(image, axis=0) 57 | # Resize and pad the image to keep the aspect ratio and fit the expected size. 58 | image = tf.image.resize_with_pad(image, 192, 192) 59 | input_data = tf.cast(image, dtype=tf.float32) 60 | 61 | 62 | interpreter.set_tensor(input_index, input_data) 63 | interpreter.invoke() 64 | 65 | # 中间层的output值 66 | out_val = interpreter.get_tensor(output_index) 67 | print(out_val.shape) 68 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/demo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/tensorflow/demo/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/demo/basic_mnist_demo.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import cv2 4 | import tensorflow as tf 5 | 6 | 7 | from tensorflow.examples.tutorials.mnist import input_data 8 | 9 | def inverse_color(image): 10 | 11 | height,width = image.shape 12 | img2 = image.copy() 13 | 14 | for i in range(height): 15 | for j in range(width): 16 | img2[i,j] = (255-image[i,j]) 17 | return img2 18 | 19 | mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) 20 | 21 | x = tf.placeholder(tf.float32, [None, 784]) 22 | W = tf.Variable(tf.zeros([784,10])) 23 | b = tf.Variable(tf.zeros([10])) 24 | 25 | y = tf.nn.softmax(tf.matmul(x,W) + b) 26 | 27 | y_ = tf.placeholder("float", [None,10]) 28 | 29 | cross_entropy = -tf.reduce_sum(y_*tf.log(y)) 30 | 31 | train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) 32 | 33 | init = tf.initialize_all_variables() 34 | 35 | sess = tf.Session() 36 | sess.run(init) 37 | 38 | for i in range(1000): 39 | batch_xs, batch_ys = mnist.train.next_batch(100) 40 | sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) 41 | 42 | correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) 43 | 44 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 45 | 46 | print sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}) 47 | 48 | 49 | #read any size pic 50 | z = cv2.imread("2.png",0) 51 | z = cv2.resize(z,(28,28),interpolation = cv2.INTER_CUBIC) 52 | z=inverse_color(z) 53 | 54 | image = np.reshape(z,[1,784],order='C') 55 | #cant use tf.reshape() cause its output is a tensor while cant be feed 56 | 57 | x2 = tf.placeholder(tf.float32, [1, 784]) 58 | y2 = tf.nn.softmax(tf.matmul(x2,W) + b) 59 | ans = tf.argmax(y2,1) 60 | print sess.run(ans,feed_dict={x2:image,}) 61 | -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/demo/ten_people_face_reconize/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/tensorflow/demo/ten_people_face_reconize/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/demo/ten_people_face_reconize/model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/tensorflow/demo/ten_people_face_reconize/model/.DS_Store -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/demo/ten_people_face_reconize/olivettifaces.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/tensorflow/demo/ten_people_face_reconize/olivettifaces.gif -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/demo/ten_people_face_reconize/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/frameworks/tensorflow/demo/ten_people_face_reconize/result.png -------------------------------------------------------------------------------- /Base/frameworks/tensorflow/readme.md: -------------------------------------------------------------------------------- 1 | ## 基于TF的一些东西 2 | 3 | #### 使用inspect_checkpoint来查看ckpt里的内容 打印节点信息 4 | ~~~ 5 | from tensorflow.python.tools import inspect_checkpoint as chkp 6 | from tensorflow.python.framework import meta_graph 7 | input_graph_def = meta_graph.read_meta_graph_file("model2.ckpt.meta").graph_def 8 | for node in input_graph_def.node: 9 | print(node.name) 10 | ~~~ 11 | 12 | #### Tensorboard 13 | > tensorboard --logdir=/Users/fire/A 14 | 15 | #### pb2onnx 16 | https://github.com/onnx/tensorflow-onnx 17 | 18 | 19 | #### 剪枝 20 | * [官方示例](https://tensorflow.google.cn/model_optimization/guide/pruning/pruning_with_keras) 21 | * [博客示例](https://www.cnblogs.com/purple5252/p/11812207.html) 22 | * [我的示例(基于mobilenetv2)](./basic/pruned_demo.py) 23 | 24 | ### Basic 25 | 26 | * [TensorFlow Example](./basic/TensorFlowExample.ipynb) 27 | * [graph/ placeholder/ TensorBoard](./basic/Learn_tf.ipynb) 28 | * [模型保存读取](./basic/tf_save_load.ipynb) 29 | * [ckpt转pb](./basic/ckpt2pb.py) | [ckpt模型加载预测](./basic/ckpt_pre.py) 30 | * [pb转tflite](./basic/pb2tflite.py)| [pb模型测试](./basic/tf_pb_pre.py) | [tf加载pb模型](./basic/read_pb.py) 31 | * [tflite模型测试](./basic/tflite_pre.py) | [转onnx后模型测试](./basic/onnx_pre.py) 32 | * [TFLiteModelMaker轻松利用预训练模型训练tflite(支持efficientnetlite等)](./basic/TFLiteModelMaker) 33 | * [TFLite打印中间节点输出](./basic/tflite_show_middle_output.py) 34 | 35 | ### Demo 36 | 37 | * [逻辑回归](./demo/TF_logsitic.ipynb) 38 | * [mnist手写数字识别(NN)](./demo/basic_mnist_demo.py) 39 | * [mnist手写数字识别(CNN)](./demo/mnist_cnn_demo.py) 40 | * [10人版人脸识别](./demo/ten_people_face_reconize) 41 | 42 | 43 | ### Resource 44 | * [TensorRT安装及使用教程](https://blog.csdn.net/zong596568821xp/article/details/86077553) 45 | -------------------------------------------------------------------------------- /Base/tools/lightgbm/readme.md: -------------------------------------------------------------------------------- 1 | From [官方文档](https://github.com/Microsoft/LightGBM) 2 | 3 | * simple_example.py 4 | * Construct Dataset 5 | * Basic train and predict 6 | * Eval during training 7 | * Early stopping 8 | * Save model to file 9 | 10 | * sklearn_example.py 11 | * Basic train and predict with sklearn interface 12 | * Feature importances with sklearn interface -------------------------------------------------------------------------------- /Base/tools/lightgbm/simpleexample.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import json 4 | import lightgbm as lgb 5 | import pandas as pd 6 | from sklearn.metrics import mean_squared_error 7 | 8 | 9 | # load or create your dataset 10 | print('Load data...') 11 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') 12 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') 13 | 14 | y_train = df_train[0].values 15 | y_test = df_test[0].values 16 | X_train = df_train.drop(0, axis=1).values 17 | X_test = df_test.drop(0, axis=1).values 18 | 19 | # create dataset for lightgbm 20 | lgb_train = lgb.Dataset(X_train, y_train) 21 | lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) 22 | 23 | # specify your configurations as a dict 24 | params = { 25 | 'task': 'train', 26 | 'boosting_type': 'gbdt', 27 | 'objective': 'regression', 28 | 'metric': {'l2', 'auc'}, 29 | 'num_leaves': 31, 30 | 'learning_rate': 0.05, 31 | 'feature_fraction': 0.9, 32 | 'bagging_fraction': 0.8, 33 | 'bagging_freq': 5, 34 | 'verbose': 0 35 | } 36 | 37 | print('Start training...') 38 | # train 39 | gbm = lgb.train(params, 40 | lgb_train, 41 | num_boost_round=20, 42 | valid_sets=lgb_eval, 43 | early_stopping_rounds=5) 44 | 45 | print('Save model...') 46 | # save model to file 47 | gbm.save_model('model.txt') 48 | 49 | print('Start predicting...') 50 | # predict 51 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) 52 | # eval 53 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) -------------------------------------------------------------------------------- /Base/tools/lightgbm/sklearnexample.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable = invalid-name, C0111 3 | import lightgbm as lgb 4 | import pandas as pd 5 | from sklearn.metrics import mean_squared_error 6 | from sklearn.model_selection import GridSearchCV 7 | 8 | # load or create your dataset 9 | print('Load data...') 10 | df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') 11 | df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') 12 | 13 | y_train = df_train[0].values 14 | y_test = df_test[0].values 15 | X_train = df_train.drop(0, axis=1).values 16 | X_test = df_test.drop(0, axis=1).values 17 | 18 | print('Start training...') 19 | # train 20 | gbm = lgb.LGBMRegressor(objective='regression', 21 | num_leaves=31, 22 | learning_rate=0.05, 23 | n_estimators=20) 24 | gbm.fit(X_train, y_train, 25 | eval_set=[(X_test, y_test)], 26 | eval_metric='l1', 27 | early_stopping_rounds=5) 28 | 29 | print('Start predicting...') 30 | # predict 31 | y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) 32 | # eval 33 | print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) 34 | 35 | # feature importances 36 | print('Feature importances:', list(gbm.feature_importances_)) 37 | 38 | # other scikit-learn modules 39 | estimator = lgb.LGBMRegressor(num_leaves=31) 40 | 41 | param_grid = { 42 | 'learning_rate': [0.01, 0.1, 1], 43 | 'n_estimators': [20, 40] 44 | } 45 | 46 | gbm = GridSearchCV(estimator, param_grid) 47 | 48 | gbm.fit(X_train, y_train) 49 | 50 | print('Best parameters found by grid search are:', gbm.best_params_) -------------------------------------------------------------------------------- /Base/tools/scikit-learn/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/tools/scikit-learn/.DS_Store -------------------------------------------------------------------------------- /Base/tools/scikit-learn/README.md: -------------------------------------------------------------------------------- 1 | # scikit-learn 2 | 3 | ### 常用 4 | * 划分验证集 5 | 6 | ```python 7 | from sklearn.cross_validation import train_test_split 8 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 9 | ``` 10 | * Grid Search 11 | ```python 12 | param_grid = {'n_estimators': [300, 500], 'max_features': [10, 12, 14]} 13 | model = grid_search.GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10, verbose=20, scoring=RMSE) 14 | model.fit(X_train, y_train) 15 | ``` 16 | * LabelEncoder 17 | ```python 18 | from sklearn.preprocessing import LabelEncoder 19 | le = LabelEncoder() 20 | le.fit([1,5,67,100]) 21 | le.transform([1,1,100,67,5]) 22 | #输出: array([0,0,3,2,1]) 23 | #可通过le.inverse_transform(x)转换回去 24 | ``` 25 | * sklearn.utils.shuffle(多个数组按同样顺序打乱) 26 | ```python 27 | def fill_feed_dict(data_X, data_Y, batch_size): 28 | """Generator datasets to yield batches""" 29 | # Shuffle data first. 30 | shuffled_X, shuffled_Y = shuffle(data_X, data_Y) 31 | for idx in range(data_X.shape[0] // batch_size): 32 | x_batch = shuffled_X[batch_size * idx: batch_size * (idx + 1)] 33 | y_batch = shuffled_Y[batch_size * idx: batch_size * (idx + 1)] 34 | yield x_batch, y_batch 35 | ``` 36 | 37 | ### 算法 38 | * [GBDT(MART)迭代决策树入门教程 | 简介](http://blog.csdn.net/suranxu007/article/details/49910323) 39 | 40 | ### 实现 41 | * [常用算法调用(LR/ RF/ GBDT/ knn/ SVM)](./useful.py) 42 | * [logistic回归](./sklearn_LR.py) 43 | * [皮尔逊相关度](./pearsonr.ipynb) 44 | * [利用kmeans对图片颜色聚类并可视化](./demo/kmeans_color.py) 45 | * [高维数据可视化tSNE](./demo/tSNE.py) 46 | 47 | ### Choosing the right estimator 48 | 49 | ![Choosing the right estimator](./choose.png) 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /Base/tools/scikit-learn/choose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/tools/scikit-learn/choose.png -------------------------------------------------------------------------------- /Base/tools/scikit-learn/demo/kmeans_color.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import os,shutil 5 | import random 6 | 7 | import cv2 8 | from sklearn.cluster import KMeans 9 | from sklearn.externals import joblib 10 | 11 | import time 12 | 13 | 14 | def getKmeansColor(img, n_cluster): 15 | img = cv2.resize(img,(300,300)) 16 | 17 | h,w,c = img.shape 18 | img = img[int(h*0.2):int(h*0.8), int(w*0.2):int(w*0.8)] 19 | h,w,c = img.shape 20 | 21 | data = np.reshape(img, (-1,3)) 22 | 23 | #调用kmeans类 24 | clf = KMeans(n_clusters=n_cluster) 25 | s = clf.fit(data) 26 | 27 | 28 | #中心 29 | print(clf.cluster_centers_) 30 | 31 | #每个样本所属的簇 32 | #print(clf.labels_) 33 | from collections import Counter 34 | color_count_dict = Counter(clf.labels_) 35 | color_count_ratio = [] 36 | for i in range(n_cluster): 37 | color_count_ratio.append(color_count_dict[i]*1.0/len(data)) 38 | print(color_count_ratio) 39 | 40 | 41 | #用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 42 | #print(clf.inertia_) 43 | 44 | #进行预测 45 | #print(clf.predict(feature)) 46 | 47 | 48 | 49 | #保存模型 50 | #joblib.dump(clf , 'km.pkl') 51 | 52 | #载入保存的模型 53 | #clf = joblib.load('c:/km.pkl') 54 | 55 | ''' 56 | #用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 57 | for i in range(5,30,1): 58 | clf = KMeans(n_clusters=i) 59 | s = clf.fit(feature) 60 | print i , clf.inertia_ 61 | ''' 62 | 63 | return clf.cluster_centers_, color_count_ratio 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | if '__main__' == __name__: 77 | n_cluster = 5 78 | img = cv2.imread("11.jpg") 79 | 80 | cluster_centers, color_count_ratio = getKmeansColor(img, n_cluster) 81 | 82 | 83 | # 可视化结果 84 | res_img_h = n_cluster*2*10*2 85 | res_img_w = 100 86 | res = np.ones((res_img_h,res_img_w,3))*255 87 | y_start = 0 88 | for i in range(n_cluster): 89 | color_h = int(res_img_h*color_count_ratio[i]) 90 | cv2.rectangle(res, (0, y_start), (100, y_start+color_h), [int(x) for x in cluster_centers[i]], -1) 91 | y_start += color_h 92 | 93 | cv2.rectangle(res, (0, 0), (res_img_w-1, res_img_h-1), (0,0,0),2) 94 | 95 | cv2.imwrite("res.jpg", res) 96 | -------------------------------------------------------------------------------- /Base/tools/scikit-learn/demo/tSNE.py: -------------------------------------------------------------------------------- 1 | # coding='utf-8' 2 | import numpy as np 3 | import cv2 4 | from sklearn.manifold import TSNE 5 | 6 | 7 | X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]]) 8 | tsne = TSNE(n_components=2) 9 | tsne.fit_transform(X) 10 | print(tsne.embedding_) 11 | 12 | 13 | 14 | # 一个对S曲线数据集上进行各种降维的说明。 15 | from time import time 16 | 17 | import matplotlib.pyplot as plt 18 | from mpl_toolkits.mplot3d import Axes3D 19 | from matplotlib.ticker import NullFormatter 20 | 21 | from sklearn import manifold, datasets 22 | 23 | # # Next line to silence pyflakes. This import is needed. 24 | # Axes3D 25 | 26 | n_points = 1000 27 | # X是一个(1000, 3)的2维数据,color是一个(1000,)的1维数据 28 | X, color = datasets.samples_generator.make_s_curve(n_points, random_state=0) 29 | n_neighbors = 10 30 | n_components = 2 31 | 32 | fig = plt.figure(figsize=(8, 8)) 33 | # 创建了一个figure,标题为"Manifold Learning with 1000 points, 10 neighbors" 34 | plt.suptitle("Manifold Learning with %i points, %i neighbors" 35 | % (1000, n_neighbors), fontsize=14) 36 | 37 | 38 | '''绘制S曲线的3D图像''' 39 | ax = fig.add_subplot(211, projection='3d') 40 | ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral) 41 | ax.view_init(4, -72) # 初始化视角 42 | 43 | '''t-SNE''' 44 | t0 = time() 45 | tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0) 46 | Y = tsne.fit_transform(X) # 转换后的输出 47 | t1 = time() 48 | print("t-SNE: %.2g sec" % (t1 - t0)) # 算法用时 49 | ax = fig.add_subplot(2, 1, 2) 50 | plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral) 51 | plt.title("t-SNE (%.2g sec)" % (t1 - t0)) 52 | ax.xaxis.set_major_formatter(NullFormatter()) # 设置标签显示格式为空 53 | ax.yaxis.set_major_formatter(NullFormatter()) 54 | # plt.axis('tight') 55 | 56 | plt.show() 57 | -------------------------------------------------------------------------------- /Base/tools/scikit-learn/pearsonr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from scipy.stats import pearsonr\n", 12 | "#doc:https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html\n", 13 | "\n", 14 | "#Calculates a Pearson correlation coefficient \n", 15 | "#and the p-value for testing non-correlation. \n", 16 | "# 原假设:无相关性\n", 17 | "#Calculates a Pearson correlation coefficient and the p-value for testing non-correlation.\n", 18 | "\n", 19 | "## Pearson’s correlation requires that each dataset be normally distributed\n", 20 | "\n", 21 | "#p值反应了相关系数的显著性。\n", 22 | "#The p-value roughly indicates the probability of an uncorrelated system \n", 23 | "#producing datasets that have a Pearson correlation at least as extreme as \n", 24 | "#the one computed from these datasets. The p-values are not entirely reliable \n", 25 | "#but are probably reasonable for datasets larger than 500 or so." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 8, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "(1.0, 0.0)\n", 38 | "(0.9450110410366913, 0.0549889589633087)\n", 39 | "(-0.4465937565388721, 0.5534062434611278)\n", 40 | "(0.9450110410366913, 0.0549889589633087)\n", 41 | "(-0.4465937565388721, 0.5534062434611278)\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "x1 = [1,2,2,4]\n", 47 | "x2 = [2,3,3,5]\n", 48 | "x3 = [4,9,16,25]\n", 49 | "x4 = [4,2,9,1]\n", 50 | "\n", 51 | "print(pearsonr(x1,x2))\n", 52 | "print(pearsonr(x1,x3))\n", 53 | "print(pearsonr(x1,x4))\n", 54 | "print(pearsonr(x2,x3))\n", 55 | "print(pearsonr(x2,x4))" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 3", 71 | "language": "python", 72 | "name": "python3" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.6.2" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 2 89 | } 90 | -------------------------------------------------------------------------------- /Base/tools/scikit-learn/sklearn_LR.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | 8 | #==================== Part 0: Basic Function ==================== 9 | with open('D:\ex2data1.txt', 'r') as f: 10 | data = f.readlines() #txt中所有字符串读入data 11 | 12 | datamat = [] 13 | for line in data: 14 | odom = line.strip().split(',') #将单个数据分隔开存好 15 | numbers_float = map(float, odom) #转化为浮点数 16 | datamat.append(numbers_float) 17 | 18 | #print datamat 19 | import numpy as np 20 | 21 | datanp = np.array(datamat) 22 | #print datanp 23 | x = datanp[:,(0,1)].reshape((100,2)) 24 | y = datanp[:,2].reshape((100,1)) 25 | lenY = len(y) 26 | X = np.hstack((np.ones((lenY,1)),x)) 27 | 28 | 29 | # In[2]: 30 | 31 | 32 | #%% ==================== Part 1: Plotting ==================== 33 | #We start the exercise by first plotting the data to understand the the problem we are working with. 34 | import matplotlib.pyplot as plt 35 | 36 | def plotData(x,y): 37 | fig = plt.figure() 38 | ax= fig.add_subplot(111) #使画在一个图上 39 | 40 | pos = np.where(y[:,0]==0) #y为类似矩阵形式,所以要再取第一列 41 | neg = np.where(y[:,0]==1) 42 | ax1 = plt.scatter(x[pos,0], x[pos,1], marker = 'x', color = 'm') 43 | ax2 = plt.scatter(x[neg,0], x[neg,1], marker = 'o', color = 'r') 44 | plt.xlabel('exam1 score') 45 | plt.ylabel('exam2 score') 46 | 47 | plt.legend([ax1, ax2], ['Admitted', 'Not admitted']) 48 | plt.show() 49 | 50 | plotData(x,y) 51 | 52 | 53 | # In[3]: 54 | 55 | 56 | from sklearn import datasets 57 | from sklearn.cross_validation import train_test_split 58 | 59 | from sklearn.linear_model import LogisticRegression 60 | lr = LogisticRegression(C=1e9) 61 | lr.fit(x, y) #这里要用的x是原始的!不带1(x0)的! 62 | print lr.coef_,lr.intercept_ 63 | final_theta = np.zeros((3,1)) 64 | final_theta[0] = lr.intercept_ 65 | final_theta[1],final_theta[2]= lr.coef_[0] 66 | print final_theta 67 | 68 | 69 | # In[4]: 70 | 71 | 72 | #%% ============= Part 4: Visualizing J(theta_0, theta_1) ============= 73 | fig = plt.figure() 74 | ax= fig.add_subplot(111) #使画在一个图上 75 | 76 | pos = np.where(y[:,0]==1) #y为类似矩阵形式,所以要再取第一列 77 | neg = np.where(y[:,0]==0) 78 | 79 | ax1 = plt.scatter(x[pos,0], x[pos,1], marker = 'x', color = 'm') 80 | ax2 = plt.scatter(x[neg,0], x[neg,1], marker = 'o', color = 'r') 81 | 82 | plt.xlabel('exam1 score') 83 | plt.ylabel('exam2 score') 84 | 85 | plt.legend([ax1, ax2], ['Admitted', 'Not admitted']) 86 | 87 | 88 | #plotX = [30,100] #范围a[2] 89 | plotX = np.arange(30,100,1) 90 | plotY = (-final_theta[0]-final_theta[1]*plotX)/final_theta[2] #由0=w0x0+w1x1+w2x2推导,这里的y就是x2,x0=1 91 | #注意等于0!!!因为这是分类问题 92 | plt.plot(plotX,plotY)#调用plot函数绘制得到由点生成的线条 93 | 94 | 95 | plt.show() 96 | 97 | 98 | # In[ ]: 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /Base/tools/scikit-learn/useful.py: -------------------------------------------------------------------------------- 1 | ### 1. LR 2 | from sklearn.linear_model import LogisticRegression 3 | lr = LogisticRegression(C=1000.0, random_state=0) 4 | lr.fit(train_x, train_y) 5 | y_pre = lr.predict(val_x) 6 | 7 | 8 | ### 2.RF 9 | #Random Forest 一般在 max_features 设为 Feature 数量的平方根附近得到最佳结果。 10 | from sklearn.ensemble import RandomForestClassifier 11 | from sklearn.datasets import make_classification 12 | 13 | rf = RandomForestClassifier(max_depth=2, random_state=0) 14 | rf.fit(train_x, train_y) 15 | 16 | y_pre = rf.predict(val_x) 17 | y_pre[y_pre>0.5] = 1 18 | y_pre[y_pre<0.5] = 0 19 | 20 | 21 | ### 3.GBDT 22 | from sklearn.ensemble import GradientBoostingRegressor 23 | gbdt=GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, max_depth=3) 24 | gbdt.fit(train_x, train_y) 25 | 26 | y_pre=gbdt.predict(val_x) 27 | y_pre[y_pre>0.5] = 1 28 | y_pre[y_pre<0.5] = 0 29 | 30 | ### 4.knn 31 | from sklearn import neighbors 32 | 33 | knn = neighbors.KNeighborsClassifier(n_neighbors=8,leaf_size=30,p=3) 34 | knn.fit(x,y) 35 | 36 | 37 | ### 5.svm 38 | #http://blog.csdn.net/u013709270/article/details/53365744 (d多分类) 39 | from sklearn import svm 40 | X = [[0, 0], [1, 1]] 41 | y = [0, 1] 42 | clf = svm.SVC() 43 | clf.fit(X, y) 44 | clf.predict([[2., 2.]]) 45 | -------------------------------------------------------------------------------- /Base/tools/spark/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/tools/spark/.DS_Store -------------------------------------------------------------------------------- /Base/tools/spark/README.md: -------------------------------------------------------------------------------- 1 | # Spark 2 | 3 | ### 文章 4 | [在windows安装部署spark(python版)](https://blog.csdn.net/hjxinkkl/article/details/57083549?winzoom=1) 5 | 6 | ### 代码 7 | [最简单的示例](start.py) 8 | 9 | ### Book 10 | * 《Spark快速大数据分析》 11 | 1. [配套代码](https://github.com/databricks/learning-spark) 12 | 2. 分章笔记:[RDD编程](./learnsparkLDA/learn_sparkRDD.ipynb) | [pair RDD](./learnsparkLDA/spark_pairRDD.ipynb) | [读存数据](./learnsparkLDA/spark_saveload.ipynb) | [累加器&广播变量&分区操作&数值RDD](./learnsparkLDA/spark_uplevel.ipynb) | [MLlib](./learnsparkLDA/spark_MLlib.ipynb) 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Base/tools/spark/learnsparkLDA/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Base/tools/spark/learnsparkLDA/.DS_Store -------------------------------------------------------------------------------- /Base/tools/spark/start.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | conf = SparkConf().setMaster("local[*]").setAppName("First_App") 3 | sc = SparkContext(conf=conf) 4 | 5 | data = sc.parallelize(range(10)) 6 | ans = data.reduce(lambda x, y: x + y) 7 | print (ans) 8 | 9 | ''' 10 | output: 11 | 12 | Setting default log level to "WARN". 13 | To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 14 | 2018-05-16 17:08:22 WARN Utils:66 - Service 'SparkUI' could not bind on port 4040. Attempting port 4041. 15 | 16 | [Stage 0:> (0 + 4) / 4] 17 | [Stage 0:==============> (1 + 3) / 4] 18 | [Stage 0:=============================> (2 + 2) / 4] 19 | [Stage 0:============================================> (3 + 1) / 4] 20 | 21 | 45 22 | [Decode error - output not utf-8] 23 | [Decode error - output not utf-8] 24 | [Decode error - output not utf-8] 25 | ''' 26 | -------------------------------------------------------------------------------- /Base/tools/xgboost/readme.md: -------------------------------------------------------------------------------- 1 | ### 资料 2 | * [官网](http://xgboost.readthedocs.io/en/latest/) 3 | * [Python API](http://xgboost.readthedocs.io/en/latest/python/python_api.html) 4 | * [安装教程](http://m.blog.csdn.net/huangdunxian/article/details/53432432) 5 | 6 | ### 应用 7 | * [demo](./xgboost.ipynb) 8 | * [多分类](./xgboost_multi.ipynb) 9 | 10 | 11 | ### 调参 12 | Xgboost 的调参。通常认为对它性能影响较大的参数有: 13 | * eta:每次迭代完成后更新权重时的步长。越小训练越慢。 14 | * num_round:总共迭代的次数。 15 | * subsample:训练每棵树时用来训练的数据占全部的比例。用于防止 Overfitting。 16 | * colsample_bytree:训练每棵树时用来训练的特征的比例,类似 RandomForestClassifier 的 max_features。 17 | * max_depth:每棵树的最大深度限制。与 Random Forest 不同,Gradient Boosting 如果不对深度加以限制,最终是会 Overfit 的。 18 | * early_stopping_rounds:用于控制在 Out Of Sample 的验证集上连续多少个迭代的分数都没有提高后就提前终止训练。用于防止 Overfitting。 19 | 20 | #### 一般的调参步骤是: 21 | 1. 将训练数据的一部分划出来作为验证集。 22 | 2. 先将 eta 设得比较高(比如 0.1),num_round 设为 300 ~ 500。 23 | 3. 用 Grid Search 对其他参数进行搜索 24 | 4. 逐步将 eta 降低,找到最佳值。 25 | 5.以验证集为 watchlist,用找到的最佳参数组合重新在训练集上训练。注意观察算法的输出,看每次迭代后在验证集上分数的变化情况,从而得到最佳的 early_stopping_rounds。 26 | 27 | ``` 28 | X_dtrain, X_deval, y_dtrain, y_deval = cross_validation.train_test_split(X_train, y_train, random_state=1026, test_size=0.3) 29 | dtrain = xgb.DMatrix(X_dtrain, y_dtrain) 30 | deval = xgb.DMatrix(X_deval, y_deval) 31 | watchlist = [(deval, 'eval')] 32 | params = { 33 | 'booster': 'gbtree', 34 | 'objective': 'reg:linear', 35 | 'subsample': 0.8, 36 | 'colsample_bytree': 0.85, 37 | 'eta': 0.05, 38 | 'max_depth': 7, 39 | 'seed': 2016, 40 | 'silent': 0, 41 | 'eval_metric': 'rmse' 42 | } 43 | clf = xgb.train(params, dtrain, 500, watchlist, early_stopping_rounds=50) 44 | pred = clf.predict(xgb.DMatrix(df_test)) 45 | ``` 46 | 所有具有随机性的 Model 一般都会有一个 seed 或是 random_state 参数用于控制随机种子。得到一个好的 Model 后,在记录参数时务必也记录下这个值,从而能够在之后重现 Model。 47 | -------------------------------------------------------------------------------- /Base/tools/xgboost/xgboost_multi.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "[参数说明](http://xgboost.readthedocs.io/en/latest//parameter.html)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 4, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import xgboost as xgb\n", 17 | "import numpy as np" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 10, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "### make data\n", 27 | "x1 = [[0,0.1,0.2],\n", 28 | " [1.1,1.2,1.3],\n", 29 | " [2.1,2.2,2.3],\n", 30 | " [0.1,0.2,0.3],\n", 31 | " [1.4,1.2,1.3],\n", 32 | " [2.1,2.2,2.1],\n", 33 | " [0.1,0.2,0.2],\n", 34 | " [1.1,1.2,1.3],\n", 35 | " [2.1,2.2,2.3]]\n", 36 | "\n", 37 | "y1 = [0,1,2,0,1,2,0,1,2]\n", 38 | "\n", 39 | "x2 = [[0,0.1,0.2],\n", 40 | " [1.1,1.2,1.3],\n", 41 | " [2.0,2.2,2.3],\n", 42 | " [0.2,0.2,0.3]]" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 12, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "[ 0. 1. 2. 0.]\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "\n", 60 | "# read in data\n", 61 | "#dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')\n", 62 | "#dtest = xgb.DMatrix('demo/data/agaricus.txt.test')\n", 63 | "# specify parameters via map\n", 64 | "dtrain = xgb.DMatrix(x1,y1)\n", 65 | "dtest = xgb.DMatrix(x2)\n", 66 | "\n", 67 | "param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'multi:softmax', 'num_class' : 3}\n", 68 | "\n", 69 | "#“multi:softmax” –set XGBoost to do multiclass classification using the softmax objective,\n", 70 | "#you also need to set num_class(number of classes)\n", 71 | "num_round = 2\n", 72 | "bst = xgb.train(param, dtrain, num_round)\n", 73 | "# make prediction\n", 74 | "preds = bst.predict(dtest)\n", 75 | "print(preds)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.5.4" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 2 107 | } 108 | -------------------------------------------------------------------------------- /CV/codes/IOU.py: -------------------------------------------------------------------------------- 1 | def IOU(box1,box2): 2 | xmin1, ymin1, xmax1, ymax1 = box1 3 | xmin2, ymin2, xmax2, ymax2 = box2 4 | # 求交集部分左上角的点 5 | xmin = max(xmin1,xmin2) 6 | ymin = max(ymin1,ymin2) 7 | # 求交集部分右下角的点 8 | xmax = min(xmax1,xmax2) 9 | ymax = min(ymax1,ymax2) 10 | # 计算输入的两个矩形的面积 11 | s1 = (xmax1-xmin1) * (ymax1 - ymin1) 12 | s2 = (xmax2-xmin2) * (ymax2 - ymin2) 13 | 14 | #计算总面积 15 | s = s1 + s2 16 | # 计算交集 17 | inter_area = max(0,(xmax - xmin)) * max(0,(ymax - ymin)) 18 | 19 | iou = inter_area / (s - inter_area) 20 | return iou 21 | -------------------------------------------------------------------------------- /CV/codes/label_smoothing.py: -------------------------------------------------------------------------------- 1 | 2 | #y: one-hot numpy array 3 | #e.g. [[1,0,0],[0,1,0]] 4 | label_smoothing = 0.01 5 | y = y * (1 - label_smoothing) + label_smoothing / num_classes 6 | -------------------------------------------------------------------------------- /CV/codes/makeVOCDirs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | 5 | BASE_PATH = "./" 6 | 7 | voc_dir = os.path.join(BASE_PATH, "VOC2007") 8 | 9 | anno_dir = os.path.join(voc_dir, "Annotations") 10 | set_dir = os.path.join(voc_dir, "ImageSets") 11 | img_dir = os.path.join(voc_dir, "JPEGImages") 12 | 13 | main_dir = os.path.join(set_dir, "Main") 14 | 15 | 16 | if not os.path.exists(voc_dir): 17 | os.makedirs(voc_dir) 18 | 19 | if not os.path.exists(anno_dir): 20 | os.makedirs(anno_dir) 21 | 22 | if not os.path.exists(set_dir): 23 | os.makedirs(set_dir) 24 | 25 | if not os.path.exists(img_dir): 26 | os.makedirs(img_dir) 27 | 28 | if not os.path.exists(main_dir): 29 | os.makedirs(main_dir) 30 | -------------------------------------------------------------------------------- /CV/codes/pascalVOC2csv.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import csv 4 | import os 5 | import glob 6 | import sys 7 | 8 | class PascalVOC2CSV(object): 9 | def __init__(self,xml=[], 10 | ann_path='./annotations.csv', 11 | classes_path='./classes.csv'): 12 | ''' 13 | :param xml: 所有Pascal VOC的xml文件路径组成的列表 14 | :param ann_path: ann_path 15 | :param classes_path: classes_path 16 | ''' 17 | self.xml = xml 18 | self.ann_path = ann_path 19 | self.classes_path=classes_path 20 | self.label=[] 21 | self.annotations=[] 22 | 23 | self.data_transfer() 24 | print(len(self.annotations)) 25 | self.write_file() 26 | 27 | 28 | def data_transfer(self): 29 | for num, xml_file in enumerate(self.xml): 30 | #print(xml_file) 31 | # 进度输出 32 | sys.stdout.write('\r>> Converting image %d/%d' % ( 33 | num + 1, len(self.xml))) 34 | sys.stdout.flush() 35 | 36 | with open(xml_file, 'r') as fp: 37 | #print(len(fp.readlines())) 38 | for p in fp: 39 | if '' in p: 40 | self.filen_ame = p.split('>')[1].split('<')[0] 41 | 42 | if '' in p: 43 | # 类别 44 | d = [next(fp).split('>')[1].split('<')[0] for _ in range(9)] 45 | d = d[:-1] 46 | self.supercategory = d[0] 47 | if self.supercategory not in self.label: 48 | self.label.append(self.supercategory) 49 | 50 | # 边界框 51 | x1 = int(d[-4]) 52 | y1 = int(d[-3]) 53 | x2 = int(d[-2]) 54 | y2 = int(d[-1]) 55 | 56 | self.annotations.append([os.path.join('JPEGImages',self.filen_ame),x1,y1,x2,y2,self.supercategory]) 57 | 58 | 59 | sys.stdout.write('\n') 60 | sys.stdout.flush() 61 | 62 | def write_file(self,): 63 | with open(self.ann_path, 'w', newline='') as fp: 64 | csv_writer = csv.writer(fp, dialect='excel') 65 | csv_writer.writerows(self.annotations) 66 | 67 | class_name=sorted(self.label) 68 | class_=[] 69 | for num,name in enumerate(class_name): 70 | class_.append([name,num]) 71 | with open(self.classes_path, 'w', newline='') as fp: 72 | csv_writer = csv.writer(fp, dialect='excel') 73 | csv_writer.writerows(class_) 74 | 75 | 76 | xml_file = glob.glob('./Annotations/*.xml') 77 | 78 | PascalVOC2CSV(xml_file) 79 | -------------------------------------------------------------------------------- /CV/codes/show_voc_box.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import numpy as np 4 | import xml.etree.ElementTree as xmlET 5 | from PIL import Image, ImageDraw 6 | 7 | classes = ('__background__', # always index 0 8 | 'Adidas', 'Nike', 'Puma') 9 | 10 | file_path_img = 'VOC2007/JPEGImages' 11 | file_path_xml = 'VOC2007/Annotations' 12 | save_file_path = 'VOC2007/Vis_boxes_VOC2007' 13 | 14 | if not os.path.exists(save_file_path): 15 | os.makedirs(save_file_path) 16 | 17 | pathDir = os.listdir(file_path_xml) 18 | for idx in range(len(pathDir)): 19 | filename = pathDir[idx] 20 | tree = xmlET.parse(os.path.join(file_path_xml, filename)) 21 | objs = tree.findall('object') 22 | num_objs = len(objs) 23 | boxes = np.zeros((num_objs, 5), dtype=np.uint16) 24 | 25 | for ix, obj in enumerate(objs): 26 | bbox = obj.find('bndbox') 27 | # Make pixel indexes 0-based 28 | x1 = float(bbox.find('xmin').text) - 1 29 | y1 = float(bbox.find('ymin').text) - 1 30 | x2 = float(bbox.find('xmax').text) - 1 31 | y2 = float(bbox.find('ymax').text) - 1 32 | 33 | cla = obj.find('name').text 34 | label = classes.index(cla) 35 | 36 | boxes[ix, 0:4] = [x1, y1, x2, y2] 37 | boxes[ix, 4] = label 38 | 39 | image_name = os.path.splitext(filename)[0] 40 | img = Image.open(os.path.join(file_path_img, image_name + '.jpg')) 41 | 42 | draw = ImageDraw.Draw(img) 43 | for ix in range(len(boxes)): 44 | xmin = int(boxes[ix, 0]) 45 | ymin = int(boxes[ix, 1]) 46 | xmax = int(boxes[ix, 2]) 47 | ymax = int(boxes[ix, 3]) 48 | draw.rectangle([xmin, ymin, xmax, ymax], outline=(255, 0, 0)) 49 | draw.text([xmin, ymin], classes[boxes[ix, 4]], (255, 0, 0)) 50 | 51 | img.save(os.path.join(save_file_path, image_name + '.jpg')) 52 | -------------------------------------------------------------------------------- /CV/codes/simple_mixup.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | """ 5 | 说明: 6 | 这里是前期看到公式后一个粗略的实现 7 | 参考了更多资料后其实有些地方还是有点问题 8 | 9 | 比如weight应该是一个beta分布而不是均匀分布,如 10 | weight = np.random.beta(alpha,alpha) 11 | 12 | 然后是输入每个batch的x,y即可,统一和一个从数据集中随机选取的图片做mixup 13 | 14 | 最后,最关键的是,最后计算loss并不是修改标签y, 15 | 而是如下计算loss: 16 | loss = weight * criterion(outputs, targets_a) + (1 - weight) * criterion(outputs, targets_b); 17 | 18 | 下一次用到mixup的时候修改了再更新到这里吧。 19 | 5.24: 已更新beta分布 20 | """ 21 | 22 | def mixup_batch(x1,y1,x2,y2,alpha=0.4): 23 | """ 24 | get batch data 25 | :param x: two training imgs (same shape) ndarry 26 | :param y: two one-hot labels(same shape) ndarry 27 | :param alpha: hyper-parameter α, default as 0.2 28 | :return: new_x,new_y 29 | """ 30 | #weight = np.random.choice([0.1,0.2,0.3,0.4,0.6,0.7,0.8,0.9]) 31 | weight = np.random.beta(alpha, alpha) 32 | print(weight) 33 | new_x = x1*weight+x2*(1-weight) 34 | new_y = y1*weight+y2*(1-weight) 35 | 36 | return new_x, new_y 37 | 38 | 39 | img1 = cv2.imread("1.jpg") 40 | img1 = cv2.resize(img1,(224,224)) 41 | img2 = cv2.imread("2.jpg") 42 | img2 = cv2.resize(img2,(224,224)) 43 | y1 = np.array([0,0,1]) 44 | y2 = np.array([0,1,0]) 45 | 46 | x,y = mixup_batch(img1,y1,img2,y2) 47 | cv2.imwrite("12.jpg", x) 48 | print(y) 49 | -------------------------------------------------------------------------------- /CV/codes/txt2xml.py: -------------------------------------------------------------------------------- 1 | 2 | import glob 3 | import cv2 4 | 5 | xml_head = ''' 6 | VOC2007 7 | {}. 8 | 9 | The VOC2007 Database 10 | PASCAL VOC2007 11 | flickr 12 | 13 | 14 | {} 15 | {} 16 | {} 17 | 18 | 0 19 | ''' 20 | xml_obj = ''' 21 | 22 | {} 23 | Unspecified 24 | 0 25 | 0 26 | 27 | {} 28 | {} 29 | {} 30 | {} 31 | 32 | 33 | ''' 34 | xml_end = ''' 35 | ''' 36 | 37 | #--data 38 | #----train 训练集图片 39 | #----train_txt 对应的txt标签 40 | #----train_xml 对应的xml标签 41 | 42 | root='./' 43 | 44 | 45 | labels = {0: 'person'} 46 | 47 | txt_Lists = glob.glob(root +'labels_abs'+ '/*.txt') 48 | print(len(txt_Lists)) 49 | # print(txt_Lists) 50 | cnt=0 51 | 52 | for txt_path in txt_Lists: 53 | filename=txt_path.split('\\') 54 | filename=filename[-1] 55 | filename=filename.split('.') 56 | filename=filename[0] 57 | 58 | txt = root+'labels_abs/'+filename+'.txt' 59 | # jpg=root+'train/'+filename+'.jpg' #jpg path 60 | xml=root+'labels_xml/'+filename+'.xml' 61 | 62 | print(txt) 63 | print(xml) 64 | 65 | obj = '' 66 | 67 | # img = cv2.imread(jpg) 68 | img_h, img_w = 1080, 1920 69 | 70 | print('h_factor:',img_h,' w_factor:',img_w) 71 | # cv2.imshow("img", img) #显示图片 72 | # cv2.waitKey(0) 73 | # cv2.destroyWindow("img") 74 | 75 | head = xml_head.format(str(filename), str(img_w), str(img_h), "3") 76 | 77 | with open(txt, 'r') as f: 78 | for line in f.readlines(): 79 | yolo_datas = line.strip().split(' ') 80 | label = int(float(yolo_datas[0].strip())) 81 | # center_x = round(float(str(yolo_datas[1]).strip()) * img_w) 82 | # center_y = round(float(str(yolo_datas[2]).strip()) * img_h) 83 | # bbox_width = round(float(str(yolo_datas[3]).strip()) * img_w) 84 | # bbox_height = round(float(str(yolo_datas[4]).strip()) * img_h) 85 | 86 | # xmin = str(int(center_x - bbox_width / 2)) 87 | # ymin = str(int(center_y - bbox_height / 2)) 88 | # xmax = str(int(center_x + bbox_width / 2)) 89 | # ymax = str(int(center_y + bbox_height / 2)) 90 | 91 | xmin = str(int(float(yolo_datas[2].strip()))) 92 | ymin = str(int(float(yolo_datas[3].strip()))) 93 | xmax = str(int(float(yolo_datas[4].strip()))) 94 | ymax = str(int(float(yolo_datas[5].strip()))) 95 | 96 | obj += xml_obj.format(labels[label], xmin, ymin, xmax, ymax) 97 | 98 | with open(xml, 'w') as f_xml: 99 | f_xml.write(head + obj + xml_end) 100 | cnt += 1 101 | print(cnt) 102 | -------------------------------------------------------------------------------- /CV/codes/updateTXT.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import os 5 | import random 6 | 7 | def getAllName(file_dir, tail_list = ['.jpg']): 8 | L=[] 9 | for root, dirs, files in os.walk(file_dir): 10 | for file in files: 11 | if os.path.splitext(file)[1] in tail_list: 12 | L.append(os.path.join(root, file)) 13 | return L 14 | 15 | 16 | 17 | 18 | # make all 19 | img_names = getAllName("VOC2007/JPEGImages") 20 | with open("VOC2007/ImageSets/Main/trainval.txt", "w", encoding="utf-8") as f: 21 | for img_name in img_names: 22 | f.write(os.path.basename(img_name)[:-4]+"\n") 23 | 24 | 25 | batch_size = 16 26 | # split 27 | split_ratio = 0.1 28 | with open("VOC2007/ImageSets/Main/trainval.txt", "r", encoding="utf-8") as f: 29 | lines = f.readlines() 30 | print("total label: ", len(lines)) 31 | print("batch size: ", batch_size) 32 | print("train steps: ", int(len(lines)*(1-split_ratio)*1.0/batch_size)) 33 | 34 | f_train = open("VOC2007/ImageSets/Main/train.txt", "w", encoding="utf-8") 35 | f_val = open("VOC2007/ImageSets/Main/val.txt", "w", encoding="utf-8") 36 | 37 | for line in lines: 38 | if random.random() < split_ratio: 39 | f_val.write(line) 40 | 41 | else: 42 | f_train.write(line) 43 | 44 | f_train.close() 45 | f_val.close() 46 | -------------------------------------------------------------------------------- /CV/nets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/nets/.DS_Store -------------------------------------------------------------------------------- /CV/nets/alexnet/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/nets/alexnet/.DS_Store -------------------------------------------------------------------------------- /CV/nets/alexnet/README.md: -------------------------------------------------------------------------------- 1 | # Alexnet 2 | > @Fire 2019.7.1 3 | 4 | * Intro: 2012年,Alex等人提出的AlexNet网络在ImageNet大赛上以远超第二名的成绩夺冠,卷积神经网络乃至深度学习重新引起了广泛的关注。 5 | 6 | * Year: 2012 7 | * Paper: [ImageNet Classification with Deep Convolutional Neural Networks](http://xueshu.baidu.com/usercenter/paper/show?paperid=bfdf67dfdf8cea0c47038f63e91b9df1&site=xueshu_se) 8 | * Code: [keras_alexnet](keras_alexnet.py) 9 | * Info: 224 * 224 * 3的输入,6的输出,参数量为7千万。 10 | 11 | ![net](./alexnet.jpg) 12 | 13 | * Note: 14 | 15 | 1. 数据增强:图像裁剪(crop),水平翻转;颜色、光照变换(使用PCA对每个像素点RGB分别加一个数)。 16 | 2. Dropout:以一定概率使神经元的输出为0,减少过拟合。 17 | 3. ReLU:方便计算,求导容易,使网络变得稀疏(类似L1正则),能够更快的学习。 18 | 4. Local Response Normalization:局部响应归一化,利用临近的数据做归一化。贡献了1.2%的Top-5正确率。 19 | 5. Overlapping Pooling:即Pooling的步长比Pooling Kernel小。贡献了0.3%的Top-5正确率。 20 | 6. 多GPU学习。 21 | 22 | -------------------------------------------------------------------------------- /CV/nets/alexnet/alexnet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/nets/alexnet/alexnet.jpg -------------------------------------------------------------------------------- /CV/nets/alexnet/keras_alexnet.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A Keras port of the original Caffe SSD300 network. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | from keras.models import Model 22 | from keras.layers import Input, Lambda, Dropout, Activation, Dense, Flatten, Conv2D, MaxPooling2D, ZeroPadding2D, Reshape 23 | from keras.regularizers import l2 24 | import keras.backend as K 25 | 26 | from keras.layers.normalization import BatchNormalization 27 | 28 | 29 | def alexnet(image_size, n_classes): 30 | 31 | img_height, img_width, img_channels = image_size 32 | 33 | x = Input(shape=(img_height, img_width, img_channels)) 34 | 35 | conv1 = Conv2D(96, (11, 11), strides=(4,4), activation='relu', padding='valid', name='conv1')(x) 36 | pool1 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same', name='pool1')(conv1) 37 | bn1 = BatchNormalization(axis=-1)(pool1) 38 | 39 | conv2 = Conv2D(256, (5, 5), strides=(1,1), activation='relu', padding='same', name='conv2')(bn1) 40 | pool2 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same', name='pool2')(conv2) 41 | bn2 = BatchNormalization(axis=-1)(pool2) 42 | 43 | conv3 = Conv2D(384, (3, 3), strides=(1,1), activation='relu', padding='same', name='conv3')(bn2) 44 | conv4 = Conv2D(384, (3, 3), strides=(1,1), activation='relu', padding='same', name='conv4')(conv3) 45 | conv5 = Conv2D(256, (3, 3), strides=(1,1), activation='relu', padding='same', name='conv5')(conv4) 46 | pool3 = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same', name='pool3')(conv5) 47 | 48 | flat = Flatten()(pool3) 49 | fc6 = Dense(4096, activation='relu', trainable=True, name='fc6')(flat) 50 | dp6 = Dropout(0.5)(fc6) 51 | fc7 = Dense(4096, activation='relu', trainable=True, name='fc7')(dp6) 52 | dp7 = Dropout(0.5)(fc7) 53 | fc8 = Dense(n_classes, activation='softmax', name='fc8')(dp7) 54 | 55 | model = Model(inputs=x, outputs=fc8) 56 | return model 57 | -------------------------------------------------------------------------------- /CV/nets/lenet5/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/nets/lenet5/.DS_Store -------------------------------------------------------------------------------- /CV/nets/lenet5/README.md: -------------------------------------------------------------------------------- 1 | # LeNet-5 2 | > @Fire 2019.7.1 3 | 4 | * Intro: Yann LeCun在1998年设计的用于手写数字识别的卷积神经网络,算是第一个比较经典的CNN网络。 5 | 6 | * Year: 1998 7 | * Paper: [Gradient-Based Learning Applied to Document Recognition](http://xueshu.baidu.com/usercenter/paper/show?paperid=80fd293244903d8233327d0e5ba6de62&site=xueshu_se) 8 | * Code: [keras_lenet](keras_lenet5.py) 9 | * Info: 32 * 32 * 1的输入,10的输出,参数量为6万。 10 | 11 | ![net](./lenet5.jpg) 12 | -------------------------------------------------------------------------------- /CV/nets/lenet5/keras_lenet5.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A Keras port of the original Caffe SSD300 network. 3 | 4 | Copyright (C) 2018 Pierluigi Ferrari 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | ''' 18 | 19 | from __future__ import division 20 | import numpy as np 21 | from keras.models import Model 22 | from keras.layers import Input, Lambda, Dropout, Activation, Dense, Flatten, Conv2D, MaxPooling2D, ZeroPadding2D, Reshape 23 | from keras.regularizers import l2 24 | import keras.backend as K 25 | 26 | from keras.layers.normalization import BatchNormalization 27 | 28 | 29 | def lenet5(image_size, n_classes): 30 | 31 | img_height, img_width, img_channels = image_size 32 | 33 | x = Input(shape=(img_height, img_width, img_channels)) 34 | 35 | conv1 = Conv2D(6, (5, 5), strides=(1,1), activation='relu', padding='valid', name='conv1')(x) 36 | pool1 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool1')(conv1) 37 | 38 | conv2 = Conv2D(16, (5, 5), strides=(1,1), activation='relu', padding='valid', name='conv2')(pool1) 39 | pool2 = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same', name='pool2')(conv2) 40 | 41 | flat = Flatten()(pool2) 42 | fc3 = Dense(120, activation='relu', trainable=True, name='fc6')(flat) 43 | #dp6 = Dropout(0.5)(fc6) 44 | fc4 = Dense(84, activation='relu', trainable=True, name='fc7')(fc3) 45 | #dp7 = Dropout(0.5)(fc7) 46 | fc5 = Dense(n_classes, activation='softmax', name='fc8')(fc4) 47 | 48 | model = Model(inputs=x, outputs=fc5) 49 | return model 50 | 51 | 52 | if __name__ == '__main__': 53 | image_size = (32,32,1) 54 | n_classes = 10 55 | net = lenet5(image_size, n_classes) 56 | net.summary() -------------------------------------------------------------------------------- /CV/nets/lenet5/lenet5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/nets/lenet5/lenet5.jpg -------------------------------------------------------------------------------- /CV/nets/vgg/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/nets/vgg/.DS_Store -------------------------------------------------------------------------------- /CV/nets/vgg/README.md: -------------------------------------------------------------------------------- 1 | # VGGNet 2 | > @Fire 2019.7.2 3 | 4 | * Intro: 2014年新的一届ILSVRC大赛中Googlenet与VGG的身影分外亮眼。Googlenet相对VGG而言在网络结构上有了更新的突破,不过其复杂度也大大增加了。VGG相对Googlenet虽然精度略逊些,但其整体网络框架还是延续了Alexnet及更早的Lenet等的一贯思路,此外还更深入的探讨了ConvNet深度对模型性能可能的影响。 5 | 6 | * Year: 2014 7 | * Paper: [Very Deep Convolutional Networks for Large-Scale Image Recognition](http://xueshu.baidu.com/usercenter/paper/show?paperid=2801f41808e377a1897a3887b6758c59&site=xueshu_se) 8 | * Code: [keras_vgg](keras_vgg.py) 9 | * Info: 224 * 224 * 3的输入,10的输出,参数量为1.3亿。 10 | 11 | ![net](./vgg.jpg) 12 | 13 | ![net](./vgg16.jpg) 14 | 15 | * Note: 16 | 17 | 1. 相比于AlexNet最大的改进是用小size的Filter代替大size的Filter。两个3 * 3的卷积核代替5 * 5的卷积核,三个3 * 3代替7 * 7。多个小尺度的卷积核比大尺度的卷积核非线性更强,同时参数减少,不容易过拟合。 18 | 19 | -------------------------------------------------------------------------------- /CV/nets/vgg/vgg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/nets/vgg/vgg.jpg -------------------------------------------------------------------------------- /CV/nets/vgg/vgg16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/nets/vgg/vgg16.jpg -------------------------------------------------------------------------------- /CV/note/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/.DS_Store -------------------------------------------------------------------------------- /CV/note/DCNN_book_note.md: -------------------------------------------------------------------------------- 1 | # 《深度卷积网络:原理与实践》笔记 2 | > Fire 2019.01.12 3 | 4 | ### 前言 5 | 1. 本书选择MXNet框架的原因:训练速度快、占用资源少、使用方便、架构清晰、易于二次开发; 6 | 7 | ### 第1章 走进深度学习的世界 8 | 2. 预测学习(Predictive Learning):输入1张图像,预测图像后续的发展; 9 | 10 | ### 第2章 深度卷机网络:第一课 11 | 3. 为何深度神经网络拥有如此强大的威力?这仍然是学术界的研究课题。目前大致的认知是,深度神经网络的逐层结构可以实现对于概念的不断抽象,这恰好与世界的运行规律吻合。 12 | 4. 经验法则:如果一项工作中所需要思考和决策的问题,人能在5秒内解决,它就很有可能被目前的深度神经网络实现。 13 | 5. 2011年,谷歌用传统浅层神经网络用了16000台机器,计算3天,才构建出一个足以识别猫的网络;在2012年,著名的深度神经网络AlexNet面世,1台机器就可以完成这个任务; 14 | 6. 图像分割(segmentation),能进一步将图像自动划分为各个物体,并标记处每个物体的具体区域。目前主流是Mask R-CNN网络,还可进一步实现包括人体姿态识别的图像分割; 15 | 7. Tensorflow游乐场:playground.tensorflow.org 16 | 8. MNIST识别经典模型是1998年的LeNet-5网络,是DCNN的雏形,在正常情况下可达到99.05%的识别率。可访问scs.ryerson.ca/~aharley/vis/conv/,看到该网络每一层输出的图像; 17 | 9. 策略网络实例(围棋):withablink.coding.me/goPolicyNet/ 18 | 19 | 20 | ### 第3章 深度卷积网络:第二课 21 | 10. Excel实现神经网络:2.3.5-P297 22 | 11. 如果发现网络的训练性能很差,值得做的事情就是观察网络内部梯度的流动情况;改善的技巧:BN、残差网络(ResNet),梯度截断,梯度惩罚; 23 | 12. 从几何观点理解神经网络:colah.github.io/posts/2014-03-NN-Manifolds-Topology/; 24 | 13. 根据拓扑学定理,所有n维流形都可以在2n+2维空间中划分开。神经网络在隐层使用大量神经元,就是在做升维,以便划分样本,这成为disentangling,即将纠缠在一起的特征或概念分开; 25 | 14. 很多时候我们还是会根据测试集调参,因此,很多研究中已不使用验证集; 26 | 15. 半监督学习(simi-supervised learning),即数据中只有部分样本带有标签,然后希望给所有样本和未来的样本找到标签;一种有趣的方法是,先人工标记少量标签,然后从少量标签训练网络,然后让网络预测所有样本的标签,再人工筛选和修改其中的标签,重复这个过程。由于网络的预测会越来越准,因此可节省许多人工标注的时间; 27 | 16. 根据近年的研究,比如《The Loss Surfaces of Multilayer Networks》,对于大规模的神经网络,这实际影响不大。如果神经网络的规模够大够深,使用足够多的神经元,往往最后会得到相当靠近全局最优值的解; 28 | 17. 根据经验,如果数据集很复杂,那么普通的SGD虽然速度更慢,但有可能会得到更好的准确率。 29 | 18. L2和L1正则化的基本思想,最简单的网络没有连接,因此希望网络的连接越少越好,如果连接的权重为0就相当于没有连接,因此希望网络中连接的权重越小越好; 30 | 19. 两种类似dropout思想的正则化方法:随机深度和Shake-Shake正则化; 31 | 20. 多分类设置目标类别为100%不好,因为softmax的特性导致网络权重越来越大(需要输入无穷大才能输出100%),不利于网络稳定性,因此可尝试设置为95%; 32 | 21. 进一步的预处理包括白化,常用方法包括PCA和ZCA白化。对于图像还可以进行直方图均衡; 33 | 22. batch大小经验:对于常见问题,最优的往往在16-256之间,太小训练过慢,太大则性能不佳;facebook2017年论文《Accurate,Large Minibatch SGD:Traing ImageNet in 1Hour》指出,设置合理的学习率,在批大小很大(比如8192)的时候也能取得较好的性能; 34 | 23. 可通过CPU-Z和GPU-Z软件,观察CPU和GPU是否在满负荷工作; 35 | 24. 2017年提出Fashion-MNIST,比MNIST难度更大更有代表性; 36 | 37 | 38 | ### 第4章 深度卷积网络:第三课 39 | 25. 卷积操作后,图像中的值往往会有正有负,正表示与特征匹配,负表示相反。如果再进行ReLU操作,就会只留下正值,因此ReLU很适合CNN; 40 | 26. 使用奇数卷积核的好处,可以通过设置合适的padding使得图像再卷积后大小不变; 41 | 27. 转置卷积4.3.5; 42 | 43 | ### 第5章 深度卷积网络:第四课 44 | 28. 5.1.1 AlexNet的特点总结; 45 | 29. 5.1.2 VGG的特点; 46 | 30. 5.1.3 DarkNet的特点; 47 | 31. 2017年9月发布的SmoothGrad技术,找到图像特征关键区域; 48 | 32. 1*1卷积核的应用场景 5.4.2; 49 | 33. batch normalization 5.4.3; 50 | 34. 残差网络:ResNet的思想 5.5.1 、 5.5.2 残差网络架构细节; 51 | 35. 5.6.1 残差网络进展:ResNet、Pyramid Net、 DenseNet; 52 | 36. 压缩网络:SqueezeNet、MobileNet、ShuffleNet(可在AlexNet的二十分之一的运算量下实现相近性能); 53 | 37. 5.6.3 卷积核的变形:扩张卷积(dilated convolution)、可变形卷积(deformable convolution); 54 | 38. 5.7.1 yolo v1;5.7.3 Faster R-CNN;5.7.4 Mask-RCNN 55 | 39. 5.8 图像风格迁移; 56 | 57 | ### 第6章 AlphaGo架构综述 58 | * 6.1.1 v13和v18; 59 | * 6.2 对弈过程; 60 | * 6.2.3 蒙特卡洛树搜索与估值问题; 61 | 62 | ### 第7章 训练策略网络与实战 63 | * zero.sjeng.org 64 | 65 | ### 第8章 生成式对抗网络:GAN 66 | * github.com/hindupuravinash/the-gan-zonn 列举了上百种不同的GAN设计; 67 | * 8.5.1 自编码器:从AE到VAE; 68 | * 8.5.2 逐点生成:PixelRNN和PixelCNN系列; 69 | 70 | ### 第9章 通向智能之秘 71 | * 9.3.3 目前研究人员认为,NLP中的问题,从易到难,顺序如下: 72 | 1. 文本搜索 73 | 2. 文本分类、情感分析 74 | 3. 翻译(到这里为止AI已经接近人类) 75 | 4. 文本摘要 76 | 5. 垂直领域问答 77 | 6. 泛领域问答 78 | * 9.4 深度学习理论发展、前沿研究 79 | * 9.4.2 超越神经网络:Capsule与gcForest 80 | * 9.4.3 深度学习为什么泛化能力好 81 | * 研究人员发现,网络越大,越难通过训练到达全局极小值点,但是同时发现,网络越大,局部极小值点和全局极小值点的差距会越小;因此,网络越大,训练的过程会越简单,越稳定,因为到达任意一个局部极小值点酒足够了; -------------------------------------------------------------------------------- /CV/note/chineseocr-ctpn-densenet.md: -------------------------------------------------------------------------------- 1 | ## [chinese-ocr模型](https://github.com/YCG09/chinese_ocr)说明文档 2 | > Fire 2018.10.18 3 | 4 | ### 基本架构 5 | * 文本检测:CTPN (输出包含文字的图片框) 6 | * 文本识别:DenseNet + CTC (输出识别字符) 7 | 8 | ### CTPN 9 | ![cptn](http://5b0988e595225.cdn.sohucs.com/images/20171130/5466184cc9504f62adcf602a899aca83.jpeg) 10 | 11 | 12 | ![](./vgg1.jpg) ![](./vgg2.jpg) ![](./vgg3.jpg) ![](./vgg4.jpg) 13 | 14 | 1. 输入图片 (1 * 3 * 224 * 224) 15 | 2. vgg提取空间特征,conv5输出:N * C * H * W (1 * 512 * 14 * 14) 16 | 3. rpn_conv层:每个点取周围3 * 3区域做滑窗,输出:N * 9C * H * W (1 * 4608 * 14 * 14) 17 | 4. 双向LSTM层提取序列特征(正反各128):对3中的每一行当作一个数据输入,然后reshape成 N * 256 * H * W(1 * 256 * 14 * 14) 18 | 5. FC层:N * 512 * H * W(1 * 512 * 14 * 14) 19 | 6. RPN网络: 20 | * 回归:1 * 20 * 14 * 14 (20 = 2 * 10;x,y偏移) 21 | * 分类:1 * 20 * 14 * 14 (20 = 2 * 10;前景或者背景) 22 | 7. NMS非极大值抑制 23 | 24 | ![](./nms.jpg) 25 | 26 | ### DenseNet+CTC 27 | 28 | 1. densenet: 29 | 30 | ![](./densenet.jpg) 31 | 32 | * 每个denseblock中都连接每层的残差 33 | * block基本结构: BN+Relu+Conv+dropout 34 | * flatten成一维 35 | * 输出层 dense(5000) 36 | 37 | ![](./denseblock.jpg) 38 | 39 | 2. CTC-loss:损失函数可以解释为:给定样本后输出正确label的概率的乘积,最优化负对数。 40 | 41 | ![](./ctc.jpg) 42 | 43 | e.g. [1 * 8] -> -AA--B-C -> ABC 44 | -------------------------------------------------------------------------------- /CV/note/cptn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/cptn.jpg -------------------------------------------------------------------------------- /CV/note/ctc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/ctc.jpg -------------------------------------------------------------------------------- /CV/note/denseblock.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/denseblock.jpg -------------------------------------------------------------------------------- /CV/note/densenet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/densenet.jpg -------------------------------------------------------------------------------- /CV/note/nms.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/nms.jpg -------------------------------------------------------------------------------- /CV/note/vgg1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/vgg1.jpg -------------------------------------------------------------------------------- /CV/note/vgg2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/vgg2.jpg -------------------------------------------------------------------------------- /CV/note/vgg3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/vgg3.jpg -------------------------------------------------------------------------------- /CV/note/vgg4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/CV/note/vgg4.jpg -------------------------------------------------------------------------------- /DIY/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/DIY/.DS_Store -------------------------------------------------------------------------------- /DIY/EM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "######### 1.DATA ##########\n", 12 | "#例子:统计学习方法三硬币 A,B,C 正面出现概率分别为 a,p,q。\n", 13 | "#先扔A,正面选B,反面C。再扔选的硬币,正面为1,反面为0\n", 14 | "#已知观测结果,求a,p,b。\n", 15 | "import numpy as np\n", 16 | "\n", 17 | "y = [1,1,0,1,0,0,1,0,1,1]\n", 18 | "y = np.array(y)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 15, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "######### 2.EM ###########\n", 30 | "def stepE(valuesOld,y):\n", 31 | " a = valuesOld[0]\n", 32 | " p = valuesOld[1]\n", 33 | " q = valuesOld[2]\n", 34 | " miu = (a*p**y*(1-p)**(1-y)) / ((a*p**y*(1-p)**(1-y)) + (1-a)*q**y*(1-q)**(1-y))\n", 35 | " return miu\n", 36 | " \n", 37 | "def setpM(miu,y):\n", 38 | " a = np.mean(miu)\n", 39 | " p = np.sum(miu*y) / np.sum(miu)\n", 40 | " q = np.sum((1-miu)*y) / np.sum(1-miu)\n", 41 | " valuesNew = np.array([a,p,q])\n", 42 | " return valuesNew\n", 43 | " \n", 44 | "def EM(init_values,y,tol = 0.0001,iterations = 1000):\n", 45 | " valuesOld = np.array(init_values)\n", 46 | " for i in range(iterations):\n", 47 | " miu = stepE(valuesOld,y)\n", 48 | " valuesNew = setpM(miu,y)\n", 49 | " if np.sum(valuesNew-valuesOld) < tol:\n", 50 | " break\n", 51 | " else:\n", 52 | " valuesOld = valuesNew\n", 53 | " return valuesNew" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 17, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "[ 0.5 0.6 0.6]\n", 66 | "[ 0.40641711 0.53684211 0.64324324]\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "######### 3.test #########\n", 72 | "\n", 73 | "init_values1 = [0.5,0.5,0.5]\n", 74 | "output1 = EM(init_values1,y)\n", 75 | "print(output1)\n", 76 | "init_values2 = [0.4,0.6,0.7]\n", 77 | "output2 = EM(init_values2,y)\n", 78 | "print(output2)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.5.4" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } 113 | -------------------------------------------------------------------------------- /DIY/IOU.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | _IOU_threshold = 0.6 4 | 5 | def IOU(Reframe,GTframe): 6 | """ 7 | 自定义函数,计算两矩形 IOU,传入为均为矩形对角线,(x,y) 坐标。 8 | """ 9 | x1 = Reframe[0] 10 | y1 = Reframe[1] 11 | width1 = Reframe[2]-Reframe[0] 12 | height1 = Reframe[3]-Reframe[1] 13 | 14 | x2 = GTframe[0] 15 | y2 = GTframe[1] 16 | width2 = GTframe[2]-GTframe[0] 17 | height2 = GTframe[3]-GTframe[1] 18 | 19 | endx = max(x1+width1,x2+width2) 20 | startx = min(x1,x2) 21 | width = width1+width2-(endx-startx) 22 | 23 | endy = max(y1+height1,y2+height2) 24 | starty = min(y1,y2) 25 | height = height1+height2-(endy-starty) 26 | 27 | if width <=0 or height <= 0: 28 | ratio = 0 # 重叠率为 0 29 | else: 30 | Area = width*height # 两矩形相交面积 31 | Area1 = width1*height1 32 | Area2 = width2*height2 33 | ratio = Area*1./(Area1+Area2-Area) 34 | # return IOU 35 | return ratio 36 | 37 | def computeLoss(pre_box_list, label_box_list, R_weight = 1): 38 | pre_box_list = np.array(pre_box_list) 39 | label_box_list = np.array(label_box_list) 40 | total_pre = len(pre_box_list) 41 | total_label = len(label_box_list) 42 | 43 | # compute precise 44 | p_count = 0.0 45 | for box_pre in pre_box_list: 46 | for box_label in label_box_list: 47 | print(IOU(box_pre,box_label)) 48 | if IOU(box_pre,box_label) > _IOU_threshold: 49 | p_count += 1 50 | break 51 | P = p_count / total_pre 52 | 53 | # compute recall 54 | r_count = 0.0 55 | for box_label in label_box_list: 56 | for box_pre in pre_box_list: 57 | if IOU(box_pre,box_label) > _IOU_threshold: 58 | r_count += 1 59 | break 60 | R = r_count / total_label 61 | 62 | # compute F1-score 63 | 64 | F = 2*P*R*R_weight/(P+R*R_weight) 65 | 66 | return P,R,F 67 | 68 | 69 | if __name__ == '__main__': 70 | 71 | # 1.test iou 72 | # tests_iou = [ 73 | # [ [[10,40,30,80],[10,40,30,80]], 1], 74 | # [ [[10,40,30,80],[30,80,60,120]], 0] 75 | # ] 76 | 77 | # for t in tests_iou: 78 | # v,_,_ = IOU(t[0][0],t[0][1]) 79 | # print(v, t[1]) 80 | 81 | # 2.test compute 82 | pre_box_list = [ [10,40,30,80], [30,80,60,120], [40,100,80,140], [42,100,80,140], [44,100,80,140] ] 83 | label_box_list = [ [10,40,30,80], [30,80,60,120] ] 84 | print(computeLoss(pre_box_list, label_box_list)) 85 | -------------------------------------------------------------------------------- /DIY/Stacking.py: -------------------------------------------------------------------------------- 1 | # code from https://dnc1994.com/2016/04/rank-10-percent-in-first-kaggle-competition/ 2 | # 自己加了点注释帮助理解,也方便自己以后使用 3 | class Ensemble(object): 4 | def __init__(self, n_folds, stacker, base_models): 5 | self.n_folds = n_folds #交叉验证集划分的折数 6 | self.stacker = stacker #第二层stacking时使用的分类器 7 | self.base_models = base_models #第一层的基本模型 们 8 | def fit_predict(self, X, y, T): 9 | X = np.array(X) #train_x 10 | y = np.array(y) #train_y 11 | T = np.array(T) #test_x 12 | folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=2016)) 13 | #sklearn.cross_validation.KFold(n, n_folds=3, shuffle=False, random_state=None) 14 | #http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html 15 | #这里只是生成了index 的迭代器,根据index取数据在后面进行 16 | 17 | S_train = np.zeros((X.shape[0], len(self.base_models))) #第二层的训练数据 18 | S_test = np.zeros((T.shape[0], len(self.base_models))) 19 | #数据条数不变,特征数变为模型数,因为每个模型产生一列 20 | 21 | for i, clf in enumerate(self.base_models): #clf Classification 22 | S_test_i = np.zeros((T.shape[0], len(folds))) 23 | for j, (train_idx, test_idx) in enumerate(folds): 24 | X_train = X[train_idx] 25 | y_train = y[train_idx] 26 | X_holdout = X[test_idx] 27 | # y_holdout = y[test_idx] 28 | clf.fit(X_train, y_train) 29 | y_pred = clf.predict(X_holdout)[:] 30 | S_train[test_idx, i] = y_pred 31 | S_test_i[:, j] = clf.predict(T)[:] #整个T的预测 32 | S_test[:, i] = S_test_i.mean(1) #按行求平均值 即axis=1. 矩阵变成一列后加入S_test中 33 | 34 | self.stacker.fit(S_train, y) 35 | y_pred = self.stacker.predict(S_test)[:] 36 | return y_pred 37 | 38 | ''' 39 | 据说获奖选手往往会使用比这复杂得多的 Ensemble,会出现三层、四层甚至五层,不同的层数之间有各种交互, 40 | 还有将经过不同的 Preprocessing 和不同的 Feature Engineering 的数据用 Ensemble 组合起来的做法。 41 | ''' 42 | -------------------------------------------------------------------------------- /DIY/ex1data1.txt: -------------------------------------------------------------------------------- 1 | 6.1101,17.592 2 | 5.5277,9.1302 3 | 8.5186,13.662 4 | 7.0032,11.854 5 | 5.8598,6.8233 6 | 8.3829,11.886 7 | 7.4764,4.3483 8 | 8.5781,12 9 | 6.4862,6.5987 10 | 5.0546,3.8166 11 | 5.7107,3.2522 12 | 14.164,15.505 13 | 5.734,3.1551 14 | 8.4084,7.2258 15 | 5.6407,0.71618 16 | 5.3794,3.5129 17 | 6.3654,5.3048 18 | 5.1301,0.56077 19 | 6.4296,3.6518 20 | 7.0708,5.3893 21 | 6.1891,3.1386 22 | 20.27,21.767 23 | 5.4901,4.263 24 | 6.3261,5.1875 25 | 5.5649,3.0825 26 | 18.945,22.638 27 | 12.828,13.501 28 | 10.957,7.0467 29 | 13.176,14.692 30 | 22.203,24.147 31 | 5.2524,-1.22 32 | 6.5894,5.9966 33 | 9.2482,12.134 34 | 5.8918,1.8495 35 | 8.2111,6.5426 36 | 7.9334,4.5623 37 | 8.0959,4.1164 38 | 5.6063,3.3928 39 | 12.836,10.117 40 | 6.3534,5.4974 41 | 5.4069,0.55657 42 | 6.8825,3.9115 43 | 11.708,5.3854 44 | 5.7737,2.4406 45 | 7.8247,6.7318 46 | 7.0931,1.0463 47 | 5.0702,5.1337 48 | 5.8014,1.844 49 | 11.7,8.0043 50 | 5.5416,1.0179 51 | 7.5402,6.7504 52 | 5.3077,1.8396 53 | 7.4239,4.2885 54 | 7.6031,4.9981 55 | 6.3328,1.4233 56 | 6.3589,-1.4211 57 | 6.2742,2.4756 58 | 5.6397,4.6042 59 | 9.3102,3.9624 60 | 9.4536,5.4141 61 | 8.8254,5.1694 62 | 5.1793,-0.74279 63 | 21.279,17.929 64 | 14.908,12.054 65 | 18.959,17.054 66 | 7.2182,4.8852 67 | 8.2951,5.7442 68 | 10.236,7.7754 69 | 5.4994,1.0173 70 | 20.341,20.992 71 | 10.136,6.6799 72 | 7.3345,4.0259 73 | 6.0062,1.2784 74 | 7.2259,3.3411 75 | 5.0269,-2.6807 76 | 6.5479,0.29678 77 | 7.5386,3.8845 78 | 5.0365,5.7014 79 | 10.274,6.7526 80 | 5.1077,2.0576 81 | 5.7292,0.47953 82 | 5.1884,0.20421 83 | 6.3557,0.67861 84 | 9.7687,7.5435 85 | 6.5159,5.3436 86 | 8.5172,4.2415 87 | 9.1802,6.7981 88 | 6.002,0.92695 89 | 5.5204,0.152 90 | 5.0594,2.8214 91 | 5.7077,1.8451 92 | 7.6366,4.2959 93 | 5.8707,7.2029 94 | 5.3054,1.9869 95 | 8.2934,0.14454 96 | 13.394,9.0551 97 | 5.4369,0.61705 98 | -------------------------------------------------------------------------------- /DIY/lenses.txt: -------------------------------------------------------------------------------- 1 | young myope no reduced no lenses 2 | young myope no normal soft 3 | young myope yes reduced no lenses 4 | young myope yes normal hard 5 | young hyper no reduced no lenses 6 | young hyper no normal soft 7 | young hyper yes reduced no lenses 8 | young hyper yes normal hard 9 | pre myope no reduced no lenses 10 | pre myope no normal soft 11 | pre myope yes reduced no lenses 12 | pre myope yes normal hard 13 | pre hyper no reduced no lenses 14 | pre hyper no normal soft 15 | pre hyper yes reduced no lenses 16 | pre hyper yes normal no lenses 17 | presbyopic myope no reduced no lenses 18 | presbyopic myope no normal no lenses 19 | presbyopic myope yes reduced no lenses 20 | presbyopic myope yes normal hard 21 | presbyopic hyper no reduced no lenses 22 | presbyopic hyper no normal soft 23 | presbyopic hyper yes reduced no lenses 24 | presbyopic hyper yes normal no lenses 25 | -------------------------------------------------------------------------------- /DM/note/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/DM/note/.DS_Store -------------------------------------------------------------------------------- /DM/note/img/fe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/DM/note/img/fe.jpg -------------------------------------------------------------------------------- /NLP/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/NLP/.DS_Store -------------------------------------------------------------------------------- /NLP/codes/re.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "emoticons_str = r\"\"\"\n", 11 | " (?:\n", 12 | " [:=;] # 眼睛\n", 13 | " [oO\\-]? # 鼻子\n", 14 | " [D\\)\\]\\(\\]/\\\\OpP] # 嘴\n", 15 | " )\"\"\"\n", 16 | "regex_str = [\n", 17 | " emoticons_str,\n", 18 | " r'<[^>]+>', # HTML tags\n", 19 | " r'(?:@[\\w_]+)', # @某人\n", 20 | " r\"(?:\\#+[\\w_]+[\\w\\'_\\-]*[\\w_]+)\", # 话题标签\n", 21 | " r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-f][0-9a-f]))+',\n", 22 | " # URLs\n", 23 | " r'(?:(?:\\d+,?)+(?:\\.?\\d+)?)', # 数字\n", 24 | " r\"(?:[a-z][a-z'\\-_]+[a-z])\", # 含有 - 和 ‘ 的单词\n", 25 | " r'(?:[\\w_]+)', # 其他\n", 26 | " r'(?:\\S)' # 其他\n", 27 | " ]" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 13, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "['RT', '@angelababy', ':', 'love', 'you', 'baby', '!', ':D', 'http://ah.love', '#168cm']\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)\n", 45 | "emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)\n", 46 | "\n", 47 | "def tokenize(s):\n", 48 | " return tokens_re.findall(s)\n", 49 | "def preprocess(s, lowercase=False):\n", 50 | " tokens = tokenize(s)\n", 51 | " if lowercase:\n", 52 | " tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]\n", 53 | " return tokens\n", 54 | "tweet = 'RT @angelababy: love you baby! :D http://ah.love #168cm'\n", 55 | "print(preprocess(tweet))\n", 56 | "# ['RT', '@angelababy', ':', 'love', 'you', 'baby',\n", 57 | "# ’!', ':D', 'http://ah.love', '#168cm']" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "Python 3", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.5.2rc1" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 2 91 | } 92 | -------------------------------------------------------------------------------- /NLP/knowledge.md: -------------------------------------------------------------------------------- 1 | 2 | ## NLP知识点 3 | 4 | 5 | 6 | ### 1.word2vec 7 | * CBOW: 训练输入是某一个特征词的上下文相关的词对应的词向量,而输出就是这特定的一个词的词向量。由于CBOW使用的是词袋模型,因此这8个词都是平等的,也就是不考虑他们和我们关注的词之间的距离大小,只要在我们上下文之内即可。 8 | * Skip-Gram: 输入是特定的一个词的词向量,而输出是特定词对应的上下文词向量。 9 | 10 | ### 2.分词 11 | 中文分词的基本方法可以分为基于语法规则的方法、基于词典的方法和基于统计的方法。 12 | 基于语法规则的分词法基本思想是在分词的同时进行句法、语义分析, 利用句法信息和语义信息来进行词性标注, 以解决分词歧义现象。因为现有的语法知识、句法规则十分笼统、复杂, 基于语法和规则的分词法所能达到的精确度远远还不能令人满意, 目前这种分词系统应用较少。 13 | 在基于词典的方法中,可以进一步分为最大匹配法,最大概率法,最短路径法等。最大匹配法指的是按照一定顺序选取字符串中的若干个字当做一个词,去词典中查找。根据扫描方式可细分为:正向最大匹配,反向最大匹配,双向最大匹配,最小切分。最大概率法指的是一个待切分的汉字串可能包含多种分词结果,将其中概率最大的那个作为该字串的分词结果。最短路径法指的是在词图上选择一条词数最少的路径。 14 | 基于统计的分词法的基本原理是根据字符串在语料库中出现的统计频率来决定其是否构成词。词是字的组合,相邻的字同时出现的次数越多, 就越有可能构成一个词。因此字与字相邻共现的频率或概率能够较好的反映它们成为词的可信度。常用的方法有HMM(隐马尔科夫模型),MAXENT(最大熵模型),MEMM(最大熵隐马尔科夫模型),CRF(条件随机场)。 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /NLP/tools/NLPIR/Start.py: -------------------------------------------------------------------------------- 1 | import pynlpir 2 | 3 | pynlpir.open() 4 | s = '今天天气真是好呀' 5 | segments = pynlpir.segment(s) 6 | stop_cx =['modal particle', 'punctuation mark', 'noun of locality','particle','numeral'] 7 | for segment in segments: 8 | #if segment[1] not in stop_cx: 9 | print(segment[0], '\t', segment[1]) 10 | print('---') 11 | key_words = pynlpir.get_key_words(s, weighted=True) 12 | for key_word in key_words: 13 | print(key_word[0], '\t', key_word[1]) 14 | 15 | pynlpir.close() 16 | 17 | ''' 18 | 今天 time word 19 | 天气 noun 20 | 真是 adverb 21 | 好 adjective 22 | 呀 modal particle 23 | --- 24 | 今天 2.2 25 | 天气 2.0 26 | ''' 27 | -------------------------------------------------------------------------------- /NLP/tools/gensim/load_w2v_ch.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | #py3 4 | #https://ai.tencent.com/ailab/nlp/embedding.html 5 | 6 | from gensim.models.keyedvectors import KeyedVectors 7 | 8 | 9 | file = "Tencent_AILab_ChineseEmbedding.txt" 10 | 11 | 12 | # with open(file,"r",encoding="utf-8") as f: 13 | # print(f.readline()) # 8824330 200 14 | # # print(f.readline()) 15 | # # print(f.readline()) 16 | 17 | 18 | wv_from_text = KeyedVectors.load_word2vec_format(file, binary=False) 19 | 20 | wv_from_text.most_similar(u"足球") 21 | """ 22 | [('足球运动', 0.8081263303756714), ('足球文化', 0.7661516070365906), ('足球发展', 0.7645934820175171), ('职业足球', 0.7609031200408936), ('足球教育', 0.7551054954528809), ('热爱足球', 0.7491205930709839), ('足球技术', 0.7459214925765991), ('踢球', 0.7441200017929077), ('世界足球', 0.7434529066085815), ('足球项目', 0.7409517765045166)] 23 | 24 | """ 25 | -------------------------------------------------------------------------------- /NLP/tools/gensim/process_wiki_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # process_wiki_data.py 用于解析XML,将XML的wiki数据转换为text格式 4 | #执行命令:python process_wiki_data.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text 5 | import logging 6 | import os.path 7 | import sys 8 | from gensim.corpora import WikiCorpus 9 | if __name__ == '__main__': 10 | program = os.path.basename(sys.argv[0]) 11 | logger = logging.getLogger(program) 12 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 13 | logging.root.setLevel(level=logging.INFO) 14 | logger.info("running %s" % ' '.join(sys.argv)) 15 | # check and process input arguments 16 | if len(sys.argv) < 3: 17 | print(globals()['__doc__'] % locals()) 18 | sys.exit(1) 19 | inp, outp = sys.argv[1:3] 20 | space = " " 21 | i = 0 22 | output = open(outp, 'w',encoding='utf-8') 23 | wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) 24 | for text in wiki.get_texts(): 25 | output.write(space.join(text) + "\n") 26 | i = i + 1 27 | if (i % 10000 == 0): 28 | logger.info("Saved " + str(i) + " articles") 29 | output.close() 30 | logger.info("Finished Saved " + str(i) + " articles") -------------------------------------------------------------------------------- /NLP/tools/gensim/readme.md: -------------------------------------------------------------------------------- 1 | ### 资料 2 | * [官方文档](https://radimrehurek.com/gensim/apiref.html) 3 | * [word2vec API](https://radimrehurek.com/gensim/models/word2vec.html) 4 | 5 | ### 实践 6 | * [word2vec训练中文模型](https://www.zybuluo.com/hanxiaoyang/note/472184) ( [将XML的wiki数据转text](./process_wiki_data.py) [jieba完成分词](../jieba/read_save.ipynb) [用word2vec工具训练](./train_word2vec_model.py) [测试模型效果](./test_word2vec.ipynb) ) 7 | * [加载腾讯开源的中文word2vec词向量语料库](./load_w2v_ch.py) 8 | -------------------------------------------------------------------------------- /NLP/tools/gensim/test_word2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "[('足球运动', 0.6127325296401978),\n", 12 | " ('排球', 0.5376268625259399),\n", 13 | " ('冰球', 0.5342495441436768),\n", 14 | " ('板球', 0.5301790833473206),\n", 15 | " ('手球', 0.5166541337966919),\n", 16 | " ('籃球', 0.5052165389060974),\n", 17 | " ('英超球', 0.499561607837677),\n", 18 | " ('女足', 0.4948025941848755),\n", 19 | " ('足球联赛', 0.491238534450531),\n", 20 | " ('美式足球', 0.49103665351867676)]" 21 | ] 22 | }, 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "import gensim\n", 30 | "model = gensim.models.Word2Vec.load(r\"F:\\data\\wiki.zh.text.model\")\n", 31 | "model.most_similar(u\"足球\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 4, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "[('女人', 0.7703686952590942),\n", 43 | " ('傻瓜', 0.5394862294197083),\n", 44 | " ('家伙', 0.5176622271537781),\n", 45 | " ('女孩', 0.5025584101676941),\n", 46 | " ('撒嬌', 0.4929904341697693),\n", 47 | " ('小伙子', 0.4917035698890686),\n", 48 | " ('女明星', 0.4843180179595947),\n", 49 | " ('爸爸', 0.4842095673084259),\n", 50 | " ('女孩子', 0.48044753074645996),\n", 51 | " ('老公', 0.4802494943141937)]" 52 | ] 53 | }, 54 | "execution_count": 4, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "model.most_similar(u\"男人\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "[('鞋子', 0.7696906924247742),\n", 72 | " ('衣物', 0.7572050094604492),\n", 73 | " ('裙子', 0.7095688581466675),\n", 74 | " ('大衣', 0.7061837911605835),\n", 75 | " ('外套', 0.7023261785507202),\n", 76 | " ('外衣', 0.6756951808929443),\n", 77 | " ('內褲', 0.6667477488517761),\n", 78 | " ('褲子', 0.6629331707954407),\n", 79 | " ('上衣', 0.6550877690315247),\n", 80 | " ('西装', 0.6357579231262207)]" 81 | ] 82 | }, 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "model.most_similar(u\"衣服\")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 3", 105 | "language": "python", 106 | "name": "python3" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 3 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython3", 118 | "version": "3.5.4" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 2 123 | } 124 | -------------------------------------------------------------------------------- /NLP/tools/gensim/train_word2vec_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # train_word2vec_model.py用于训练模型 4 | import logging 5 | import os.path 6 | import sys 7 | import multiprocessing 8 | from gensim.corpora import WikiCorpus 9 | from gensim.models import Word2Vec 10 | from gensim.models.word2vec import LineSentence 11 | if __name__ == '__main__': 12 | program = os.path.basename(sys.argv[0]) 13 | logger = logging.getLogger(program) 14 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 15 | logging.root.setLevel(level=logging.INFO) 16 | logger.info("running %s" % ' '.join(sys.argv)) 17 | # check and process input arguments 18 | if len(sys.argv) < 4: 19 | print globals()['__doc__'] % locals() 20 | sys.exit(1) 21 | inp, outp1, outp2 = sys.argv[1:4] 22 | model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5, 23 | workers=multiprocessing.cpu_count()) 24 | # trim unneeded model memory = use(much) less RAM 25 | #model.init_sims(replace=True) 26 | model.save(outp1) 27 | model.save_word2vec_format(outp2, binary=False) -------------------------------------------------------------------------------- /NLP/tools/jieba/cixing.py: -------------------------------------------------------------------------------- 1 | import jieba.posseg as pseg 2 | words = pseg.cut("迅速落实整改,报道称 河南省2017年护士执业资格考试已于2017年5月8日结束 模块消防站全景器材室多功能室图书室厨房") 3 | for w in words: 4 | print("%s %s" %(w.word, w.flag)) 5 | 6 | 7 | ''' 8 | #output: 9 | 迅速 ad 10 | 落实 a 11 | 整改 v 12 | , x 13 | 报道 v 14 | 称 v 15 | ... 16 | ''' 17 | -------------------------------------------------------------------------------- /NLP/tools/jieba/jieba_cut.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "\n", 13 | "Full Mode: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学\n", 14 | "Default Mode: 我/ 来到/ 北/ 京/ 清华大学\n", 15 | "他, 来到, 了, 网易, 杭研, 大厦\n", 16 | "小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, ,, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "##################### 分词的不同模式 #######################\n", 22 | "import jieba\n", 23 | "seg_list = jieba.cut(\"我来到北京清华大学\", cut_all=True)\n", 24 | "print(seg_list) #直接输入不行 因为不是list 而是一个类似于迭代器的 enerator\n", 25 | "print(\"Full Mode:\", \"/ \".join(seg_list)) # 全模式\n", 26 | "\n", 27 | "seg_list = jieba.cut(\"我来到北京清华大学\", cut_all=False)\n", 28 | "print(\"Default Mode:\", \"/ \".join(seg_list)) # 精确模式\n", 29 | "\n", 30 | "seg_list = jieba.cut(\"他来到了网易杭研大厦\") # 默认是精确模式\n", 31 | "print(\", \".join(seg_list))\n", 32 | "\n", 33 | "seg_list = jieba.cut_for_search(\"小明硕士毕业于中国科学院计算所,后在日本京都大学深造\")# 搜索引擎模式\n", 34 | "print(\", \".join(seg_list))" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.5.2rc1" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 2 68 | } 69 | -------------------------------------------------------------------------------- /NLP/tools/jieba/jieba_cut_ngram.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import jieba 3 | 4 | ori_data = u'刘超是一个喜欢学习的好学生,你看,他正在学习呢。除了学习刘超还喜欢打游戏。' 5 | print ori_data 6 | 7 | def reform(sentence): 8 | #如果是以“。”结束的则将“。”删掉 9 | if sentence[-1] == u"。": 10 | sentence=sentence[:-1] 11 | #添加起始符BOS和终止符EOS 12 | tmp = [u'、' ,u',',u':',u'。'] 13 | for i in xrange(len(sentence)): 14 | if sentence[i] in tmp: 15 | sentence=sentence[:i]+'EEEBBB'+sentence[i+1:] 16 | sentence2="BBB"+sentence+"EEE" 17 | return sentence2 18 | 19 | data1 = reform(ori_data) 20 | print data1 21 | 22 | #分词并统计词频 23 | def segmentation(sentence,lists=[],dicts={}): 24 | jieba.suggest_freq(u"BBB", True) 25 | jieba.suggest_freq(u"EEE", True) 26 | jieba.suggest_freq(u"刘超", True) 27 | #分词 28 | sentence = jieba.cut(sentence,cut_all=False,HMM=False) 29 | #组合 30 | format_sentence=",".join(sentence) 31 | lists=format_sentence.split(",") 32 | for word in lists: 33 | if word not in dicts: 34 | dicts[word]=1 35 | else: 36 | dicts[word]+=1 37 | return lists 38 | 39 | 40 | dict1 = {} 41 | t = segmentation(data1,lists=[],dicts=dict1) 42 | for x in t: 43 | print x.encode('utf-8') 44 | #输出词频,同时去除一些杂词 45 | badwords = [] 46 | #badwords = [u'的',u'是',u'呢',u'还',u'BBB',u'EEE'] 47 | for key in dict1.keys(): 48 | if key not in badwords: 49 | print key.encode('utf-8'),':',dict1[key] 50 | 51 | 52 | test1 = u'刘超喜欢学习' 53 | test2 = u'学习喜欢刘超' 54 | 55 | def segmentation(sentence,lists=[]): 56 | jieba.suggest_freq(u"刘超", True) 57 | #分词 58 | sentence = jieba.cut(sentence,cut_all=False,HMM=False) 59 | #组合 60 | format_sentence=",".join(sentence) 61 | lists=format_sentence.split(",") 62 | return lists 63 | 64 | test1 = segmentation(test1) 65 | test2 = segmentation(test2) 66 | 67 | #比较两个数列,二元语法 68 | def compareList(ori_list,test_list): 69 | #申请空间 70 | count_list=[0]*(len(test_list)) 71 | #遍历测试的字符串 72 | for i in range(0,len(test_list)-1): 73 | #遍历语料字符串,且因为是二元语法,不用比较语料字符串的最后一个字符 74 | for j in range(0,len(ori_list)-2): 75 | #如果测试的第一个词和语料的第一个词相等则比较第二个词 76 | if test_list[i]==ori_list[j]: 77 | if test_list[i+1]==ori_list[j+1]: 78 | count_list[i]+=1 79 | return count_list 80 | 81 | print compareList(t,test1) 82 | print compareList(t,test2) 83 | 84 | #计算概率 85 | def probability(test_list,count_list,ori_dict): 86 | flag=0 87 | #概率值为p 88 | p=1 89 | for key in test_list: 90 | #数据平滑处理:加1法 91 | p*=(float(count_list[flag]+1)/float(ori_dict[key]+1)) 92 | flag+=1 93 | return p 94 | print probability(test1,compareList(t,test1),dict1) 95 | print probability(test2,compareList(t,test2),dict1) 96 | -------------------------------------------------------------------------------- /NLP/tools/jieba/read_save.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "Building prefix dict from the default dictionary ...\n", 13 | "Loading model from cache C:\\Users\\Fire\\AppData\\Local\\Temp\\jieba.cache\n" 14 | ] 15 | }, 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "start cut.\n" 21 | ] 22 | }, 23 | { 24 | "name": "stderr", 25 | "output_type": "stream", 26 | "text": [ 27 | "Loading model cost 0.684 seconds.\n", 28 | "Prefix dict has been built succesfully.\n" 29 | ] 30 | }, 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "End file.\n" 36 | ] 37 | }, 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "0" 42 | ] 43 | }, 44 | "execution_count": 1, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "import codecs\n", 51 | "import jieba \n", 52 | "import jieba.analyse \n", 53 | " \n", 54 | "#Read file and cut \n", 55 | "def read_file_cut(): \n", 56 | " fileName = r\"F:\\data\\wiki.zh.text\" \n", 57 | " source = open(fileName, 'r',encoding='utf-8') \n", 58 | " line = source.readline() \n", 59 | " line = line.rstrip('\\n') \n", 60 | " \n", 61 | " result = codecs.open('wiki.zh.text.seg', 'w', 'utf-8') \n", 62 | " \n", 63 | " print('start cut.')\n", 64 | " while line!=\"\": \n", 65 | " seglist = jieba.cut(line,cut_all=False) #精确模式 \n", 66 | " output = ' '.join(list(seglist)) #空格拼接 \n", 67 | " result.write(output + '\\r\\n') \n", 68 | " line = source.readline() \n", 69 | " print('End file.') \n", 70 | " source.close() \n", 71 | " result.close() \n", 72 | " return 0\n", 73 | " \n", 74 | "read_file_cut() " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.5.4" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 2 108 | } 109 | -------------------------------------------------------------------------------- /NLP/tools/jieba/readme.md: -------------------------------------------------------------------------------- 1 | ### 常用 2 | * 读取自定义词典 3 | 4 | ``` 5 | jieba.load_userdict(r'./data/user_dict.txt') # file_name为自定义词典的路径   6 | #(格式:每行:词 [词频] [词性]) 中括号代表可选 7 | ``` 8 | 9 | ### 实践 10 | * [二元语法模型](./jieba_cut_ngram.py) 11 | * [jieba分词不同模式](./jieba_cut.ipynb) 12 | * [读取文本分词并存储](./read_save.ipynb) 13 | * [词性标注](./cixing.py) 14 | * [TF-IDF](./if-idf.py) 15 | -------------------------------------------------------------------------------- /NLP/tools/nltk/readme.md: -------------------------------------------------------------------------------- 1 | ### 资料 2 | * [官方文档](http://www.nltk.org/api/nltk.html) 3 | 4 | ### 常用功能 5 | * [处理频率问题 .FreqDist](./func/nltk_FreqDist.ipynb) 6 | * [分词tokenize & Text & 处理HTML](./func/tokenize_text_html.ipynb) 7 | 8 | ### 实践 9 | * [词形归一化 & 词性标注](./practice/wordsNormalization.ipynb) 10 | * [情感分析](./practice/Sentiment_analysis.ipynb) 11 | * [文本相似性](./practice/Text_similarity.ipynb) 12 | * [TF-IDF](./practice/TF-IDF.ipynb) -------------------------------------------------------------------------------- /NLP/tools/word2vec/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/NLP/tools/word2vec/.DS_Store -------------------------------------------------------------------------------- /NLP/tools/word2vec/readme.md: -------------------------------------------------------------------------------- 1 | ### 常用 2 | * 使用自定义语料: 3 | 1. 分词,去除停用词 4 | 2. 所有词以空格键或tab隔开写入一个文件中 5 | 6 | 7 | 8 | ### 实践 9 | * [基本示例](word2vec_start.ipynb) 10 | -------------------------------------------------------------------------------- /Others/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fire717/Machine-Learning/46d417e4c872052857899331ff7f526f79110896/Others/.DS_Store --------------------------------------------------------------------------------