├── .gitattributes
├── .gitignore
├── 0-Distance
    └── blog_ml_distance.py
├── 0-Spider
    ├── README.md
    ├── beidaNewsSpider
    │   ├── .idea
    │   │   ├── beidaSpider.iml
    │   │   ├── inspectionProfiles
    │   │   │   └── profiles_settings.xml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   └── workspace.xml
    │   ├── README.md
    │   ├── news.sql
    │   ├── news.txt
    │   └── spider.py
    └── tiebaSpider
    │   ├── .idea
    │       ├── misc.xml
    │       ├── modules.xml
    │       ├── tiebaSpider.iml
    │       └── workspace.xml
    │   ├── README.md
    │   ├── spider1
    │       ├── README.md
    │       ├── main.py
    │       ├── spider.py
    │       ├── spider.pyc
    │       └── tiebaname
    │       │   └── name.txt
    │   └── spider2
    │       └── tieba
    │           ├── .idea
    │               ├── misc.xml
    │               ├── modules.xml
    │               ├── tieba.iml
    │               └── workspace.xml
    │           ├── data
    │               └── 20170630_all_href.txt
    │           ├── name.txt
    │           ├── scrapy.cfg
    │           └── tieba
    │               ├── __init__.py
    │               ├── __init__.pyc
    │               ├── items.py
    │               ├── items.pyc
    │               ├── middlewares.py
    │               ├── pipelines.py
    │               ├── pipelines.pyc
    │               ├── settings.py
    │               ├── settings.pyc
    │               └── spiders
    │                   ├── __init__.py
    │                   ├── __init__.pyc
    │                   ├── tieba1.py
    │                   ├── tieba1.pyc
    │                   ├── tieba2.py
    │                   └── tieba2.pyc
├── AdaBoost
    └── AdaBoost.py
├── Apriori
    └── Apriori.py
├── Bayes
    └── bayes.py
├── Decision-Tree
    └── DecisionTree-ID3.py
├── FP-growth
    ├── FP_Tree.py
    ├── newsClickStream.py
    └── 所用到dat文件下载地址.txt
├── K-means
    └── kMeans.py
├── Logistic Regession
    ├── LogisticRegession.py
    ├── LogisticRegessionExample.py
    ├── ex1.txt
    ├── horseColicTest.txt
    └── horseColicTraining.txt
├── PCA
    ├── PCA.py
    ├── secom.data
    └── testSet.txt
├── README.md
├── Recommend
    ├── uid_score_bid.dat
    ├── 基于item的协同过滤推荐BasedItem.py
    ├── 基于图的推荐PersonalRank.py
    ├── 基于标签的推荐.py
    └── 基于用户的协同过滤推荐BasedUserCF.py
├── Regession
    ├── abalone.txt
    ├── ex0.txt
    ├── ex1.txt
    └── regession.py
└── sklearn
    ├── README.md
    └── line_regression
        ├── Folds5x2_pp.csv
        └── sk_linreg.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/0-Distance/blog_ml_distance.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | from numpy import *
  4 | 
  5 | print '[+]------------欧式距离-----------'
  6 | def twoPointDistance(a,b):
  7 | 	d = sqrt( (a[0]-b[0])**2 + (a[1]-b[1])**2 )
  8 | 	return d
  9 | 
 10 | print 'a,b 二维距离为：',twoPointDistance((1,1),(2,2))
 11 | 
 12 | def threePointDistance(a,b):
 13 | 	d = sqrt( (a[0]-b[0])**2 + (a[1]-b[1])**2 + (a[2]-b[2])**2 )
 14 | 	return d
 15 | 
 16 | print 'a,b 三维距离为：',threePointDistance((1,1,1),(2,2,2))
 17 | 
 18 | def distance(a,b):
 19 | 	sum = 0
 20 | 	for i in range(len(a)):
 21 | 		sum += (a[i]-b[i])**2
 22 | 	return sqrt(sum)
 23 | 
 24 | print 'a,b 多维距离为：',distance((1,1,2,2),(2,2,4,4))
 25 | 
 26 | print '[+]------------标准欧式距离-----------'
 27 | 
 28 | def moreBZOSdis(a,b):
 29 | 	sumnum = 0
 30 | 	for i in range(len(a)):
 31 | 		# 计算si 分量标准差
 32 | 		avg = (a[i]-b[i])/2
 33 | 		si = sqrt( (a[i] - avg) ** 2 + (b[i] - avg) ** 2 )
 34 | 		sumnum += ((a[i]-b[i])/si ) ** 2
 35 | 	
 36 | 	return sqrt(sumnum)
 37 | 
 38 | print 'a,b 标准欧式距离：',moreBZOSdis((1,2,1,2),(3,3,3,4))
 39 | 
 40 | print '[+]------------曼哈顿距离-----------'
 41 | def twoMHDdis(a,b):
 42 | 	return abs(a[0]-b[0])+abs(a[1]-b[1])
 43 | 
 44 | print 'a,b 二维曼哈顿距离为：', twoMHDdis((1,1),(2,2)) 
 45 | 
 46 | def threeMHDdis(a,b):
 47 | 	return abs(a[0]-b[0])+abs(a[1]-b[1]) + abs(a[2]-b[2])
 48 |  
 49 | print 'a,b 三维曼哈顿距离为：', threeMHDdis((1,1,1),(2,2,2)) 
 50 | 
 51 | 
 52 | def moreMHDdis(a,b):
 53 | 	sum = 0 
 54 | 	for i in range(len(a)):
 55 | 		sum += abs(a[i]-b[i])
 56 | 	return sum
 57 | 
 58 | print 'a,b 多维曼哈顿距离为：', moreMHDdis((1,1,1,1),(2,2,2,2)) 
 59 | 
 60 | print '[+]------------切比雪夫距离-----------'
 61 | def twoQBXFdis(a,b):
 62 | 	return max( abs(a[0]-b[0]), abs(a[1]-b[1]))
 63 | 
 64 | print 'a,b二维切比雪夫距离：' , twoQBXFdis((1,2),(3,4))
 65 | 
 66 | def moreQBXFdis(a,b):
 67 | 	maxnum = 0
 68 | 	for i in range(len(a)):
 69 | 		if abs(a[i]-b[i]) > maxnum:
 70 | 			maxnum = abs(a[i]-b[i])
 71 | 	return maxnum
 72 | 
 73 | print 'a,b多维切比雪夫距离：' , moreQBXFdis((1,1,1,1),(3,4,3,4))
 74 | 
 75 | 
 76 | print '[+]------------夹角余弦-----------'
 77 | 
 78 | def twoCos(a,b):
 79 | 	cos = (a[0]*b[0]+a[1]*b[1]) / (sqrt(a[0]**2 + b[0]**2) * sqrt(a[1]**2 + b[1]**2) )
 80 | 
 81 | 	return cos
 82 | print 'a,b 二维夹角余弦距离：',twoCos((1,1),(2,2))
 83 | 
 84 | def moreCos(a,b):
 85 |     sum_fenzi = 0.0
 86 |     sum_fenmu_1,sum_fenmu_2 = 0,0
 87 |     for i in range(len(a)):
 88 |         sum_fenzi += a[i]*b[i]
 89 |         sum_fenmu_1 += a[i]**2 
 90 |         sum_fenmu_2 += b[i]**2 
 91 | 
 92 |     return sum_fenzi/( sqrt(sum_fenmu_1) * sqrt(sum_fenmu_2) )
 93 | print 'a,b 多维夹角余弦距离：',moreCos((1,1,1,1),(2,2,2,2))
 94 | 
 95 | print '[+]------------汉明距离-----------'
 96 | 
 97 | def hanmingDis(a,b):
 98 | 	sumnum = 0
 99 | 	for i in range(len(a)):
100 | 		if a[i]!=b[i]:
101 | 			sumnum += 1
102 | 	return sumnum
103 | 
104 | print 'a,b 汉明距离：',hanmingDis((1,1,2,3),(2,2,1,3))
105 | 
106 | print '[+]------------杰卡德距离-----------'
107 | 
108 | def jiekadeDis(a,b):
109 | 	set_a = set(a)
110 | 	set_b = set(b)
111 | 	dis = float(len( (set_a | set_b) - (set_a & set_b) ) )/ len(set_a | set_b)
112 | 	return dis
113 | 
114 | print 'a,b 杰卡德距离：', jiekadeDis((1,2,3),(2,3,4))
115 | 
116 | def jiekadeXSDis(a,b):
117 | 	set_a = set(a)
118 | 	set_b = set(b)
119 | 	dis = float(len(set_a & set_b)  )/ len(set_a | set_b)
120 | 	return dis
121 | 
122 | print 'a,b 杰卡德相似系数：', jiekadeXSDis((1,2,3),(2,3,4))
123 | 


--------------------------------------------------------------------------------
/0-Spider/README.md:
--------------------------------------------------------------------------------
1 | > 此部分我会上传一些spider的代码吧，大部分会是以目标进行分类，部分对应的会有csdn的blog，路过的大神不要嘲笑我等小白
2 | 
3 | 
4 |  1: Scrapy 爬取百度贴吧指定帖子的发帖人和回帖人 <br/>
5 |  http://blog.csdn.net/gamer_gyt/article/details/75043398 <br/>
6 | 


--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/beidaSpider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="useProjectProfile" value="false" />
4 |     <option name="USE_PROJECT_PROFILE" value="false" />
5 |     <version value="1.0" />
6 |   </settings>
7 | </component>


--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.2 (/usr/bin/python3.5)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/beidaSpider.iml" filepath="$PROJECT_DIR$/.idea/beidaSpider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="b354228d-64b7-4964-ae76-0bd8c6479a39" name="Default" comment="" />
  5 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  6 |     <option name="TRACKING_ENABLED" value="true" />
  7 |     <option name="SHOW_DIALOG" value="false" />
  8 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
  9 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 10 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 11 |   </component>
 12 |   <component name="CreatePatchCommitExecutor">
 13 |     <option name="PATCH_PATH" value="" />
 14 |   </component>
 15 |   <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
 16 |   <component name="FileEditorManager">
 17 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
 18 |       <file leaf-file-name="spider.py" pinned="false" current-in-tab="true">
 19 |         <entry file="file://$PROJECT_DIR$/spider.py">
 20 |           <provider selected="true" editor-type-id="text-editor">
 21 |             <state relative-caret-position="275">
 22 |               <caret line="22" column="9" lean-forward="false" selection-start-line="22" selection-start-column="9" selection-end-line="22" selection-end-column="9" />
 23 |               <folding>
 24 |                 <marker date="1494062179346" expanded="true" signature="870:952" ph="INSERT INTO news... " />
 25 |               </folding>
 26 |             </state>
 27 |           </provider>
 28 |         </entry>
 29 |       </file>
 30 |     </leaf>
 31 |   </component>
 32 |   <component name="FileTemplateManagerImpl">
 33 |     <option name="RECENT_TEMPLATES">
 34 |       <list>
 35 |         <option value="Python Script" />
 36 |       </list>
 37 |     </option>
 38 |   </component>
 39 |   <component name="IdeDocumentHistory">
 40 |     <option name="CHANGED_PATHS">
 41 |       <list>
 42 |         <option value="$PROJECT_DIR$/spider.py" />
 43 |       </list>
 44 |     </option>
 45 |   </component>
 46 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
 47 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
 48 |   <component name="JsGulpfileManager">
 49 |     <detection-done>true</detection-done>
 50 |     <sorting>DEFINITION_ORDER</sorting>
 51 |   </component>
 52 |   <component name="ProjectFrameBounds">
 53 |     <option name="x" value="47" />
 54 |     <option name="y" value="24" />
 55 |     <option name="width" value="1319" />
 56 |     <option name="height" value="744" />
 57 |   </component>
 58 |   <component name="ProjectView">
 59 |     <navigator currentView="ProjectPane" proportions="" version="1">
 60 |       <flattenPackages />
 61 |       <showMembers />
 62 |       <showModules />
 63 |       <showLibraryContents />
 64 |       <hideEmptyPackages />
 65 |       <abbreviatePackageNames />
 66 |       <autoscrollToSource />
 67 |       <autoscrollFromSource />
 68 |       <sortByType />
 69 |       <manualOrder />
 70 |       <foldersAlwaysOnTop value="true" />
 71 |     </navigator>
 72 |     <panes>
 73 |       <pane id="Scratches" />
 74 |       <pane id="Scope" />
 75 |       <pane id="ProjectPane">
 76 |         <subPane>
 77 |           <PATH>
 78 |             <PATH_ELEMENT>
 79 |               <option name="myItemId" value="beidaSpider" />
 80 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
 81 |             </PATH_ELEMENT>
 82 |             <PATH_ELEMENT>
 83 |               <option name="myItemId" value="External Libraries" />
 84 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ExternalLibrariesNode" />
 85 |             </PATH_ELEMENT>
 86 |           </PATH>
 87 |         </subPane>
 88 |       </pane>
 89 |     </panes>
 90 |   </component>
 91 |   <component name="PropertiesComponent">
 92 |     <property name="WebServerToolWindowFactoryState" value="false" />
 93 |     <property name="nodejs_interpreter_path" value="/usr/local/bin/node" />
 94 |     <property name="js.eslint.eslintPackage" value="" />
 95 |     <property name="js-jscs-nodeInterpreter" value="/usr/local/bin/node" />
 96 |     <property name="settings.editor.selected.configurable" value="configurable.group.language" />
 97 |     <property name="last_opened_file_path" value="$USER_HOME$/github/CSMarket/csmarket" />
 98 |   </component>
 99 |   <component name="RunManager">
100 |     <configuration default="true" type="DjangoTestsConfigurationType" factoryName="Django tests">
101 |       <option name="INTERPRETER_OPTIONS" value="" />
102 |       <option name="PARENT_ENVS" value="true" />
103 |       <envs>
104 |         <env name="PYTHONUNBUFFERED" value="1" />
105 |       </envs>
106 |       <option name="SDK_HOME" value="" />
107 |       <option name="WORKING_DIRECTORY" value="" />
108 |       <option name="IS_MODULE_SDK" value="false" />
109 |       <option name="ADD_CONTENT_ROOTS" value="true" />
110 |       <option name="ADD_SOURCE_ROOTS" value="true" />
111 |       <module name="beidaSpider" />
112 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
113 |       <option name="TARGET" value="" />
114 |       <option name="SETTINGS_FILE" value="" />
115 |       <option name="CUSTOM_SETTINGS" value="false" />
116 |       <option name="USE_OPTIONS" value="false" />
117 |       <option name="OPTIONS" value="" />
118 |       <method />
119 |     </configuration>
120 |     <configuration default="true" type="JavaScriptTestRunnerProtractor" factoryName="Protractor">
121 |       <config-file value="" />
122 |       <node-interpreter value="project" />
123 |       <envs />
124 |       <method />
125 |     </configuration>
126 |     <configuration default="true" type="JavascriptDebugType" factoryName="JavaScript Debug">
127 |       <method />
128 |     </configuration>
129 |     <configuration default="true" type="PyBehaveRunConfigurationType" factoryName="Behave">
130 |       <option name="INTERPRETER_OPTIONS" value="" />
131 |       <option name="PARENT_ENVS" value="true" />
132 |       <envs />
133 |       <option name="SDK_HOME" value="" />
134 |       <option name="WORKING_DIRECTORY" value="" />
135 |       <option name="IS_MODULE_SDK" value="false" />
136 |       <option name="ADD_CONTENT_ROOTS" value="true" />
137 |       <option name="ADD_SOURCE_ROOTS" value="true" />
138 |       <module name="beidaSpider" />
139 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
140 |       <option name="ADDITIONAL_ARGS" value="" />
141 |       <method />
142 |     </configuration>
143 |     <configuration default="true" type="PyLettuceRunConfigurationType" factoryName="Lettuce">
144 |       <option name="INTERPRETER_OPTIONS" value="" />
145 |       <option name="PARENT_ENVS" value="true" />
146 |       <envs />
147 |       <option name="SDK_HOME" value="" />
148 |       <option name="WORKING_DIRECTORY" value="" />
149 |       <option name="IS_MODULE_SDK" value="false" />
150 |       <option name="ADD_CONTENT_ROOTS" value="true" />
151 |       <option name="ADD_SOURCE_ROOTS" value="true" />
152 |       <module name="beidaSpider" />
153 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
154 |       <option name="ADDITIONAL_ARGS" value="" />
155 |       <method />
156 |     </configuration>
157 |     <configuration default="true" type="PythonConfigurationType" factoryName="Python">
158 |       <option name="INTERPRETER_OPTIONS" value="" />
159 |       <option name="PARENT_ENVS" value="true" />
160 |       <envs>
161 |         <env name="PYTHONUNBUFFERED" value="1" />
162 |       </envs>
163 |       <option name="SDK_HOME" value="" />
164 |       <option name="WORKING_DIRECTORY" value="" />
165 |       <option name="IS_MODULE_SDK" value="false" />
166 |       <option name="ADD_CONTENT_ROOTS" value="true" />
167 |       <option name="ADD_SOURCE_ROOTS" value="true" />
168 |       <module name="beidaSpider" />
169 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
170 |       <option name="SCRIPT_NAME" value="" />
171 |       <option name="PARAMETERS" value="" />
172 |       <option name="SHOW_COMMAND_LINE" value="false" />
173 |       <method />
174 |     </configuration>
175 |     <configuration default="true" type="Tox" factoryName="Tox">
176 |       <option name="INTERPRETER_OPTIONS" value="" />
177 |       <option name="PARENT_ENVS" value="true" />
178 |       <envs />
179 |       <option name="SDK_HOME" value="" />
180 |       <option name="WORKING_DIRECTORY" value="" />
181 |       <option name="IS_MODULE_SDK" value="false" />
182 |       <option name="ADD_CONTENT_ROOTS" value="true" />
183 |       <option name="ADD_SOURCE_ROOTS" value="true" />
184 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
185 |       <module name="beidaSpider" />
186 |       <method />
187 |     </configuration>
188 |     <configuration default="true" type="js.build_tools.gulp" factoryName="Gulp.js">
189 |       <node-interpreter>project</node-interpreter>
190 |       <node-options />
191 |       <gulpfile />
192 |       <tasks />
193 |       <arguments />
194 |       <envs />
195 |       <method />
196 |     </configuration>
197 |     <configuration default="true" type="js.build_tools.npm" factoryName="npm">
198 |       <command value="run" />
199 |       <scripts />
200 |       <node-interpreter value="project" />
201 |       <envs />
202 |       <method />
203 |     </configuration>
204 |     <configuration default="true" type="tests" factoryName="Attests">
205 |       <option name="INTERPRETER_OPTIONS" value="" />
206 |       <option name="PARENT_ENVS" value="true" />
207 |       <envs />
208 |       <option name="SDK_HOME" value="" />
209 |       <option name="WORKING_DIRECTORY" value="" />
210 |       <option name="IS_MODULE_SDK" value="false" />
211 |       <option name="ADD_CONTENT_ROOTS" value="true" />
212 |       <option name="ADD_SOURCE_ROOTS" value="true" />
213 |       <module name="beidaSpider" />
214 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
215 |       <option name="SCRIPT_NAME" value="" />
216 |       <option name="CLASS_NAME" value="" />
217 |       <option name="METHOD_NAME" value="" />
218 |       <option name="FOLDER_NAME" value="" />
219 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
220 |       <option name="PATTERN" value="" />
221 |       <option name="USE_PATTERN" value="false" />
222 |       <method />
223 |     </configuration>
224 |     <configuration default="true" type="tests" factoryName="Doctests">
225 |       <option name="INTERPRETER_OPTIONS" value="" />
226 |       <option name="PARENT_ENVS" value="true" />
227 |       <envs />
228 |       <option name="SDK_HOME" value="" />
229 |       <option name="WORKING_DIRECTORY" value="" />
230 |       <option name="IS_MODULE_SDK" value="false" />
231 |       <option name="ADD_CONTENT_ROOTS" value="true" />
232 |       <option name="ADD_SOURCE_ROOTS" value="true" />
233 |       <module name="beidaSpider" />
234 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
235 |       <option name="SCRIPT_NAME" value="" />
236 |       <option name="CLASS_NAME" value="" />
237 |       <option name="METHOD_NAME" value="" />
238 |       <option name="FOLDER_NAME" value="" />
239 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
240 |       <option name="PATTERN" value="" />
241 |       <option name="USE_PATTERN" value="false" />
242 |       <method />
243 |     </configuration>
244 |     <configuration default="true" type="tests" factoryName="Nosetests">
245 |       <option name="INTERPRETER_OPTIONS" value="" />
246 |       <option name="PARENT_ENVS" value="true" />
247 |       <envs />
248 |       <option name="SDK_HOME" value="" />
249 |       <option name="WORKING_DIRECTORY" value="" />
250 |       <option name="IS_MODULE_SDK" value="false" />
251 |       <option name="ADD_CONTENT_ROOTS" value="true" />
252 |       <option name="ADD_SOURCE_ROOTS" value="true" />
253 |       <module name="beidaSpider" />
254 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
255 |       <option name="SCRIPT_NAME" value="" />
256 |       <option name="CLASS_NAME" value="" />
257 |       <option name="METHOD_NAME" value="" />
258 |       <option name="FOLDER_NAME" value="" />
259 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
260 |       <option name="PATTERN" value="" />
261 |       <option name="USE_PATTERN" value="false" />
262 |       <option name="PARAMS" value="" />
263 |       <option name="USE_PARAM" value="false" />
264 |       <method />
265 |     </configuration>
266 |     <configuration default="true" type="tests" factoryName="Unittests">
267 |       <option name="INTERPRETER_OPTIONS" value="" />
268 |       <option name="PARENT_ENVS" value="true" />
269 |       <envs />
270 |       <option name="SDK_HOME" value="" />
271 |       <option name="WORKING_DIRECTORY" value="" />
272 |       <option name="IS_MODULE_SDK" value="false" />
273 |       <option name="ADD_CONTENT_ROOTS" value="true" />
274 |       <option name="ADD_SOURCE_ROOTS" value="true" />
275 |       <module name="beidaSpider" />
276 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
277 |       <option name="SCRIPT_NAME" value="" />
278 |       <option name="CLASS_NAME" value="" />
279 |       <option name="METHOD_NAME" value="" />
280 |       <option name="FOLDER_NAME" value="" />
281 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
282 |       <option name="PATTERN" value="" />
283 |       <option name="USE_PATTERN" value="false" />
284 |       <option name="PUREUNITTEST" value="true" />
285 |       <option name="PARAMS" value="" />
286 |       <option name="USE_PARAM" value="false" />
287 |       <method />
288 |     </configuration>
289 |     <configuration default="true" type="tests" factoryName="py.test">
290 |       <option name="INTERPRETER_OPTIONS" value="" />
291 |       <option name="PARENT_ENVS" value="true" />
292 |       <envs />
293 |       <option name="SDK_HOME" value="" />
294 |       <option name="WORKING_DIRECTORY" value="" />
295 |       <option name="IS_MODULE_SDK" value="false" />
296 |       <option name="ADD_CONTENT_ROOTS" value="true" />
297 |       <option name="ADD_SOURCE_ROOTS" value="true" />
298 |       <module name="beidaSpider" />
299 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
300 |       <option name="SCRIPT_NAME" value="" />
301 |       <option name="CLASS_NAME" value="" />
302 |       <option name="METHOD_NAME" value="" />
303 |       <option name="FOLDER_NAME" value="" />
304 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
305 |       <option name="PATTERN" value="" />
306 |       <option name="USE_PATTERN" value="false" />
307 |       <option name="testToRun" value="" />
308 |       <option name="keywords" value="" />
309 |       <option name="params" value="" />
310 |       <option name="USE_PARAM" value="false" />
311 |       <option name="USE_KEYWORD" value="false" />
312 |       <method />
313 |     </configuration>
314 |   </component>
315 |   <component name="ShelveChangesManager" show_recycled="false">
316 |     <option name="remove_strategy" value="false" />
317 |   </component>
318 |   <component name="TaskManager">
319 |     <task active="true" id="Default" summary="Default task">
320 |       <changelist id="b354228d-64b7-4964-ae76-0bd8c6479a39" name="Default" comment="" />
321 |       <created>1494037431357</created>
322 |       <option name="number" value="Default" />
323 |       <option name="presentableId" value="Default" />
324 |       <updated>1494037431357</updated>
325 |     </task>
326 |     <servers />
327 |   </component>
328 |   <component name="ToolWindowManager">
329 |     <frame x="47" y="24" width="1319" height="744" extended-state="6" />
330 |     <editor active="false" />
331 |     <layout>
332 |       <window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.15238817" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
333 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
334 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
335 |       <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
336 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
337 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
338 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
339 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
340 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
341 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
342 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
343 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
344 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
345 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
346 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
347 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
348 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
349 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
350 |     </layout>
351 |   </component>
352 |   <component name="TypeScriptGeneratedFilesManager">
353 |     <option name="processedProjectFiles" value="true" />
354 |   </component>
355 |   <component name="VcsContentAnnotationSettings">
356 |     <option name="myLimit" value="2678400000" />
357 |   </component>
358 |   <component name="XDebuggerManager">
359 |     <breakpoint-manager />
360 |     <watches-manager />
361 |   </component>
362 |   <component name="editorHistoryManager">
363 |     <entry file="file://$PROJECT_DIR$/spider.py">
364 |       <provider selected="true" editor-type-id="text-editor">
365 |         <state relative-caret-position="0">
366 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
367 |           <folding>
368 |             <marker date="1494062179346" expanded="true" signature="870:952" ph="INSERT INTO news... " />
369 |           </folding>
370 |         </state>
371 |       </provider>
372 |     </entry>
373 |     <entry file="file://$PROJECT_DIR$/spider.py">
374 |       <provider selected="true" editor-type-id="text-editor">
375 |         <state relative-caret-position="0">
376 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
377 |           <folding>
378 |             <marker date="1494062179346" expanded="true" signature="870:952" ph="INSERT INTO news... " />
379 |           </folding>
380 |         </state>
381 |       </provider>
382 |     </entry>
383 |     <entry file="file://$PROJECT_DIR$/spider.py">
384 |       <provider selected="true" editor-type-id="text-editor">
385 |         <state relative-caret-position="275">
386 |           <caret line="22" column="9" lean-forward="false" selection-start-line="22" selection-start-column="9" selection-end-line="22" selection-end-column="9" />
387 |           <folding>
388 |             <marker date="1494062179346" expanded="true" signature="870:952" ph="INSERT INTO news... " />
389 |           </folding>
390 |         </state>
391 |       </provider>
392 |     </entry>
393 |   </component>
394 |   <component name="masterDetails">
395 |     <states>
396 |       <state key="ScopeChooserConfigurable.UI">
397 |         <settings>
398 |           <splitter-proportions>
399 |             <option name="proportions">
400 |               <list>
401 |                 <option value="0.2" />
402 |               </list>
403 |             </option>
404 |           </splitter-proportions>
405 |         </settings>
406 |       </state>
407 |     </states>
408 |   </component>
409 | </project>


--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/README.md:
--------------------------------------------------------------------------------
 1 | 爬取北大要闻的所有新闻
 2 | 
 3 | url:http://pkunews.pku.edu.cn/xxfz/node_185.htm
 4 | 
 5 | news.sql 为数据备份（Mysql）
 6 | 
 7 | 数据库文件备份与恢复
 8 | 
 9 | 备份：/usr/bin/mysqldump -uroot -proot beidaspider  --default-character-set=utf8 --opt -Q -R >./news.sql
10 | 
11 | 恢复：/usr/bin/mysql -uroot -proot beidaspider <./news.sql
12 | 


--------------------------------------------------------------------------------
/0-Spider/beidaNewsSpider/spider.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import pymysql
  4 | from bs4 import BeautifulSoup
  5 | import urllib.request
  6 | import time
  7 | 
  8 | '''
  9 | 创建数据库和数据表语句
 10 | create database beidaspider default charset utf8;
 11 | 
 12 | create table news(
 13 | title  varchar(100),
 14 | pub_date date,
 15 | from_ varchar(50),
 16 | content varchar(20000)
 17 | );
 18 | 
 19 | 数据库备份
 20 | /usr/bin/mysqldump -uroot -proot beidaspider  --default-character-set=utf8 --opt -Q -R >./news.sql
 21 | 
 22 | 数据库恢复
 23 | /usr/bin/mysql -uroot -proot beidaspider <./news.sql
 24 | '''
 25 | 
 26 | 
 27 | class BeiDaSpider:
 28 |     # 初始化
 29 |     def __init__(self):
 30 |         self.root_href = "http://pkunews.pku.edu.cn/xxfz/"
 31 | 
 32 |     # 连接数据库
 33 |     def connMysql(self):
 34 |         conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root',db='beidaspider',charset='utf8')
 35 |         cur = conn.cursor()
 36 |         return cur,conn
 37 | 
 38 |     # 写入数据库
 39 |     def write(self,title,date,from_,content):
 40 |         cur,conn = self.connMysql()
 41 |         sql = """INSERT INTO news (title, pub_date, from_, content) VALUES ("%s", "%s", "%s", "%s")""" % (title,date,from_,content)
 42 |         cur.execute(sql)
 43 |         conn.commit()
 44 |         conn.close()
 45 | 
 46 |         with open("news.txt","a") as fp:
 47 |             fp.write(title+"\t"+date+"\t"+from_+"\t"+content+"\n")
 48 |         fp.close()
 49 | 
 50 |     # 解析每页，获取该页所有的新闻链接
 51 |     def parse_onePage_href(self,url):
 52 |         res = urllib.request.urlopen(url)
 53 |         body = BeautifulSoup(res.read())
 54 |         table = body.find('table',cellspacing="0",cellpadding="0",id="nav2_7Tabcontent_10")
 55 |         a_list = table.find_all('a')
 56 |         href_list = []
 57 |         for a in a_list:
 58 |             href_list.append(self.root_href + a.get('href'))
 59 |         return href_list
 60 | 
 61 |     # 解析每个新闻，获取数据
 62 |     def parse_oneNew(self,url):
 63 |         res = urllib.request.urlopen(url)
 64 |         body = BeautifulSoup(res.read())
 65 | 
 66 |         # 获取标题
 67 |         title = body.title.get_text().strip()
 68 |         print(title)
 69 | 
 70 |         # 获取时间和来源
 71 |         #dataAndfrom =
 72 |         dataAndfrom = body.find('table',width="560",border="0",cellspacing="0",cellpadding="0")
 73 |         datafrom_list = dataAndfrom.find_all('tr')[0].get_text().strip().split("  ")
 74 |         date = datafrom_list[0].split("：")[1].strip()
 75 |         from_ = datafrom_list[1].split("：")[1].strip()
 76 |         print(date)
 77 |         #print(from_)
 78 | 
 79 |         # 获取新闻内容
 80 |         content = body.find('table',width="710",border="0",cellspacing="0",cellpadding="0",style="margin-left:15px;").find_all('tr')[3].get_text().strip().replace("\n"," ")
 81 |         #print(content)
 82 | 
 83 |         self.write(title,date,from_,content)
 84 | 
 85 |     def start(self):
 86 |         for i in range(1,21):
 87 |             if i==1:
 88 |                 href_list = self.parse_onePage_href(self.root_href + "node_185.htm")
 89 |                 for href in href_list:
 90 |                     try:
 91 |                         self.parse_oneNew(href)
 92 |                     except Exception as e:
 93 |                         print(e)
 94 |                     finally:
 95 |                         pass
 96 |             #        time.sleep(1)
 97 |                     # break
 98 |             else:
 99 |                 href_list = self.parse_onePage_href(self.root_href + "node_185_" + str(i) + ".htm")
100 |                 for href in href_list:
101 |                     try:
102 |                         self.parse_oneNew(href)
103 |                     except Exception as e:
104 |                         print(e)
105 |                     finally:
106 |                         pass
107 |             #        time.sleep(1)
108 |             #time.sleep(2)
109 |             # break
110 | 
111 | 
112 | if __name__=="__main__":
113 |     spi = BeiDaSpider()
114 |     spi.start()
115 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 (/System/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/tiebaSpider.iml" filepath="$PROJECT_DIR$/.idea/tiebaSpider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/.idea/tiebaSpider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="e220b1d7-08fd-4217-a675-d4175ce15bee" name="Default" comment="" />
  5 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  6 |     <option name="TRACKING_ENABLED" value="true" />
  7 |     <option name="SHOW_DIALOG" value="false" />
  8 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
  9 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 10 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 11 |   </component>
 12 |   <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
 13 |   <component name="FileEditorManager">
 14 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
 15 |       <file leaf-file-name="main.py" pinned="false" current-in-tab="false">
 16 |         <entry file="file://$PROJECT_DIR$/main.py">
 17 |           <provider selected="true" editor-type-id="text-editor">
 18 |             <state relative-caret-position="119">
 19 |               <caret line="7" column="21" lean-forward="false" selection-start-line="7" selection-start-column="21" selection-end-line="7" selection-end-column="21" />
 20 |               <folding />
 21 |             </state>
 22 |           </provider>
 23 |         </entry>
 24 |       </file>
 25 |       <file leaf-file-name="spider.py" pinned="false" current-in-tab="false">
 26 |         <entry file="file://$PROJECT_DIR$/spider.py">
 27 |           <provider selected="true" editor-type-id="text-editor">
 28 |             <state relative-caret-position="622">
 29 |               <caret line="115" column="30" lean-forward="true" selection-start-line="115" selection-start-column="30" selection-end-line="115" selection-end-column="30" />
 30 |               <folding>
 31 |                 <element signature="e#17#46#0" expanded="true" />
 32 |               </folding>
 33 |             </state>
 34 |           </provider>
 35 |         </entry>
 36 |       </file>
 37 |       <file leaf-file-name="README.md" pinned="false" current-in-tab="true">
 38 |         <entry file="file://$PROJECT_DIR$/README.md">
 39 |           <provider selected="true" editor-type-id="text-editor">
 40 |             <state relative-caret-position="340">
 41 |               <caret line="20" column="51" lean-forward="false" selection-start-line="20" selection-start-column="51" selection-end-line="20" selection-end-column="51" />
 42 |               <folding />
 43 |             </state>
 44 |           </provider>
 45 |         </entry>
 46 |       </file>
 47 |     </leaf>
 48 |   </component>
 49 |   <component name="FileTemplateManagerImpl">
 50 |     <option name="RECENT_TEMPLATES">
 51 |       <list>
 52 |         <option value="Python Script" />
 53 |       </list>
 54 |     </option>
 55 |   </component>
 56 |   <component name="FindInProjectRecents">
 57 |     <findStrings>
 58 |       <find>print</find>
 59 |       <find>time.sleep(self.timesleep)</find>
 60 |     </findStrings>
 61 |   </component>
 62 |   <component name="IdeDocumentHistory">
 63 |     <option name="CHANGED_PATHS">
 64 |       <list>
 65 |         <option value="$PROJECT_DIR$/mail.py" />
 66 |         <option value="$PROJECT_DIR$/Spider.py" />
 67 |         <option value="$PROJECT_DIR$/error/error.log" />
 68 |         <option value="$PROJECT_DIR$/tiebaname/name.txt" />
 69 |         <option value="$PROJECT_DIR$/data/20170627/戒赌&amp;ie=utf-8&amp;pn=.txt" />
 70 |         <option value="$PROJECT_DIR$/data/20170627/戒赌.txt" />
 71 |         <option value="$PROJECT_DIR$/main.py" />
 72 |         <option value="$PROJECT_DIR$/spider.py" />
 73 |         <option value="$PROJECT_DIR$/README.md" />
 74 |       </list>
 75 |     </option>
 76 |   </component>
 77 |   <component name="ProjectFrameBounds">
 78 |     <option name="width" value="1440" />
 79 |     <option name="height" value="900" />
 80 |   </component>
 81 |   <component name="ProjectView">
 82 |     <navigator currentView="ProjectPane" proportions="" version="1">
 83 |       <flattenPackages />
 84 |       <showMembers />
 85 |       <showModules />
 86 |       <showLibraryContents />
 87 |       <hideEmptyPackages />
 88 |       <abbreviatePackageNames />
 89 |       <autoscrollToSource />
 90 |       <autoscrollFromSource />
 91 |       <sortByType />
 92 |       <manualOrder />
 93 |       <foldersAlwaysOnTop value="true" />
 94 |     </navigator>
 95 |     <panes>
 96 |       <pane id="Scope" />
 97 |       <pane id="Scratches" />
 98 |       <pane id="ProjectPane">
 99 |         <subPane>
100 |           <PATH>
101 |             <PATH_ELEMENT>
102 |               <option name="myItemId" value="tiebaSpider" />
103 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
104 |             </PATH_ELEMENT>
105 |             <PATH_ELEMENT>
106 |               <option name="myItemId" value="tiebaSpider" />
107 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
108 |             </PATH_ELEMENT>
109 |           </PATH>
110 |           <PATH>
111 |             <PATH_ELEMENT>
112 |               <option name="myItemId" value="tiebaSpider" />
113 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
114 |             </PATH_ELEMENT>
115 |             <PATH_ELEMENT>
116 |               <option name="myItemId" value="tiebaSpider" />
117 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
118 |             </PATH_ELEMENT>
119 |             <PATH_ELEMENT>
120 |               <option name="myItemId" value="tiebaname" />
121 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
122 |             </PATH_ELEMENT>
123 |           </PATH>
124 |           <PATH>
125 |             <PATH_ELEMENT>
126 |               <option name="myItemId" value="tiebaSpider" />
127 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
128 |             </PATH_ELEMENT>
129 |             <PATH_ELEMENT>
130 |               <option name="myItemId" value="tiebaSpider" />
131 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
132 |             </PATH_ELEMENT>
133 |             <PATH_ELEMENT>
134 |               <option name="myItemId" value="data" />
135 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
136 |             </PATH_ELEMENT>
137 |           </PATH>
138 |         </subPane>
139 |       </pane>
140 |     </panes>
141 |   </component>
142 |   <component name="PropertiesComponent">
143 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
144 |     <property name="FullScreen" value="true" />
145 |   </component>
146 |   <component name="RunDashboard">
147 |     <option name="ruleStates">
148 |       <list>
149 |         <RuleState>
150 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
151 |         </RuleState>
152 |         <RuleState>
153 |           <option name="name" value="StatusDashboardGroupingRule" />
154 |         </RuleState>
155 |       </list>
156 |     </option>
157 |   </component>
158 |   <component name="RunManager">
159 |     <configuration default="true" type="PythonConfigurationType" factoryName="Python">
160 |       <option name="INTERPRETER_OPTIONS" value="" />
161 |       <option name="PARENT_ENVS" value="true" />
162 |       <envs>
163 |         <env name="PYTHONUNBUFFERED" value="1" />
164 |       </envs>
165 |       <option name="SDK_HOME" value="" />
166 |       <option name="WORKING_DIRECTORY" value="" />
167 |       <option name="IS_MODULE_SDK" value="false" />
168 |       <option name="ADD_CONTENT_ROOTS" value="true" />
169 |       <option name="ADD_SOURCE_ROOTS" value="true" />
170 |       <module name="tiebaSpider" />
171 |       <option name="SCRIPT_NAME" value="" />
172 |       <option name="PARAMETERS" value="" />
173 |       <option name="SHOW_COMMAND_LINE" value="false" />
174 |       <option name="EMULATE_TERMINAL" value="false" />
175 |       <method />
176 |     </configuration>
177 |     <configuration default="true" type="Tox" factoryName="Tox">
178 |       <option name="INTERPRETER_OPTIONS" value="" />
179 |       <option name="PARENT_ENVS" value="true" />
180 |       <envs />
181 |       <option name="SDK_HOME" value="" />
182 |       <option name="WORKING_DIRECTORY" value="" />
183 |       <option name="IS_MODULE_SDK" value="false" />
184 |       <option name="ADD_CONTENT_ROOTS" value="true" />
185 |       <option name="ADD_SOURCE_ROOTS" value="true" />
186 |       <module name="tiebaSpider" />
187 |       <method />
188 |     </configuration>
189 |     <configuration default="true" type="tests" factoryName="Doctests">
190 |       <option name="INTERPRETER_OPTIONS" value="" />
191 |       <option name="PARENT_ENVS" value="true" />
192 |       <envs />
193 |       <option name="SDK_HOME" value="" />
194 |       <option name="WORKING_DIRECTORY" value="" />
195 |       <option name="IS_MODULE_SDK" value="false" />
196 |       <option name="ADD_CONTENT_ROOTS" value="true" />
197 |       <option name="ADD_SOURCE_ROOTS" value="true" />
198 |       <module name="tiebaSpider" />
199 |       <option name="SCRIPT_NAME" value="" />
200 |       <option name="CLASS_NAME" value="" />
201 |       <option name="METHOD_NAME" value="" />
202 |       <option name="FOLDER_NAME" value="" />
203 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
204 |       <option name="PATTERN" value="" />
205 |       <option name="USE_PATTERN" value="false" />
206 |       <method />
207 |     </configuration>
208 |     <configuration default="true" type="tests" factoryName="Unittests">
209 |       <option name="INTERPRETER_OPTIONS" value="" />
210 |       <option name="PARENT_ENVS" value="true" />
211 |       <envs />
212 |       <option name="SDK_HOME" value="" />
213 |       <option name="WORKING_DIRECTORY" value="" />
214 |       <option name="IS_MODULE_SDK" value="false" />
215 |       <option name="ADD_CONTENT_ROOTS" value="true" />
216 |       <option name="ADD_SOURCE_ROOTS" value="true" />
217 |       <module name="tiebaSpider" />
218 |       <option name="_new_additionalArguments" value="&quot;&quot;" />
219 |       <option name="_new_target" value="&quot;.&quot;" />
220 |       <option name="_new_targetType" value="&quot;PATH&quot;" />
221 |       <method />
222 |     </configuration>
223 |   </component>
224 |   <component name="ShelveChangesManager" show_recycled="false">
225 |     <option name="remove_strategy" value="false" />
226 |   </component>
227 |   <component name="TaskManager">
228 |     <task active="true" id="Default" summary="Default task">
229 |       <changelist id="e220b1d7-08fd-4217-a675-d4175ce15bee" name="Default" comment="" />
230 |       <created>1498495498538</created>
231 |       <option name="number" value="Default" />
232 |       <option name="presentableId" value="Default" />
233 |       <updated>1498495498538</updated>
234 |     </task>
235 |     <servers />
236 |   </component>
237 |   <component name="ToolWindowManager">
238 |     <frame x="0" y="0" width="1440" height="900" extended-state="0" />
239 |     <editor active="true" />
240 |     <layout>
241 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.19513889" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
242 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
243 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="true" content_ui="tabs" />
244 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
245 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
246 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
247 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
248 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="7" side_tool="false" content_ui="tabs" />
249 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
250 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
251 |       <window_info id="Data View" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
252 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
253 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
254 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
255 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
256 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
257 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32942554" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
258 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
259 |     </layout>
260 |   </component>
261 |   <component name="VcsContentAnnotationSettings">
262 |     <option name="myLimit" value="2678400000" />
263 |   </component>
264 |   <component name="XDebuggerManager">
265 |     <breakpoint-manager />
266 |     <watches-manager />
267 |   </component>
268 |   <component name="editorHistoryManager">
269 |     <entry file="file://$PROJECT_DIR$/main.py">
270 |       <provider selected="true" editor-type-id="text-editor">
271 |         <state relative-caret-position="0">
272 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
273 |           <folding />
274 |         </state>
275 |       </provider>
276 |     </entry>
277 |     <entry file="file://$PROJECT_DIR$/spider.py">
278 |       <provider selected="true" editor-type-id="text-editor">
279 |         <state relative-caret-position="2074">
280 |           <caret line="125" column="29" lean-forward="true" selection-start-line="125" selection-start-column="29" selection-end-line="125" selection-end-column="29" />
281 |           <folding>
282 |             <element signature="e#17#46#0" expanded="true" />
283 |           </folding>
284 |         </state>
285 |       </provider>
286 |     </entry>
287 |     <entry file="file://$PROJECT_DIR$/main.py">
288 |       <provider selected="true" editor-type-id="text-editor">
289 |         <state relative-caret-position="0">
290 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
291 |           <folding />
292 |         </state>
293 |       </provider>
294 |     </entry>
295 |     <entry file="file://$PROJECT_DIR$/spider.py">
296 |       <provider selected="true" editor-type-id="text-editor">
297 |         <state relative-caret-position="2295">
298 |           <caret line="139" column="40" lean-forward="false" selection-start-line="139" selection-start-column="40" selection-end-line="139" selection-end-column="40" />
299 |           <folding>
300 |             <element signature="e#17#46#0" expanded="true" />
301 |           </folding>
302 |         </state>
303 |       </provider>
304 |     </entry>
305 |     <entry file="file://$PROJECT_DIR$/tiebaname/name.txt">
306 |       <provider selected="true" editor-type-id="text-editor">
307 |         <state relative-caret-position="0">
308 |           <caret line="0" column="2" lean-forward="false" selection-start-line="0" selection-start-column="2" selection-end-line="0" selection-end-column="2" />
309 |         </state>
310 |       </provider>
311 |     </entry>
312 |     <entry file="file://$PROJECT_DIR$/data/20170627/戒赌&amp;ie=utf-8&amp;pn=.txt" />
313 |     <entry file="file://$PROJECT_DIR$/data/20170627/戒赌.txt" />
314 |     <entry file="file://$PROJECT_DIR$/error/error.log" />
315 |     <entry file="file://$PROJECT_DIR$/main.py">
316 |       <provider selected="true" editor-type-id="text-editor">
317 |         <state relative-caret-position="119">
318 |           <caret line="7" column="21" lean-forward="false" selection-start-line="7" selection-start-column="21" selection-end-line="7" selection-end-column="21" />
319 |           <folding />
320 |         </state>
321 |       </provider>
322 |     </entry>
323 |     <entry file="file://$PROJECT_DIR$/戒赌.txt">
324 |       <provider selected="true" editor-type-id="text-editor">
325 |         <state relative-caret-position="-11582">
326 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
327 |           <folding />
328 |         </state>
329 |       </provider>
330 |     </entry>
331 |     <entry file="file://$PROJECT_DIR$/data/20170628/戒赌.txt">
332 |       <provider selected="true" editor-type-id="text-editor">
333 |         <state relative-caret-position="0">
334 |           <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
335 |           <folding />
336 |         </state>
337 |       </provider>
338 |     </entry>
339 |     <entry file="file://$PROJECT_DIR$/spider.py">
340 |       <provider selected="true" editor-type-id="text-editor">
341 |         <state relative-caret-position="622">
342 |           <caret line="115" column="30" lean-forward="true" selection-start-line="115" selection-start-column="30" selection-end-line="115" selection-end-column="30" />
343 |           <folding>
344 |             <element signature="e#17#46#0" expanded="true" />
345 |           </folding>
346 |         </state>
347 |       </provider>
348 |     </entry>
349 |     <entry file="file://$PROJECT_DIR$/README.md">
350 |       <provider selected="true" editor-type-id="text-editor">
351 |         <state relative-caret-position="340">
352 |           <caret line="20" column="51" lean-forward="false" selection-start-line="20" selection-start-column="51" selection-end-line="20" selection-end-column="51" />
353 |           <folding />
354 |         </state>
355 |       </provider>
356 |     </entry>
357 |   </component>
358 | </project>


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/README.md:
--------------------------------------------------------------------------------
 1 | 这两个文件夹下的爬虫都是为了实现爬取贴吧前三页帖子的发帖人和回帖人，spider1使用的是BeautifulSoup+urllib2，spider2使用的是scrapy
 2 | 
 3 | 
 4 | [ Scrapy 爬取百度贴吧指定帖子的发帖人和回帖人](http://blog.csdn.net/Gamer_gyt/article/details/75043398)
 5 | 
 6 | 
 7 | CSDN博客地址：
 8 | http://blog.csdn.net/gamer_gyt/
 9 | 
10 | 如有问题请联系：
11 | QQ：1923361654
12 | WeChat：gyt13342445911
13 | Email：thinkgamer_gyt@gmail.com
14 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/README.md:
--------------------------------------------------------------------------------
 1 | # 项目说明
 2 | 
 3 | 该项目为爬取指定贴吧的前三页帖子的发帖用户和回帖用户的用户名
 4 | 
 5 | data 目录为存放数据的目录，其中以天为单位创建二级目录，以贴吧名为三级单位存储抓取结果
 6 | 
 7 | 目录结构类似于：
 8 | 
 9 | data
10 | 
11 | --20170626
12 | 
13 | -----戒赌吧.txt
14 | 
15 | -----网易吧.txt
16 | 
17 | tiebaname 目录为存放贴吧名字的目录，将要爬取的贴吧名字写入该目录下的name.txt文件中
18 | 
19 | 目录结构类似于：
20 | 
21 | tiebaname
22 | 
23 | --name.txt
24 | 
25 | 采用的是python 的beautifulSoup库，效果不太理想，但后续会逐步改善，可能换成别的框架
26 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/main.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from spider import Spider
 4 | 
 5 | if __name__ == "__main__":
 6 |     import time
 7 |     print("Start At:",time.asctime( time.localtime(time.time()) ))
 8 |     spider = Spider()
 9 |     spider.start()
10 |     print("Stop At:",time.asctime( time.localtime(time.time()) ))


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/spider.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | from bs4 import BeautifulSoup
  4 | import urllib2
  5 | import urllib
  6 | import time,os
  7 | 
  8 | class Spider:
  9 | 
 10 |     def __init__(self):
 11 |         self.search_url = 'https://tieba.baidu.com/f?kw='
 12 |         self.tieba_list = []      # 存储要爬取的若干个贴吧的链接
 13 |         self.url_list = []        # 存放每个贴吧前三页的帖子链接
 14 |         self.timesleep = 2        # 每次访问tieba的url时间间隔
 15 |         self.pages = 3            # 设置要抓取多少页
 16 |         self.current_href = ''    # 当前爬取的贴吧链接url
 17 | 
 18 |         # 在data目录下创建日期和贴吧名的txt文件
 19 |         if not os.path.exists('data/%s' % time.strftime('%Y%m%d')):
 20 |             os.mkdir('data/%s' % time.strftime('%Y%m%d'))
 21 | 
 22 |     def error(self,loc,url,e):
 23 |         fw = open("error/error.log","a")
 24 |         fw.write(time.asctime( time.localtime(time.time()) )+"\t"+loc+"\t"+url+"\t"+str(e))
 25 |         fw.close()
 26 | 
 27 |     # 模拟浏览器进行登录
 28 |     def get_page(self,href):
 29 |         res = urllib2.urlopen(href)
 30 |         # 如果访问成功的话返回读取的内容，否则返回空的字符串
 31 |         if res.code == 200:
 32 |             return res.read()
 33 |         else:
 34 |             return ""
 35 | 
 36 |         # 从文件中加载贴吧名并组成url
 37 |     def read(self):
 38 |         try:
 39 |             with open("tiebaname/name.txt", "r") as fr:
 40 |                 for line in fr.readlines():
 41 |                     # urllib.quote(line.strip()) 将关键字转变成url 格式
 42 |                     self.tieba_list.append(self.search_url + urllib.quote(line.strip()) + "&ie=utf-8&pn=")
 43 |             fr.close()
 44 |         except Exception as e:
 45 |             self.error("read", "read error", e)
 46 |             pass
 47 |         finally:
 48 |             return self.tieba_list
 49 | 
 50 | 
 51 |     # 解析每个帖子共有几页
 52 |     def get_num(self,url):
 53 |         try:
 54 |             if self.get_page(url):
 55 |                 body = BeautifulSoup(self.get_page(url), "html.parser")
 56 |                 num_li = body.find_all("li", class_="l_reply_num", style="margin-left:8px")[0]
 57 |                 num = num_li.findAll('span', class_='red')[1].get_text()
 58 |                 # print(num)
 59 |                 return int(num)
 60 |             else:
 61 |                 pass
 62 |         except Exception as e:
 63 |             self.error("get_num",url,e)
 64 |             return 1
 65 | 
 66 |     # 解析每一个贴吧前三页的所有帖子连接
 67 |     def parse_href(self,one_tieba_url):
 68 |         self.url_list = []  # 存放一个贴吧前三页所有帖子的链接
 69 |         try:
 70 |             for i in range(0,self.pages):
 71 |                 url = one_tieba_url + str(i * 50)
 72 |                 try:
 73 |                     # i* 50 控制翻页，每页显示50个
 74 |                     if self.get_page(one_tieba_url+str(i*50)):
 75 |                         body = BeautifulSoup(self.get_page(url), "html.parser")
 76 |                         div_list = body.find_all("div", class_="threadlist_title pull_left j_th_tit ")  # 解析到每一个帖子
 77 |                         for div in div_list:
 78 |                             # print(div.a.get('href'),div.a.get_text())
 79 |                             # print("https://tieba.baidu.com" + div.a.get('href'))
 80 |                             self.url_list.append("https://tieba.baidu.com" + div.a.get('href'))
 81 |                     else:
 82 |                         pass
 83 |                 except Exception as e:
 84 |                     self.error("parse_href",url,e)
 85 |                     pass
 86 |                 # time.sleep(self.timesleep)
 87 |         except Exception as e:
 88 |             self.error("parse_href",one_tieba_url,e)
 89 |             pass
 90 | 
 91 |     # 解析每个贴吧前三页所有帖子的发帖人和回帖人的用户名
 92 |     def parse_username(self):
 93 |         try:
 94 |             # 解析每个帖子对应的发帖人和回帖人
 95 |             for url in self.url_list:
 96 |                 filename = urllib.unquote(self.current_href.split("kw=")[1].split("&ie=")[0])              # 贴吧名字，也是文件名
 97 |                 fw = open('data/%s/%s.txt' % (time.strftime('%Y%m%d'), filename), 'a')
 98 | 
 99 |                 try:
100 |                     fw.write(url+"\t")
101 |                     num = self.get_num(url)
102 |                     for i in range(1,num+1):
103 |                         one_url = url+"?pn="+str(i)   # https://tieba.baidu.com/p/5183701449?pn=1
104 |                         # print("total %s papges, now parse is %s page，url is：%s"%(num,i,one_url))
105 |                         # 解析用户名
106 |                         if self.get_page(one_url):
107 |                             li_list = BeautifulSoup(self.get_page(one_url), "html.parser").find_all('li',class_='d_name')
108 |                             for li in li_list:
109 |                                 # print(li.a.get_text())
110 |                                 fw.write(li.a.get_text().encode("utf-8")+"\t")
111 |                             # time.sleep(self.timesleep)
112 |                         else:
113 |                             pass
114 |                     fw.write("\n")
115 |                     fw.close()
116 |                     print(url)
117 |                 except Exception as e:
118 |                     self.error("parse_username",url,e)
119 |                     pass
120 | 
121 |                 time.sleep(self.timesleep)
122 |         except Exception as e:
123 |             self.error("parse_username",url,e)
124 |             pass
125 | 
126 |     def start(self):
127 |         self.read()  # load tieba_prepare name
128 |         for url in self.tieba_list:
129 |             try:
130 |                 self.current_href =url
131 |                 print("Start:",self.current_href,time.strftime("%Y-%m-%d %H-%M-%S")) #self.current_href,
132 |                 self.parse_href(url)  # 解析该贴吧对应的前三页的每个帖子的链接
133 |                 self.parse_username() # 解析每个帖子的发帖人和回帖人
134 |             except Exception as e:
135 |                 self.error("start","parse error at start",e)
136 |                 pass
137 | 
138 |             time.sleep(self.timesleep)
139 |             print("Over:",time.strftime("%Y-%m-%d %H-%M-%S"))


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider1/spider.pyc


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider1/tiebaname/name.txt:
--------------------------------------------------------------------------------
  1 | 戒赌
  2 | 足彩
  3 | 福彩
  4 | 汉中彩票
  5 | 体彩
  6 | 竞彩
  7 | 双色球
  8 | 深圳
  9 | 上海
 10 | 北京
 11 | 武汉
 12 | 福建
 13 | 浙江
 14 | 广州
 15 | 哈尔滨
 16 | 吉林
 17 | 青岛
 18 | 杭州
 19 | 山东
 20 | 重庆
 21 | nba
 22 | 曼联
 23 | 科比
 24 | 皇家马德里
 25 | 巴塞罗那
 26 | 切尔西
 27 | ac米兰
 28 | 北京国安
 29 | 山东鲁能
 30 | 国际米兰
 31 | 拜仁慕尼黑
 32 | 火箭
 33 | 广州FC
 34 | 詹姆斯
 35 | 麦迪
 36 | 利物浦
 37 | 阿森纳
 38 | 尤文图斯
 39 | 洛杉矶湖人
 40 | 上海申花
 41 | 热火
 42 | 梅西
 43 | 德国队
 44 | 江苏舜天
 45 | 小小罗
 46 | 天津泰达
 47 | 死飞
 48 | 欧洲杯
 49 | 中超
 50 | cba
 51 | 河南建业
 52 | 曼城
 53 | 国足
 54 | 意大利国家队
 55 | 多特蒙德
 56 | 英超
 57 | 中国足球
 58 | 库里
 59 | 内马尔
 60 | 罗伊斯
 61 | 足球
 62 | 篮球
 63 | 网球
 64 | 浙江绿城
 65 | 苹果
 66 | iphone
 67 | 长春亚泰
 68 | 英格兰
 69 | 辽宁宏运
 70 | 贵州人和
 71 | 上海东亚
 72 | 重庆力帆
 73 | 西甲
 74 | 马德里竞技
 75 | 德甲
 76 | 世界杯
 77 | 艾弗森
 78 | 韦德
 79 | 马刺
 80 | 易建联
 81 | 北京金隅
 82 | 广东宏远
 83 | 李毅
 84 | 扒皮
 85 | 美女
 86 | 小米
 87 | 电影
 88 | 内涵
 89 | 动漫
 90 | nba
 91 | 头像
 92 | 遮天
 93 | exo
 94 | 爆照
 95 | 减肥
 96 | 鹿晗
 97 | 神回复
 98 | dota
 99 | 文字控
100 | 心理学
101 | 美食
102 | 校花
103 | 绿帽子小同学
104 | 旅行
105 | 小说
106 | 笑话
107 | 90后
108 | 高考
109 | 权志龙
110 | 吴亦凡
111 | 手绘
112 | 梦幻西游
113 | 旅游
114 | dota2
115 | les
116 | 胥渡
117 | 爱情
118 | 整形
119 | 隆鼻
120 | 腐女
121 | gay
122 | 搞笑
123 | 柯南
124 | 剑网
125 | 凡人修仙
126 | 周杰伦
127 | 刘诗诗
128 | 爱情公寓
129 | 陈奕迅
130 | 李敏浩
131 | 音乐
132 | bigbang
133 | 帅哥
134 | 淘宝
135 | 进击的巨人
136 | 张杰
137 | 网名
138 | 魅族
139 | 手机
140 | 短句
141 | 张艺兴
142 | 金秀贤
143 | 手工
144 | 路过的一只
145 | 娱乐圈
146 | 内涵图
147 | 章鱼卡
148 | 君似毒
149 | 黄子韬
150 | 秦时明月
151 | 杨幂
152 | 言情小说
153 | 化妆
154 | 天天酷跑
155 | 情感
156 | 2012
157 | 恐怖
158 | 维尼夫妇
159 | 整容
160 | vae
161 | 爱所以存在
162 | 吴世勋
163 | 吃货


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.13 (/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/bin/python2.7)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/tieba.iml" filepath="$PROJECT_DIR$/.idea/tieba.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/.idea/tieba.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="e4f38311-5570-476d-b3bb-9039ebacc50b" name="Default" comment="" />
  5 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  6 |     <option name="TRACKING_ENABLED" value="true" />
  7 |     <option name="SHOW_DIALOG" value="false" />
  8 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
  9 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 10 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 11 |   </component>
 12 |   <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
 13 |   <component name="FileEditorManager">
 14 |     <leaf>
 15 |       <file leaf-file-name="tieba2.py" pinned="false" current-in-tab="true">
 16 |         <entry file="file://$PROJECT_DIR$/tieba/spiders/tieba2.py">
 17 |           <provider selected="true" editor-type-id="text-editor">
 18 |             <state relative-caret-position="391">
 19 |               <caret line="23" column="40" lean-forward="false" selection-start-line="23" selection-start-column="40" selection-end-line="23" selection-end-column="40" />
 20 |               <folding>
 21 |                 <element signature="e#17#30#0" expanded="true" />
 22 |               </folding>
 23 |             </state>
 24 |           </provider>
 25 |         </entry>
 26 |       </file>
 27 |       <file leaf-file-name="20170630_all_href.txt" pinned="false" current-in-tab="false">
 28 |         <entry file="file://$PROJECT_DIR$/data/20170630_all_href.txt">
 29 |           <provider selected="true" editor-type-id="text-editor">
 30 |             <state relative-caret-position="34">
 31 |               <caret line="2" column="36" lean-forward="false" selection-start-line="2" selection-start-column="36" selection-end-line="2" selection-end-column="36" />
 32 |               <folding />
 33 |             </state>
 34 |           </provider>
 35 |         </entry>
 36 |       </file>
 37 |     </leaf>
 38 |   </component>
 39 |   <component name="FileTemplateManagerImpl">
 40 |     <option name="RECENT_TEMPLATES">
 41 |       <list>
 42 |         <option value="Python Script" />
 43 |       </list>
 44 |     </option>
 45 |   </component>
 46 |   <component name="IdeDocumentHistory">
 47 |     <option name="CHANGED_PATHS">
 48 |       <list>
 49 |         <option value="$PROJECT_DIR$/tieba/spiders/tieba1.py" />
 50 |         <option value="$PROJECT_DIR$/data/20170630_all_href.txt" />
 51 |         <option value="$PROJECT_DIR$/tieba/spiders/tieba2.py" />
 52 |       </list>
 53 |     </option>
 54 |   </component>
 55 |   <component name="ProjectFrameBounds">
 56 |     <option name="width" value="1440" />
 57 |     <option name="height" value="900" />
 58 |   </component>
 59 |   <component name="ProjectView">
 60 |     <navigator currentView="ProjectPane" proportions="" version="1">
 61 |       <flattenPackages />
 62 |       <showMembers />
 63 |       <showModules />
 64 |       <showLibraryContents />
 65 |       <hideEmptyPackages />
 66 |       <abbreviatePackageNames />
 67 |       <autoscrollToSource />
 68 |       <autoscrollFromSource />
 69 |       <sortByType />
 70 |       <manualOrder />
 71 |       <foldersAlwaysOnTop value="true" />
 72 |     </navigator>
 73 |     <panes>
 74 |       <pane id="ProjectPane">
 75 |         <subPane>
 76 |           <PATH>
 77 |             <PATH_ELEMENT>
 78 |               <option name="myItemId" value="tieba" />
 79 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
 80 |             </PATH_ELEMENT>
 81 |             <PATH_ELEMENT>
 82 |               <option name="myItemId" value="tieba" />
 83 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
 84 |             </PATH_ELEMENT>
 85 |           </PATH>
 86 |           <PATH>
 87 |             <PATH_ELEMENT>
 88 |               <option name="myItemId" value="tieba" />
 89 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
 90 |             </PATH_ELEMENT>
 91 |             <PATH_ELEMENT>
 92 |               <option name="myItemId" value="tieba" />
 93 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
 94 |             </PATH_ELEMENT>
 95 |             <PATH_ELEMENT>
 96 |               <option name="myItemId" value="tieba" />
 97 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
 98 |             </PATH_ELEMENT>
 99 |           </PATH>
100 |           <PATH>
101 |             <PATH_ELEMENT>
102 |               <option name="myItemId" value="tieba" />
103 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
104 |             </PATH_ELEMENT>
105 |             <PATH_ELEMENT>
106 |               <option name="myItemId" value="tieba" />
107 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
108 |             </PATH_ELEMENT>
109 |             <PATH_ELEMENT>
110 |               <option name="myItemId" value="tieba" />
111 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
112 |             </PATH_ELEMENT>
113 |             <PATH_ELEMENT>
114 |               <option name="myItemId" value="spiders" />
115 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
116 |             </PATH_ELEMENT>
117 |           </PATH>
118 |           <PATH>
119 |             <PATH_ELEMENT>
120 |               <option name="myItemId" value="tieba" />
121 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
122 |             </PATH_ELEMENT>
123 |             <PATH_ELEMENT>
124 |               <option name="myItemId" value="tieba" />
125 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
126 |             </PATH_ELEMENT>
127 |             <PATH_ELEMENT>
128 |               <option name="myItemId" value="data" />
129 |               <option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
130 |             </PATH_ELEMENT>
131 |           </PATH>
132 |         </subPane>
133 |       </pane>
134 |       <pane id="Scope" />
135 |       <pane id="Scratches" />
136 |     </panes>
137 |   </component>
138 |   <component name="PropertiesComponent">
139 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
140 |     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
141 |     <property name="FullScreen" value="true" />
142 |   </component>
143 |   <component name="RunDashboard">
144 |     <option name="ruleStates">
145 |       <list>
146 |         <RuleState>
147 |           <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
148 |         </RuleState>
149 |         <RuleState>
150 |           <option name="name" value="StatusDashboardGroupingRule" />
151 |         </RuleState>
152 |       </list>
153 |     </option>
154 |   </component>
155 |   <component name="RunManager">
156 |     <configuration default="true" type="PythonConfigurationType" factoryName="Python">
157 |       <option name="INTERPRETER_OPTIONS" value="" />
158 |       <option name="PARENT_ENVS" value="true" />
159 |       <envs>
160 |         <env name="PYTHONUNBUFFERED" value="1" />
161 |       </envs>
162 |       <option name="SDK_HOME" value="" />
163 |       <option name="WORKING_DIRECTORY" value="" />
164 |       <option name="IS_MODULE_SDK" value="false" />
165 |       <option name="ADD_CONTENT_ROOTS" value="true" />
166 |       <option name="ADD_SOURCE_ROOTS" value="true" />
167 |       <module name="tieba" />
168 |       <option name="SCRIPT_NAME" value="" />
169 |       <option name="PARAMETERS" value="" />
170 |       <option name="SHOW_COMMAND_LINE" value="false" />
171 |       <option name="EMULATE_TERMINAL" value="false" />
172 |       <method />
173 |     </configuration>
174 |     <configuration default="true" type="Tox" factoryName="Tox">
175 |       <option name="INTERPRETER_OPTIONS" value="" />
176 |       <option name="PARENT_ENVS" value="true" />
177 |       <envs />
178 |       <option name="SDK_HOME" value="" />
179 |       <option name="WORKING_DIRECTORY" value="" />
180 |       <option name="IS_MODULE_SDK" value="false" />
181 |       <option name="ADD_CONTENT_ROOTS" value="true" />
182 |       <option name="ADD_SOURCE_ROOTS" value="true" />
183 |       <module name="tieba" />
184 |       <method />
185 |     </configuration>
186 |     <configuration default="true" type="tests" factoryName="Doctests">
187 |       <option name="INTERPRETER_OPTIONS" value="" />
188 |       <option name="PARENT_ENVS" value="true" />
189 |       <envs />
190 |       <option name="SDK_HOME" value="" />
191 |       <option name="WORKING_DIRECTORY" value="" />
192 |       <option name="IS_MODULE_SDK" value="false" />
193 |       <option name="ADD_CONTENT_ROOTS" value="true" />
194 |       <option name="ADD_SOURCE_ROOTS" value="true" />
195 |       <module name="tieba" />
196 |       <option name="SCRIPT_NAME" value="" />
197 |       <option name="CLASS_NAME" value="" />
198 |       <option name="METHOD_NAME" value="" />
199 |       <option name="FOLDER_NAME" value="" />
200 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
201 |       <option name="PATTERN" value="" />
202 |       <option name="USE_PATTERN" value="false" />
203 |       <method />
204 |     </configuration>
205 |     <configuration default="true" type="tests" factoryName="Unittests">
206 |       <option name="INTERPRETER_OPTIONS" value="" />
207 |       <option name="PARENT_ENVS" value="true" />
208 |       <envs />
209 |       <option name="SDK_HOME" value="" />
210 |       <option name="WORKING_DIRECTORY" value="" />
211 |       <option name="IS_MODULE_SDK" value="false" />
212 |       <option name="ADD_CONTENT_ROOTS" value="true" />
213 |       <option name="ADD_SOURCE_ROOTS" value="true" />
214 |       <module name="tieba" />
215 |       <option name="_new_additionalArguments" value="&quot;&quot;" />
216 |       <option name="_new_target" value="&quot;.&quot;" />
217 |       <option name="_new_targetType" value="&quot;PATH&quot;" />
218 |       <method />
219 |     </configuration>
220 |   </component>
221 |   <component name="ShelveChangesManager" show_recycled="false">
222 |     <option name="remove_strategy" value="false" />
223 |   </component>
224 |   <component name="TaskManager">
225 |     <task active="true" id="Default" summary="Default task">
226 |       <changelist id="e4f38311-5570-476d-b3bb-9039ebacc50b" name="Default" comment="" />
227 |       <created>1498758628713</created>
228 |       <option name="number" value="Default" />
229 |       <option name="presentableId" value="Default" />
230 |       <updated>1498758628713</updated>
231 |     </task>
232 |     <servers />
233 |   </component>
234 |   <component name="ToolWindowManager">
235 |     <frame x="0" y="0" width="1440" height="900" extended-state="0" />
236 |     <editor active="true" />
237 |     <layout>
238 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
239 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
240 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
241 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
242 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
243 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
244 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
245 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
246 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
247 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
248 |       <window_info id="Data View" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
249 |       <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
250 |       <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
251 |       <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
252 |       <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
253 |       <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
254 |       <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
255 |       <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
256 |     </layout>
257 |   </component>
258 |   <component name="VcsContentAnnotationSettings">
259 |     <option name="myLimit" value="2678400000" />
260 |   </component>
261 |   <component name="XDebuggerManager">
262 |     <breakpoint-manager />
263 |     <watches-manager />
264 |   </component>
265 |   <component name="editorHistoryManager">
266 |     <entry file="file://$PROJECT_DIR$/name.txt">
267 |       <provider selected="true" editor-type-id="text-editor">
268 |         <state relative-caret-position="17">
269 |           <caret line="1" column="2" lean-forward="true" selection-start-line="1" selection-start-column="2" selection-end-line="1" selection-end-column="2" />
270 |           <folding />
271 |         </state>
272 |       </provider>
273 |     </entry>
274 |     <entry file="file://$PROJECT_DIR$/tieba/spiders/tieba1.py">
275 |       <provider selected="true" editor-type-id="text-editor">
276 |         <state relative-caret-position="85">
277 |           <caret line="5" column="0" lean-forward="true" selection-start-line="5" selection-start-column="0" selection-end-line="5" selection-end-column="0" />
278 |           <folding>
279 |             <element signature="e#17#30#0" expanded="true" />
280 |           </folding>
281 |         </state>
282 |       </provider>
283 |     </entry>
284 |     <entry file="file://$PROJECT_DIR$/data/20170630_all_href.txt">
285 |       <provider selected="true" editor-type-id="text-editor">
286 |         <state relative-caret-position="34">
287 |           <caret line="2" column="36" lean-forward="false" selection-start-line="2" selection-start-column="36" selection-end-line="2" selection-end-column="36" />
288 |           <folding />
289 |         </state>
290 |       </provider>
291 |     </entry>
292 |     <entry file="file://$PROJECT_DIR$/tieba/spiders/tieba2.py">
293 |       <provider selected="true" editor-type-id="text-editor">
294 |         <state relative-caret-position="391">
295 |           <caret line="23" column="40" lean-forward="false" selection-start-line="23" selection-start-column="40" selection-end-line="23" selection-end-column="40" />
296 |           <folding>
297 |             <element signature="e#17#30#0" expanded="true" />
298 |           </folding>
299 |         </state>
300 |       </provider>
301 |     </entry>
302 |   </component>
303 | </project>


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/name.txt:
--------------------------------------------------------------------------------
  1 | 戒赌
  2 | 足彩
  3 | 福彩
  4 | 汉中彩票
  5 | 体彩
  6 | 竞彩
  7 | 双色球
  8 | 深圳
  9 | 上海
 10 | 北京
 11 | 武汉
 12 | 福建
 13 | 浙江
 14 | 广州
 15 | 哈尔滨
 16 | 吉林
 17 | 青岛
 18 | 杭州
 19 | 山东
 20 | 重庆
 21 | nba
 22 | 曼联
 23 | 科比
 24 | 皇家马德里
 25 | 巴塞罗那
 26 | 切尔西
 27 | ac米兰
 28 | 北京国安
 29 | 山东鲁能
 30 | 国际米兰
 31 | 拜仁慕尼黑
 32 | 火箭
 33 | 广州FC
 34 | 詹姆斯
 35 | 麦迪
 36 | 利物浦
 37 | 阿森纳
 38 | 尤文图斯
 39 | 洛杉矶湖人
 40 | 上海申花
 41 | 热火
 42 | 梅西
 43 | 德国队
 44 | 江苏舜天
 45 | 小小罗
 46 | 天津泰达
 47 | 死飞
 48 | 欧洲杯
 49 | 中超
 50 | cba
 51 | 河南建业
 52 | 曼城
 53 | 国足
 54 | 意大利国家队
 55 | 多特蒙德
 56 | 英超
 57 | 中国足球
 58 | 库里
 59 | 内马尔
 60 | 罗伊斯
 61 | 足球
 62 | 篮球
 63 | 网球
 64 | 浙江绿城
 65 | 苹果
 66 | iphone
 67 | 长春亚泰
 68 | 英格兰
 69 | 辽宁宏运
 70 | 贵州人和
 71 | 上海东亚
 72 | 重庆力帆
 73 | 西甲
 74 | 马德里竞技
 75 | 德甲
 76 | 世界杯
 77 | 艾弗森
 78 | 韦德
 79 | 马刺
 80 | 易建联
 81 | 北京金隅
 82 | 广东宏远
 83 | 李毅
 84 | 扒皮
 85 | 美女
 86 | 小米
 87 | 电影
 88 | 内涵
 89 | 动漫
 90 | nba
 91 | 头像
 92 | 遮天
 93 | exo
 94 | 爆照
 95 | 减肥
 96 | 鹿晗
 97 | 神回复
 98 | dota
 99 | 文字控
100 | 心理学
101 | 美食
102 | 校花
103 | 绿帽子小同学
104 | 旅行
105 | 小说
106 | 笑话
107 | 90后
108 | 高考
109 | 权志龙
110 | 吴亦凡
111 | 手绘
112 | 梦幻西游
113 | 旅游
114 | dota2
115 | les
116 | 胥渡
117 | 爱情
118 | 整形
119 | 隆鼻
120 | 腐女
121 | gay
122 | 搞笑
123 | 柯南
124 | 剑网
125 | 凡人修仙
126 | 周杰伦
127 | 刘诗诗
128 | 爱情公寓
129 | 陈奕迅
130 | 李敏浩
131 | 音乐
132 | bigbang
133 | 帅哥
134 | 淘宝
135 | 进击的巨人
136 | 张杰
137 | 网名
138 | 魅族
139 | 手机
140 | 短句
141 | 张艺兴
142 | 金秀贤
143 | 手工
144 | 路过的一只
145 | 娱乐圈
146 | 内涵图
147 | 章鱼卡
148 | 君似毒
149 | 黄子韬
150 | 秦时明月
151 | 杨幂
152 | 言情小说
153 | 化妆
154 | 天天酷跑
155 | 情感
156 | 2012
157 | 恐怖
158 | 维尼夫妇
159 | 整容
160 | vae
161 | 爱所以存在
162 | 吴世勋
163 | 吃货
164 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tieba.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tieba
12 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.py


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/__init__.pyc


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TiebaItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/items.pyc


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class TiebaSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import time
 9 | 
10 | 
11 | class TiebaPipeline(object):
12 | 
13 |     def process_item(self, item, spider):
14 | 				return item
15 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/pipelines.pyc


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for tieba project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'tieba'
13 | 
14 | SPIDER_MODULES = ['tieba.spiders']
15 | NEWSPIDER_MODULE = 'tieba.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'tieba (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'tieba.middlewares.TiebaSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'tieba.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'tieba.pipelines.TiebaPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/settings.pyc


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/__init__.pyc


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import scrapy
 4 | import urllib
 5 | import time
 6 | 
 7 | 
 8 | class TiebaSpider(scrapy.Spider):
 9 | 
10 |     name = 'tieba'
11 | 
12 |     def __init__(self):
13 |         self.urls = []
14 | 
15 |         # 加载贴吧名
16 |         fr = open("name.txt", "r")
17 | 
18 |         for one in fr.readlines():
19 |             for i in range(0, 3):
20 |                 self.urls.append('https://tieba.baidu.com/f?kw=' +
21 |                                  urllib.quote(one.strip()) + '&ie=utf-8&pn=' + str(i * 50))
22 |         fr.close()
23 | 
24 |     def start_requests(self):
25 |         urls = self.urls
26 | 
27 |         for url in urls:
28 |             yield scrapy.Request(url=url, callback=self.parse)
29 | 
30 |     def parse(self, response):
31 |         sel = scrapy.Selector(response)
32 |         ahref_list = sel.xpath(
33 |             '//a[re:test(@class, "j_th_tit ")]//@href').extract()
34 | 
35 |         fw = open("data/%s_all_href.txt" % time.strftime('%Y%m%d'), "a")
36 |         for ahref in ahref_list:
37 |             href = "https://tieba.baidu.com" + ahref
38 |             fw.write(href + "\n")
39 |         fw.close()
40 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba1.pyc


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import scrapy
 4 | import time
 5 | from scrapy.http.request import Request
 6 | from scrapy.http import HtmlResponse
 7 | 
 8 | class TiebaSpider2(scrapy.Spider):
 9 | 
10 |     name = 'tieba2'
11 | 
12 |     def __init__(self):
13 |         self.urls = []
14 | 
15 |         # 加载贴吧名
16 |         fr = open("data/%s_all_href.txt" % time.strftime('%Y%m%d'), "r")
17 | 
18 |         for one in fr.readlines():
19 |             self.urls.append(one.strip())
20 |         fr.close()
21 | 
22 |     def start_requests(self):
23 |         urls = self.urls
24 | 
25 |         for one in urls:
26 |             yield scrapy.Request(url=one, callback=self.parse)
27 |     
28 |     def parse_uname(self, response):
29 |         # response = HtmlResponse(url=page_url.url)
30 |         sel = scrapy.Selector(response)
31 |         name_list = sel.xpath('//li[re:test(@class, "d_name")]//a/text()').extract()
32 |         # print respons        
33 |         fw = open("data/%s_all_name.txt" % time.strftime('%Y%m%d'), "a")
34 |         for name in list(set(name_list)):
35 |             fw.write(name.encode("utf-8"))
36 |             fw.write("\n")
37 |         fw.close()
38 |     
39 |     def parse(self, response):
40 |         sel = scrapy.Selector(response)
41 | 
42 |         # 可能有些帖子被删除
43 |         try:
44 |             # 得到每个帖子有多少页
45 |             num = int(sel.xpath('//span[re:test(@class,"red")]//text()').extract()[1])   
46 |             # 遍历每页获得用户名
47 |             for page_num in range(1, num + 1):
48 |                 one_url = response.url + "?pn=" + str(page_num)
49 | 
50 |                 yield Request(url=one_url, callback=self.parse_uname) 
51 |         except Exception as e:
52 |             pass
53 |         
54 |         
55 | 
56 |     
57 | 


--------------------------------------------------------------------------------
/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/0-Spider/tiebaSpider/spider2/tieba/tieba/spiders/tieba2.pyc


--------------------------------------------------------------------------------
/AdaBoost/AdaBoost.py:
--------------------------------------------------------------------------------
  1 | #-*-coding:utf-8-*-
  2 | '''
  3 | Created on 2016年5月10日
  4 | 
  5 | @author: Gamer Think
  6 | '''
  7 | from test.inspect_fodder import StupidGit
  8 | 
  9 | __author__="thinkgamer"
 10 | 
 11 | from numpy import *
 12 | 
 13 | #加载数据集
 14 | def loadSimData():
 15 |     datMat = matrix([[1.0 , 2.1],
 16 |                      [2.  , 1.1],
 17 |                      [1.3 , 1. ],
 18 |                      [1.  , 1. ],
 19 |                      [2.  , 1. ]])
 20 |     
 21 |     classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
 22 |     return datMat,classLabels
 23 | 
 24 | #单层决策树生成函数
 25 | def stumpClassify(dataMatrix, dimen,threshVal, threshInsq):
 26 |     retArray = ones((shape(dataMatrix)[0],1))
 27 |     if threshInsq == 'lt':
 28 |         retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
 29 |     else:
 30 |         retArray[dataMatrix[:,dimen] > threshVal] = -1.0
 31 |     return retArray
 32 | 
 33 | def buildStump(dataArr,classLabels,D):
 34 |     dataMatrix = mat(dataArr)
 35 |     #matrix必须是二维的，numpy可以是多维的
 36 |     labelMat = mat(classLabels).T #.T表示转置矩阵
 37 |     m,n = shape(dataMatrix)     #给定数据集的行列数
 38 |     numSteps = 10.0 #变用于在特征的所有可能值上进行遍历
 39 |     bestStump = {} #字典用于存储给定权重向量0时所得到的最佳单层决策树的相关信息
 40 |     bestClassEnt = mat(zeros((m,1)))
 41 |     minError = inf #首先将minError初始化为正无穷大
 42 |     for i in range(n):
 43 |         rangeMin = dataMatrix[:,i].min()
 44 |         rangeMax = dataMatrix[:,i].max()
 45 |         stepSize = (rangeMax-rangeMin)/numSteps
 46 |         for j in range(-1,int(numSteps)+1):
 47 |             #lt ：小于，lte，le：小于等于
 48 |             #gt：大于，，gte，ge：大于等于
 49 |             #eq：等于  ne,neq：不等于
 50 |             for inequal in ['lt','gt']:
 51 |                 threshVal = (rangeMin + float(j) * stepSize)
 52 |                 predictedVals = stumpClassify(dataMatrix,i,threshVal, inequal)
 53 |                 errArr = mat(ones((m,1)))
 54 |                 errArr[predictedVals==labelMat]=0
 55 |                 weightedError = D.T * errArr    #计算加权错误概率
 56 | #                 print "split: dim %d, thresh % .2f, thresh inequal: %s, the weighted error is %.3f" % (i, threshVal,inequal,weightedError)
 57 |                 #更新bestStump中保存的最佳单层决策树的相关信息
 58 |                 if weightedError < minError:
 59 |                     minError = weightedError
 60 |                     bestClassEnt = predictedVals.copy()
 61 |                     bestStump['dim'] = i
 62 |                     bestStump['thresh'] = threshVal
 63 |                     bestStump['ineq'] = inequal
 64 |      
 65 |     return bestStump,minError,bestClassEnt 
 66 |                     
 67 | #基于单层决策树的AdaBoost训练过程
 68 | #numIt：迭代次数，默认为40
 69 | def adaBoostTrainDS(dataArr,classLabels,numIt=40):
 70 |     weakClassArr = []
 71 |     m= shape(dataArr)[0]
 72 |     D = mat(ones((m,1))/m)
 73 |     aggClassEst = mat(zeros((m,1)))
 74 |     #迭代
 75 |     for i in range(numIt):
 76 |         #调用单层决策树
 77 |         bestStump,error,classEst = buildStump(dataArr, classLabels, D)  
 78 |         print "D:",D.T  #打印D的转置矩阵
 79 |         alpha = float(0.5 * log((1.0 - error) / max(error,1e-16)))# max(error,1e-16)))用于确保没有错误时，不会发生溢出
 80 |         bestStump['alpha'] = alpha
 81 |         weakClassArr.append(bestStump)
 82 |         print "classEst:",classEst.T
 83 |         #为下一次迭代计算D
 84 |         expon = multiply(-1 * alpha * mat(classLabels).T,classEst)
 85 |         D = multiply(D,exp(expon))
 86 |         D = D /D.sum()
 87 |         #错误率累加计算
 88 |         aggClassEst += alpha* classEst
 89 |         print "aggClassEst:",aggClassEst.T
 90 |         aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m,1)))
 91 |         errorRate = aggErrors.sum()/m
 92 |         print "total error:",errorRate
 93 |         #如果不发生错误，返回
 94 |         if errorRate == 0.0:
 95 |             break
 96 |     return weakClassArr      
 97 |       
 98 | 
 99 | #AdaBoost分类函数
100 | #输入参数为待分类样例datToClass和多个弱分类器classifierArr
101 | def adaClassify(datToClass,classifierArr):
102 |     dataMatrix = mat(datToClass)
103 |     m = shape(dataMatrix)[0]      
104 |     aggClassEst = mat(zeros((m,1)))
105 |     for i in range(len(classifierArr)):
106 |         classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],\
107 |                                  classifierArr[i]['thresh'],\
108 |                                  classifierArr[i]['ineq'])
109 |         aggClassEst+= classifierArr[i]['alpha'] * classEst
110 |         print aggClassEst
111 |     return sign(aggClassEst)
112 | 
113 |       
114 | #main函数
115 | if __name__=="__main__":
116 |     #加载数据集
117 |     datMat,classLabels = loadSimData()
118 | #     print "datMat:",datMat
119 | #     print "classLabels:",classLabels
120 |     
121 |     #单层决策树生成函数
122 | #     D = mat(ones((5,1))/5)
123 | #     print buildStump(datMat, classLabels, D)
124 |     
125 |     #基于单层决策树的Adaboost训练过程
126 |     classifierArray = adaBoostTrainDS(datMat, classLabels, 30)
127 | #     for classifier in classifierArray:
128 | #         print classifier 
129 |         
130 |     #测试AdaBoost分类函数
131 |     print "[0,0]:\n",adaClassify([0,0], classifierArray)
132 |     print "\n\n[[5,5],[0,0]]:\n",adaClassify([[5,5],[0,0]], classifierArray)
133 | 


--------------------------------------------------------------------------------
/Apriori/Apriori.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Apriori/Apriori.py


--------------------------------------------------------------------------------
/Bayes/bayes.py:
--------------------------------------------------------------------------------
 1 | #encoding:utf-8
 2 | '''
 3 | Created on 2016年5月12日
 4 | 
 5 | @author: Gamer Think
 6 | '''
 7 | 
 8 | from numpy import *
 9 | 
10 | #词表到向量的转换函数
11 | def loadDataSet():
12 |     postingList = [['my','dog','has','flea','problems','help','please'],
13 |                    ['maybe','not','take','him','to','dog','park','stupid'],
14 |                    ['my','dalmation','is','so','cute','I','love','him'],
15 |                    ['stop','posting','stupid','worthless','garbage'],
16 |                    ['mr','licks','ate','my','steak','how','to','stop','him'],
17 |                    ['quit','buying','worthless','dog','food','stupid']]
18 |     classVec = [0,1,0,1,0,1]      #1,侮辱  0,正常
19 |     return postingList,classVec
20 | 
21 | def createVocabList(dataSet):
22 |     vocabSet = set([])  #调用set方法,创建一个空集
23 |     for document in dataSet:
24 |         vocabSet = vocabSet | set(document)     #创建两个集合的并集
25 |     return list(vocabSet)
26 | 
27 | def setOfWords2Vec(vocabList,inputSet):
28 |     returnVec = [0]*len(vocabList)   #创建一个所含元素都为0的向量
29 |     for word in inputSet:
30 |         if word in vocabList:
31 |             returnVec[vocabList.index(word)] = 1
32 |         else:
33 |             print "the word:%s is not in my Vocabulary" % word
34 |     return returnVec
35 | 
36 | 
37 | def bagOfWords2VecMN(vocabList,inputSet):
38 |     returnVec = [0]*len(vocabList)   #创建一个所含元素都为0的向量
39 |     for word in inputSet:
40 |         if word in vocabList:
41 |             returnVec[vocabList.index(word)] += 1
42 |     return returnVec
43 | 
44 | 
45 | #朴素贝叶斯分类器训练集
46 | def trainNB0(trainMatrix,trainCategory):  #传入参数为文档矩阵，每篇文档类别标签所构成的向量
47 |     numTrainDocs = len(trainMatrix)      #文档矩阵的长度
48 |     numWords = len(trainMatrix[0])       #第一个文档的单词个数
49 |     pAbusive = sum(trainCategory)/float(numTrainDocs)  #任意文档属于侮辱性文档概率
50 |     #p0Num = zeros(numWords);p1Num = zeros(numWords)        #初始化两个矩阵，长度为numWords，内容值为0
51 |     p0Num = ones(numWords);p1Num = ones(numWords)        #初始化两个矩阵，长度为numWords，内容值为1
52 |     #p0Denom = 0.0;p1Denom = 0.0                         #初始化概率
53 |     p0Denom = 2.0;p1Denom = 2.0 
54 |     for i in range(numTrainDocs):
55 |         if trainCategory[i]==1:
56 |             p1Num +=trainMatrix[i]
57 |             p1Denom += sum(trainMatrix[i])
58 |         else:
59 |             p0Num +=trainMatrix[i]
60 |             p0Denom += sum(trainMatrix[i])
61 |     #p1Vect = p1Num/p1Denom #对每个元素做除法
62 |     #p0Vect = p0Num/p0Denom
63 |     p1Vect = log(p1Num/p1Denom)
64 |     p0Vect = log(p0Num/p0Denom)
65 |     return p0Vect,p1Vect,pAbusive
66 | 
67 | #朴素贝叶斯分类函数
68 | def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
69 |     p1 = sum(vec2Classify * p1Vec) + log(pClass1)   #元素相乘
70 |     p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
71 |     if p1>p0:
72 |         return 1
73 |     else:
74 |         return 0
75 | 
76 | def testingNB():
77 |     listOPosts,listClasses = loadDataSet()   #产生文档矩阵和对应的标签
78 |     myVocabList = createVocabList(listOPosts) #创建并集
79 |     trainMat = []   #创建一个空的列表
80 |     for postinDoc in listOPosts:
81 |         trainMat.append(setOfWords2Vec(myVocabList,postinDoc))  #使用词向量来填充trainMat列表
82 |     p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))  #训练函数
83 |     testEntry = ['love','my','dalmation']   #测试文档列表
84 |     thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #声明矩阵
85 |     print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
86 |     testEntry = ['stupid','garbage']
87 |     thisDoc = array(setOfWords2Vec(myVocabList,testEntry))    #声明矩阵
88 |     print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)
89 |     
90 | if __name__=="__main__":
91 |     testingNB()


--------------------------------------------------------------------------------
/Decision-Tree/DecisionTree-ID3.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | '''
 3 | '''
 4 | from math import log
 5 | import operator
 6 | 
 7 | def createDataSet():
 8 |     dataSet =[[1,1,'yes'],
 9 |               [1,1,'yes'],
10 |               [1,0,'no'],
11 |               [0,1,'no'],
12 |               [0,1,'no']]
13 |     labels = ['no surfacing','flippers'] #分类的属性
14 |     return dataSet,labels
15 |     
16 | #计算给定数据的香农熵
17 | def calcShannonEnt(dataSet):
18 |     numEntries = len(dataSet)
19 |     labelCounts = {}
20 |     for featVec in dataSet:
21 |         currentLabel = featVec[-1] #获得标签
22 |         #构造存放标签的字典
23 |         if currentLabel not in labelCounts.keys():
24 |             labelCounts[currentLabel]=0
25 |         labelCounts[currentLabel]+=1 #对应的标签数目+1
26 |     #计算香农熵
27 |     shannonEnt = 0.0
28 |     for key in labelCounts:
29 |         prob = float(labelCounts[key])/numEntries
30 |         shannonEnt -=prob*log(prob,2)
31 |     return shannonEnt
32 | 
33 | #划分数据集,三个参数为带划分的数据集，划分数据集的特征，特征的返回值
34 | def splitDataSet(dataSet,axis,value):  
35 |     retDataSet = []
36 |     for featVec in dataSet:
37 |         if featVec[axis] ==value:
38 |             #将相同数据集特征的抽取出来
39 |             reducedFeatVec = featVec[:axis]
40 |             reducedFeatVec.extend(featVec[axis+1:])
41 |             retDataSet.append(reducedFeatVec)
42 |     return retDataSet #返回一个列表
43 |         
44 | #选择最好的数据集划分方式
45 | def chooseBestFeatureToSplit(dataSet):
46 |     numFeature = len(dataSet[0])-1
47 |     baseEntropy = calcShannonEnt(dataSet)
48 |     bestInfoGain = 0.0
49 |     beatFeature = -1
50 |     for i in range(numFeature):
51 |         featureList = [example[i] for example in dataSet] #获取第i个特征所有的可能取值
52 |         uniqueVals = set(featureList)  #从列表中创建集合，得到不重复的所有可能取值ֵ
53 |         newEntropy = 0.0
54 |         for value in uniqueVals:
55 |             subDataSet = splitDataSet(dataSet,i,value)   #以i为数据集特征，value为返回值，划分数据集
56 |             prob = len(subDataSet)/float(len(dataSet))   #数据集特征为i的所占的比例
57 |             newEntropy +=prob * calcShannonEnt(subDataSet)   #计算每种数据集的信息熵
58 |         infoGain = baseEntropy- newEntropy
59 |         #计算最好的信息增益，增益越大说明所占决策权越大
60 |         if (infoGain > bestInfoGain):
61 |             bestInfoGain = infoGain
62 |             bestFeature = i
63 |     return bestFeature
64 | 
65 | #递归构建决策树
66 | def majorityCnt(classList):      
67 |     classCount = {}
68 |     for vote in classList:
69 |         if vote not in classCount.keys():
70 |             classCount[vote]=0
71 |         classCount[vote]+=1
72 |     sortedClassCount = sorted(classCount.iteritems(),key =operator.itemgetter(1),reverse=True)#排序，True升序
73 |     return sortedClassCount[0][0]  #返回出现次数最多的
74 | 
75 |  #创建树的函数代码
76 | def createTree(dataSet,labels):     
77 |     classList = [example[-1]  for example in dataSet]
78 |     if classList.count(classList[0])==len(classList):#类别完全相同则停止划分
79 |         return classList[0]
80 |     if len(dataSet[0]) ==1:             #遍历完所有特征值时返回出现次数最多的
81 |         return majorityCnt(classList)
82 |     bestFeat = chooseBestFeatureToSplit(dataSet)   #选择最好的数据集划分方式
83 |     bestFeatLabel = labels[bestFeat]   #得到对应的标签值
84 |     myTree = {bestFeatLabel:{}}
85 |     del(labels[bestFeat])      #清空labels[bestFeat],在下一次使用时清零
86 |     featValues = [example[bestFeat] for example in dataSet] 
87 |     uniqueVals = set(featValues)
88 |     for value in uniqueVals:
89 |         subLabels =labels[:]
90 |         #递归调用创建决策树函数
91 |         myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
92 |     return myTree  
93 | 
94 | if __name__=="__main__":
95 |     dataSet,labels = createDataSet()
96 |     print createTree(dataSet,labels)


--------------------------------------------------------------------------------
/FP-growth/FP_Tree.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/FP-growth/FP_Tree.py


--------------------------------------------------------------------------------
/FP-growth/newsClickStream.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/FP-growth/newsClickStream.py


--------------------------------------------------------------------------------
/FP-growth/所用到dat文件下载地址.txt:
--------------------------------------------------------------------------------
1 | http://download.csdn.net/detail/gamer_gyt/9514873


--------------------------------------------------------------------------------
/K-means/kMeans.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/K-means/kMeans.py


--------------------------------------------------------------------------------
/Logistic Regession/LogisticRegession.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | '''
  3 | Created on 2016/4/24
  4 | 
  5 | @author: Gamer Think
  6 | '''
  7 | 
  8 | from numpy import *
  9 | 
 10 | #加载数据集
 11 | def loadDataSet():
 12 |     dataMat = []
 13 |     labelMat = []
 14 |     fp = open("ex1.txt")
 15 |     for line in fp.readlines():
 16 |         lineArr = line.strip().split() #分割
 17 |         dataMat.append([1.0,float(lineArr[0]), float(lineArr[1])])
 18 |         labelMat.append( int(lineArr[2]))
 19 |         
 20 |     return dataMat,labelMat
 21 | 
 22 | #定义Sigmoid函数
 23 | def sigmoid(inX):
 24 |     return 1.0/(1+exp(-inX))
 25 | 
 26 | #梯度上升算法求解最佳回归系数
 27 | def gradAscent(dataMatIn,classLabels):
 28 |     dataMatrix = mat(dataMatIn) #将数组转为矩阵
 29 |     labelMat = mat(classLabels).transpose()
 30 |     m,n = shape(dataMatrix)      #返回矩阵的行和列
 31 |     alpha = 0.001      #初始化 alpha的值
 32 |     maxCycles = 500    #最大迭代次数
 33 |     weights = ones((n,1)) #初始化最佳回归系数
 34 |     for i in range(0,maxCycles):
 35 |         #引用原书的代码，求梯度
 36 |         h = sigmoid(dataMatrix*weights)
 37 |         error = labelMat - h
 38 |         weights = weights + alpha * dataMatrix.transpose() * error
 39 |         
 40 |     return weights
 41 |    
 42 | #随机梯度上升算法求回归系数
 43 | def stocGradAscent0(dataMatrix,labelMat): 
 44 |     dataMatrix = array(dataMatrix)
 45 |     m,n = shape(dataMatrix)
 46 |     alpha = 0.01
 47 |     weights = ones(n)
 48 |     for i in range(0,m):
 49 |         h = sigmoid(sum(dataMatrix[i]*weights))
 50 |         error = labelMat[i] - h
 51 |         weights = weights + alpha *  error * dataMatrix[i]
 52 |     
 53 |     return weights
 54 |         
 55 |     
 56 | #改进版的随机梯度上升算法
 57 | def stocGradAscent1(dataMatrix,labelMat,numIter=150):
 58 |     m,n = shape(dataMatrix)
 59 |     weights = ones(n)
 60 |     for i in range(0,numIter):
 61 |         dataIndex = range(m)
 62 |         for j in range(0,m):
 63 |             alpha = 4/(1.0+j+i)+0.01
 64 |             randIndex = int(random.uniform(0,len(dataIndex)))
 65 |             h = sigmoid(sum(dataMatrix[randIndex] * weights))
 66 |             error = labelMat[randIndex] - h
 67 |             weights = weights + alpha * error * dataMatrix[randIndex]
 68 |             del(dataIndex[randIndex])
 69 |         
 70 |     return weights       
 71 | 
 72 | #分析数据，画出决策边界
 73 | def plotBestFit(wei,dataMatrix,labelMat):
 74 |     import matplotlib.pyplot as plt
 75 |     weights = wei     #将矩阵wei转化为list
 76 |     dataArr = array(dataMatrix)  #将矩阵转化为数组
 77 |     n = shape(dataMatrix)[0]
 78 |     xcord1 = [];ycord1=[]
 79 |     xcord2 = [];ycord2=[]
 80 |     
 81 |     for i in range(n):
 82 |         if int(labelMat[i])==1:
 83 |             xcord1.append(dataArr[i,1])
 84 |             ycord1.append(dataArr[i,2])
 85 |         else:
 86 |             xcord2.append(dataArr[i,1])
 87 |             ycord2.append(dataArr[i,2])
 88 |             
 89 |     fig = plt.figure()
 90 |     ax = fig.add_subplot(111)
 91 |     ax.scatter(xcord1,ycord1,s=30,c='red', marker='s')
 92 |     ax.scatter(xcord2,ycord2,s=30,c="green")
 93 |     x = arange(-3.0,3.0,0.1)
 94 |     y = (-weights[0]-weights[1] * x)/weights[2]
 95 |     ax.plot(x,y)
 96 |     plt.xlabel("x1")     #X轴的标签
 97 |     plt.ylabel("x2")     #Y轴的标签
 98 |     plt.show()
 99 |             
100 |             
101 |             
102 | if __name__=="__main__":
103 |     dataMatrix,labelMat = loadDataSet()
104 |     #梯度上升算法
105 | #     weight = gradAscent(dataMatrix, labelMat)
106 | #     print weight
107 | #     plotBestFit(weight.getA(),dataMatrix,labelMat)
108 | 
109 |     #随机梯度上升算法
110 | #     weight = stocGradAscent0(dataMatrix, labelMat)
111 | #     print weight
112 | #     plotBestFit(weight,dataMatrix,labelMat)
113 |     
114 |     #改进版的随机梯度上升算法
115 |     weight = stocGradAscent1(array(dataMatrix), labelMat)
116 |     print weight
117 |     plotBestFit(weight,dataMatrix,labelMat)
118 |     
119 |     


--------------------------------------------------------------------------------
/Logistic Regession/LogisticRegessionExample.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | '''
 3 | Created on 2016/4/25
 4 | 
 5 | @author: Gamer Think
 6 | '''
 7 | import LogisticRegession as lr
 8 | from numpy import *
 9 | 
10 | #二分类问题进行分类
11 | def classifyVector(inX,weights):
12 |     prob = lr.sigmoid(sum(inX * weights))
13 |     if prob>0.5:
14 |         return 1.0
15 |     else:
16 |         return 0.0
17 | 
18 | #训练和测试
19 | def colicTest():
20 |     frTrain = open('horseColicTraining.txt'); frTest = open('horseColicTest.txt')
21 |     trainingSet = []; trainingLabels = []
22 |     #训练回归模型
23 |     for line in frTrain.readlines():
24 |         currLine = line.strip().split('\t')
25 |         lineArr =[]
26 |         for i in range(21):
27 |             lineArr.append(float(currLine[i]))
28 |         trainingSet.append(lineArr)
29 |         trainingLabels.append(float(currLine[21]))
30 |     trainWeights = lr.stocGradAscent1(array(trainingSet), trainingLabels, 1000)
31 |     errorCount = 0; numTestVec = 0.0
32 |     #测试回归模型
33 |     for line in frTest.readlines():
34 |         numTestVec += 1.0
35 |         currLine = line.strip().split('\t')
36 |         lineArr =[]
37 |         for i in range(21):
38 |             lineArr.append(float(currLine[i]))
39 |         if int(classifyVector(array(lineArr), trainWeights))!= int(currLine[21]):
40 |             errorCount += 1
41 |     errorRate = (float(errorCount)/numTestVec)
42 |     print "the error rate of this test is: %f" % errorRate
43 |     return errorRate
44 |     
45 | def multiTest():
46 |     numTests = 10
47 |     errorSum = 0.0
48 |     for k in range(numTests):
49 |         errorSum += colicTest()
50 |     print "after %d iterations the average error rate is: %f" % (numTests,errorSum/float(numTests))
51 | 
52 | 
53 | if __name__=="__main__":
54 |     multiTest()  
55 | 


--------------------------------------------------------------------------------
/Logistic Regession/ex1.txt:
--------------------------------------------------------------------------------
  1 | -0.017612   14.053064   0  
  2 | -1.395634   4.662541    1  
  3 | -0.752157   6.538620	0  
  4 | -1.322371   7.152853    0  
  5 | 0.423363	11.054677   0  
  6 | 0.406704    7.067335    1  
  7 | 0.667394    12.741452   0  
  8 | -2.460150   6.866805    1  
  9 | 0.569411    9.548755    0  
 10 | -0.026632   10.427743   0  
 11 | 0.850433    6.920334    1  
 12 | 1.347183    13.175500   0  
 13 | 1.176813    3.167020    1  
 14 | -1.781871   9.097953    0  
 15 | -0.566606   5.749003    1  
 16 | 0.931635    1.589505    1  
 17 | -0.024205   6.151823    1  
 18 | -0.036453   2.690988    1  
 19 | -0.196949   0.444165    1  
 20 | 1.014459    5.754399    1  
 21 | 1.985298    3.230619    1  
 22 | -1.693453   -0.557540   1  
 23 | -0.576525   11.778922   0  
 24 | -0.346811   -1.678730   1  
 25 | -2.124484   2.672471    1  
 26 | 1.217916    9.597015    0  
 27 | -0.733928   9.098687    0  
 28 | -3.642001   -1.618087   1  
 29 | 0.315985    3.523953    1  
 30 | 1.416614    9.619232    0  
 31 | -0.386323   3.989286    1  
 32 | 0.556921    8.294984    1  
 33 | 1.224863    11.587360   0  
 34 | -1.347803   -2.406051   1  
 35 | 1.196604    4.951851    1  
 36 | 0.275221    9.543647    0  
 37 | 0.470575    9.332488    0  
 38 | -1.889567   9.542662    0  
 39 | -1.527893   12.150579   0  
 40 | -1.185247   11.309318   0  
 41 | -0.445678   3.297303    1  
 42 | 1.042222    6.105155    1  
 43 | -0.618787   10.320986   0  
 44 | 1.152083    0.548467    1  
 45 | 0.828534    2.676045    1  
 46 | -1.237728   10.549033   0  
 47 | -0.683565   -2.166125   1  
 48 | 0.229456    5.921938    1  
 49 | -0.959885   11.555336   0  
 50 | 0.492911    10.993324   0  
 51 | 0.184992    8.721488    0  
 52 | -0.355715   10.325976   0  
 53 | -0.397822   8.058397    0  
 54 | 0.824839    13.730343   0  
 55 | 1.507278    5.027866    1  
 56 | 0.099671    6.835839    1  
 57 | -0.344008   10.717485   0  
 58 | 1.785928    7.718645    1  
 59 | -0.918801   11.560217   0  
 60 | -0.364009   4.747300    1  
 61 | -0.841722   4.119083    1  
 62 | 0.490426    1.960539    1  
 63 | -0.007194   9.075792    0  
 64 | 0.356107    12.447863   0  
 65 | 0.342578    12.281162   0  
 66 | -0.810823   -1.466018   1  
 67 | 2.530777    6.476801    1  
 68 | 1.296683    11.607559   0  
 69 | 0.475487    12.040035   0  
 70 | -0.783277   11.009725   0  
 71 | 0.074798    11.023650   0  
 72 | -1.337472   0.468339    1  
 73 | -0.102781   13.763651   0  
 74 | -0.147324   2.874846    1  
 75 | 0.518389    9.887035    0  
 76 | 1.015399    7.571882    0  
 77 | -1.658086   -0.027255   1  
 78 | 1.319944    2.171228    1  
 79 | 2.056216    5.019981    1  
 80 | -0.851633   4.375691    1  
 81 | -1.510047   6.061992    0  
 82 | -1.076637   -3.181888   1  
 83 | 1.821096    10.283990   0  
 84 | 3.010150    8.401766    1  
 85 | -1.099458   1.688274    1  
 86 | -0.834872   -1.733869   1  
 87 | -0.846637   3.849075    1  
 88 | 1.400102    12.628781   0  
 89 | 1.752842    5.468166    1  
 90 | 0.078557    0.059736    1  
 91 | 0.089392    -0.715300   1  
 92 | 1.825662    12.693808   0  
 93 | 0.197445    9.744638    0  
 94 | 0.126117    0.922311    1  
 95 | -0.679797   1.220530    1  
 96 | 0.677983    2.556666    1  
 97 | 0.761349    10.693862   0  
 98 | -2.168791   0.143632    1  
 99 | 1.388610    9.341997    0  
100 | 0.317029    14.739025   0  


--------------------------------------------------------------------------------
/Logistic Regession/horseColicTest.txt:
--------------------------------------------------------------------------------
 1 | 2	1	38.50	54	20	0	1	2	2	3	4	1	2	2	5.90	0	2	42.00	6.30	0	0	1
 2 | 2	1	37.60	48	36	0	0	1	1	0	3	0	0	0	0	0	0	44.00	6.30	1	5.00	1
 3 | 1	1	37.7	44	28	0	4	3	2	5	4	4	1	1	0	3	5	45	70	3	2	1
 4 | 1	1	37	56	24	3	1	4	2	4	4	3	1	1	0	0	0	35	61	3	2	0
 5 | 2	1	38.00	42	12	3	0	3	1	1	0	1	0	0	0	0	2	37.00	5.80	0	0	1
 6 | 1	1	0	60	40	3	0	1	1	0	4	0	3	2	0	0	5	42	72	0	0	1
 7 | 2	1	38.40	80	60	3	2	2	1	3	2	1	2	2	0	1	1	54.00	6.90	0	0	1
 8 | 2	1	37.80	48	12	2	1	2	1	3	0	1	2	0	0	2	0	48.00	7.30	1	0	1
 9 | 2	1	37.90	45	36	3	3	3	2	2	3	1	2	1	0	3	0	33.00	5.70	3	0	1
10 | 2	1	39.00	84	12	3	1	5	1	2	4	2	1	2	7.00	0	4	62.00	5.90	2	2.20	0
11 | 2	1	38.20	60	24	3	1	3	2	3	3	2	3	3	0	4	4	53.00	7.50	2	1.40	1
12 | 1	1	0	140	0	0	0	4	2	5	4	4	1	1	0	0	5	30	69	0	0	0
13 | 1	1	37.90	120	60	3	3	3	1	5	4	4	2	2	7.50	4	5	52.00	6.60	3	1.80	0
14 | 2	1	38.00	72	36	1	1	3	1	3	0	2	2	1	0	3	5	38.00	6.80	2	2.00	1
15 | 2	9	38.00	92	28	1	1	2	1	1	3	2	3	0	7.20	0	0	37.00	6.10	1	1.10	1
16 | 1	1	38.30	66	30	2	3	1	1	2	4	3	3	2	8.50	4	5	37.00	6.00	0	0	1
17 | 2	1	37.50	48	24	3	1	1	1	2	1	0	1	1	0	3	2	43.00	6.00	1	2.80	1
18 | 1	1	37.50	88	20	2	3	3	1	4	3	3	0	0	0	0	0	35.00	6.40	1	0	0
19 | 2	9	0	150	60	4	4	4	2	5	4	4	0	0	0	0	0	0	0	0	0	0
20 | 1	1	39.7	100	30	0	0	6	2	4	4	3	1	0	0	4	5	65	75	0	0	0
21 | 1	1	38.30	80	0	3	3	4	2	5	4	3	2	1	0	4	4	45.00	7.50	2	4.60	1
22 | 2	1	37.50	40	32	3	1	3	1	3	2	3	2	1	0	0	5	32.00	6.40	1	1.10	1
23 | 1	1	38.40	84	30	3	1	5	2	4	3	3	2	3	6.50	4	4	47.00	7.50	3	0	0
24 | 1	1	38.10	84	44	4	0	4	2	5	3	1	1	3	5.00	0	4	60.00	6.80	0	5.70	0
25 | 2	1	38.70	52	0	1	1	1	1	1	3	1	0	0	0	1	3	4.00	74.00	0	0	1
26 | 2	1	38.10	44	40	2	1	3	1	3	3	1	0	0	0	1	3	35.00	6.80	0	0	1
27 | 2	1	38.4	52	20	2	1	3	1	1	3	2	2	1	0	3	5	41	63	1	1	1
28 | 1	1	38.20	60	0	1	0	3	1	2	1	1	1	1	0	4	4	43.00	6.20	2	3.90	1
29 | 2	1	37.70	40	18	1	1	1	0	3	2	1	1	1	0	3	3	36.00	3.50	0	0	1
30 | 1	1	39.1	60	10	0	1	1	0	2	3	0	0	0	0	4	4	0	0	0	0	1
31 | 2	1	37.80	48	16	1	1	1	1	0	1	1	2	1	0	4	3	43.00	7.50	0	0	1
32 | 1	1	39.00	120	0	4	3	5	2	2	4	3	2	3	8.00	0	0	65.00	8.20	3	4.60	1
33 | 1	1	38.20	76	0	2	3	2	1	5	3	3	1	2	6.00	1	5	35.00	6.50	2	0.90	1
34 | 2	1	38.30	88	0	0	0	6	0	0	0	0	0	0	0	0	0	0	0	0	0	0
35 | 1	1	38.00	80	30	3	3	3	1	0	0	0	0	0	6.00	0	0	48.00	8.30	0	4.30	1
36 | 1	1	0	0	0	3	1	1	1	2	3	3	1	3	6.00	4	4	0	0	2	0	0
37 | 1	1	37.60	40	0	1	1	1	1	1	1	1	0	0	0	1	1	0	0	2	2.10	1
38 | 2	1	37.50	44	0	1	1	1	1	3	3	2	0	0	0	0	0	45.00	5.80	2	1.40	1
39 | 2	1	38.2	42	16	1	1	3	1	1	3	1	0	0	0	1	0	35	60	1	1	1
40 | 2	1	38	56	44	3	3	3	0	0	1	1	2	1	0	4	0	47	70	2	1	1
41 | 2	1	38.30	45	20	3	3	2	2	2	4	1	2	0	0	4	0	0	0	0	0	1
42 | 1	1	0	48	96	1	1	3	1	0	4	1	2	1	0	1	4	42.00	8.00	1	0	1
43 | 1	1	37.70	55	28	2	1	2	1	2	3	3	0	3	5.00	4	5	0	0	0	0	1
44 | 2	1	36.00	100	20	4	3	6	2	2	4	3	1	1	0	4	5	74.00	5.70	2	2.50	0
45 | 1	1	37.10	60	20	2	0	4	1	3	0	3	0	2	5.00	3	4	64.00	8.50	2	0	1
46 | 2	1	37.10	114	40	3	0	3	2	2	2	1	0	0	0	0	3	32.00	0	3	6.50	1
47 | 1	1	38.1	72	30	3	3	3	1	4	4	3	2	1	0	3	5	37	56	3	1	1
48 | 1	1	37.00	44	12	3	1	1	2	1	1	1	0	0	0	4	2	40.00	6.70	3	8.00	1
49 | 1	1	38.6	48	20	3	1	1	1	4	3	1	0	0	0	3	0	37	75	0	0	1
50 | 1	1	0	82	72	3	1	4	1	2	3	3	0	3	0	4	4	53	65	3	2	0
51 | 1	9	38.20	78	60	4	4	6	0	3	3	3	0	0	0	1	0	59.00	5.80	3	3.10	0
52 | 2	1	37.8	60	16	1	1	3	1	2	3	2	1	2	0	3	0	41	73	0	0	0
53 | 1	1	38.7	34	30	2	0	3	1	2	3	0	0	0	0	0	0	33	69	0	2	0
54 | 1	1	0	36	12	1	1	1	1	1	2	1	1	1	0	1	5	44.00	0	0	0	1
55 | 2	1	38.30	44	60	0	0	1	1	0	0	0	0	0	0	0	0	6.40	36.00	0	0	1
56 | 2	1	37.40	54	18	3	0	1	1	3	4	3	2	2	0	4	5	30.00	7.10	2	0	1
57 | 1	1	0	0	0	4	3	0	2	2	4	1	0	0	0	0	0	54	76	3	2	1
58 | 1	1	36.6	48	16	3	1	3	1	4	1	1	1	1	0	0	0	27	56	0	0	0
59 | 1	1	38.5	90	0	1	1	3	1	3	3	3	2	3	2	4	5	47	79	0	0	1
60 | 1	1	0	75	12	1	1	4	1	5	3	3	0	3	5.80	0	0	58.00	8.50	1	0	1
61 | 2	1	38.20	42	0	3	1	1	1	1	1	2	2	1	0	3	2	35.00	5.90	2	0	1
62 | 1	9	38.20	78	60	4	4	6	0	3	3	3	0	0	0	1	0	59.00	5.80	3	3.10	0
63 | 2	1	38.60	60	30	1	1	3	1	4	2	2	1	1	0	0	0	40.00	6.00	1	0	1
64 | 2	1	37.80	42	40	1	1	1	1	1	3	1	0	0	0	3	3	36.00	6.20	0	0	1
65 | 1	1	38	60	12	1	1	2	1	2	1	1	1	1	0	1	4	44	65	3	2	0
66 | 2	1	38.00	42	12	3	0	3	1	1	1	1	0	0	0	0	1	37.00	5.80	0	0	1
67 | 2	1	37.60	88	36	3	1	1	1	3	3	2	1	3	1.50	0	0	44.00	6.00	0	0	0


--------------------------------------------------------------------------------
/PCA/PCA.py:
--------------------------------------------------------------------------------
 1 | #-*-coding:utf8-*-
 2 | '''
 3 | Created on 2016-5-15
 4 | 
 5 | @author: thinkgamer
 6 | '''
 7 | from numpy import *
 8 | 
 9 | def loadDataSet(filename,delim = "\t"):
10 |     fr = open(filename)
11 |     stringArr = [line.strip().split(delim) for line in fr.readlines()]
12 |     datArr = [map(float, line) for line in stringArr]
13 |     return mat(datArr)
14 | 
15 | #dataMat对应数据集，N个特征
16 | def pca(dataMat, topNfeat=9999999):
17 |     meanVals = mean(dataMat, axis = 0)   #求平均值
18 |     meanRemoved = dataMat - meanVals #去平均值
19 |     covMat = cov(meanRemoved,rowvar=0) #计算协防差矩阵
20 |     eigVals, eigVects = linalg.eig(mat(covMat))
21 |     eigValInd = argsort(eigVals)
22 |     #从小到大对N个值排序
23 |     eigValInd = eigValInd[: -(topNfeat + 1) : -1]
24 |     redEigVects = eigVects[:, eigValInd]
25 |     #将数据转换到新空间
26 |     lowDDataMat = meanRemoved * redEigVects
27 |     reconMat = (lowDDataMat * redEigVects.T) + meanVals
28 |     return lowDDataMat, reconMat
29 | 
30 | #测试
31 | dataMat = loadDataSet("testSet.txt")
32 | lowDMat, reconMat = pca(dataMat,1)
33 | print shape(lowDMat)
34 | 
35 | '''
36 | #show
37 | import matplotlib
38 | import matplotlib.pyplot as plt
39 | fig = plt.figure()
40 | ax = fig.add_subplot(111)
41 | ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^',  s = 90 )
42 | ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0],marker='o', s = 50 , c ='red' )
43 | plt.show() 
44 | '''
45 | 
46 | #将NaN替换成平均值函数
47 | def replaceNanWithMean(): 
48 |     datMat = loadDataSet('secom.data', ' ')
49 |     numFeat = shape(datMat)[1]
50 |     for i in range(numFeat):
51 |         meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number)
52 |         datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal  #set NaN values to mean
53 |     return datMat
54 | 
55 | #加载数据               
56 | dataMat = replaceNanWithMean()
57 | #去除均值
58 | meanVals = mean(dataMat, axis=0)
59 | meanRemoved = dataMat - meanVals        
60 | #计算协方差               
61 | covMat = cov(meanRemoved, rowvar=0)
62 | 
63 | #特征值分析
64 | eigVals,   eigVects = linalg.eig(mat(covMat))               
65 | print eigVals               


--------------------------------------------------------------------------------
/PCA/testSet.txt:
--------------------------------------------------------------------------------
   1 | 10.235186	11.321997
   2 | 10.122339	11.810993
   3 | 9.190236	8.904943
   4 | 9.306371	9.847394
   5 | 8.330131	8.340352
   6 | 10.152785	10.123532
   7 | 10.408540	10.821986
   8 | 9.003615	10.039206
   9 | 9.534872	10.096991
  10 | 9.498181	10.825446
  11 | 9.875271	9.233426
  12 | 10.362276	9.376892
  13 | 10.191204	11.250851
  14 | 7.720499	6.476300
  15 | 9.334379	8.471268
  16 | 7.963186	6.731333
  17 | 8.244973	9.013785
  18 | 9.569196	10.568949
  19 | 8.854793	9.076536
  20 | 9.382171	7.240862
  21 | 8.179055	8.944502
  22 | 8.267896	8.797017
  23 | 9.047165	8.725068
  24 | 8.741043	7.901385
  25 | 7.190216	7.804587
  26 | 8.081227	9.314431
  27 | 8.047460	5.720780
  28 | 7.917584	7.543254
  29 | 8.676942	10.102220
  30 | 9.210251	9.424717
  31 | 7.732998	9.840202
  32 | 7.681754	8.609897
  33 | 7.925863	10.079159
  34 | 8.261509	8.242080
  35 | 8.514872	7.527561
  36 | 10.324450	10.804481
  37 | 7.856710	7.931543
  38 | 7.858608	7.995340
  39 | 9.196808	6.577598
  40 | 9.644415	10.935081
  41 | 9.579833	9.085021
  42 | 7.888484	5.976428
  43 | 9.072624	9.703344
  44 | 8.914184	9.298515
  45 | 7.822869	7.086663
  46 | 10.538554	11.061464
  47 | 8.280187	8.709012
  48 | 8.884223	8.670105
  49 | 9.359927	10.575055
  50 | 9.078611	9.710833
  51 | 7.935134	8.586173
  52 | 8.805945	10.575145
  53 | 9.584316	9.614076
  54 | 11.269714	11.717254
  55 | 9.120444	9.019774
  56 | 7.977520	8.313923
  57 | 8.104648	9.456128
  58 | 8.617126	7.331723
  59 | 9.033811	9.469706
  60 | 8.327680	5.122092
  61 | 8.532272	10.100909
  62 | 9.295434	8.933824
  63 | 9.905202	9.027559
  64 | 10.585764	10.912733
  65 | 10.427584	11.532578
  66 | 9.072767	9.960144
  67 | 9.164275	8.645121
  68 | 9.746058	10.717080
  69 | 9.286072	9.340024
  70 | 8.188233	7.432415
  71 | 7.948598	8.445419
  72 | 7.563350	5.656178
  73 | 8.972405	8.801869
  74 | 9.980868	8.788996
  75 | 7.753490	7.714248
  76 | 7.431143	9.032819
  77 | 8.943403	8.359354
  78 | 10.481890	9.988969
  79 | 9.150454	10.278760
  80 | 8.123894	9.060351
  81 | 8.626164	8.469342
  82 | 7.354185	7.631252
  83 | 11.323046	11.015032
  84 | 8.190008	6.860792
  85 | 8.412598	7.661358
  86 | 9.258404	8.580382
  87 | 11.007915	11.443881
  88 | 8.279403	8.347003
  89 | 8.931149	10.105221
  90 | 10.239245	10.077473
  91 | 8.129346	7.096877
  92 | 8.485823	9.373561
  93 | 10.703640	11.651618
  94 | 9.500728	8.150228
  95 | 9.712414	9.910445
  96 | 9.333374	9.407557
  97 | 8.787865	10.168021
  98 | 9.238180	10.253478
  99 | 9.577388	8.895150
 100 | 10.447753	10.318227
 101 | 9.303944	9.223136
 102 | 9.883268	11.662945
 103 | 9.471921	10.443792
 104 | 10.007753	9.579912
 105 | 8.110298	7.106263
 106 | 6.964069	6.585040
 107 | 10.413499	9.649309
 108 | 8.032629	7.053254
 109 | 8.015549	9.166753
 110 | 10.462924	8.656612
 111 | 9.530788	10.134130
 112 | 9.202658	9.314222
 113 | 10.103241	10.235159
 114 | 7.849264	6.624856
 115 | 9.059071	7.992555
 116 | 10.172889	10.724789
 117 | 9.528439	6.420990
 118 | 7.190422	6.789792
 119 | 9.085716	9.846328
 120 | 9.452887	8.735386
 121 | 7.417322	7.348594
 122 | 8.468639	8.715086
 123 | 8.303642	9.463231
 124 | 9.939052	10.026771
 125 | 8.701989	7.516978
 126 | 9.737541	10.587281
 127 | 8.280233	7.852444
 128 | 10.648386	10.259203
 129 | 9.173893	10.520372
 130 | 9.135397	10.751406
 131 | 7.594580	8.488833
 132 | 8.587520	8.463406
 133 | 8.581887	7.888644
 134 | 9.448768	8.707422
 135 | 7.882664	7.772030
 136 | 10.050635	9.859720
 137 | 9.012078	9.533899
 138 | 8.770020	8.882996
 139 | 9.428804	9.446306
 140 | 8.504209	8.319693
 141 | 9.800003	10.964667
 142 | 8.069660	7.683099
 143 | 10.012217	10.320644
 144 | 8.704677	8.918146
 145 | 8.198722	7.297786
 146 | 9.868322	9.901657
 147 | 9.426997	11.480353
 148 | 9.228767	9.262976
 149 | 8.952359	9.528471
 150 | 8.186847	8.600587
 151 | 9.026371	8.705143
 152 | 9.483364	9.807079
 153 | 7.826587	7.975401
 154 | 11.197846	10.959298
 155 | 7.632421	8.769745
 156 | 8.761605	8.309365
 157 | 9.353670	8.728758
 158 | 6.466637	6.038996
 159 | 8.370634	9.178830
 160 | 10.337451	11.075600
 161 | 8.917679	8.288367
 162 | 9.076621	8.487626
 163 | 7.278948	4.634097
 164 | 10.153017	11.219183
 165 | 7.132603	5.853118
 166 | 9.338644	9.805940
 167 | 9.878602	9.187000
 168 | 10.009505	10.924505
 169 | 9.384438	10.691860
 170 | 7.535322	8.160481
 171 | 6.808732	8.268469
 172 | 8.302965	8.075009
 173 | 8.345379	8.305356
 174 | 9.517530	8.249839
 175 | 9.267825	9.999109
 176 | 10.291511	11.032664
 177 | 8.605909	8.705207
 178 | 8.331145	7.812295
 179 | 8.632412	10.574287
 180 | 8.766397	8.712107
 181 | 9.407070	9.732756
 182 | 9.709495	9.729569
 183 | 10.422201	11.070360
 184 | 6.831495	6.466763
 185 | 8.187122	8.405929
 186 | 8.523093	9.041844
 187 | 7.952394	6.801220
 188 | 10.490780	10.001468
 189 | 10.813791	9.802494
 190 | 7.861113	7.541475
 191 | 8.800399	8.738974
 192 | 7.542152	6.612838
 193 | 9.446981	9.378659
 194 | 8.281684	7.358572
 195 | 8.473801	8.208343
 196 | 11.736767	11.022029
 197 | 8.379578	8.714348
 198 | 8.313718	8.832381
 199 | 9.342589	10.416659
 200 | 7.560710	6.889648
 201 | 9.295344	9.739040
 202 | 9.176612	9.718781
 203 | 8.614385	10.150521
 204 | 9.079373	8.839794
 205 | 10.333289	10.921255
 206 | 9.453502	7.335134
 207 | 10.174590	10.292500
 208 | 9.693713	9.793636
 209 | 7.474925	7.751391
 210 | 10.107905	10.156997
 211 | 9.257241	7.854266
 212 | 10.209794	11.410157
 213 | 7.248050	6.433676
 214 | 10.150091	9.288597
 215 | 10.077713	10.321500
 216 | 8.191122	8.931519
 217 | 8.791469	10.287216
 218 | 9.229434	9.095193
 219 | 8.682571	8.546005
 220 | 7.524099	7.709751
 221 | 8.442410	8.326037
 222 | 9.364851	9.095989
 223 | 9.061222	7.557899
 224 | 7.989999	8.555363
 225 | 8.801275	8.868732
 226 | 10.351932	9.497796
 227 | 10.230710	10.496151
 228 | 9.783163	9.891408
 229 | 10.651481	9.431617
 230 | 8.387393	6.400507
 231 | 9.003921	7.050003
 232 | 8.483723	8.314886
 233 | 9.020501	7.545771
 234 | 9.329105	11.095661
 235 | 9.583687	9.271929
 236 | 8.908705	8.407529
 237 | 8.835406	8.083517
 238 | 9.736362	8.296735
 239 | 10.030302	9.737178
 240 | 8.287142	6.993460
 241 | 9.173211	9.306335
 242 | 9.026355	9.696531
 243 | 9.128391	9.921247
 244 | 11.486346	12.910777
 245 | 11.519458	11.472111
 246 | 9.027707	10.263974
 247 | 9.351935	8.542200
 248 | 9.421701	11.403201
 249 | 9.005687	8.100969
 250 | 7.015279	6.614278
 251 | 8.213607	8.340948
 252 | 8.226646	8.718997
 253 | 8.144753	8.366877
 254 | 10.133642	12.790169
 255 | 10.763481	10.847016
 256 | 10.003622	10.337716
 257 | 9.007955	9.792482
 258 | 8.670506	10.782931
 259 | 10.386414	9.956162
 260 | 10.104761	10.123044
 261 | 8.079502	8.304075
 262 | 9.945424	11.855409
 263 | 8.642497	9.998066
 264 | 9.349722	8.690328
 265 | 9.034991	8.826490
 266 | 8.738746	7.518464
 267 | 8.919532	9.740312
 268 | 9.464136	10.444588
 269 | 10.710057	12.666857
 270 | 10.042007	10.532091
 271 | 8.447996	7.426363
 272 | 9.509351	9.030516
 273 | 11.946359	10.553075
 274 | 9.981617	9.912651
 275 | 9.853876	9.632967
 276 | 10.560648	11.881714
 277 | 8.370952	9.989491
 278 | 8.323209	10.102529
 279 | 9.828359	11.702462
 280 | 8.515623	8.426754
 281 | 9.004363	9.628036
 282 | 10.529847	10.458031
 283 | 10.028765	10.624880
 284 | 9.448114	9.313227
 285 | 8.332617	7.382295
 286 | 8.323006	8.276608
 287 | 7.740771	8.799750
 288 | 8.379615	8.146192
 289 | 8.340764	9.184458
 290 | 9.863614	8.254694
 291 | 9.969563	9.405134
 292 | 9.164394	9.182127
 293 | 10.622098	9.722592
 294 | 9.592072	10.029446
 295 | 8.212027	7.477366
 296 | 9.080225	8.244448
 297 | 8.555774	7.842325
 298 | 9.958046	9.696221
 299 | 8.972573	9.797128
 300 | 9.213223	7.128437
 301 | 8.737239	9.385138
 302 | 10.333907	10.994856
 303 | 8.797511	8.643075
 304 | 11.044848	9.623160
 305 | 8.539260	9.097113
 306 | 11.582163	11.884333
 307 | 7.863848	7.176199
 308 | 6.218103	5.283562
 309 | 9.120602	7.250190
 310 | 9.001166	9.635203
 311 | 8.081476	8.844224
 312 | 9.369802	8.230911
 313 | 8.768925	8.666987
 314 | 9.841098	8.543896
 315 | 10.451522	9.549511
 316 | 9.755402	9.117522
 317 | 7.988961	6.869854
 318 | 8.872507	9.787118
 319 | 10.363980	10.716608
 320 | 6.315671	5.765953
 321 | 9.638879	9.202355
 322 | 8.588126	8.037966
 323 | 8.947408	9.144386
 324 | 9.051130	7.195132
 325 | 9.321709	8.380668
 326 | 10.146531	9.754745
 327 | 9.843373	8.891437
 328 | 9.213148	11.700632
 329 | 7.630078	7.294753
 330 | 8.093088	7.967590
 331 | 7.488915	6.090652
 332 | 8.126036	8.586472
 333 | 8.760350	7.268987
 334 | 10.201347	9.141013
 335 | 7.838208	7.307700
 336 | 6.155653	5.563997
 337 | 7.767841	6.254528
 338 | 8.425656	8.615832
 339 | 10.362168	10.886815
 340 | 10.180024	10.378934
 341 | 9.794665	10.047812
 342 | 9.970394	9.668279
 343 | 7.030217	7.060471
 344 | 9.275414	9.095738
 345 | 10.314911	10.456539
 346 | 9.259774	8.204851
 347 | 10.023919	9.558307
 348 | 8.887540	9.866704
 349 | 9.851608	9.410989
 350 | 8.710882	7.268012
 351 | 9.017007	10.217673
 352 | 7.976369	9.000979
 353 | 8.738332	8.664734
 354 | 8.344510	8.977600
 355 | 8.959613	12.324240
 356 | 9.169982	8.624635
 357 | 7.487451	8.154859
 358 | 8.706316	7.719455
 359 | 9.564832	8.940403
 360 | 8.327775	9.044509
 361 | 9.734032	10.195255
 362 | 8.021343	6.445092
 363 | 9.081048	11.024397
 364 | 7.626651	6.549263
 365 | 10.725858	8.575374
 366 | 8.731381	8.307788
 367 | 10.394237	10.596874
 368 | 7.029311	7.658832
 369 | 9.517907	7.509904
 370 | 10.394064	10.060898
 371 | 10.752500	9.431601
 372 | 9.692431	10.332130
 373 | 9.651897	7.876862
 374 | 8.592329	10.096837
 375 | 10.212801	10.827496
 376 | 9.045043	9.265524
 377 | 8.901643	8.036115
 378 | 10.794525	9.318830
 379 | 11.040915	12.021746
 380 | 8.390836	9.672469
 381 | 9.840166	11.226568
 382 | 10.806810	12.205633
 383 | 8.924285	10.934056
 384 | 8.411251	8.289672
 385 | 7.808891	9.663290
 386 | 9.733437	8.486958
 387 | 8.300026	7.477374
 388 | 8.221756	10.278308
 389 | 9.096867	9.619677
 390 | 9.410116	9.289188
 391 | 10.097176	9.768470
 392 | 9.387954	8.844855
 393 | 9.376134	7.704630
 394 | 8.231599	9.101203
 395 | 9.910738	10.694855
 396 | 8.645689	7.764589
 397 | 8.090245	7.109596
 398 | 9.253483	9.813672
 399 | 9.331546	8.039386
 400 | 9.843256	10.208792
 401 | 9.713131	9.247665
 402 | 9.259369	10.704622
 403 | 10.243948	9.695883
 404 | 6.396262	6.456390
 405 | 8.936289	8.703871
 406 | 8.750846	9.347273
 407 | 6.497155	4.130251
 408 | 9.516552	10.164848
 409 | 9.125766	8.858775
 410 | 8.374387	7.300114
 411 | 8.132816	7.621107
 412 | 10.099505	9.159134
 413 | 9.356477	6.869999
 414 | 8.112934	7.587547
 415 | 7.265396	6.987031
 416 | 11.950505	13.715109
 417 | 10.745959	10.822171
 418 | 8.893270	7.887332
 419 | 6.003473	4.960219
 420 | 7.498851	6.451334
 421 | 10.162072	9.935954
 422 | 8.732617	9.177679
 423 | 9.300827	9.952360
 424 | 11.908436	12.256801
 425 | 9.371215	9.188645
 426 | 9.943640	9.245037
 427 | 7.386450	7.046819
 428 | 8.410374	8.293218
 429 | 7.830419	6.440253
 430 | 8.263140	8.279446
 431 | 11.448164	12.192363
 432 | 8.216533	9.186628
 433 | 9.316128	10.046697
 434 | 8.156927	6.834792
 435 | 9.951421	11.240598
 436 | 9.059607	8.458446
 437 | 10.476339	10.560461
 438 | 7.548200	7.227127
 439 | 9.432204	7.236705
 440 | 9.402750	9.126413
 441 | 11.188095	13.853426
 442 | 9.520201	11.028131
 443 | 8.884154	9.764071
 444 | 8.961105	8.833117
 445 | 8.549663	8.865765
 446 | 10.111708	10.515462
 447 | 9.024761	9.169368
 448 | 7.904149	8.048756
 449 | 9.240995	7.796142
 450 | 8.126538	6.116125
 451 | 7.442148	7.931335
 452 | 9.486821	10.091359
 453 | 9.834289	11.694720
 454 | 9.009714	11.599170
 455 | 9.761314	11.344083
 456 | 6.993941	6.562988
 457 | 8.659524	8.410107
 458 | 7.685363	8.097297
 459 | 7.793217	6.519109
 460 | 8.883454	9.257347
 461 | 8.781821	9.231980
 462 | 7.946281	7.658978
 463 | 8.523959	10.646480
 464 | 9.031525	8.649648
 465 | 8.317140	7.758978
 466 | 9.192417	11.151218
 467 | 8.408486	8.282182
 468 | 10.327702	11.459048
 469 | 8.389687	8.548727
 470 | 8.642250	7.056870
 471 | 8.833447	9.267638
 472 | 8.805261	8.320281
 473 | 9.726211	9.095997
 474 | 8.477631	9.507530
 475 | 9.738838	9.652110
 476 | 8.272108	7.582696
 477 | 9.258089	8.495931
 478 | 8.334144	8.810766
 479 | 8.150904	6.486032
 480 | 7.259669	7.270156
 481 | 11.034180	11.519954
 482 | 10.705432	10.642527
 483 | 8.388814	7.159137
 484 | 8.559369	7.846284
 485 | 7.187988	6.519313
 486 | 8.811453	7.765900
 487 | 8.492762	7.992941
 488 | 8.739752	8.502909
 489 | 10.150752	10.420295
 490 | 7.062378	5.365289
 491 | 8.448195	7.480000
 492 | 10.224333	11.592750
 493 | 9.533795	9.212845
 494 | 9.519492	7.690501
 495 | 9.661847	10.376189
 496 | 7.963877	8.597193
 497 | 10.184486	9.136709
 498 | 8.505234	9.159210
 499 | 8.187646	8.518690
 500 | 9.167590	9.405917
 501 | 8.612162	8.518755
 502 | 10.970868	10.392229
 503 | 9.603649	9.141095
 504 | 9.704263	8.830178
 505 | 9.657506	8.132449
 506 | 9.337882	11.045306
 507 | 9.521722	9.537764
 508 | 8.954197	8.728179
 509 | 8.635658	10.352662
 510 | 8.910816	9.020317
 511 | 9.900933	9.392002
 512 | 10.247105	8.289649
 513 | 9.571690	8.171237
 514 | 7.388627	7.668071
 515 | 8.354008	10.074590
 516 | 9.775598	8.835696
 517 | 8.768913	7.983604
 518 | 8.330199	8.474098
 519 | 8.169356	9.361172
 520 | 10.346522	10.086434
 521 | 7.976144	9.266702
 522 | 8.429648	7.865824
 523 | 11.261674	11.788587
 524 | 10.051066	10.112425
 525 | 8.954626	9.789343
 526 | 8.382220	8.121012
 527 | 9.820642	9.426441
 528 | 8.125950	9.695087
 529 | 8.646465	7.291808
 530 | 8.190202	8.003737
 531 | 8.773887	7.306175
 532 | 8.731000	10.300436
 533 | 9.163098	7.816769
 534 | 9.456346	9.223922
 535 | 9.645180	9.324053
 536 | 8.835060	8.966915
 537 | 9.325950	10.943248
 538 | 9.941912	9.548535
 539 | 9.282799	10.119488
 540 | 9.567591	9.462164
 541 | 8.529019	9.768001
 542 | 9.314824	10.153727
 543 | 8.264439	8.273860
 544 | 8.307262	8.214036
 545 | 9.122041	8.657861
 546 | 8.404258	8.389365
 547 | 7.828355	8.419433
 548 | 9.803180	10.108286
 549 | 8.662439	8.581953
 550 | 8.883265	8.978377
 551 | 8.012330	8.262451
 552 | 9.420258	8.974878
 553 | 7.015415	6.365940
 554 | 9.888832	11.163036
 555 | 9.677549	10.346431
 556 | 8.410158	7.912899
 557 | 9.464147	10.762900
 558 | 7.067227	7.035717
 559 | 9.320923	10.583089
 560 | 9.056917	8.771241
 561 | 8.110004	8.387789
 562 | 10.310021	10.970014
 563 | 8.211185	8.809627
 564 | 8.942883	8.840746
 565 | 9.479958	8.328700
 566 | 8.973982	8.702291
 567 | 8.519257	8.764855
 568 | 9.424556	8.956911
 569 | 7.222919	8.177787
 570 | 8.257007	9.700619
 571 | 9.778795	9.296134
 572 | 8.028806	8.575974
 573 | 9.886464	9.965076
 574 | 9.090552	6.978930
 575 | 9.605548	10.256751
 576 | 9.959004	9.610229
 577 | 8.308701	9.509124
 578 | 7.748293	9.685933
 579 | 8.311108	9.428114
 580 | 9.697068	10.217956
 581 | 9.582991	9.478773
 582 | 9.167265	10.198412
 583 | 10.329753	10.406602
 584 | 8.908819	7.428789
 585 | 10.072908	10.393294
 586 | 7.992905	9.226629
 587 | 8.907696	7.269366
 588 | 8.421948	9.342968
 589 | 7.481399	7.225033
 590 | 10.358408	10.166130
 591 | 8.786556	10.279943
 592 | 9.658701	11.379367
 593 | 10.167807	9.417552
 594 | 8.653449	8.656681
 595 | 8.020304	8.671270
 596 | 8.364348	10.004068
 597 | 9.119183	9.788199
 598 | 8.405504	9.740580
 599 | 11.020930	11.904350
 600 | 9.755232	9.515713
 601 | 10.059542	9.589748
 602 | 8.727131	9.777998
 603 | 7.666182	6.028642
 604 | 8.870733	8.367501
 605 | 9.340446	7.707269
 606 | 9.919283	10.796813
 607 | 7.905837	8.326034
 608 | 10.181187	10.089865
 609 | 8.797328	8.981988
 610 | 8.466272	7.765032
 611 | 10.335914	12.620539
 612 | 9.365003	8.609115
 613 | 8.011017	7.249489
 614 | 10.923993	13.901513
 615 | 7.074631	7.558720
 616 | 9.824598	8.851297
 617 | 8.861026	8.370857
 618 | 10.127296	10.861535
 619 | 10.548377	10.855695
 620 | 8.880470	7.948761
 621 | 8.901619	9.674705
 622 | 7.813710	9.246912
 623 | 10.128808	10.560668
 624 | 11.096699	10.911644
 625 | 8.551471	6.871514
 626 | 8.907241	8.677815
 627 | 10.571647	10.294838
 628 | 8.815314	8.810725
 629 | 8.453396	8.339296
 630 | 9.594819	11.487580
 631 | 10.714211	9.628908
 632 | 7.428788	7.712869
 633 | 10.892119	12.747752
 634 | 9.024071	11.112692
 635 | 7.803375	7.847038
 636 | 8.521558	8.881848
 637 | 9.742818	11.520203
 638 | 9.832836	9.180396
 639 | 8.703132	10.028498
 640 | 9.905029	11.347606
 641 | 10.037536	8.882688
 642 | 8.629995	8.392863
 643 | 9.583497	9.219663
 644 | 8.781687	9.650598
 645 | 9.344119	9.537024
 646 | 10.407510	9.223929
 647 | 7.244488	6.559021
 648 | 10.643616	10.288383
 649 | 8.757557	6.947901
 650 | 10.784590	11.233350
 651 | 10.028427	11.330033
 652 | 7.968361	6.830308
 653 | 8.925954	8.539113
 654 | 7.738692	7.114987
 655 | 8.192398	8.352016
 656 | 10.412017	12.431122
 657 | 8.208801	5.777678
 658 | 7.820077	7.790720
 659 | 9.542754	11.542541
 660 | 6.817938	7.429229
 661 | 7.365218	7.956797
 662 | 9.274391	7.932700
 663 | 9.546475	8.803412
 664 | 7.471734	6.797870
 665 | 8.016969	7.848070
 666 | 8.852701	8.458114
 667 | 8.215012	8.468330
 668 | 6.975507	6.846980
 669 | 9.435134	10.609700
 670 | 9.228075	9.342622
 671 | 8.388410	7.637856
 672 | 7.111456	9.289163
 673 | 9.403508	8.482654
 674 | 9.133894	8.343575
 675 | 10.670801	9.750821
 676 | 9.983542	10.074537
 677 | 10.012865	8.537017
 678 | 8.929895	8.951909
 679 | 7.666951	7.473615
 680 | 9.493839	7.821783
 681 | 8.894081	7.059413
 682 | 9.593382	9.859732
 683 | 9.126847	8.395700
 684 | 9.532945	9.850696
 685 | 9.459384	9.384213
 686 | 8.982743	8.217062
 687 | 10.107798	8.790772
 688 | 10.563574	9.044890
 689 | 8.278963	9.518790
 690 | 8.734960	10.494129
 691 | 9.597940	9.530895
 692 | 10.025478	9.508270
 693 | 10.335922	10.974063
 694 | 8.404390	8.146748
 695 | 7.108699	6.038469
 696 | 8.873951	7.474227
 697 | 8.731459	8.154455
 698 | 8.795146	7.534687
 699 | 6.407165	6.810352
 700 | 9.979312	10.287430
 701 | 8.786715	8.396736
 702 | 10.753339	10.360567
 703 | 10.508031	10.321976
 704 | 10.636925	10.193797
 705 | 10.614322	11.215420
 706 | 8.916411	8.965286
 707 | 8.112756	8.304769
 708 | 10.833109	10.497542
 709 | 8.319758	9.727691
 710 | 9.945336	11.820097
 711 | 10.150461	9.914715
 712 | 10.185024	10.388722
 713 | 9.793569	9.079955
 714 | 10.590128	11.811596
 715 | 8.505584	6.884282
 716 | 10.461428	10.745439
 717 | 8.755781	9.418427
 718 | 7.488249	7.172072
 719 | 10.238905	10.428659
 720 | 9.887827	10.427821
 721 | 8.529971	8.838217
 722 | 8.375208	10.242837
 723 | 8.901724	8.398304
 724 | 8.607694	9.173198
 725 | 8.691369	9.964261
 726 | 9.584578	9.641546
 727 | 10.265792	11.405078
 728 | 7.592968	6.683355
 729 | 8.692791	9.389031
 730 | 7.589852	6.005793
 731 | 10.550386	11.736584
 732 | 8.578351	7.227055
 733 | 7.526931	6.875134
 734 | 8.577081	9.877115
 735 | 9.272136	11.050928
 736 | 10.300809	10.653059
 737 | 8.642013	9.006681
 738 | 9.720491	10.265202
 739 | 9.029005	9.646928
 740 | 8.736201	7.975603
 741 | 8.672886	9.070759
 742 | 8.370633	8.412170
 743 | 9.483776	9.183341
 744 | 6.790842	7.594992
 745 | 9.842146	10.156810
 746 | 9.563336	7.962532
 747 | 8.724669	9.870732
 748 | 9.012145	9.171326
 749 | 9.116948	9.791167
 750 | 6.219094	7.988420
 751 | 9.468422	8.359975
 752 | 8.825231	8.475208
 753 | 9.572224	9.696428
 754 | 9.609128	8.488175
 755 | 9.428590	10.468998
 756 | 8.293266	8.617701
 757 | 9.423584	10.355688
 758 | 9.240796	9.517228
 759 | 10.915423	13.026252
 760 | 10.854684	11.130866
 761 | 9.226816	9.391796
 762 | 9.580264	10.359235
 763 | 7.289907	6.898208
 764 | 9.338857	10.374025
 765 | 9.523176	11.332190
 766 | 10.162233	10.357396
 767 | 8.873930	9.207398
 768 | 8.607259	7.794804
 769 | 8.852325	8.215797
 770 | 8.077272	6.501042
 771 | 8.169273	8.269613
 772 | 6.806421	7.544423
 773 | 8.793151	9.691549
 774 | 11.640981	11.365702
 775 | 9.544082	11.576545
 776 | 9.009266	9.605596
 777 | 9.726552	9.426719
 778 | 9.495888	10.626624
 779 | 8.683982	9.337864
 780 | 8.322105	8.631099
 781 | 8.887895	8.644931
 782 | 8.662659	11.373025
 783 | 9.263321	7.536016
 784 | 7.802624	7.171625
 785 | 8.773183	8.561565
 786 | 8.730443	10.197596
 787 | 8.942915	7.758383
 788 | 8.057618	8.774996
 789 | 8.112081	8.202349
 790 | 10.378884	12.103755
 791 | 9.248876	8.637249
 792 | 9.739599	9.708576
 793 | 8.126345	8.278487
 794 | 8.894788	7.966117
 795 | 9.683165	9.019221
 796 | 10.886957	12.053843
 797 | 9.668852	10.902132
 798 | 7.486692	6.471138
 799 | 8.794850	9.173609
 800 | 8.835915	8.296727
 801 | 9.443984	11.375344
 802 | 8.696621	6.434580
 803 | 9.645560	9.233722
 804 | 9.623857	7.915590
 805 | 10.840632	12.620268
 806 | 7.298135	7.356141
 807 | 9.639644	8.902389
 808 | 9.849802	7.682624
 809 | 10.609964	10.259615
 810 | 9.768229	11.382811
 811 | 7.646351	7.571849
 812 | 10.230300	9.470859
 813 | 8.224402	8.496866
 814 | 6.879671	8.393648
 815 | 7.976247	8.667221
 816 | 9.183268	8.694550
 817 | 11.471853	12.786280
 818 | 10.428349	10.615726
 819 | 8.090828	5.902504
 820 | 9.738627	8.485792
 821 | 8.139709	8.396333
 822 | 9.508055	8.990529
 823 | 8.857260	8.497732
 824 | 8.902558	7.014433
 825 | 9.660607	11.040833
 826 | 8.772221	10.512150
 827 | 11.020038	9.354134
 828 | 7.918527	7.742062
 829 | 7.630835	7.756260
 830 | 11.043272	11.041613
 831 | 9.299376	8.674157
 832 | 9.795087	8.431837
 833 | 9.415683	8.312101
 834 | 7.942037	6.942913
 835 | 9.724790	11.766496
 836 | 10.222032	11.550876
 837 | 8.894163	8.306020
 838 | 8.394309	8.070420
 839 | 9.012776	6.880548
 840 | 9.661093	10.138921
 841 | 9.896472	9.762372
 842 | 9.135628	8.759928
 843 | 8.762656	10.306028
 844 | 8.602473	8.861956
 845 | 10.085297	10.464774
 846 | 10.644983	10.945767
 847 | 9.034571	8.391668
 848 | 8.602920	8.501944
 849 | 8.224766	7.402758
 850 | 8.755050	9.431085
 851 | 9.669937	8.641049
 852 | 10.693530	10.287124
 853 | 9.462806	7.611153
 854 | 9.287707	10.082363
 855 | 10.941260	10.783728
 856 | 9.263080	7.913328
 857 | 10.167111	10.225338
 858 | 8.783830	9.465345
 859 | 8.958624	8.662136
 860 | 9.841649	9.926781
 861 | 7.205691	6.790638
 862 | 8.629089	9.135461
 863 | 7.469440	8.450442
 864 | 8.179133	7.790434
 865 | 8.083984	7.875520
 866 | 9.271300	8.135359
 867 | 8.652349	8.254397
 868 | 7.983920	6.609684
 869 | 7.836860	9.785238
 870 | 7.418535	7.011256
 871 | 8.458288	10.095364
 872 | 9.387605	9.726911
 873 | 8.663951	8.206705
 874 | 10.146507	11.698577
 875 | 8.937103	10.990924
 876 | 11.218687	11.141945
 877 | 8.363142	9.106936
 878 | 7.877643	7.122922
 879 | 9.620978	9.905689
 880 | 9.509649	10.773209
 881 | 6.748743	6.705385
 882 | 9.300919	8.085029
 883 | 9.332257	9.818791
 884 | 7.898610	8.366643
 885 | 9.841914	9.480675
 886 | 6.920484	8.959501
 887 | 8.544713	9.563136
 888 | 8.162266	6.715277
 889 | 8.659552	9.282008
 890 | 10.673398	13.174824
 891 | 9.024000	10.379238
 892 | 8.183292	6.647572
 893 | 10.544919	10.649602
 894 | 7.201266	6.529605
 895 | 9.557407	11.096821
 896 | 8.304605	6.940929
 897 | 9.742855	9.920897
 898 | 10.024587	9.645222
 899 | 10.002296	9.998940
 900 | 8.965876	8.665419
 901 | 7.823136	6.949572
 902 | 8.125088	7.654065
 903 | 6.569589	6.046863
 904 | 10.195497	8.689129
 905 | 11.730011	10.374221
 906 | 8.739105	7.457571
 907 | 9.820059	10.278526
 908 | 9.547456	10.398198
 909 | 8.375072	8.416302
 910 | 8.889533	8.308929
 911 | 8.861201	9.290408
 912 | 12.677687	12.788463
 913 | 9.100735	8.620537
 914 | 7.728350	6.328219
 915 | 7.955373	8.355028
 916 | 8.733352	8.645414
 917 | 10.257527	11.191813
 918 | 9.246413	9.497014
 919 | 9.745302	9.642035
 920 | 7.785652	8.147621
 921 | 7.431673	8.566399
 922 | 8.654384	8.466701
 923 | 8.475392	6.744677
 924 | 9.968440	10.765192
 925 | 10.163616	10.806963
 926 | 10.238135	10.036636
 927 | 9.902889	10.746730
 928 | 9.523850	8.749708
 929 | 9.214363	9.149178
 930 | 9.266040	10.841502
 931 | 8.494292	7.770942
 932 | 10.821158	10.410192
 933 | 8.645888	7.970308
 934 | 9.885204	10.098080
 935 | 9.084990	10.886349
 936 | 9.277874	8.871449
 937 | 8.135131	7.137064
 938 | 7.917379	9.080522
 939 | 9.685586	8.822850
 940 | 8.558141	7.848112
 941 | 9.502917	10.061255
 942 | 6.409004	5.164774
 943 | 10.149235	10.579951
 944 | 7.847304	8.411351
 945 | 8.846930	6.819939
 946 | 8.675153	9.411147
 947 | 9.476276	9.061508
 948 | 11.099184	10.644263
 949 | 8.792411	10.379405
 950 | 8.400418	7.072706
 951 | 8.555713	7.923805
 952 | 8.024763	8.426993
 953 | 8.642696	10.453412
 954 | 7.906117	7.920408
 955 | 8.793393	9.722878
 956 | 8.280364	7.669854
 957 | 9.387766	9.706245
 958 | 9.626853	10.762499
 959 | 10.163631	10.919007
 960 | 9.375543	11.513524
 961 | 9.309440	8.575699
 962 | 10.055329	10.297255
 963 | 8.706241	9.097172
 964 | 10.032934	11.951897
 965 | 10.812974	11.311435
 966 | 10.352603	10.819865
 967 | 8.276870	9.055403
 968 | 8.397389	7.944434
 969 | 9.371741	10.395790
 970 | 10.825710	10.144099
 971 | 9.158483	11.385382
 972 | 10.658639	11.389856
 973 | 8.091762	6.631039
 974 | 10.734892	10.054598
 975 | 11.535880	11.604912
 976 | 9.799077	11.371677
 977 | 8.478725	9.078455
 978 | 9.399902	8.947744
 979 | 7.305377	8.144973
 980 | 7.613377	6.668798
 981 | 10.681308	10.830845
 982 | 9.973855	10.004133
 983 | 9.369918	7.855433
 984 | 8.838223	7.429033
 985 | 9.521831	10.623930
 986 | 9.724419	10.447452
 987 | 8.890224	9.275923
 988 | 9.932763	11.589953
 989 | 10.839337	9.051250
 990 | 8.497708	7.521701
 991 | 8.440236	8.705670
 992 | 9.063566	9.755744
 993 | 8.449647	8.929485
 994 | 8.554576	8.063231
 995 | 10.348606	10.550718
 996 | 5.985254	5.186844
 997 | 9.931937	10.175582
 998 | 9.854922	9.201393
 999 | 9.114580	9.134215
1000 | 10.334899	8.543604
1001 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Machine-Learning-With-Python
2 | ========================
3 | Fix bugs and add new features for personalized projects
4 | 
5 | 


--------------------------------------------------------------------------------
/Recommend/基于item的协同过滤推荐BasedItem.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*-coding:utf-8-*-
 3 | '''
 4 | Created on 2016-5-30
 5 | 
 6 | @author: thinkgamer
 7 | '''
 8 | import math
 9 | 
10 | class ItemBasedCF:
11 |     def __init__(self,train_file):
12 |         self.train_file = train_file
13 |         self.readData()
14 |     def readData(self):
15 |         #读取文件，并生成用户-物品的评分表和测试集
16 |         self.train = dict()     #用户-物品的评分表
17 |         for line in open(self.train_file):
18 |             # user,item,score = line.strip().split(",")
19 |             user,score,item = line.strip().split(",")
20 |             self.train.setdefault(user,{})
21 |             self.train[user][item] = int(float(score))
22 | 
23 |     def ItemSimilarity(self):
24 |         #建立物品-物品的共现矩阵
25 |         C = dict()  #物品-物品的共现矩阵
26 |         N = dict()  #物品被多少个不同用户购买
27 |         for user,items in self.train.items():
28 |             for i in items.keys():
29 |                 N.setdefault(i,0)
30 |                 N[i] += 1
31 |                 C.setdefault(i,{})
32 |                 for j in items.keys():
33 |                     if i == j : continue
34 |                     C[i].setdefault(j,0)
35 |                     C[i][j] += 1
36 |         #计算相似度矩阵
37 |         self.W = dict()
38 |         for i,related_items in C.items():
39 |             self.W.setdefault(i,{})
40 |             for j,cij in related_items.items():
41 |                 self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
42 |         return self.W
43 | 
44 |     #给用户user推荐，前K个相关用户
45 |     def Recommend(self,user,K=3,N=10):
46 |         rank = dict()
47 |         action_item = self.train[user]     #用户user产生过行为的item和评分
48 |         for item,score in action_item.items():
49 |             for j,wj in sorted(self.W[item].items(),key=lambda x:x[1],reverse=True)[0:K]:
50 |                 if j in action_item.keys():
51 |                     continue
52 |                 rank.setdefault(j,0)
53 |                 rank[j] += score * wj
54 |         return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])
55 |     
56 | #声明一个ItemBased推荐的对象    
57 | Item = ItemBasedCF("uid_score_bid")
58 | Item.ItemSimilarity()
59 | recommedDic = Item.Recommend("xiyuweilan")
60 | for k,v in recommedDic.iteritems():
61 |     print k,"\t",v


--------------------------------------------------------------------------------
/Recommend/基于图的推荐PersonalRank.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Recommend/基于图的推荐PersonalRank.py


--------------------------------------------------------------------------------
/Recommend/基于标签的推荐.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*-coding:utf-8-*-
  3 | import random  
  4 | import math
  5 | #统计各类数量  
  6 | def addValueToMat(theMat,key,value,incr):  
  7 |     if key not in theMat: #如果key没出先在theMat中  
  8 |         theMat[key]=dict();  
  9 |         theMat[key][value]=incr;  
 10 |     else:  
 11 |         if value not in theMat[key]:  
 12 |             theMat[key][value]=incr;  
 13 |         else:  
 14 |             theMat[key][value]+=incr;#若有值，则递增  
 15 |   
 16 | user_tags = dict();  
 17 | tag_items = dict();  
 18 | user_items = dict();  
 19 | user_items_test = dict();#测试集数据字典  
 20 | item_tags = dict()        #用于多样性测试
 21 |   
 22 | #初始化，进行各种统计  
 23 | def InitStat():  
 24 |     data_file = open('delicious.dat')  
 25 |     line = data_file.readline();   
 26 |     while line:  
 27 |         if random.random()>0.1:#将90%的数据作为训练集，剩下10%的数据作为测试集  
 28 |             terms = line.split("\t");#训练集的数据结构是[user, item, tag]形式  
 29 |             user=terms[0];  
 30 |             item=terms[1];  
 31 |             tag=terms[2];  
 32 |             addValueToMat(user_tags,user,tag,1)  
 33 |             addValueToMat(tag_items,tag,item,1)  
 34 |             addValueToMat(user_items,user,item,1)
 35 |             addValueToMat(item_tags,item,tag,1)  
 36 |             line = data_file.readline();  
 37 |         else:  
 38 |             addValueToMat(user_items_test,user,item,1)  
 39 |     data_file.close();     
 40 |     
 41 | #推荐算法  
 42 | def Recommend(usr):  
 43 |     recommend_list = dict();  
 44 |     tagged_item = user_items[usr];#得到该用户所有推荐过的物品  
 45 |     for tag_,wut in user_tags[usr].items():#用户打过的标签及次数  
 46 |         for item_,wit in tag_items[tag_].items():#物品被打过的标签及被打过的次数  
 47 |             if item_ not in tagged_item:#已经推荐过的不再推荐  
 48 |                 if item_ not in recommend_list:  
 49 |                     recommend_list[item_]=wut*wit;#根据公式  
 50 |                 else:  
 51 |                     recommend_list[item_]+=wut*wit;  
 52 |     return sorted(recommend_list.iteritems(), key=lambda a:a[1],reverse=True)
 53 | 
 54 | #统计标签流行度
 55 | def TagPopularity():
 56 |     tagfreq = {}
 57 |     for user in user_tags.keys():
 58 |         for tag in user_tags[user].keys():
 59 |             if tag not in tagfreq:
 60 |                 tagfreq[tag] = 1
 61 |             else:
 62 |                 tagfreq[tag] +=1
 63 |     return sorted(tagfreq.iteritems(), key=lambda a:a[1],reverse=True)
 64 | 
 65 | #计算余弦相似度
 66 | def CosineSim(item_tags,i,j):
 67 |     ret = 0
 68 |     for b,wib in item_tags[i].items():     #求物品i,j的标签交集数目
 69 |         if b in item_tags[j]:
 70 |             ret += wib * item_tags[j][b]
 71 |     ni = 0
 72 |     nj = 0
 73 |     for b, w in item_tags[i].items():      #统计 i 的标签数目
 74 |         ni += w * w
 75 |     for b, w in item_tags[j].items():      #统计 j 的标签数目
 76 |         nj += w * w
 77 |     if ret == 0:
 78 |         return 0
 79 |     return ret/math.sqrt(ni * nj)          #返回余弦值       
 80 | 
 81 | #计算推荐列表多样性
 82 | def Diversity(item_tags,recommend_items):
 83 |     ret = 0
 84 |     n = 0
 85 |     for i in dict(recommend_items).keys():
 86 |         for j in dict(recommend_items).keys():
 87 |             if i == j:
 88 |                 continue
 89 |             ret += CosineSim(item_tags,i,j)
 90 |             n += 1
 91 |     return ret/(n * 1.0)
 92 | 
 93 | InitStat()
 94 | recommend_list = Recommend("48411")
 95 | # print recommend_list
 96 | for recommend in recommend_list[:10]:  #兴趣度最高的十个itemid
 97 |     print recommend
 98 | 
 99 | #标签流行度统计    
100 | tagFreq = TagPopularity()
101 | for tag in tagFreq[:20]:
102 |     print tag
103 | 
104 | #推荐列表多样性,计算时间较长
105 | diversityNum = Diversity(item_tags, recommend_list)
106 | print diversityNum


--------------------------------------------------------------------------------
/Recommend/基于用户的协同过滤推荐BasedUserCF.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/komeilkma/Machine-Learning-With-Python/8780eb4646292367915096bc18121e7c691a46dc/Recommend/基于用户的协同过滤推荐BasedUserCF.py


--------------------------------------------------------------------------------
/Regession/ex0.txt:
--------------------------------------------------------------------------------
  1 | 1.000000	0.067732	3.176513
  2 | 1.000000	0.427810	3.816464
  3 | 1.000000	0.995731	4.550095
  4 | 1.000000	0.738336	4.256571
  5 | 1.000000	0.981083	4.560815
  6 | 1.000000	0.526171	3.929515
  7 | 1.000000	0.378887	3.526170
  8 | 1.000000	0.033859	3.156393
  9 | 1.000000	0.132791	3.110301
 10 | 1.000000	0.138306	3.149813
 11 | 1.000000	0.247809	3.476346
 12 | 1.000000	0.648270	4.119688
 13 | 1.000000	0.731209	4.282233
 14 | 1.000000	0.236833	3.486582
 15 | 1.000000	0.969788	4.655492
 16 | 1.000000	0.607492	3.965162
 17 | 1.000000	0.358622	3.514900
 18 | 1.000000	0.147846	3.125947
 19 | 1.000000	0.637820	4.094115
 20 | 1.000000	0.230372	3.476039
 21 | 1.000000	0.070237	3.210610
 22 | 1.000000	0.067154	3.190612
 23 | 1.000000	0.925577	4.631504
 24 | 1.000000	0.717733	4.295890
 25 | 1.000000	0.015371	3.085028
 26 | 1.000000	0.335070	3.448080
 27 | 1.000000	0.040486	3.167440
 28 | 1.000000	0.212575	3.364266
 29 | 1.000000	0.617218	3.993482
 30 | 1.000000	0.541196	3.891471
 31 | 1.000000	0.045353	3.143259
 32 | 1.000000	0.126762	3.114204
 33 | 1.000000	0.556486	3.851484
 34 | 1.000000	0.901144	4.621899
 35 | 1.000000	0.958476	4.580768
 36 | 1.000000	0.274561	3.620992
 37 | 1.000000	0.394396	3.580501
 38 | 1.000000	0.872480	4.618706
 39 | 1.000000	0.409932	3.676867
 40 | 1.000000	0.908969	4.641845
 41 | 1.000000	0.166819	3.175939
 42 | 1.000000	0.665016	4.264980
 43 | 1.000000	0.263727	3.558448
 44 | 1.000000	0.231214	3.436632
 45 | 1.000000	0.552928	3.831052
 46 | 1.000000	0.047744	3.182853
 47 | 1.000000	0.365746	3.498906
 48 | 1.000000	0.495002	3.946833
 49 | 1.000000	0.493466	3.900583
 50 | 1.000000	0.792101	4.238522
 51 | 1.000000	0.769660	4.233080
 52 | 1.000000	0.251821	3.521557
 53 | 1.000000	0.181951	3.203344
 54 | 1.000000	0.808177	4.278105
 55 | 1.000000	0.334116	3.555705
 56 | 1.000000	0.338630	3.502661
 57 | 1.000000	0.452584	3.859776
 58 | 1.000000	0.694770	4.275956
 59 | 1.000000	0.590902	3.916191
 60 | 1.000000	0.307928	3.587961
 61 | 1.000000	0.148364	3.183004
 62 | 1.000000	0.702180	4.225236
 63 | 1.000000	0.721544	4.231083
 64 | 1.000000	0.666886	4.240544
 65 | 1.000000	0.124931	3.222372
 66 | 1.000000	0.618286	4.021445
 67 | 1.000000	0.381086	3.567479
 68 | 1.000000	0.385643	3.562580
 69 | 1.000000	0.777175	4.262059
 70 | 1.000000	0.116089	3.208813
 71 | 1.000000	0.115487	3.169825
 72 | 1.000000	0.663510	4.193949
 73 | 1.000000	0.254884	3.491678
 74 | 1.000000	0.993888	4.533306
 75 | 1.000000	0.295434	3.550108
 76 | 1.000000	0.952523	4.636427
 77 | 1.000000	0.307047	3.557078
 78 | 1.000000	0.277261	3.552874
 79 | 1.000000	0.279101	3.494159
 80 | 1.000000	0.175724	3.206828
 81 | 1.000000	0.156383	3.195266
 82 | 1.000000	0.733165	4.221292
 83 | 1.000000	0.848142	4.413372
 84 | 1.000000	0.771184	4.184347
 85 | 1.000000	0.429492	3.742878
 86 | 1.000000	0.162176	3.201878
 87 | 1.000000	0.917064	4.648964
 88 | 1.000000	0.315044	3.510117
 89 | 1.000000	0.201473	3.274434
 90 | 1.000000	0.297038	3.579622
 91 | 1.000000	0.336647	3.489244
 92 | 1.000000	0.666109	4.237386
 93 | 1.000000	0.583888	3.913749
 94 | 1.000000	0.085031	3.228990
 95 | 1.000000	0.687006	4.286286
 96 | 1.000000	0.949655	4.628614
 97 | 1.000000	0.189912	3.239536
 98 | 1.000000	0.844027	4.457997
 99 | 1.000000	0.333288	3.513384
100 | 1.000000	0.427035	3.729674
101 | 1.000000	0.466369	3.834274
102 | 1.000000	0.550659	3.811155
103 | 1.000000	0.278213	3.598316
104 | 1.000000	0.918769	4.692514
105 | 1.000000	0.886555	4.604859
106 | 1.000000	0.569488	3.864912
107 | 1.000000	0.066379	3.184236
108 | 1.000000	0.335751	3.500796
109 | 1.000000	0.426863	3.743365
110 | 1.000000	0.395746	3.622905
111 | 1.000000	0.694221	4.310796
112 | 1.000000	0.272760	3.583357
113 | 1.000000	0.503495	3.901852
114 | 1.000000	0.067119	3.233521
115 | 1.000000	0.038326	3.105266
116 | 1.000000	0.599122	3.865544
117 | 1.000000	0.947054	4.628625
118 | 1.000000	0.671279	4.231213
119 | 1.000000	0.434811	3.791149
120 | 1.000000	0.509381	3.968271
121 | 1.000000	0.749442	4.253910
122 | 1.000000	0.058014	3.194710
123 | 1.000000	0.482978	3.996503
124 | 1.000000	0.466776	3.904358
125 | 1.000000	0.357767	3.503976
126 | 1.000000	0.949123	4.557545
127 | 1.000000	0.417320	3.699876
128 | 1.000000	0.920461	4.613614
129 | 1.000000	0.156433	3.140401
130 | 1.000000	0.656662	4.206717
131 | 1.000000	0.616418	3.969524
132 | 1.000000	0.853428	4.476096
133 | 1.000000	0.133295	3.136528
134 | 1.000000	0.693007	4.279071
135 | 1.000000	0.178449	3.200603
136 | 1.000000	0.199526	3.299012
137 | 1.000000	0.073224	3.209873
138 | 1.000000	0.286515	3.632942
139 | 1.000000	0.182026	3.248361
140 | 1.000000	0.621523	3.995783
141 | 1.000000	0.344584	3.563262
142 | 1.000000	0.398556	3.649712
143 | 1.000000	0.480369	3.951845
144 | 1.000000	0.153350	3.145031
145 | 1.000000	0.171846	3.181577
146 | 1.000000	0.867082	4.637087
147 | 1.000000	0.223855	3.404964
148 | 1.000000	0.528301	3.873188
149 | 1.000000	0.890192	4.633648
150 | 1.000000	0.106352	3.154768
151 | 1.000000	0.917886	4.623637
152 | 1.000000	0.014855	3.078132
153 | 1.000000	0.567682	3.913596
154 | 1.000000	0.068854	3.221817
155 | 1.000000	0.603535	3.938071
156 | 1.000000	0.532050	3.880822
157 | 1.000000	0.651362	4.176436
158 | 1.000000	0.901225	4.648161
159 | 1.000000	0.204337	3.332312
160 | 1.000000	0.696081	4.240614
161 | 1.000000	0.963924	4.532224
162 | 1.000000	0.981390	4.557105
163 | 1.000000	0.987911	4.610072
164 | 1.000000	0.990947	4.636569
165 | 1.000000	0.736021	4.229813
166 | 1.000000	0.253574	3.500860
167 | 1.000000	0.674722	4.245514
168 | 1.000000	0.939368	4.605182
169 | 1.000000	0.235419	3.454340
170 | 1.000000	0.110521	3.180775
171 | 1.000000	0.218023	3.380820
172 | 1.000000	0.869778	4.565020
173 | 1.000000	0.196830	3.279973
174 | 1.000000	0.958178	4.554241
175 | 1.000000	0.972673	4.633520
176 | 1.000000	0.745797	4.281037
177 | 1.000000	0.445674	3.844426
178 | 1.000000	0.470557	3.891601
179 | 1.000000	0.549236	3.849728
180 | 1.000000	0.335691	3.492215
181 | 1.000000	0.884739	4.592374
182 | 1.000000	0.918916	4.632025
183 | 1.000000	0.441815	3.756750
184 | 1.000000	0.116598	3.133555
185 | 1.000000	0.359274	3.567919
186 | 1.000000	0.814811	4.363382
187 | 1.000000	0.387125	3.560165
188 | 1.000000	0.982243	4.564305
189 | 1.000000	0.780880	4.215055
190 | 1.000000	0.652565	4.174999
191 | 1.000000	0.870030	4.586640
192 | 1.000000	0.604755	3.960008
193 | 1.000000	0.255212	3.529963
194 | 1.000000	0.730546	4.213412
195 | 1.000000	0.493829	3.908685
196 | 1.000000	0.257017	3.585821
197 | 1.000000	0.833735	4.374394
198 | 1.000000	0.070095	3.213817
199 | 1.000000	0.527070	3.952681
200 | 1.000000	0.116163	3.129283
201 | 


--------------------------------------------------------------------------------
/Regession/ex1.txt:
--------------------------------------------------------------------------------
  1 | 1.000000	0.635975	4.093119
  2 | 1.000000	0.552438	3.804358
  3 | 1.000000	0.855922	4.456531
  4 | 1.000000	0.083386	3.187049
  5 | 1.000000	0.975802	4.506176
  6 | 1.000000	0.181269	3.171914
  7 | 1.000000	0.129156	3.053996
  8 | 1.000000	0.605648	3.974659
  9 | 1.000000	0.301625	3.542525
 10 | 1.000000	0.698805	4.234199
 11 | 1.000000	0.226419	3.405937
 12 | 1.000000	0.519290	3.932469
 13 | 1.000000	0.354424	3.514051
 14 | 1.000000	0.118380	3.105317
 15 | 1.000000	0.512811	3.843351
 16 | 1.000000	0.236795	3.576074
 17 | 1.000000	0.353509	3.544471
 18 | 1.000000	0.481447	3.934625
 19 | 1.000000	0.060509	3.228226
 20 | 1.000000	0.174090	3.300232
 21 | 1.000000	0.806818	4.331785
 22 | 1.000000	0.531462	3.908166
 23 | 1.000000	0.853167	4.386918
 24 | 1.000000	0.304804	3.617260
 25 | 1.000000	0.612021	4.082411
 26 | 1.000000	0.620880	3.949470
 27 | 1.000000	0.580245	3.984041
 28 | 1.000000	0.742443	4.251907
 29 | 1.000000	0.110770	3.115214
 30 | 1.000000	0.742687	4.234319
 31 | 1.000000	0.574390	3.947544
 32 | 1.000000	0.986378	4.532519
 33 | 1.000000	0.294867	3.510392
 34 | 1.000000	0.472125	3.927832
 35 | 1.000000	0.872321	4.631825
 36 | 1.000000	0.843537	4.482263
 37 | 1.000000	0.864577	4.487656
 38 | 1.000000	0.341874	3.486371
 39 | 1.000000	0.097980	3.137514
 40 | 1.000000	0.757874	4.212660
 41 | 1.000000	0.877656	4.506268
 42 | 1.000000	0.457993	3.800973
 43 | 1.000000	0.475341	3.975979
 44 | 1.000000	0.848391	4.494447
 45 | 1.000000	0.746059	4.244715
 46 | 1.000000	0.153462	3.019251
 47 | 1.000000	0.694256	4.277945
 48 | 1.000000	0.498712	3.812414
 49 | 1.000000	0.023580	3.116973
 50 | 1.000000	0.976826	4.617363
 51 | 1.000000	0.624004	4.005158
 52 | 1.000000	0.472220	3.874188
 53 | 1.000000	0.390551	3.630228
 54 | 1.000000	0.021349	3.145849
 55 | 1.000000	0.173488	3.192618
 56 | 1.000000	0.971028	4.540226
 57 | 1.000000	0.595302	3.835879
 58 | 1.000000	0.097638	3.141948
 59 | 1.000000	0.745972	4.323316
 60 | 1.000000	0.676390	4.204829
 61 | 1.000000	0.488949	3.946710
 62 | 1.000000	0.982873	4.666332
 63 | 1.000000	0.296060	3.482348
 64 | 1.000000	0.228008	3.451286
 65 | 1.000000	0.671059	4.186388
 66 | 1.000000	0.379419	3.595223
 67 | 1.000000	0.285170	3.534446
 68 | 1.000000	0.236314	3.420891
 69 | 1.000000	0.629803	4.115553
 70 | 1.000000	0.770272	4.257463
 71 | 1.000000	0.493052	3.934798
 72 | 1.000000	0.631592	4.154963
 73 | 1.000000	0.965676	4.587470
 74 | 1.000000	0.598675	3.944766
 75 | 1.000000	0.351997	3.480517
 76 | 1.000000	0.342001	3.481382
 77 | 1.000000	0.661424	4.253286
 78 | 1.000000	0.140912	3.131670
 79 | 1.000000	0.373574	3.527099
 80 | 1.000000	0.223166	3.378051
 81 | 1.000000	0.908785	4.578960
 82 | 1.000000	0.915102	4.551773
 83 | 1.000000	0.410940	3.634259
 84 | 1.000000	0.754921	4.167016
 85 | 1.000000	0.764453	4.217570
 86 | 1.000000	0.101534	3.237201
 87 | 1.000000	0.780368	4.353163
 88 | 1.000000	0.819868	4.342184
 89 | 1.000000	0.173990	3.236950
 90 | 1.000000	0.330472	3.509404
 91 | 1.000000	0.162656	3.242535
 92 | 1.000000	0.476283	3.907937
 93 | 1.000000	0.636391	4.108455
 94 | 1.000000	0.758737	4.181959
 95 | 1.000000	0.778372	4.251103
 96 | 1.000000	0.936287	4.538462
 97 | 1.000000	0.510904	3.848193
 98 | 1.000000	0.515737	3.974757
 99 | 1.000000	0.437823	3.708323
100 | 1.000000	0.828607	4.385210
101 | 1.000000	0.556100	3.927788
102 | 1.000000	0.038209	3.187881
103 | 1.000000	0.321993	3.444542
104 | 1.000000	0.067288	3.199263
105 | 1.000000	0.774989	4.285745
106 | 1.000000	0.566077	3.878557
107 | 1.000000	0.796314	4.155745
108 | 1.000000	0.746600	4.197772
109 | 1.000000	0.360778	3.524928
110 | 1.000000	0.397321	3.525692
111 | 1.000000	0.062142	3.211318
112 | 1.000000	0.379250	3.570495
113 | 1.000000	0.248238	3.462431
114 | 1.000000	0.682561	4.206177
115 | 1.000000	0.355393	3.562322
116 | 1.000000	0.889051	4.595215
117 | 1.000000	0.733806	4.182694
118 | 1.000000	0.153949	3.320695
119 | 1.000000	0.036104	3.122670
120 | 1.000000	0.388577	3.541312
121 | 1.000000	0.274481	3.502135
122 | 1.000000	0.319401	3.537559
123 | 1.000000	0.431653	3.712609
124 | 1.000000	0.960398	4.504875
125 | 1.000000	0.083660	3.262164
126 | 1.000000	0.122098	3.105583
127 | 1.000000	0.415299	3.742634
128 | 1.000000	0.854192	4.566589
129 | 1.000000	0.925574	4.630884
130 | 1.000000	0.109306	3.190539
131 | 1.000000	0.805161	4.289105
132 | 1.000000	0.344474	3.406602
133 | 1.000000	0.769116	4.251899
134 | 1.000000	0.182003	3.183214
135 | 1.000000	0.225972	3.342508
136 | 1.000000	0.413088	3.747926
137 | 1.000000	0.964444	4.499998
138 | 1.000000	0.203334	3.350089
139 | 1.000000	0.285574	3.539554
140 | 1.000000	0.850209	4.443465
141 | 1.000000	0.061561	3.290370
142 | 1.000000	0.426935	3.733302
143 | 1.000000	0.389376	3.614803
144 | 1.000000	0.096918	3.175132
145 | 1.000000	0.148938	3.164284
146 | 1.000000	0.893738	4.619629
147 | 1.000000	0.195527	3.426648
148 | 1.000000	0.407248	3.670722
149 | 1.000000	0.224357	3.412571
150 | 1.000000	0.045963	3.110330
151 | 1.000000	0.944647	4.647928
152 | 1.000000	0.756552	4.164515
153 | 1.000000	0.432098	3.730603
154 | 1.000000	0.990511	4.609868
155 | 1.000000	0.649699	4.094111
156 | 1.000000	0.584879	3.907636
157 | 1.000000	0.785934	4.240814
158 | 1.000000	0.029945	3.106915
159 | 1.000000	0.075747	3.201181
160 | 1.000000	0.408408	3.872302
161 | 1.000000	0.583851	3.860890
162 | 1.000000	0.497759	3.884108
163 | 1.000000	0.421301	3.696816
164 | 1.000000	0.140320	3.114540
165 | 1.000000	0.546465	3.791233
166 | 1.000000	0.843181	4.443487
167 | 1.000000	0.295390	3.535337
168 | 1.000000	0.825059	4.417975
169 | 1.000000	0.946343	4.742471
170 | 1.000000	0.350404	3.470964
171 | 1.000000	0.042787	3.113381
172 | 1.000000	0.352487	3.594600
173 | 1.000000	0.590736	3.914875
174 | 1.000000	0.120748	3.108492
175 | 1.000000	0.143140	3.152725
176 | 1.000000	0.511926	3.994118
177 | 1.000000	0.496358	3.933417
178 | 1.000000	0.382802	3.510829
179 | 1.000000	0.252464	3.498402
180 | 1.000000	0.845894	4.460441
181 | 1.000000	0.132023	3.245277
182 | 1.000000	0.442301	3.771067
183 | 1.000000	0.266889	3.434771
184 | 1.000000	0.008575	2.999612
185 | 1.000000	0.897632	4.454221
186 | 1.000000	0.533171	3.985348
187 | 1.000000	0.285243	3.557982
188 | 1.000000	0.377258	3.625972
189 | 1.000000	0.486995	3.922226
190 | 1.000000	0.305993	3.547421
191 | 1.000000	0.277528	3.580944
192 | 1.000000	0.750899	4.268081
193 | 1.000000	0.694756	4.278096
194 | 1.000000	0.870158	4.517640
195 | 1.000000	0.276457	3.555461
196 | 1.000000	0.017761	3.055026
197 | 1.000000	0.802046	4.354819
198 | 1.000000	0.559275	3.894387
199 | 1.000000	0.941305	4.597773
200 | 1.000000	0.856877	4.523616
201 | 


--------------------------------------------------------------------------------
/Regession/regession.py:
--------------------------------------------------------------------------------
  1 | #-*-coding:utf8-*-
  2 | '''
  3 | Created on 2016年5月14日
  4 | 
  5 | @author: Gamer Think
  6 | '''
  7 | 
  8 | from numpy import *
  9 | #加载数据集
 10 | def loadDataSet(filename):
 11 |     numFeat = len(open(filename).readline().split("\t")) -1
 12 |     dataMat = []; labelMat = []
 13 |     fr = open(filename)
 14 |     for line in fr.readlines():
 15 |         lineArr = []
 16 |         curLine = line.strip().split("\t")
 17 |         for i in range(numFeat):
 18 |             lineArr.append(float(curLine[i]))
 19 |         
 20 |         dataMat.append(lineArr)
 21 |         labelMat.append(float(curLine[-1]))
 22 |         
 23 |     return dataMat,labelMat
 24 | 
 25 | #====================用线性回归找到最佳拟合曲线===========
 26 | #计算最佳拟合曲线
 27 | def standRegress(xArr,yArr):
 28 |     xMat = mat(xArr); yMat = mat(yArr).T  #.T代表转置矩阵
 29 |     xTx = xMat.T * xMat
 30 |     if linalg.det(xTx) ==0.0: #linalg.det(xTx) 计算行列式的值
 31 |         print "This matrix is singular , cannot do inverse"
 32 |         return
 33 |     ws = xTx.I * (xMat.T * yMat)
 34 |     return ws
 35 | 
 36 | #测试上边的函数
 37 | xArr,yArr = loadDataSet("ex0.txt")
 38 | ws = standRegress(xArr, yArr)
 39 | print "ws（相关系数）：",ws    #ws 存放的就是回归系数
 40 | 
 41 | #画图展示
 42 | def show():
 43 |     import matplotlib.pyplot as plt
 44 |     xMat = mat(xArr); yMat = mat(yArr)
 45 |     yHat = xMat*ws
 46 |     fig = plt.figure() #创建绘图对象
 47 |     ax = fig.add_subplot(111)  #111表示将画布划分为1行2列选择使用从上到下第一块
 48 |     #scatter绘制散点图
 49 |     ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
 50 |     #复制，排序
 51 |     xCopy =xMat.copy()
 52 |     xCopy.sort(0)
 53 |     yHat = xCopy * ws
 54 |     #plot画线
 55 |     ax.plot(xCopy[:,1],yHat)
 56 |     plt.show()
 57 | 
 58 | # show()
 59 | 
 60 | #利用numpy库提供的corrcoef来计算预测值和真实值得相关性
 61 | yHat = mat(xArr) * ws  #yHat = xMat * ws
 62 | print "相关性：",corrcoef(yHat.T,mat(yArr))
 63 | #====================用线性回归找到最佳拟合曲线===========
 64 | 
 65 | '''
 66 | #==================局部加权线性回归================
 67 | 
 68 | def lwlr(testPoint,xArr,yArr,k=1.0):
 69 |     xMat = mat(xArr); yMat = mat(yArr).T
 70 |     m = shape(xMat)[0]
 71 |     weights = mat(eye((m)))   #产生对角线矩阵
 72 |     for j in range(m):
 73 |         diffMat = testPoint - xMat[j,:]
 74 |         #更新权重值，以指数级递减
 75 |         weights[j,j] = exp(diffMat * diffMat.T /(-2.0*k**2))
 76 |     xTx = xMat.T * (weights * xMat)
 77 |     if linalg.det(xTx) == 0.0:
 78 |         print "this matrix is singular,cannot do inverse"
 79 |         return
 80 |     ws = xTx.I * (xMat.T * (weights * yMat))
 81 |     return testPoint * ws
 82 | 
 83 | def lwlrTest(testArr,xArr,yArr,k=1.0):
 84 |     m = shape(testArr)[0]
 85 |     yHat = zeros(m)
 86 |     for i in range(m):
 87 |         yHat[i] =lwlr(testArr[i],xArr,yArr,k)
 88 |     return yHat
 89 | 
 90 | 
 91 | xArr,yArr = loadDataSet('ex0.txt')
 92 | print "k=1.0：",lwlr(xArr[0],xArr,yArr,1.0)
 93 | print "k=0.001：",lwlr(xArr[0],xArr,yArr,0.001)
 94 | print "k=0.003：",lwlr(xArr[0],xArr,yArr,0.003)
 95 | 
 96 | #画图
 97 | def showlwlr():
 98 |     yHat = lwlrTest(xArr, xArr, yArr, 0.01)
 99 |     xMat = mat(xArr)
100 |     srtInd = xMat[:,1].argsort(0)
101 |     xSort = xMat[srtInd][:,0,:]
102 |     
103 |     import matplotlib.pyplot as plt
104 |     fig = plt.figure() #创建绘图对象
105 |     ax = fig.add_subplot(111)  #111表示将画布划分为1行2列选择使用从上到下第一块
106 |     ax.plot(xSort[:,1],yHat[srtInd])
107 |     #scatter绘制散点图
108 |     ax.scatter(xMat[:,1].flatten().A[0],mat(yArr).T[:,0].flatten().A[0],s=2,c='red')
109 |     plt.show()
110 | 
111 | showlwlr()
112 | '''
113 | '''
114 | #=========================岭回归==================
115 | #用于计算回归系数
116 | def ridgeRegres(xMat,yMat,lam=0.2):
117 |     xTx = xMat.T * xMat
118 |     denom = xTx + eye(shape(xMat)[1]) * lam
119 |     if linalg.det(denom)==0.0:
120 |         print "This matrix is singular, cannot do inverse"
121 |         return 
122 |     ws = denom.I * (xMat.T * yMat)
123 |     return ws
124 | 
125 | #用于在一组lambda上做测试
126 | def ridgeTest(xArr,yArr):
127 |     xMat = mat(xArr); yMat = mat(yArr).T
128 |     yMean = mean(yMat,0)
129 |     #数据标准化
130 |     yMat = yMat - yMean
131 |     xMeans = mean(xMat,0)
132 |     xVar = var(xMat,0)
133 |     xMat = (xMat - xMeans)/xVar
134 |     
135 |     numTestPts = 30
136 |     wMat = zeros((numTestPts, shape(xMat)[1]))
137 |     for i in range(numTestPts):
138 |         ws = ridgeRegres(xMat, yMat, exp(i-10))
139 |         wMat[i,:]=ws.T
140 |     return wMat
141 | 
142 | abX,abY = loadDataSet('abalone.txt')
143 | ridgeWeights = ridgeTest(abX,abY)
144 | # print ridgeWeights
145 | 
146 | def showRidge():
147 |     import matplotlib.pyplot as plt
148 |     fig = plt.figure()
149 |     ax = fig.add_subplot(111)
150 |     ax.plot(ridgeWeights)
151 |     plt.show()
152 | 
153 | showRidge()
154 | #===================岭回归=============
155 | '''
156 | #===================向前逐步回归============
157 | 
158 | #计算平方误差
159 | def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
160 |     return ((yArr-yHatArr)**2).sum()
161 | 
162 | #数据标准化处理
163 | def regularize(xMat):#regularize by columns
164 |     inMat = xMat.copy()
165 |     inMeans = mean(inMat,0)   #calc mean then subtract it off
166 |     inVar = var(inMat,0)      #calc variance of Xi then divide by it
167 |     inMat = (inMat - inMeans)/inVar
168 |     return inMat
169 | 
170 | 
171 | def stageWise(xArr,yArr,eps=0.01,numIt=100):
172 |     xMat = mat(xArr); yMat=mat(yArr).T
173 |     yMean = mean(yMat,0)
174 |     yMat = yMat - yMean     #can also regularize ys but will get smaller coef
175 |     xMat = regularize(xMat)
176 |     m,n=shape(xMat)
177 |     returnMat = zeros((numIt,n)) #testing code remove
178 |     ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
179 |     for i in range(numIt):#could change this to while loop
180 |         #print ws.T
181 |         lowestError = inf; 
182 |         for j in range(n):
183 |             for sign in [-1,1]:
184 |                 wsTest = ws.copy()
185 |                 wsTest[j] += eps*sign
186 |                 yTest = xMat*wsTest
187 |                 rssE = rssError(yMat.A,yTest.A)
188 |                 if rssE < lowestError:
189 |                     lowestError = rssE
190 |                     wsMax = wsTest
191 |         ws = wsMax.copy()
192 |         returnMat[i,:]=ws.T
193 |     return returnMat
194 |     
195 | xArr,yArr = loadDataSet('abalone.txt')
196 | print stageWise(xArr, yArr, 0.01, 200),"\n\n"
197 | 
198 | # print stageWise(xArr, yArr, 0.001, 200)
199 | 
200 | xMat = mat(xArr)
201 | yMat = mat(yArr).T
202 | xMat = regularize(xMat)
203 | yM = mean(yMat,0)
204 | yMat = yMat - yM
205 | weights = standRegress(xMat, yMat.T)
206 | print weights.T


--------------------------------------------------------------------------------
/sklearn/README.md:
--------------------------------------------------------------------------------
1 | 0:  line_regression——回归分析之Sklearn实现电力预测<br>
2 | http://blog.csdn.net/Gamer_gyt/article/details/78467021<br>
3 | 


--------------------------------------------------------------------------------
/sklearn/line_regression/sk_linreg.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # vim:fenc=utf-8
 4 | #
 5 | # Copyright © 2017 Register <registerdedicated(at)gmail.com>
 6 | #
 7 | # Distributed under terms of the GPLv3 license.
 8 | 
 9 | """
10 | """
11 | from sklearn.cross_validation import train_test_split
12 | from sklearn.linear_model import LinearRegression
13 | import pandas as pd
14 | import numpy as np
15 | 
16 | # pandas 读取数据
17 | data = pd.read_csv("Folds5x2_pp.csv")
18 | print data.shape
19 | 
20 | # 准备样本数据和样本输出
21 | X = data[["AT","V","AP","RH"]]
22 | print X.shape
23 | y = data[["PE"]]
24 | print y.shape
25 | 
26 | linreg = LinearRegression()
27 | linreg.fit(X_train,y_train)
28 | 
29 | # 训练模型完毕，查看结果
30 | print linreg.intercept_
31 | print linreg.coef_
32 | 
33 | y_pred = linreg.predict(X_test)
34 | from sklearn import metrics
35 | 
36 | # 使用sklearn来计算mse和Rmse
37 | print "MSE:",metrics.mean_squared_error(y_test, y_pred)
38 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred))
39 | 
40 | # 交叉验证
41 | from sklearn.model_selection import cross_val_predict
42 | predicted = cross_val_predict(linreg,X,y,cv=10)
43 | print "MSE:",metrics.mean_squared_error(y, predicted)
44 | print "RMSE:",np.sqrt(metrics.mean_squared_error(y, predicted))
45 | 
46 | # 画图查看结果
47 | import matplotlib.pyplot as plt
48 | fig, ax = plt.subplots()
49 | ax.scatter(y, predicted)
50 | ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
51 | ax.set_xlabel('Measured')
52 | ax.set_ylabel('Predicted')
53 | plt.show()
54 | 


--------------------------------------------------------------------------------