├── .gitignore
├── CollectiveIntelligence
    ├── chapter10
    │   ├── Thumbs.db
    │   ├── articles.txt
    │   ├── clusters.py
    │   ├── docclass.py
    │   ├── features.txt
    │   ├── newsfeatures.py
    │   ├── nnmf.py
    │   ├── stockfeatures.txt
    │   └── stockvolume.py
    ├── chapter11
    │   └── gp.py
    ├── chapter2
    │   ├── deliciousrec.py
    │   ├── pydelicious.py
    │   └── recommendations.py
    ├── chapter3
    │   ├── Thumbs.db
    │   ├── blogdata.txt
    │   ├── clusters.py
    │   ├── downloadzebodata.py
    │   ├── feedlist.txt
    │   ├── generatefeedvector.py
    │   └── zebo.txt
    ├── chapter4
    │   ├── nn.py
    │   └── searchengine.py
    ├── chapter5
    │   ├── dorm.py
    │   ├── kayak.py
    │   ├── optimization.py
    │   ├── schedule.txt
    │   └── socialnetwork.py
    ├── chapter6
    │   ├── docclass.py
    │   ├── feedfilter.py
    │   ├── python_search.xml
    │   ├── test.db
    │   └── test1.db
    ├── chapter7
    │   ├── Thumbs.db
    │   ├── addresslist.txt
    │   ├── hotornot.py
    │   ├── treepredict.py
    │   └── zillow.py
    ├── chapter8
    │   ├── ebaypredict.py
    │   ├── numpredict.py
    │   └── optimization.py
    └── chapter9
    │   ├── advancedclassify.py
    │   ├── agesonly.csv
    │   ├── facebook.py
    │   ├── matchmaker.csv
    │   └── svm.py
├── KNN
    ├── CF Recommendation System.py
    ├── __init__.py
    ├── knn-Euclidean Distance.ipynb
    ├── knn.ipynb
    └── knn.py
├── LICENSE
├── README.md
├── Untitled Diagram.png
├── Untitled Diagram.xml
├── com.xml
├── data
    └── iris.data.csv
├── kaggle
    ├── Chapter_1.1.ipynb
    ├── Chapter_1.4.ipynb
    └── Datasets
    │   └── Breast-Cancer
    │       └── breast-cancer-test.csv
├── scripts
    ├── consumer.py
    └── producer.py
├── work_one
    ├── Asyncio_hello.py
    ├── FLASK_app.py
    ├── IO.py
    ├── WSGI_hello.py
    ├── WSGI_server.py
    ├── code.jpg
    ├── distributed
    │   ├── task_master.py
    │   └── task_worker.py
    ├── leet.py
    ├── mydict.py
    ├── mydict2.py
    ├── mydict_test.py
    ├── myfile.py
    ├── requestUrlTest.py
    ├── script1.py
    ├── sina.html
    ├── templates
    │   ├── form.html
    │   ├── home.html
    │   └── signin-ok.html
    ├── test.db
    ├── test1.jpg
    ├── test22.jpg
    ├── test33.png
    ├── test44.bmp
    ├── work_GUI.py
    ├── work_HTMLParser.py
    ├── work_PILImageDraw.py
    ├── work_TCP_client.py
    ├── work_TCP_server.py
    ├── work_UDP_client.py
    ├── work_UDP_server.py
    ├── work_data_MYSQL.py
    ├── work_data_SQLAlchemy.py
    ├── work_data_SQLite.py
    ├── work_mail_POP3.py
    ├── work_mail_SMTP.py
    └── work_register.py
└── work_two_Crawler
    ├── 86.jpg
    ├── 93.jpg
    ├── 94.jpg
    ├── Download.py
    ├── __init__.py
    ├── catch_blog.py
    ├── catch_blog3.py
    ├── catch_blog5.py
    ├── catch_img.py
    ├── catch_mongo_mzui.py
    ├── catch_mongodb_mzi.py
    ├── catch_mongodb_queue.py
    ├── catch_mzui.py
    ├── catch_tianmao_rating.py
    ├── save_cookie.py
    ├── test2.py
    ├── 《幸福之路》（一）——开篇.txt
    └── 南极人天猫评价.csv


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | .idea
91 | node_modules
92 | saveit.txt
93 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter10/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter10/Thumbs.db


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter10/docclass.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import math
  3 | import cPickle
  4 | from pysqlite2 import dbapi2 as sqlite
  5 | 
  6 | def getwords(doc):
  7 |   splitter=re.compile('\\W*')
  8 |   words=[s.lower() for s in splitter.split(doc) 
  9 |           if len(s)>2 and len(s)<20]
 10 |   
 11 |   # Return the unique set of words only
 12 |   return dict([(w,1) for w in words])
 13 | 
 14 | #def entryfeatures(entry):
 15 | 
 16 | def sampletrain(cl):
 17 |   cl.train('Nobody owns the water.','good')
 18 |   cl.train('the quick rabbit jumps fences','good')
 19 |   cl.train('buy pharmaceuticals now','bad')
 20 |   cl.train('make quick money at the online casino','bad')
 21 |   cl.train('the quick brown fox jumps','good')
 22 | 
 23 | class classifier:
 24 |   def __init__(self,getfeatures):
 25 |     self.fc={}
 26 |     self.cc={}
 27 |     self.getfeatures=getfeatures
 28 |   
 29 |   def setdb(self,dbfile):
 30 |     self.con=sqlite.connect(dbfile)    
 31 |     self.con.execute('create table if not exists fc(feature,category,count)')
 32 |     self.con.execute('create table if not exists cc(category,count)')
 33 |   
 34 |   def incf(self,f,cat):
 35 |     count=self.fcount(f,cat)
 36 |     if count==0:
 37 |       self.con.execute("insert into fc values ('%s','%s',1)" 
 38 |                        % (f,cat))
 39 |     else:
 40 |       self.con.execute(
 41 |         "update fc set count=%d where feature='%s' and category='%s'" 
 42 |         % (count+1,f,cat)) 
 43 |   
 44 |   def fcount(self,f,cat):
 45 |     res=self.con.execute(
 46 |       'select count from fc where feature="%s" and category="%s"'
 47 |       %(f,cat)).fetchone()
 48 |     if res==None: return 0
 49 |     else: return float(res[0])
 50 | 
 51 |   def incc(self,cat):
 52 |     count=self.catcount(cat)
 53 |     if count==0:
 54 |       self.con.execute("insert into cc values ('%s',1)" % (cat))
 55 |     else:
 56 |       self.con.execute("update cc set count=%d where category='%s'" 
 57 |                        % (count+1,cat))    
 58 |       
 59 |   def catcount(self,cat):
 60 |     res=self.con.execute('select count from cc where category="%s"'
 61 |                          %(cat)).fetchone()
 62 |     if res==None: return 0.0
 63 |     else: return float(res[0])
 64 |     
 65 |   def categories(self):
 66 |     cur=self.con.execute('select category from cc');
 67 |     return [d[0] for d in cur]
 68 | 
 69 |   def totalcount(self):
 70 |     res=self.con.execute('select sum(count) from cc').fetchone();
 71 |     if res==None: return 0
 72 |     return res[0]
 73 |     
 74 | 
 75 |   """  
 76 |   def incf(self,f,cat):
 77 |     self.fc.setdefault(f,{})
 78 |     self.fc[f].setdefault(cat,0)
 79 |     self.fc[f][cat]+=1
 80 |   
 81 |   def incc(self,cat):
 82 |     self.cc.setdefault(cat,0)
 83 |     self.cc[cat]+=1
 84 |  
 85 |   def fcount(self,f,cat):
 86 |     if f in self.fc and cat in self.fc[f]: 
 87 |       return float(self.fc[f][cat])
 88 |     return 0.0
 89 |   
 90 |   def catcount(self,cat):
 91 |     if cat in self.cc:
 92 |       return float(self.cc[cat])
 93 |     return 0
 94 | 
 95 |   def totalcount(self):
 96 |     return sum(self.cc.values())
 97 | 
 98 |   def categories(self):
 99 |     return self.cc.keys()
100 |   """  
101 |   
102 |   
103 |   def train(self,item,cat):
104 |     features=self.getfeatures(item)   
105 |     for f in features:
106 |       self.incf(f,cat)
107 |     self.incc(cat)
108 |     self.con.commit()
109 |    
110 |   def fprob(self,f,cat):
111 |     if self.catcount(cat)==0: return 0
112 |     return self.fcount(f,cat)/self.catcount(cat)
113 | 
114 |   def setfilename(self,filename):
115 |     self.filename=filename
116 |     self.restoredata()
117 | 
118 |   def restoredata(self):
119 |     try: f=file(self.filename,'rb')
120 |     except: return
121 |     self.fc=cPickle.load(f)
122 |     self.cc=cPickle.load(f)
123 |     f.close()
124 |     
125 |   def savedata(self):
126 |     f=file(self.filename,'wb')
127 |     cPickle.dump(self.fc,f,True)
128 |     cPickle.dump(self.cc,f,True)
129 |     f.close()
130 |   def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
131 |     basicprob=prf(f,cat)
132 |     totals=sum([self.fcount(f,c) for c in self.categories()])
133 |     bp=((weight*ap)+(totals*basicprob))/(weight+totals)
134 |     return bp
135 |     
136 |       
137 |   
138 | class naivebayes(classifier):
139 |   def __init__(self,getfeatures):
140 |     classifier.__init__(self,getfeatures)
141 |     self.thresholds={}
142 | 
143 |   def setthreshold(self,cat,t):
144 |     self.thresholds[cat]=t
145 |     
146 |   def getthreshold(self,cat):
147 |     if cat not in self.thresholds: return 1.0
148 |     return self.thresholds[cat]
149 | 
150 |   def classify(self,item,default=None):
151 |     probs={}
152 |     max=0.0
153 |     for cat in self.categories():
154 |       probs[cat]=self.prob(item,cat)
155 |       if probs[cat]>max: 
156 |         max=probs[cat]
157 |         best=cat
158 |     for cat in probs:
159 |       if cat==best: continue
160 |       if probs[cat]*self.getthreshold(best)>probs[best]: return default
161 |     return best
162 |     
163 |   def docprob(self,item,cat):
164 |     features=self.getfeatures(item)   
165 |     p=1
166 |     for f in features: p*=self.weightedprob(f,cat,self.fprob)
167 |     return p
168 | 
169 | 
170 |   def prob(self,item,cat):
171 |     catprob=self.catcount(cat)/self.totalcount()
172 |     docprob=self.docprob(item,cat)
173 |     return docprob*catprob
174 | 
175 | class fisherclassifier(classifier):
176 |   def __init__(self,getfeatures):
177 |     classifier.__init__(self,getfeatures)
178 |     self.minimums={}
179 | 
180 |   def setminimum(self,cat,min):
181 |     self.minimums[cat]=min
182 |   
183 |   def getminimum(self,cat):
184 |     if cat not in self.minimums: return 0
185 |     return self.minimums[cat]
186 |   
187 |   def classify(self,item,default=None):
188 |     best=default
189 |     max=0.0
190 |     for c in self.categories():
191 |       p=self.fisherprob(item,c)
192 |       if p>self.getminimum(c) and p>max:
193 |         best=c
194 |         max=p
195 |     return best
196 |         
197 | 
198 |   def cprob(self,f,cat):
199 |     # The frequency of this feature in this category    
200 |     clf=self.fprob(f,cat)
201 | 
202 |     if clf==0: return 0.0
203 | 
204 |     # The frequency of this feature in all the categories
205 |     freqsum=sum([self.fprob(f,c) for c in self.categories()])
206 | 
207 |     # The probability is the frequency in this category divided by
208 |     # the overall frequency
209 |     p=clf/(freqsum)
210 |     
211 |     return p
212 |   
213 |   
214 |   def fisherprob(self,item,cat):
215 |     p=1
216 |     features=self.getfeatures(item)
217 |     for f in features:
218 |       p*=(self.weightedprob(f,cat,self.cprob))
219 |     fscore=-2*math.log(p)
220 |     return self.chi2P(fscore,len(features)*2)
221 |   
222 |   def chi2P(self,chi,df):
223 |     m = chi / 2.0
224 |     sum = term = math.exp(-m)
225 |     for i in range(1, df//2):
226 |         term *= m / i
227 |         sum += term
228 |     return min(sum, 1.0)
229 | 
230 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter10/features.txt:
--------------------------------------------------------------------------------
 1 | ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
 2 | (6.58003120192, u'The Abs Diet by David Zinczenko')
 3 | (5.9231935598, u"I did'nt diet to get in shape for Trinidad's Carnival.....")
 4 | (5.04673654071, u'Sensible Diet & Exercise')
 5 | 
 6 | ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
 7 | (6.52183126318, u'food/exercise Monday 10/1')
 8 | (5.94642162786, u'diet/exercise 10/4')
 9 | (5.3332773133, u'food/exercise Friday 10/5')
10 | 
11 | ['food', 'calories', 'than', 'easy', 'high', 'come']
12 | (9.98464450123, u'High or low fat food? Easy trick for figuring it out')
13 | (3.41252863148, u'Oatmeal, cereal of choice.')
14 | (3.19119866786, u'Food and Workout Log 10.8.07')
15 | 
16 | ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
17 | (7.46811621754, u'saturday')
18 | (5.62839188358, u'diet-exercise thursday')
19 | (5.29370213306, u'sleepy food/fitness thursday')
20 | 
21 | ['food', 'home', 'then', 'exercise', 'morning', 'went']
22 | (5.22083940456, u'Food & Exercise -- 10/5/2007')
23 | (5.16310413391, u'Food & Exercise -- 10/4/2007')
24 | (4.75585045074, u'Food & Exercise -- 9/28/2007 (yesterday)')
25 | 
26 | ['fats', 'quot', 'this', 'good', 'about', 'like']
27 | (14.9233786406, u'Good fats bad fats')
28 | (1.3775418859, u'Should we ban marathons?')
29 | (1.37194239805, u'Food & Exercise -- 10/3/2007')
30 | 
31 | ['quot', 'they', 'money', 'want', 'very', 'best']
32 | (6.1620884463, u'More about the Chicago marathon')
33 | (5.58276496802, u'LOUIE + LINESMAKER = $$$$')
34 | (4.04959173123, u'High or low fat food? Easy trick for figuring it out')
35 | 
36 | ['that', 'much', 'does', 'exercise', 'this', 'morning']
37 | (7.73926153154, u'Food & Exercise -- 10/7/2007')
38 | (5.96451663382, u'< 1g, etc.')
39 | (3.81276353396, u"why I'm succeeding, finally, with my fitness")
40 | 
41 | ['with', 'your', 'weight', 'have', 'control', 'about']
42 | (6.78756986407, u'Control ur Weight')
43 | (5.54567450388, u'Flu-Busting Chicken Soup')
44 | (5.21079777525, u'Weight Loss Tips')
45 | 
46 | ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
47 | (5.58477112035, u'Food and Workout Log 9.27.08')
48 | (5.48488799917, u'Food and Workout Log 10.3.07')
49 | (5.10395750879, u'Food and Workout Log 10.10.07')
50 | 
51 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter10/newsfeatures.py:
--------------------------------------------------------------------------------
  1 | import feedparser
  2 | import re
  3 | 
  4 | 
  5 | feedlist=['http://today.reuters.com/rss/topNews',
  6 |           'http://today.reuters.com/rss/domesticNews',
  7 |           'http://today.reuters.com/rss/worldNews',
  8 |           'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
  9 |           'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
 10 |           'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
 11 |           'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
 12 |           'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
 13 |           'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
 14 |           'http://news.google.com/?output=rss',
 15 |           'http://feeds.salon.com/salon/news',
 16 |           'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
 17 |           'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
 18 |           'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
 19 |           'http://rss.cnn.com/rss/edition.rss',
 20 |           'http://rss.cnn.com/rss/edition_world.rss',
 21 |           'http://rss.cnn.com/rss/edition_us.rss']
 22 | 
 23 | def stripHTML(h):
 24 |   p=''
 25 |   s=0
 26 |   for c in h:
 27 |     if c=='<': s=1
 28 |     elif c=='>':
 29 |       s=0
 30 |       p+=' '
 31 |     elif s==0: p+=c
 32 |   return p
 33 | 
 34 | 
 35 | def separatewords(text):
 36 |   splitter=re.compile('\\W*')
 37 |   return [s.lower() for s in splitter.split(text) if len(s)>3]
 38 | 
 39 | def getarticlewords():
 40 |   allwords={}
 41 |   articlewords=[]
 42 |   articletitles=[]
 43 |   ec=0
 44 |   # Loop over every feed
 45 |   for feed in feedlist:
 46 |     f=feedparser.parse(feed)
 47 |     
 48 |     # Loop over every article
 49 |     for e in f.entries:
 50 |       # Ignore identical articles
 51 |       if e.title in articletitles: continue
 52 |       
 53 |       # Extract the words
 54 |       txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8'))
 55 |       words=separatewords(txt)
 56 |       articlewords.append({})
 57 |       articletitles.append(e.title)
 58 |       
 59 |       # Increase the counts for this word in allwords and in articlewords
 60 |       for word in words:
 61 |         allwords.setdefault(word,0)
 62 |         allwords[word]+=1
 63 |         articlewords[ec].setdefault(word,0)
 64 |         articlewords[ec][word]+=1
 65 |       ec+=1
 66 |   return allwords,articlewords,articletitles
 67 | 
 68 | def makematrix(allw,articlew):
 69 |   wordvec=[]
 70 |   
 71 |   # Only take words that are common but not too common
 72 |   for w,c in allw.items():
 73 |     if c>3 and c<len(articlew)*0.6:
 74 |       wordvec.append(w) 
 75 |   
 76 |   # Create the word matrix
 77 |   l1=[[(word in f and f[word] or 0) for word in wordvec] for f in articlew]
 78 |   return l1,wordvec
 79 | 
 80 | from numpy import *
 81 | 
 82 | def showfeatures(w,h,titles,wordvec,out='features.txt'): 
 83 |   outfile=file(out,'w')  
 84 |   pc,wc=shape(h)
 85 |   toppatterns=[[] for i in range(len(titles))]
 86 |   patternnames=[]
 87 |   
 88 |   # Loop over all the features
 89 |   for i in range(pc):
 90 |     slist=[]
 91 |     # Create a list of words and their weights
 92 |     for j in range(wc):
 93 |       slist.append((h[i,j],wordvec[j]))
 94 |     # Reverse sort the word list
 95 |     slist.sort()
 96 |     slist.reverse()
 97 |     
 98 |     # Print the first six elements
 99 |     n=[s[1] for s in slist[0:6]]
100 |     outfile.write(str(n)+'\n')
101 |     patternnames.append(n)
102 |     
103 |     # Create a list of articles for this feature
104 |     flist=[]
105 |     for j in range(len(titles)):
106 |       # Add the article with its weight
107 |       flist.append((w[j,i],titles[j]))
108 |       toppatterns[j].append((w[j,i],i,titles[j]))
109 |     
110 |     # Reverse sort the list
111 |     flist.sort()
112 |     flist.reverse()
113 |     
114 |     # Show the top 3 articles
115 |     for f in flist[0:3]:
116 |       outfile.write(str(f)+'\n')
117 |     outfile.write('\n')
118 | 
119 |   outfile.close()
120 |   # Return the pattern names for later use
121 |   return toppatterns,patternnames
122 | 
123 | def showarticles(titles,toppatterns,patternnames,out='articles.txt'):
124 |   outfile=file(out,'w')  
125 |   
126 |   # Loop over all the articles
127 |   for j in range(len(titles)):
128 |     outfile.write(titles[j].encode('utf8')+'\n')
129 |     
130 |     # Get the top features for this article and
131 |     # reverse sort them
132 |     toppatterns[j].sort()
133 |     toppatterns[j].reverse()
134 |     
135 |     # Print the top three patterns
136 |     for i in range(3):
137 |       outfile.write(str(toppatterns[j][i][0])+' '+
138 |                     str(patternnames[toppatterns[j][i][1]])+'\n')
139 |     outfile.write('\n')
140 |     
141 |   outfile.close()
142 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter10/nnmf.py:
--------------------------------------------------------------------------------
 1 | from numpy import *
 2 | 
 3 | def difcost(a,b):
 4 |   dif=0
 5 |   for i in range(shape(a)[0]):
 6 |     for j in range(shape(a)[1]):
 7 |       # Euclidean Distance
 8 |       dif+=pow(a[i,j]-b[i,j],2)
 9 |   return dif
10 | 
11 | def factorize(v,pc=10,iter=50):
12 |   ic=shape(v)[0]
13 |   fc=shape(v)[1]
14 | 
15 |   # Initialize the weight and feature matrices with random values
16 |   w=matrix([[random.random() for j in range(pc)] for i in range(ic)])
17 |   h=matrix([[random.random() for i in range(fc)] for i in range(pc)])
18 | 
19 |   # Perform operation a maximum of iter times
20 |   for i in range(iter):
21 |     wh=w*h
22 |     
23 |     # Calculate the current difference
24 |     cost=difcost(v,wh)
25 |     
26 |     if i%10==0: print cost
27 |     
28 |     # Terminate if the matrix has been fully factorized
29 |     if cost==0: break
30 |     
31 |     # Update feature matrix
32 |     hn=(transpose(w)*v)
33 |     hd=(transpose(w)*w*h)
34 |   
35 |     h=matrix(array(h)*array(hn)/array(hd))
36 | 
37 |     # Update weights matrix
38 |     wn=(v*transpose(h))
39 |     wd=(w*h*transpose(h))
40 | 
41 |     w=matrix(array(w)*array(wn)/array(wd))  
42 |     
43 |   return w,h
44 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter10/stockfeatures.txt:
--------------------------------------------------------------------------------
  1 | 5.26743580154e+017
  2 | 3.93402025291e+016
  3 | 2.21688612312e+016
  4 | 1.71500393528e+016
  5 | 1.49411594165e+016
  6 | [[  2.33322541e+06   2.07819608e+06   2.51935438e+06   2.96234043e+06
  7 |     1.75536111e+06   7.86146406e+06   2.63057169e+06   2.15047807e+06
  8 |     5.08400536e+06   7.00030282e+06   1.85413701e+07   3.38175040e+06]
  9 |  [  4.39522609e+06   3.06456173e+05   1.01774069e+06   5.95775828e+05
 10 |     4.58278700e+05   2.44897111e+06   6.88990546e+05   9.20287049e+05
 11 |     4.92159041e+06   2.73739991e+06   3.16536914e+06   1.59875019e+07]
 12 |  [  1.94852289e+07   2.76219783e+05   2.65520981e+03   3.05103534e+05
 13 |     1.98473327e+05   3.64804329e+05   1.19037805e+05   1.98460099e+02
 14 |     3.76011874e+05   1.43281935e+05   1.39846581e+06   3.84252682e+05]
 15 |  [  1.17533915e+07   3.03635741e+05   5.79421694e+05   4.36884572e+05
 16 |     3.06811879e+05   9.98011680e+05   5.08825718e+05   2.75383182e+05
 17 |     5.01943100e+06   1.15884764e+06   1.40079467e+06   1.47720209e+04]
 18 |  [  1.05481574e+07   3.70822814e+05   6.30403606e+05   7.01379744e+05
 19 |     1.69117963e+05   1.67921090e+06   6.68489498e+05   3.92653670e+05
 20 |     1.29366132e+03   1.25383449e+06   4.57858763e+06   4.65246631e+05]]
 21 | [[ 0.43597457  0.05871326  0.06516699  0.03360034  0.52074744]
 22 |  [ 0.44059965  0.1652264   0.29512033  0.11415813  0.55129002]
 23 |  [ 0.71651796  0.14618471  0.76636368  0.58318813  0.27252264]
 24 |  ..., 
 25 |  [ 1.28562362  0.84010606  0.65675734  0.2187646   0.68153007]
 26 |  [ 0.78639688  0.40560653  1.21738032  1.17089036  2.06706388]
 27 |  [ 2.45069957  0.00640682  0.86072825  0.10106403  1.12640551]]
 28 | Feature 0
 29 | (18541370.141110275, 'XOM')
 30 | (7861464.0553792343, 'CVX')
 31 | (7000302.8181583285, 'PG')
 32 | (5084005.3613334689, 'GOOG')
 33 | (3381750.4044293971, 'AMGN')
 34 | (2962340.4315599473, 'BP')
 35 | (2630571.6923459047, 'DNA')
 36 | (2519354.3804378472, 'BIIB')
 37 | (2333225.4065250917, 'YHOO')
 38 | (2150478.0737609738, 'EXPE')
 39 | (2078196.0848287165, 'AVP')
 40 | (1755361.1131727577, 'CL')
 41 | 
 42 | [(2.4506995728828622, '18-Oct-05'), (1.7327784403764923, '11-Sep-06'), (1.5111300572258395, '8-Jun-06')]
 43 | 
 44 | Feature 1
 45 | (15987501.883808712, 'AMGN')
 46 | (4921590.4116128432, 'GOOG')
 47 | (4395226.0932264365, 'YHOO')
 48 | (3165369.1418494503, 'XOM')
 49 | (2737399.9096869556, 'PG')
 50 | (2448971.1065134653, 'CVX')
 51 | (1017740.6942413859, 'BIIB')
 52 | (920287.04939950886, 'EXPE')
 53 | (688990.54637332377, 'DNA')
 54 | (595775.82846660342, 'BP')
 55 | (458278.69976566656, 'CL')
 56 | (306456.1727793481, 'AVP')
 57 | 
 58 | [(5.5183934865182875, '15-Feb-06'), (2.138473391072961, '1-Feb-06'), (1.9475044925471119, '26-Jan-06')]
 59 | 
 60 | Feature 2
 61 | (19485228.873686153, 'YHOO')
 62 | (1398465.8074515802, 'XOM')
 63 | (384252.68231490435, 'AMGN')
 64 | (376011.87440058013, 'GOOG')
 65 | (364804.32850560133, 'CVX')
 66 | (305103.53400016041, 'BP')
 67 | (276219.78349040612, 'AVP')
 68 | (198473.32671485722, 'CL')
 69 | (143281.93458262246, 'PG')
 70 | (119037.80463716132, 'DNA')
 71 | (2655.2098122150296, 'BIIB')
 72 | (198.46009910268154, 'EXPE')
 73 | 
 74 | [(8.3018051767438337, '19-Jul-06'), (4.5697390847378792, '19-Sep-06'), (2.506039176128628, '19-Apr-06')]
 75 | 
 76 | Feature 3
 77 | (11753391.461576829, 'YHOO')
 78 | (5019430.9962252304, 'GOOG')
 79 | (1400794.6664170395, 'XOM')
 80 | (1158847.6445206082, 'PG')
 81 | (998011.67965212127, 'CVX')
 82 | (579421.69354580715, 'BIIB')
 83 | (508825.71818347432, 'DNA')
 84 | (436884.5719282077, 'BP')
 85 | (306811.8787867761, 'CL')
 86 | (303635.74069823755, 'AVP')
 87 | (275383.18216351332, 'EXPE')
 88 | (14772.020946359988, 'AMGN')
 89 | 
 90 | [(6.9635400795449733, '18-Jan-06'), (4.4080426022720891, '18-Oct-06'), (3.6766225277997848, '20-Jan-06')]
 91 | 
 92 | Feature 4
 93 | (10548157.403712066, 'YHOO')
 94 | (4578587.6349422066, 'XOM')
 95 | (1679210.8955857321, 'CVX')
 96 | (1253834.4926672454, 'PG')
 97 | (701379.74418841151, 'BP')
 98 | (668489.49759360566, 'DNA')
 99 | (630403.60590710363, 'BIIB')
100 | (465246.63059152756, 'AMGN')
101 | (392653.67018991744, 'EXPE')
102 | (370822.8136748391, 'AVP')
103 | (169117.96293892173, 'CL')
104 | (1293.6613221068894, 'GOOG')
105 | 
106 | [(3.2242716186256213, '19-Jul-06'), (2.4565899212822875, '18-Oct-06'), (2.2169891870590743, '17-Oct-06')]
107 | 
108 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter10/stockvolume.py:
--------------------------------------------------------------------------------
 1 | import nnmf
 2 | import urllib2
 3 | from numpy import *
 4 | 
 5 | tickers=['YHOO','AVP','BIIB','BP','CL','CVX',
 6 |          'DNA','EXPE','GOOG','PG','XOM','AMGN']
 7 | 
 8 | shortest=300
 9 | prices={}
10 | dates=None
11 | 
12 | for t in tickers:
13 |   # Open the URL
14 |   rows=urllib2.urlopen('http://ichart.finance.yahoo.com/table.csv?'+\
15 |                        's=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=1996'%t +\
16 |                        '&ignore=.csv').readlines()
17 | 
18 |   
19 |   # Extract the volume field from every line
20 |   prices[t]=[float(r.split(',')[5]) for r in rows[1:] if r.strip()!='']
21 |   if len(prices[t])<shortest: shortest=len(prices[t])
22 |   
23 |   if not dates:
24 |     dates=[r.split(',')[0] for r in rows[1:] if r.strip()!='']
25 | 
26 | l1=[[prices[tickers[i]][j] 
27 |      for i in range(len(tickers))] 
28 |     for j in range(shortest)]
29 | 
30 | w,h=nnmf.factorize(matrix(l1),pc=5)
31 | 
32 | print h
33 | print w
34 | 
35 | # Loop over all the features
36 | for i in range(shape(h)[0]):
37 |   print "Feature %d" %i
38 |   
39 |   # Get the top stocks for this feature
40 |   ol=[(h[i,j],tickers[j]) for j in range(shape(h)[1])]
41 |   ol.sort()
42 |   ol.reverse()
43 |   for j in range(12):
44 |     print ol[j]
45 |   print
46 |   
47 |   # Show the top dates for this feature
48 |   porder=[(w[d,i],d) for d in range(300)]
49 |   porder.sort()
50 |   porder.reverse()
51 |   print [(p[0],dates[p[1]]) for p in porder[0:3]]
52 |   print
53 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter2/deliciousrec.py:
--------------------------------------------------------------------------------
 1 | from pydelicious import get_popular,get_userposts,get_urlposts
 2 | import time
 3 | 
 4 | def initializeUserDict(tag,count=5):
 5 |   user_dict={}
 6 |   # get the top count' popular posts
 7 |   for p1 in get_popular(tag=tag)[0:count]:
 8 |     # find all users who posted this
 9 |     for p2 in get_urlposts(p1['href']):
10 |       user=p2['user']
11 |       user_dict[user]={}
12 |   return user_dict
13 | 
14 | def fillItems(user_dict):
15 |   all_items={}
16 |   # Find links posted by all users
17 |   for user in user_dict:
18 |     for i in range(3):
19 |       try:
20 |         posts=get_userposts(user)
21 |         break
22 |       except:
23 |         print "Failed user "+user+", retrying"
24 |         time.sleep(4)
25 |     for post in posts:
26 |       url=post['href']
27 |       user_dict[user][url]=1.0
28 |       all_items[url]=1
29 |   
30 |   # Fill in missing items with 0
31 |   for ratings in user_dict.values():
32 |     for item in all_items:
33 |       if item not in ratings:
34 |         ratings[item]=0.0
35 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter2/recommendations.py:
--------------------------------------------------------------------------------
  1 | # A dictionary of movie critics and their ratings of a small
  2 | # set of movies
  3 | critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
  4 |  'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
  5 |  'The Night Listener': 3.0},
  6 | 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
  7 |  'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
  8 |  'You, Me and Dupree': 3.5}, 
  9 | 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 10 |  'Superman Returns': 3.5, 'The Night Listener': 4.0},
 11 | 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 12 |  'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 13 |  'You, Me and Dupree': 2.5},
 14 | 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 15 |  'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 16 |  'You, Me and Dupree': 2.0}, 
 17 | 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 18 |  'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
 19 | 'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
 20 | 
 21 | 
 22 | from math import sqrt
 23 | 
 24 | # Returns a distance-based similarity score for person1 and person2
 25 | def sim_distance(prefs,person1,person2):
 26 |   # Get the list of shared_items
 27 |   si={}
 28 |   for item in prefs[person1]: 
 29 |     if item in prefs[person2]: si[item]=1
 30 | 
 31 |   # if they have no ratings in common, return 0
 32 |   if len(si)==0: return 0
 33 | 
 34 |   # Add up the squares of all the differences
 35 |   sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
 36 |                       for item in prefs[person1] if item in prefs[person2]])
 37 | 
 38 |   return 1/(1+sum_of_squares)
 39 | 
 40 | # Returns the Pearson correlation coefficient for p1 and p2
 41 | def sim_pearson(prefs,p1,p2):
 42 |   # Get the list of mutually rated items
 43 |   si={}
 44 |   for item in prefs[p1]: 
 45 |     if item in prefs[p2]: si[item]=1
 46 | 
 47 |   # if they are no ratings in common, return 0
 48 |   if len(si)==0: return 0
 49 | 
 50 |   # Sum calculations
 51 |   n=len(si)
 52 |   
 53 |   # Sums of all the preferences
 54 |   sum1=sum([prefs[p1][it] for it in si])
 55 |   sum2=sum([prefs[p2][it] for it in si])
 56 |   
 57 |   # Sums of the squares
 58 |   sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
 59 |   sum2Sq=sum([pow(prefs[p2][it],2) for it in si])	
 60 |   
 61 |   # Sum of the products
 62 |   pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
 63 |   
 64 |   # Calculate r (Pearson score)
 65 |   num=pSum-(sum1*sum2/n)
 66 |   den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
 67 |   if den==0: return 0
 68 | 
 69 |   r=num/den
 70 | 
 71 |   return r
 72 | 
 73 | # Returns the best matches for person from the prefs dictionary. 
 74 | # Number of results and similarity function are optional params.
 75 | def topMatches(prefs,person,n=5,similarity=sim_pearson):
 76 |   scores=[(similarity(prefs,person,other),other) 
 77 |                   for other in prefs if other!=person]
 78 |   scores.sort()
 79 |   scores.reverse()
 80 |   return scores[0:n]
 81 | 
 82 | # Gets recommendations for a person by using a weighted average
 83 | # of every other user's rankings
 84 | def getRecommendations(prefs,person,similarity=sim_pearson):
 85 |   totals={}
 86 |   simSums={}
 87 |   for other in prefs:
 88 |     # don't compare me to myself
 89 |     if other==person: continue
 90 |     sim=similarity(prefs,person,other)
 91 | 
 92 |     # ignore scores of zero or lower
 93 |     if sim<=0: continue
 94 |     for item in prefs[other]:
 95 | 	    
 96 |       # only score movies I haven't seen yet
 97 |       if item not in prefs[person] or prefs[person][item]==0:
 98 |         # Similarity * Score
 99 |         totals.setdefault(item,0)
100 |         totals[item]+=prefs[other][item]*sim
101 |         # Sum of similarities
102 |         simSums.setdefault(item,0)
103 |         simSums[item]+=sim
104 | 
105 |   # Create the normalized list
106 |   rankings=[(total/simSums[item],item) for item,total in totals.items()]
107 | 
108 |   # Return the sorted list
109 |   rankings.sort()
110 |   rankings.reverse()
111 |   return rankings
112 | 
113 | def transformPrefs(prefs):
114 |   result={}
115 |   for person in prefs:
116 |     for item in prefs[person]:
117 |       result.setdefault(item,{})
118 |       
119 |       # Flip item and person
120 |       result[item][person]=prefs[person][item]
121 |   return result
122 | 
123 | 
124 | def calculateSimilarItems(prefs,n=10):
125 |   # Create a dictionary of items showing which other items they
126 |   # are most similar to.
127 |   result={}
128 |   # Invert the preference matrix to be item-centric
129 |   itemPrefs=transformPrefs(prefs)
130 |   c=0
131 |   for item in itemPrefs:
132 |     # Status updates for large datasets
133 |     c+=1
134 |     if c%100==0: print "%d / %d" % (c,len(itemPrefs))
135 |     # Find the most similar items to this one
136 |     scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
137 |     result[item]=scores
138 |   return result
139 | 
140 | def getRecommendedItems(prefs,itemMatch,user):
141 |   userRatings=prefs[user]
142 |   scores={}
143 |   totalSim={}
144 |   # Loop over items rated by this user
145 |   for (item,rating) in userRatings.items( ):
146 | 
147 |     # Loop over items similar to this one
148 |     for (similarity,item2) in itemMatch[item]:
149 | 
150 |       # Ignore if this user has already rated this item
151 |       if item2 in userRatings: continue
152 |       # Weighted sum of rating times similarity
153 |       scores.setdefault(item2,0)
154 |       scores[item2]+=similarity*rating
155 |       # Sum of all the similarities
156 |       totalSim.setdefault(item2,0)
157 |       totalSim[item2]+=similarity
158 | 
159 |   # Divide each total score by total weighting to get an average
160 |   rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
161 | 
162 |   # Return the rankings from highest to lowest
163 |   rankings.sort( )
164 |   rankings.reverse( )
165 |   return rankings
166 | 
167 | def loadMovieLens(path='/data/movielens'):
168 |   # Get movie titles
169 |   movies={}
170 |   for line in open(path+'/u.item'):
171 |     (id,title)=line.split('|')[0:2]
172 |     movies[id]=title
173 |   
174 |   # Load data
175 |   prefs={}
176 |   for line in open(path+'/u.data'):
177 |     (user,movieid,rating,ts)=line.split('\t')
178 |     prefs.setdefault(user,{})
179 |     prefs[user][movies[movieid]]=float(rating)
180 |   return prefs
181 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter3/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter3/Thumbs.db


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter3/downloadzebodata.py:
--------------------------------------------------------------------------------
 1 | from BeautifulSoup import BeautifulSoup
 2 | import urllib2
 3 | import re
 4 | chare=re.compile(r'[!-\.&]')
 5 | itemowners={}
 6 | 
 7 | # Words to remove
 8 | dropwords=['a','new','some','more','my','own','the','many','other','another']
 9 | 
10 | currentuser=0
11 | for i in range(1,51):
12 |   # URL for the want search page
13 |   c=urllib2.urlopen(
14 |   'http://member.zebo.com/Main?event_key=USERSEARCH&wiowiw=wiw&keyword=car&page=%d'
15 |   % (i))
16 |   soup=BeautifulSoup(c.read())
17 |   for td in soup('td'):
18 |     # Find table cells of bgverdanasmall class
19 |     if ('class' in dict(td.attrs) and td['class']=='bgverdanasmall'):
20 |       items=[re.sub(chare,'',str(a.contents[0]).lower()).strip() for a in td('a')]
21 |       for item in items:
22 |         # Remove extra words
23 |         txt=' '.join([t for t in item.split(' ') if t not in dropwords])
24 |         if len(txt)<2: continue
25 |         itemowners.setdefault(txt,{})
26 |         itemowners[txt][currentuser]=1
27 |       currentuser+=1
28 |       
29 | out=file('zebo.txt','w')
30 | out.write('Item')
31 | for user in range(0,currentuser): out.write('\tU%d' % user)
32 | out.write('\n')
33 | for item,owners in itemowners.items():
34 |   if len(owners)>10:
35 |     out.write(item)
36 |     for user in range(0,currentuser):
37 |       if user in owners: out.write('\t1')
38 |       else: out.write('\t0')
39 |     out.write('\n')
40 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter3/feedlist.txt:
--------------------------------------------------------------------------------
 1 | http://feeds.feedburner.com/37signals/beMH
 2 | http://feeds.feedburner.com/blogspot/bRuz
 3 | http://battellemedia.com/index.xml
 4 | http://blog.guykawasaki.com/index.rdf
 5 | http://blog.outer-court.com/rss.xml
 6 | http://feeds.searchenginewatch.com/sewblog
 7 | http://blog.topix.net/index.rdf
 8 | http://blogs.abcnews.com/theblotter/index.rdf
 9 | http://feeds.feedburner.com/ConsumingExperienceFull
10 | http://flagrantdisregard.com/index.php/feed/
11 | http://featured.gigaom.com/feed/
12 | http://gizmodo.com/index.xml
13 | http://gofugyourself.typepad.com/go_fug_yourself/index.rdf
14 | http://googleblog.blogspot.com/rss.xml
15 | http://feeds.feedburner.com/GoogleOperatingSystem
16 | http://headrush.typepad.com/creating_passionate_users/index.rdf
17 | http://feeds.feedburner.com/instapundit/main
18 | http://jeremy.zawodny.com/blog/rss2.xml
19 | http://joi.ito.com/index.rdf
20 | http://feeds.feedburner.com/Mashable
21 | http://michellemalkin.com/index.rdf
22 | http://moblogsmoproblems.blogspot.com/rss.xml
23 | http://newsbusters.org/node/feed
24 | http://beta.blogger.com/feeds/27154654/posts/full?alt=rss
25 | http://feeds.feedburner.com/paulstamatiou
26 | http://powerlineblog.com/index.rdf
27 | http://feeds.feedburner.com/Publishing20
28 | http://radar.oreilly.com/index.rdf
29 | http://scienceblogs.com/pharyngula/index.xml
30 | http://scobleizer.wordpress.com/feed/
31 | http://sethgodin.typepad.com/seths_blog/index.rdf
32 | http://rss.slashdot.org/Slashdot/slashdot
33 | http://thinkprogress.org/feed/
34 | http://feeds.feedburner.com/andrewsullivan/rApM
35 | http://wilwheaton.typepad.com/wwdnbackup/index.rdf
36 | http://www.43folders.com/feed/
37 | http://www.456bereastreet.com/feed.xml
38 | http://www.autoblog.com/rss.xml
39 | http://www.bloggersblog.com/rss.xml
40 | http://www.bloglines.com/rss/about/news
41 | http://www.blogmaverick.com/rss.xml
42 | http://www.boingboing.net/index.rdf
43 | http://www.buzzmachine.com/index.xml
44 | http://www.captainsquartersblog.com/mt/index.rdf
45 | http://www.coolhunting.com/index.rdf
46 | http://feeds.copyblogger.com/Copyblogger
47 | http://feeds.feedburner.com/crooksandliars/YaCP
48 | http://feeds.dailykos.com/dailykos/index.xml
49 | http://www.deadspin.com/index.xml
50 | http://www.downloadsquad.com/rss.xml
51 | http://www.engadget.com/rss.xml
52 | http://www.gapingvoid.com/index.rdf
53 | http://www.gawker.com/index.xml
54 | http://www.gothamist.com/index.rdf
55 | http://www.huffingtonpost.com/raw_feed_index.rdf
56 | http://www.hyperorg.com/blogger/index.rdf
57 | http://www.joelonsoftware.com/rss.xml
58 | http://www.joystiq.com/rss.xml
59 | http://www.kotaku.com/index.xml
60 | http://feeds.kottke.org/main
61 | http://www.lifehack.org/feed/
62 | http://www.lifehacker.com/index.xml
63 | http://littlegreenfootballs.com/weblog/lgf-rss.php
64 | http://www.makezine.com/blog/index.xml
65 | http://www.mattcutts.com/blog/feed/
66 | http://xml.metafilter.com/rss.xml
67 | http://www.mezzoblue.com/rss/index.xml
68 | http://www.micropersuasion.com/index.rdf
69 | http://www.neilgaiman.com/journal/feed/rss.xml
70 | http://www.oilman.ca/feed/
71 | http://www.perezhilton.com/index.xml
72 | http://www.plasticbag.org/index.rdf
73 | http://www.powazek.com/rss.xml
74 | http://www.problogger.net/feed/
75 | http://feeds.feedburner.com/QuickOnlineTips
76 | http://www.readwriteweb.com/rss.xml
77 | http://www.schneier.com/blog/index.rdf
78 | http://scienceblogs.com/sample/combined.xml
79 | http://www.seroundtable.com/index.rdf
80 | http://www.shoemoney.com/feed/
81 | http://www.sifry.com/alerts/index.rdf
82 | http://www.simplebits.com/xml/rss.xml
83 | http://feeds.feedburner.com/Spikedhumor
84 | http://www.stevepavlina.com/blog/feed
85 | http://www.talkingpointsmemo.com/index.xml
86 | http://www.tbray.org/ongoing/ongoing.rss
87 | http://feeds.feedburner.com/TechCrunch
88 | http://www.techdirt.com/techdirt_rss.xml
89 | http://www.techeblog.com/index.php/feed/
90 | http://www.thesuperficial.com/index.xml
91 | http://www.tmz.com/rss.xml
92 | http://www.treehugger.com/index.rdf
93 | http://www.tuaw.com/rss.xml
94 | http://www.valleywag.com/index.xml
95 | http://www.we-make-money-not-art.com/index.rdf
96 | http://www.wired.com/rss/index.xml
97 | http://www.wonkette.com/index.xml
98 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter3/generatefeedvector.py:
--------------------------------------------------------------------------------
 1 | import feedparser
 2 | import re
 3 | 
 4 | # Returns title and dictionary of word counts for an RSS feed
 5 | def getwordcounts(url):
 6 |   # Parse the feed
 7 |   d=feedparser.parse(url)
 8 |   wc={}
 9 | 
10 |   # Loop over all the entries
11 |   for e in d.entries:
12 |     if 'summary' in e: summary=e.summary
13 |     else: summary=e.description
14 | 
15 |     # Extract a list of words
16 |     words=getwords(e.title+' '+summary)
17 |     for word in words:
18 |       wc.setdefault(word,0)
19 |       wc[word]+=1
20 |   return d.feed.title,wc
21 | 
22 | def getwords(html):
23 |   # Remove all the HTML tags
24 |   txt=re.compile(r'<[^>]+>').sub('',html)
25 | 
26 |   # Split words by all non-alpha characters
27 |   words=re.compile(r'[^A-Z^a-z]+').split(txt)
28 | 
29 |   # Convert to lowercase
30 |   return [word.lower() for word in words if word!='']
31 | 
32 | 
33 | apcount={}
34 | wordcounts={}
35 | feedlist=[line for line in file('feedlist.txt')]
36 | for feedurl in feedlist:
37 |   try:
38 |     title,wc=getwordcounts(feedurl)
39 |     wordcounts[title]=wc
40 |     for word,count in wc.items():
41 |       apcount.setdefault(word,0)
42 |       if count>1:
43 |         apcount[word]+=1
44 |   except:
45 |     print 'Failed to parse feed %s' % feedurl
46 | 
47 | wordlist=[]
48 | for w,bc in apcount.items():
49 |   frac=float(bc)/len(feedlist)
50 |   if frac>0.1 and frac<0.5:
51 |     wordlist.append(w)
52 | 
53 | out=file('blogdata1.txt','w')
54 | out.write('Blog')
55 | for word in wordlist: out.write('\t%s' % word)
56 | out.write('\n')
57 | for blog,wc in wordcounts.items():
58 |   print blog
59 |   out.write(blog)
60 |   for word in wordlist:
61 |     if word in wc: out.write('\t%d' % wc[word])
62 |     else: out.write('\t0')
63 |   out.write('\n')
64 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter4/nn.py:
--------------------------------------------------------------------------------
  1 | from math import tanh
  2 | from pysqlite2 import dbapi2 as sqlite
  3 | 
  4 | def dtanh(y):
  5 |     return 1.0-y*y
  6 | 
  7 | class searchnet:
  8 |     def __init__(self,dbname):
  9 |       self.con=sqlite.connect(dbname)
 10 |   
 11 |     def __del__(self):
 12 |       self.con.close()
 13 | 
 14 |     def maketables(self):
 15 |       self.con.execute('create table hiddennode(create_key)')
 16 |       self.con.execute('create table wordhidden(fromid,toid,strength)')
 17 |       self.con.execute('create table hiddenurl(fromid,toid,strength)')
 18 |       self.con.commit()
 19 | 
 20 |     def getstrength(self,fromid,toid,layer):
 21 |       if layer==0: table='wordhidden'
 22 |       else: table='hiddenurl'
 23 |       res=self.con.execute('select strength from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone()
 24 |       if res==None: 
 25 |           if layer==0: return -0.2
 26 |           if layer==1: return 0
 27 |       return res[0]
 28 | 
 29 |     def setstrength(self,fromid,toid,layer,strength):
 30 |       if layer==0: table='wordhidden'
 31 |       else: table='hiddenurl'
 32 |       res=self.con.execute('select rowid from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone()
 33 |       if res==None: 
 34 |         self.con.execute('insert into %s (fromid,toid,strength) values (%d,%d,%f)' % (table,fromid,toid,strength))
 35 |       else:
 36 |         rowid=res[0]
 37 |         self.con.execute('update %s set strength=%f where rowid=%d' % (table,strength,rowid))
 38 | 
 39 |     def generatehiddennode(self,wordids,urls):
 40 |       if len(wordids)>3: return None
 41 |       # Check if we already created a node for this set of words
 42 |       sorted_words=[str(id) for id in wordids]
 43 |       sorted_words.sort()
 44 |       createkey='_'.join(sorted_words)
 45 |       res=self.con.execute(
 46 |       "select rowid from hiddennode where create_key='%s'" % createkey).fetchone()
 47 | 
 48 |       # If not, create it
 49 |       if res==None:
 50 |         cur=self.con.execute(
 51 |         "insert into hiddennode (create_key) values ('%s')" % createkey)
 52 |         hiddenid=cur.lastrowid
 53 |         # Put in some default weights
 54 |         for wordid in wordids:
 55 |           self.setstrength(wordid,hiddenid,0,1.0/len(wordids))
 56 |         for urlid in urls:
 57 |           self.setstrength(hiddenid,urlid,1,0.1)
 58 |         self.con.commit()
 59 | 
 60 |     def getallhiddenids(self,wordids,urlids):
 61 |       l1={}
 62 |       for wordid in wordids:
 63 |         cur=self.con.execute(
 64 |         'select toid from wordhidden where fromid=%d' % wordid)
 65 |         for row in cur: l1[row[0]]=1
 66 |       for urlid in urlids:
 67 |         cur=self.con.execute(
 68 |         'select fromid from hiddenurl where toid=%d' % urlid)
 69 |         for row in cur: l1[row[0]]=1
 70 |       return l1.keys()
 71 | 
 72 |     def setupnetwork(self,wordids,urlids):
 73 |         # value lists
 74 |         self.wordids=wordids
 75 |         self.hiddenids=self.getallhiddenids(wordids,urlids)
 76 |         self.urlids=urlids
 77 |  
 78 |         # node outputs
 79 |         self.ai = [1.0]*len(self.wordids)
 80 |         self.ah = [1.0]*len(self.hiddenids)
 81 |         self.ao = [1.0]*len(self.urlids)
 82 |         
 83 |         # create weights matrix
 84 |         self.wi = [[self.getstrength(wordid,hiddenid,0) 
 85 |                     for hiddenid in self.hiddenids] 
 86 |                    for wordid in self.wordids]
 87 |         self.wo = [[self.getstrength(hiddenid,urlid,1) 
 88 |                     for urlid in self.urlids] 
 89 |                    for hiddenid in self.hiddenids]
 90 | 
 91 |     def feedforward(self):
 92 |         # the only inputs are the query words
 93 |         for i in range(len(self.wordids)):
 94 |             self.ai[i] = 1.0
 95 | 
 96 |         # hidden activations
 97 |         for j in range(len(self.hiddenids)):
 98 |             sum = 0.0
 99 |             for i in range(len(self.wordids)):
100 |                 sum = sum + self.ai[i] * self.wi[i][j]
101 |             self.ah[j] = tanh(sum)
102 | 
103 |         # output activations
104 |         for k in range(len(self.urlids)):
105 |             sum = 0.0
106 |             for j in range(len(self.hiddenids)):
107 |                 sum = sum + self.ah[j] * self.wo[j][k]
108 |             self.ao[k] = tanh(sum)
109 | 
110 |         return self.ao[:]
111 | 
112 |     def getresult(self,wordids,urlids):
113 |       self.setupnetwork(wordids,urlids)
114 |       return self.feedforward()
115 | 
116 |     def backPropagate(self, targets, N=0.5):
117 |         # calculate errors for output
118 |         output_deltas = [0.0] * len(self.urlids)
119 |         for k in range(len(self.urlids)):
120 |             error = targets[k]-self.ao[k]
121 |             output_deltas[k] = dtanh(self.ao[k]) * error
122 | 
123 |         # calculate errors for hidden layer
124 |         hidden_deltas = [0.0] * len(self.hiddenids)
125 |         for j in range(len(self.hiddenids)):
126 |             error = 0.0
127 |             for k in range(len(self.urlids)):
128 |                 error = error + output_deltas[k]*self.wo[j][k]
129 |             hidden_deltas[j] = dtanh(self.ah[j]) * error
130 | 
131 |         # update output weights
132 |         for j in range(len(self.hiddenids)):
133 |             for k in range(len(self.urlids)):
134 |                 change = output_deltas[k]*self.ah[j]
135 |                 self.wo[j][k] = self.wo[j][k] + N*change
136 | 
137 |         # update input weights
138 |         for i in range(len(self.wordids)):
139 |             for j in range(len(self.hiddenids)):
140 |                 change = hidden_deltas[j]*self.ai[i]
141 |                 self.wi[i][j] = self.wi[i][j] + N*change
142 | 
143 |     def trainquery(self,wordids,urlids,selectedurl): 
144 |       # generate a hidden node if necessary
145 |       self.generatehiddennode(wordids,urlids)
146 | 
147 |       self.setupnetwork(wordids,urlids)      
148 |       self.feedforward()
149 |       targets=[0.0]*len(urlids)
150 |       targets[urlids.index(selectedurl)]=1.0
151 |       error = self.backPropagate(targets)
152 |       self.updatedatabase()
153 | 
154 |     def updatedatabase(self):
155 |       # set them to database values
156 |       for i in range(len(self.wordids)):
157 |           for j in range(len(self.hiddenids)):
158 |               self.setstrength(self.wordids[i],self. hiddenids[j],0,self.wi[i][j])
159 |       for j in range(len(self.hiddenids)):
160 |           for k in range(len(self.urlids)):
161 |               self.setstrength(self.hiddenids[j],self.urlids[k],1,self.wo[j][k])
162 |       self.con.commit()
163 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter5/dorm.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | 
 4 | # The dorms, each of which has two available spaces
 5 | dorms=['Zeus','Athena','Hercules','Bacchus','Pluto']
 6 | 
 7 | # People, along with their first and second choices
 8 | prefs=[('Toby', ('Bacchus', 'Hercules')),
 9 |        ('Steve', ('Zeus', 'Pluto')),
10 |        ('Karen', ('Athena', 'Zeus')),
11 |        ('Sarah', ('Zeus', 'Pluto')),
12 |        ('Dave', ('Athena', 'Bacchus')), 
13 |        ('Jeff', ('Hercules', 'Pluto')), 
14 |        ('Fred', ('Pluto', 'Athena')), 
15 |        ('Suzie', ('Bacchus', 'Hercules')), 
16 |        ('Laura', ('Bacchus', 'Hercules')), 
17 |        ('James', ('Hercules', 'Athena'))]
18 | 
19 | # [(0,9),(0,8),(0,7),(0,6),...,(0,0)]
20 | domain=[(0,(len(dorms)*2)-i-1) for i in range(0,len(dorms)*2)]
21 | 
22 | def printsolution(vec):
23 |   slots=[]
24 |   # Create two slots for each dorm
25 |   for i in range(len(dorms)): slots+=[i,i]
26 | 
27 |   # Loop over each students assignment
28 |   for i in range(len(vec)):
29 |     x=int(vec[i])
30 | 
31 |     # Choose the slot from the remaining ones
32 |     dorm=dorms[slots[x]]
33 |     # Show the student and assigned dorm
34 |     print prefs[i][0],dorm
35 |     # Remove this slot
36 |     del slots[x]
37 | 
38 | def dormcost(vec):
39 |   cost=0
40 |   # Create list a of slots
41 |   slots=[0,0,1,1,2,2,3,3,4,4]
42 | 
43 |   # Loop over each student
44 |   for i in range(len(vec)):
45 |     x=int(vec[i])
46 |     dorm=dorms[slots[x]]
47 |     pref=prefs[i][1]
48 |     # First choice costs 0, second choice costs 1
49 |     if pref[0]==dorm: cost+=0
50 |     elif pref[1]==dorm: cost+=1
51 |     else: cost+=3
52 |     # Not on the list costs 3
53 | 
54 |     # Remove selected slot
55 |     del slots[x]
56 |     
57 |   return cost
58 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter5/kayak.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import urllib2
 3 | import xml.dom.minidom
 4 | 
 5 | kayakkey='YOUR KEY HERE'
 6 | 
 7 | def getkayaksession():
 8 |   # Construct the URL to start a session
 9 |   url='http://www.kayak.com/k/ident/apisession?token=%s&version=1' % kayakkey
10 |   
11 |   # Parse the resulting XML
12 |   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
13 |   
14 |   # Find <sid>xxxxxxxx</sid>
15 |   sid=doc.getElementsByTagName('sid')[0].firstChild.data
16 |   return sid
17 | 
18 | def flightsearch(sid,origin,destination,depart_date):
19 |   
20 |   # Construct search URL
21 |   url='http://www.kayak.com/s/apisearch?basicmode=true&oneway=y&origin=%s' % origin
22 |   url+='&destination=%s&depart_date=%s' % (destination,depart_date)
23 |   url+='&return_date=none&depart_time=a&return_time=a'
24 |   url+='&travelers=1&cabin=e&action=doFlights&apimode=1'
25 |   url+='&_sid_=%s&version=1' % (sid)
26 | 
27 |   # Get the XML
28 |   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
29 | 
30 |   # Extract the search ID
31 |   searchid=doc.getElementsByTagName('searchid')[0].firstChild.data
32 | 
33 |   return searchid
34 | 
35 | def flightsearchresults(sid,searchid):
36 |   def parseprice(p): 
37 |     return float(p[1:].replace(',',''))
38 | 
39 |   # Polling loop
40 |   while 1:
41 |     time.sleep(2)
42 | 
43 |     # Construct URL for polling
44 |     url='http://www.kayak.com/s/basic/flight?'
45 |     url+='searchid=%s&c=5&apimode=1&_sid_=%s&version=1' % (searchid,sid)
46 |     doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
47 | 
48 |     # Look for morepending tag, and wait until it is no longer true
49 |     morepending=doc.getElementsByTagName('morepending')[0].firstChild
50 |     if morepending==None or morepending.data=='false': break
51 | 
52 |   # Now download the complete list
53 |   url='http://www.kayak.com/s/basic/flight?'
54 |   url+='searchid=%s&c=999&apimode=1&_sid_=%s&version=1' % (searchid,sid)
55 |   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
56 | 
57 |   # Get the various elements as lists
58 |   prices=doc.getElementsByTagName('price')
59 |   departures=doc.getElementsByTagName('depart')
60 |   arrivals=doc.getElementsByTagName('arrive')  
61 | 
62 |   # Zip them together
63 |   return zip([p.firstChild.data.split(' ')[1] for p in departures],
64 |              [p.firstChild.data.split(' ')[1] for p in arrivals],
65 |              [parseprice(p.firstChild.data) for p in prices])
66 | 
67 | 
68 | def createschedule(people,dest,dep,ret):
69 |   # Get a session id for these searches
70 |   sid=getkayaksession()
71 |   flights={}
72 |   
73 |   for p in people:
74 |     name,origin=p
75 |     # Outbound flight
76 |     searchid=flightsearch(sid,origin,dest,dep)
77 |     flights[(origin,dest)]=flightsearchresults(sid,searchid)
78 |     
79 |     # Return flight
80 |     searchid=flightsearch(sid,dest,origin,ret)
81 |     flights[(dest,origin)]=flightsearchresults(sid,searchid)
82 |     
83 |   return flights
84 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter5/optimization.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import random
  3 | import math
  4 | 
  5 | people = [('Seymour','BOS'),
  6 |           ('Franny','DAL'),
  7 |           ('Zooey','CAK'),
  8 |           ('Walt','MIA'),
  9 |           ('Buddy','ORD'),
 10 |           ('Les','OMA')]
 11 | # Laguardia
 12 | destination='LGA'
 13 | 
 14 | flights={}
 15 | # 
 16 | for line in file('schedule.txt'):
 17 |   origin,dest,depart,arrive,price=line.strip().split(',')
 18 |   flights.setdefault((origin,dest),[])
 19 | 
 20 |   # Add details to the list of possible flights
 21 |   flights[(origin,dest)].append((depart,arrive,int(price)))
 22 | 
 23 | def getminutes(t):
 24 |   x=time.strptime(t,'%H:%M')
 25 |   return x[3]*60+x[4]
 26 | 
 27 | def printschedule(r):
 28 |   for d in range(len(r)/2):
 29 |     name=people[d][0]
 30 |     origin=people[d][1]
 31 |     out=flights[(origin,destination)][int(r[d])]
 32 |     ret=flights[(destination,origin)][int(r[d+1])]
 33 |     print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin,
 34 |                                                   out[0],out[1],out[2],
 35 |                                                   ret[0],ret[1],ret[2])
 36 | 
 37 | def schedulecost(sol):
 38 |   totalprice=0
 39 |   latestarrival=0
 40 |   earliestdep=24*60
 41 | 
 42 |   for d in range(len(sol)/2):
 43 |     # Get the inbound and outbound flights
 44 |     origin=people[d][1]
 45 |     outbound=flights[(origin,destination)][int(sol[d])]
 46 |     returnf=flights[(destination,origin)][int(sol[d+1])]
 47 |     
 48 |     # Total price is the price of all outbound and return flights
 49 |     totalprice+=outbound[2]
 50 |     totalprice+=returnf[2]
 51 |     
 52 |     # Track the latest arrival and earliest departure
 53 |     if latestarrival<getminutes(outbound[1]): latestarrival=getminutes(outbound[1])
 54 |     if earliestdep>getminutes(returnf[0]): earliestdep=getminutes(returnf[0])
 55 |   
 56 |   # Every person must wait at the airport until the latest person arrives.
 57 |   # They also must arrive at the same time and wait for their flights.
 58 |   totalwait=0  
 59 |   for d in range(len(sol)/2):
 60 |     origin=people[d][1]
 61 |     outbound=flights[(origin,destination)][int(sol[d])]
 62 |     returnf=flights[(destination,origin)][int(sol[d+1])]
 63 |     totalwait+=latestarrival-getminutes(outbound[1])
 64 |     totalwait+=getminutes(returnf[0])-earliestdep  
 65 | 
 66 |   # Does this solution require an extra day of car rental? That'll be $50!
 67 |   if latestarrival>earliestdep: totalprice+=50
 68 |   
 69 |   return totalprice+totalwait
 70 | 
 71 | def randomoptimize(domain,costf):
 72 |   best=999999999
 73 |   bestr=None
 74 |   for i in range(0,1000):
 75 |     # Create a random solution
 76 |     r=[float(random.randint(domain[i][0],domain[i][1])) 
 77 |        for i in range(len(domain))]
 78 |     
 79 |     # Get the cost
 80 |     cost=costf(r)
 81 |     
 82 |     # Compare it to the best one so far
 83 |     if cost<best:
 84 |       best=cost
 85 |       bestr=r 
 86 |   return r
 87 | 
 88 | def hillclimb(domain,costf):
 89 |   # Create a random solution
 90 |   sol=[random.randint(domain[i][0],domain[i][1])
 91 |       for i in range(len(domain))]
 92 |   # Main loop
 93 |   while 1:
 94 |     # Create list of neighboring solutions
 95 |     neighbors=[]
 96 |     
 97 |     for j in range(len(domain)):
 98 |       # One away in each direction
 99 |       if sol[j]>domain[j][0]:
100 |         neighbors.append(sol[0:j]+[sol[j]+1]+sol[j+1:])
101 |       if sol[j]<domain[j][1]:
102 |         neighbors.append(sol[0:j]+[sol[j]-1]+sol[j+1:])
103 | 
104 |     # See what the best solution amongst the neighbors is
105 |     current=costf(sol)
106 |     best=current
107 |     for j in range(len(neighbors)):
108 |       cost=costf(neighbors[j])
109 |       if cost<best:
110 |         best=cost
111 |         sol=neighbors[j]
112 | 
113 |     # If there's no improvement, then we've reached the top
114 |     if best==current:
115 |       break
116 |   return sol
117 | 
118 | def annealingoptimize(domain,costf,T=10000.0,cool=0.95,step=1):
119 |   # Initialize the values randomly
120 |   vec=[float(random.randint(domain[i][0],domain[i][1])) 
121 |        for i in range(len(domain))]
122 |   
123 |   while T>0.1:
124 |     # Choose one of the indices
125 |     i=random.randint(0,len(domain)-1)
126 | 
127 |     # Choose a direction to change it
128 |     dir=random.randint(-step,step)
129 | 
130 |     # Create a new list with one of the values changed
131 |     vecb=vec[:]
132 |     vecb[i]+=dir
133 |     if vecb[i]<domain[i][0]: vecb[i]=domain[i][0]
134 |     elif vecb[i]>domain[i][1]: vecb[i]=domain[i][1]
135 | 
136 |     # Calculate the current cost and the new cost
137 |     ea=costf(vec)
138 |     eb=costf(vecb)
139 |     p=pow(math.e,(-eb-ea)/T)
140 | 
141 |     # Is it better, or does it make the probability
142 |     # cutoff?
143 |     if (eb<ea or random.random()<p):
144 |       vec=vecb      
145 | 
146 |     # Decrease the temperature
147 |     T=T*cool
148 |   return vec
149 | 
150 | def geneticoptimize(domain,costf,popsize=50,step=1,
151 |                     mutprob=0.2,elite=0.2,maxiter=100):
152 |   # Mutation Operation
153 |   def mutate(vec):
154 |     i=random.randint(0,len(domain)-1)
155 |     if random.random()<0.5 and vec[i]>domain[i][0]:
156 |       return vec[0:i]+[vec[i]-step]+vec[i+1:] 
157 |     elif vec[i]<domain[i][1]:
158 |       return vec[0:i]+[vec[i]+step]+vec[i+1:]
159 |   
160 |   # Crossover Operation
161 |   def crossover(r1,r2):
162 |     i=random.randint(1,len(domain)-2)
163 |     return r1[0:i]+r2[i:]
164 | 
165 |   # Build the initial population
166 |   pop=[]
167 |   for i in range(popsize):
168 |     vec=[random.randint(domain[i][0],domain[i][1]) 
169 |          for i in range(len(domain))]
170 |     pop.append(vec)
171 |   
172 |   # How many winners from each generation?
173 |   topelite=int(elite*popsize)
174 |   
175 |   # Main loop 
176 |   for i in range(maxiter):
177 |     scores=[(costf(v),v) for v in pop]
178 |     scores.sort()
179 |     ranked=[v for (s,v) in scores]
180 |     
181 |     # Start with the pure winners
182 |     pop=ranked[0:topelite]
183 |     
184 |     # Add mutated and bred forms of the winners
185 |     while len(pop)<popsize:
186 |       if random.random()<mutprob:
187 | 
188 |         # Mutation
189 |         c=random.randint(0,topelite)
190 |         pop.append(mutate(ranked[c]))
191 |       else:
192 |       
193 |         # Crossover
194 |         c1=random.randint(0,topelite)
195 |         c2=random.randint(0,topelite)
196 |         pop.append(crossover(ranked[c1],ranked[c2]))
197 |     
198 |     # Print current best score
199 |     print scores[0][0]
200 |     
201 |   return scores[0][1]
202 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter5/schedule.txt:
--------------------------------------------------------------------------------
  1 | LGA,OMA,6:19,8:13,239
  2 | OMA,LGA,6:11,8:31,249
  3 | LGA,OMA,8:04,10:59,136
  4 | OMA,LGA,7:39,10:24,219
  5 | LGA,OMA,9:31,11:43,210
  6 | OMA,LGA,9:15,12:03,99
  7 | LGA,OMA,11:07,13:24,171
  8 | OMA,LGA,11:08,13:07,175
  9 | LGA,OMA,12:31,14:02,234
 10 | OMA,LGA,12:18,14:56,172
 11 | LGA,OMA,14:05,15:47,226
 12 | OMA,LGA,13:37,15:08,250
 13 | LGA,OMA,15:07,17:21,129
 14 | OMA,LGA,15:03,16:42,135
 15 | LGA,OMA,16:35,18:56,144
 16 | OMA,LGA,16:51,19:09,147
 17 | LGA,OMA,18:25,20:34,205
 18 | OMA,LGA,18:12,20:17,242
 19 | LGA,OMA,20:05,21:44,172
 20 | OMA,LGA,20:05,22:06,261
 21 | LGA,ORD,6:03,8:43,219
 22 | ORD,LGA,6:05,8:32,174
 23 | LGA,ORD,7:50,10:08,164
 24 | ORD,LGA,8:25,10:34,157
 25 | LGA,ORD,9:11,10:42,172
 26 | ORD,LGA,9:42,11:32,169
 27 | LGA,ORD,10:33,13:11,132
 28 | ORD,LGA,11:01,12:39,260
 29 | LGA,ORD,12:08,14:47,231
 30 | ORD,LGA,12:44,14:17,134
 31 | LGA,ORD,14:19,17:09,190
 32 | ORD,LGA,14:22,16:32,126
 33 | LGA,ORD,15:04,17:23,189
 34 | ORD,LGA,15:58,18:40,173
 35 | LGA,ORD,17:06,20:00,95
 36 | ORD,LGA,16:43,19:00,246
 37 | LGA,ORD,18:33,20:22,143
 38 | ORD,LGA,18:48,21:45,246
 39 | LGA,ORD,19:32,21:25,160
 40 | ORD,LGA,19:50,22:24,269
 41 | LGA,MIA,6:33,9:14,172
 42 | MIA,LGA,6:25,9:30,335
 43 | LGA,MIA,8:23,11:07,143
 44 | MIA,LGA,7:34,9:40,324
 45 | LGA,MIA,9:25,12:46,295
 46 | MIA,LGA,9:15,12:29,225
 47 | LGA,MIA,11:08,14:38,262
 48 | MIA,LGA,11:28,14:40,248
 49 | LGA,MIA,12:37,15:05,170
 50 | MIA,LGA,12:05,15:30,330
 51 | LGA,MIA,14:08,16:09,232
 52 | MIA,LGA,14:01,17:24,338
 53 | LGA,MIA,15:23,18:49,150
 54 | MIA,LGA,15:34,18:11,326
 55 | LGA,MIA,16:50,19:26,304
 56 | MIA,LGA,17:07,20:04,291
 57 | LGA,MIA,18:07,21:30,355
 58 | MIA,LGA,18:23,21:35,134
 59 | LGA,MIA,20:27,23:42,169
 60 | MIA,LGA,19:53,22:21,173
 61 | LGA,BOS,6:39,8:09,86
 62 | BOS,LGA,6:17,8:26,89
 63 | LGA,BOS,8:23,10:28,149
 64 | BOS,LGA,8:04,10:11,95
 65 | LGA,BOS,9:58,11:18,130
 66 | BOS,LGA,9:45,11:50,172
 67 | LGA,BOS,10:33,12:03,74
 68 | BOS,LGA,11:16,13:29,83
 69 | LGA,BOS,12:08,14:05,142
 70 | BOS,LGA,12:34,15:02,109
 71 | LGA,BOS,13:39,15:30,74
 72 | BOS,LGA,13:40,15:37,138
 73 | LGA,BOS,15:25,16:58,62
 74 | BOS,LGA,15:27,17:18,151
 75 | LGA,BOS,17:03,18:03,103
 76 | BOS,LGA,17:11,18:30,108
 77 | LGA,BOS,18:24,20:49,124
 78 | BOS,LGA,18:34,19:36,136
 79 | LGA,BOS,19:58,21:23,142
 80 | BOS,LGA,20:17,22:22,102
 81 | LGA,DAL,6:09,9:49,414
 82 | DAL,LGA,6:12,10:22,230
 83 | LGA,DAL,7:57,11:15,347
 84 | DAL,LGA,7:53,11:37,433
 85 | LGA,DAL,9:49,13:51,229
 86 | DAL,LGA,9:08,12:12,364
 87 | LGA,DAL,10:51,14:16,256
 88 | DAL,LGA,10:30,14:57,290
 89 | LGA,DAL,12:20,16:34,500
 90 | DAL,LGA,12:19,15:25,342
 91 | LGA,DAL,14:20,17:32,332
 92 | DAL,LGA,13:54,18:02,294
 93 | LGA,DAL,15:49,20:10,497
 94 | DAL,LGA,15:44,18:55,382
 95 | LGA,DAL,17:14,20:59,277
 96 | DAL,LGA,16:52,20:48,448
 97 | LGA,DAL,18:44,22:42,351
 98 | DAL,LGA,18:26,21:29,464
 99 | LGA,DAL,19:57,23:15,512
100 | DAL,LGA,20:07,23:27,473
101 | LGA,CAK,6:58,9:01,238
102 | CAK,LGA,6:08,8:06,224
103 | LGA,CAK,8:19,11:16,122
104 | CAK,LGA,8:27,10:45,139
105 | LGA,CAK,9:58,12:56,249
106 | CAK,LGA,9:15,12:14,247
107 | LGA,CAK,10:32,13:16,139
108 | CAK,LGA,10:53,13:36,189
109 | LGA,CAK,12:01,13:41,267
110 | CAK,LGA,12:08,14:59,149
111 | LGA,CAK,13:37,15:33,142
112 | CAK,LGA,13:40,15:38,137
113 | LGA,CAK,15:50,18:45,243
114 | CAK,LGA,15:23,17:25,232
115 | LGA,CAK,16:33,18:15,253
116 | CAK,LGA,17:08,19:08,262
117 | LGA,CAK,18:17,21:04,259
118 | CAK,LGA,18:35,20:28,204
119 | LGA,CAK,19:46,21:45,214
120 | CAK,LGA,20:30,23:11,114
121 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter5/socialnetwork.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | people=['Charlie','Augustus','Veruca','Violet','Mike','Joe','Willy','Miranda']
 4 | 
 5 | links=[('Augustus', 'Willy'), 
 6 |        ('Mike', 'Joe'), 
 7 |        ('Miranda', 'Mike'), 
 8 |        ('Violet', 'Augustus'), 
 9 |        ('Miranda', 'Willy'), 
10 |        ('Charlie', 'Mike'), 
11 |        ('Veruca', 'Joe'), 
12 |        ('Miranda', 'Augustus'), 
13 |        ('Willy', 'Augustus'), 
14 |        ('Joe', 'Charlie'), 
15 |        ('Veruca', 'Augustus'), 
16 |        ('Miranda', 'Joe')]
17 | 
18 | 
19 | def crosscount(v):
20 |   # Convert the number list into a dictionary of person:(x,y)
21 |   loc=dict([(people[i],(v[i*2],v[i*2+1])) for i in range(0,len(people))])
22 |   total=0
23 |   
24 |   # Loop through every pair of links
25 |   for i in range(len(links)):
26 |     for j in range(i+1,len(links)):
27 | 
28 |       # Get the locations 
29 |       (x1,y1),(x2,y2)=loc[links[i][0]],loc[links[i][1]]
30 |       (x3,y3),(x4,y4)=loc[links[j][0]],loc[links[j][1]]
31 |       
32 |       den=(y4-y3)*(x2-x1)-(x4-x3)*(y2-y1)
33 | 
34 |       # den==0 if the lines are parallel
35 |       if den==0: continue
36 | 
37 |       # Otherwise ua and ub are the fraction of the
38 |       # line where they cross
39 |       ua=((x4-x3)*(y1-y3)-(y4-y3)*(x1-x3))/den
40 |       ub=((x2-x1)*(y1-y3)-(y2-y1)*(x1-x3))/den
41 |       
42 |       # If the fraction is between 0 and 1 for both lines
43 |       # then they cross each other
44 |       if ua>0 and ua<1 and ub>0 and ub<1:
45 |         total+=1
46 |     for i in range(len(people)):
47 |       for j in range(i+1,len(people)):
48 |         # Get the locations of the two nodes
49 |         (x1,y1),(x2,y2)=loc[people[i]],loc[people[j]]
50 | 
51 |         # Find the distance between them
52 |         dist=math.sqrt(math.pow(x1-x2,2)+math.pow(y1-y2,2))
53 |         # Penalize any nodes closer than 50 pixels
54 |         if dist<50:
55 |           total+=(1.0-(dist/50.0))
56 |         
57 |   return total
58 | from PIL import Image,ImageDraw
59 | 
60 | def drawnetwork(sol):
61 |   # Create the image
62 |   img=Image.new('RGB',(400,400),(255,255,255))
63 |   draw=ImageDraw.Draw(img)
64 | 
65 |   # Create the position dict
66 |   pos=dict([(people[i],(sol[i*2],sol[i*2+1])) for i in range(0,len(people))])
67 | 
68 |   for (a,b) in links:
69 |     draw.line((pos[a],pos[b]),fill=(255,0,0))
70 | 
71 |   for n,p in pos.items():
72 |     draw.text(p,n,(0,0,0))
73 | 
74 |   img.show()
75 | 
76 | 
77 | domain=[(10,370)]*(len(people)*2)


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter6/docclass.py:
--------------------------------------------------------------------------------
  1 | from pysqlite2 import dbapi2 as sqlite
  2 | import re
  3 | import math
  4 | 
  5 | def getwords(doc):
  6 |   splitter=re.compile('\\W*')
  7 |   print doc
  8 |   # Split the words by non-alpha characters
  9 |   words=[s.lower() for s in splitter.split(doc) 
 10 |           if len(s)>2 and len(s)<20]
 11 |   
 12 |   # Return the unique set of words only
 13 |   return dict([(w,1) for w in words])
 14 | 
 15 | class classifier:
 16 |   def __init__(self,getfeatures,filename=None):
 17 |     # Counts of feature/category combinations
 18 |     self.fc={}
 19 |     # Counts of documents in each category
 20 |     self.cc={}
 21 |     self.getfeatures=getfeatures
 22 |     
 23 |   def setdb(self,dbfile):
 24 |     self.con=sqlite.connect(dbfile)    
 25 |     self.con.execute('create table if not exists fc(feature,category,count)')
 26 |     self.con.execute('create table if not exists cc(category,count)')
 27 | 
 28 | 
 29 |   def incf(self,f,cat):
 30 |     count=self.fcount(f,cat)
 31 |     if count==0:
 32 |       self.con.execute("insert into fc values ('%s','%s',1)" 
 33 |                        % (f,cat))
 34 |     else:
 35 |       self.con.execute(
 36 |         "update fc set count=%d where feature='%s' and category='%s'" 
 37 |         % (count+1,f,cat)) 
 38 |   
 39 |   def fcount(self,f,cat):
 40 |     res=self.con.execute(
 41 |       'select count from fc where feature="%s" and category="%s"'
 42 |       %(f,cat)).fetchone()
 43 |     if res==None: return 0
 44 |     else: return float(res[0])
 45 | 
 46 |   def incc(self,cat):
 47 |     count=self.catcount(cat)
 48 |     if count==0:
 49 |       self.con.execute("insert into cc values ('%s',1)" % (cat))
 50 |     else:
 51 |       self.con.execute("update cc set count=%d where category='%s'" 
 52 |                        % (count+1,cat))    
 53 | 
 54 |   def catcount(self,cat):
 55 |     res=self.con.execute('select count from cc where category="%s"'
 56 |                          %(cat)).fetchone()
 57 |     if res==None: return 0
 58 |     else: return float(res[0])
 59 | 
 60 |   def categories(self):
 61 |     cur=self.con.execute('select category from cc');
 62 |     return [d[0] for d in cur]
 63 | 
 64 |   def totalcount(self):
 65 |     res=self.con.execute('select sum(count) from cc').fetchone();
 66 |     if res==None: return 0
 67 |     return res[0]
 68 | 
 69 | 
 70 |   def train(self,item,cat):
 71 |     features=self.getfeatures(item)
 72 |     # Increment the count for every feature with this category
 73 |     for f in features:
 74 |       self.incf(f,cat)
 75 | 
 76 |     # Increment the count for this category
 77 |     self.incc(cat)
 78 |     self.con.commit()
 79 | 
 80 |   def fprob(self,f,cat):
 81 |     if self.catcount(cat)==0: return 0
 82 | 
 83 |     # The total number of times this feature appeared in this 
 84 |     # category divided by the total number of items in this category
 85 |     return self.fcount(f,cat)/self.catcount(cat)
 86 | 
 87 |   def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
 88 |     # Calculate current probability
 89 |     basicprob=prf(f,cat)
 90 | 
 91 |     # Count the number of times this feature has appeared in
 92 |     # all categories
 93 |     totals=sum([self.fcount(f,c) for c in self.categories()])
 94 | 
 95 |     # Calculate the weighted average
 96 |     bp=((weight*ap)+(totals*basicprob))/(weight+totals)
 97 |     return bp
 98 | 
 99 | 
100 | 
101 | 
102 | class naivebayes(classifier):
103 |   
104 |   def __init__(self,getfeatures):
105 |     classifier.__init__(self,getfeatures)
106 |     self.thresholds={}
107 |   
108 |   def docprob(self,item,cat):
109 |     features=self.getfeatures(item)   
110 | 
111 |     # Multiply the probabilities of all the features together
112 |     p=1
113 |     for f in features: p*=self.weightedprob(f,cat,self.fprob)
114 |     return p
115 | 
116 |   def prob(self,item,cat):
117 |     catprob=self.catcount(cat)/self.totalcount()
118 |     docprob=self.docprob(item,cat)
119 |     return docprob*catprob
120 |   
121 |   def setthreshold(self,cat,t):
122 |     self.thresholds[cat]=t
123 |     
124 |   def getthreshold(self,cat):
125 |     if cat not in self.thresholds: return 1.0
126 |     return self.thresholds[cat]
127 |   
128 |   def classify(self,item,default=None):
129 |     probs={}
130 |     # Find the category with the highest probability
131 |     max=0.0
132 |     for cat in self.categories():
133 |       probs[cat]=self.prob(item,cat)
134 |       if probs[cat]>max: 
135 |         max=probs[cat]
136 |         best=cat
137 | 
138 |     # Make sure the probability exceeds threshold*next best
139 |     for cat in probs:
140 |       if cat==best: continue
141 |       if probs[cat]*self.getthreshold(best)>probs[best]: return default
142 |     return best
143 | 
144 | class fisherclassifier(classifier):
145 |   def cprob(self,f,cat):
146 |     # The frequency of this feature in this category    
147 |     clf=self.fprob(f,cat)
148 |     if clf==0: return 0
149 | 
150 |     # The frequency of this feature in all the categories
151 |     freqsum=sum([self.fprob(f,c) for c in self.categories()])
152 | 
153 |     # The probability is the frequency in this category divided by
154 |     # the overall frequency
155 |     p=clf/(freqsum)
156 |     
157 |     return p
158 |   def fisherprob(self,item,cat):
159 |     # Multiply all the probabilities together
160 |     p=1
161 |     features=self.getfeatures(item)
162 |     for f in features:
163 |       p*=(self.weightedprob(f,cat,self.cprob))
164 | 
165 |     # Take the natural log and multiply by -2
166 |     fscore=-2*math.log(p)
167 | 
168 |     # Use the inverse chi2 function to get a probability
169 |     return self.invchi2(fscore,len(features)*2)
170 |   def invchi2(self,chi, df):
171 |     m = chi / 2.0
172 |     sum = term = math.exp(-m)
173 |     for i in range(1, df//2):
174 |         term *= m / i
175 |         sum += term
176 |     return min(sum, 1.0)
177 |   def __init__(self,getfeatures):
178 |     classifier.__init__(self,getfeatures)
179 |     self.minimums={}
180 | 
181 |   def setminimum(self,cat,min):
182 |     self.minimums[cat]=min
183 |   
184 |   def getminimum(self,cat):
185 |     if cat not in self.minimums: return 0
186 |     return self.minimums[cat]
187 |   def classify(self,item,default=None):
188 |     # Loop through looking for the best result
189 |     best=default
190 |     max=0.0
191 |     for c in self.categories():
192 |       p=self.fisherprob(item,c)
193 |       # Make sure it exceeds its minimum
194 |       if p>self.getminimum(c) and p>max:
195 |         best=c
196 |         max=p
197 |     return best
198 | 
199 | 
200 | def sampletrain(cl):
201 |   cl.train('Nobody owns the water.','good')
202 |   cl.train('the quick rabbit jumps fences','good')
203 |   cl.train('buy pharmaceuticals now','bad')
204 |   cl.train('make quick money at the online casino','bad')
205 |   cl.train('the quick brown fox jumps','good')
206 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter6/feedfilter.py:
--------------------------------------------------------------------------------
 1 | import feedparser
 2 | import re
 3 | 
 4 | # Takes a filename of URL of a blog feed and classifies the entries
 5 | def read(feed,classifier):
 6 |   # Get feed entries and loop over them
 7 |   f=feedparser.parse(feed)
 8 |   for entry in f['entries']:
 9 |     print
10 |     print '-----'
11 |     # Print the contents of the entry
12 |     print 'Title:     '+entry['title'].encode('utf-8')
13 |     print 'Publisher: '+entry['publisher'].encode('utf-8')
14 |     print
15 |     print entry['summary'].encode('utf-8')
16 |     
17 | 
18 |     # Combine all the text to create one item for the classifier
19 |     fulltext='%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary'])
20 | 
21 |     # Print the best guess at the current category
22 |     print 'Guess: '+str(classifier.classify(entry))
23 | 
24 |     # Ask the user to specify the correct category and train on that
25 |     cl=raw_input('Enter category: ')
26 |     classifier.train(entry,cl)
27 | 
28 | 
29 | def entryfeatures(entry):
30 |   splitter=re.compile('\\W*')
31 |   f={}
32 |   
33 |   # Extract the title words and annotate
34 |   titlewords=[s.lower() for s in splitter.split(entry['title']) 
35 |           if len(s)>2 and len(s)<20]
36 |   for w in titlewords: f['Title:'+w]=1
37 |   
38 |   # Extract the summary words
39 |   summarywords=[s.lower() for s in splitter.split(entry['summary']) 
40 |           if len(s)>2 and len(s)<20]
41 | 
42 |   # Count uppercase words
43 |   uc=0
44 |   for i in range(len(summarywords)):
45 |     w=summarywords[i]
46 |     f[w]=1
47 |     if w.isupper(): uc+=1
48 |     
49 |     # Get word pairs in summary as features
50 |     if i<len(summarywords)-1:
51 |       twowords=' '.join(summarywords[i:i+1])
52 |       f[twowords]=1
53 |     
54 |   # Keep creator and publisher whole
55 |   f['Publisher:'+entry['publisher']]=1
56 | 
57 |   # UPPERCASE is a virtual word flagging too much shouting  
58 |   if float(uc)/len(summarywords)>0.3: f['UPPERCASE']=1
59 |   
60 |   return f
61 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter6/test.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter6/test.db


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter6/test1.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter6/test1.db


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter7/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter7/Thumbs.db


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter7/addresslist.txt:
--------------------------------------------------------------------------------
 1 | 6 Washington
 2 | 21 Manassas
 3 | 280 Pearl
 4 | 55 Ellery
 5 | 50 Follen
 6 | 51 Granite
 7 | 992 Memorial
 8 | 83 Trowbridge
 9 | 1 Dana
10 | 45 Regent
11 | 90 Alpine
12 | 21 Francis
13 | 112 Avon Hill
14 | 9 Bellevue
15 | 4 Blanchard Rd
16 | 34 Shea
17 | 5 Fountain
18 | 14 Marcella
19 | 39 Saint Saveur
20 | 35 Pemberton
21 | 46 Shepard
22 | 31 Market
23 | 99 Howard
24 | 88 Pearl
25 | 208 Western
26 | 285 Windsor
27 | 26 Cambridgepark
28 | 211 Erie
29 | 129 Franklin
30 | 27 Gurney
31 | 149 Prospect
32 | 27 Linnaean
33 | 20 Dudley
34 | 60 Otis St
35 | 130 Mount Auburn St
36 | 2 Michael Way
37 | 263 Columbia St
38 | 6 Hurlbut St
39 | 199 Harvard St
40 | 168 River St
41 | 400 Washington St
42 | 12 Traill St
43 | 74 Field St
44 | 21 Walden Square Rd
45 | 7 Wendell St
46 | 15 Normandy Ave
47 | 6 Gibson Ter
48 | 94 Pine St
49 | 23 Magee St
50 | 175 Richdale Ave
51 | 168 River St
52 | 246 Brattle St


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter7/hotornot.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import xml.dom.minidom
 3 | 
 4 | api_key='YOUR KEY HERE'
 5 | 
 6 | def getrandomratings(c):
 7 |   # Construct URL for getRandomProfile
 8 |   url="http://services.hotornot.com/rest/?app_key=%s" % api_key
 9 |   url+="&method=Rate.getRandomProfile&retrieve_num=%d" % c
10 |   url+="&get_rate_info=true&meet_users_only=true"
11 |   
12 |   f1=urllib2.urlopen(url).read()
13 | 
14 |   doc=xml.dom.minidom.parseString(f1)
15 |   
16 |   emids=doc.getElementsByTagName('emid')
17 |   ratings=doc.getElementsByTagName('rating')
18 | 
19 |   # Combine the emids and ratings together into a list
20 |   result=[]
21 |   for e,r in zip(emids,ratings):
22 |     if r.firstChild!=None:
23 |       result.append((e.firstChild.data,r.firstChild.data))
24 |   return result
25 | 
26 | stateregions={'New England':['ct','mn','ma','nh','ri','vt'],
27 |               'Mid Atlantic':['de','md','nj','ny','pa'],
28 |               'South':['al','ak','fl','ga','ky','la','ms','mo',
29 |                        'nc','sc','tn','va','wv'],
30 |               'Midwest':['il','in','ia','ks','mi','ne','nd','oh','sd','wi'],
31 |               'West':['ak','ca','co','hi','id','mt','nv','or','ut','wa','wy']}
32 | 
33 | def getpeopledata(ratings):
34 |   result=[]
35 |   for emid,rating in ratings:
36 |     # URL for the MeetMe.getProfile method
37 |     url="http://services.hotornot.com/rest/?app_key=%s" % api_key
38 |     url+="&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid
39 | 
40 |     # Get all the info about this person
41 |     try:
42 |       rating=int(float(rating)+0.5)
43 |       doc2=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
44 |       gender=doc2.getElementsByTagName('gender')[0].firstChild.data
45 |       age=doc2.getElementsByTagName('age')[0].firstChild.data
46 |       loc=doc2.getElementsByTagName('location')[0].firstChild.data[0:2]
47 | 
48 |       # Convert state to region
49 |       for r,s in stateregions.items():
50 |         if loc in s: region=r
51 | 
52 |       if region!=None:
53 |         result.append((gender,int(age),region,rating))
54 |     except:
55 |       pass
56 |   return result
57 | 
58 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter7/zillow.py:
--------------------------------------------------------------------------------
 1 | import xml.dom.minidom
 2 | import urllib2
 3 | 
 4 | zwskey="YOUR API KEY"
 5 | 
 6 | def getaddressdata(address,city):
 7 |   escad=address.replace(' ','+')
 8 |   url='http://www.zillow.com/webservice/GetDeepSearchResults.htm?'
 9 |   url+='zws-id=%s&address=%s&citystatezip=%s' % (zwskey,escad,city)
10 |   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
11 |   code=doc.getElementsByTagName('code')[0].firstChild.data
12 |   if code!='0': return None
13 |   if 1:
14 |     zipcode=doc.getElementsByTagName('zipcode')[0].firstChild.data
15 |     use=doc.getElementsByTagName('useCode')[0].firstChild.data
16 |     year=doc.getElementsByTagName('yearBuilt')[0].firstChild.data
17 |     sqft=doc.getElementsByTagName('finishedSqFt')[0].firstChild.data
18 |     bath=doc.getElementsByTagName('bathrooms')[0].firstChild.data
19 |     bed=doc.getElementsByTagName('bedrooms')[0].firstChild.data
20 |     rooms=1 #doc.getElementsByTagName('totalRooms')[0].firstChild.data
21 |     price=doc.getElementsByTagName('amount')[0].firstChild.data
22 |   else:
23 |     return None
24 |        
25 |   return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price)
26 | 
27 | def getpricelist():
28 |   l1=[]
29 |   for line in file('addresslist.txt'):
30 |     data=getaddressdata(line.strip(),'Cambridge,MA')
31 |     l1.append(data)
32 |   return l1
33 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter8/ebaypredict.py:
--------------------------------------------------------------------------------
  1 | import httplib
  2 | from xml.dom.minidom import parse, parseString, Node
  3 | 
  4 | devKey = 'YOUR DEV KEY'
  5 | appKey = 'YOUR APP KEY'
  6 | certKey = 'YOUR CERT KEY'
  7 | serverUrl = 'api.ebay.com'
  8 | userToken = 'YOUR TOKEN'
  9 | 
 10 | def getHeaders(apicall,siteID="0",compatabilityLevel = "433"):
 11 |   headers = {"X-EBAY-API-COMPATIBILITY-LEVEL": compatabilityLevel,	
 12 |              "X-EBAY-API-DEV-NAME": devKey,
 13 |              "X-EBAY-API-APP-NAME": appKey,
 14 |              "X-EBAY-API-CERT-NAME": certKey,
 15 |              "X-EBAY-API-CALL-NAME": apicall,
 16 |              "X-EBAY-API-SITEID": siteID,
 17 |              "Content-Type": "text/xml"}
 18 |   return headers
 19 | 
 20 | def sendRequest(apicall,xmlparameters):
 21 |   connection = httplib.HTTPSConnection(serverUrl)
 22 |   connection.request("POST", '/ws/api.dll', xmlparameters, getHeaders(apicall))
 23 |   response = connection.getresponse()
 24 |   if response.status != 200:
 25 |     print "Error sending request:" + response.reason
 26 |   else: 
 27 |     data = response.read()
 28 |     connection.close()
 29 |   return data
 30 | 
 31 | def getSingleValue(node,tag):
 32 |   nl=node.getElementsByTagName(tag)
 33 |   if len(nl)>0:
 34 |     tagNode=nl[0]
 35 |     if tagNode.hasChildNodes():
 36 |       return tagNode.firstChild.nodeValue
 37 |   return '-1'
 38 | 
 39 | 
 40 | def doSearch(query,categoryID=None,page=1):
 41 |   xml = "<?xml version='1.0' encoding='utf-8'?>"+\
 42 |         "<GetSearchResultsRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">"+\
 43 |         "<RequesterCredentials><eBayAuthToken>" +\
 44 |         userToken +\
 45 |         "</eBayAuthToken></RequesterCredentials>" + \
 46 |         "<Pagination>"+\
 47 |           "<EntriesPerPage>200</EntriesPerPage>"+\
 48 |           "<PageNumber>"+str(page)+"</PageNumber>"+\
 49 |         "</Pagination>"+\
 50 |         "<Query>" + query + "</Query>"
 51 |   if categoryID!=None:
 52 |     xml+="<CategoryID>"+str(categoryID)+"</CategoryID>"
 53 |   xml+="</GetSearchResultsRequest>"
 54 |   
 55 |   data=sendRequest('GetSearchResults',xml)
 56 |   response = parseString(data)
 57 |   itemNodes = response.getElementsByTagName('Item');
 58 |   results = []
 59 |   for item in itemNodes:
 60 |     itemId=getSingleValue(item,'ItemID')
 61 |     itemTitle=getSingleValue(item,'Title')
 62 |     itemPrice=getSingleValue(item,'CurrentPrice')
 63 |     itemEnds=getSingleValue(item,'EndTime')
 64 |     results.append((itemId,itemTitle,itemPrice,itemEnds))
 65 |   return results
 66 | 
 67 | 
 68 | def getCategory(query='',parentID=None,siteID='0'):
 69 |   lquery=query.lower()
 70 |   xml = "<?xml version='1.0' encoding='utf-8'?>"+\
 71 |         "<GetCategoriesRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">"+\
 72 |         "<RequesterCredentials><eBayAuthToken>" +\
 73 |         userToken +\
 74 |         "</eBayAuthToken></RequesterCredentials>"+\
 75 |         "<DetailLevel>ReturnAll</DetailLevel>"+\
 76 |         "<ViewAllNodes>true</ViewAllNodes>"+\
 77 |         "<CategorySiteID>"+siteID+"</CategorySiteID>"
 78 |   if parentID==None:
 79 |     xml+="<LevelLimit>1</LevelLimit>"
 80 |   else:
 81 |     xml+="<CategoryParent>"+str(parentID)+"</CategoryParent>"
 82 |   xml += "</GetCategoriesRequest>"
 83 |   data=sendRequest('GetCategories',xml)
 84 |   categoryList=parseString(data)
 85 |   catNodes=categoryList.getElementsByTagName('Category')
 86 |   for node in catNodes:
 87 |     catid=getSingleValue(node,'CategoryID')
 88 |     name=getSingleValue(node,'CategoryName')
 89 |     if name.lower().find(lquery)!=-1:
 90 |       print catid,name
 91 | 
 92 | def getItem(itemID):
 93 |   xml = "<?xml version='1.0' encoding='utf-8'?>"+\
 94 |         "<GetItemRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">"+\
 95 |         "<RequesterCredentials><eBayAuthToken>" +\
 96 |         userToken +\
 97 |         "</eBayAuthToken></RequesterCredentials>" + \
 98 |         "<ItemID>" + str(itemID) + "</ItemID>"+\
 99 |         "<DetailLevel>ItemReturnAttributes</DetailLevel>"+\
100 |         "</GetItemRequest>"
101 |   data=sendRequest('GetItem',xml)
102 |   result={}
103 |   response=parseString(data)
104 |   result['title']=getSingleValue(response,'Title')
105 |   sellingStatusNode = response.getElementsByTagName('SellingStatus')[0];
106 |   result['price']=getSingleValue(sellingStatusNode,'CurrentPrice')
107 |   result['bids']=getSingleValue(sellingStatusNode,'BidCount')
108 |   seller = response.getElementsByTagName('Seller')
109 |   result['feedback'] = getSingleValue(seller[0],'FeedbackScore')
110 | 
111 |   attributeSet=response.getElementsByTagName('Attribute');
112 |   attributes={}
113 |   for att in attributeSet:
114 |     attID=att.attributes.getNamedItem('attributeID').nodeValue
115 |     attValue=getSingleValue(att,'ValueLiteral')
116 |     attributes[attID]=attValue
117 |   result['attributes']=attributes
118 |   return result
119 | 
120 | 
121 | def makeLaptopDataset():
122 |   searchResults=doSearch('laptop',categoryID=51148)
123 |   result=[]
124 |   for r in searchResults:
125 |     item=getItem(r[0])
126 |     att=item['attributes']
127 |     try:
128 |       data=(float(att['12']),float(att['26444']),
129 |             float(att['26446']),float(att['25710']),
130 |             float(item['feedback'])
131 |            )
132 |       entry={'input':data,'result':float(item['price'])}
133 |       result.append(entry)
134 |     except:
135 |       print item['title']+' failed'
136 |   return result
137 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter8/numpredict.py:
--------------------------------------------------------------------------------
  1 | from random import random,randint
  2 | import math
  3 | 
  4 | def wineprice(rating,age):
  5 |   peak_age=rating-50
  6 |   
  7 |   # Calculate price based on rating
  8 |   price=rating/2
  9 |   if age>peak_age:
 10 |     # Past its peak, goes bad in 10 years
 11 |     price=price*(5-(age-peak_age)/2)
 12 |   else:
 13 |     # Increases to 5x original value as it
 14 |     # approaches its peak
 15 |     price=price*(5*((age+1)/peak_age))
 16 |   if price<0: price=0
 17 |   return price
 18 | 
 19 | 
 20 | def wineset1():
 21 |   rows=[]
 22 |   for i in range(300):
 23 |     # Create a random age and rating
 24 |     rating=random()*50+50
 25 |     age=random()*50
 26 | 
 27 |     # Get reference price
 28 |     price=wineprice(rating,age)
 29 |     
 30 |     # Add some noise
 31 |     price*=(random()*0.2+0.9)
 32 | 
 33 |     # Add to the dataset
 34 |     rows.append({'input':(rating,age),
 35 |                  'result':price})
 36 |   return rows
 37 | 
 38 | def euclidean(v1,v2):
 39 |   d=0.0
 40 |   for i in range(len(v1)):
 41 |     d+=(v1[i]-v2[i])**2
 42 |   return math.sqrt(d)
 43 | 
 44 | 
 45 | def getdistances(data,vec1):
 46 |   distancelist=[]
 47 |   
 48 |   # Loop over every item in the dataset
 49 |   for i in range(len(data)):
 50 |     vec2=data[i]['input']
 51 |     
 52 |     # Add the distance and the index
 53 |     distancelist.append((euclidean(vec1,vec2),i))
 54 |   
 55 |   # Sort by distance
 56 |   distancelist.sort()
 57 |   return distancelist
 58 | 
 59 | def knnestimate(data,vec1,k=5):
 60 |   # Get sorted distances
 61 |   dlist=getdistances(data,vec1)
 62 |   avg=0.0
 63 |   
 64 |   # Take the average of the top k results
 65 |   for i in range(k):
 66 |     idx=dlist[i][1]
 67 |     avg+=data[idx]['result']
 68 |   avg=avg/k
 69 |   return avg
 70 | 
 71 | def inverseweight(dist,num=1.0,const=0.1):
 72 |   return num/(dist+const)
 73 | 
 74 | def subtractweight(dist,const=1.0):
 75 |   if dist>const: 
 76 |     return 0
 77 |   else: 
 78 |     return const-dist
 79 | 
 80 | def gaussian(dist,sigma=5.0):
 81 |   return math.e**(-dist**2/(2*sigma**2))
 82 | 
 83 | def weightedknn(data,vec1,k=5,weightf=gaussian):
 84 |   # Get distances
 85 |   dlist=getdistances(data,vec1)
 86 |   avg=0.0
 87 |   totalweight=0.0
 88 |   
 89 |   # Get weighted average
 90 |   for i in range(k):
 91 |     dist=dlist[i][0]
 92 |     idx=dlist[i][1]
 93 |     weight=weightf(dist)
 94 |     avg+=weight*data[idx]['result']
 95 |     totalweight+=weight
 96 |   if totalweight==0: return 0
 97 |   avg=avg/totalweight
 98 |   return avg
 99 | 
100 | def dividedata(data,test=0.05):
101 |   trainset=[]
102 |   testset=[]
103 |   for row in data:
104 |     if random()<test:
105 |       testset.append(row)
106 |     else:
107 |       trainset.append(row)
108 |   return trainset,testset
109 | 
110 | def testalgorithm(algf,trainset,testset):
111 |   error=0.0
112 |   for row in testset:
113 |     guess=algf(trainset,row['input'])
114 |     error+=(row['result']-guess)**2
115 |     #print row['result'],guess
116 |   #print error/len(testset)
117 |   return error/len(testset)
118 | 
119 | def crossvalidate(algf,data,trials=100,test=0.1):
120 |   error=0.0
121 |   for i in range(trials):
122 |     trainset,testset=dividedata(data,test)
123 |     error+=testalgorithm(algf,trainset,testset)
124 |   return error/trials
125 | 
126 | def wineset2():
127 |   rows=[]
128 |   for i in range(300):
129 |     rating=random()*50+50
130 |     age=random()*50
131 |     aisle=float(randint(1,20))
132 |     bottlesize=[375.0,750.0,1500.0][randint(0,2)]
133 |     price=wineprice(rating,age)
134 |     price*=(bottlesize/750)
135 |     price*=(random()*0.2+0.9)
136 |     rows.append({'input':(rating,age,aisle,bottlesize),
137 |                  'result':price})
138 |   return rows
139 | 
140 | def rescale(data,scale):
141 |   scaleddata=[]
142 |   for row in data:
143 |     scaled=[scale[i]*row['input'][i] for i in range(len(scale))]
144 |     scaleddata.append({'input':scaled,'result':row['result']})
145 |   return scaleddata
146 | 
147 | def createcostfunction(algf,data):
148 |   def costf(scale):
149 |     sdata=rescale(data,scale)
150 |     return crossvalidate(algf,sdata,trials=20)
151 |   return costf
152 | 
153 | weightdomain=[(0,10)]*4
154 | 
155 | def wineset3():
156 |   rows=wineset1()
157 |   for row in rows:
158 |     if random()<0.5:
159 |       # Wine was bought at a discount store
160 |       row['result']*=0.6
161 |   return rows
162 | 
163 | def probguess(data,vec1,low,high,k=5,weightf=gaussian):
164 |   dlist=getdistances(data,vec1)
165 |   nweight=0.0
166 |   tweight=0.0
167 |   
168 |   for i in range(k):
169 |     dist=dlist[i][0]
170 |     idx=dlist[i][1]
171 |     weight=weightf(dist)
172 |     v=data[idx]['result']
173 |     
174 |     # Is this point in the range?
175 |     if v>=low and v<=high:
176 |       nweight+=weight
177 |     tweight+=weight
178 |   if tweight==0: return 0
179 |   
180 |   # The probability is the weights in the range
181 |   # divided by all the weights
182 |   return nweight/tweight
183 | 
184 | from pylab import *
185 | 
186 | def cumulativegraph(data,vec1,high,k=5,weightf=gaussian):
187 |   t1=arange(0.0,high,0.1)
188 |   cprob=array([probguess(data,vec1,0,v,k,weightf) for v in t1])
189 |   plot(t1,cprob)
190 |   show()
191 | 
192 | 
193 | def probabilitygraph(data,vec1,high,k=5,weightf=gaussian,ss=5.0):
194 |   # Make a range for the prices
195 |   t1=arange(0.0,high,0.1)
196 |   
197 |   # Get the probabilities for the entire range
198 |   probs=[probguess(data,vec1,v,v+0.1,k,weightf) for v in t1]
199 |   
200 |   # Smooth them by adding the gaussian of the nearby probabilites
201 |   smoothed=[]
202 |   for i in range(len(probs)):
203 |     sv=0.0
204 |     for j in range(0,len(probs)):
205 |       dist=abs(i-j)*0.1
206 |       weight=gaussian(dist,sigma=ss)
207 |       sv+=weight*probs[j]
208 |     smoothed.append(sv)
209 |   smoothed=array(smoothed)
210 |     
211 |   plot(t1,smoothed)
212 |   show()
213 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter8/optimization.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import random
  3 | import math
  4 | 
  5 | people = [('Seymour','BOS'),
  6 |           ('Franny','DAL'),
  7 |           ('Zooey','CAK'),
  8 |           ('Walt','MIA'),
  9 |           ('Buddy','ORD'),
 10 |           ('Les','OMA')]
 11 | # Laguardia
 12 | destination='LGA'
 13 | 
 14 | flights={}
 15 | # 
 16 | """
 17 | for line in file('schedule.txt'):
 18 |   origin,dest,depart,arrive,price=line.strip().split(',')
 19 |   flights.setdefault((origin,dest),[])
 20 | 
 21 |   # Add details to the list of possible flights
 22 |   flights[(origin,dest)].append((depart,arrive,int(price)))
 23 | """
 24 | def getminutes(t):
 25 |   x=time.strptime(t,'%H:%M')
 26 |   return x[3]*60+x[4]
 27 | 
 28 | def printschedule(r):
 29 |   for d in range(len(r)/2):
 30 |     name=people[d][0]
 31 |     origin=people[d][1]
 32 |     out=flights[(origin,destination)][int(r[d])]
 33 |     ret=flights[(destination,origin)][int(r[d+1])]
 34 |     print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin,
 35 |                                                   out[0],out[1],out[2],
 36 |                                                   ret[0],ret[1],ret[2])
 37 | 
 38 | def schedulecost(sol):
 39 |   totalprice=0
 40 |   latestarrival=0
 41 |   earliestdep=24*60
 42 | 
 43 |   for d in range(len(sol)/2):
 44 |     # Get the inbound and outbound flights
 45 |     origin=people[d][1]
 46 |     outbound=flights[(origin,destination)][int(sol[d])]
 47 |     returnf=flights[(destination,origin)][int(sol[d+1])]
 48 |     
 49 |     # Total price is the price of all outbound and return flights
 50 |     totalprice+=outbound[2]
 51 |     totalprice+=returnf[2]
 52 |     
 53 |     # Track the latest arrival and earliest departure
 54 |     if latestarrival<getminutes(outbound[1]): latestarrival=getminutes(outbound[1])
 55 |     if earliestdep>getminutes(returnf[0]): earliestdep=getminutes(returnf[0])
 56 |   
 57 |   # Every person must wait at the airport until the latest person arrives.
 58 |   # They also must arrive at the same time and wait for their flights.
 59 |   totalwait=0  
 60 |   for d in range(len(sol)/2):
 61 |     origin=people[d][1]
 62 |     outbound=flights[(origin,destination)][int(sol[d])]
 63 |     returnf=flights[(destination,origin)][int(sol[d+1])]
 64 |     totalwait+=latestarrival-getminutes(outbound[1])
 65 |     totalwait+=getminutes(returnf[0])-earliestdep  
 66 | 
 67 |   # Does this solution require an extra day of car rental? That'll be $50!
 68 |   if latestarrival>earliestdep: totalprice+=50
 69 |   
 70 |   return totalprice+totalwait
 71 | 
 72 | def randomoptimize(domain,costf):
 73 |   best=999999999
 74 |   bestr=None
 75 |   for i in range(0,1000):
 76 |     # Create a random solution
 77 |     r=[float(random.randint(domain[i][0],domain[i][1])) 
 78 |        for i in range(len(domain))]
 79 |     
 80 |     # Get the cost
 81 |     cost=costf(r)
 82 |     
 83 |     # Compare it to the best one so far
 84 |     if cost<best:
 85 |       best=cost
 86 |       bestr=r 
 87 |   return r
 88 | 
 89 | 
 90 | def annealingoptimize(domain,costf,T=10000.0,cool=0.95,step=1):
 91 |   # Initialize the values randomly
 92 |   vec=[float(random.randint(domain[i][0],domain[i][1])) 
 93 |        for i in range(len(domain))]
 94 |   
 95 |   while T>0.1:
 96 |     # Choose one of the indices
 97 |     i=random.randint(0,len(domain)-1)
 98 | 
 99 |     # Choose a direction to change it
100 |     dir=random.randint(-step,step)
101 | 
102 |     # Create a new list with one of the values changed
103 |     vecb=vec[:]
104 |     vecb[i]+=dir
105 |     if vecb[i]<domain[i][0]: vecb[i]=domain[i][0]
106 |     elif vecb[i]>domain[i][1]: vecb[i]=domain[i][1]
107 | 
108 |     # Calculate the current cost and the new cost
109 |     ea=costf(vec)
110 |     eb=costf(vecb)
111 |     p=pow(math.e,(-eb-ea)/T)
112 | 
113 |     print vec,ea
114 | 
115 | 
116 |     # Is it better, or does it make the probability
117 |     # cutoff?
118 |     if (eb<ea or random.random()<p):
119 |       vec=vecb      
120 | 
121 |     # Decrease the temperature
122 |     T=T*cool
123 |   return vec
124 | 
125 | def swarmoptimize(domain,costf,popsize=20,lrate=0.1,maxv=2.0,iters=50):
126 |   # Initialize individuals
127 |   # current solutions
128 |   x=[]
129 | 
130 |   # best solutions
131 |   p=[]
132 | 
133 |   # velocities
134 |   v=[]
135 |   
136 |   for i in range(0,popsize):
137 |     vec=[float(random.randint(domain[i][0],domain[i][1])) 
138 |          for i in range(len(domain))]
139 |     x.append(vec)
140 |     p.append(vec[:])
141 |     v.append([0.0 for i in vec])
142 |   
143 |   
144 |   for ml in range(0,iters):
145 |     for i in range(0,popsize):
146 |       # Best solution for this particle
147 |       if costf(x[i])<costf(p[i]):
148 |         p[i]=x[i][:]
149 |       g=i
150 | 
151 |       # Best solution for any particle
152 |       for j in range(0,popsize):
153 |         if costf(p[j])<costf(p[g]): g=j
154 |       for d in range(len(x[i])):
155 |         # Update the velocity of this particle
156 |         v[i][d]+=lrate*(p[i][d]-x[i][d])+lrate*(p[g][d]-x[i][d])
157 | 
158 |         # constrain velocity to a maximum
159 |         if v[i][d]>maxv: v[i][d]=maxv
160 |         elif v[i][d]<-maxv: v[i][d]=-maxv
161 | 
162 |         # constrain bounds of solutions
163 |         x[i][d]+=v[i][d]
164 |         if x[i][d]<domain[d][0]: x[i][d]=domain[d][0]
165 |         elif x[i][d]>domain[d][1]: x[i][d]=domain[d][1]
166 | 
167 |     print p[g],costf(p[g])
168 |   return p[g]
169 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter9/advancedclassify.py:
--------------------------------------------------------------------------------
  1 | class matchrow:
  2 |   def __init__(self,row,allnum=False):
  3 |     if allnum:
  4 |       self.data=[float(row[i]) for i in range(len(row)-1)]
  5 |     else:
  6 |       self.data=row[0:len(row)-1]
  7 |     self.match=int(row[len(row)-1])
  8 | 
  9 | def loadmatch(f,allnum=False):
 10 |   rows=[]
 11 |   for line in file(f):
 12 |     rows.append(matchrow(line.split(','),allnum))
 13 |   return rows
 14 |  
 15 | from pylab import *
 16 | def plotagematches(rows):
 17 |   xdm,ydm=[r.data[0] for r in rows if r.match==1],\
 18 |           [r.data[1] for r in rows if r.match==1]
 19 |   xdn,ydn=[r.data[0] for r in rows if r.match==0],\
 20 |           [r.data[1] for r in rows if r.match==0] 
 21 |   
 22 |   plot(xdm,ydm,'bo')
 23 |   plot(xdn,ydn,'b+')
 24 |   
 25 |   show()
 26 | 
 27 | def lineartrain(rows):
 28 |   averages={}
 29 |   counts={}
 30 |   
 31 |   for row in rows:
 32 |     # Get the class of this point
 33 |     cl=row.match
 34 |     
 35 |     averages.setdefault(cl,[0.0]*(len(row.data)))
 36 |     counts.setdefault(cl,0)
 37 |     
 38 |     # Add this point to the averages
 39 |     for i in range(len(row.data)):
 40 |       averages[cl][i]+=float(row.data[i])
 41 |       
 42 |     # Keep track of how many points in each class
 43 |     counts[cl]+=1
 44 |     
 45 |   # Divide sums by counts to get the averages
 46 |   for cl,avg in averages.items():
 47 |     for i in range(len(avg)):
 48 |       avg[i]/=counts[cl]
 49 |   
 50 |   return averages
 51 | 
 52 | def dotproduct(v1,v2):
 53 |   return sum([v1[i]*v2[i] for i in range(len(v1))])
 54 | 
 55 | def veclength(v):
 56 |   return sum([p**2 for p in v])
 57 | 
 58 | def dpclassify(point,avgs):
 59 |   b=(dotproduct(avgs[1],avgs[1])-dotproduct(avgs[0],avgs[0]))/2
 60 |   y=dotproduct(point,avgs[0])-dotproduct(point,avgs[1])+b
 61 |   if y>0: return 0
 62 |   else: return 1
 63 | 
 64 | def yesno(v):
 65 |   if v=='yes': return 1
 66 |   elif v=='no': return -1
 67 |   else: return 0
 68 |   
 69 | def matchcount(interest1,interest2):
 70 |   l1=interest1.split(':')
 71 |   l2=interest2.split(':')
 72 |   x=0
 73 |   for v in l1:
 74 |     if v in l2: x+=1
 75 |   return x
 76 | 
 77 | yahookey="YOUR API KEY"
 78 | from xml.dom.minidom import parseString
 79 | from urllib import urlopen,quote_plus
 80 | 
 81 | loc_cache={}
 82 | def getlocation(address):
 83 |   if address in loc_cache: return loc_cache[address]
 84 |   data=urlopen('http://api.local.yahoo.com/MapsService/V1/'+\
 85 |                'geocode?appid=%s&location=%s' %
 86 |                (yahookey,quote_plus(address))).read()
 87 |   doc=parseString(data)
 88 |   lat=doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue
 89 |   long=doc.getElementsByTagName('Longitude')[0].firstChild.nodeValue  
 90 |   loc_cache[address]=(float(lat),float(long))
 91 |   return loc_cache[address]
 92 | 
 93 | def milesdistance(a1,a2):
 94 |   lat1,long1=getlocation(a1)
 95 |   lat2,long2=getlocation(a2)
 96 |   latdif=69.1*(lat2-lat1)
 97 |   longdif=53.0*(long2-long1)
 98 |   return (latdif**2+longdif**2)**.5
 99 | 
100 | def loadnumerical():
101 |   oldrows=loadmatch('matchmaker.csv')
102 |   newrows=[]
103 |   for row in oldrows:
104 |     d=row.data
105 |     data=[float(d[0]),yesno(d[1]),yesno(d[2]),
106 |           float(d[5]),yesno(d[6]),yesno(d[7]),
107 |           matchcount(d[3],d[8]),
108 |           milesdistance(d[4],d[9]),
109 |           row.match]
110 |     newrows.append(matchrow(data))
111 |   return newrows
112 | 
113 | def scaledata(rows):
114 |   low=[999999999.0]*len(rows[0].data)
115 |   high=[-999999999.0]*len(rows[0].data)
116 |   # Find the lowest and highest values
117 |   for row in rows:
118 |     d=row.data
119 |     for i in range(len(d)):
120 |       if d[i]<low[i]: low[i]=d[i]
121 |       if d[i]>high[i]: high[i]=d[i]
122 |   
123 |   # Create a function that scales data
124 |   def scaleinput(d):
125 |      return [(d[i]-low[i])/(high[i]-low[i])
126 |             for i in range(len(low))]
127 |   
128 |   # Scale all the data
129 |   newrows=[matchrow(scaleinput(row.data)+[row.match])
130 |            for row in rows]
131 |   
132 |   # Return the new data and the function
133 |   return newrows,scaleinput
134 | 
135 | 
136 | def rbf(v1,v2,gamma=10):
137 |   dv=[v1[i]-v2[i] for i in range(len(v1))]
138 |   l=veclength(dv)
139 |   return math.e**(-gamma*l)
140 | 
141 | def nlclassify(point,rows,offset,gamma=10):
142 |   sum0=0.0
143 |   sum1=0.0
144 |   count0=0
145 |   count1=0
146 |   
147 |   for row in rows:
148 |     if row.match==0:
149 |       sum0+=rbf(point,row.data,gamma)
150 |       count0+=1
151 |     else:
152 |       sum1+=rbf(point,row.data,gamma)
153 |       count1+=1
154 |   y=(1.0/count0)*sum0-(1.0/count1)*sum1+offset
155 | 
156 |   if y>0: return 0
157 |   else: return 1
158 | 
159 | def getoffset(rows,gamma=10):
160 |   l0=[]
161 |   l1=[]
162 |   for row in rows:
163 |     if row.match==0: l0.append(row.data)
164 |     else: l1.append(row.data)
165 |   sum0=sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0)
166 |   sum1=sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1)
167 |   
168 |   return (1.0/(len(l1)**2))*sum1-(1.0/(len(l0)**2))*sum0
169 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter9/agesonly.csv:
--------------------------------------------------------------------------------
  1 | 24,30,1
  2 | 30,40,1
  3 | 22,49,0
  4 | 43,39,1
  5 | 23,30,1
  6 | 23,49,0
  7 | 48,46,1
  8 | 23,23,1
  9 | 29,49,0
 10 | 38,38,1
 11 | 30,34,1
 12 | 40,50,1
 13 | 35,32,1
 14 | 49,44,1
 15 | 38,22,1
 16 | 30,27,1
 17 | 26,24,1
 18 | 39,23,1
 19 | 36,43,1
 20 | 25,31,1
 21 | 27,27,1
 22 | 32,22,1
 23 | 40,30,1
 24 | 26,28,1
 25 | 46,32,1
 26 | 41,37,1
 27 | 39,41,1
 28 | 18,28,0
 29 | 18,47,0
 30 | 39,44,1
 31 | 38,21,1
 32 | 24,36,0
 33 | 32,22,1
 34 | 21,20,1
 35 | 42,36,1
 36 | 46,41,1
 37 | 39,38,1
 38 | 18,31,0
 39 | 31,45,1
 40 | 44,24,0
 41 | 49,22,0
 42 | 26,27,1
 43 | 25,34,1
 44 | 47,23,0
 45 | 27,48,0
 46 | 32,49,1
 47 | 46,41,1
 48 | 24,32,1
 49 | 29,26,1
 50 | 25,36,1
 51 | 27,35,1
 52 | 38,19,1
 53 | 18,40,0
 54 | 34,49,1
 55 | 32,35,1
 56 | 47,49,1
 57 | 47,18,0
 58 | 33,24,1
 59 | 35,28,1
 60 | 35,41,1
 61 | 39,43,1
 62 | 29,18,1
 63 | 18,44,0
 64 | 26,26,1
 65 | 31,43,1
 66 | 20,29,0
 67 | 28,18,1
 68 | 31,38,1
 69 | 34,34,1
 70 | 32,33,1
 71 | 34,27,1
 72 | 19,38,0
 73 | 32,21,1
 74 | 33,37,1
 75 | 33,18,1
 76 | 18,46,0
 77 | 31,37,1
 78 | 36,30,1
 79 | 40,40,1
 80 | 38,30,1
 81 | 49,28,1
 82 | 31,47,1
 83 | 28,50,0
 84 | 49,43,1
 85 | 24,31,1
 86 | 33,43,1
 87 | 28,24,1
 88 | 45,29,1
 89 | 49,35,1
 90 | 36,29,1
 91 | 42,32,1
 92 | 29,18,1
 93 | 49,20,0
 94 | 22,27,1
 95 | 41,38,1
 96 | 47,21,0
 97 | 40,32,1
 98 | 35,18,1
 99 | 35,33,1
100 | 34,28,1
101 | 22,31,0
102 | 46,20,0
103 | 18,49,0
104 | 48,23,0
105 | 39,21,1
106 | 20,34,0
107 | 24,20,1
108 | 38,18,1
109 | 37,47,1
110 | 39,37,1
111 | 38,39,1
112 | 27,42,1
113 | 47,49,1
114 | 27,42,1
115 | 40,28,1
116 | 41,46,1
117 | 39,25,1
118 | 43,36,1
119 | 49,30,1
120 | 24,38,0
121 | 49,42,1
122 | 19,22,0
123 | 43,27,1
124 | 30,37,1
125 | 24,31,1
126 | 24,48,0
127 | 24,29,1
128 | 18,19,1
129 | 29,25,1
130 | 38,33,1
131 | 39,20,1
132 | 24,30,1
133 | 22,39,0
134 | 47,21,0
135 | 30,44,1
136 | 41,38,1
137 | 29,33,1
138 | 42,42,1
139 | 47,27,1
140 | 23,20,1
141 | 39,18,1
142 | 30,26,1
143 | 36,27,1
144 | 40,18,1
145 | 31,18,1
146 | 46,27,1
147 | 41,44,1
148 | 26,34,1
149 | 33,18,1
150 | 48,19,0
151 | 46,27,1
152 | 25,40,0
153 | 50,36,1
154 | 20,21,1
155 | 33,47,1
156 | 40,35,1
157 | 24,27,1
158 | 34,19,1
159 | 26,45,0
160 | 34,36,1
161 | 21,27,0
162 | 48,28,1
163 | 23,25,1
164 | 48,46,1
165 | 30,20,1
166 | 23,40,0
167 | 36,40,1
168 | 21,45,0
169 | 30,40,1
170 | 39,24,1
171 | 42,47,1
172 | 28,37,1
173 | 24,30,1
174 | 37,25,1
175 | 44,34,1
176 | 43,32,1
177 | 46,29,1
178 | 49,22,0
179 | 41,28,1
180 | 23,50,0
181 | 30,43,1
182 | 25,32,1
183 | 27,46,0
184 | 23,21,1
185 | 39,41,1
186 | 33,27,1
187 | 49,21,0
188 | 33,33,1
189 | 18,25,0
190 | 42,35,1
191 | 36,25,1
192 | 26,50,0
193 | 18,37,0
194 | 35,37,1
195 | 39,38,1
196 | 22,30,0
197 | 18,44,0
198 | 46,44,1
199 | 24,27,1
200 | 41,34,1
201 | 40,39,1
202 | 34,49,1
203 | 35,41,1
204 | 46,48,1
205 | 50,23,0
206 | 49,20,0
207 | 22,47,0
208 | 27,26,1
209 | 30,30,1
210 | 37,39,1
211 | 42,44,1
212 | 41,27,1
213 | 24,21,1
214 | 34,28,1
215 | 23,43,0
216 | 43,35,1
217 | 42,40,1
218 | 25,24,1
219 | 36,24,1
220 | 25,23,1
221 | 44,30,1
222 | 39,33,1
223 | 38,33,1
224 | 49,30,1
225 | 40,19,1
226 | 19,46,0
227 | 31,21,1
228 | 48,33,1
229 | 26,24,1
230 | 20,37,0
231 | 29,31,1
232 | 35,28,1
233 | 37,25,1
234 | 42,42,1
235 | 42,48,1
236 | 41,47,1
237 | 44,45,1
238 | 45,46,1
239 | 25,38,1
240 | 19,45,0
241 | 36,26,1
242 | 33,36,1
243 | 27,19,1
244 | 48,24,0
245 | 37,48,1
246 | 23,31,0
247 | 20,29,0
248 | 27,44,0
249 | 47,24,0
250 | 36,18,1
251 | 37,48,1
252 | 32,29,1
253 | 46,48,1
254 | 31,47,1
255 | 23,45,0
256 | 28,30,1
257 | 36,32,1
258 | 25,43,0
259 | 24,44,0
260 | 34,47,1
261 | 46,42,1
262 | 18,31,0
263 | 23,25,1
264 | 44,39,1
265 | 18,29,0
266 | 49,40,1
267 | 24,33,0
268 | 21,44,0
269 | 40,24,1
270 | 46,41,1
271 | 42,33,1
272 | 25,41,0
273 | 29,42,1
274 | 40,18,1
275 | 37,40,1
276 | 46,28,1
277 | 33,20,1
278 | 18,42,0
279 | 22,36,0
280 | 27,46,0
281 | 33,48,1
282 | 21,37,0
283 | 26,50,0
284 | 29,23,1
285 | 23,33,0
286 | 21,38,0
287 | 18,30,0
288 | 29,28,1
289 | 31,22,1
290 | 30,48,1
291 | 41,37,1
292 | 35,31,1
293 | 48,32,1
294 | 29,37,1
295 | 32,33,1
296 | 43,26,1
297 | 21,33,0
298 | 44,28,1
299 | 35,18,1
300 | 35,35,1
301 | 25,20,1
302 | 39,46,1
303 | 26,39,1
304 | 36,29,1
305 | 29,44,1
306 | 28,42,1
307 | 38,21,1
308 | 28,49,0
309 | 33,26,1
310 | 31,28,1
311 | 25,47,0
312 | 23,25,1
313 | 45,49,1
314 | 28,26,1
315 | 36,48,1
316 | 42,48,1
317 | 42,21,1
318 | 29,32,1
319 | 26,28,1
320 | 24,46,0
321 | 39,30,1
322 | 29,46,1
323 | 43,43,1
324 | 20,42,0
325 | 35,41,1
326 | 45,19,0
327 | 38,45,1
328 | 25,38,1
329 | 31,20,1
330 | 38,43,1
331 | 37,30,1
332 | 43,27,1
333 | 43,44,1
334 | 21,30,0
335 | 22,45,0
336 | 44,26,1
337 | 43,42,1
338 | 26,41,0
339 | 47,35,1
340 | 48,30,1
341 | 41,24,1
342 | 19,48,0
343 | 45,24,0
344 | 38,41,1
345 | 42,46,1
346 | 49,45,1
347 | 28,44,1
348 | 22,44,0
349 | 31,48,1
350 | 48,21,0
351 | 31,20,1
352 | 30,39,1
353 | 23,23,1
354 | 21,32,0
355 | 19,19,1
356 | 21,27,0
357 | 24,46,0
358 | 25,28,1
359 | 48,50,1
360 | 25,32,1
361 | 26,29,1
362 | 33,48,1
363 | 35,32,1
364 | 48,25,1
365 | 30,27,1
366 | 34,49,1
367 | 40,45,1
368 | 28,32,1
369 | 47,33,1
370 | 29,33,1
371 | 21,22,1
372 | 21,39,0
373 | 41,45,1
374 | 46,39,1
375 | 22,24,1
376 | 32,22,1
377 | 27,46,0
378 | 26,35,1
379 | 27,29,1
380 | 48,19,0
381 | 35,26,1
382 | 42,29,1
383 | 30,22,1
384 | 20,26,0
385 | 33,25,1
386 | 37,30,1
387 | 37,32,1
388 | 20,22,1
389 | 42,48,1
390 | 29,20,1
391 | 32,46,1
392 | 37,34,1
393 | 29,45,1
394 | 19,44,0
395 | 49,18,0
396 | 28,25,1
397 | 48,31,1
398 | 35,46,1
399 | 34,26,1
400 | 38,26,1
401 | 36,31,1
402 | 31,30,1
403 | 27,19,1
404 | 44,38,1
405 | 19,37,0
406 | 43,49,1
407 | 19,42,0
408 | 32,24,1
409 | 46,43,1
410 | 43,46,1
411 | 33,32,1
412 | 23,35,0
413 | 26,34,1
414 | 48,20,0
415 | 45,38,1
416 | 30,30,1
417 | 28,23,1
418 | 43,36,1
419 | 19,37,0
420 | 39,45,1
421 | 20,30,0
422 | 28,30,1
423 | 19,42,0
424 | 41,21,1
425 | 42,31,1
426 | 47,45,1
427 | 42,48,1
428 | 40,22,1
429 | 28,20,1
430 | 22,31,0
431 | 28,24,1
432 | 18,33,0
433 | 42,47,1
434 | 35,18,1
435 | 32,28,1
436 | 45,39,1
437 | 46,45,1
438 | 41,43,1
439 | 24,37,0
440 | 34,30,1
441 | 40,22,1
442 | 38,20,1
443 | 43,28,1
444 | 21,26,0
445 | 35,27,1
446 | 33,37,1
447 | 48,39,1
448 | 47,40,1
449 | 31,32,1
450 | 18,32,0
451 | 31,20,1
452 | 30,49,1
453 | 22,46,0
454 | 36,39,1
455 | 30,35,1
456 | 49,50,1
457 | 46,39,1
458 | 45,44,1
459 | 34,40,1
460 | 27,28,1
461 | 27,35,1
462 | 46,46,1
463 | 26,42,0
464 | 27,18,1
465 | 23,38,0
466 | 30,30,1
467 | 34,32,1
468 | 48,27,1
469 | 31,23,1
470 | 29,47,0
471 | 47,31,1
472 | 35,19,1
473 | 30,28,1
474 | 33,44,1
475 | 36,37,1
476 | 34,44,1
477 | 42,43,1
478 | 36,29,1
479 | 35,46,1
480 | 22,36,0
481 | 39,47,1
482 | 23,23,1
483 | 47,20,0
484 | 38,22,1
485 | 21,33,0
486 | 37,41,1
487 | 18,18,1
488 | 35,34,1
489 | 49,49,1
490 | 33,32,1
491 | 31,19,1
492 | 31,26,1
493 | 45,31,1
494 | 41,44,1
495 | 27,47,0
496 | 28,26,1
497 | 18,47,0
498 | 37,18,1
499 | 20,42,0
500 | 36,45,1
501 | 


--------------------------------------------------------------------------------
/CollectiveIntelligence/chapter9/facebook.py:
--------------------------------------------------------------------------------
  1 | import urllib,md5,webbrowser,time
  2 | from xml.dom.minidom import parseString
  3 | 
  4 | apikey="47e953c8ea9ed30db904af453125c759"
  5 | secret="ea703e4721e8c7bf88b92110a46a9b06"
  6 | FacebookURL = "https://api.facebook.com/restserver.php"
  7 | 
  8 | def getsinglevalue(node,tag):
  9 |   nl=node.getElementsByTagName(tag)
 10 |   if len(nl)>0:
 11 |     tagNode=nl[0]
 12 |     if tagNode.hasChildNodes():
 13 |       return tagNode.firstChild.nodeValue
 14 |   return ''
 15 | 
 16 | def callid(): 
 17 |   return str(int(time.time()*10))
 18 | 
 19 | class fbsession:
 20 |   def __init__(self):
 21 |     self.session_secret=None
 22 |     self.session_key=None
 23 |     self.createtoken()
 24 |     webbrowser.open(self.getlogin())
 25 |     print "Press enter after logging in:",
 26 |     raw_input()
 27 |     self.getsession()
 28 |   def sendrequest(self, args):
 29 |     args['api_key'] = apikey
 30 |     args['sig'] = self.makehash(args)
 31 |     post_data = urllib.urlencode(args)
 32 |     url = FacebookURL + "?" + post_data
 33 |     data=urllib.urlopen(url).read()
 34 |     print data
 35 |     return parseString(data)
 36 |   def makehash(self,args):
 37 |     hasher = md5.new(''.join([x + '=' + args[x] for x in sorted(args.keys())]))
 38 |     if self.session_secret: hasher.update(self.session_secret)
 39 |     else: hasher.update(secret)
 40 |     return hasher.hexdigest()
 41 |   def createtoken(self):
 42 |     res = self.sendrequest({'method':"facebook.auth.createToken"})
 43 |     self.token = getsinglevalue(res,'token')
 44 |   def getlogin(self):
 45 |     return "http://api.facebook.com/login.php?api_key="+apikey+\
 46 |            "&auth_token=" + self.token
 47 |   def getsession(self):
 48 |     doc=self.sendrequest({'method':'facebook.auth.getSession',
 49 |                                'auth_token':self.token})
 50 |     self.session_key=getsinglevalue(doc,'session_key')
 51 |     self.session_secret=getsinglevalue(doc,'secret')
 52 |   def getfriends(self):
 53 |     doc=self.sendrequest({'method':'facebook.friends.get',
 54 |                           'session_key':self.session_key,'call_id':callid()})
 55 |     results=[]
 56 |     for n in doc.getElementsByTagName('result_elt'):
 57 |       results.append(n.firstChild.nodeValue)
 58 |     return results
 59 | 
 60 |   def getinfo(self,users):
 61 |     ulist=','.join(users)
 62 |     
 63 |     fields='gender,current_location,relationship_status,'+\
 64 |            'affiliations,hometown_location'
 65 |     
 66 |     doc=self.sendrequest({'method':'facebook.users.getInfo',
 67 |     'session_key':self.session_key,'call_id':callid(),
 68 |     'users':ulist,'fields':fields})
 69 | 
 70 |     results={}
 71 |     for n,id in zip(doc.getElementsByTagName('result_elt'),users):
 72 |       # Get the location
 73 |       locnode=n.getElementsByTagName('hometown_location')[0]
 74 |       loc=getsinglevalue(locnode,'city')+', '+getsinglevalue(locnode,'state')
 75 |       
 76 |       # Get school
 77 |       college=''
 78 |       gradyear='0'
 79 |       affiliations=n.getElementsByTagName('affiliations_elt')
 80 |       for aff in affiliations:
 81 |         # Type 1 is college
 82 |         if getsinglevalue(aff,'type')=='1': 
 83 |           college=getsinglevalue(aff,'name')
 84 |           gradyear=getsinglevalue(aff,'year')
 85 |       
 86 |       results[id]={'gender':getsinglevalue(n,'gender'),
 87 |                    'status':getsinglevalue(n,'relationship_status'),
 88 |                    'location':loc,'college':college,'year':gradyear}
 89 |     return results
 90 | 
 91 |   def arefriends(self,idlist1,idlist2):
 92 |     id1=','.join(idlist1)
 93 |     id2=','.join(idlist2)
 94 |     doc=self.sendrequest({'method':'facebook.friends.areFriends',
 95 |                           'session_key':self.session_key,'call_id':callid(),
 96 |                           'id1':id1,'id2':id2})
 97 |     results=[]
 98 |     for n in doc.getElementsByTagName('result_elt'):
 99 |       results.append(int(n.firstChild.nodeValue))
100 |     return results
101 |   
102 |   
103 | 
104 |   def makedataset(self):
105 |     from advancedclassify import milesdistance
106 |     # Get all the info for all my friends
107 |     friends=self.getfriends()
108 |     info=self.getinfo(friends)
109 |     ids1,ids2=[],[]
110 |     rows=[]
111 | 
112 |     # Nested loop to look at every pair of friends
113 |     for i in range(len(friends)):
114 |       f1=friends[i]
115 |       data1=info[f1]
116 |       
117 |       # Start at i+1 so we don't double up
118 |       for j in range(i+1,len(friends)):
119 |         f2=friends[j]
120 |         data2=info[f2]
121 |         ids1.append(f1)
122 |         ids2.append(f2)
123 | 
124 |         # Generate some numbers from the data
125 |         if data1['college']==data2['college']: sameschool=1
126 |         else: sameschool=0
127 |         male1=(data1['gender']=='Male') and 1 or 0
128 |         male2=(data2['gender']=='Male') and 1 or 0        
129 |         
130 |         row=[male1,int(data1['year']),male2,int(data2['year']),sameschool]
131 |         rows.append(row)
132 |     # Call arefriends in blocks for every pair of people
133 |     arefriends=[]
134 |     for i in range(0,len(ids1),30):
135 |       j=min(i+30,len(ids1))
136 |       pa=self.arefriends(ids1[i:j],ids2[i:j])
137 |       arefriends+=pa
138 |     return arefriends,rows
139 |   
140 | 


--------------------------------------------------------------------------------
/KNN/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding:utf-8 -*-
3 | 
4 | __author__ = 'Demi Yu'


--------------------------------------------------------------------------------
/KNN/knn.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": false
 8 |    },
 9 |    "outputs": [
10 |     {
11 |      "ename": "Error",
12 |      "evalue": "iterator should return strings, not bytes (did you open the file in text mode?)",
13 |      "traceback": [
14 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
15 |       "\u001b[0;31mError\u001b[0m                                     Traceback (most recent call last)",
16 |       "\u001b[0;32m<ipython-input-5-32f5b515dc36>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/Users/yuhongjun/Python/python-training/data/iris.data.csv\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcsvfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0mlines\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcsv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcsvfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlines\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
17 |       "\u001b[0;31mError\u001b[0m: iterator should return strings, not bytes (did you open the file in text mode?)"
18 |      ],
19 |      "output_type": "error"
20 |     }
21 |    ],
22 |    "source": [
23 |     "import numpy as np"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [],
31 |    "source": []
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "kernelspec": {
36 |    "display_name": "Python 2",
37 |    "language": "python",
38 |    "name": "python2"
39 |   },
40 |   "language_info": {
41 |    "codemirror_mode": {
42 |     "name": "ipython",
43 |     "version": 2
44 |    },
45 |    "file_extension": ".py",
46 |    "mimetype": "text/x-python",
47 |    "name": "python",
48 |    "nbconvert_exporter": "python",
49 |    "pygments_lexer": "ipython2",
50 |    "version": "2.7.6"
51 |   }
52 |  },
53 |  "nbformat": 4,
54 |  "nbformat_minor": 0
55 | }
56 | 


--------------------------------------------------------------------------------
/KNN/knn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __author__ = 'Demi Yu'
 5 | import numpy as np
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | import sklearn.metrics as metrics
 9 | import numpy as np
10 | from sklearn.neighbors import NearestNeighbors
11 | from scipy.spatial.distance import correlation, cosine
12 | import ipywidgets as widgets
13 | from IPython.display import display, clear_output
14 | from sklearn.metrics import pairwise_distances
15 | from sklearn.metrics import mean_squared_error
16 | from math import sqrt
17 | import sys, os, time
18 | from contextlib import contextmanager
19 | import numpy as np
20 | import math
21 | def createDataset():
22 |     # 构建训练集数据
23 |     dataset = [[0.26547727, 0.27892898,0],
24 |            [0.1337869 , 0.08356665,0],
25 |            [0.02771102, 0.36429227,0],
26 |            [0.81783834, 0.86542639,1],
27 |            [0.99240191, 0.87950623,1],
28 |            [0.99240191, 0.77950623,1]]
29 |     return np.array(dataset)
30 | 
31 | 
32 | def getDistance(instance1,instance2):
33 |     #  计算两点间的距离
34 |     distance=0
35 |     length = len(instance1)
36 |     for i in range(length):
37 |         distance += math.pow(instance1[i]-instance2[i],2)
38 |     return math.sqrt(distance)
39 | 
40 | 
41 | def getNeighbors(trainingSet,testInstance,k):
42 |     # 计算未知实例与所有已知实例的距离。返回最近的K个已知实例
43 |     features = createDataset()[:,:2]
44 |     labels =  createDataset()[:,-1]
45 |     distance_list = []
46 |     for i in range(len(features)):
47 |         distance = getDistance(testInstance,features[i])
48 |         distance_list.append((distance,labels[i]))
49 |     sorted_distance_list = sorted(distance_list)
50 |     neighbors = sorted_distance_list[:k]
51 |     return neighbors
52 | 
53 | 
54 | def countClass(neighbors):
55 |     # 对返回最近的K个已知实例，进行统计分类，根据少数服从多数，让未知实例归类为K个最邻近样本中最多数的类别。
56 |     class_num_dict = {}
57 |     for n in neighbors:
58 |         if n[1] in class_num_dict:
59 |             class_num_dict[n[1]] += 1
60 |         else:
61 |             class_num_dict[n[1]] = 1
62 |     return class_num_dict
63 | 
64 | def main():
65 |     trainingSet = createDataset()
66 |     testSet = [[0,0],[1,1],[1.1,1.2]]
67 |     result = []
68 |     for test in testSet:
69 |         # 计算未知实例与所有已知实例的距离。返回最近的K个已知实例
70 |         neighbors = getNeighbors(trainingSet,test,4)
71 |         # 对返回最近的K个已知实例，进行统计分类。
72 |         class_num_dict = countClass(neighbors)
73 |         # 根据少数服从多数，让未知实例归类为K个最邻近样本中最多数的类别。
74 |         result.append(sorted(class_num_dict.items(),key = lambda x:x[1],reverse=True)[0][0])
75 |     print(testSet)
76 |     print(result)
77 | 
78 | if __name__ == '__main__':
79 |     main()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Demi_YuHongJun
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # python-training
 2 | python-training
 3 | 
 4 | ### create my.cnf for MYSQL
 5 | 
 6 | 在mac上：MySQL的配置文件默认存放在/etc/my.cnf或者/etc/mysql/my.cnf：
 7 | 
 8 | 但是mac中在/etc/ 可能不存在my.cnf 或/mysql/my.cnf： 可以自己创建；
 9 | 
10 | 我在/etc/ 中创建/mysql/my.cnf， 并输入内容如下：
11 | ```
12 | [client]
13 | default-character-set = utf8
14 | 
15 | [mysqld]
16 | default-storage-engine = INNODB
17 | character-set-server = utf8
18 | collation-server = utf8_general_ci
19 | ```
20 | ### 命令行：
21 | ```
22 | $ alias mysql=/usr/local/mysql/bin/mysql
23 | 
24 | $ alias mysqladmin=/usr/local/mysql/bin/mysqladmin
25 | 
26 | $ mysqladmin -u root -p password new_password
27 | 
28 | mysql -u root -p
29 | 
30 | mysql> show variables like '%char%';
31 | 
32 | 
33 | +--------------------------+-----------------------------------------------------------+
34 | | Variable_name            | Value                                                     |
35 | +--------------------------+-----------------------------------------------------------+
36 | | character_set_client     | utf8                                                      |
37 | | character_set_connection | utf8                                                      |
38 | | character_set_database   | utf8                                                      |
39 | | character_set_filesystem | binary                                                    |
40 | | character_set_results    | utf8                                                      |
41 | | character_set_server     | utf8                                                      |
42 | | character_set_system     | utf8                                                      |
43 | | character_sets_dir       | /usr/local/mysql-5.7.18-macos10.12-x86_64/share/charsets/ |
44 | +--------------------------+-----------------------------------------------------------+
45 | 8 rows in set (0.00 sec)
46 | ```
47 | 看到utf8字样就表示编码设置正确
48 | 
49 | ### MySql 5.7中添加用户,新建数据库,用户授权,删除用户,修改密码
50 | 
51 | #### 新建用户
52 | 创建test用户，密码是password。
53 | 
54 | mysql -u root -p
55 | 
56 | mysql> CREATE USER "www-data"@"localhost" IDENTIFIED BY "www-data"; #本地登录
57 | mysql> CREATE USER "www-data"@"%" IDENTIFIED BY "www-data"; #远程登录
58 | mysql> quit
59 | 
60 | mysql -u test -p #测试是否创建成功
61 | 
62 | #### 为用户授权
63 | 
64 | 1. 登录MYSQL，这里以ROOT身份登录：
65 | ```
66 | mysql -u root -p
67 | ```
68 | 2. 为用户创建一个数据库(testDB)：
69 | ```
70 | create database testDB;
71 | create database testDB default charset utf8 collate utf8_general_ci;
72 | ```
73 | 3. 授权test用户拥有testDB数据库的所有权限：
74 | 
75 | 授权格式：grant 权限 on 数据库.* to 用户名@登录主机 identified by “密码”;密码可为空
76 | ```
77 | grant all privileges on testDB.* to "test"@"localhost" identified by "password";
78 | flush privileges; #刷新系统权限表
79 | ```
80 | 4. 指定部分权限给用户:
81 | ```
82 | grant all privileges on testDB.* to "test"@"localhost" identified by "password";
83 | 
84 | flush privileges; #刷新系统权限表
85 | ```
86 | 5. 删除用户
87 | ```
88 | drop user 用户名@'%';
89 | drop user 'www-data'@'localhost';
90 | ```
91 | 6. 修改指定用户密码
92 | ```
93 | mysql -u root -p
94 | update mysql.user set authentication_string=password(“新密码”) where User="test" and Host="localhost";
95 | flush privileges;
96 | ```


--------------------------------------------------------------------------------
/Untitled Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/Untitled Diagram.png


--------------------------------------------------------------------------------
/Untitled Diagram.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" version="7.3.5" editor="www.draw.io" type="github"><diagram name="Page-1" id="6d05d945-a31f-5f71-e534-1321e6440cac">3Z1Nc9s4EoZ/ja8uAuDnccZ2snvI1lYlVTlOMRZtayOLXkmexPvrV7RFW+hug6RFoJv0YSojy19PEy/ebjSAM3Nx//vzpny4+1IvqtWZjha/z8zlmdZFFO//27zw9PJCXOiXF243y8XLS+rtha/L/1WHF6PDq4/LRbW13rir69Vu+WC/eF2v19X17uW1w9eWm039y37bTb2yf+pDeVtZX9S88PW6XLW/x3ny9vr35WJ39/J6rtO31/9RLW/vDj9bqbR4+cyP8vrn7aZ+XB9+4pk2N88fL5++L9vvdfjJ27tyUf86eslcnZmLTV3vXv51//uiWjVwW3AvX/fpnc++/oGbar3r8wWpefmKv8vVY2V9j+3uqeWx3W3qn9VFvao3zy+Yy6ury0/7b/nnzXK1Onr9j8sLc1ng3+HwLf+uNrvq99FLh9/pc1XfV7vN0/4th88mBzyHByiPzqPjj8P3+/UWmvjwBXdHUXl9JA6Pw+3rj3kDsv/Hgck7fGLE56K+f1hVu6r5Gzf1f5oHUEfV7+r6cbes1wheuVrervf/XFU3+9/rz/Asi6EsjfYFM0EwTdLoRPm0FfDQxcpCo7LUApcV58XxR4bAAbQ694QxxWM2PY/ic6UlUCwsCjpKhFLMMMXzKJUBMcnsR9F+EiGhmI1hjhj2hnc6pDSxnzQnoxQxolQuGYVKgaic8ExdXFwlo+BKAJHEfqgKe2DmRbA5NosQr6v17XJdVZvl+na0CXU0kAX9aPXmaBR4iyesCmEdYbYdjSKYbbUTIpvCZdrDXDseQzDXGqEQiSTj1Kl2PFkEU21uMVSRjUhhuxwKIs5ETMDJFs4emY0JSFqGp1tvswdOKv59lJaV98t1eUpiNv58MRgdnDAyiiTI6uIRyBJ5xsv8EfC5c04TiJ1mG51ENjF0nvA9HwiihfMGBlodyg9oFdjIhaKF84k4qPLbXGIwI+IM3pfU5zhR+FLu9llC2bxr+1BdL2+W1+xqPxSXt3pbjjMAPTQBGF3BVebmwzXKcsLpZ+wKDqpncmgRlj5nV3A3LLb8J8fWPQn6TNkTmbIppXx6jq37kZ7Xm8VJBaDTweWngjOaHK6jcxRo1FWauGmxDUbCqRfcOq8KI5QW4dTD0+rQ+VSKzmOjnjLqvA0JMIhNMJkviPr+fx+XD/fNn6qjVflUP+5ECf1QcoF0vsB+3vD7eSj0AJdhW+Ftu46O059InNLLwYUtvY6lST1g0T57DLSwp89CYgIctM1Jg2WLcJ6+wJ7+6+NDtfl7uX0uy0TlerH/731VNZKPdYurKp+agQBhUT72xBN7++ZBYxb9tgD5Dq22L4RhWGJ3rxNuzQcrPoJoYXdv1HmUsEq+1k5aBZu9L7C9zxklHwxBYEHDLcCqCPv7P5fPb6lvml+h3vwUpPNDsQXSeRVhc89exIEyD2AlbENRtQvBJ7SUeJd5QbROb8DxLvOAVrscw0ELW/uiN6dg/ZsGPG32Yke4HRIqwo7/eyP5d/XDR1U/WO/mUIhwMmhb8V2tOGYMxkQWEA3NAsL1cioXVUYHpyIiPRha4wjXzhnHYjnixEEPTRy4WjrhIG8nIg6MOKNo90CyNHUCUCnXlKJwRvFOT2dbTnpbNr6u7/dMlltJHZ8Dufbq9xwDs8T+INjyAmi1Q4ZhtLbaxlsgRz2eRiouYj0hfCmuYy4AQ1GzrScohbOOdnwKaPAHGChQ/mYDqsO/akS+WTOOFss9lOWPw+7r6EddbhZy6k3DOQZTfyKnkKf+kBff+CRSBwEd/kYqLiJDCN9O26H+gBaj+BOJQNjHityVf+BkIsApXO+Q0jgV+FI/rl/6haLqrY+IS+9h79BgdN7a/5XG/l5Av1AHILYxqLG9N+ztQloqLMLccy8cg14hCIvYWBmMFuHtQ27ehfKe20cWgPZFheuK/uSdWDxYyur6Lway8qjnRJ//4FUA74IOABnGYYctuwkuUm5BFwSL3pMrS9IBLr72T6UJxx5yVy6Q9AwcEWViMvEJIekGO/Zv1VZUg3+mB8KCDf7GFzuJFXkVO2kx+iqDLXuzZ5dX4eG5gZJ40bt2mTVeuXC9FgQ5cBG2vf/GXbS+/en5Y3ztB63rJuWTfqJSX17/LG/HPMdtNIz5iRjRpOCrNdScvr/X48OoaAv7DkZO9SMq+Cc3rYw4qJ3b6iRx9NH8MxrHjtkllTO7EBlE/+3CPrvQbFmEmUXeSdDb/BLj1OLTslotRptbRoMI5xY9lCGcXHzNLTGxRqCGphzBOkvR7NINMtRwjolcZOhJCcFaS9HsIogjzlEG70kL11qqnBjxYl8wikTqwrc/2aYUR12U/E0gOEE5TCDRw3NL0XNb6fas7StdLG9PyV1O51icxBG2EvmiivMTAVuUM+OkpfnGpsQTiMTCkthHBE5bgLQYJ08ip+Dbo6zgxRWxbWpf850Q0p/g3OGi/LFqrvFZlU+SRB6sEfaAFkjnE5wrxPJ0HuJiG4wJzgj4G4rEwiJsP/fKBJR5AItv+11CuPv+W5THlvkoi4HMgyWcJKTMY4f/z/V2V65W7Y4xWWofZfFQdlDtNbkJeQyWxLoDfwMp3BaLiPENS2zsY+7VaLD3UBAswtizt5CCGxMgLbaDmlSCjf3rFaIchw8BwU/5BD99Z4PAmfSdYaAE24MhFH5fXUgp9vlD15s9qL6bFtvIbJd2jkcmf5dpJpUWsW+A3+cbJy0+1SduH27/WBbVB4oFVjuIC+b8qf77+wYmJvvdEIPJPrb7Aso7sRNXxrfERlwgLMDuZ2JxEX6/kKb8NgxDHPsbDBdh+IOCAn0X9nOVRJ2gvEk/cV0w+/4CUMEHXSo9aAGNz1Nf/T7ErcCaXeTbbQ/v8GIbhMTtv00Fn1fiTSwU1giNO6MLvHGyYoRFOHu+63yVATuCE93xUPlTd2zs5ch6MQxSOFHHzl2OmHdRCjbksF+XI+JiIGGXLka8ASPNB4nw5mFv5LV7UEGXTRKTj1oI9SZu6BWj3qBXt5sSlO/c1zorcU8vn3y/VqDe4cSnTcQNvYwCrqVSwjacT8G1W5n49rARV/IGvZMXNkqCqS7lU3DB/hs0RnZTCtQXSdy/y6nfTkiMwiTKf0eZVEySHDjU71SMfhMWPORdux2N7kneBcqbgBO37YoR8IGMAsk3ca2uHPkGkPg6N4jbdAXJtxxMku03oFTwUcL2OyClFFCyIHUyoo53TMah0sN7e6TiOqiQkwqru86UUCqsbjozQqmcYJ7RoQGX8ZUah1bSm1ZGHBDuERd20RJw9ZfnoLg0cdFtyDHXX58DY2H1zzk4ISaVw4XVMeemNxf1erlJGC6sFjlPxHJhNcV5JpbLCbbY3yyVg9pGLofXCYbZH69C9ecVtyuiYXixWunCiOXCWokuErFcTnDNI3BxXmkFCzi4qOqPC3Fra0gu/fU4aE6qiVtWA2JRUX/dDczlBHvsb5pSUX89DszrBNvsk5dYnSZuVA068PoLtcpjvGLhEQxrnVlFA5xzYDCspeZ9miAWDKtFVqp3uTk0F1aLrFTvwvKbmw/DRWRheZ+GC+WlT7DOPnn1LjmH5sXrqXXvRcHQXE7w1GNw6a3TOmn3VobhwlpyVrq/TgfmwmuRdX89DsyF1yHDvjlHLT40GJG1ZWX6LwLqgug09AjsBOfsE1j/1cHQwHgttem/PBgaDGvZWZn+64OBwRAXjgYF01+siW0FHrHwemR4c48YLLwWOe4vvGGxyKwuxwO6NcKWfojLQUUAGyDTYXNT4nLQoANvgKcODIa36pwM8M6BwfBWnROpUs3rkJP+kmzSFG/K9QiG1yHDu+HkgCHuzZQwVyUDnHNgYCd4Z4/A0v4LhIGBIVzfyu3P/Sv/Ku8rRA7waeQzTal9qIdLgdvPfD/8JbphXK931Dub178efpQagXm7nfb1qi8E1VBU4xGg9nDkUyKZdJKk9kSPAZK4LWcvNdEfD5v9f4nTCadENXW239jnrURKYU1QvnaiY7v/Zcak027SlE6MARqnD9+mDdrZOMUIGqcj36cN2tmIxQgaJzjTfqIzp29jBI0Tpk/TBu1M2QHoKO9XLxxlqy3OwL5Om7SzCsBJGqduEyftLCtwksZZ3+Xj5nDN4oSBw5WrzCYOPHVE5Nk5fAsOAPwuaoR44ITx667c7KYdDKdHkRsLnHNefftj0pGA5c3CFYlWhfkDgdPQprAyj+Q/GzI4gib/xE6KaWf/btSMFp7YmjHtZAke6CGHNE5Lp53/wyNC5JDGeenEn2mptUNi48m0KwDuY0wYsyViy8q081J4AIoc0jgvnTZpeHSKHNLElV/RXMy1+2CWHpmnN3NNLGhO21y7UcMHvGdjwyikcfY4bSPiPlWHkzROGKdtrt3n9HCSnlvC6D75h5M0Thinba7RYUJuJ9JzF9AoqHHGOG3Ph84hEoOa2G41ddSD1jJDosY54168oy/l0/Ttdcf5SNBfE9S9+Wtii9i0/XUHa84nHGeQ0zYjHadbcaLGWeO0HXbHeVmcqOeWNsITuAShxnnjxD22GuRG8Dq7N9I4b5y471OD1goCksZ549RJD1orCEgap43N4ttMHLZ2zo+cDpvYkDhxh+1mDZ/wcDUoYifjxK2IHmRFApLGWePE/TW85VQM6dkljXpQfh6QNM4ZJ+6u3UfrIScSEDXOGSdu+tyH8nGixjnj1FEPWiwIiRonjc3fMhODDQ8LhAYbYfZnqHHKOHFD7WbLlzLGc+tNRSc7SiGd4CRx4oYaHhUphvTskkS4h08MaZwkTtxQuw+0TIrz4vij3/Fxo5AmdkNOnLRzYYCRNE4Sp07auTDASJrYzJjPxk0P2uob9iijue1m7GDNuCEpmVt3KjpfVQ7que1nRCe2ykE9u6QRngIrBnU6tx2NHefKMm7/SnHaOHHj5z6RlhM1zhsnjhqeZSsGNbWQi1Bv78qH5p/3v2835cPd+c2q/nV9V2525w+b+rrabs/QMcBp2vwhVDhe/kQyHPVDeb3cNVDaatioZ1gmFnQV2dSxaBfAhRNnuCg7O8rNGBHpczkFiEi52dS/tuc/9rD+2jS/3l/Pr+C4APrtZ9b1uiJDslhuquvnQ67M5bZ+3L8+QlxcF6EpbSNN8WHz9liKTNv1cRwYH3HpcwfG8JFycTEsIn4HiQHRAA84cf2a/Y48bwPmfZj0uWFjeuFwLk8ooEht/mgdK229Q4eKRp9rPSYXDfdRpgrM28S0zTY4+lwmMrlwwCN8bBelI/KzR9HIjI06CRSMPleYTC4YMFMDM8c+HFY0iAIm19jA5Yl5+SuoWmAKsZHGRCM0j78iVsBnFhdbvqLMDkwGssBCTGBw5WNegYFSBpSMzt0FxMVP6s7tucAkX4BRAtYjcTcDTOwVEQ0f8wqxBj+vYZK7LrxXOY1UwDDxk7efWOHaP6UjhASWuKyQwKGCk0WmChexsj8D4crbaw/aeSMF8wZOD6klfC/A/WTnEoq8iDoskeDEI0l7YB+D+jyTcJBA2PMzOPhcFXgayJmm51lm4YV2RcOAkohGwXhv7dp7ND6QhE8gGoXLuxq7JBJRN3jbc3be5oi+w0H0EcwgHHCnoe1YDSyVo2jYwchC1QuJToM5BAPscTFgcEAlQtGwUafKhArHPNNsOxvT9tiII2rgHAuVTTrLA80b6QeSbPnBgEcC25XBGJTScQs2mDUCFdLTWS6Iw2M/jd02EsedWZ2xw5GG0qlZZtVwG1l7sFsbDpBkU+e6gGgUgcIxyxVxuNesrUG34cjJsXMcDZt1nIVyuLNMxpsmcCsASWfGF6rmlPrJtyXUnDJQczL2U68zoujRo+aUZaNwn/vyNrheSNsTAkz5qPUhlmWIbO7L27AUZTdLwaNoiMIgV2DmvrxdFECulNhQfCDVnlQo1KsMtLGwMj+T4tyOKxRzX81u6lF2KMC8ImdUfCDPnlQoOi6fApUqvIzNFZcPJNyTigsqhYC7wUE/SCGmMyr7QO49rcjAqgiIDCi1i4nLB5LwacUlBqkhOFzDZERFhCkWH8jPpxUL9wZ5eCgEVTkMEZn9/27qenf0uc8N6y/1omre8X8=</diagram></mxfile>


--------------------------------------------------------------------------------
/com.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36" version="7.3.5" editor="www.draw.io" type="github"><diagram name="Page-1" id="e3a06f82-3646-2815-327d-82caf3d4e204">7V3fc6M2EP5r/NgMkkDAY+Ik14fr9aZ5aN8yCshGDUaukGOnf32FQTZYikvujKMMdmYysPyS9tOHVrsrMUHTxeaLIMvsN57SfAK9dDNBtxMIY89X/yvBay3wY1gL5oKltQjsBQ/sX9oIvUa6YiktOydKznPJll1hwouCJrIjI0Lwdfe0Gc+7T12SOTUEDwnJtfQq2Mv/ZKnMGjnA8f7Ar5TNs+bhEcT1gSeSPM8FXxXNEwte0PrIgujbNLUsM5LydUuE7iZoKjiX9dZiM6V5pVetM32dfNUFnaCbTC5ytQPU5vbw/RsXgz4Xq3oJWsj24966HyZhGs0QpB5KYxR6v+DmCS8kXzVP+H0lS4WkEn6jcs3Fc2kUYp0xSR+WJKn216o1dUs1Y3k+5TkX27PRLEpoklRyXsim3QBf7ZdS8Geqz9wqHd28UCGZQvU6Z/NCiSVfWiuqC65Op5uWqKn4F8oXVIpXdYo+6kVNC28aOIwb+Nf7xuLHqJZlrXaCcAMiaZrofHfzva7VRqPuvqr3DNXfFZKKpWBlpf0k56v0J1WfBjRK/Q9XvapvR/Mh9g3NYw9aNB/6A2g+iA3NT5kUbKNkf9HierlUGw9UqAqqjXsiFhOIc1Wkm5S9qM15tblmquTqNcUEXZM8L/UpqjSts34Sv7vb6u/D8QviuINfHBnwwcgz4QMeGII5QWTg9wet3lmFZKS68rtgL0Se8BWWEhrNHHiFhV0eRRgZQCBgAQKBaAgcwuNvsBMp//7+1g89JZ8LkjKlvQN1fzQm/8sN5EUmJDgaghoIG8qmqTKTmt1GGVtTh1b38LqaLyURO11Wx2iRtnf/plK+NgKyklyJuJAZn/OC5F95pdMawQNI1G2uKxNvXwQluWdVvbZFqCHTRhs8hkzJVyKhb1Rf251EzKl865ymS6n0chRgQXMi2UvX6DwpWqbxNc2rBm5gqFvvV/JE8++8ZJLxqhU/cSn5QqmLlMvapp6xTQVsG1R707eyRFm3y+qRi828GiNcJdtu8SonS3XRIzS5ifENvL+3cFMd8zyMp1NLazgB61DgXflx69flIAwNEqphgEnCCOtBw2lZGIyahbAHC2NnSAgvJPxhEsIjJMR+/LEk9EdNQtSnK/ScYSEyWJg4ycKUls/1Va6QEAdXbQ7GXSeHHyHr4RYpo9jkZDzEeAGhUTPS78NI4Awj/U/SL342RiLoXXntn+mFPB8j4agZGfRhJHSGkcGFkT/KSGilXMNIEGB3GBmOmpG4DyPdceDgT8LIZUocYmOMroIjbPRwYPAPWZzaeJBxSDxq/kU9+AdPbqM2l37nrKKObifhQdQw8nH3FnUpm6sOMN8V48eagRlhco3SG1qQ5fKxrMOUzpAbI9w1frsQAjNgFdpcQvEA1PbNVIwxUTvuQ213HEJmjP7CwJ4MDDvWLOgw0OaVPR8DzXymMTFQu1uPW7fuREaAmaF04eAJekE/Mp2w5+PguF0+2sF6nIOROxw0cwQuHDwBB5E2dT6Eg+MOhGiX6nEOhu5wEF44OIQtCgLT0Xo2DgbjHg2CXvkB2BkOBiMfOvSCK3AHrpFbmX3ggg6xa+Rpw73gcoddwMyWcs0gUW1FumSJxCHq2B5BZKYIB4Fpe6CBkhOjcROuTy6UQ4NwMxfKNb6pAUCdMPlYvhZJJnihYLXwb7r9vcW/6tc6dsuEKnJdD4VnBfr5R+px2G+UEA+CvJlz8yd9muynB/ZsBTWg72kCu3bznlaQkCSjpx0F5gc1SWhRv9hJU9Cd4BRvaR3611N0Q2hiDyzYh4O8pYGZ33H77cE16icZKUtWutPXAgAPwr6BOSkOoB1mnSmj4TBQ+uOeDaC9acfHj8idHtecWuoa7fTMb3d4FwZd2ln9axbO6TUPTku4cc+BA30yqVwinPsZT+4TzhpUOhvhRu7D6ZPf5BLh3E9wcp9w1kyKsxFu3Kn6sFc2kzuEg+5nM7lPOGv64NkIN26n6W5Vvc9COPdTl9wnnDVj/lyEg+7HmTJVzUc+m7GKNq6ACL3uzJU4MGc02dZLC4JBfF/Q/fBF9s8j3S8d5g6QsAsk1HkNLSD1MqmdjKUhpoZCMxbxQJOVqJZae2c44lzTDgXfvDqXlAZ87+Ad69lyQW3e6XiQ5QzHnQ0K+8z51QEFB8yaYNyRhH5wuRO7D8bth+4Hlzu51tAMubrWq5VrJpPMof4MhN2oeeCZ4VableKDYcxNM3b3zlURzpQtITNWPDbLjH3CZAngH0yQ9y1rz9qyJcJBiGsGkC6oD4E6Ogw6hRYf+NlQN6MYF9SHQB163fxVhD6Q69oJPFKbCn2yyAcyIx91EmM1Mq8cZ44ZWGv69KiL5oyVFWsLSvv0gOk0wJaPhwzy7RDkvmtdPc8h9AyfTxybLlmg3bYH37AYAEAMzX5Tf4Kk+WyC2roWSfWlhERunXwH+Kqay7cwaWu9EbW/r7DXL/T1fnNjG/gLlqbVwUF6Nv9wYrZJqwhbOjb//bxSu/sPNdWrRO2/hIXu/gM=</diagram></mxfile>


--------------------------------------------------------------------------------
/data/iris.data.csv:
--------------------------------------------------------------------------------
  1 | 5.1,3.5,1.4,0.2,Iris-setosa
  2 | 4.9,3.0,1.4,0.2,Iris-setosa
  3 | 4.7,3.2,1.3,0.2,Iris-setosa
  4 | 4.6,3.1,1.5,0.2,Iris-setosa
  5 | 5.0,3.6,1.4,0.2,Iris-setosa
  6 | 5.4,3.9,1.7,0.4,Iris-setosa
  7 | 4.6,3.4,1.4,0.3,Iris-setosa
  8 | 5.0,3.4,1.5,0.2,Iris-setosa
  9 | 4.4,2.9,1.4,0.2,Iris-setosa
 10 | 4.9,3.1,1.5,0.1,Iris-setosa
 11 | 5.4,3.7,1.5,0.2,Iris-setosa
 12 | 4.8,3.4,1.6,0.2,Iris-setosa
 13 | 4.8,3.0,1.4,0.1,Iris-setosa
 14 | 4.3,3.0,1.1,0.1,Iris-setosa
 15 | 5.8,4.0,1.2,0.2,Iris-setosa
 16 | 5.7,4.4,1.5,0.4,Iris-setosa
 17 | 5.4,3.9,1.3,0.4,Iris-setosa
 18 | 5.1,3.5,1.4,0.3,Iris-setosa
 19 | 5.7,3.8,1.7,0.3,Iris-setosa
 20 | 5.1,3.8,1.5,0.3,Iris-setosa
 21 | 5.4,3.4,1.7,0.2,Iris-setosa
 22 | 5.1,3.7,1.5,0.4,Iris-setosa
 23 | 4.6,3.6,1.0,0.2,Iris-setosa
 24 | 5.1,3.3,1.7,0.5,Iris-setosa
 25 | 4.8,3.4,1.9,0.2,Iris-setosa
 26 | 5.0,3.0,1.6,0.2,Iris-setosa
 27 | 5.0,3.4,1.6,0.4,Iris-setosa
 28 | 5.2,3.5,1.5,0.2,Iris-setosa
 29 | 5.2,3.4,1.4,0.2,Iris-setosa
 30 | 4.7,3.2,1.6,0.2,Iris-setosa
 31 | 4.8,3.1,1.6,0.2,Iris-setosa
 32 | 5.4,3.4,1.5,0.4,Iris-setosa
 33 | 5.2,4.1,1.5,0.1,Iris-setosa
 34 | 5.5,4.2,1.4,0.2,Iris-setosa
 35 | 4.9,3.1,1.5,0.1,Iris-setosa
 36 | 5.0,3.2,1.2,0.2,Iris-setosa
 37 | 5.5,3.5,1.3,0.2,Iris-setosa
 38 | 4.9,3.1,1.5,0.1,Iris-setosa
 39 | 4.4,3.0,1.3,0.2,Iris-setosa
 40 | 5.1,3.4,1.5,0.2,Iris-setosa
 41 | 5.0,3.5,1.3,0.3,Iris-setosa
 42 | 4.5,2.3,1.3,0.3,Iris-setosa
 43 | 4.4,3.2,1.3,0.2,Iris-setosa
 44 | 5.0,3.5,1.6,0.6,Iris-setosa
 45 | 5.1,3.8,1.9,0.4,Iris-setosa
 46 | 4.8,3.0,1.4,0.3,Iris-setosa
 47 | 5.1,3.8,1.6,0.2,Iris-setosa
 48 | 4.6,3.2,1.4,0.2,Iris-setosa
 49 | 5.3,3.7,1.5,0.2,Iris-setosa
 50 | 5.0,3.3,1.4,0.2,Iris-setosa
 51 | 7.0,3.2,4.7,1.4,Iris-versicolor
 52 | 6.4,3.2,4.5,1.5,Iris-versicolor
 53 | 6.9,3.1,4.9,1.5,Iris-versicolor
 54 | 5.5,2.3,4.0,1.3,Iris-versicolor
 55 | 6.5,2.8,4.6,1.5,Iris-versicolor
 56 | 5.7,2.8,4.5,1.3,Iris-versicolor
 57 | 6.3,3.3,4.7,1.6,Iris-versicolor
 58 | 4.9,2.4,3.3,1.0,Iris-versicolor
 59 | 6.6,2.9,4.6,1.3,Iris-versicolor
 60 | 5.2,2.7,3.9,1.4,Iris-versicolor
 61 | 5.0,2.0,3.5,1.0,Iris-versicolor
 62 | 5.9,3.0,4.2,1.5,Iris-versicolor
 63 | 6.0,2.2,4.0,1.0,Iris-versicolor
 64 | 6.1,2.9,4.7,1.4,Iris-versicolor
 65 | 5.6,2.9,3.6,1.3,Iris-versicolor
 66 | 6.7,3.1,4.4,1.4,Iris-versicolor
 67 | 5.6,3.0,4.5,1.5,Iris-versicolor
 68 | 5.8,2.7,4.1,1.0,Iris-versicolor
 69 | 6.2,2.2,4.5,1.5,Iris-versicolor
 70 | 5.6,2.5,3.9,1.1,Iris-versicolor
 71 | 5.9,3.2,4.8,1.8,Iris-versicolor
 72 | 6.1,2.8,4.0,1.3,Iris-versicolor
 73 | 6.3,2.5,4.9,1.5,Iris-versicolor
 74 | 6.1,2.8,4.7,1.2,Iris-versicolor
 75 | 6.4,2.9,4.3,1.3,Iris-versicolor
 76 | 6.6,3.0,4.4,1.4,Iris-versicolor
 77 | 6.8,2.8,4.8,1.4,Iris-versicolor
 78 | 6.7,3.0,5.0,1.7,Iris-versicolor
 79 | 6.0,2.9,4.5,1.5,Iris-versicolor
 80 | 5.7,2.6,3.5,1.0,Iris-versicolor
 81 | 5.5,2.4,3.8,1.1,Iris-versicolor
 82 | 5.5,2.4,3.7,1.0,Iris-versicolor
 83 | 5.8,2.7,3.9,1.2,Iris-versicolor
 84 | 6.0,2.7,5.1,1.6,Iris-versicolor
 85 | 5.4,3.0,4.5,1.5,Iris-versicolor
 86 | 6.0,3.4,4.5,1.6,Iris-versicolor
 87 | 6.7,3.1,4.7,1.5,Iris-versicolor
 88 | 6.3,2.3,4.4,1.3,Iris-versicolor
 89 | 5.6,3.0,4.1,1.3,Iris-versicolor
 90 | 5.5,2.5,4.0,1.3,Iris-versicolor
 91 | 5.5,2.6,4.4,1.2,Iris-versicolor
 92 | 6.1,3.0,4.6,1.4,Iris-versicolor
 93 | 5.8,2.6,4.0,1.2,Iris-versicolor
 94 | 5.0,2.3,3.3,1.0,Iris-versicolor
 95 | 5.6,2.7,4.2,1.3,Iris-versicolor
 96 | 5.7,3.0,4.2,1.2,Iris-versicolor
 97 | 5.7,2.9,4.2,1.3,Iris-versicolor
 98 | 6.2,2.9,4.3,1.3,Iris-versicolor
 99 | 5.1,2.5,3.0,1.1,Iris-versicolor
100 | 5.7,2.8,4.1,1.3,Iris-versicolor
101 | 6.3,3.3,6.0,2.5,Iris-virginica
102 | 5.8,2.7,5.1,1.9,Iris-virginica
103 | 7.1,3.0,5.9,2.1,Iris-virginica
104 | 6.3,2.9,5.6,1.8,Iris-virginica
105 | 6.5,3.0,5.8,2.2,Iris-virginica
106 | 7.6,3.0,6.6,2.1,Iris-virginica
107 | 4.9,2.5,4.5,1.7,Iris-virginica
108 | 7.3,2.9,6.3,1.8,Iris-virginica
109 | 6.7,2.5,5.8,1.8,Iris-virginica
110 | 7.2,3.6,6.1,2.5,Iris-virginica
111 | 6.5,3.2,5.1,2.0,Iris-virginica
112 | 6.4,2.7,5.3,1.9,Iris-virginica
113 | 6.8,3.0,5.5,2.1,Iris-virginica
114 | 5.7,2.5,5.0,2.0,Iris-virginica
115 | 5.8,2.8,5.1,2.4,Iris-virginica
116 | 6.4,3.2,5.3,2.3,Iris-virginica
117 | 6.5,3.0,5.5,1.8,Iris-virginica
118 | 7.7,3.8,6.7,2.2,Iris-virginica
119 | 7.7,2.6,6.9,2.3,Iris-virginica
120 | 6.0,2.2,5.0,1.5,Iris-virginica
121 | 6.9,3.2,5.7,2.3,Iris-virginica
122 | 5.6,2.8,4.9,2.0,Iris-virginica
123 | 7.7,2.8,6.7,2.0,Iris-virginica
124 | 6.3,2.7,4.9,1.8,Iris-virginica
125 | 6.7,3.3,5.7,2.1,Iris-virginica
126 | 7.2,3.2,6.0,1.8,Iris-virginica
127 | 6.2,2.8,4.8,1.8,Iris-virginica
128 | 6.1,3.0,4.9,1.8,Iris-virginica
129 | 6.4,2.8,5.6,2.1,Iris-virginica
130 | 7.2,3.0,5.8,1.6,Iris-virginica
131 | 7.4,2.8,6.1,1.9,Iris-virginica
132 | 7.9,3.8,6.4,2.0,Iris-virginica
133 | 6.4,2.8,5.6,2.2,Iris-virginica
134 | 6.3,2.8,5.1,1.5,Iris-virginica
135 | 6.1,2.6,5.6,1.4,Iris-virginica
136 | 7.7,3.0,6.1,2.3,Iris-virginica
137 | 6.3,3.4,5.6,2.4,Iris-virginica
138 | 6.4,3.1,5.5,1.8,Iris-virginica
139 | 6.0,3.0,4.8,1.8,Iris-virginica
140 | 6.9,3.1,5.4,2.1,Iris-virginica
141 | 6.7,3.1,5.6,2.4,Iris-virginica
142 | 6.9,3.1,5.1,2.3,Iris-virginica
143 | 5.8,2.7,5.1,1.9,Iris-virginica
144 | 6.8,3.2,5.9,2.3,Iris-virginica
145 | 6.7,3.3,5.7,2.5,Iris-virginica
146 | 6.7,3.0,5.2,2.3,Iris-virginica
147 | 6.3,2.5,5.0,1.9,Iris-virginica
148 | 6.5,3.0,5.2,2.0,Iris-virginica
149 | 6.2,3.4,5.4,2.3,Iris-virginica
150 | 5.9,3.0,5.1,1.8,Iris-virginica
151 | 
152 | 


--------------------------------------------------------------------------------
/kaggle/Chapter_1.1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": false
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "df_train = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-train.csv')\n",
 23 |     "\n",
 24 |     "df_test = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-test.csv')\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size']]\n",
 36 |     "df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size']]"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "import matplotlib.pyplot as plt\n",
 48 |     "\n",
 49 |     "plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')\n",
 50 |     "plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')\n",
 51 |     "\n",
 52 |     "plt.xlabel('Clump Thickness')\n",
 53 |     "plt.ylabel('Cell Size')\n",
 54 |     "\n",
 55 |     "plt.show()\n"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 5,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "import numpy as np\n",
 67 |     "\n",
 68 |     "intercept = np.random.random([1])\n",
 69 |     "coef = np.random.random([2])\n",
 70 |     "\n",
 71 |     "lx = np.arange(0, 12)\n",
 72 |     "ly = (-intercept - lx * coef[0]) / coef[1]\n",
 73 |     "\n",
 74 |     "plt.plot(lx, ly, c='yellow')\n",
 75 |     "\n",
 76 |     "\n",
 77 |     "plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')\n",
 78 |     "plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')\n",
 79 |     "plt.xlabel('Clump Thickness')\n",
 80 |     "plt.ylabel('Cell Size')\n",
 81 |     "plt.show()\n",
 82 |     "\n"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 6,
 88 |    "metadata": {
 89 |     "collapsed": false
 90 |    },
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "Testing accuracy (10 training samples): 0.868571428571\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "from sklearn.linear_model import LogisticRegression\n",
102 |     "lr = LogisticRegression()\n",
103 |     "\n",
104 |     "lr.fit(df_train[['Clump Thickness', 'Cell Size']][:10], df_train['Type'][:10])\n",
105 |     "print 'Testing accuracy (10 training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type'])\n"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 7,
111 |    "metadata": {
112 |     "collapsed": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "intercept = lr.intercept_\n",
117 |     "coef = lr.coef_[0, :]\n",
118 |     "\n",
119 |     "ly = (-intercept - lx * coef[0]) / coef[1]\n",
120 |     "\n",
121 |     "plt.plot(lx, ly, c='green')\n",
122 |     "plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')\n",
123 |     "plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')\n",
124 |     "plt.xlabel('Clump Thickness')\n",
125 |     "plt.ylabel('Cell Size')\n",
126 |     "plt.show()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 8,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "Testing accuracy (all training samples): 0.937142857143\n"
141 |      ]
142 |     }
143 |    ],
144 |    "source": [
145 |     "lr = LogisticRegression()\n",
146 |     "\n",
147 |     "lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type'])\n",
148 |     "print 'Testing accuracy (all training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type'])\n"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {
155 |     "collapsed": true
156 |    },
157 |    "outputs": [],
158 |    "source": [
159 |     "intercept = lr.intercept_\n",
160 |     "coef = lr.coef_[0, :]\n",
161 |     "ly = (-intercept - lx * coef[0]) / coef[1]\n",
162 |     "\n",
163 |     "plt.plot(lx, ly, c='blue')\n",
164 |     "plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')\n",
165 |     "plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')\n",
166 |     "plt.xlabel('Clump Thickness')\n",
167 |     "plt.ylabel('Cell Size')\n",
168 |     "plt.show()"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {
175 |     "collapsed": true
176 |    },
177 |    "outputs": [],
178 |    "source": []
179 |   }
180 |  ],
181 |  "metadata": {
182 |   "kernelspec": {
183 |    "display_name": "Python 2",
184 |    "language": "python",
185 |    "name": "python2"
186 |   },
187 |   "language_info": {
188 |    "codemirror_mode": {
189 |     "name": "ipython",
190 |     "version": 2
191 |    },
192 |    "file_extension": ".py",
193 |    "mimetype": "text/x-python",
194 |    "name": "python",
195 |    "nbconvert_exporter": "python",
196 |    "pygments_lexer": "ipython2",
197 |    "version": "2.7.11"
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 0
202 | }
203 | 


--------------------------------------------------------------------------------
/kaggle/Datasets/Breast-Cancer/breast-cancer-test.csv:
--------------------------------------------------------------------------------
  1 | ,Clump Thickness,Cell Size,Type
  2 | 158,1,2,0
  3 | 499,1,1,0
  4 | 396,1,1,0
  5 | 155,5,5,1
  6 | 321,1,1,0
  7 | 212,1,1,0
  8 | 234,3,2,0
  9 | 289,6,6,1
 10 | 300,4,10,1
 11 | 356,3,3,1
 12 | 672,1,1,0
 13 | 328,10,3,1
 14 | 199,1,1,0
 15 | 78,1,1,0
 16 | 598,1,1,0
 17 | 569,10,8,1
 18 | 446,1,1,0
 19 | 506,10,10,1
 20 | 626,6,6,1
 21 | 603,4,6,1
 22 | 360,10,10,1
 23 | 338,1,1,0
 24 | 668,7,4,1
 25 | 290,1,1,0
 26 | 284,4,5,1
 27 | 331,1,1,0
 28 | 477,1,1,0
 29 | 54,5,5,1
 30 | 248,1,1,0
 31 | 223,5,6,1
 32 | 133,1,1,0
 33 | 640,1,1,0
 34 | 136,1,1,0
 35 | 109,5,4,1
 36 | 181,1,1,0
 37 | 432,1,1,0
 38 | 554,1,1,0
 39 | 482,10,10,1
 40 | 516,1,1,0
 41 | 132,10,8,1
 42 | 176,1,1,0
 43 | 72,3,3,0
 44 | 254,10,10,1
 45 | 577,1,1,0
 46 | 649,1,1,0
 47 | 595,1,1,0
 48 | 666,2,2,0
 49 | 352,4,5,0
 50 | 76,1,4,0
 51 | 148,1,1,0
 52 | 346,2,2,0
 53 | 90,1,1,0
 54 | 681,10,10,1
 55 | 10,1,1,0
 56 | 63,3,4,1
 57 | 635,1,4,0
 58 | 656,1,1,0
 59 | 174,6,5,1
 60 | 256,1,1,0
 61 | 667,1,1,0
 62 | 31,1,1,0
 63 | 369,1,3,0
 64 | 570,10,4,1
 65 | 77,3,1,0
 66 | 532,1,1,0
 67 | 548,1,1,0
 68 | 211,10,8,1
 69 | 55,6,6,1
 70 | 135,1,1,0
 71 | 671,2,1,0
 72 | 340,3,3,1
 73 | 2,1,1,0
 74 | 227,9,9,1
 75 | 81,1,1,0
 76 | 473,1,1,0
 77 | 694,1,1,0
 78 | 665,1,1,0
 79 | 604,3,2,1
 80 | 120,1,2,0
 81 | 311,1,1,0
 82 | 204,1,1,0
 83 | 244,1,1,0
 84 | 686,1,1,0
 85 | 271,1,1,0
 86 | 131,1,1,0
 87 | 680,10,10,1
 88 | 60,3,5,1
 89 | 310,1,1,0
 90 | 30,1,1,0
 91 | 69,1,1,0
 92 | 651,2,1,0
 93 | 390,1,1,0
 94 | 44,10,10,1
 95 | 625,1,3,0
 96 | 70,1,3,0
 97 | 515,10,4,1
 98 | 654,1,1,0
 99 | 249,1,1,0
100 | 209,1,1,0
101 | 165,1,1,0
102 | 470,1,1,0
103 | 164,1,1,0
104 | 507,1,1,0
105 | 323,4,6,1
106 | 65,4,2,1
107 | 409,1,2,0
108 | 49,8,7,1
109 | 118,1,1,0
110 | 192,1,1,0
111 | 39,5,3,1
112 | 259,7,7,0
113 | 422,3,3,0
114 | 6,1,1,0
115 | 101,3,4,1
116 | 542,3,1,0
117 | 299,1,2,1
118 | 395,1,1,0
119 | 501,1,1,0
120 | 318,1,1,0
121 | 145,1,3,0
122 | 486,1,1,0
123 | 353,7,10,1
124 | 208,1,1,0
125 | 695,1,1,0
126 | 361,10,3,1
127 | 86,3,6,1
128 | 664,1,1,0
129 | 481,3,2,0
130 | 633,7,4,1
131 | 41,4,3,1
132 | 108,1,1,0
133 | 690,1,1,0
134 | 56,10,10,1
135 | 424,1,1,0
136 | 514,6,7,1
137 | 24,1,1,0
138 | 218,10,7,1
139 | 431,1,1,0
140 | 281,1,1,0
141 | 110,3,1,0
142 | 82,2,1,0
143 | 51,3,3,1
144 | 220,1,1,0
145 | 559,1,1,0
146 | 544,1,3,0
147 | 302,10,10,1
148 | 552,2,2,0
149 | 215,7,8,1
150 | 235,1,4,0
151 | 18,7,7,1
152 | 250,2,2,0
153 | 260,5,8,1
154 | 430,3,1,0
155 | 264,9,4,1
156 | 61,1,1,0
157 | 213,10,10,1
158 | 377,1,1,0
159 | 29,1,3,0
160 | 182,1,1,0
161 | 306,1,1,0
162 | 388,1,1,0
163 | 329,4,6,1
164 | 437,1,1,0
165 | 296,3,4,0
166 | 584,1,1,0
167 | 342,1,1,0
168 | 436,10,10,1
169 | 579,1,1,0
170 | 326,1,1,1
171 | 362,2,2,0
172 | 617,1,1,0
173 | 578,1,1,0
174 | 231,8,7,1
175 | 336,5,5,1
176 | 655,1,1,0
177 | 


--------------------------------------------------------------------------------
/scripts/consumer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | from kafka import KafkaConsumer
 7 | 
 8 | consumer = KafkaConsumer('result')
 9 | for msg in consumer:
10 |     print((msg.value).decode('utf8'))


--------------------------------------------------------------------------------
/scripts/producer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | import csv
 7 | import time
 8 | from kafka import KafkaProducer
 9 | 
10 | # 实例化一个KafkaProducer示例，用于向Kafka投递消息
11 | producer = KafkaProducer(bootstrap_servers='127.0.0.1:9092')
12 | # 打开数据文件
13 | csvfile = open("../data/user_log.csv", "r")
14 | # 生成一个可用于读取csv文件的reader
15 | reader = csv.reader(csvfile)
16 | 
17 | for line in reader:
18 |     gender = line[9]  # 性别在每行日志代码的第9个元素
19 |     if gender == 'gender':
20 |         continue  # 去除第一行表头
21 |     time.sleep(0.1)  # 每隔0.1秒发送一行数据
22 |     # 发送数据，topic为'sex'
23 |     producer.send('sex', line[9].encode('utf8'))


--------------------------------------------------------------------------------
/work_one/Asyncio_hello.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | # 从Python 3.5开始 可以让coroutine的代码更简洁易读
 3 | # 把@asyncio.coroutine替换为async；
 4 | # 把yield from替换为await。
 5 | 
 6 | # @asyncio.coroutine
 7 | async def wget(host):
 8 |     print('wget %s...' % host)
 9 |     connect = asyncio.open_connection(host, 80)
10 |     # reader, writer = yield from connect
11 |     reader, writer = await connect
12 |     header = 'GET / HTTP/1.0\r\nHost: %s\r\n\r\n' % host
13 |     writer.write(header.encode('utf-8'))
14 |     # yield from writer.drain()
15 |     await writer.drain()
16 |     while True:
17 |         # line = yield from reader.readline()
18 |         line = await reader.readline()
19 |         if line == b'\r\n':
20 |             break
21 |         print('%s header > %s' % (host, line.decode('utf-8').rstrip()))
22 |     # Ignore the body, close the socket
23 |     writer.close()
24 | 
25 | 
26 | loop = asyncio.get_event_loop()
27 | tasks = [wget(host) for host in ['www.sina.com.cn', 'www.sohu.com', 'www.163.com']]
28 | loop.run_until_complete(asyncio.wait(tasks))
29 | loop.close()
30 | 


--------------------------------------------------------------------------------
/work_one/FLASK_app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, render_template
 2 | 
 3 | app = Flask(__name__)
 4 | 
 5 | @app.route('/', methods=['GET', 'POST'])
 6 | def home():
 7 |     return render_template('home.html')
 8 | 
 9 | @app.route('/signin', methods=['GET'])
10 | def signin_form():
11 |     return render_template('form.html')
12 | 
13 | @app.route('/signin', methods=['POST'])
14 | def signin():
15 |     username = request.form['username']
16 |     password = request.form['password']
17 |     if username=='admin' and password=='password':
18 |         return render_template('signin-ok.html', username=username)
19 |     return render_template('form.html', message='Bad username or password', username=username)
20 | 
21 | if __name__ == '__main__':
22 |     app.run()


--------------------------------------------------------------------------------
/work_one/IO.py:
--------------------------------------------------------------------------------
  1 | # from io import StringIO
  2 | #
  3 | # f = StringIO('Hello!\nHi!\nGoodbye!')
  4 | # while True:
  5 | #     s = f.readline()
  6 | #     if s == '':
  7 | #         break
  8 | #     print(s)
  9 | 
 10 | # 最后看看如何利用Python的特性来过滤文件。比如我们要列出当前目录下的所有目录，只需要一行代码：
 11 | # import os
 12 | # [x for x in os.listdir('.') if os.path.isdir(x)]
 13 | 
 14 | # 要列出所有的.py文件，也只需一行代码：
 15 | # [x for x in os.listdir('.') if os.path.isfile(x) and os.path.splitext(x)[1]=='.py']
 16 | 
 17 | #编写一个程序，能在当前目录以及当前目录的所有子目录下查找文件名包含指定字符串的文件，并打印出相对路径
 18 | # import os
 19 | # def search(dir, text):
 20 | #     for x in os.listdir(dir):
 21 | #         if os.path.isfile(os.path.join(dir,x)):
 22 | #             if text in os.path.splitext(x)[0]:
 23 | #                 print('%s, %s'% (dir, x))
 24 | #         if os.path.isdir(os.path.join(dir,x)):
 25 | #             search(os.path.join(dir, x),text)
 26 | #
 27 | # print(os.path.abspath('.'))
 28 | # search('/Users/yuhongjun/reactNativeWorkSpace/YuHongJun.github.io' , 'feedtest2')
 29 | #
 30 | # import os
 31 | #
 32 | # print('Process (%s) start...' % os.getpid())
 33 | # # Only works on Unix/Linux/Mac:
 34 | # pid = os.fork()
 35 | # if pid == 0:
 36 | #     print('I am child process (%s) and my.cnf parent is %s.' % (os.getpid(), os.getppid()))
 37 | # else:
 38 | #     print('I (%s) just created a child process (%s).' % (os.getpid(), pid))
 39 | 
 40 | # import subprocess
 41 | #
 42 | # print('$ nslookup www.python.org')
 43 | # r = subprocess.call(['nslookup', 'www.python.org'])
 44 | # print('Exit code:', r)
 45 | 
 46 | # from multiprocessing import Process, Queue
 47 | # import os, time, random
 48 | #
 49 | # # 写数据进程执行的代码:
 50 | # def write(q):
 51 | #     print('Process to write: %s' % os.getpid())
 52 | #     for value in ['A', 'B', 'C']:
 53 | #         print('Put %s to queue...' % value)
 54 | #         q.put(value)
 55 | #         time.sleep(random.random())
 56 | #
 57 | # # 读数据进程执行的代码:
 58 | # def read(q):
 59 | #     print('Process to read: %s' % os.getpid())
 60 | #     while True:
 61 | #         value = q.get(True)
 62 | #         print('Get %s from queue.' % value)
 63 | #
 64 | # if __name__=='__main__':
 65 | #     # 父进程创建Queue，并传给各个子进程：
 66 | #     q = Queue()
 67 | #     pw = Process(target=write, args=(q,))
 68 | #     pr = Process(target=read, args=(q,))
 69 | #     # 启动子进程pw，写入:
 70 | #     pw.start()
 71 | #     # 启动子进程pr，读取:
 72 | #     pr.start()
 73 | #     # 等待pw结束:
 74 | #     pw.join()
 75 | #     # pr进程里是死循环，无法等待其结束，只能强行终止:
 76 | #     pr.terminate()
 77 | 
 78 | # import time, threading
 79 | #
 80 | # # 新线程执行的代码:
 81 | # def loop():
 82 | #     print('thread %s is running...' % threading.current_thread().name)
 83 | #     n = 0
 84 | #     while n < 5:
 85 | #         n = n + 1
 86 | #         print('thread %s >>> %s' % (threading.current_thread().name, n))
 87 | #         time.sleep(1)
 88 | #     print('thread %s ended.' % threading.current_thread().name)
 89 | #
 90 | # print('thread %s is running...' % threading.current_thread().name)
 91 | # t = threading.Thread(target=loop, name='LoopThread')
 92 | # t.start()
 93 | # t.join()
 94 | # print('thread %s ended.' % threading.current_thread().name)
 95 | #
 96 | # import time, threading
 97 | # balance = 0
 98 | # lock = threading.Lock()
 99 | #
100 | # def change_it(n):
101 | #     # 先存后取，结果应该为0:
102 | #     global balance
103 | #     balance = balance + n
104 | #     balance = balance - n
105 | #
106 | # def run_thread(n):
107 | #     for i in range(100000):
108 | #         # 先要获取锁:
109 | #         lock.acquire()
110 | #         try:
111 | #             # 放心地改吧:
112 | #             change_it(n)
113 | #         finally:
114 | #             # 改完了一定要释放锁:
115 | #             lock.release()
116 | 
117 | # import re
118 | # re_mail=re.compile(r'^(.+)@([a-zA-Z0-9]+)\.([a-zA-Z0-9]{2,3}|[0-9]{1,3})$')
119 | # a=re_mail.match('someone@gmail.com').groups()
120 | # print(a)
121 | 
122 | # from datetime import datetime
123 | # now=datetime.now()
124 | # print(now)
125 | # print(type(now))
126 | #
127 | # dt=datetime(2017,8,17,12,12)
128 | # tt=dt.timestamp()
129 | #
130 | # print(datetime.fromtimestamp(tt)) # 本地时间
131 | # print(datetime.utcfromtimestamp(tt)) # UTC时间
132 | 
133 | # from datetime import datetime
134 | # cday = datetime.strptime('2017-8-17 12:12:12','%Y-%m-%d %H:%M:%S')
135 | # print(cday)
136 | #
137 | # now=datetime.now()
138 | # print(now.strftime('%a,%b %d %H:%M'))
139 | 
140 | import re
141 | from datetime import datetime, timezone, timedelta
142 | 
143 | def to_timestamp(dt_str, tz_str):
144 |     dt = datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S')
145 |     print(dt)
146 |     tz_info = re.split(r'[UTC\:]+',tz_str)
147 | 
148 |     print(tz_info)
149 |     tz_hours = int(tz_info[1])
150 |     print(tz_hours)
151 | 
152 |     tz_minutes = int(tz_info[2])
153 |     print(tz_minutes)
154 | 
155 |     dt = dt.replace(tzinfo = timezone(timedelta(hours=tz_hours, minutes=tz_minutes)))
156 |     return dt.timestamp()
157 | 
158 | # 测试:
159 | 
160 | t1 = to_timestamp('2015-6-1 08:10:30', 'UTC+7:00')
161 | assert t1 == 1433121030.0, t1
162 | 
163 | t2 = to_timestamp('2015-5-31 16:10:30', 'UTC-09:00')
164 | assert t2 == 1433121030.0, t2
165 | 
166 | print('Pass')


--------------------------------------------------------------------------------
/work_one/WSGI_hello.py:
--------------------------------------------------------------------------------
1 | # hello.py
2 | def application(environ, start_response):
3 |     start_response('200 OK',[('Content-Type','text/html')])
4 |     body='<h1>Hello, %s!</h1>' % (environ['PATH_INFO'][1:] or 'web')
5 |     return [body.encode('utf-8')]


--------------------------------------------------------------------------------
/work_one/WSGI_server.py:
--------------------------------------------------------------------------------
1 | from wsgiref.simple_server import make_server
2 | from WSGI_hello import application
3 | # 创建一个服务器，IP地址为空，端口是8000，处理函数是application:
4 | httpd= make_server('',8000,application)
5 | print('Serving HTTP on port 8000...')
6 | # 开始监听HTTP请求:
7 | httpd.serve_forever()


--------------------------------------------------------------------------------
/work_one/code.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/code.jpg


--------------------------------------------------------------------------------
/work_one/distributed/task_master.py:
--------------------------------------------------------------------------------
 1 | import random, time, queue
 2 | from multiprocessing.managers import BaseManager
 3 | 
 4 | # 发送任务的队列:
 5 | task_queue = queue.Queue()
 6 | # 接收结果的队列:
 7 | result_queue = queue.Queue()
 8 | 
 9 | # 从BaseManager继承的QueueManager:
10 | class QueueManager(BaseManager):
11 |     pass
12 | 
13 | # 把两个Queue都注册到网络上, callable参数关联了Queue对象:
14 | QueueManager.register('get_task_queue', callable=lambda: task_queue)
15 | QueueManager.register('get_result_queue', callable=lambda: result_queue)
16 | # 绑定端口5000, 设置验证码'abc':
17 | manager = QueueManager(address=('', 5000), authkey=b'abc')
18 | # 启动Queue:
19 | manager.start()
20 | # 获得通过网络访问的Queue对象:
21 | task = manager.get_task_queue()
22 | result = manager.get_result_queue()
23 | # 放几个任务进去:
24 | for i in range(10):
25 |     n = random.randint(0, 10000)
26 |     print('Put task %d...' % n)
27 |     task.put(n)
28 | # 从result队列读取结果:
29 | print('Try get results...')
30 | for i in range(10):
31 |     r = result.get(timeout=10)
32 |     print('Result: %s' % r)
33 | # 关闭:
34 | manager.shutdown()
35 | print('master exit.')


--------------------------------------------------------------------------------
/work_one/distributed/task_worker.py:
--------------------------------------------------------------------------------
 1 | import time, sys, queue
 2 | from multiprocessing.managers import BaseManager
 3 | 
 4 | # 创建类似的QueueManager:
 5 | class QueueManager(BaseManager):
 6 |     pass
 7 | 
 8 | # 由于这个QueueManager只从网络上获取Queue，所以注册时只提供名字:
 9 | QueueManager.register('get_task_queue')
10 | QueueManager.register('get_result_queue')
11 | 
12 | # 连接到服务器，也就是运行task_master.py的机器:
13 | server_addr = '127.0.0.1'
14 | print('Connect to server %s...' % server_addr)
15 | # 端口和验证码注意保持与task_master.py设置的完全一致:
16 | m = QueueManager(address=(server_addr, 5000), authkey=b'abc')
17 | # 从网络连接:
18 | m.connect()
19 | # 获取Queue的对象:
20 | task = m.get_task_queue()
21 | result = m.get_result_queue()
22 | # 从task队列取任务,并把结果写入result队列:
23 | for i in range(10):
24 |     try:
25 |         n = task.get(timeout=1)
26 |         print('run task %d * %d...' % (n, n))
27 |         r = '%d * %d = %d' % (n, n, n*n)
28 |         time.sleep(1)
29 |         result.put(r)
30 |     except queue.Empty:
31 |         print('task queue is empty.')
32 | # 处理结束:
33 | print('worker exit.')


--------------------------------------------------------------------------------
/work_one/leet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | 
 7 | # class Solution(object):
 8 | #     def longestCommonPrefix(self, strs):
 9 | #         """
10 | #         :type strs: List[str]
11 | #         :rtype: str
12 | #         """
13 | #         if not strs:
14 | #             print("kong")
15 | #             return ""
16 | #         for i, letter_group in enumerate(zip(*strs)):
17 | #             print(list(zip(*strs)))
18 | #             print(len(set(letter_group)))
19 | #             if len(set(letter_group)) > 1:
20 | #                 print(i)
21 | #                 print('jinlaile')
22 | #                 print(strs[0][:i])
23 | #                 return strs[0][:i]
24 | #             else:
25 | #                 print('min')
26 | #                 print(min(strs))
27 | #                 return min(strs)
28 | # # me=Solution()
29 | # Solution().longestCommonPrefix(["abc","de"])
30 | 
31 | class Solution(object):
32 |     def isValid(self, s):
33 |         """
34 |         :type s: str
35 |         :rtype: bool
36 |         """
37 |         stack = []
38 |         dict = {"]":"[", "}":"{", ")":"("}
39 |         for char in s:
40 |             if char in dict.values():
41 | 
42 |                 stack.append(char)
43 |                 print(stack)
44 |             elif char in dict.keys():
45 |                 if stack == [] or dict[char] != stack.pop():
46 |                     return False
47 |             else:
48 |                 return False
49 |         return stack == []
50 | result=Solution().isValid("{}[")
51 | print(result)


--------------------------------------------------------------------------------
/work_one/mydict.py:
--------------------------------------------------------------------------------
 1 | class Dict(dict):
 2 |     def __int__(self, **kw):
 3 |         super().__init__(**kw)
 4 | 
 5 |     def __getattr__(self, key):
 6 |         try:
 7 |             return self[key]
 8 |         except KeyError:
 9 |             raise AttributeError(r"'Dict' object has no attribute '%s'" % key)
10 | 
11 |     def __setattr__(self, key, value):
12 |         self[key] = value
13 | 
14 | 
15 | with open('mydict2.py', 'r') as f:
16 |     print(f.read())
17 | 
18 | # with open('test1.jpg', 'rb') as f:
19 | #     print(f.read())
20 | #
21 | # with open('mydict2.py', 'w') as f:
22 | #     f.write('hhheeelllooo')
23 | 


--------------------------------------------------------------------------------
/work_one/mydict2.py:
--------------------------------------------------------------------------------
 1 | class Dict(dict):
 2 |     '''
 3 |     Simple dict but also support access as x.y style.
 4 | 
 5 |     >>> d1=Dict()
 6 |     >>> d1['x']=100
 7 |     >>> d1.x
 8 |     100
 9 |     >>> d1.y=200
10 |     >>> d1['y']
11 |     200
12 |     >>> d2 = Dict(a=1, b=2, c='3')
13 |     >>> d2.c
14 |     '3'
15 |     >>> d2['empty']
16 |     Traceback (most recent call last):
17 |         ...
18 |     KeyError: 'empty'
19 | 
20 |     >>> d2.empty
21 |     Traceback (most recent call last):
22 |         ...
23 |     AttributeError: 'Dict' object has no attribute 'empty'
24 | 
25 |     '''
26 | 
27 |     def __int__(self, **kw):
28 |         super(Dict, self).__init__(**kw)
29 | 
30 |     def __getattr__(self, key):
31 |         try:
32 |             return self[key]
33 |         except KeyError:
34 |             raise AttributeError(r"'Dict' object has no attribute '%s'" % key)
35 | 
36 |     def __setattr__(self, key, value):
37 |         self[key] = value
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     import doctest
42 | 
43 |     doctest.testmod()
44 | 


--------------------------------------------------------------------------------
/work_one/mydict_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from mydict import Dict
 4 | 
 5 | 
 6 | class TestDict(unittest.TestCase):
 7 |     def setUp(self):
 8 |         print('setUp...')
 9 | 
10 |     def tearDown(self):
11 |         print('tearDown...')
12 | 
13 |     def test_init(self):
14 |         d = Dict(a=1, b='test')
15 |         self.assertEqual(d.a, 1)
16 |         self.assertEqual(d.b, 'test')
17 |         self.assertTrue(isinstance(d, dict))
18 | 
19 |     def test_key(self):
20 |         d = Dict()
21 |         d['key'] = 'value'
22 |         self.assertEqual(d.key, 'value')
23 | 
24 |     def test_attr(self):
25 |         d = Dict()
26 |         d.key = 'value'
27 |         self.assertTrue('key' in d)
28 |         self.assertEqual(d['key'], 'value')
29 | 
30 |     def test_keyerror(self):
31 |         d = Dict()
32 |         with self.assertRaises(KeyError):
33 |             value = d['empty']
34 | 
35 |     def test_attrerror(self):
36 |         d = Dict()
37 |         with self.assertRaises(AttributeError):
38 |             value = d.empty
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------
/work_one/myfile.py:
--------------------------------------------------------------------------------
1 | title = "The Meaning of Life"


--------------------------------------------------------------------------------
/work_one/requestUrlTest.py:
--------------------------------------------------------------------------------
 1 | #1.单进程：
 2 | # import requests,time
 3 | # start_time=time.time()
 4 | # [requests.get('http://www.liaoxuefeng.com/') for x in range(100)]
 5 | # print("用时：{}秒".format(time.time()-start_time))
 6 | 
 7 | #2.多线程
 8 | 
 9 | # import threadpool,requests
10 | # def run(url):
11 | #     r=requests.get(url=url)
12 | # pool=threadpool.ThreadPool(10)
13 | # reqs=threadpool.makeRequests(run,['http://www.liaoxuefeng.com' for x in range(100)])
14 | # [pool.putRequest(x) for x in reqs]
15 | # pool.wait()
16 | # print("用时：{}秒".format(time.time()-start_time))
17 | 
18 | #3.多进程
19 | 
20 | #!/usr/bin/env python3
21 | # -*- coding: utf-8 -*-
22 | # import multiprocessing,time,requests
23 | # start_time=time.time()
24 | # def run(url):
25 | #     r=requests.get(url=url)
26 | #     #print(1)
27 | # if __name__=='__main__':
28 | #     pool=multiprocessing.Pool(10)
29 | #     [pool.apply_async(run,args=('http://www.liaoxuefeng.com',)) for x in range(100)]
30 | #     pool.close()
31 | #     pool.join()
32 | #     print("用时：{}秒".format(time.time()-start_time))
33 | 
34 | #4.协程（异步IO）
35 | 
36 | import asyncio, aiohttp, time
37 | start_time=time.time()
38 | async def run(url):
39 |     async with aiohttp.ClientSession() as session:
40 |         async with session.get(url=url) as resp:
41 |             pass
42 | loop=asyncio.get_event_loop()
43 | tasks=[asyncio.ensure_future(run('http://www.liaoxuefeng.com')) for x in range(100)]
44 | loop.run_until_complete(asyncio.wait(tasks))
45 | print("用时：{}秒".format(time.time()-start_time))


--------------------------------------------------------------------------------
/work_one/script1.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # A first Python script
  4 | 
  5 | # import sys  # Load a library module
  6 | #
  7 | # print(sys.platform)
  8 | # print(2 ** 100)  # Raise 2 to a power
  9 | # x = 'Spam!'
 10 | # print(x * 8)  # String repetition
 11 | 
 12 | # 小明的成绩从去年的72分提升到了今年的85分，请计算小明成绩提升的百分点，并用字符串格式化显示出'xx.x%'，只保留小数点后1位：
 13 | # s1 = 72
 14 | # s2 = 85
 15 | # r = (85 - 72) / 72 * 100
 16 | # print('小明成绩提升百分点%.1f%%' % r)
 17 | #
 18 | # #小明身高1.75，体重80.5kg。请根据BMI公式（体重除以身高的平方）帮小明计算他的BMI指数，并根据BMI指数：
 19 | # h = input('please input an height: ')
 20 | # height = float(h)
 21 | # w = input('please input an weight: ')
 22 | # weight = float(w)
 23 | # bmi = weight / (height ** 2)
 24 | # print('你的身高是%.2f m,体重是%.1f kg,BMI指数是%f ' %(height,weight,bmi))
 25 | #
 26 | # if bmi < 18.5:
 27 | #     print('过轻')
 28 | # elif 18.5 <= bmi < 25:
 29 | #     print('正常')
 30 | # elif 25 <= bmi < 28:
 31 | #     print('过重')
 32 | # elif 28 <= bmi < 32:
 33 | #     print('肥胖')
 34 | # else:
 35 | #     print('严重肥胖')
 36 | 
 37 | # range(101)就可以生成0-100的整数序列，计算如下
 38 | # sum = 0
 39 | # for x in range(101):
 40 | #     sum = sum + x
 41 | # print(sum)
 42 | 
 43 | # 第二种循环是while循环，只要条件满足，就不断循环，条件不满足时退出循环。比如我们要计算100以内所有奇数之和，可以用while循环实现：
 44 | # sum = 0
 45 | # n = 99
 46 | # while n > 0:
 47 | #     sum = sum + n
 48 | #     n = n - 2
 49 | # print(sum)
 50 | 
 51 | # 请定义一个函数quadratic(a, b, c)，接收3个参数，返回一元二次方程：
 52 | #
 53 | # ax2 + bx + c = 0
 54 | #
 55 | # 的两个解。
 56 | #
 57 | # 提示：计算平方根可以调用math.sqrt()函数
 58 | 
 59 | # import math
 60 | # def quadratic(a, b, c):
 61 | #     if b**2-4*a*c   ==0:
 62 | #         return -b/(2*a)
 63 | #     elif (b**2-4*a*c)   <0:
 64 | #         return '无解'
 65 | #     else:
 66 | #         n1=(-b+math.sqrt(b**2-4*a*c))/(2*a)
 67 | #         n2=(-b-math.sqrt(b**2-4*a*c))/(2*a)
 68 | #         return n1,n2
 69 | # print(quadratic(2, 3, 1))
 70 | # print(quadratic(1, 4, 4))
 71 | 
 72 | # 参数组合
 73 | #
 74 | # 在Python中定义函数，可以用必选参数、默认参数、可变参数、关键字参数和命名关键字参数，这5种参数都可以组合使用。但是请注意，参数定义的顺序必须是：必选参数、默认参数、可变参数、命名关键字参数和关键字参数。
 75 | #
 76 | # 比如定义一个函数，包含上述若干种参数：
 77 | 
 78 | # def f1(a, b, c=0, *args, **kw):
 79 | #     print('a =', a, 'b =', b, 'c =', c, 'args =', args, 'kw =', kw)
 80 | #
 81 | # def f2(a, b, c=0, *, d, **kw):
 82 | #     print('a =', a, 'b =', b, 'c =', c, 'd =', d, 'kw =', kw)
 83 | #
 84 | # args = (1, 2, 3)
 85 | # kw = {'d': 99, 'x': '#'}
 86 | # f1(args, kw)
 87 | # f1(*args, kw)
 88 | # f1(*args, *kw)
 89 | # f1(*args, **kw)
 90 | 
 91 | # f2(*args, **kw)
 92 | 
 93 | # def fact(n):
 94 | #     if n==1:
 95 | #         return 1
 96 | #     return n * fact(n - 1)
 97 | #
 98 | # print(fact(5))
 99 | 
100 | # s=0
101 | # def hanoi(n,a,b,c):
102 | #     global s
103 | #     if n==1:
104 | #         s=s+1
105 | #         print('第 %s 步：' % s)
106 | #         print(a,'->',c)
107 | #     else:
108 | #         hanoi(n-1,a,c,b) #将前n-1个盘子从a移动到b上
109 | #         hanoi(1, a, b, c) #将最底下的最后一个盘子从a移动到c上
110 | #         hanoi(n - 1, b, a, c) #将b上的n-1个盘子移动到c上
111 | #
112 | #
113 | # hanoi(3,'A','B','C')
114 | 
115 | # 汉诺塔 http://baike.baidu.com/item/%E6%B1%89%E8%AF%BA%E5%A1%94/3468295
116 | 
117 | # B=[]
118 | # def move(n,a,b,c):
119 | #     if n==1:
120 | #         buzhou=a+str(n)+'-->'+c+str(n)+'first'
121 | #         B.append(buzhou)
122 | #         return
123 | #     else:
124 | #         move(n-1,a,c,b)
125 | #         buzhou = a + str(n) + '-->' + c + str(n)+'seco'
126 | #         B.append(buzhou)
127 | #         move(n-1,b,a,c)
128 | # move(3,'A','B','C')
129 | # print('共需操作'+str(len(B))+'次','操作过程为',B)
130 | # 共需操作7次 操作过程为
131 | # ['A1-->C1first', 'A2-->B2seco', 'C1-->B1first', 'A3-->C3seco', 'B1-->A1first', 'B2-->C2seco', 'A1-->C1first']
132 | 
133 | #
134 | # L1 = ['Hello', 'World', 18, 'Apple', None]
135 | # L2 = [s.lower() for s in L1 if isinstance(s,str)==True]
136 | # L3 = [s.lower() if isinstance(s,str) else s for s in L1]
137 | # L4 = [s.upper() if isinstance(s,str) is True else s for s in L1]
138 | # L5 = [s[:1].upper()+s[1:].lower() if isinstance(s,str) else s for s in L1]
139 | # print('L1:',L1)
140 | # print('L2:',L2)
141 | # print('L3:',L3)
142 | # print('L4:',L4)
143 | # print('L5:',L5)
144 | 
145 | # class Solution(object):
146 | #     def removeDuplicates(self, nums):
147 | #         """
148 | #         :type nums: List[int]
149 | #         :rtype: int
150 | #         """
151 | #         if not nums:
152 | #             return 0
153 | #
154 | #         newTail = 0
155 | #
156 | #         for i in range(1, len(nums)):
157 | #             if nums[i] != nums[newTail]:
158 | #                 newTail += 1
159 | #                 nums[newTail] = nums[i]
160 | #
161 | #         return newTail + 1
162 | 
163 | # 35. Search Insert Position
164 | # def searchInsert( nums, target):
165 | #     if (len(nums) == 0):
166 | #         return 0
167 | #
168 | #     start = 0
169 | #     end = len(nums) - 1
170 | #     while (start + 1 < end):
171 | #         mid = start + (end - start) // 2
172 | #         if (nums[mid] == target):
173 | #             return mid
174 | #         elif (nums[mid] < target):
175 | #             start = mid
176 | #         else:
177 | #             end = mid
178 | #
179 | #     if target <= nums[start]:
180 | #         return start
181 | #     elif target <= nums[end]:
182 | #         return end
183 | #     else:
184 | #         return end + 1
185 | #
186 | # print(searchInsert([1,3,5,6],4))
187 | 
188 | # from PIL import Image
189 | # im = Image.open('test1.jpg')
190 | # print(im.format, im.size, im.mode)
191 | # im.thumbnail((540,405))
192 | # im.save('test22.jpg','JPEG')
193 | # import sys
194 | # sys.path
195 | 
196 | # class Student(object):
197 | #
198 | #     @property
199 | #     def birth(self):
200 | #         return self._birth
201 | #
202 | #     @birth.setter
203 | #     def birth(self, value):
204 | #         self._birth = value
205 | #
206 | #     @property
207 | #     def age(self):
208 | #         return 2015 - self._birth
209 | #
210 | #
211 | #
212 | # s=Student()
213 | # s.birth=2000
214 | # print(s.birth)
215 | # print(s.age)
216 | 
217 | # class Solution(object):
218 | #     def findDisappearedNumbers(self, nums):
219 | #         """
220 | #         :type nums: List[int]
221 | #         :rtype: List[int]
222 | #         """
223 | #         # For each number i in nums,
224 | #         # we mark the number that i points as negative.
225 | #         # Then we filter the list, get all the indexes
226 | #         # who points to a positive number
227 | #         for i in xrange(len(nums)):
228 | #             index = abs(nums[i]) - 1
229 | #             nums[index] = - abs(nums[index])
230 | #
231 | #         return [i + 1 for i in range(len(nums)) if nums[i] > 0]
232 | #
233 | #     __repr__=findDisappearedNumbers
234 | #
235 | 
236 | class Chain(object):
237 |     def __init__(self, path=''):
238 |         self._path = path
239 | 
240 |     # def __getattr__(self, path):
241 |     #     return Chain('%s/%s' % (self._path, path))
242 | 
243 |     def __getattr__(self, path):
244 |         if path in ['users', 'group']:
245 |             return Chain('%s' % self._path)
246 |         else:
247 |             return Chain('%s/%s' % (self._path, path))
248 | 
249 |     def __call__(self, path):
250 |         return Chain('%s/%s' % (self._path, path))
251 | 
252 |     def __str__(self):
253 |         return self._path
254 | 
255 |     __repr__ = __str__
256 | 
257 | 
258 | # print(Chain().status.user.timeline.list)
259 | print(Chain().users('michael').group('student').repos)
260 | # /status/user/timeline/list
261 | # /users/michael/group/student/repos
262 | # /michael/student/repos
263 | 
264 | # 调用时，需要把:user替换为实际用户名。如果我们能写出这样的链式调用：
265 | # In：
266 | # Chain().users('Michael').group('student').repos
267 | #
268 | # Out：
269 | # GET/Michael/student/repos
270 | 


--------------------------------------------------------------------------------
/work_one/templates/form.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Please Sign In</title>
 6 | </head>
 7 | <body>
 8 | {% if message %}
 9 | <p style="color:red">{{message}}</p>
10 | {% endif %}
11 | <form action="/signin" method="post">
12 |     <legend>Please sign in:</legend>
13 |     <p><input name="username" placeholder="UserName" value="{{username}}"></p>
14 |     <p><input name="password" placeholder="PassWord" value=""></p>
15 |     <p><button type="submit">Sign In</button></p>
16 | </form>
17 | 
18 | </body>
19 | </html>


--------------------------------------------------------------------------------
/work_one/templates/home.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Home</title>
 6 | </head>
 7 | <body>
 8 | <h1 style="font-style: italic">Home</h1>
 9 | </body>
10 | </html>


--------------------------------------------------------------------------------
/work_one/templates/signin-ok.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Wlecome,{{username}}</title>
 6 | </head>
 7 | <body>
 8 | <p>Welcome,{{username}}!</p>
 9 | </body>
10 | </html>


--------------------------------------------------------------------------------
/work_one/test.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test.db


--------------------------------------------------------------------------------
/work_one/test1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test1.jpg


--------------------------------------------------------------------------------
/work_one/test22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test22.jpg


--------------------------------------------------------------------------------
/work_one/test33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test33.png


--------------------------------------------------------------------------------
/work_one/test44.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test44.bmp


--------------------------------------------------------------------------------
/work_one/work_GUI.py:
--------------------------------------------------------------------------------
 1 | from tkinter import *
 2 | import tkinter.messagebox as messagebox
 3 | 
 4 | class Application(Frame):
 5 |     def __init__(self,master=None):
 6 |         Frame.__init__(self,master)
 7 |         self.pack()
 8 |         self.createWidgets()
 9 | 
10 |     def createWidgets(self):
11 |         self.nameInput=Entry(self)
12 |         self.nameInput.pack()
13 |         self.quitButton=Button(self,text='Hello',command=self.hello)
14 |         self.quitButton.pack()
15 | 
16 |     def hello(self):
17 |         name=self.nameInput.get() or 'world'
18 |         messagebox.showinfo('Message','Hello,%s' % name)
19 | 
20 | 
21 | app=Application()
22 | app.master.title('Hello')
23 | app.mainloop()


--------------------------------------------------------------------------------
/work_one/work_HTMLParser.py:
--------------------------------------------------------------------------------
 1 | # from html.parser import HTMLParser
 2 | # import urllib.request
 3 | #
 4 | # response = urllib.request.urlopen('https://www.python.org/events/python-events/')
 5 | # class PythonEvent(HTMLParser):
 6 | #     def __init__(self):
 7 | #         super(PythonEvent, self).__init__()
 8 | #         self.key = 0
 9 | #         self.location_key = 0
10 | #         self.event_list = []
11 | #         self.event_tmp = []
12 | #     def handle_starttag(self, tag, attrs):
13 | #         if attrs:
14 | #             if attrs[0][1] == 'event-title' or tag == 'time':
15 | #                 self.key = 1 # self.key=1表示data需要保存
16 | #             if attrs[0][1] == 'event-location':
17 | #                 self.key = 1
18 | #                 self.location_key =1 # self.location_key=1表示单个data信息结尾
19 | #
20 | #     def handle_data(self, data):
21 | #         if self.key:
22 | #             self.event_tmp.append(data)
23 | #         if self.location_key:
24 | #             self.event_list.append(self.event_tmp) # event_tmp保存进list并重置
25 | #             self.event_tmp = []
26 | #
27 | #     def handle_endtag(self, tag):
28 | #         self.key = 0
29 | #         self.location_key = 0
30 | #
31 | # event = PythonEvent()
32 | # event.feed(response.read().decode('utf-8'))
33 | # for i in event.event_list:
34 | #     print(i)
35 | 
36 | #from urllib import request
37 | 
38 | # with request.urlopen('https://api.douban.com/v2/book/2129650') as f:
39 | #     data = f.read()
40 | #     print('Status:',f.status,f.reason)
41 | #     for k, v in f.getheaders():
42 | #         print('%s:%s' % (k,v))
43 | #     print('Data:', data.decode('utf-8'))
44 | 
45 | # from urllib import request
46 | #
47 | # req=request.Request('http://www.douban.com/')
48 | # req.add_header('User-Agent','Mozilla/6.0 (iphone; CPU iphone os 8_0 like Mac OS X) AppleWebkit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
49 | # with request.urlopen(req) as f:
50 | #     print('Status:', f.status, f.reason)
51 | #     for k,v in f.getheaders():
52 | #         print('%s:%s' %(k,v))
53 | #     print('Data:', f.read().decode('utf-8'))
54 | 
55 | from urllib import request, parse
56 | 
57 | print('Login to weibo.cn...')
58 | email = input('Email: ')
59 | passwd = input('Password: ')
60 | login_data = parse.urlencode([
61 |     ('username', email),
62 |     ('password', passwd),
63 |     ('entry', 'mweibo'),
64 |     ('client_id', ''),
65 |     ('savestate', '1'),
66 |     ('ec', ''),
67 |     ('pagerefer', 'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F')
68 | ])
69 | 
70 | req = request.Request('https://passport.weibo.cn/sso/login')
71 | req.add_header('Origin', 'https://passport.weibo.cn')
72 | req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
73 | req.add_header('Referer', 'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F')
74 | 
75 | with request.urlopen(req, data=login_data.encode('utf-8')) as f:
76 |     print('Status:', f.status, f.reason)
77 |     for k, v in f.getheaders():
78 |         print('%s: %s' % (k, v))
79 |     print('Data:', f.read().decode('utf-8'))


--------------------------------------------------------------------------------
/work_one/work_PILImageDraw.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image, ImageDraw, ImageFont, ImageFilter
 2 | 
 3 | import random
 4 | 
 5 | # random  letters,num,chinese
 6 | def rndChar():
 7 |     # return chr(random.randint(65,90))
 8 |     # random Upper and lower case letters
 9 |     letterChr=chr(random.choice([x for x in range(65,123) if x not in [z for z in range(91,97)]]))
10 |     #random num
11 |     numChr=str(random.randint(0,9))
12 |     # random chinese
13 |     cnChr=random.choice('我是中文汉字')
14 |     return random.choice((letterChr,numChr,cnChr))
15 | # random color1
16 | def rndColor():
17 |     return (random.randint(64,255), random.randint(64,255),random.randint(64,255))
18 | 
19 | # random color2
20 | def rndColor2():
21 |     return (random.randint(32,127), random.randint(32,127), random.randint(32,127))
22 | 
23 | # 240 * 60
24 | width = 60 * 4
25 | height = 60
26 | image = Image.new('RGB',(width,height),(255,255,255))
27 | # create font
28 | font = ImageFont.truetype('Arial.ttf',36)
29 | # create draw
30 | draw= ImageDraw.Draw(image)
31 | # fill each pixel
32 | for x in range(width):
33 |     for y in range(height):
34 |         draw.point((x,y), fill=rndColor())
35 | 
36 | # output text
37 | for t in range(4):
38 |     draw.text((60*t+10,10), rndChar(),font=font,fill=rndColor2())
39 | 
40 | # fuzzy
41 | image = image.filter(ImageFilter.BLUR)
42 | image.save('code.jpg','jpeg')
43 | 
44 | 


--------------------------------------------------------------------------------
/work_one/work_TCP_client.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 4 | # 建立连接:
 5 | s.connect(('127.0.0.1', 9999))
 6 | # 接收欢迎消息:
 7 | print(s.recv(1024).decode('utf-8'))
 8 | for data in [b'Michael', b'Tracy', b'Sarah']:
 9 |     # 发送数据:
10 |     s.send(data)
11 |     print(s.recv(1024).decode('utf-8'))
12 | s.send(b'exit')
13 | s.close()


--------------------------------------------------------------------------------
/work_one/work_TCP_server.py:
--------------------------------------------------------------------------------
 1 | # import socket
 2 | #
 3 | # s=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
 4 | # s.connect(('www.sina.com.cn', 80))
 5 | #
 6 | # s.send(b'GET / HTTP/1.1\r\nHost: www.sina.com.cn\r\nConnection: close\r\n\r\n')
 7 | #
 8 | # buffer = []
 9 | # while True:
10 | #     d=s.recv(1024)
11 | #     if d:
12 | #         buffer.append(d)
13 | #     else:
14 | #         break
15 | #
16 | # data=b''.join(buffer)
17 | #
18 | # s.close()
19 | #
20 | # header, html = data.split(b'\r\n\r\n',1)
21 | # print(header.decode('utf-8'))
22 | #
23 | # with open('sina.html','wb') as f:
24 | #     f.write(html)
25 | 
26 | import socket,threading,time
27 | 
28 | def tcplink(sock, addr):
29 |     print('Accept new connection from %s:%s...' % addr)
30 |     sock.send(b'Welcome!')
31 |     while True:
32 |         data = sock.recv(1024)
33 |         time.sleep(1)
34 |         if not data or data.decode('utf-8') == 'exit':
35 |             break
36 |         sock.send(('Hello, %s!' % data.decode('utf-8')).encode('utf-8'))
37 |     sock.close()
38 |     print('Connection from %s:%s closed.' % addr)
39 | 
40 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
41 | s.bind(('127.0.0.1', 9999))
42 | s.listen(5)
43 | print('Waiting for connection...')
44 | 
45 | while True:
46 |     # 接受一个新连接:
47 |     sock, addr = s.accept()
48 |     # 创建新线程来处理TCP连接:
49 |     t = threading.Thread(target=tcplink, args=(sock, addr))
50 |     t.start()


--------------------------------------------------------------------------------
/work_one/work_UDP_client.py:
--------------------------------------------------------------------------------
1 | import socket
2 | s=socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
3 | for data in [b'Michael',b'Tra',b'Sarah']:
4 |     s.sendto(data,('127.0.0.1',9999))
5 |     print(s.recv(1024).decode('utf-8'))
6 | s.close()


--------------------------------------------------------------------------------
/work_one/work_UDP_server.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | s= socket.socket(socket.AF_INET,socket.SOCK_DGRAM)
 4 | s.bind(('127.0.0.1',9999))
 5 | print('Bind UDP on 9999')
 6 | while True:
 7 |     data,addr=s.recvfrom(1024)
 8 |     print('Received from %s:%s' %addr)
 9 |     s.sendto(b'Hello, %s!' % data, addr)
10 | 
11 | 


--------------------------------------------------------------------------------
/work_one/work_data_MYSQL.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import pymysql
 4 | 
 5 | # 打开数据库连接
 6 | db = pymysql.connect("localhost","test","password","TESTDB" )
 7 | 
 8 | # 使用 cursor() 方法创建一个游标对象 cursor
 9 | cursor = db.cursor()
10 | 
11 | # 使用 execute()  方法执行 SQL 查询
12 | # cursor.execute("SELECT VERSION()")
13 | 
14 | # 使用 fetchone() 方法获取单条数据.
15 | # data = cursor.fetchone()
16 | 
17 | # print ("Database version : %s " % data)
18 | 
19 | # 使用 execute() 方法执行 SQL，如果表存在则删除
20 | # cursor.execute("DROP TABLE IF EXISTS EMPLOYEE")
21 | 
22 | # 使用预处理语句创建表
23 | # sql = """CREATE TABLE EMPLOYEE (
24 | #          FIRST_NAME  CHAR(20) NOT NULL,
25 | #          LAST_NAME  CHAR(20),
26 | #          AGE INT,
27 | #          SEX CHAR(1),
28 | #          INCOME FLOAT )"""
29 | #
30 | #
31 | # cursor.execute(sql)
32 | 
33 | 
34 | 
35 | # SQL 插入语句
36 | # sql = """INSERT INTO EMPLOYEE(FIRST_NAME,
37 | #          LAST_NAME, AGE, SEX, INCOME)
38 | #          VALUES ('Mac', 'Mohan', 20, 'M', 2000)"""
39 | 
40 | # sql = "INSERT INTO EMPLOYEE(FIRST_NAME, \
41 | #        LAST_NAME, AGE, SEX, INCOME) \
42 | #        VALUES ('%s', '%s', '%d', '%c', '%d' )" % \
43 | #        ('Mac2', 'Mohan2', 202, 'M', 2000)
44 | # try:
45 | #    # 执行sql语句
46 | #    cursor.execute(sql)
47 | #    # 提交到数据库执行
48 | #    db.commit()
49 | # except:
50 | #    # 如果发生错误则回滚
51 | #    db.rollback()
52 | 
53 | 
54 | # SQL 查询语句
55 | # sql = "SELECT * FROM EMPLOYEE \
56 | #        WHERE INCOME > '%d'" % (1000)
57 | #
58 | # try:
59 | #    # 执行SQL语句
60 | #    cursor.execute(sql)
61 | #    # 获取所有记录列表
62 | #    results = cursor.fetchall()
63 | #    for row in results:
64 | #       fname = row[0]
65 | #       lname = row[1]
66 | #       age = row[2]
67 | #       sex = row[3]
68 | #       income = row[4]
69 | #        # 打印结果
70 | #       print ("fname=%s,lname=%s,age=%d,sex=%s,income=%d" % \
71 | #              (fname, lname, age, sex, income ))
72 | # except:
73 | #    print ("Error: unable to fetch data")
74 | 
75 | # SQL 更新语句
76 | # sql = "UPDATE EMPLOYEE SET AGE = AGE + 1\
77 | #                           WHERE SEX = '%c'" % ('M')
78 | 
79 | # SQL 删除语句
80 | sql = "DELETE FROM EMPLOYEE WHERE AGE > '%d'" % (200)
81 | 
82 | try:
83 |    # 执行SQL语句
84 |    cursor.execute(sql)
85 |    # 提交到数据库执行
86 |    db.commit()
87 | except:
88 |    # 发生错误时回滚
89 |    db.rollback()
90 | 
91 | # 关闭数据库连接
92 | db.close()


--------------------------------------------------------------------------------
/work_one/work_data_SQLAlchemy.py:
--------------------------------------------------------------------------------
 1 | # 导入:
 2 | from sqlalchemy import Column, String, Integer,create_engine
 3 | from sqlalchemy.orm import sessionmaker
 4 | from sqlalchemy.ext.declarative import declarative_base
 5 | 
 6 | # 创建对象的基类:
 7 | Base = declarative_base()
 8 | 
 9 | # 定义User对象:
10 | class User(Base):
11 |     # 表的名字:
12 |     __tablename__ = 'test'
13 | 
14 |     # 表的结构:
15 |     id = Column(Integer, primary_key=True)
16 |     name = Column(String(50))
17 |     age=Column(Integer)
18 | 
19 | # 初始化数据库连接:
20 | engine = create_engine('mysql+pymysql://root:password@localhost:3306/sqltest')
21 | # 创建DBSession类型:
22 | DBSession = sessionmaker(bind=engine)
23 | 
24 | # 创建session对象:
25 | session = DBSession()
26 | # 创建新User对象:
27 | new_user = User(id=2, name='Bob',age=22)
28 | # 添加到session:
29 | session.add(new_user)
30 | # 提交即保存到数据库:
31 | session.commit()
32 | 
33 | 
34 | # 创建Query查询，filter是where条件，最后调用one()返回唯一行，如果调用all()则返回所有行:
35 | user = session.query(User).filter(User.id==5).one()
36 | # 打印类型和对象的name属性:
37 | print('type:', type(user))
38 | print('name:', user.name)
39 | 
40 | 
41 | # 关闭session:
42 | session.close()
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/work_one/work_data_SQLite.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, sqlite3
 4 | 
 5 | db_file = os.path.join(os.path.dirname(__file__), 'test.db')
 6 | if os.path.isfile(db_file):
 7 |     os.remove(db_file)
 8 | 
 9 | # 初始数据:
10 | conn = sqlite3.connect(db_file)
11 | cursor = conn.cursor()
12 | cursor.execute('create table user(id varchar(20) primary key, name varchar(20), score int)')
13 | cursor.execute(r"insert into user values ('A-001', 'Adam', 95)")
14 | cursor.execute(r"insert into user values ('A-002', 'Bart', 62)")
15 | cursor.execute(r"insert into user values ('A-003', 'Lisa', 78)")
16 | cursor.close()
17 | conn.commit()
18 | conn.close()


--------------------------------------------------------------------------------
/work_one/work_mail_POP3.py:
--------------------------------------------------------------------------------
  1 | # receive mail
  2 | from email.parser import Parser
  3 | from email.header import decode_header
  4 | from email.utils import parseaddr
  5 | 
  6 | import poplib
  7 | 
  8 | # 输入邮件地址, 口令和POP3服务器地址:
  9 | # email = input('Email: ')
 10 | # password = input('Password: ')
 11 | # pop3_server = input('POP3 server: ')
 12 | 
 13 | # 163
 14 | # email = 'xindy138@163.com'
 15 | # password = '****'
 16 | # pop3_server = 'pop3.163.com'
 17 | 
 18 | # qq 特殊端口995 和验证码
 19 | email = '373128869@qq.com'
 20 | password = 'qhhtlbcexpnzcbac'
 21 | pop3_server = 'pop.qq.com'
 22 | pop3_port = 995
 23 | 
 24 | 
 25 | # indent用于缩进显示:
 26 | def print_info(msg, indent=0):
 27 |     if indent == 0:
 28 |         for header in ['From', 'To', 'Subject']:
 29 |             value = msg.get(header, '')
 30 |             if value:
 31 |                 if header == 'Subject':
 32 |                     value = decode_str(value)
 33 |                 else:
 34 |                     hdr, addr = parseaddr(value)
 35 |                     name = decode_str(hdr)
 36 |                     value = u'%s <%s>' % (name, addr)
 37 |             print('%s%s: %s' % ('  ' * indent, header, value))
 38 |     if (msg.is_multipart()):
 39 |         parts = msg.get_payload()
 40 |         for n, part in enumerate(parts):
 41 |             print('%spart %s' % ('  ' * indent, n))
 42 |             print('%s--------------------' % ('  ' * indent))
 43 |             print_info(part, indent + 1)
 44 |     else:
 45 |         content_type = msg.get_content_type()
 46 |         if content_type == 'text/plain' or content_type == 'text/html':
 47 |             content = msg.get_payload(decode=True)
 48 |             charset = guess_charset(msg)
 49 |             if charset:
 50 |                 content = content.decode(charset)
 51 |             print('%sText: %s' % ('  ' * indent, content + '...'))
 52 |         else:
 53 |             print('%sAttachment: %s' % ('  ' * indent, content_type))
 54 | 
 55 | 
 56 | def decode_str(s):
 57 |     value, charset = decode_header(s)[0]
 58 |     if charset:
 59 |         value = value.decode(charset)
 60 |     return value
 61 | 
 62 | 
 63 | def guess_charset(msg):
 64 |     charset = msg.get_charset()
 65 |     if charset is None:
 66 |         content_type = msg.get('Content-Type', '').lower()
 67 |         pos = content_type.find('charset=')
 68 |         if pos >= 0:
 69 |             charset = content_type[pos + 8:].strip()
 70 |     return charset
 71 | 
 72 | 
 73 | # 连接到POP3服务器:
 74 | server = poplib.POP3_SSL(pop3_server, pop3_port)
 75 | # 可以打开或关闭调试信息:
 76 | server.set_debuglevel(1)
 77 | # 可选:打印POP3服务器的欢迎文字:
 78 | print(server.getwelcome().decode('utf-8'))
 79 | 
 80 | # 身份认证:
 81 | server.user(email)
 82 | server.pass_(password)
 83 | 
 84 | # stat()返回邮件数量和占用空间:
 85 | print('Messages: %s. Size: %s' % server.stat())
 86 | # list()返回所有邮件的编号:
 87 | resp, mails, octets = server.list()
 88 | # 可以查看返回的列表类似[b'1 82923', b'2 2184', ...]
 89 | print(mails)
 90 | 
 91 | # 获取最新一封邮件, 注意索引号从1开始:
 92 | index = len(mails)
 93 | resp, lines, octets = server.retr(index)
 94 | 
 95 | # lines存储了邮件的原始文本的每一行,
 96 | # 可以获得整个邮件的原始文本:
 97 | msg_content = b'\r\n'.join(lines).decode('utf-8')
 98 | # 稍后解析出邮件:
 99 | msg = Parser().parsestr(msg_content)
100 | 
101 | print_info(msg)
102 | # 可以根据邮件索引号直接从服务器删除邮件:
103 | # server.dele(index)
104 | # 关闭连接:
105 | server.quit()
106 | 


--------------------------------------------------------------------------------
/work_one/work_mail_SMTP.py:
--------------------------------------------------------------------------------
 1 | # send mail
 2 | from email import encoders
 3 | from email.header import Header
 4 | from email.mime.base import MIMEBase
 5 | from email.mime.multipart import MIMEMultipart
 6 | from email.mime.text import MIMEText
 7 | from email.utils import parseaddr, formataddr
 8 | import smtplib
 9 | 
10 | 
11 | def _format_addr(s):
12 |     name, addr = parseaddr(s)
13 |     return formataddr((Header(name, 'utf-8').encode(), addr))
14 | 
15 | 
16 | # from_addr = input('From:')
17 | # password = input('Password:')
18 | # to_addr = input('To:')
19 | # smtp_server = input('SMTP server:')
20 | 
21 | # 163
22 | # from_addr = 'xindy138@163.com'
23 | # password = '*******'
24 | # to_addr = '373128869@qq.com'
25 | # smtp_server = 'smtp.163.com'
26 | # smtp_port = 25
27 | 
28 | # qq 特殊端口465 和验证码
29 | from_addr = '373128869@qq.com'
30 | password = 'qhhtlbcexpnzcbac'
31 | to_addr = 'demiyuhongjun@gmail.com'
32 | smtp_server = 'smtp.qq.com'
33 | smtp_port = 465
34 | 
35 | # msg = MIMEText('<html><body><h1>Hello</h1>' +
36 | #     '<p>send by <a href="http://www.python.org">Python</a>...</p>' +
37 | #     '</body></html>', 'html', 'utf-8')
38 | msg = MIMEMultipart('alternative')  # 同时支持HTML和Plain格式 如果收件人无法查看HTML格式的邮件，就可以自动降级查看纯文本邮件
39 | msg.attach(MIMEText('send with file...', 'plain', 'utf-8'))
40 | msg.attach(MIMEText('<html><body><h1>Hello</h1>' +
41 |                     '<p><img src="cid:0"></p>' +
42 |                     '</body></html>', 'html', 'utf-8'))
43 | msg['From'] = _format_addr('Python爱好者 <%s>' % from_addr)
44 | msg['To'] = _format_addr('管理员 <%s>' % to_addr)
45 | msg['Subject'] = Header('来自SMTP的问候……', 'utf-8').encode()
46 | 
47 | with open('test33.png', 'rb') as f:
48 |     # 设置附件的MIME和文件名，这里是png类型:
49 |     mime = MIMEBase('image', 'png', filename='test33.png')
50 |     # 加上必要的头信息:
51 |     mime.add_header('Content-Disposition', 'attachment', filename='test33.png')
52 |     mime.add_header('Content-ID', '<0>')
53 |     mime.add_header('X-Attachment-Id', '0')
54 |     # 把附件的内容读进来:
55 |     mime.set_payload(f.read())
56 |     # 用Base64编码:
57 |     encoders.encode_base64(mime)
58 |     # 添加到MIMEMultipart:
59 |     msg.attach(mime)
60 | 
61 | server = smtplib.SMTP_SSL(smtp_server, smtp_port)
62 | server.set_debuglevel(1)
63 | server.login(from_addr, password)
64 | server.sendmail(from_addr, [to_addr], msg.as_string())
65 | server.quit()
66 | 


--------------------------------------------------------------------------------
/work_one/work_register.py:
--------------------------------------------------------------------------------
  1 | # from PIL import Image
  2 | # im = Image.open('test1.jpg')
  3 | # print(im.format, im.size, im.mode)
  4 | # im.thumbnail((540,405))
  5 | # im.save('test44.bmp','BMP')
  6 | 
  7 | # import struct
  8 | #
  9 | # def judge(url):
 10 | #     with open(url,'rb') as f:
 11 | #         s=(f.read(30))
 12 | #         t=(struct.unpack('<ccIIIIIIHH',s))
 13 | #         if t[0]==b'B' and t[1]==b'M':
 14 | #             print('size:%s*%s\n color:%s'%(t[6],t[7],t[-1]))
 15 | #         else:
 16 | #             print(t)
 17 | #             print('size:%s*%s\n color:%s'%(t[6],t[7],t[-1]))
 18 | #
 19 | #
 20 | # url=input('Please enter the file address:')
 21 | #
 22 | # judge(url)
 23 | 
 24 | # import hashlib
 25 | #
 26 | # class Test(object):
 27 | # # db=test:win
 28 | #     db = {
 29 | #         'test': 'c967e1266b519854833aabceb116de07'
 30 | #     }
 31 | #     def get_md5(self,psw):
 32 | #         md5=hashlib.md5()
 33 | #         md5.update(psw.encode('utf-8'))
 34 | #         psw=md5.hexdigest()
 35 | #         return psw
 36 | #
 37 | #     def register(self):
 38 | #         username=input('Register: Please enter ur username:')
 39 | #         if username in self.db:
 40 | #             print('Username has existed')
 41 | #             exit()
 42 | #         else:
 43 | #             password=input('Register: Please enter ur password:')
 44 | #             self.db[username]=Test().get_md5(username+password+'the-Salt')
 45 | #
 46 | #     def login(self):
 47 | #         login_user=input('Login: Please enter your username:')
 48 | #         login_psw=input('Login: Please enter your password:')
 49 | #         login_psw=Test().get_md5(login_user+login_psw+'the-Salt')
 50 | #
 51 | #         passwd = self.db.get(login_user, -1)
 52 | #         if passwd==-1:
 53 | #             print('Username Error.')
 54 | #         elif self.db[login_user]!=login_psw:
 55 | #             print('Password Error.')
 56 | #         else:
 57 | #             print('Login Success.')
 58 | #
 59 | # t=Test()
 60 | # t.register()
 61 | # t.login()
 62 | import urllib.request, urllib.parse
 63 | from xml.parsers.expat import ParserCreate
 64 | class weatherSaxHandler(object):
 65 |     def __init__(self):
 66 |         self.location = {}
 67 |         self.forcast = []
 68 |     def start_element(self, name, attrs):
 69 |         if name == 'yweather:location':
 70 |             self.location = attrs
 71 |         if name == 'yweather:forecast':
 72 |             self.forcast.append(attrs)
 73 |     def end_element(self, name):
 74 |         pass
 75 |     def char_data(self,text):
 76 |         pass
 77 | 
 78 | def parse_weather(xml):
 79 |     parser = ParserCreate()
 80 |     handler = weatherSaxHandler()
 81 |     parser.StartElementHandler = handler.start_element
 82 |     parser.EndElementHandler = handler.end_element
 83 |     parser.CharacterDataHandler = handler.char_data
 84 |     parser.Parse(xml)
 85 |     today = {
 86 |         'text': handler.forcast[0]['text'],
 87 |         'low': int(handler.forcast[0]['low']),
 88 |         'high': int(handler.forcast[0]['high'])
 89 |     }
 90 |     tomorrow = {
 91 |         'text': handler.forcast[1]['text'],
 92 |         'low': int(handler.forcast[1]['low']),
 93 |         'high': int(handler.forcast[1]['high'])
 94 |     }
 95 |     d = {
 96 |         'today': today,
 97 |         'tomorrow': tomorrow
 98 | 
 99 |     }
100 |     d.update(handler.location)
101 |     return d
102 | 
103 | 
104 | # 测试:xml
105 | # data = r'''<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
106 | # <rss version="2.0" xmlns:yweather="http://xml.weather.yahoo.com/ns/rss/1.0" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#">
107 | #     <channel>
108 | #         <title>Yahoo! Weather - Beijing, CN</title>
109 | #         <lastBuildDate>Wed, 27 May 2015 11:00 am CST</lastBuildDate>
110 | #         <yweather:location city="Beijing" region="" country="China"/>
111 | #         <yweather:units temperature="C" distance="km" pressure="mb" speed="km/h"/>
112 | #         <yweather:wind chill="28" direction="180" speed="14.48" />
113 | #         <yweather:atmosphere humidity="53" visibility="2.61" pressure="1006.1" rising="0" />
114 | #         <yweather:astronomy sunrise="4:51 am" sunset="7:32 pm"/>
115 | #         <item>
116 | #             <geo:lat>39.91</geo:lat>
117 | #             <geo:long>116.39</geo:long>
118 | #             <pubDate>Wed, 27 May 2015 11:00 am CST</pubDate>
119 | #             <yweather:condition text="Haze" code="21" temp="28" date="Wed, 27 May 2015 11:00 am CST" />
120 | #             <yweather:forecast day="Wed" date="27 May 2015" low="20" high="33" text="Partly Cloudy" code="30" />
121 | #             <yweather:forecast day="Thu" date="28 May 2015" low="21" high="34" text="Sunny" code="32" />
122 | #             <yweather:forecast day="Fri" date="29 May 2015" low="18" high="25" text="AM Showers" code="39" />
123 | #             <yweather:forecast day="Sat" date="30 May 2015" low="18" high="32" text="Sunny" code="32" />
124 | #             <yweather:forecast day="Sun" date="31 May 2015" low="20" high="37" text="Sunny" code="32" />
125 | #         </item>
126 | #     </channel>
127 | # </rss>
128 | # '''
129 | 
130 | 
131 | # weather = parse_weather(data)
132 | 
133 | # assert weather['city'] == 'Beijing', weather['city']
134 | # assert weather['country'] == 'China', weather['country']
135 | # assert weather['today']['text'] == 'Partly Cloudy', weather['today']['text']
136 | # assert weather['today']['low'] == 20, weather['today']['low']
137 | # assert weather['today']['high'] == 33, weather['today']['high']
138 | # assert weather['tomorrow']['text'] == 'Sunny', weather['tomorrow']['text']
139 | # assert weather['tomorrow']['low'] == 21, weather['tomorrow']['low']
140 | # assert weather['tomorrow']['high'] == 34, weather['tomorrow']['high']
141 | # print('Weather:', str(weather))
142 | 
143 | def get_weather(city): # 输入城市名（拼音）字符串，输出天气dict
144 |     baseurl = "https://query.yahooapis.com/v1/public/yql?"
145 |     yql_query = 'select * from weather.forecast where woeid in (select woeid from geo.places(1) where text="%s")' % city
146 |     yql_url = baseurl + urllib.parse.urlencode({'q':yql_query})
147 |     print(yql_url)
148 |     with urllib.request.urlopen(yql_url) as f:
149 |         city_xml = f.read().decode('utf-8')
150 |     city_weather = parse_weather(city_xml)
151 |     return city_weather
152 | 
153 | def main():
154 |     city = input('Weather Forecast in City: ')
155 |     print(get_weather(city))
156 | 
157 | main()


--------------------------------------------------------------------------------
/work_two_Crawler/86.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_two_Crawler/86.jpg


--------------------------------------------------------------------------------
/work_two_Crawler/93.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_two_Crawler/93.jpg


--------------------------------------------------------------------------------
/work_two_Crawler/94.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_two_Crawler/94.jpg


--------------------------------------------------------------------------------
/work_two_Crawler/Download.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding:utf-8 -*-
  3 | #http://cuiqingcai.com/3256.html
  4 | #http://www.xicidaili.com/nt/ 代理IP地址
  5 | __author__ = 'Demi Yu'
  6 | 
  7 | #限制IP访问频率，超过频率就断开连接。（这种方法解决办法就是，降低爬虫的速度在每个请求前面加上time.sleep；或者不停的更换代理IP，这样就绕过反爬虫机制啦！）
  8 | #后台对访问进行统计，如果单个userAgent访问超过阈值，予以封锁。（效果出奇的棒！不过误伤也超级大，一般站点不会使用，不过我们也考虑进去
  9 | #上面讲过有的网站会限制相同的User-Agent的访问频率，那我们就给他随机来一个User-Agent，不停的更换代理IP好了！去百度一下User-Agent，我找到了下面这些：
 10 | 
 11 | import requests
 12 | import re
 13 | import random
 14 | import time
 15 | from bs4 import BeautifulSoup
 16 | 
 17 | class download():
 18 | 
 19 |     def __init__(self):
 20 | 
 21 |         headers = {
 22 |             'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
 23 | 
 24 |         self.iplist = []  ##初始化一个list用来存放我们获取到的IP
 25 |         html = requests.get("http://www.xicidaili.com/nt/",headers=headers)  ##不解释咯
 26 |         # iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S)  ##表示从html.text中获取所有r/><b中的内容，re.S的意思是包括匹配包括换行符，findall返回的是个list哦！
 27 |         # for ip in iplistn:
 28 |         #     i = re.sub('\n', '', ip)  ##re.sub 是re模块替换的方法，这儿表示将\n替换为空
 29 |         #     self.iplist.append(i.strip())  ##添加到我们上面初始化的list里面
 30 |         iplistn=BeautifulSoup(html.text, 'lxml').table.find_all('tr')
 31 |         for ipItem in iplistn:
 32 |             # i = re.sub('\n', '', ipItem)  ##re.sub 是re模块替换的方法，这儿表示将\n替换为空
 33 |             ipHost=ipItem.contents[3].string
 34 |             ipPort = ipItem.contents[5].string
 35 | 
 36 |             # ipPort=ipItem.select('td:nth-of-type(3)').find('td').get_text()
 37 |             ipAll=ipHost+':'+ipPort
 38 |             self.iplist.append(ipAll.strip())
 39 |         # print(self.iplist[1:])  'IP地址:端口',
 40 | 
 41 | 
 42 | 
 43 | 
 44 |         self.user_agent_list = [
 45 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 46 |             "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 47 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 48 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
 49 |             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
 50 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
 51 |             "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
 52 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 53 |             "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 54 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 55 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
 56 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
 57 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 58 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 59 |             "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 60 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
 61 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
 62 |             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
 63 |         ]
 64 | 
 65 |     def get(self, url, timeout, proxy=None, num_retries=6): ##给函数一个默认参数proxy为空
 66 |         UA = random.choice(self.user_agent_list) ##从self.user_agent_list中随机取出一个字符串
 67 |         headers = {'User-Agent': UA}  ##构造成一个完整的User-Agent （UA代表的是上面随机取出来的字符串哦）
 68 | 
 69 |         if proxy == None: ##当代理为空时，不使用代理获取response（别忘了response啥哦！之前说过了！！）
 70 |             try:
 71 |                 return requests.get(url, headers=headers, timeout=timeout)##这样服务器就会以为我们是真的浏览器了
 72 |             except:##如过上面的代码执行报错则执行下面的代码
 73 | 
 74 |                 if num_retries > 0: ##num_retries是我们限定的重试次数
 75 |                     time.sleep(10) ##延迟十秒
 76 |                     print(u'获取网页出错，10S后将获取倒数第：', num_retries, u'次')
 77 |                     return self.get(url, timeout, num_retries-1)  ##调用自身 并将次数减1
 78 |                 else:
 79 |                     print(u'开始使用代理')
 80 |                     time.sleep(10)
 81 |                     IP = ''.join(str(random.choice(self.iplist[1:])).strip()) ##下面有解释哦
 82 |                     proxy = {'http': IP}
 83 |                     return self.get(url, timeout, proxy,) ##代理不为空的时候
 84 | 
 85 |         else: ##当代理不为空
 86 |             try:
 87 |                 IP = ''.join(str(random.choice(self.iplist[1:])).strip()) ##将从self.iplist中获取的字符串处理成我们需要的格式（处理了些什么自己看哦，这是基础呢）
 88 |                 proxy = {'http': IP} ##构造成一个代理
 89 |                 return requests.get(url, headers=headers, proxies=proxy, timeout=timeout) ##使用代理获取response
 90 |             except:
 91 | 
 92 |                 if num_retries > 0:
 93 |                     time.sleep(10)
 94 |                     IP = ''.join(str(random.choice(self.iplist[1:])).strip())
 95 |                     proxy = {'http': IP}
 96 |                     print(u'正在更换代理，10S后将重新获取倒数第', num_retries, u'次')
 97 |                     print(u'当前代理是：', proxy)
 98 |                     return self.get(url, timeout, proxy, num_retries - 1)
 99 |                 else:
100 |                     print(u'代理也不好使了！取消代理')
101 |                     return self.get(url, 3)
102 | 
103 | request = download()  ##
104 | 
105 | 
106 | # if __name__ == '__main__':
107 | #     request=download()
108 | #     request.get('http://www.mzitu.com/all',3)
109 | 


--------------------------------------------------------------------------------
/work_two_Crawler/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding:utf-8 -*-
3 | 
4 | __author__ = 'Demi Yu'


--------------------------------------------------------------------------------
/work_two_Crawler/catch_blog.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | #爬取简书上某个文章地址的主体内容
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | from bs4 import BeautifulSoup
 7 | import requests
 8 | import codecs
 9 | 
10 | 
11 | def get_page(url):
12 |     headers = {
13 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
14 |     }
15 |     data = requests.get(url, headers=headers).content
16 |     return data
17 | 
18 | 
19 | def get_text(html):
20 |     parser = BeautifulSoup(html, 'html.parser')
21 |     article = parser.find('div', attrs={'class': 'article'})  # 定位文章
22 |     title = article.find('h1', attrs={'class': 'title'}).get_text()  # 获取标题
23 |     text = []  # 创建空列表存放文章
24 |     for paragraph in article.find_all('p'):
25 |         paragraph_content = paragraph.get_text()
26 |         text.append(paragraph_content)  # 将文章一段一段的添加到列表中
27 |     return title, text
28 | 
29 | 
30 | def save_text(title, text):
31 |     file_name = title + '.txt'
32 |     with codecs.open(file_name, 'wb', encoding='utf-8') as open_file:
33 |         try:
34 |             for p in text:
35 |                 open_file.write('\t%s\r\n' % p)  # \t是tab制表符，\r\n是Carrige Return换行
36 |         except Exception:
37 |             print('发生了错误！')
38 |         print('文章抓取完成！')
39 |         return
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     url = 'http://www.jianshu.com/p/293c3b71416e'
44 |     html = get_page(url)
45 |     title, text = get_text(html)
46 |     save_text(title, text)
47 | 


--------------------------------------------------------------------------------
/work_two_Crawler/catch_blog3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | #爬取简书上某个文章地址的主体内容
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | from bs4 import BeautifulSoup
 7 | import requests
 8 | import codecs
 9 | 
10 | 
11 | def get_page(url):
12 |     headers = {
13 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
14 |     }
15 |     data = requests.get(url, headers=headers).content
16 |     return data
17 | 
18 | 
19 | def get_text(html):
20 |     parser = BeautifulSoup(html, 'html.parser')
21 |     article = parser.find('div', attrs={'class': 'article'})  # 定位文章
22 |     title = article.find('h1', attrs={'class': 'title'}).get_text()  # 获取标题
23 |     text = []  # 创建空列表存放文章
24 |     for paragraph in article.find_all('p'):
25 |         paragraph_content = paragraph.get_text()
26 |         text.append(paragraph_content)  # 将文章一段一段的添加到列表中
27 |     return title, text
28 | 
29 | 
30 | def save_text(title, text):
31 |     file_name = title + '.txt'
32 |     with codecs.open(file_name, 'wb', encoding='utf-8') as open_file:
33 |         try:
34 |             for p in text:
35 |                 open_file.write('\t%s\r\n' % p)  # \t是tab制表符，\r\n是Carrige Return换行
36 |         except Exception:
37 |             print('发生了错误！')
38 |         print('文章抓取完成！')
39 |         return
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     url = 'http://www.jianshu.com/p/293c3b71416e'
44 |     html = get_page(url)
45 |     title, text = get_text(html)
46 |     save_text(title, text)
47 | 


--------------------------------------------------------------------------------
/work_two_Crawler/catch_blog5.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | #爬取简书上某个文章地址的主体内容
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | from bs4 import BeautifulSoup
 7 | import requests
 8 | import codecs
 9 | 
10 | 
11 | def get_page(url):
12 |     headers = {
13 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
14 |     }
15 |     data = requests.get(url, headers=headers).content
16 |     return data
17 | 
18 | 
19 | def get_text(html):
20 |     parser = BeautifulSoup(html, 'html.parser')
21 |     article = parser.find('div', attrs={'class': 'article'})  # 定位文章
22 |     title = article.find('h1', attrs={'class': 'title'}).get_text()  # 获取标题
23 |     text = []  # 创建空列表存放文章
24 |     for paragraph in article.find_all('p'):
25 |         paragraph_content = paragraph.get_text()
26 |         text.append(paragraph_content)  # 将文章一段一段的添加到列表中
27 |     return title, text
28 | 
29 | 
30 | def save_text(title, text):
31 |     file_name = title + '.txt'
32 |     with codecs.open(file_name, 'wb', encoding='utf-8') as open_file:
33 |         try:
34 |             for p in text:
35 |                 open_file.write('\t%s\r\n' % p)  # \t是tab制表符，\r\n是Carrige Return换行
36 |         except Exception:
37 |             print('发生了错误！')
38 |         print('文章抓取完成！')
39 |         return
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     url = 'http://www.jianshu.com/p/293c3b71416e'
44 |     html = get_page(url)
45 |     title, text = get_text(html)
46 |     save_text(title, text)
47 | 


--------------------------------------------------------------------------------
/work_two_Crawler/catch_img.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | # 抓取天极图片网某个网址的全部图片
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | import re
 7 | import urllib.request
 8 | import urllib
 9 | 
10 | 
11 | def download_page(url):
12 |     headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) '
13 | 
14 |                              'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}
15 | 
16 |     request = urllib.request.Request(url, headers=headers)
17 |     response = urllib.request.urlopen(request)
18 |     data = response.read()
19 |     return data
20 | 
21 | 
22 | def get_image(html):
23 |     regx = r'http://[\S]*\.jpg'  # 定义正则表达式，意思是所有以.jpg格式结尾的网址
24 |     pattern = re.compile(regx)
25 |     get_img = re.findall(pattern, repr(html))  # 用repr方式将初始网址转换为字符串，然后开始按照预定的模式进行查找，将所有符合条件的网址都放入内存中
26 |     num = 1
27 |     for img in get_img:
28 |         image = download_page(img)  # 将每个img连接重新解析
29 |         with open('%s.jpg' % num, 'wb') as fp:
30 |             fp.write(image)
31 |             num += 1
32 |             print('正在下载第%s 张图片' % num)
33 |     return
34 | 
35 | 
36 | url = 'http://pic.yesky.com/c/6_61112.shtml'
37 | html = download_page(url)
38 | get_image(html)
39 | 
40 | # 但是往往裸奔版对一些网站是爬不了的，这时就需要对爬虫进行一些伪装了。伪装浏览器或者加入延时。
41 | 
42 | # 伪装的话直接把request请求改成
43 | 
44 | # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) '  'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'}
45 | 
46 | # request = urllib.request.Request(url,headers=headers)
47 | 
48 | # 这样就成功完成伪装了
49 | 


--------------------------------------------------------------------------------
/work_two_Crawler/catch_mongo_mzui.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding:utf-8 -*-
  3 | # http://cuiqingcai.com/3179.html
  4 | __author__ = 'Demi Yu'
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | import os
  8 | 
  9 | from Download import request
 10 | from pymongo import MongoClient
 11 | import datetime
 12 | 
 13 | class mzitu():
 14 |     def __init__(self):
 15 |         client = MongoClient() ##与MongDB建立连接（这是默认连接本地MongDB数据库）
 16 |         db = client['meinvxiezhenji'] ## 选择一个数据库
 17 |         self.meizitu_collection = db['meizitu'] ##在meizixiezhenji这个数据库中，选择一个集合
 18 |         self.title = '' ##用来保存页面主题
 19 |         self.url = '' ##用来保存页面地址
 20 |         self.img_urls = [] ##初始化一个 列表  用来保存图片地址
 21 | 
 22 | 
 23 | 
 24 | 
 25 |     def all_url(self, url):
 26 |         html = request.get(url,3)  ##调用request函数把套图地址传进去会返回给我们一个response
 27 |         all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a')
 28 |         retval = os.getcwd()  # 查看当前工作目录 '/Users/yuhongjun/Python/python-training/work_two_Crawler'
 29 |         for a in all_a:
 30 |             title = a.get_text()
 31 |             self.title=title #将主题保存到self.title中
 32 |             print(u'开始保存：', title)  ##加点提示不然太枯燥了
 33 |             path = str(title).replace("?", '_')  ##我注意到有个标题带有 ？  这个符号Windows系统是不能创建文件夹的所以要替换掉
 34 |             self.mkdir(path)  ##调用mkdir函数创建文件夹！这儿path代表的是标题title哦！！！！！不要糊涂了哦！
 35 |             href = a['href']
 36 |             self.url=href  #将页面地址保存到self.url中
 37 | 
 38 |             if self.meizitu_collection.find_one({'主题页面': href}):  ##判断这个主题是否已经在数据库中、不在就运行else下的内容，在则忽略。
 39 |                 print(u'这个页面已经爬取过了')
 40 |             else:
 41 |                 self.html(href) ##调用html函数把href参数传递过去！href是啥还记的吧？ 就是套图的地址哦！！不要迷糊了哦！
 42 | 
 43 |             os.chdir(retval) ##切换到目录
 44 | 
 45 |     def html(self, href):  ##这个函数是处理套图地址获得图片的页面地址
 46 |         html = request.get(href,3)
 47 |         max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
 48 |         page_num = 0  # 这个当作计数器用 （用来判断图片是否下载完毕）
 49 |         # for page in range(1, 2):
 50 |         for page in range(1, int(max_span) + 1):
 51 |             page_num = page_num + 1  ##每for循环一次就+1  （当page_num等于max_span的时候，就证明我们的在下载最后一张图片了）
 52 |             page_url = href + '/' + str(page)
 53 |             self.img(page_url, max_span, page_num)  ##调用img函数 把上面我们我们需要的两个变量，传递给下一个函数。
 54 | 
 55 |     def img(self, page_url, max_span, page_num):  ##这个函数处理图片页面地址获得图片的实际地址
 56 |         img_html = request.get(page_url,3)
 57 |         img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
 58 |         self.img_urls.append(img_url) ##每一次 for page in range(1, int(max_span) + 1)获取到的图片地址都会添加到 img_urls这个初始化的列表
 59 |         if int(max_span) == page_num: ##我们传递下来的两个参数用上了 当max_span和Page_num相等时，就是最后一张图片了，最后一次下载图片并保存到数据库中。
 60 |             self.save(img_url)
 61 |             post = {  ##这是构造一个字典，里面有啥都是中文，很好理解吧！
 62 |                 '标题': self.title,
 63 |                 '主题页面': self.url,
 64 |                 '图片地址': self.img_urls,
 65 |                 '获取时间': datetime.datetime.now()
 66 |             }
 67 |             self.meizitu_collection.save(post) ##将post中的内容写入数据库。
 68 |             print(u'插入数据库成功')
 69 |         else:  ##max_span 不等于 page_num执行这下面
 70 |             self.save(img_url)
 71 | 
 72 |     def save(self, img_url):  ##这个函数保存图片
 73 |         name = img_url[-9:-4]
 74 |         print(u'开始保存：', img_url)
 75 |         img = request.get(img_url,3)
 76 |         f = open(name + '.jpg', 'ab')
 77 |         f.write(img.content)
 78 |         f.close()
 79 | 
 80 |     def mkdir(self, path):  ##这个函数创建文件夹
 81 |         path = path.strip()
 82 |         macPath="Pic/"
 83 |         isExists = os.path.exists(os.path.join(macPath, path))
 84 |         if not isExists:
 85 |             print(u'建了一个名字叫做', path, u'的文件夹！')
 86 |             os.makedirs(os.path.join(macPath, path))
 87 |             os.chdir(os.path.join(macPath, path))  ##切换到目录
 88 |             return True
 89 |         else:
 90 |             print(u'名字叫做', path, u'的文件夹已经存在了！')
 91 |             return False
 92 | 
 93 |     # def request(self, url):  ##这个函数获取网页的response 然后返回
 94 |     #     headers = {
 95 |     #         'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
 96 |     #     content = requests.get(url, headers=headers)
 97 |     #     return content
 98 | 
 99 | 
100 | Mzitu = mzitu()  ##实例化
101 | Mzitu.all_url('http://www.mzitu.com/all')  ##给函数all_url传入参数  你可以当作启动爬虫（就是入口）
102 | 


--------------------------------------------------------------------------------
/work_two_Crawler/catch_mongodb_mzi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | #http://cuiqingcai.com/3363.html
 4 | 
 5 | __author__ = 'Demi Yu'
 6 | 
 7 | import os
 8 | import time
 9 | import threading
10 | import multiprocessing
11 | from catch_mongodb_queue import MogoQueue
12 | from Download import request
13 | from bs4 import BeautifulSoup
14 | 
15 | SLEEP_TIME = 1
16 | 
17 | def mzitu_crawler(max_threads=10):
18 |     crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue') ##这个是我们获取URL的队列
19 |     ##img_queue = MogoQueue('meinvxiezhenji', 'img_queue')
20 |     def pageurl_crawler():
21 |         while True:
22 |             try:
23 |                 url = crawl_queue.pop()
24 |                 print(url)
25 |             except KeyError:
26 |                 print('队列没有数据')
27 |                 break
28 |             else:
29 |                 img_urls = []
30 |                 req = request.get(url, 3).text
31 |                 title = crawl_queue.pop_title(url)
32 |                 mkdir(title)
33 |                 os.chdir('D:\mzitu\\' + title)
34 |                 max_span = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
35 |                 for page in range(1, int(max_span) + 1):
36 |                     page_url = url + '/' + str(page)
37 |                     img_url = BeautifulSoup(request.get(page_url, 3).text, 'lxml').find('div', class_='main-image').find('img')['src']
38 |                     img_urls.append(img_url)
39 |                     save(img_url)
40 |                 crawl_queue.complete(url) ##设置为完成状态
41 |                 ##img_queue.push_imgurl(title, img_urls)
42 |                 ##print('插入数据库成功')
43 | 
44 |     def save(img_url):
45 |         name = img_url[-9:-4]
46 |         print(u'开始保存：', img_url)
47 |         img = request.get(img_url, 3)
48 |         f = open(name + '.jpg', 'ab')
49 |         f.write(img.content)
50 |         f.close()
51 | 
52 |     def mkdir(path):
53 |         path = path.strip()
54 |         isExists = os.path.exists(os.path.join("D:\mzitu", path))
55 |         if not isExists:
56 |             print(u'建了一个名字叫做', path, u'的文件夹！')
57 |             os.makedirs(os.path.join("D:\mzitu", path))
58 |             return True
59 |         else:
60 |             print(u'名字叫做', path, u'的文件夹已经存在了！')
61 |             return False
62 | 
63 |     threads = []
64 |     while threads or crawl_queue:
65 |         """
66 |         这儿crawl_queue用上了，就是我们__bool__函数的作用，为真则代表我们MongoDB队列里面还有数据
67 |         threads 或者 crawl_queue为真都代表我们还没下载完成，程序就会继续执行
68 |         """
69 |         for thread in threads:
70 |             if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
71 |                 threads.remove(thread)
72 |         while len(threads) < max_threads or crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时
73 |             thread = threading.Thread(target=pageurl_crawler) ##创建线程
74 |             thread.setDaemon(True) ##设置守护线程
75 |             thread.start() ##启动线程
76 |             threads.append(thread) ##添加进线程队列
77 |         time.sleep(SLEEP_TIME)
78 | 
79 | def process_crawler():
80 |     process = []
81 |     num_cpus = multiprocessing.cpu_count()
82 |     print('将会启动进程数为：', num_cpus)
83 |     for i in range(num_cpus):
84 |         p = multiprocessing.Process(target=mzitu_crawler) ##创建进程
85 |         p.start() ##启动进程
86 |         process.append(p) ##添加进进程队列
87 |     for p in process:
88 |         p.join() ##等待进程队列里面的进程结束
89 | 
90 | if __name__ == "__main__":
91 |     process_crawler()


--------------------------------------------------------------------------------
/work_two_Crawler/catch_mongodb_queue.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | #http://cuiqingcai.com/3363.html
 4 | 
 5 | __author__ = 'Demi Yu'
 6 | 
 7 | from datetime import datetime, timedelta
 8 | from pymongo import MongoClient, errors
 9 | 
10 | 
11 | class MogoQueue():
12 |     OUTSTANDING = 1  ##初始状态
13 |     PROCESSING = 2  ##正在下载状态
14 |     COMPLETE = 3  ##下载完成状态
15 | 
16 |     def __init__(self, db, collection, timeout=300):  ##初始mongodb连接
17 |         self.client = MongoClient()
18 |         self.Client = self.client[db]
19 |         self.db = self.Client[collection]
20 |         self.timeout = timeout
21 | 
22 |     def __bool__(self):
23 |         """
24 |         这个函数，我的理解是如果下面的表达为真，则整个类为真
25 |         至于有什么用，后面我会注明的（如果我的理解有误，请指点出来谢谢，我也是Python新手）
26 |         $ne的意思是不匹配
27 |         """
28 |         record = self.db.find_one(
29 |             {'status': {'$ne': self.COMPLETE}}
30 |         )
31 |         return True if record else False
32 | 
33 |     def push(self, url, title):  ##这个函数用来添加新的URL进队列
34 |         try:
35 |             self.db.insert({'_id': url, 'status': self.OUTSTANDING, '主题': title})
36 |             print(url, '插入队列成功')
37 |         except errors.DuplicateKeyError as e:  ##报错则代表已经存在于队列之中了
38 |             print(url, '已经存在于队列中了')
39 |             pass
40 | 
41 |     def push_imgurl(self, title, url):
42 |         try:
43 |             self.db.insert({'_id': title, 'statue': self.OUTSTANDING, 'url': url})
44 |             print('图片地址插入成功')
45 |         except errors.DuplicateKeyError as e:
46 |             print('地址已经存在了')
47 |             pass
48 | 
49 |     def pop(self):
50 |         """
51 |         这个函数会查询队列中的所有状态为OUTSTANDING的值，
52 |         更改状态，（query后面是查询）（update后面是更新）
53 |         并返回_id（就是我们的ＵＲＬ），MongDB好使吧，^_^
54 |         如果没有OUTSTANDING的值则调用repair()函数重置所有超时的状态为OUTSTANDING，
55 |         $set是设置的意思，和MySQL的set语法一个意思
56 |         """
57 |         record = self.db.find_and_modify(
58 |             query={'status': self.OUTSTANDING},
59 |             update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}}
60 |         )
61 |         if record:
62 |             return record['_id']
63 |         else:
64 |             self.repair()
65 |             raise KeyError
66 | 
67 |     def pop_title(self, url):
68 |         record = self.db.find_one({'_id': url})
69 |         return record['主题']
70 | 
71 |     def peek(self):
72 |         """这个函数是取出状态为 OUTSTANDING的文档并返回_id(URL)"""
73 |         record = self.db.find_one({'status': self.OUTSTANDING})
74 |         if record:
75 |             return record['_id']
76 | 
77 |     def complete(self, url):
78 |         """这个函数是更新已完成的URL完成"""
79 |         self.db.update({'_id': url}, {'$set': {'status': self.COMPLETE}})
80 | 
81 |     def repair(self):
82 |         """这个函数是重置状态$lt是比较"""
83 |         record = self.db.find_and_modify(
84 |             query={
85 |                 'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},
86 |                 'status': {'$ne': self.COMPLETE}
87 |             },
88 |             update={'$set': {'status': self.OUTSTANDING}}
89 |         )
90 |         if record:
91 |             print('重置URL状态', record['_id'])
92 | 
93 |     def clear(self):
94 |         """这个函数只有第一次才调用、后续不要调用、因为这是删库啊！"""
95 |         self.db.drop()


--------------------------------------------------------------------------------
/work_two_Crawler/catch_mzui.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | # http://cuiqingcai.com/3179.html
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | from bs4 import BeautifulSoup
 7 | import os
 8 | 
 9 | from Download import request
10 | 
11 | class mzitu():
12 |     def all_url(self, url):
13 |         html = request.get(url,3)  ##调用request函数把套图地址传进去会返回给我们一个response
14 |         all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a')
15 |         retval = os.getcwd()      # 查看当前工作目录 '/Users/yuhongjun/Python/python-training/work_two_Crawler'
16 |         for a in all_a:
17 |             title = a.get_text()
18 |             print(u'开始保存：', title)  ##加点提示不然太枯燥了
19 |             path = str(title).replace("?", '_')  ##我注意到有个标题带有 ？  这个符号Windows系统是不能创建文件夹的所以要替换掉
20 |             self.mkdir(path)  ##调用mkdir函数创建文件夹！这儿path代表的是标题title哦！！！！！不要糊涂了哦！
21 |             href = a['href']
22 |             self.html(href)  ##调用html函数把href参数传递过去！href是啥还记的吧？ 就是套图的地址哦！！不要迷糊了哦！
23 |             os.chdir(retval) ##切换到目录
24 | 
25 |     def html(self, href):  ##这个函数是处理套图地址获得图片的页面地址
26 |         html = request.get(href,3)
27 |         max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
28 |         # for page in range(1, int(max_span) + 1):
29 |         for page in range(1, 2):
30 |             page_url = href + '/' + str(page)
31 |             self.img(page_url)  ##调用img函数
32 | 
33 |     def img(self, page_url):  ##这个函数处理图片页面地址获得图片的实际地址
34 |         img_html = request.get(page_url,3)
35 |         img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
36 |         self.save(img_url)
37 | 
38 |     def save(self, img_url):  ##这个函数保存图片
39 |         name = img_url[-9:-4]
40 |         img = request.get(img_url,3)
41 |         f = open(name + '.jpg', 'ab')
42 |         f.write(img.content)
43 |         f.close()
44 | 
45 |     def mkdir(self, path):  ##这个函数创建文件夹
46 |         path = path.strip()
47 |         macPath="Pic/"
48 |         isExists = os.path.exists(os.path.join(macPath, path))
49 |         if not isExists:
50 |             print(u'建了一个名字叫做', path, u'的文件夹！')
51 |             os.makedirs(os.path.join(macPath, path))
52 |             os.chdir(os.path.join(macPath, path))  ##切换到目录
53 |             return True
54 |         else:
55 |             print(u'名字叫做', path, u'的文件夹已经存在了！')
56 |             return False
57 | 
58 |     # def request(self, url):  ##这个函数获取网页的response 然后返回
59 |     #     headers = {
60 |     #         'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
61 |     #     content = requests.get(url, headers=headers)
62 |     #     return content
63 | 
64 | 
65 | Mzitu = mzitu()  ##实例化
66 | Mzitu.all_url('http://www.mzitu.com/all')  ##给函数all_url传入参数  你可以当作启动爬虫（就是入口）
67 | 


--------------------------------------------------------------------------------
/work_two_Crawler/catch_tianmao_rating.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | # 导入所需的开发模块
 7 | import requests
 8 | import re
 9 | # 创建循环链接
10 | urls = []
11 | #处理前100页的评价
12 | for i in list(range(1,10)):
13 |     urls.append('https://rate.tmall.com/list_detail_rate.htm?itemId=521136254098&spuId=345965243&sellerId=2106525799&order=1&currentPage=%s' %i)
14 | 
15 | # 构建字段容器
16 | nickname = [] #昵称
17 | ratedate = [] #评价时间
18 | color = []  #款式
19 | size = [] #尺码
20 | ratecontent = []  #评价内容
21 | # 循环抓取数据
22 | for url in urls:
23 |     content = requests.get(url).text
24 | 
25 | # 借助正则表达式使用findall进行匹配查询，可以使用bs
26 |     nickname.extend(re.findall('"displayUserNick":"(.*?)"',content))
27 |     color.extend(re.findall(re.compile('颜色分类:(.*?);'),content))
28 |     size.extend(re.findall(re.compile('尺码:(.*?);'),content))
29 |     ratecontent.extend(re.findall(re.compile('"rateContent":"(.*?)","rateDate"'),content))
30 |     ratedate.extend(re.findall(re.compile('"rateDate":"(.*?)","reply"'),content))
31 |     print(nickname)
32 | 
33 | # 写入数据，最好改成with形式
34 | file = open('南极人天猫评价.csv','w')
35 | for i in list(range(0,len(nickname))):
36 |     file.write(','.join((nickname[i],ratedate[i],color[i],size[i],ratecontent[i]))+'\n')
37 | file.close()


--------------------------------------------------------------------------------
/work_two_Crawler/save_cookie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | 
 4 | __author__ = 'Demi Yu'
 5 | 
 6 | 
 7 | # 基于py3.6 + pycharm
 8 | # 原地址：http://cuiqingcai.com/968.html
 9 | #
10 | #
11 | # 1.从服务器获取cookie，保存到变量中。
12 | # 整体思路是：
13 | # 先从cookiejar类中声明一个变量用来保存cookie→然后创建cookie处理器，用来处理cookie→创建打开器，读取处理好的cookie→创建请求，来生成cookie→ 将cookie读取到内存
14 | 
15 | # import http.cookiejar
16 | # import urllib.request
17 | #
18 | # cookie = http.cookiejar.CookieJar()
19 | # #声明一个CookieJar对象实例来保存cookie
20 | # handler=urllib.request.HTTPCookieProcessor(cookie)
21 | # #创建一个cookie的处理器，handler本质上是HTTPCookieProcessor类下的实例
22 | # opener = urllib.request.build_opener(handler)
23 | # #创建一个打开器opener，读取handler处理好的内容
24 | # response = opener.open('http://www.baidu.com')
25 | # #创建请求，来生成cookie。此处的open方法同urlopen方法，也可以传入request
26 | # for item in cookie:
27 | #     print ('Name = '+item.name)
28 | #     print ('Value = '+item.value)
29 | #
30 | 
31 | 
32 | 
33 | # 2.将cookie保存到文件。
34 | # 整体思路是：
35 | # 创建保存cookie工具的实例（同时指定文件名）→然后创建cookie处理器，用来处理cookie→创建打开器，用来读取处理好的cookie→创建请求，来生成cookie→将cookie保存到文件
36 | 
37 | # import http.cookiejar
38 | # import urllib.request
39 | #
40 | # cookie = http.cookiejar.MozillaCookieJar('cookie.txt')
41 | # #创建保存cookie工具的实例（同时指定文件名）
42 | # handler = urllib.request.HTTPCookieProcessor(cookie)
43 | # #创建cookie处理器，用来处理cookie。handler本质上是HTTPCookieProcessor类下的实例
44 | # opener = urllib.request.build_opener(handler)
45 | # #创建打开器，用来读取处理好的cookie
46 | # response = opener.open("http://www.baidu.com")
47 | # #创建请求，来生成cookie
48 | # cookie.save(ignore_discard=True, ignore_expires=True)
49 | # #将cookie保存到文件
50 | #
51 | 
52 | 
53 | # 3.从文件中读取cookie并使用
54 | # 整体思路：
55 | # 创建空的cookie实例→从文件中读取cookie到变量→创建request请求→创建cookie处理器，用来处理cookie→创建打开器，用来读取已保存的cookie变量→发送请求，获得服务器的response→打印response
56 | 
57 | 
58 | import http.cookiejar
59 | import urllib.request
60 | 
61 | cookie = http.cookiejar.MozillaCookieJar()
62 | #创建空的cookie实例
63 | cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
64 | #从文件中读取cookie到变量
65 | req = urllib.request.Request("http://www.baidu.com")
66 | #创建request请求
67 | handler = urllib.request.HTTPCookieProcessor(cookie)
68 | #创建cookie处理器，用来处理cookie
69 | opener = urllib.request.build_opener(handler)
70 | #创建打开器，用来读取已保存的cookie变量
71 | response = opener.open(req)
72 | #发送请求，获得服务器的response
73 | print (response.read())
74 | #打印response


--------------------------------------------------------------------------------
/work_two_Crawler/test2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding:utf-8 -*-
 3 | #http://cuiqingcai.com/3335.html
 4 | import multiprocessing
 5 | import time
 6 | 
 7 | def process(num):
 8 |     time.sleep(num)
 9 |     print ('Process:', num)
10 | 
11 | if __name__ == '__main__':
12 |     for i in range(5):
13 |         p = multiprocessing.Process(target=process, args=(i,))
14 |         p.start()
15 | 
16 |     print('CPU number:' + str(multiprocessing.cpu_count()))
17 |     for p in multiprocessing.active_children():
18 |         print('Child process name: ' + p.name + ' id: ' + str(p.pid))
19 | 
20 |     print('Process Ended')
21 | 
22 | 


--------------------------------------------------------------------------------
/work_two_Crawler/《幸福之路》（一）——开篇.txt:
--------------------------------------------------------------------------------
 1 | 	《元认知》的最后一部分推荐了一些书目，刚好这本《幸福之路》在手边。借着脑袋还有“元认知”的余热，拿来巩固一下。
 2 | 	伯特兰·罗素，十九世纪的哲学家、数学家和历史学家。最开始知道他是因为《数学原理》这本书，所以脑海中一直认为他是一个数学家。没想到，他在历史和哲学上的成就并不比在数学中的差。不仅如此，他还获得过“诺贝尔文学奖”。这本《幸福之路》是写于1930年的，距今已经有87年了，文化、科技、医学等等跟“幸福”相关的因素都与现在有着巨大的差距。存在如此巨大差距的情况下，我们可以看看当时的一位智者是怎样思考“幸福”这个一直到现在还在困扰着我们的话题。
 3 | 	我们人类并不会因为有了足够的食物，就会变得很快乐。事实证明，太多的人似乎都经常处于一种不快乐的状态中。造成这种状况的原因，一部分是由于社会制度的缘故，一部分由于个人心理的缘故。既然充足的食物并不能使人们快乐，那么类推，财富似乎也不是解决人们不快乐的万能钥匙。那么如果想靠单纯地追逐财富来解决不快乐的问题，就显得有点滑稽。这种在大多数国度中大多数人都存在的一种“日常的烦恼”实在让人讨厌，有没有一种方法能够让人摆脱这种可恶的状态呢？
 4 | 	作者从童年感到生活的漫长苦闷到写这本书时感到明显对生活的热爱，产生了怎么样的变化呢？主要有这么三点：第一，发现了自己最渴望的的东西是什么，并且逐渐得到了不少；第二，成功抛弃了某些根本就不可能实现的欲望，比如获得某种无可置疑的知识的欲望；第三，不那么经常地想着自己，也就是对自己宽容许多。这是作者自己给自己的总结，虽然并不一定适用于所有人，但至少可以拿来借鉴一下。另外，对于那种自我专注过分的人，作者给出的方法是利用外部的戒律，强行将自己的思绪从自身分离出来。
 5 | 	这里面作者通过举例，解释了一下所谓的“自我专注”。普遍的三类——“犯罪狂”、“自恋狂”、“自大狂”。
 6 | 	犯罪狂：指沉溺于犯罪意识中的人，并且为此导致自己讨厌自己。这种人通常会让自己时刻处在儿时外界设立的一些禁律之中，这样现实中的自己永远与想象中的自己产生强烈的冲突。身处这样一种境况，如何能感受到幸福。很明显，打破那些童年中架设在这些人身上的荒谬枷锁，是他们重获新生、迈向快乐的第一步。
 7 | 	自恋狂：跟上面的“犯罪狂”刚好相反，惯于自我欣赏和希望受人欣赏，而且往往过度。这种人失去了爱的能力，渴望得到的是一种强烈的、被所有人关注的欲望。他们对自己以外的人和事都不会再真正感兴趣了。
 8 | 	自大狂：更渴望权利而不是魅力，与令人爱戴相比，更倾向于选择令人畏惧。过分追求权力会使人不快乐、愚蠢或者既不快乐又愚蠢。那种疯癫式的“我为王”的病态快乐，实际上是一种人格不健全的、屈辱的产物。对于权力的过分追求，会让人变得可悲。
 9 | 	不快乐的原因多种多样，一言以蔽之：凡有任何明显的精神分析意义上的抑制出现，就没有真正的快乐可言，这让“不快乐”都有了某种共同的元素。一个人在年轻的时候一些正常的心理需求却没有得倒满足，ta就会把能达到这种满足看得比其它任何事情都重要（可以解释受父母忽略的孩子长大之后总是过分渴望得到朋友甚至恋人的关注，这给健康的亲密关系带来了极大的麻烦）。而过分强调在这一方面得到满足，会让ta在相当长的时间里放弃寻找其它快乐的方向，至此，让自己的生活变得脆弱而又单一，真正的幸福快乐也就无从谈起了。
10 | 	简单介绍导致人们不快乐的一些原因，后面会对给我们生活造成困扰的几大常见问题展开讨论。
11 | 	
12 | 	
13 | 	日更挑战第八天
14 | 


--------------------------------------------------------------------------------