├── .gitignore ├── CollectiveIntelligence ├── chapter10 │ ├── Thumbs.db │ ├── articles.txt │ ├── clusters.py │ ├── docclass.py │ ├── features.txt │ ├── newsfeatures.py │ ├── nnmf.py │ ├── stockfeatures.txt │ └── stockvolume.py ├── chapter11 │ └── gp.py ├── chapter2 │ ├── deliciousrec.py │ ├── pydelicious.py │ └── recommendations.py ├── chapter3 │ ├── Thumbs.db │ ├── blogdata.txt │ ├── clusters.py │ ├── downloadzebodata.py │ ├── feedlist.txt │ ├── generatefeedvector.py │ └── zebo.txt ├── chapter4 │ ├── nn.py │ └── searchengine.py ├── chapter5 │ ├── dorm.py │ ├── kayak.py │ ├── optimization.py │ ├── schedule.txt │ └── socialnetwork.py ├── chapter6 │ ├── docclass.py │ ├── feedfilter.py │ ├── python_search.xml │ ├── test.db │ └── test1.db ├── chapter7 │ ├── Thumbs.db │ ├── addresslist.txt │ ├── hotornot.py │ ├── treepredict.py │ └── zillow.py ├── chapter8 │ ├── ebaypredict.py │ ├── numpredict.py │ └── optimization.py └── chapter9 │ ├── advancedclassify.py │ ├── agesonly.csv │ ├── facebook.py │ ├── matchmaker.csv │ └── svm.py ├── KNN ├── CF Recommendation System.py ├── __init__.py ├── knn-Euclidean Distance.ipynb ├── knn.ipynb └── knn.py ├── LICENSE ├── README.md ├── Untitled Diagram.png ├── Untitled Diagram.xml ├── com.xml ├── data └── iris.data.csv ├── kaggle ├── Chapter_1.1.ipynb ├── Chapter_1.4.ipynb └── Datasets │ └── Breast-Cancer │ └── breast-cancer-test.csv ├── scripts ├── consumer.py └── producer.py ├── work_one ├── Asyncio_hello.py ├── FLASK_app.py ├── IO.py ├── WSGI_hello.py ├── WSGI_server.py ├── code.jpg ├── distributed │ ├── task_master.py │ └── task_worker.py ├── leet.py ├── mydict.py ├── mydict2.py ├── mydict_test.py ├── myfile.py ├── requestUrlTest.py ├── script1.py ├── sina.html ├── templates │ ├── form.html │ ├── home.html │ └── signin-ok.html ├── test.db ├── test1.jpg ├── test22.jpg ├── test33.png ├── test44.bmp ├── work_GUI.py ├── work_HTMLParser.py ├── work_PILImageDraw.py ├── work_TCP_client.py ├── work_TCP_server.py ├── work_UDP_client.py ├── work_UDP_server.py ├── work_data_MYSQL.py ├── work_data_SQLAlchemy.py ├── work_data_SQLite.py ├── work_mail_POP3.py ├── work_mail_SMTP.py └── work_register.py └── work_two_Crawler ├── 86.jpg ├── 93.jpg ├── 94.jpg ├── Download.py ├── __init__.py ├── catch_blog.py ├── catch_blog3.py ├── catch_blog5.py ├── catch_img.py ├── catch_mongo_mzui.py ├── catch_mongodb_mzi.py ├── catch_mongodb_queue.py ├── catch_mzui.py ├── catch_tianmao_rating.py ├── save_cookie.py ├── test2.py ├── 《幸福之路》(一)——开篇.txt └── 南极人天猫评价.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | .idea 91 | node_modules 92 | saveit.txt 93 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter10/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter10/Thumbs.db -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter10/docclass.py: -------------------------------------------------------------------------------- 1 | import re 2 | import math 3 | import cPickle 4 | from pysqlite2 import dbapi2 as sqlite 5 | 6 | def getwords(doc): 7 | splitter=re.compile('\\W*') 8 | words=[s.lower() for s in splitter.split(doc) 9 | if len(s)>2 and len(s)<20] 10 | 11 | # Return the unique set of words only 12 | return dict([(w,1) for w in words]) 13 | 14 | #def entryfeatures(entry): 15 | 16 | def sampletrain(cl): 17 | cl.train('Nobody owns the water.','good') 18 | cl.train('the quick rabbit jumps fences','good') 19 | cl.train('buy pharmaceuticals now','bad') 20 | cl.train('make quick money at the online casino','bad') 21 | cl.train('the quick brown fox jumps','good') 22 | 23 | class classifier: 24 | def __init__(self,getfeatures): 25 | self.fc={} 26 | self.cc={} 27 | self.getfeatures=getfeatures 28 | 29 | def setdb(self,dbfile): 30 | self.con=sqlite.connect(dbfile) 31 | self.con.execute('create table if not exists fc(feature,category,count)') 32 | self.con.execute('create table if not exists cc(category,count)') 33 | 34 | def incf(self,f,cat): 35 | count=self.fcount(f,cat) 36 | if count==0: 37 | self.con.execute("insert into fc values ('%s','%s',1)" 38 | % (f,cat)) 39 | else: 40 | self.con.execute( 41 | "update fc set count=%d where feature='%s' and category='%s'" 42 | % (count+1,f,cat)) 43 | 44 | def fcount(self,f,cat): 45 | res=self.con.execute( 46 | 'select count from fc where feature="%s" and category="%s"' 47 | %(f,cat)).fetchone() 48 | if res==None: return 0 49 | else: return float(res[0]) 50 | 51 | def incc(self,cat): 52 | count=self.catcount(cat) 53 | if count==0: 54 | self.con.execute("insert into cc values ('%s',1)" % (cat)) 55 | else: 56 | self.con.execute("update cc set count=%d where category='%s'" 57 | % (count+1,cat)) 58 | 59 | def catcount(self,cat): 60 | res=self.con.execute('select count from cc where category="%s"' 61 | %(cat)).fetchone() 62 | if res==None: return 0.0 63 | else: return float(res[0]) 64 | 65 | def categories(self): 66 | cur=self.con.execute('select category from cc'); 67 | return [d[0] for d in cur] 68 | 69 | def totalcount(self): 70 | res=self.con.execute('select sum(count) from cc').fetchone(); 71 | if res==None: return 0 72 | return res[0] 73 | 74 | 75 | """ 76 | def incf(self,f,cat): 77 | self.fc.setdefault(f,{}) 78 | self.fc[f].setdefault(cat,0) 79 | self.fc[f][cat]+=1 80 | 81 | def incc(self,cat): 82 | self.cc.setdefault(cat,0) 83 | self.cc[cat]+=1 84 | 85 | def fcount(self,f,cat): 86 | if f in self.fc and cat in self.fc[f]: 87 | return float(self.fc[f][cat]) 88 | return 0.0 89 | 90 | def catcount(self,cat): 91 | if cat in self.cc: 92 | return float(self.cc[cat]) 93 | return 0 94 | 95 | def totalcount(self): 96 | return sum(self.cc.values()) 97 | 98 | def categories(self): 99 | return self.cc.keys() 100 | """ 101 | 102 | 103 | def train(self,item,cat): 104 | features=self.getfeatures(item) 105 | for f in features: 106 | self.incf(f,cat) 107 | self.incc(cat) 108 | self.con.commit() 109 | 110 | def fprob(self,f,cat): 111 | if self.catcount(cat)==0: return 0 112 | return self.fcount(f,cat)/self.catcount(cat) 113 | 114 | def setfilename(self,filename): 115 | self.filename=filename 116 | self.restoredata() 117 | 118 | def restoredata(self): 119 | try: f=file(self.filename,'rb') 120 | except: return 121 | self.fc=cPickle.load(f) 122 | self.cc=cPickle.load(f) 123 | f.close() 124 | 125 | def savedata(self): 126 | f=file(self.filename,'wb') 127 | cPickle.dump(self.fc,f,True) 128 | cPickle.dump(self.cc,f,True) 129 | f.close() 130 | def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5): 131 | basicprob=prf(f,cat) 132 | totals=sum([self.fcount(f,c) for c in self.categories()]) 133 | bp=((weight*ap)+(totals*basicprob))/(weight+totals) 134 | return bp 135 | 136 | 137 | 138 | class naivebayes(classifier): 139 | def __init__(self,getfeatures): 140 | classifier.__init__(self,getfeatures) 141 | self.thresholds={} 142 | 143 | def setthreshold(self,cat,t): 144 | self.thresholds[cat]=t 145 | 146 | def getthreshold(self,cat): 147 | if cat not in self.thresholds: return 1.0 148 | return self.thresholds[cat] 149 | 150 | def classify(self,item,default=None): 151 | probs={} 152 | max=0.0 153 | for cat in self.categories(): 154 | probs[cat]=self.prob(item,cat) 155 | if probs[cat]>max: 156 | max=probs[cat] 157 | best=cat 158 | for cat in probs: 159 | if cat==best: continue 160 | if probs[cat]*self.getthreshold(best)>probs[best]: return default 161 | return best 162 | 163 | def docprob(self,item,cat): 164 | features=self.getfeatures(item) 165 | p=1 166 | for f in features: p*=self.weightedprob(f,cat,self.fprob) 167 | return p 168 | 169 | 170 | def prob(self,item,cat): 171 | catprob=self.catcount(cat)/self.totalcount() 172 | docprob=self.docprob(item,cat) 173 | return docprob*catprob 174 | 175 | class fisherclassifier(classifier): 176 | def __init__(self,getfeatures): 177 | classifier.__init__(self,getfeatures) 178 | self.minimums={} 179 | 180 | def setminimum(self,cat,min): 181 | self.minimums[cat]=min 182 | 183 | def getminimum(self,cat): 184 | if cat not in self.minimums: return 0 185 | return self.minimums[cat] 186 | 187 | def classify(self,item,default=None): 188 | best=default 189 | max=0.0 190 | for c in self.categories(): 191 | p=self.fisherprob(item,c) 192 | if p>self.getminimum(c) and p>max: 193 | best=c 194 | max=p 195 | return best 196 | 197 | 198 | def cprob(self,f,cat): 199 | # The frequency of this feature in this category 200 | clf=self.fprob(f,cat) 201 | 202 | if clf==0: return 0.0 203 | 204 | # The frequency of this feature in all the categories 205 | freqsum=sum([self.fprob(f,c) for c in self.categories()]) 206 | 207 | # The probability is the frequency in this category divided by 208 | # the overall frequency 209 | p=clf/(freqsum) 210 | 211 | return p 212 | 213 | 214 | def fisherprob(self,item,cat): 215 | p=1 216 | features=self.getfeatures(item) 217 | for f in features: 218 | p*=(self.weightedprob(f,cat,self.cprob)) 219 | fscore=-2*math.log(p) 220 | return self.chi2P(fscore,len(features)*2) 221 | 222 | def chi2P(self,chi,df): 223 | m = chi / 2.0 224 | sum = term = math.exp(-m) 225 | for i in range(1, df//2): 226 | term *= m / i 227 | sum += term 228 | return min(sum, 1.0) 229 | 230 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter10/features.txt: -------------------------------------------------------------------------------- 1 | ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 2 | (6.58003120192, u'The Abs Diet by David Zinczenko') 3 | (5.9231935598, u"I did'nt diet to get in shape for Trinidad's Carnival.....") 4 | (5.04673654071, u'Sensible Diet & Exercise') 5 | 6 | ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 7 | (6.52183126318, u'food/exercise Monday 10/1') 8 | (5.94642162786, u'diet/exercise 10/4') 9 | (5.3332773133, u'food/exercise Friday 10/5') 10 | 11 | ['food', 'calories', 'than', 'easy', 'high', 'come'] 12 | (9.98464450123, u'High or low fat food? Easy trick for figuring it out') 13 | (3.41252863148, u'Oatmeal, cereal of choice.') 14 | (3.19119866786, u'Food and Workout Log 10.8.07') 15 | 16 | ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 17 | (7.46811621754, u'saturday') 18 | (5.62839188358, u'diet-exercise thursday') 19 | (5.29370213306, u'sleepy food/fitness thursday') 20 | 21 | ['food', 'home', 'then', 'exercise', 'morning', 'went'] 22 | (5.22083940456, u'Food & Exercise -- 10/5/2007') 23 | (5.16310413391, u'Food & Exercise -- 10/4/2007') 24 | (4.75585045074, u'Food & Exercise -- 9/28/2007 (yesterday)') 25 | 26 | ['fats', 'quot', 'this', 'good', 'about', 'like'] 27 | (14.9233786406, u'Good fats bad fats') 28 | (1.3775418859, u'Should we ban marathons?') 29 | (1.37194239805, u'Food & Exercise -- 10/3/2007') 30 | 31 | ['quot', 'they', 'money', 'want', 'very', 'best'] 32 | (6.1620884463, u'More about the Chicago marathon') 33 | (5.58276496802, u'LOUIE + LINESMAKER = $$$$') 34 | (4.04959173123, u'High or low fat food? Easy trick for figuring it out') 35 | 36 | ['that', 'much', 'does', 'exercise', 'this', 'morning'] 37 | (7.73926153154, u'Food & Exercise -- 10/7/2007') 38 | (5.96451663382, u'< 1g, etc.') 39 | (3.81276353396, u"why I'm succeeding, finally, with my fitness") 40 | 41 | ['with', 'your', 'weight', 'have', 'control', 'about'] 42 | (6.78756986407, u'Control ur Weight') 43 | (5.54567450388, u'Flu-Busting Chicken Soup') 44 | (5.21079777525, u'Weight Loss Tips') 45 | 46 | ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 47 | (5.58477112035, u'Food and Workout Log 9.27.08') 48 | (5.48488799917, u'Food and Workout Log 10.3.07') 49 | (5.10395750879, u'Food and Workout Log 10.10.07') 50 | 51 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter10/newsfeatures.py: -------------------------------------------------------------------------------- 1 | import feedparser 2 | import re 3 | 4 | 5 | feedlist=['http://today.reuters.com/rss/topNews', 6 | 'http://today.reuters.com/rss/domesticNews', 7 | 'http://today.reuters.com/rss/worldNews', 8 | 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml', 9 | 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml', 10 | 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml', 11 | 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml', 12 | 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 13 | 'http://www.nytimes.com/services/xml/rss/nyt/International.xml', 14 | 'http://news.google.com/?output=rss', 15 | 'http://feeds.salon.com/salon/news', 16 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss', 17 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss', 18 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss', 19 | 'http://rss.cnn.com/rss/edition.rss', 20 | 'http://rss.cnn.com/rss/edition_world.rss', 21 | 'http://rss.cnn.com/rss/edition_us.rss'] 22 | 23 | def stripHTML(h): 24 | p='' 25 | s=0 26 | for c in h: 27 | if c=='<': s=1 28 | elif c=='>': 29 | s=0 30 | p+=' ' 31 | elif s==0: p+=c 32 | return p 33 | 34 | 35 | def separatewords(text): 36 | splitter=re.compile('\\W*') 37 | return [s.lower() for s in splitter.split(text) if len(s)>3] 38 | 39 | def getarticlewords(): 40 | allwords={} 41 | articlewords=[] 42 | articletitles=[] 43 | ec=0 44 | # Loop over every feed 45 | for feed in feedlist: 46 | f=feedparser.parse(feed) 47 | 48 | # Loop over every article 49 | for e in f.entries: 50 | # Ignore identical articles 51 | if e.title in articletitles: continue 52 | 53 | # Extract the words 54 | txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8')) 55 | words=separatewords(txt) 56 | articlewords.append({}) 57 | articletitles.append(e.title) 58 | 59 | # Increase the counts for this word in allwords and in articlewords 60 | for word in words: 61 | allwords.setdefault(word,0) 62 | allwords[word]+=1 63 | articlewords[ec].setdefault(word,0) 64 | articlewords[ec][word]+=1 65 | ec+=1 66 | return allwords,articlewords,articletitles 67 | 68 | def makematrix(allw,articlew): 69 | wordvec=[] 70 | 71 | # Only take words that are common but not too common 72 | for w,c in allw.items(): 73 | if c>3 and c10: 35 | out.write(item) 36 | for user in range(0,currentuser): 37 | if user in owners: out.write('\t1') 38 | else: out.write('\t0') 39 | out.write('\n') 40 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter3/feedlist.txt: -------------------------------------------------------------------------------- 1 | http://feeds.feedburner.com/37signals/beMH 2 | http://feeds.feedburner.com/blogspot/bRuz 3 | http://battellemedia.com/index.xml 4 | http://blog.guykawasaki.com/index.rdf 5 | http://blog.outer-court.com/rss.xml 6 | http://feeds.searchenginewatch.com/sewblog 7 | http://blog.topix.net/index.rdf 8 | http://blogs.abcnews.com/theblotter/index.rdf 9 | http://feeds.feedburner.com/ConsumingExperienceFull 10 | http://flagrantdisregard.com/index.php/feed/ 11 | http://featured.gigaom.com/feed/ 12 | http://gizmodo.com/index.xml 13 | http://gofugyourself.typepad.com/go_fug_yourself/index.rdf 14 | http://googleblog.blogspot.com/rss.xml 15 | http://feeds.feedburner.com/GoogleOperatingSystem 16 | http://headrush.typepad.com/creating_passionate_users/index.rdf 17 | http://feeds.feedburner.com/instapundit/main 18 | http://jeremy.zawodny.com/blog/rss2.xml 19 | http://joi.ito.com/index.rdf 20 | http://feeds.feedburner.com/Mashable 21 | http://michellemalkin.com/index.rdf 22 | http://moblogsmoproblems.blogspot.com/rss.xml 23 | http://newsbusters.org/node/feed 24 | http://beta.blogger.com/feeds/27154654/posts/full?alt=rss 25 | http://feeds.feedburner.com/paulstamatiou 26 | http://powerlineblog.com/index.rdf 27 | http://feeds.feedburner.com/Publishing20 28 | http://radar.oreilly.com/index.rdf 29 | http://scienceblogs.com/pharyngula/index.xml 30 | http://scobleizer.wordpress.com/feed/ 31 | http://sethgodin.typepad.com/seths_blog/index.rdf 32 | http://rss.slashdot.org/Slashdot/slashdot 33 | http://thinkprogress.org/feed/ 34 | http://feeds.feedburner.com/andrewsullivan/rApM 35 | http://wilwheaton.typepad.com/wwdnbackup/index.rdf 36 | http://www.43folders.com/feed/ 37 | http://www.456bereastreet.com/feed.xml 38 | http://www.autoblog.com/rss.xml 39 | http://www.bloggersblog.com/rss.xml 40 | http://www.bloglines.com/rss/about/news 41 | http://www.blogmaverick.com/rss.xml 42 | http://www.boingboing.net/index.rdf 43 | http://www.buzzmachine.com/index.xml 44 | http://www.captainsquartersblog.com/mt/index.rdf 45 | http://www.coolhunting.com/index.rdf 46 | http://feeds.copyblogger.com/Copyblogger 47 | http://feeds.feedburner.com/crooksandliars/YaCP 48 | http://feeds.dailykos.com/dailykos/index.xml 49 | http://www.deadspin.com/index.xml 50 | http://www.downloadsquad.com/rss.xml 51 | http://www.engadget.com/rss.xml 52 | http://www.gapingvoid.com/index.rdf 53 | http://www.gawker.com/index.xml 54 | http://www.gothamist.com/index.rdf 55 | http://www.huffingtonpost.com/raw_feed_index.rdf 56 | http://www.hyperorg.com/blogger/index.rdf 57 | http://www.joelonsoftware.com/rss.xml 58 | http://www.joystiq.com/rss.xml 59 | http://www.kotaku.com/index.xml 60 | http://feeds.kottke.org/main 61 | http://www.lifehack.org/feed/ 62 | http://www.lifehacker.com/index.xml 63 | http://littlegreenfootballs.com/weblog/lgf-rss.php 64 | http://www.makezine.com/blog/index.xml 65 | http://www.mattcutts.com/blog/feed/ 66 | http://xml.metafilter.com/rss.xml 67 | http://www.mezzoblue.com/rss/index.xml 68 | http://www.micropersuasion.com/index.rdf 69 | http://www.neilgaiman.com/journal/feed/rss.xml 70 | http://www.oilman.ca/feed/ 71 | http://www.perezhilton.com/index.xml 72 | http://www.plasticbag.org/index.rdf 73 | http://www.powazek.com/rss.xml 74 | http://www.problogger.net/feed/ 75 | http://feeds.feedburner.com/QuickOnlineTips 76 | http://www.readwriteweb.com/rss.xml 77 | http://www.schneier.com/blog/index.rdf 78 | http://scienceblogs.com/sample/combined.xml 79 | http://www.seroundtable.com/index.rdf 80 | http://www.shoemoney.com/feed/ 81 | http://www.sifry.com/alerts/index.rdf 82 | http://www.simplebits.com/xml/rss.xml 83 | http://feeds.feedburner.com/Spikedhumor 84 | http://www.stevepavlina.com/blog/feed 85 | http://www.talkingpointsmemo.com/index.xml 86 | http://www.tbray.org/ongoing/ongoing.rss 87 | http://feeds.feedburner.com/TechCrunch 88 | http://www.techdirt.com/techdirt_rss.xml 89 | http://www.techeblog.com/index.php/feed/ 90 | http://www.thesuperficial.com/index.xml 91 | http://www.tmz.com/rss.xml 92 | http://www.treehugger.com/index.rdf 93 | http://www.tuaw.com/rss.xml 94 | http://www.valleywag.com/index.xml 95 | http://www.we-make-money-not-art.com/index.rdf 96 | http://www.wired.com/rss/index.xml 97 | http://www.wonkette.com/index.xml 98 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter3/generatefeedvector.py: -------------------------------------------------------------------------------- 1 | import feedparser 2 | import re 3 | 4 | # Returns title and dictionary of word counts for an RSS feed 5 | def getwordcounts(url): 6 | # Parse the feed 7 | d=feedparser.parse(url) 8 | wc={} 9 | 10 | # Loop over all the entries 11 | for e in d.entries: 12 | if 'summary' in e: summary=e.summary 13 | else: summary=e.description 14 | 15 | # Extract a list of words 16 | words=getwords(e.title+' '+summary) 17 | for word in words: 18 | wc.setdefault(word,0) 19 | wc[word]+=1 20 | return d.feed.title,wc 21 | 22 | def getwords(html): 23 | # Remove all the HTML tags 24 | txt=re.compile(r'<[^>]+>').sub('',html) 25 | 26 | # Split words by all non-alpha characters 27 | words=re.compile(r'[^A-Z^a-z]+').split(txt) 28 | 29 | # Convert to lowercase 30 | return [word.lower() for word in words if word!=''] 31 | 32 | 33 | apcount={} 34 | wordcounts={} 35 | feedlist=[line for line in file('feedlist.txt')] 36 | for feedurl in feedlist: 37 | try: 38 | title,wc=getwordcounts(feedurl) 39 | wordcounts[title]=wc 40 | for word,count in wc.items(): 41 | apcount.setdefault(word,0) 42 | if count>1: 43 | apcount[word]+=1 44 | except: 45 | print 'Failed to parse feed %s' % feedurl 46 | 47 | wordlist=[] 48 | for w,bc in apcount.items(): 49 | frac=float(bc)/len(feedlist) 50 | if frac>0.1 and frac<0.5: 51 | wordlist.append(w) 52 | 53 | out=file('blogdata1.txt','w') 54 | out.write('Blog') 55 | for word in wordlist: out.write('\t%s' % word) 56 | out.write('\n') 57 | for blog,wc in wordcounts.items(): 58 | print blog 59 | out.write(blog) 60 | for word in wordlist: 61 | if word in wc: out.write('\t%d' % wc[word]) 62 | else: out.write('\t0') 63 | out.write('\n') 64 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter4/nn.py: -------------------------------------------------------------------------------- 1 | from math import tanh 2 | from pysqlite2 import dbapi2 as sqlite 3 | 4 | def dtanh(y): 5 | return 1.0-y*y 6 | 7 | class searchnet: 8 | def __init__(self,dbname): 9 | self.con=sqlite.connect(dbname) 10 | 11 | def __del__(self): 12 | self.con.close() 13 | 14 | def maketables(self): 15 | self.con.execute('create table hiddennode(create_key)') 16 | self.con.execute('create table wordhidden(fromid,toid,strength)') 17 | self.con.execute('create table hiddenurl(fromid,toid,strength)') 18 | self.con.commit() 19 | 20 | def getstrength(self,fromid,toid,layer): 21 | if layer==0: table='wordhidden' 22 | else: table='hiddenurl' 23 | res=self.con.execute('select strength from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone() 24 | if res==None: 25 | if layer==0: return -0.2 26 | if layer==1: return 0 27 | return res[0] 28 | 29 | def setstrength(self,fromid,toid,layer,strength): 30 | if layer==0: table='wordhidden' 31 | else: table='hiddenurl' 32 | res=self.con.execute('select rowid from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone() 33 | if res==None: 34 | self.con.execute('insert into %s (fromid,toid,strength) values (%d,%d,%f)' % (table,fromid,toid,strength)) 35 | else: 36 | rowid=res[0] 37 | self.con.execute('update %s set strength=%f where rowid=%d' % (table,strength,rowid)) 38 | 39 | def generatehiddennode(self,wordids,urls): 40 | if len(wordids)>3: return None 41 | # Check if we already created a node for this set of words 42 | sorted_words=[str(id) for id in wordids] 43 | sorted_words.sort() 44 | createkey='_'.join(sorted_words) 45 | res=self.con.execute( 46 | "select rowid from hiddennode where create_key='%s'" % createkey).fetchone() 47 | 48 | # If not, create it 49 | if res==None: 50 | cur=self.con.execute( 51 | "insert into hiddennode (create_key) values ('%s')" % createkey) 52 | hiddenid=cur.lastrowid 53 | # Put in some default weights 54 | for wordid in wordids: 55 | self.setstrength(wordid,hiddenid,0,1.0/len(wordids)) 56 | for urlid in urls: 57 | self.setstrength(hiddenid,urlid,1,0.1) 58 | self.con.commit() 59 | 60 | def getallhiddenids(self,wordids,urlids): 61 | l1={} 62 | for wordid in wordids: 63 | cur=self.con.execute( 64 | 'select toid from wordhidden where fromid=%d' % wordid) 65 | for row in cur: l1[row[0]]=1 66 | for urlid in urlids: 67 | cur=self.con.execute( 68 | 'select fromid from hiddenurl where toid=%d' % urlid) 69 | for row in cur: l1[row[0]]=1 70 | return l1.keys() 71 | 72 | def setupnetwork(self,wordids,urlids): 73 | # value lists 74 | self.wordids=wordids 75 | self.hiddenids=self.getallhiddenids(wordids,urlids) 76 | self.urlids=urlids 77 | 78 | # node outputs 79 | self.ai = [1.0]*len(self.wordids) 80 | self.ah = [1.0]*len(self.hiddenids) 81 | self.ao = [1.0]*len(self.urlids) 82 | 83 | # create weights matrix 84 | self.wi = [[self.getstrength(wordid,hiddenid,0) 85 | for hiddenid in self.hiddenids] 86 | for wordid in self.wordids] 87 | self.wo = [[self.getstrength(hiddenid,urlid,1) 88 | for urlid in self.urlids] 89 | for hiddenid in self.hiddenids] 90 | 91 | def feedforward(self): 92 | # the only inputs are the query words 93 | for i in range(len(self.wordids)): 94 | self.ai[i] = 1.0 95 | 96 | # hidden activations 97 | for j in range(len(self.hiddenids)): 98 | sum = 0.0 99 | for i in range(len(self.wordids)): 100 | sum = sum + self.ai[i] * self.wi[i][j] 101 | self.ah[j] = tanh(sum) 102 | 103 | # output activations 104 | for k in range(len(self.urlids)): 105 | sum = 0.0 106 | for j in range(len(self.hiddenids)): 107 | sum = sum + self.ah[j] * self.wo[j][k] 108 | self.ao[k] = tanh(sum) 109 | 110 | return self.ao[:] 111 | 112 | def getresult(self,wordids,urlids): 113 | self.setupnetwork(wordids,urlids) 114 | return self.feedforward() 115 | 116 | def backPropagate(self, targets, N=0.5): 117 | # calculate errors for output 118 | output_deltas = [0.0] * len(self.urlids) 119 | for k in range(len(self.urlids)): 120 | error = targets[k]-self.ao[k] 121 | output_deltas[k] = dtanh(self.ao[k]) * error 122 | 123 | # calculate errors for hidden layer 124 | hidden_deltas = [0.0] * len(self.hiddenids) 125 | for j in range(len(self.hiddenids)): 126 | error = 0.0 127 | for k in range(len(self.urlids)): 128 | error = error + output_deltas[k]*self.wo[j][k] 129 | hidden_deltas[j] = dtanh(self.ah[j]) * error 130 | 131 | # update output weights 132 | for j in range(len(self.hiddenids)): 133 | for k in range(len(self.urlids)): 134 | change = output_deltas[k]*self.ah[j] 135 | self.wo[j][k] = self.wo[j][k] + N*change 136 | 137 | # update input weights 138 | for i in range(len(self.wordids)): 139 | for j in range(len(self.hiddenids)): 140 | change = hidden_deltas[j]*self.ai[i] 141 | self.wi[i][j] = self.wi[i][j] + N*change 142 | 143 | def trainquery(self,wordids,urlids,selectedurl): 144 | # generate a hidden node if necessary 145 | self.generatehiddennode(wordids,urlids) 146 | 147 | self.setupnetwork(wordids,urlids) 148 | self.feedforward() 149 | targets=[0.0]*len(urlids) 150 | targets[urlids.index(selectedurl)]=1.0 151 | error = self.backPropagate(targets) 152 | self.updatedatabase() 153 | 154 | def updatedatabase(self): 155 | # set them to database values 156 | for i in range(len(self.wordids)): 157 | for j in range(len(self.hiddenids)): 158 | self.setstrength(self.wordids[i],self. hiddenids[j],0,self.wi[i][j]) 159 | for j in range(len(self.hiddenids)): 160 | for k in range(len(self.urlids)): 161 | self.setstrength(self.hiddenids[j],self.urlids[k],1,self.wo[j][k]) 162 | self.con.commit() 163 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter5/dorm.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | # The dorms, each of which has two available spaces 5 | dorms=['Zeus','Athena','Hercules','Bacchus','Pluto'] 6 | 7 | # People, along with their first and second choices 8 | prefs=[('Toby', ('Bacchus', 'Hercules')), 9 | ('Steve', ('Zeus', 'Pluto')), 10 | ('Karen', ('Athena', 'Zeus')), 11 | ('Sarah', ('Zeus', 'Pluto')), 12 | ('Dave', ('Athena', 'Bacchus')), 13 | ('Jeff', ('Hercules', 'Pluto')), 14 | ('Fred', ('Pluto', 'Athena')), 15 | ('Suzie', ('Bacchus', 'Hercules')), 16 | ('Laura', ('Bacchus', 'Hercules')), 17 | ('James', ('Hercules', 'Athena'))] 18 | 19 | # [(0,9),(0,8),(0,7),(0,6),...,(0,0)] 20 | domain=[(0,(len(dorms)*2)-i-1) for i in range(0,len(dorms)*2)] 21 | 22 | def printsolution(vec): 23 | slots=[] 24 | # Create two slots for each dorm 25 | for i in range(len(dorms)): slots+=[i,i] 26 | 27 | # Loop over each students assignment 28 | for i in range(len(vec)): 29 | x=int(vec[i]) 30 | 31 | # Choose the slot from the remaining ones 32 | dorm=dorms[slots[x]] 33 | # Show the student and assigned dorm 34 | print prefs[i][0],dorm 35 | # Remove this slot 36 | del slots[x] 37 | 38 | def dormcost(vec): 39 | cost=0 40 | # Create list a of slots 41 | slots=[0,0,1,1,2,2,3,3,4,4] 42 | 43 | # Loop over each student 44 | for i in range(len(vec)): 45 | x=int(vec[i]) 46 | dorm=dorms[slots[x]] 47 | pref=prefs[i][1] 48 | # First choice costs 0, second choice costs 1 49 | if pref[0]==dorm: cost+=0 50 | elif pref[1]==dorm: cost+=1 51 | else: cost+=3 52 | # Not on the list costs 3 53 | 54 | # Remove selected slot 55 | del slots[x] 56 | 57 | return cost 58 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter5/kayak.py: -------------------------------------------------------------------------------- 1 | import time 2 | import urllib2 3 | import xml.dom.minidom 4 | 5 | kayakkey='YOUR KEY HERE' 6 | 7 | def getkayaksession(): 8 | # Construct the URL to start a session 9 | url='http://www.kayak.com/k/ident/apisession?token=%s&version=1' % kayakkey 10 | 11 | # Parse the resulting XML 12 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 13 | 14 | # Find xxxxxxxx 15 | sid=doc.getElementsByTagName('sid')[0].firstChild.data 16 | return sid 17 | 18 | def flightsearch(sid,origin,destination,depart_date): 19 | 20 | # Construct search URL 21 | url='http://www.kayak.com/s/apisearch?basicmode=true&oneway=y&origin=%s' % origin 22 | url+='&destination=%s&depart_date=%s' % (destination,depart_date) 23 | url+='&return_date=none&depart_time=a&return_time=a' 24 | url+='&travelers=1&cabin=e&action=doFlights&apimode=1' 25 | url+='&_sid_=%s&version=1' % (sid) 26 | 27 | # Get the XML 28 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 29 | 30 | # Extract the search ID 31 | searchid=doc.getElementsByTagName('searchid')[0].firstChild.data 32 | 33 | return searchid 34 | 35 | def flightsearchresults(sid,searchid): 36 | def parseprice(p): 37 | return float(p[1:].replace(',','')) 38 | 39 | # Polling loop 40 | while 1: 41 | time.sleep(2) 42 | 43 | # Construct URL for polling 44 | url='http://www.kayak.com/s/basic/flight?' 45 | url+='searchid=%s&c=5&apimode=1&_sid_=%s&version=1' % (searchid,sid) 46 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 47 | 48 | # Look for morepending tag, and wait until it is no longer true 49 | morepending=doc.getElementsByTagName('morepending')[0].firstChild 50 | if morepending==None or morepending.data=='false': break 51 | 52 | # Now download the complete list 53 | url='http://www.kayak.com/s/basic/flight?' 54 | url+='searchid=%s&c=999&apimode=1&_sid_=%s&version=1' % (searchid,sid) 55 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 56 | 57 | # Get the various elements as lists 58 | prices=doc.getElementsByTagName('price') 59 | departures=doc.getElementsByTagName('depart') 60 | arrivals=doc.getElementsByTagName('arrive') 61 | 62 | # Zip them together 63 | return zip([p.firstChild.data.split(' ')[1] for p in departures], 64 | [p.firstChild.data.split(' ')[1] for p in arrivals], 65 | [parseprice(p.firstChild.data) for p in prices]) 66 | 67 | 68 | def createschedule(people,dest,dep,ret): 69 | # Get a session id for these searches 70 | sid=getkayaksession() 71 | flights={} 72 | 73 | for p in people: 74 | name,origin=p 75 | # Outbound flight 76 | searchid=flightsearch(sid,origin,dest,dep) 77 | flights[(origin,dest)]=flightsearchresults(sid,searchid) 78 | 79 | # Return flight 80 | searchid=flightsearch(sid,dest,origin,ret) 81 | flights[(dest,origin)]=flightsearchresults(sid,searchid) 82 | 83 | return flights 84 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter5/optimization.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import math 4 | 5 | people = [('Seymour','BOS'), 6 | ('Franny','DAL'), 7 | ('Zooey','CAK'), 8 | ('Walt','MIA'), 9 | ('Buddy','ORD'), 10 | ('Les','OMA')] 11 | # Laguardia 12 | destination='LGA' 13 | 14 | flights={} 15 | # 16 | for line in file('schedule.txt'): 17 | origin,dest,depart,arrive,price=line.strip().split(',') 18 | flights.setdefault((origin,dest),[]) 19 | 20 | # Add details to the list of possible flights 21 | flights[(origin,dest)].append((depart,arrive,int(price))) 22 | 23 | def getminutes(t): 24 | x=time.strptime(t,'%H:%M') 25 | return x[3]*60+x[4] 26 | 27 | def printschedule(r): 28 | for d in range(len(r)/2): 29 | name=people[d][0] 30 | origin=people[d][1] 31 | out=flights[(origin,destination)][int(r[d])] 32 | ret=flights[(destination,origin)][int(r[d+1])] 33 | print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin, 34 | out[0],out[1],out[2], 35 | ret[0],ret[1],ret[2]) 36 | 37 | def schedulecost(sol): 38 | totalprice=0 39 | latestarrival=0 40 | earliestdep=24*60 41 | 42 | for d in range(len(sol)/2): 43 | # Get the inbound and outbound flights 44 | origin=people[d][1] 45 | outbound=flights[(origin,destination)][int(sol[d])] 46 | returnf=flights[(destination,origin)][int(sol[d+1])] 47 | 48 | # Total price is the price of all outbound and return flights 49 | totalprice+=outbound[2] 50 | totalprice+=returnf[2] 51 | 52 | # Track the latest arrival and earliest departure 53 | if latestarrivalgetminutes(returnf[0]): earliestdep=getminutes(returnf[0]) 55 | 56 | # Every person must wait at the airport until the latest person arrives. 57 | # They also must arrive at the same time and wait for their flights. 58 | totalwait=0 59 | for d in range(len(sol)/2): 60 | origin=people[d][1] 61 | outbound=flights[(origin,destination)][int(sol[d])] 62 | returnf=flights[(destination,origin)][int(sol[d+1])] 63 | totalwait+=latestarrival-getminutes(outbound[1]) 64 | totalwait+=getminutes(returnf[0])-earliestdep 65 | 66 | # Does this solution require an extra day of car rental? That'll be $50! 67 | if latestarrival>earliestdep: totalprice+=50 68 | 69 | return totalprice+totalwait 70 | 71 | def randomoptimize(domain,costf): 72 | best=999999999 73 | bestr=None 74 | for i in range(0,1000): 75 | # Create a random solution 76 | r=[float(random.randint(domain[i][0],domain[i][1])) 77 | for i in range(len(domain))] 78 | 79 | # Get the cost 80 | cost=costf(r) 81 | 82 | # Compare it to the best one so far 83 | if costdomain[j][0]: 100 | neighbors.append(sol[0:j]+[sol[j]+1]+sol[j+1:]) 101 | if sol[j]0.1: 124 | # Choose one of the indices 125 | i=random.randint(0,len(domain)-1) 126 | 127 | # Choose a direction to change it 128 | dir=random.randint(-step,step) 129 | 130 | # Create a new list with one of the values changed 131 | vecb=vec[:] 132 | vecb[i]+=dir 133 | if vecb[i]domain[i][1]: vecb[i]=domain[i][1] 135 | 136 | # Calculate the current cost and the new cost 137 | ea=costf(vec) 138 | eb=costf(vecb) 139 | p=pow(math.e,(-eb-ea)/T) 140 | 141 | # Is it better, or does it make the probability 142 | # cutoff? 143 | if (ebdomain[i][0]: 156 | return vec[0:i]+[vec[i]-step]+vec[i+1:] 157 | elif vec[i]0 and ua<1 and ub>0 and ub<1: 45 | total+=1 46 | for i in range(len(people)): 47 | for j in range(i+1,len(people)): 48 | # Get the locations of the two nodes 49 | (x1,y1),(x2,y2)=loc[people[i]],loc[people[j]] 50 | 51 | # Find the distance between them 52 | dist=math.sqrt(math.pow(x1-x2,2)+math.pow(y1-y2,2)) 53 | # Penalize any nodes closer than 50 pixels 54 | if dist<50: 55 | total+=(1.0-(dist/50.0)) 56 | 57 | return total 58 | from PIL import Image,ImageDraw 59 | 60 | def drawnetwork(sol): 61 | # Create the image 62 | img=Image.new('RGB',(400,400),(255,255,255)) 63 | draw=ImageDraw.Draw(img) 64 | 65 | # Create the position dict 66 | pos=dict([(people[i],(sol[i*2],sol[i*2+1])) for i in range(0,len(people))]) 67 | 68 | for (a,b) in links: 69 | draw.line((pos[a],pos[b]),fill=(255,0,0)) 70 | 71 | for n,p in pos.items(): 72 | draw.text(p,n,(0,0,0)) 73 | 74 | img.show() 75 | 76 | 77 | domain=[(10,370)]*(len(people)*2) -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter6/docclass.py: -------------------------------------------------------------------------------- 1 | from pysqlite2 import dbapi2 as sqlite 2 | import re 3 | import math 4 | 5 | def getwords(doc): 6 | splitter=re.compile('\\W*') 7 | print doc 8 | # Split the words by non-alpha characters 9 | words=[s.lower() for s in splitter.split(doc) 10 | if len(s)>2 and len(s)<20] 11 | 12 | # Return the unique set of words only 13 | return dict([(w,1) for w in words]) 14 | 15 | class classifier: 16 | def __init__(self,getfeatures,filename=None): 17 | # Counts of feature/category combinations 18 | self.fc={} 19 | # Counts of documents in each category 20 | self.cc={} 21 | self.getfeatures=getfeatures 22 | 23 | def setdb(self,dbfile): 24 | self.con=sqlite.connect(dbfile) 25 | self.con.execute('create table if not exists fc(feature,category,count)') 26 | self.con.execute('create table if not exists cc(category,count)') 27 | 28 | 29 | def incf(self,f,cat): 30 | count=self.fcount(f,cat) 31 | if count==0: 32 | self.con.execute("insert into fc values ('%s','%s',1)" 33 | % (f,cat)) 34 | else: 35 | self.con.execute( 36 | "update fc set count=%d where feature='%s' and category='%s'" 37 | % (count+1,f,cat)) 38 | 39 | def fcount(self,f,cat): 40 | res=self.con.execute( 41 | 'select count from fc where feature="%s" and category="%s"' 42 | %(f,cat)).fetchone() 43 | if res==None: return 0 44 | else: return float(res[0]) 45 | 46 | def incc(self,cat): 47 | count=self.catcount(cat) 48 | if count==0: 49 | self.con.execute("insert into cc values ('%s',1)" % (cat)) 50 | else: 51 | self.con.execute("update cc set count=%d where category='%s'" 52 | % (count+1,cat)) 53 | 54 | def catcount(self,cat): 55 | res=self.con.execute('select count from cc where category="%s"' 56 | %(cat)).fetchone() 57 | if res==None: return 0 58 | else: return float(res[0]) 59 | 60 | def categories(self): 61 | cur=self.con.execute('select category from cc'); 62 | return [d[0] for d in cur] 63 | 64 | def totalcount(self): 65 | res=self.con.execute('select sum(count) from cc').fetchone(); 66 | if res==None: return 0 67 | return res[0] 68 | 69 | 70 | def train(self,item,cat): 71 | features=self.getfeatures(item) 72 | # Increment the count for every feature with this category 73 | for f in features: 74 | self.incf(f,cat) 75 | 76 | # Increment the count for this category 77 | self.incc(cat) 78 | self.con.commit() 79 | 80 | def fprob(self,f,cat): 81 | if self.catcount(cat)==0: return 0 82 | 83 | # The total number of times this feature appeared in this 84 | # category divided by the total number of items in this category 85 | return self.fcount(f,cat)/self.catcount(cat) 86 | 87 | def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5): 88 | # Calculate current probability 89 | basicprob=prf(f,cat) 90 | 91 | # Count the number of times this feature has appeared in 92 | # all categories 93 | totals=sum([self.fcount(f,c) for c in self.categories()]) 94 | 95 | # Calculate the weighted average 96 | bp=((weight*ap)+(totals*basicprob))/(weight+totals) 97 | return bp 98 | 99 | 100 | 101 | 102 | class naivebayes(classifier): 103 | 104 | def __init__(self,getfeatures): 105 | classifier.__init__(self,getfeatures) 106 | self.thresholds={} 107 | 108 | def docprob(self,item,cat): 109 | features=self.getfeatures(item) 110 | 111 | # Multiply the probabilities of all the features together 112 | p=1 113 | for f in features: p*=self.weightedprob(f,cat,self.fprob) 114 | return p 115 | 116 | def prob(self,item,cat): 117 | catprob=self.catcount(cat)/self.totalcount() 118 | docprob=self.docprob(item,cat) 119 | return docprob*catprob 120 | 121 | def setthreshold(self,cat,t): 122 | self.thresholds[cat]=t 123 | 124 | def getthreshold(self,cat): 125 | if cat not in self.thresholds: return 1.0 126 | return self.thresholds[cat] 127 | 128 | def classify(self,item,default=None): 129 | probs={} 130 | # Find the category with the highest probability 131 | max=0.0 132 | for cat in self.categories(): 133 | probs[cat]=self.prob(item,cat) 134 | if probs[cat]>max: 135 | max=probs[cat] 136 | best=cat 137 | 138 | # Make sure the probability exceeds threshold*next best 139 | for cat in probs: 140 | if cat==best: continue 141 | if probs[cat]*self.getthreshold(best)>probs[best]: return default 142 | return best 143 | 144 | class fisherclassifier(classifier): 145 | def cprob(self,f,cat): 146 | # The frequency of this feature in this category 147 | clf=self.fprob(f,cat) 148 | if clf==0: return 0 149 | 150 | # The frequency of this feature in all the categories 151 | freqsum=sum([self.fprob(f,c) for c in self.categories()]) 152 | 153 | # The probability is the frequency in this category divided by 154 | # the overall frequency 155 | p=clf/(freqsum) 156 | 157 | return p 158 | def fisherprob(self,item,cat): 159 | # Multiply all the probabilities together 160 | p=1 161 | features=self.getfeatures(item) 162 | for f in features: 163 | p*=(self.weightedprob(f,cat,self.cprob)) 164 | 165 | # Take the natural log and multiply by -2 166 | fscore=-2*math.log(p) 167 | 168 | # Use the inverse chi2 function to get a probability 169 | return self.invchi2(fscore,len(features)*2) 170 | def invchi2(self,chi, df): 171 | m = chi / 2.0 172 | sum = term = math.exp(-m) 173 | for i in range(1, df//2): 174 | term *= m / i 175 | sum += term 176 | return min(sum, 1.0) 177 | def __init__(self,getfeatures): 178 | classifier.__init__(self,getfeatures) 179 | self.minimums={} 180 | 181 | def setminimum(self,cat,min): 182 | self.minimums[cat]=min 183 | 184 | def getminimum(self,cat): 185 | if cat not in self.minimums: return 0 186 | return self.minimums[cat] 187 | def classify(self,item,default=None): 188 | # Loop through looking for the best result 189 | best=default 190 | max=0.0 191 | for c in self.categories(): 192 | p=self.fisherprob(item,c) 193 | # Make sure it exceeds its minimum 194 | if p>self.getminimum(c) and p>max: 195 | best=c 196 | max=p 197 | return best 198 | 199 | 200 | def sampletrain(cl): 201 | cl.train('Nobody owns the water.','good') 202 | cl.train('the quick rabbit jumps fences','good') 203 | cl.train('buy pharmaceuticals now','bad') 204 | cl.train('make quick money at the online casino','bad') 205 | cl.train('the quick brown fox jumps','good') 206 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter6/feedfilter.py: -------------------------------------------------------------------------------- 1 | import feedparser 2 | import re 3 | 4 | # Takes a filename of URL of a blog feed and classifies the entries 5 | def read(feed,classifier): 6 | # Get feed entries and loop over them 7 | f=feedparser.parse(feed) 8 | for entry in f['entries']: 9 | print 10 | print '-----' 11 | # Print the contents of the entry 12 | print 'Title: '+entry['title'].encode('utf-8') 13 | print 'Publisher: '+entry['publisher'].encode('utf-8') 14 | print 15 | print entry['summary'].encode('utf-8') 16 | 17 | 18 | # Combine all the text to create one item for the classifier 19 | fulltext='%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary']) 20 | 21 | # Print the best guess at the current category 22 | print 'Guess: '+str(classifier.classify(entry)) 23 | 24 | # Ask the user to specify the correct category and train on that 25 | cl=raw_input('Enter category: ') 26 | classifier.train(entry,cl) 27 | 28 | 29 | def entryfeatures(entry): 30 | splitter=re.compile('\\W*') 31 | f={} 32 | 33 | # Extract the title words and annotate 34 | titlewords=[s.lower() for s in splitter.split(entry['title']) 35 | if len(s)>2 and len(s)<20] 36 | for w in titlewords: f['Title:'+w]=1 37 | 38 | # Extract the summary words 39 | summarywords=[s.lower() for s in splitter.split(entry['summary']) 40 | if len(s)>2 and len(s)<20] 41 | 42 | # Count uppercase words 43 | uc=0 44 | for i in range(len(summarywords)): 45 | w=summarywords[i] 46 | f[w]=1 47 | if w.isupper(): uc+=1 48 | 49 | # Get word pairs in summary as features 50 | if i0.3: f['UPPERCASE']=1 59 | 60 | return f 61 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter6/test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter6/test.db -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter6/test1.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter6/test1.db -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter7/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/CollectiveIntelligence/chapter7/Thumbs.db -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter7/addresslist.txt: -------------------------------------------------------------------------------- 1 | 6 Washington 2 | 21 Manassas 3 | 280 Pearl 4 | 55 Ellery 5 | 50 Follen 6 | 51 Granite 7 | 992 Memorial 8 | 83 Trowbridge 9 | 1 Dana 10 | 45 Regent 11 | 90 Alpine 12 | 21 Francis 13 | 112 Avon Hill 14 | 9 Bellevue 15 | 4 Blanchard Rd 16 | 34 Shea 17 | 5 Fountain 18 | 14 Marcella 19 | 39 Saint Saveur 20 | 35 Pemberton 21 | 46 Shepard 22 | 31 Market 23 | 99 Howard 24 | 88 Pearl 25 | 208 Western 26 | 285 Windsor 27 | 26 Cambridgepark 28 | 211 Erie 29 | 129 Franklin 30 | 27 Gurney 31 | 149 Prospect 32 | 27 Linnaean 33 | 20 Dudley 34 | 60 Otis St 35 | 130 Mount Auburn St 36 | 2 Michael Way 37 | 263 Columbia St 38 | 6 Hurlbut St 39 | 199 Harvard St 40 | 168 River St 41 | 400 Washington St 42 | 12 Traill St 43 | 74 Field St 44 | 21 Walden Square Rd 45 | 7 Wendell St 46 | 15 Normandy Ave 47 | 6 Gibson Ter 48 | 94 Pine St 49 | 23 Magee St 50 | 175 Richdale Ave 51 | 168 River St 52 | 246 Brattle St -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter7/hotornot.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import xml.dom.minidom 3 | 4 | api_key='YOUR KEY HERE' 5 | 6 | def getrandomratings(c): 7 | # Construct URL for getRandomProfile 8 | url="http://services.hotornot.com/rest/?app_key=%s" % api_key 9 | url+="&method=Rate.getRandomProfile&retrieve_num=%d" % c 10 | url+="&get_rate_info=true&meet_users_only=true" 11 | 12 | f1=urllib2.urlopen(url).read() 13 | 14 | doc=xml.dom.minidom.parseString(f1) 15 | 16 | emids=doc.getElementsByTagName('emid') 17 | ratings=doc.getElementsByTagName('rating') 18 | 19 | # Combine the emids and ratings together into a list 20 | result=[] 21 | for e,r in zip(emids,ratings): 22 | if r.firstChild!=None: 23 | result.append((e.firstChild.data,r.firstChild.data)) 24 | return result 25 | 26 | stateregions={'New England':['ct','mn','ma','nh','ri','vt'], 27 | 'Mid Atlantic':['de','md','nj','ny','pa'], 28 | 'South':['al','ak','fl','ga','ky','la','ms','mo', 29 | 'nc','sc','tn','va','wv'], 30 | 'Midwest':['il','in','ia','ks','mi','ne','nd','oh','sd','wi'], 31 | 'West':['ak','ca','co','hi','id','mt','nv','or','ut','wa','wy']} 32 | 33 | def getpeopledata(ratings): 34 | result=[] 35 | for emid,rating in ratings: 36 | # URL for the MeetMe.getProfile method 37 | url="http://services.hotornot.com/rest/?app_key=%s" % api_key 38 | url+="&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid 39 | 40 | # Get all the info about this person 41 | try: 42 | rating=int(float(rating)+0.5) 43 | doc2=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 44 | gender=doc2.getElementsByTagName('gender')[0].firstChild.data 45 | age=doc2.getElementsByTagName('age')[0].firstChild.data 46 | loc=doc2.getElementsByTagName('location')[0].firstChild.data[0:2] 47 | 48 | # Convert state to region 49 | for r,s in stateregions.items(): 50 | if loc in s: region=r 51 | 52 | if region!=None: 53 | result.append((gender,int(age),region,rating)) 54 | except: 55 | pass 56 | return result 57 | 58 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter7/zillow.py: -------------------------------------------------------------------------------- 1 | import xml.dom.minidom 2 | import urllib2 3 | 4 | zwskey="YOUR API KEY" 5 | 6 | def getaddressdata(address,city): 7 | escad=address.replace(' ','+') 8 | url='http://www.zillow.com/webservice/GetDeepSearchResults.htm?' 9 | url+='zws-id=%s&address=%s&citystatezip=%s' % (zwskey,escad,city) 10 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 11 | code=doc.getElementsByTagName('code')[0].firstChild.data 12 | if code!='0': return None 13 | if 1: 14 | zipcode=doc.getElementsByTagName('zipcode')[0].firstChild.data 15 | use=doc.getElementsByTagName('useCode')[0].firstChild.data 16 | year=doc.getElementsByTagName('yearBuilt')[0].firstChild.data 17 | sqft=doc.getElementsByTagName('finishedSqFt')[0].firstChild.data 18 | bath=doc.getElementsByTagName('bathrooms')[0].firstChild.data 19 | bed=doc.getElementsByTagName('bedrooms')[0].firstChild.data 20 | rooms=1 #doc.getElementsByTagName('totalRooms')[0].firstChild.data 21 | price=doc.getElementsByTagName('amount')[0].firstChild.data 22 | else: 23 | return None 24 | 25 | return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price) 26 | 27 | def getpricelist(): 28 | l1=[] 29 | for line in file('addresslist.txt'): 30 | data=getaddressdata(line.strip(),'Cambridge,MA') 31 | l1.append(data) 32 | return l1 33 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter8/ebaypredict.py: -------------------------------------------------------------------------------- 1 | import httplib 2 | from xml.dom.minidom import parse, parseString, Node 3 | 4 | devKey = 'YOUR DEV KEY' 5 | appKey = 'YOUR APP KEY' 6 | certKey = 'YOUR CERT KEY' 7 | serverUrl = 'api.ebay.com' 8 | userToken = 'YOUR TOKEN' 9 | 10 | def getHeaders(apicall,siteID="0",compatabilityLevel = "433"): 11 | headers = {"X-EBAY-API-COMPATIBILITY-LEVEL": compatabilityLevel, 12 | "X-EBAY-API-DEV-NAME": devKey, 13 | "X-EBAY-API-APP-NAME": appKey, 14 | "X-EBAY-API-CERT-NAME": certKey, 15 | "X-EBAY-API-CALL-NAME": apicall, 16 | "X-EBAY-API-SITEID": siteID, 17 | "Content-Type": "text/xml"} 18 | return headers 19 | 20 | def sendRequest(apicall,xmlparameters): 21 | connection = httplib.HTTPSConnection(serverUrl) 22 | connection.request("POST", '/ws/api.dll', xmlparameters, getHeaders(apicall)) 23 | response = connection.getresponse() 24 | if response.status != 200: 25 | print "Error sending request:" + response.reason 26 | else: 27 | data = response.read() 28 | connection.close() 29 | return data 30 | 31 | def getSingleValue(node,tag): 32 | nl=node.getElementsByTagName(tag) 33 | if len(nl)>0: 34 | tagNode=nl[0] 35 | if tagNode.hasChildNodes(): 36 | return tagNode.firstChild.nodeValue 37 | return '-1' 38 | 39 | 40 | def doSearch(query,categoryID=None,page=1): 41 | xml = ""+\ 42 | ""+\ 43 | "" +\ 44 | userToken +\ 45 | "" + \ 46 | ""+\ 47 | "200"+\ 48 | ""+str(page)+""+\ 49 | ""+\ 50 | "" + query + "" 51 | if categoryID!=None: 52 | xml+=""+str(categoryID)+"" 53 | xml+="" 54 | 55 | data=sendRequest('GetSearchResults',xml) 56 | response = parseString(data) 57 | itemNodes = response.getElementsByTagName('Item'); 58 | results = [] 59 | for item in itemNodes: 60 | itemId=getSingleValue(item,'ItemID') 61 | itemTitle=getSingleValue(item,'Title') 62 | itemPrice=getSingleValue(item,'CurrentPrice') 63 | itemEnds=getSingleValue(item,'EndTime') 64 | results.append((itemId,itemTitle,itemPrice,itemEnds)) 65 | return results 66 | 67 | 68 | def getCategory(query='',parentID=None,siteID='0'): 69 | lquery=query.lower() 70 | xml = ""+\ 71 | ""+\ 72 | "" +\ 73 | userToken +\ 74 | ""+\ 75 | "ReturnAll"+\ 76 | "true"+\ 77 | ""+siteID+"" 78 | if parentID==None: 79 | xml+="1" 80 | else: 81 | xml+=""+str(parentID)+"" 82 | xml += "" 83 | data=sendRequest('GetCategories',xml) 84 | categoryList=parseString(data) 85 | catNodes=categoryList.getElementsByTagName('Category') 86 | for node in catNodes: 87 | catid=getSingleValue(node,'CategoryID') 88 | name=getSingleValue(node,'CategoryName') 89 | if name.lower().find(lquery)!=-1: 90 | print catid,name 91 | 92 | def getItem(itemID): 93 | xml = ""+\ 94 | ""+\ 95 | "" +\ 96 | userToken +\ 97 | "" + \ 98 | "" + str(itemID) + ""+\ 99 | "ItemReturnAttributes"+\ 100 | "" 101 | data=sendRequest('GetItem',xml) 102 | result={} 103 | response=parseString(data) 104 | result['title']=getSingleValue(response,'Title') 105 | sellingStatusNode = response.getElementsByTagName('SellingStatus')[0]; 106 | result['price']=getSingleValue(sellingStatusNode,'CurrentPrice') 107 | result['bids']=getSingleValue(sellingStatusNode,'BidCount') 108 | seller = response.getElementsByTagName('Seller') 109 | result['feedback'] = getSingleValue(seller[0],'FeedbackScore') 110 | 111 | attributeSet=response.getElementsByTagName('Attribute'); 112 | attributes={} 113 | for att in attributeSet: 114 | attID=att.attributes.getNamedItem('attributeID').nodeValue 115 | attValue=getSingleValue(att,'ValueLiteral') 116 | attributes[attID]=attValue 117 | result['attributes']=attributes 118 | return result 119 | 120 | 121 | def makeLaptopDataset(): 122 | searchResults=doSearch('laptop',categoryID=51148) 123 | result=[] 124 | for r in searchResults: 125 | item=getItem(r[0]) 126 | att=item['attributes'] 127 | try: 128 | data=(float(att['12']),float(att['26444']), 129 | float(att['26446']),float(att['25710']), 130 | float(item['feedback']) 131 | ) 132 | entry={'input':data,'result':float(item['price'])} 133 | result.append(entry) 134 | except: 135 | print item['title']+' failed' 136 | return result 137 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter8/numpredict.py: -------------------------------------------------------------------------------- 1 | from random import random,randint 2 | import math 3 | 4 | def wineprice(rating,age): 5 | peak_age=rating-50 6 | 7 | # Calculate price based on rating 8 | price=rating/2 9 | if age>peak_age: 10 | # Past its peak, goes bad in 10 years 11 | price=price*(5-(age-peak_age)/2) 12 | else: 13 | # Increases to 5x original value as it 14 | # approaches its peak 15 | price=price*(5*((age+1)/peak_age)) 16 | if price<0: price=0 17 | return price 18 | 19 | 20 | def wineset1(): 21 | rows=[] 22 | for i in range(300): 23 | # Create a random age and rating 24 | rating=random()*50+50 25 | age=random()*50 26 | 27 | # Get reference price 28 | price=wineprice(rating,age) 29 | 30 | # Add some noise 31 | price*=(random()*0.2+0.9) 32 | 33 | # Add to the dataset 34 | rows.append({'input':(rating,age), 35 | 'result':price}) 36 | return rows 37 | 38 | def euclidean(v1,v2): 39 | d=0.0 40 | for i in range(len(v1)): 41 | d+=(v1[i]-v2[i])**2 42 | return math.sqrt(d) 43 | 44 | 45 | def getdistances(data,vec1): 46 | distancelist=[] 47 | 48 | # Loop over every item in the dataset 49 | for i in range(len(data)): 50 | vec2=data[i]['input'] 51 | 52 | # Add the distance and the index 53 | distancelist.append((euclidean(vec1,vec2),i)) 54 | 55 | # Sort by distance 56 | distancelist.sort() 57 | return distancelist 58 | 59 | def knnestimate(data,vec1,k=5): 60 | # Get sorted distances 61 | dlist=getdistances(data,vec1) 62 | avg=0.0 63 | 64 | # Take the average of the top k results 65 | for i in range(k): 66 | idx=dlist[i][1] 67 | avg+=data[idx]['result'] 68 | avg=avg/k 69 | return avg 70 | 71 | def inverseweight(dist,num=1.0,const=0.1): 72 | return num/(dist+const) 73 | 74 | def subtractweight(dist,const=1.0): 75 | if dist>const: 76 | return 0 77 | else: 78 | return const-dist 79 | 80 | def gaussian(dist,sigma=5.0): 81 | return math.e**(-dist**2/(2*sigma**2)) 82 | 83 | def weightedknn(data,vec1,k=5,weightf=gaussian): 84 | # Get distances 85 | dlist=getdistances(data,vec1) 86 | avg=0.0 87 | totalweight=0.0 88 | 89 | # Get weighted average 90 | for i in range(k): 91 | dist=dlist[i][0] 92 | idx=dlist[i][1] 93 | weight=weightf(dist) 94 | avg+=weight*data[idx]['result'] 95 | totalweight+=weight 96 | if totalweight==0: return 0 97 | avg=avg/totalweight 98 | return avg 99 | 100 | def dividedata(data,test=0.05): 101 | trainset=[] 102 | testset=[] 103 | for row in data: 104 | if random()=low and v<=high: 176 | nweight+=weight 177 | tweight+=weight 178 | if tweight==0: return 0 179 | 180 | # The probability is the weights in the range 181 | # divided by all the weights 182 | return nweight/tweight 183 | 184 | from pylab import * 185 | 186 | def cumulativegraph(data,vec1,high,k=5,weightf=gaussian): 187 | t1=arange(0.0,high,0.1) 188 | cprob=array([probguess(data,vec1,0,v,k,weightf) for v in t1]) 189 | plot(t1,cprob) 190 | show() 191 | 192 | 193 | def probabilitygraph(data,vec1,high,k=5,weightf=gaussian,ss=5.0): 194 | # Make a range for the prices 195 | t1=arange(0.0,high,0.1) 196 | 197 | # Get the probabilities for the entire range 198 | probs=[probguess(data,vec1,v,v+0.1,k,weightf) for v in t1] 199 | 200 | # Smooth them by adding the gaussian of the nearby probabilites 201 | smoothed=[] 202 | for i in range(len(probs)): 203 | sv=0.0 204 | for j in range(0,len(probs)): 205 | dist=abs(i-j)*0.1 206 | weight=gaussian(dist,sigma=ss) 207 | sv+=weight*probs[j] 208 | smoothed.append(sv) 209 | smoothed=array(smoothed) 210 | 211 | plot(t1,smoothed) 212 | show() 213 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter8/optimization.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import math 4 | 5 | people = [('Seymour','BOS'), 6 | ('Franny','DAL'), 7 | ('Zooey','CAK'), 8 | ('Walt','MIA'), 9 | ('Buddy','ORD'), 10 | ('Les','OMA')] 11 | # Laguardia 12 | destination='LGA' 13 | 14 | flights={} 15 | # 16 | """ 17 | for line in file('schedule.txt'): 18 | origin,dest,depart,arrive,price=line.strip().split(',') 19 | flights.setdefault((origin,dest),[]) 20 | 21 | # Add details to the list of possible flights 22 | flights[(origin,dest)].append((depart,arrive,int(price))) 23 | """ 24 | def getminutes(t): 25 | x=time.strptime(t,'%H:%M') 26 | return x[3]*60+x[4] 27 | 28 | def printschedule(r): 29 | for d in range(len(r)/2): 30 | name=people[d][0] 31 | origin=people[d][1] 32 | out=flights[(origin,destination)][int(r[d])] 33 | ret=flights[(destination,origin)][int(r[d+1])] 34 | print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin, 35 | out[0],out[1],out[2], 36 | ret[0],ret[1],ret[2]) 37 | 38 | def schedulecost(sol): 39 | totalprice=0 40 | latestarrival=0 41 | earliestdep=24*60 42 | 43 | for d in range(len(sol)/2): 44 | # Get the inbound and outbound flights 45 | origin=people[d][1] 46 | outbound=flights[(origin,destination)][int(sol[d])] 47 | returnf=flights[(destination,origin)][int(sol[d+1])] 48 | 49 | # Total price is the price of all outbound and return flights 50 | totalprice+=outbound[2] 51 | totalprice+=returnf[2] 52 | 53 | # Track the latest arrival and earliest departure 54 | if latestarrivalgetminutes(returnf[0]): earliestdep=getminutes(returnf[0]) 56 | 57 | # Every person must wait at the airport until the latest person arrives. 58 | # They also must arrive at the same time and wait for their flights. 59 | totalwait=0 60 | for d in range(len(sol)/2): 61 | origin=people[d][1] 62 | outbound=flights[(origin,destination)][int(sol[d])] 63 | returnf=flights[(destination,origin)][int(sol[d+1])] 64 | totalwait+=latestarrival-getminutes(outbound[1]) 65 | totalwait+=getminutes(returnf[0])-earliestdep 66 | 67 | # Does this solution require an extra day of car rental? That'll be $50! 68 | if latestarrival>earliestdep: totalprice+=50 69 | 70 | return totalprice+totalwait 71 | 72 | def randomoptimize(domain,costf): 73 | best=999999999 74 | bestr=None 75 | for i in range(0,1000): 76 | # Create a random solution 77 | r=[float(random.randint(domain[i][0],domain[i][1])) 78 | for i in range(len(domain))] 79 | 80 | # Get the cost 81 | cost=costf(r) 82 | 83 | # Compare it to the best one so far 84 | if cost0.1: 96 | # Choose one of the indices 97 | i=random.randint(0,len(domain)-1) 98 | 99 | # Choose a direction to change it 100 | dir=random.randint(-step,step) 101 | 102 | # Create a new list with one of the values changed 103 | vecb=vec[:] 104 | vecb[i]+=dir 105 | if vecb[i]domain[i][1]: vecb[i]=domain[i][1] 107 | 108 | # Calculate the current cost and the new cost 109 | ea=costf(vec) 110 | eb=costf(vecb) 111 | p=pow(math.e,(-eb-ea)/T) 112 | 113 | print vec,ea 114 | 115 | 116 | # Is it better, or does it make the probability 117 | # cutoff? 118 | if (ebmaxv: v[i][d]=maxv 160 | elif v[i][d]<-maxv: v[i][d]=-maxv 161 | 162 | # constrain bounds of solutions 163 | x[i][d]+=v[i][d] 164 | if x[i][d]domain[d][1]: x[i][d]=domain[d][1] 166 | 167 | print p[g],costf(p[g]) 168 | return p[g] 169 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter9/advancedclassify.py: -------------------------------------------------------------------------------- 1 | class matchrow: 2 | def __init__(self,row,allnum=False): 3 | if allnum: 4 | self.data=[float(row[i]) for i in range(len(row)-1)] 5 | else: 6 | self.data=row[0:len(row)-1] 7 | self.match=int(row[len(row)-1]) 8 | 9 | def loadmatch(f,allnum=False): 10 | rows=[] 11 | for line in file(f): 12 | rows.append(matchrow(line.split(','),allnum)) 13 | return rows 14 | 15 | from pylab import * 16 | def plotagematches(rows): 17 | xdm,ydm=[r.data[0] for r in rows if r.match==1],\ 18 | [r.data[1] for r in rows if r.match==1] 19 | xdn,ydn=[r.data[0] for r in rows if r.match==0],\ 20 | [r.data[1] for r in rows if r.match==0] 21 | 22 | plot(xdm,ydm,'bo') 23 | plot(xdn,ydn,'b+') 24 | 25 | show() 26 | 27 | def lineartrain(rows): 28 | averages={} 29 | counts={} 30 | 31 | for row in rows: 32 | # Get the class of this point 33 | cl=row.match 34 | 35 | averages.setdefault(cl,[0.0]*(len(row.data))) 36 | counts.setdefault(cl,0) 37 | 38 | # Add this point to the averages 39 | for i in range(len(row.data)): 40 | averages[cl][i]+=float(row.data[i]) 41 | 42 | # Keep track of how many points in each class 43 | counts[cl]+=1 44 | 45 | # Divide sums by counts to get the averages 46 | for cl,avg in averages.items(): 47 | for i in range(len(avg)): 48 | avg[i]/=counts[cl] 49 | 50 | return averages 51 | 52 | def dotproduct(v1,v2): 53 | return sum([v1[i]*v2[i] for i in range(len(v1))]) 54 | 55 | def veclength(v): 56 | return sum([p**2 for p in v]) 57 | 58 | def dpclassify(point,avgs): 59 | b=(dotproduct(avgs[1],avgs[1])-dotproduct(avgs[0],avgs[0]))/2 60 | y=dotproduct(point,avgs[0])-dotproduct(point,avgs[1])+b 61 | if y>0: return 0 62 | else: return 1 63 | 64 | def yesno(v): 65 | if v=='yes': return 1 66 | elif v=='no': return -1 67 | else: return 0 68 | 69 | def matchcount(interest1,interest2): 70 | l1=interest1.split(':') 71 | l2=interest2.split(':') 72 | x=0 73 | for v in l1: 74 | if v in l2: x+=1 75 | return x 76 | 77 | yahookey="YOUR API KEY" 78 | from xml.dom.minidom import parseString 79 | from urllib import urlopen,quote_plus 80 | 81 | loc_cache={} 82 | def getlocation(address): 83 | if address in loc_cache: return loc_cache[address] 84 | data=urlopen('http://api.local.yahoo.com/MapsService/V1/'+\ 85 | 'geocode?appid=%s&location=%s' % 86 | (yahookey,quote_plus(address))).read() 87 | doc=parseString(data) 88 | lat=doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue 89 | long=doc.getElementsByTagName('Longitude')[0].firstChild.nodeValue 90 | loc_cache[address]=(float(lat),float(long)) 91 | return loc_cache[address] 92 | 93 | def milesdistance(a1,a2): 94 | lat1,long1=getlocation(a1) 95 | lat2,long2=getlocation(a2) 96 | latdif=69.1*(lat2-lat1) 97 | longdif=53.0*(long2-long1) 98 | return (latdif**2+longdif**2)**.5 99 | 100 | def loadnumerical(): 101 | oldrows=loadmatch('matchmaker.csv') 102 | newrows=[] 103 | for row in oldrows: 104 | d=row.data 105 | data=[float(d[0]),yesno(d[1]),yesno(d[2]), 106 | float(d[5]),yesno(d[6]),yesno(d[7]), 107 | matchcount(d[3],d[8]), 108 | milesdistance(d[4],d[9]), 109 | row.match] 110 | newrows.append(matchrow(data)) 111 | return newrows 112 | 113 | def scaledata(rows): 114 | low=[999999999.0]*len(rows[0].data) 115 | high=[-999999999.0]*len(rows[0].data) 116 | # Find the lowest and highest values 117 | for row in rows: 118 | d=row.data 119 | for i in range(len(d)): 120 | if d[i]high[i]: high[i]=d[i] 122 | 123 | # Create a function that scales data 124 | def scaleinput(d): 125 | return [(d[i]-low[i])/(high[i]-low[i]) 126 | for i in range(len(low))] 127 | 128 | # Scale all the data 129 | newrows=[matchrow(scaleinput(row.data)+[row.match]) 130 | for row in rows] 131 | 132 | # Return the new data and the function 133 | return newrows,scaleinput 134 | 135 | 136 | def rbf(v1,v2,gamma=10): 137 | dv=[v1[i]-v2[i] for i in range(len(v1))] 138 | l=veclength(dv) 139 | return math.e**(-gamma*l) 140 | 141 | def nlclassify(point,rows,offset,gamma=10): 142 | sum0=0.0 143 | sum1=0.0 144 | count0=0 145 | count1=0 146 | 147 | for row in rows: 148 | if row.match==0: 149 | sum0+=rbf(point,row.data,gamma) 150 | count0+=1 151 | else: 152 | sum1+=rbf(point,row.data,gamma) 153 | count1+=1 154 | y=(1.0/count0)*sum0-(1.0/count1)*sum1+offset 155 | 156 | if y>0: return 0 157 | else: return 1 158 | 159 | def getoffset(rows,gamma=10): 160 | l0=[] 161 | l1=[] 162 | for row in rows: 163 | if row.match==0: l0.append(row.data) 164 | else: l1.append(row.data) 165 | sum0=sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0) 166 | sum1=sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1) 167 | 168 | return (1.0/(len(l1)**2))*sum1-(1.0/(len(l0)**2))*sum0 169 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter9/agesonly.csv: -------------------------------------------------------------------------------- 1 | 24,30,1 2 | 30,40,1 3 | 22,49,0 4 | 43,39,1 5 | 23,30,1 6 | 23,49,0 7 | 48,46,1 8 | 23,23,1 9 | 29,49,0 10 | 38,38,1 11 | 30,34,1 12 | 40,50,1 13 | 35,32,1 14 | 49,44,1 15 | 38,22,1 16 | 30,27,1 17 | 26,24,1 18 | 39,23,1 19 | 36,43,1 20 | 25,31,1 21 | 27,27,1 22 | 32,22,1 23 | 40,30,1 24 | 26,28,1 25 | 46,32,1 26 | 41,37,1 27 | 39,41,1 28 | 18,28,0 29 | 18,47,0 30 | 39,44,1 31 | 38,21,1 32 | 24,36,0 33 | 32,22,1 34 | 21,20,1 35 | 42,36,1 36 | 46,41,1 37 | 39,38,1 38 | 18,31,0 39 | 31,45,1 40 | 44,24,0 41 | 49,22,0 42 | 26,27,1 43 | 25,34,1 44 | 47,23,0 45 | 27,48,0 46 | 32,49,1 47 | 46,41,1 48 | 24,32,1 49 | 29,26,1 50 | 25,36,1 51 | 27,35,1 52 | 38,19,1 53 | 18,40,0 54 | 34,49,1 55 | 32,35,1 56 | 47,49,1 57 | 47,18,0 58 | 33,24,1 59 | 35,28,1 60 | 35,41,1 61 | 39,43,1 62 | 29,18,1 63 | 18,44,0 64 | 26,26,1 65 | 31,43,1 66 | 20,29,0 67 | 28,18,1 68 | 31,38,1 69 | 34,34,1 70 | 32,33,1 71 | 34,27,1 72 | 19,38,0 73 | 32,21,1 74 | 33,37,1 75 | 33,18,1 76 | 18,46,0 77 | 31,37,1 78 | 36,30,1 79 | 40,40,1 80 | 38,30,1 81 | 49,28,1 82 | 31,47,1 83 | 28,50,0 84 | 49,43,1 85 | 24,31,1 86 | 33,43,1 87 | 28,24,1 88 | 45,29,1 89 | 49,35,1 90 | 36,29,1 91 | 42,32,1 92 | 29,18,1 93 | 49,20,0 94 | 22,27,1 95 | 41,38,1 96 | 47,21,0 97 | 40,32,1 98 | 35,18,1 99 | 35,33,1 100 | 34,28,1 101 | 22,31,0 102 | 46,20,0 103 | 18,49,0 104 | 48,23,0 105 | 39,21,1 106 | 20,34,0 107 | 24,20,1 108 | 38,18,1 109 | 37,47,1 110 | 39,37,1 111 | 38,39,1 112 | 27,42,1 113 | 47,49,1 114 | 27,42,1 115 | 40,28,1 116 | 41,46,1 117 | 39,25,1 118 | 43,36,1 119 | 49,30,1 120 | 24,38,0 121 | 49,42,1 122 | 19,22,0 123 | 43,27,1 124 | 30,37,1 125 | 24,31,1 126 | 24,48,0 127 | 24,29,1 128 | 18,19,1 129 | 29,25,1 130 | 38,33,1 131 | 39,20,1 132 | 24,30,1 133 | 22,39,0 134 | 47,21,0 135 | 30,44,1 136 | 41,38,1 137 | 29,33,1 138 | 42,42,1 139 | 47,27,1 140 | 23,20,1 141 | 39,18,1 142 | 30,26,1 143 | 36,27,1 144 | 40,18,1 145 | 31,18,1 146 | 46,27,1 147 | 41,44,1 148 | 26,34,1 149 | 33,18,1 150 | 48,19,0 151 | 46,27,1 152 | 25,40,0 153 | 50,36,1 154 | 20,21,1 155 | 33,47,1 156 | 40,35,1 157 | 24,27,1 158 | 34,19,1 159 | 26,45,0 160 | 34,36,1 161 | 21,27,0 162 | 48,28,1 163 | 23,25,1 164 | 48,46,1 165 | 30,20,1 166 | 23,40,0 167 | 36,40,1 168 | 21,45,0 169 | 30,40,1 170 | 39,24,1 171 | 42,47,1 172 | 28,37,1 173 | 24,30,1 174 | 37,25,1 175 | 44,34,1 176 | 43,32,1 177 | 46,29,1 178 | 49,22,0 179 | 41,28,1 180 | 23,50,0 181 | 30,43,1 182 | 25,32,1 183 | 27,46,0 184 | 23,21,1 185 | 39,41,1 186 | 33,27,1 187 | 49,21,0 188 | 33,33,1 189 | 18,25,0 190 | 42,35,1 191 | 36,25,1 192 | 26,50,0 193 | 18,37,0 194 | 35,37,1 195 | 39,38,1 196 | 22,30,0 197 | 18,44,0 198 | 46,44,1 199 | 24,27,1 200 | 41,34,1 201 | 40,39,1 202 | 34,49,1 203 | 35,41,1 204 | 46,48,1 205 | 50,23,0 206 | 49,20,0 207 | 22,47,0 208 | 27,26,1 209 | 30,30,1 210 | 37,39,1 211 | 42,44,1 212 | 41,27,1 213 | 24,21,1 214 | 34,28,1 215 | 23,43,0 216 | 43,35,1 217 | 42,40,1 218 | 25,24,1 219 | 36,24,1 220 | 25,23,1 221 | 44,30,1 222 | 39,33,1 223 | 38,33,1 224 | 49,30,1 225 | 40,19,1 226 | 19,46,0 227 | 31,21,1 228 | 48,33,1 229 | 26,24,1 230 | 20,37,0 231 | 29,31,1 232 | 35,28,1 233 | 37,25,1 234 | 42,42,1 235 | 42,48,1 236 | 41,47,1 237 | 44,45,1 238 | 45,46,1 239 | 25,38,1 240 | 19,45,0 241 | 36,26,1 242 | 33,36,1 243 | 27,19,1 244 | 48,24,0 245 | 37,48,1 246 | 23,31,0 247 | 20,29,0 248 | 27,44,0 249 | 47,24,0 250 | 36,18,1 251 | 37,48,1 252 | 32,29,1 253 | 46,48,1 254 | 31,47,1 255 | 23,45,0 256 | 28,30,1 257 | 36,32,1 258 | 25,43,0 259 | 24,44,0 260 | 34,47,1 261 | 46,42,1 262 | 18,31,0 263 | 23,25,1 264 | 44,39,1 265 | 18,29,0 266 | 49,40,1 267 | 24,33,0 268 | 21,44,0 269 | 40,24,1 270 | 46,41,1 271 | 42,33,1 272 | 25,41,0 273 | 29,42,1 274 | 40,18,1 275 | 37,40,1 276 | 46,28,1 277 | 33,20,1 278 | 18,42,0 279 | 22,36,0 280 | 27,46,0 281 | 33,48,1 282 | 21,37,0 283 | 26,50,0 284 | 29,23,1 285 | 23,33,0 286 | 21,38,0 287 | 18,30,0 288 | 29,28,1 289 | 31,22,1 290 | 30,48,1 291 | 41,37,1 292 | 35,31,1 293 | 48,32,1 294 | 29,37,1 295 | 32,33,1 296 | 43,26,1 297 | 21,33,0 298 | 44,28,1 299 | 35,18,1 300 | 35,35,1 301 | 25,20,1 302 | 39,46,1 303 | 26,39,1 304 | 36,29,1 305 | 29,44,1 306 | 28,42,1 307 | 38,21,1 308 | 28,49,0 309 | 33,26,1 310 | 31,28,1 311 | 25,47,0 312 | 23,25,1 313 | 45,49,1 314 | 28,26,1 315 | 36,48,1 316 | 42,48,1 317 | 42,21,1 318 | 29,32,1 319 | 26,28,1 320 | 24,46,0 321 | 39,30,1 322 | 29,46,1 323 | 43,43,1 324 | 20,42,0 325 | 35,41,1 326 | 45,19,0 327 | 38,45,1 328 | 25,38,1 329 | 31,20,1 330 | 38,43,1 331 | 37,30,1 332 | 43,27,1 333 | 43,44,1 334 | 21,30,0 335 | 22,45,0 336 | 44,26,1 337 | 43,42,1 338 | 26,41,0 339 | 47,35,1 340 | 48,30,1 341 | 41,24,1 342 | 19,48,0 343 | 45,24,0 344 | 38,41,1 345 | 42,46,1 346 | 49,45,1 347 | 28,44,1 348 | 22,44,0 349 | 31,48,1 350 | 48,21,0 351 | 31,20,1 352 | 30,39,1 353 | 23,23,1 354 | 21,32,0 355 | 19,19,1 356 | 21,27,0 357 | 24,46,0 358 | 25,28,1 359 | 48,50,1 360 | 25,32,1 361 | 26,29,1 362 | 33,48,1 363 | 35,32,1 364 | 48,25,1 365 | 30,27,1 366 | 34,49,1 367 | 40,45,1 368 | 28,32,1 369 | 47,33,1 370 | 29,33,1 371 | 21,22,1 372 | 21,39,0 373 | 41,45,1 374 | 46,39,1 375 | 22,24,1 376 | 32,22,1 377 | 27,46,0 378 | 26,35,1 379 | 27,29,1 380 | 48,19,0 381 | 35,26,1 382 | 42,29,1 383 | 30,22,1 384 | 20,26,0 385 | 33,25,1 386 | 37,30,1 387 | 37,32,1 388 | 20,22,1 389 | 42,48,1 390 | 29,20,1 391 | 32,46,1 392 | 37,34,1 393 | 29,45,1 394 | 19,44,0 395 | 49,18,0 396 | 28,25,1 397 | 48,31,1 398 | 35,46,1 399 | 34,26,1 400 | 38,26,1 401 | 36,31,1 402 | 31,30,1 403 | 27,19,1 404 | 44,38,1 405 | 19,37,0 406 | 43,49,1 407 | 19,42,0 408 | 32,24,1 409 | 46,43,1 410 | 43,46,1 411 | 33,32,1 412 | 23,35,0 413 | 26,34,1 414 | 48,20,0 415 | 45,38,1 416 | 30,30,1 417 | 28,23,1 418 | 43,36,1 419 | 19,37,0 420 | 39,45,1 421 | 20,30,0 422 | 28,30,1 423 | 19,42,0 424 | 41,21,1 425 | 42,31,1 426 | 47,45,1 427 | 42,48,1 428 | 40,22,1 429 | 28,20,1 430 | 22,31,0 431 | 28,24,1 432 | 18,33,0 433 | 42,47,1 434 | 35,18,1 435 | 32,28,1 436 | 45,39,1 437 | 46,45,1 438 | 41,43,1 439 | 24,37,0 440 | 34,30,1 441 | 40,22,1 442 | 38,20,1 443 | 43,28,1 444 | 21,26,0 445 | 35,27,1 446 | 33,37,1 447 | 48,39,1 448 | 47,40,1 449 | 31,32,1 450 | 18,32,0 451 | 31,20,1 452 | 30,49,1 453 | 22,46,0 454 | 36,39,1 455 | 30,35,1 456 | 49,50,1 457 | 46,39,1 458 | 45,44,1 459 | 34,40,1 460 | 27,28,1 461 | 27,35,1 462 | 46,46,1 463 | 26,42,0 464 | 27,18,1 465 | 23,38,0 466 | 30,30,1 467 | 34,32,1 468 | 48,27,1 469 | 31,23,1 470 | 29,47,0 471 | 47,31,1 472 | 35,19,1 473 | 30,28,1 474 | 33,44,1 475 | 36,37,1 476 | 34,44,1 477 | 42,43,1 478 | 36,29,1 479 | 35,46,1 480 | 22,36,0 481 | 39,47,1 482 | 23,23,1 483 | 47,20,0 484 | 38,22,1 485 | 21,33,0 486 | 37,41,1 487 | 18,18,1 488 | 35,34,1 489 | 49,49,1 490 | 33,32,1 491 | 31,19,1 492 | 31,26,1 493 | 45,31,1 494 | 41,44,1 495 | 27,47,0 496 | 28,26,1 497 | 18,47,0 498 | 37,18,1 499 | 20,42,0 500 | 36,45,1 501 | -------------------------------------------------------------------------------- /CollectiveIntelligence/chapter9/facebook.py: -------------------------------------------------------------------------------- 1 | import urllib,md5,webbrowser,time 2 | from xml.dom.minidom import parseString 3 | 4 | apikey="47e953c8ea9ed30db904af453125c759" 5 | secret="ea703e4721e8c7bf88b92110a46a9b06" 6 | FacebookURL = "https://api.facebook.com/restserver.php" 7 | 8 | def getsinglevalue(node,tag): 9 | nl=node.getElementsByTagName(tag) 10 | if len(nl)>0: 11 | tagNode=nl[0] 12 | if tagNode.hasChildNodes(): 13 | return tagNode.firstChild.nodeValue 14 | return '' 15 | 16 | def callid(): 17 | return str(int(time.time()*10)) 18 | 19 | class fbsession: 20 | def __init__(self): 21 | self.session_secret=None 22 | self.session_key=None 23 | self.createtoken() 24 | webbrowser.open(self.getlogin()) 25 | print "Press enter after logging in:", 26 | raw_input() 27 | self.getsession() 28 | def sendrequest(self, args): 29 | args['api_key'] = apikey 30 | args['sig'] = self.makehash(args) 31 | post_data = urllib.urlencode(args) 32 | url = FacebookURL + "?" + post_data 33 | data=urllib.urlopen(url).read() 34 | print data 35 | return parseString(data) 36 | def makehash(self,args): 37 | hasher = md5.new(''.join([x + '=' + args[x] for x in sorted(args.keys())])) 38 | if self.session_secret: hasher.update(self.session_secret) 39 | else: hasher.update(secret) 40 | return hasher.hexdigest() 41 | def createtoken(self): 42 | res = self.sendrequest({'method':"facebook.auth.createToken"}) 43 | self.token = getsinglevalue(res,'token') 44 | def getlogin(self): 45 | return "http://api.facebook.com/login.php?api_key="+apikey+\ 46 | "&auth_token=" + self.token 47 | def getsession(self): 48 | doc=self.sendrequest({'method':'facebook.auth.getSession', 49 | 'auth_token':self.token}) 50 | self.session_key=getsinglevalue(doc,'session_key') 51 | self.session_secret=getsinglevalue(doc,'secret') 52 | def getfriends(self): 53 | doc=self.sendrequest({'method':'facebook.friends.get', 54 | 'session_key':self.session_key,'call_id':callid()}) 55 | results=[] 56 | for n in doc.getElementsByTagName('result_elt'): 57 | results.append(n.firstChild.nodeValue) 58 | return results 59 | 60 | def getinfo(self,users): 61 | ulist=','.join(users) 62 | 63 | fields='gender,current_location,relationship_status,'+\ 64 | 'affiliations,hometown_location' 65 | 66 | doc=self.sendrequest({'method':'facebook.users.getInfo', 67 | 'session_key':self.session_key,'call_id':callid(), 68 | 'users':ulist,'fields':fields}) 69 | 70 | results={} 71 | for n,id in zip(doc.getElementsByTagName('result_elt'),users): 72 | # Get the location 73 | locnode=n.getElementsByTagName('hometown_location')[0] 74 | loc=getsinglevalue(locnode,'city')+', '+getsinglevalue(locnode,'state') 75 | 76 | # Get school 77 | college='' 78 | gradyear='0' 79 | affiliations=n.getElementsByTagName('affiliations_elt') 80 | for aff in affiliations: 81 | # Type 1 is college 82 | if getsinglevalue(aff,'type')=='1': 83 | college=getsinglevalue(aff,'name') 84 | gradyear=getsinglevalue(aff,'year') 85 | 86 | results[id]={'gender':getsinglevalue(n,'gender'), 87 | 'status':getsinglevalue(n,'relationship_status'), 88 | 'location':loc,'college':college,'year':gradyear} 89 | return results 90 | 91 | def arefriends(self,idlist1,idlist2): 92 | id1=','.join(idlist1) 93 | id2=','.join(idlist2) 94 | doc=self.sendrequest({'method':'facebook.friends.areFriends', 95 | 'session_key':self.session_key,'call_id':callid(), 96 | 'id1':id1,'id2':id2}) 97 | results=[] 98 | for n in doc.getElementsByTagName('result_elt'): 99 | results.append(int(n.firstChild.nodeValue)) 100 | return results 101 | 102 | 103 | 104 | def makedataset(self): 105 | from advancedclassify import milesdistance 106 | # Get all the info for all my friends 107 | friends=self.getfriends() 108 | info=self.getinfo(friends) 109 | ids1,ids2=[],[] 110 | rows=[] 111 | 112 | # Nested loop to look at every pair of friends 113 | for i in range(len(friends)): 114 | f1=friends[i] 115 | data1=info[f1] 116 | 117 | # Start at i+1 so we don't double up 118 | for j in range(i+1,len(friends)): 119 | f2=friends[j] 120 | data2=info[f2] 121 | ids1.append(f1) 122 | ids2.append(f2) 123 | 124 | # Generate some numbers from the data 125 | if data1['college']==data2['college']: sameschool=1 126 | else: sameschool=0 127 | male1=(data1['gender']=='Male') and 1 or 0 128 | male2=(data2['gender']=='Male') and 1 or 0 129 | 130 | row=[male1,int(data1['year']),male2,int(data2['year']),sameschool] 131 | rows.append(row) 132 | # Call arefriends in blocks for every pair of people 133 | arefriends=[] 134 | for i in range(0,len(ids1),30): 135 | j=min(i+30,len(ids1)) 136 | pa=self.arefriends(ids1[i:j],ids2[i:j]) 137 | arefriends+=pa 138 | return arefriends,rows 139 | 140 | -------------------------------------------------------------------------------- /KNN/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Demi Yu' -------------------------------------------------------------------------------- /KNN/knn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "ename": "Error", 12 | "evalue": "iterator should return strings, not bytes (did you open the file in text mode?)", 13 | "traceback": [ 14 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 15 | "\u001b[0;31mError\u001b[0m Traceback (most recent call last)", 16 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/Users/yuhongjun/Python/python-training/data/iris.data.csv\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcsvfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mlines\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcsv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcsvfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlines\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'a'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 17 | "\u001b[0;31mError\u001b[0m: iterator should return strings, not bytes (did you open the file in text mode?)" 18 | ], 19 | "output_type": "error" 20 | } 21 | ], 22 | "source": [ 23 | "import numpy as np" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 2", 37 | "language": "python", 38 | "name": "python2" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 2 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython2", 50 | "version": "2.7.6" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 0 55 | } 56 | -------------------------------------------------------------------------------- /KNN/knn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Demi Yu' 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | import sklearn.metrics as metrics 9 | import numpy as np 10 | from sklearn.neighbors import NearestNeighbors 11 | from scipy.spatial.distance import correlation, cosine 12 | import ipywidgets as widgets 13 | from IPython.display import display, clear_output 14 | from sklearn.metrics import pairwise_distances 15 | from sklearn.metrics import mean_squared_error 16 | from math import sqrt 17 | import sys, os, time 18 | from contextlib import contextmanager 19 | import numpy as np 20 | import math 21 | def createDataset(): 22 | # 构建训练集数据 23 | dataset = [[0.26547727, 0.27892898,0], 24 | [0.1337869 , 0.08356665,0], 25 | [0.02771102, 0.36429227,0], 26 | [0.81783834, 0.86542639,1], 27 | [0.99240191, 0.87950623,1], 28 | [0.99240191, 0.77950623,1]] 29 | return np.array(dataset) 30 | 31 | 32 | def getDistance(instance1,instance2): 33 | # 计算两点间的距离 34 | distance=0 35 | length = len(instance1) 36 | for i in range(length): 37 | distance += math.pow(instance1[i]-instance2[i],2) 38 | return math.sqrt(distance) 39 | 40 | 41 | def getNeighbors(trainingSet,testInstance,k): 42 | # 计算未知实例与所有已知实例的距离。返回最近的K个已知实例 43 | features = createDataset()[:,:2] 44 | labels = createDataset()[:,-1] 45 | distance_list = [] 46 | for i in range(len(features)): 47 | distance = getDistance(testInstance,features[i]) 48 | distance_list.append((distance,labels[i])) 49 | sorted_distance_list = sorted(distance_list) 50 | neighbors = sorted_distance_list[:k] 51 | return neighbors 52 | 53 | 54 | def countClass(neighbors): 55 | # 对返回最近的K个已知实例,进行统计分类,根据少数服从多数,让未知实例归类为K个最邻近样本中最多数的类别。 56 | class_num_dict = {} 57 | for n in neighbors: 58 | if n[1] in class_num_dict: 59 | class_num_dict[n[1]] += 1 60 | else: 61 | class_num_dict[n[1]] = 1 62 | return class_num_dict 63 | 64 | def main(): 65 | trainingSet = createDataset() 66 | testSet = [[0,0],[1,1],[1.1,1.2]] 67 | result = [] 68 | for test in testSet: 69 | # 计算未知实例与所有已知实例的距离。返回最近的K个已知实例 70 | neighbors = getNeighbors(trainingSet,test,4) 71 | # 对返回最近的K个已知实例,进行统计分类。 72 | class_num_dict = countClass(neighbors) 73 | # 根据少数服从多数,让未知实例归类为K个最邻近样本中最多数的类别。 74 | result.append(sorted(class_num_dict.items(),key = lambda x:x[1],reverse=True)[0][0]) 75 | print(testSet) 76 | print(result) 77 | 78 | if __name__ == '__main__': 79 | main() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Demi_YuHongJun 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-training 2 | python-training 3 | 4 | ### create my.cnf for MYSQL 5 | 6 | 在mac上:MySQL的配置文件默认存放在/etc/my.cnf或者/etc/mysql/my.cnf: 7 | 8 | 但是mac中在/etc/ 可能不存在my.cnf 或/mysql/my.cnf: 可以自己创建; 9 | 10 | 我在/etc/ 中创建/mysql/my.cnf, 并输入内容如下: 11 | ``` 12 | [client] 13 | default-character-set = utf8 14 | 15 | [mysqld] 16 | default-storage-engine = INNODB 17 | character-set-server = utf8 18 | collation-server = utf8_general_ci 19 | ``` 20 | ### 命令行: 21 | ``` 22 | $ alias mysql=/usr/local/mysql/bin/mysql 23 | 24 | $ alias mysqladmin=/usr/local/mysql/bin/mysqladmin 25 | 26 | $ mysqladmin -u root -p password new_password 27 | 28 | mysql -u root -p 29 | 30 | mysql> show variables like '%char%'; 31 | 32 | 33 | +--------------------------+-----------------------------------------------------------+ 34 | | Variable_name | Value | 35 | +--------------------------+-----------------------------------------------------------+ 36 | | character_set_client | utf8 | 37 | | character_set_connection | utf8 | 38 | | character_set_database | utf8 | 39 | | character_set_filesystem | binary | 40 | | character_set_results | utf8 | 41 | | character_set_server | utf8 | 42 | | character_set_system | utf8 | 43 | | character_sets_dir | /usr/local/mysql-5.7.18-macos10.12-x86_64/share/charsets/ | 44 | +--------------------------+-----------------------------------------------------------+ 45 | 8 rows in set (0.00 sec) 46 | ``` 47 | 看到utf8字样就表示编码设置正确 48 | 49 | ### MySql 5.7中添加用户,新建数据库,用户授权,删除用户,修改密码 50 | 51 | #### 新建用户 52 | 创建test用户,密码是password。 53 | 54 | mysql -u root -p 55 | 56 | mysql> CREATE USER "www-data"@"localhost" IDENTIFIED BY "www-data"; #本地登录 57 | mysql> CREATE USER "www-data"@"%" IDENTIFIED BY "www-data"; #远程登录 58 | mysql> quit 59 | 60 | mysql -u test -p #测试是否创建成功 61 | 62 | #### 为用户授权 63 | 64 | 1. 登录MYSQL,这里以ROOT身份登录: 65 | ``` 66 | mysql -u root -p 67 | ``` 68 | 2. 为用户创建一个数据库(testDB): 69 | ``` 70 | create database testDB; 71 | create database testDB default charset utf8 collate utf8_general_ci; 72 | ``` 73 | 3. 授权test用户拥有testDB数据库的所有权限: 74 | 75 | 授权格式:grant 权限 on 数据库.* to 用户名@登录主机 identified by “密码”;密码可为空 76 | ``` 77 | grant all privileges on testDB.* to "test"@"localhost" identified by "password"; 78 | flush privileges; #刷新系统权限表 79 | ``` 80 | 4. 指定部分权限给用户: 81 | ``` 82 | grant all privileges on testDB.* to "test"@"localhost" identified by "password"; 83 | 84 | flush privileges; #刷新系统权限表 85 | ``` 86 | 5. 删除用户 87 | ``` 88 | drop user 用户名@'%'; 89 | drop user 'www-data'@'localhost'; 90 | ``` 91 | 6. 修改指定用户密码 92 | ``` 93 | mysql -u root -p 94 | update mysql.user set authentication_string=password(“新密码”) where User="test" and Host="localhost"; 95 | flush privileges; 96 | ``` -------------------------------------------------------------------------------- /Untitled Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/Untitled Diagram.png -------------------------------------------------------------------------------- /Untitled Diagram.xml: -------------------------------------------------------------------------------- 1 | 3Z1Nc9s4EoZ/ja8uAuDnccZ2snvI1lYlVTlOMRZtayOLXkmexPvrV7RFW+hug6RFoJv0YSojy19PEy/ebjSAM3Nx//vzpny4+1IvqtWZjha/z8zlmdZFFO//27zw9PJCXOiXF243y8XLS+rtha/L/1WHF6PDq4/LRbW13rir69Vu+WC/eF2v19X17uW1w9eWm039y37bTb2yf+pDeVtZX9S88PW6XLW/x3ny9vr35WJ39/J6rtO31/9RLW/vDj9bqbR4+cyP8vrn7aZ+XB9+4pk2N88fL5++L9vvdfjJ27tyUf86eslcnZmLTV3vXv51//uiWjVwW3AvX/fpnc++/oGbar3r8wWpefmKv8vVY2V9j+3uqeWx3W3qn9VFvao3zy+Yy6ury0/7b/nnzXK1Onr9j8sLc1ng3+HwLf+uNrvq99FLh9/pc1XfV7vN0/4th88mBzyHByiPzqPjj8P3+/UWmvjwBXdHUXl9JA6Pw+3rj3kDsv/Hgck7fGLE56K+f1hVu6r5Gzf1f5oHUEfV7+r6cbes1wheuVrervf/XFU3+9/rz/Asi6EsjfYFM0EwTdLoRPm0FfDQxcpCo7LUApcV58XxR4bAAbQ694QxxWM2PY/ic6UlUCwsCjpKhFLMMMXzKJUBMcnsR9F+EiGhmI1hjhj2hnc6pDSxnzQnoxQxolQuGYVKgaic8ExdXFwlo+BKAJHEfqgKe2DmRbA5NosQr6v17XJdVZvl+na0CXU0kAX9aPXmaBR4iyesCmEdYbYdjSKYbbUTIpvCZdrDXDseQzDXGqEQiSTj1Kl2PFkEU21uMVSRjUhhuxwKIs5ETMDJFs4emY0JSFqGp1tvswdOKv59lJaV98t1eUpiNv58MRgdnDAyiiTI6uIRyBJ5xsv8EfC5c04TiJ1mG51ENjF0nvA9HwiihfMGBlodyg9oFdjIhaKF84k4qPLbXGIwI+IM3pfU5zhR+FLu9llC2bxr+1BdL2+W1+xqPxSXt3pbjjMAPTQBGF3BVebmwzXKcsLpZ+wKDqpncmgRlj5nV3A3LLb8J8fWPQn6TNkTmbIppXx6jq37kZ7Xm8VJBaDTweWngjOaHK6jcxRo1FWauGmxDUbCqRfcOq8KI5QW4dTD0+rQ+VSKzmOjnjLqvA0JMIhNMJkviPr+fx+XD/fNn6qjVflUP+5ECf1QcoF0vsB+3vD7eSj0AJdhW+Ftu46O059InNLLwYUtvY6lST1g0T57DLSwp89CYgIctM1Jg2WLcJ6+wJ7+6+NDtfl7uX0uy0TlerH/731VNZKPdYurKp+agQBhUT72xBN7++ZBYxb9tgD5Dq22L4RhWGJ3rxNuzQcrPoJoYXdv1HmUsEq+1k5aBZu9L7C9zxklHwxBYEHDLcCqCPv7P5fPb6lvml+h3vwUpPNDsQXSeRVhc89exIEyD2AlbENRtQvBJ7SUeJd5QbROb8DxLvOAVrscw0ELW/uiN6dg/ZsGPG32Yke4HRIqwo7/eyP5d/XDR1U/WO/mUIhwMmhb8V2tOGYMxkQWEA3NAsL1cioXVUYHpyIiPRha4wjXzhnHYjnixEEPTRy4WjrhIG8nIg6MOKNo90CyNHUCUCnXlKJwRvFOT2dbTnpbNr6u7/dMlltJHZ8Dufbq9xwDs8T+INjyAmi1Q4ZhtLbaxlsgRz2eRiouYj0hfCmuYy4AQ1GzrScohbOOdnwKaPAHGChQ/mYDqsO/akS+WTOOFss9lOWPw+7r6EddbhZy6k3DOQZTfyKnkKf+kBff+CRSBwEd/kYqLiJDCN9O26H+gBaj+BOJQNjHityVf+BkIsApXO+Q0jgV+FI/rl/6haLqrY+IS+9h79BgdN7a/5XG/l5Av1AHILYxqLG9N+ztQloqLMLccy8cg14hCIvYWBmMFuHtQ27ehfKe20cWgPZFheuK/uSdWDxYyur6Lway8qjnRJ//4FUA74IOABnGYYctuwkuUm5BFwSL3pMrS9IBLr72T6UJxx5yVy6Q9AwcEWViMvEJIekGO/Zv1VZUg3+mB8KCDf7GFzuJFXkVO2kx+iqDLXuzZ5dX4eG5gZJ40bt2mTVeuXC9FgQ5cBG2vf/GXbS+/en5Y3ztB63rJuWTfqJSX17/LG/HPMdtNIz5iRjRpOCrNdScvr/X48OoaAv7DkZO9SMq+Cc3rYw4qJ3b6iRx9NH8MxrHjtkllTO7EBlE/+3CPrvQbFmEmUXeSdDb/BLj1OLTslotRptbRoMI5xY9lCGcXHzNLTGxRqCGphzBOkvR7NINMtRwjolcZOhJCcFaS9HsIogjzlEG70kL11qqnBjxYl8wikTqwrc/2aYUR12U/E0gOEE5TCDRw3NL0XNb6fas7StdLG9PyV1O51icxBG2EvmiivMTAVuUM+OkpfnGpsQTiMTCkthHBE5bgLQYJ08ip+Dbo6zgxRWxbWpf850Q0p/g3OGi/LFqrvFZlU+SRB6sEfaAFkjnE5wrxPJ0HuJiG4wJzgj4G4rEwiJsP/fKBJR5AItv+11CuPv+W5THlvkoi4HMgyWcJKTMY4f/z/V2V65W7Y4xWWofZfFQdlDtNbkJeQyWxLoDfwMp3BaLiPENS2zsY+7VaLD3UBAswtizt5CCGxMgLbaDmlSCjf3rFaIchw8BwU/5BD99Z4PAmfSdYaAE24MhFH5fXUgp9vlD15s9qL6bFtvIbJd2jkcmf5dpJpUWsW+A3+cbJy0+1SduH27/WBbVB4oFVjuIC+b8qf77+wYmJvvdEIPJPrb7Aso7sRNXxrfERlwgLMDuZ2JxEX6/kKb8NgxDHPsbDBdh+IOCAn0X9nOVRJ2gvEk/cV0w+/4CUMEHXSo9aAGNz1Nf/T7ErcCaXeTbbQ/v8GIbhMTtv00Fn1fiTSwU1giNO6MLvHGyYoRFOHu+63yVATuCE93xUPlTd2zs5ch6MQxSOFHHzl2OmHdRCjbksF+XI+JiIGGXLka8ASPNB4nw5mFv5LV7UEGXTRKTj1oI9SZu6BWj3qBXt5sSlO/c1zorcU8vn3y/VqDe4cSnTcQNvYwCrqVSwjacT8G1W5n49rARV/IGvZMXNkqCqS7lU3DB/hs0RnZTCtQXSdy/y6nfTkiMwiTKf0eZVEySHDjU71SMfhMWPORdux2N7kneBcqbgBO37YoR8IGMAsk3ca2uHPkGkPg6N4jbdAXJtxxMku03oFTwUcL2OyClFFCyIHUyoo53TMah0sN7e6TiOqiQkwqru86UUCqsbjozQqmcYJ7RoQGX8ZUah1bSm1ZGHBDuERd20RJw9ZfnoLg0cdFtyDHXX58DY2H1zzk4ISaVw4XVMeemNxf1erlJGC6sFjlPxHJhNcV5JpbLCbbY3yyVg9pGLofXCYbZH69C9ecVtyuiYXixWunCiOXCWokuErFcTnDNI3BxXmkFCzi4qOqPC3Fra0gu/fU4aE6qiVtWA2JRUX/dDczlBHvsb5pSUX89DszrBNvsk5dYnSZuVA068PoLtcpjvGLhEQxrnVlFA5xzYDCspeZ9miAWDKtFVqp3uTk0F1aLrFTvwvKbmw/DRWRheZ+GC+WlT7DOPnn1LjmH5sXrqXXvRcHQXE7w1GNw6a3TOmn3VobhwlpyVrq/TgfmwmuRdX89DsyF1yHDvjlHLT40GJG1ZWX6LwLqgug09AjsBOfsE1j/1cHQwHgttem/PBgaDGvZWZn+64OBwRAXjgYF01+siW0FHrHwemR4c48YLLwWOe4vvGGxyKwuxwO6NcKWfojLQUUAGyDTYXNT4nLQoANvgKcODIa36pwM8M6BwfBWnROpUs3rkJP+kmzSFG/K9QiG1yHDu+HkgCHuzZQwVyUDnHNgYCd4Z4/A0v4LhIGBIVzfyu3P/Sv/Ku8rRA7waeQzTal9qIdLgdvPfD/8JbphXK931Dub178efpQagXm7nfb1qi8E1VBU4xGg9nDkUyKZdJKk9kSPAZK4LWcvNdEfD5v9f4nTCadENXW239jnrURKYU1QvnaiY7v/Zcak027SlE6MARqnD9+mDdrZOMUIGqcj36cN2tmIxQgaJzjTfqIzp29jBI0Tpk/TBu1M2QHoKO9XLxxlqy3OwL5Om7SzCsBJGqduEyftLCtwksZZ3+Xj5nDN4oSBw5WrzCYOPHVE5Nk5fAsOAPwuaoR44ITx667c7KYdDKdHkRsLnHNefftj0pGA5c3CFYlWhfkDgdPQprAyj+Q/GzI4gib/xE6KaWf/btSMFp7YmjHtZAke6CGHNE5Lp53/wyNC5JDGeenEn2mptUNi48m0KwDuY0wYsyViy8q081J4AIoc0jgvnTZpeHSKHNLElV/RXMy1+2CWHpmnN3NNLGhO21y7UcMHvGdjwyikcfY4bSPiPlWHkzROGKdtrt3n9HCSnlvC6D75h5M0Thinba7RYUJuJ9JzF9AoqHHGOG3Ph84hEoOa2G41ddSD1jJDosY54168oy/l0/Ttdcf5SNBfE9S9+Wtii9i0/XUHa84nHGeQ0zYjHadbcaLGWeO0HXbHeVmcqOeWNsITuAShxnnjxD22GuRG8Dq7N9I4b5y471OD1goCksZ549RJD1orCEgap43N4ttMHLZ2zo+cDpvYkDhxh+1mDZ/wcDUoYifjxK2IHmRFApLGWePE/TW85VQM6dkljXpQfh6QNM4ZJ+6u3UfrIScSEDXOGSdu+tyH8nGixjnj1FEPWiwIiRonjc3fMhODDQ8LhAYbYfZnqHHKOHFD7WbLlzLGc+tNRSc7SiGd4CRx4oYaHhUphvTskkS4h08MaZwkTtxQuw+0TIrz4vij3/Fxo5AmdkNOnLRzYYCRNE4Sp07auTDASJrYzJjPxk0P2uob9iijue1m7GDNuCEpmVt3KjpfVQ7que1nRCe2ykE9u6QRngIrBnU6tx2NHefKMm7/SnHaOHHj5z6RlhM1zhsnjhqeZSsGNbWQi1Bv78qH5p/3v2835cPd+c2q/nV9V2525w+b+rrabs/QMcBp2vwhVDhe/kQyHPVDeb3cNVDaatioZ1gmFnQV2dSxaBfAhRNnuCg7O8rNGBHpczkFiEi52dS/tuc/9rD+2jS/3l/Pr+C4APrtZ9b1uiJDslhuquvnQ67M5bZ+3L8+QlxcF6EpbSNN8WHz9liKTNv1cRwYH3HpcwfG8JFycTEsIn4HiQHRAA84cf2a/Y48bwPmfZj0uWFjeuFwLk8ooEht/mgdK229Q4eKRp9rPSYXDfdRpgrM28S0zTY4+lwmMrlwwCN8bBelI/KzR9HIjI06CRSMPleYTC4YMFMDM8c+HFY0iAIm19jA5Yl5+SuoWmAKsZHGRCM0j78iVsBnFhdbvqLMDkwGssBCTGBw5WNegYFSBpSMzt0FxMVP6s7tucAkX4BRAtYjcTcDTOwVEQ0f8wqxBj+vYZK7LrxXOY1UwDDxk7efWOHaP6UjhASWuKyQwKGCk0WmChexsj8D4crbaw/aeSMF8wZOD6klfC/A/WTnEoq8iDoskeDEI0l7YB+D+jyTcJBA2PMzOPhcFXgayJmm51lm4YV2RcOAkohGwXhv7dp7ND6QhE8gGoXLuxq7JBJRN3jbc3be5oi+w0H0EcwgHHCnoe1YDSyVo2jYwchC1QuJToM5BAPscTFgcEAlQtGwUafKhArHPNNsOxvT9tiII2rgHAuVTTrLA80b6QeSbPnBgEcC25XBGJTScQs2mDUCFdLTWS6Iw2M/jd02EsedWZ2xw5GG0qlZZtVwG1l7sFsbDpBkU+e6gGgUgcIxyxVxuNesrUG34cjJsXMcDZt1nIVyuLNMxpsmcCsASWfGF6rmlPrJtyXUnDJQczL2U68zoujRo+aUZaNwn/vyNrheSNsTAkz5qPUhlmWIbO7L27AUZTdLwaNoiMIgV2DmvrxdFECulNhQfCDVnlQo1KsMtLGwMj+T4tyOKxRzX81u6lF2KMC8ImdUfCDPnlQoOi6fApUqvIzNFZcPJNyTigsqhYC7wUE/SCGmMyr7QO49rcjAqgiIDCi1i4nLB5LwacUlBqkhOFzDZERFhCkWH8jPpxUL9wZ5eCgEVTkMEZn9/27qenf0uc8N6y/1omre8X8= -------------------------------------------------------------------------------- /com.xml: -------------------------------------------------------------------------------- 1 | 7V3fc6M2EP5r/NgMkkDAY+Ik14fr9aZ5aN8yCshGDUaukGOnf32FQTZYikvujKMMdmYysPyS9tOHVrsrMUHTxeaLIMvsN57SfAK9dDNBtxMIY89X/yvBay3wY1gL5oKltQjsBQ/sX9oIvUa6YiktOydKznPJll1hwouCJrIjI0Lwdfe0Gc+7T12SOTUEDwnJtfQq2Mv/ZKnMGjnA8f7Ar5TNs+bhEcT1gSeSPM8FXxXNEwte0PrIgujbNLUsM5LydUuE7iZoKjiX9dZiM6V5pVetM32dfNUFnaCbTC5ytQPU5vbw/RsXgz4Xq3oJWsj24966HyZhGs0QpB5KYxR6v+DmCS8kXzVP+H0lS4WkEn6jcs3Fc2kUYp0xSR+WJKn216o1dUs1Y3k+5TkX27PRLEpoklRyXsim3QBf7ZdS8Geqz9wqHd28UCGZQvU6Z/NCiSVfWiuqC65Op5uWqKn4F8oXVIpXdYo+6kVNC28aOIwb+Nf7xuLHqJZlrXaCcAMiaZrofHfzva7VRqPuvqr3DNXfFZKKpWBlpf0k56v0J1WfBjRK/Q9XvapvR/Mh9g3NYw9aNB/6A2g+iA3NT5kUbKNkf9HierlUGw9UqAqqjXsiFhOIc1Wkm5S9qM15tblmquTqNcUEXZM8L/UpqjSts34Sv7vb6u/D8QviuINfHBnwwcgz4QMeGII5QWTg9wet3lmFZKS68rtgL0Se8BWWEhrNHHiFhV0eRRgZQCBgAQKBaAgcwuNvsBMp//7+1g89JZ8LkjKlvQN1fzQm/8sN5EUmJDgaghoIG8qmqTKTmt1GGVtTh1b38LqaLyURO11Wx2iRtnf/plK+NgKyklyJuJAZn/OC5F95pdMawQNI1G2uKxNvXwQluWdVvbZFqCHTRhs8hkzJVyKhb1Rf251EzKl865ymS6n0chRgQXMi2UvX6DwpWqbxNc2rBm5gqFvvV/JE8++8ZJLxqhU/cSn5QqmLlMvapp6xTQVsG1R707eyRFm3y+qRi828GiNcJdtu8SonS3XRIzS5ifENvL+3cFMd8zyMp1NLazgB61DgXflx69flIAwNEqphgEnCCOtBw2lZGIyahbAHC2NnSAgvJPxhEsIjJMR+/LEk9EdNQtSnK/ScYSEyWJg4ycKUls/1Va6QEAdXbQ7GXSeHHyHr4RYpo9jkZDzEeAGhUTPS78NI4Awj/U/SL342RiLoXXntn+mFPB8j4agZGfRhJHSGkcGFkT/KSGilXMNIEGB3GBmOmpG4DyPdceDgT8LIZUocYmOMroIjbPRwYPAPWZzaeJBxSDxq/kU9+AdPbqM2l37nrKKObifhQdQw8nH3FnUpm6sOMN8V48eagRlhco3SG1qQ5fKxrMOUzpAbI9w1frsQAjNgFdpcQvEA1PbNVIwxUTvuQ213HEJmjP7CwJ4MDDvWLOgw0OaVPR8DzXymMTFQu1uPW7fuREaAmaF04eAJekE/Mp2w5+PguF0+2sF6nIOROxw0cwQuHDwBB5E2dT6Eg+MOhGiX6nEOhu5wEF44OIQtCgLT0Xo2DgbjHg2CXvkB2BkOBiMfOvSCK3AHrpFbmX3ggg6xa+Rpw73gcoddwMyWcs0gUW1FumSJxCHq2B5BZKYIB4Fpe6CBkhOjcROuTy6UQ4NwMxfKNb6pAUCdMPlYvhZJJnihYLXwb7r9vcW/6tc6dsuEKnJdD4VnBfr5R+px2G+UEA+CvJlz8yd9muynB/ZsBTWg72kCu3bznlaQkCSjpx0F5gc1SWhRv9hJU9Cd4BRvaR3611N0Q2hiDyzYh4O8pYGZ33H77cE16icZKUtWutPXAgAPwr6BOSkOoB1mnSmj4TBQ+uOeDaC9acfHj8idHtecWuoa7fTMb3d4FwZd2ln9axbO6TUPTku4cc+BA30yqVwinPsZT+4TzhpUOhvhRu7D6ZPf5BLh3E9wcp9w1kyKsxFu3Kn6sFc2kzuEg+5nM7lPOGv64NkIN26n6W5Vvc9COPdTl9wnnDVj/lyEg+7HmTJVzUc+m7GKNq6ACL3uzJU4MGc02dZLC4JBfF/Q/fBF9s8j3S8d5g6QsAsk1HkNLSD1MqmdjKUhpoZCMxbxQJOVqJZae2c44lzTDgXfvDqXlAZ87+Ad69lyQW3e6XiQ5QzHnQ0K+8z51QEFB8yaYNyRhH5wuRO7D8bth+4Hlzu51tAMubrWq5VrJpPMof4MhN2oeeCZ4VableKDYcxNM3b3zlURzpQtITNWPDbLjH3CZAngH0yQ9y1rz9qyJcJBiGsGkC6oD4E6Ogw6hRYf+NlQN6MYF9SHQB163fxVhD6Q69oJPFKbCn2yyAcyIx91EmM1Mq8cZ44ZWGv69KiL5oyVFWsLSvv0gOk0wJaPhwzy7RDkvmtdPc8h9AyfTxybLlmg3bYH37AYAEAMzX5Tf4Kk+WyC2roWSfWlhERunXwH+Kqay7cwaWu9EbW/r7DXL/T1fnNjG/gLlqbVwUF6Nv9wYrZJqwhbOjb//bxSu/sPNdWrRO2/hIXu/gM= -------------------------------------------------------------------------------- /data/iris.data.csv: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica 151 | 152 | -------------------------------------------------------------------------------- /kaggle/Chapter_1.1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "df_train = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-train.csv')\n", 23 | "\n", 24 | "df_test = pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-test.csv')\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size']]\n", 36 | "df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size']]" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "import matplotlib.pyplot as plt\n", 48 | "\n", 49 | "plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')\n", 50 | "plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')\n", 51 | "\n", 52 | "plt.xlabel('Clump Thickness')\n", 53 | "plt.ylabel('Cell Size')\n", 54 | "\n", 55 | "plt.show()\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 5, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "import numpy as np\n", 67 | "\n", 68 | "intercept = np.random.random([1])\n", 69 | "coef = np.random.random([2])\n", 70 | "\n", 71 | "lx = np.arange(0, 12)\n", 72 | "ly = (-intercept - lx * coef[0]) / coef[1]\n", 73 | "\n", 74 | "plt.plot(lx, ly, c='yellow')\n", 75 | "\n", 76 | "\n", 77 | "plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')\n", 78 | "plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')\n", 79 | "plt.xlabel('Clump Thickness')\n", 80 | "plt.ylabel('Cell Size')\n", 81 | "plt.show()\n", 82 | "\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "Testing accuracy (10 training samples): 0.868571428571\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "from sklearn.linear_model import LogisticRegression\n", 102 | "lr = LogisticRegression()\n", 103 | "\n", 104 | "lr.fit(df_train[['Clump Thickness', 'Cell Size']][:10], df_train['Type'][:10])\n", 105 | "print 'Testing accuracy (10 training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type'])\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "intercept = lr.intercept_\n", 117 | "coef = lr.coef_[0, :]\n", 118 | "\n", 119 | "ly = (-intercept - lx * coef[0]) / coef[1]\n", 120 | "\n", 121 | "plt.plot(lx, ly, c='green')\n", 122 | "plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')\n", 123 | "plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')\n", 124 | "plt.xlabel('Clump Thickness')\n", 125 | "plt.ylabel('Cell Size')\n", 126 | "plt.show()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [ 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "Testing accuracy (all training samples): 0.937142857143\n" 141 | ] 142 | } 143 | ], 144 | "source": [ 145 | "lr = LogisticRegression()\n", 146 | "\n", 147 | "lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type'])\n", 148 | "print 'Testing accuracy (all training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type'])\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "intercept = lr.intercept_\n", 160 | "coef = lr.coef_[0, :]\n", 161 | "ly = (-intercept - lx * coef[0]) / coef[1]\n", 162 | "\n", 163 | "plt.plot(lx, ly, c='blue')\n", 164 | "plt.scatter(df_test_negative['Clump Thickness'],df_test_negative['Cell Size'], marker = 'o', s=200, c='red')\n", 165 | "plt.scatter(df_test_positive['Clump Thickness'],df_test_positive['Cell Size'], marker = 'x', s=150, c='black')\n", 166 | "plt.xlabel('Clump Thickness')\n", 167 | "plt.ylabel('Cell Size')\n", 168 | "plt.show()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [] 179 | } 180 | ], 181 | "metadata": { 182 | "kernelspec": { 183 | "display_name": "Python 2", 184 | "language": "python", 185 | "name": "python2" 186 | }, 187 | "language_info": { 188 | "codemirror_mode": { 189 | "name": "ipython", 190 | "version": 2 191 | }, 192 | "file_extension": ".py", 193 | "mimetype": "text/x-python", 194 | "name": "python", 195 | "nbconvert_exporter": "python", 196 | "pygments_lexer": "ipython2", 197 | "version": "2.7.11" 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 0 202 | } 203 | -------------------------------------------------------------------------------- /kaggle/Datasets/Breast-Cancer/breast-cancer-test.csv: -------------------------------------------------------------------------------- 1 | ,Clump Thickness,Cell Size,Type 2 | 158,1,2,0 3 | 499,1,1,0 4 | 396,1,1,0 5 | 155,5,5,1 6 | 321,1,1,0 7 | 212,1,1,0 8 | 234,3,2,0 9 | 289,6,6,1 10 | 300,4,10,1 11 | 356,3,3,1 12 | 672,1,1,0 13 | 328,10,3,1 14 | 199,1,1,0 15 | 78,1,1,0 16 | 598,1,1,0 17 | 569,10,8,1 18 | 446,1,1,0 19 | 506,10,10,1 20 | 626,6,6,1 21 | 603,4,6,1 22 | 360,10,10,1 23 | 338,1,1,0 24 | 668,7,4,1 25 | 290,1,1,0 26 | 284,4,5,1 27 | 331,1,1,0 28 | 477,1,1,0 29 | 54,5,5,1 30 | 248,1,1,0 31 | 223,5,6,1 32 | 133,1,1,0 33 | 640,1,1,0 34 | 136,1,1,0 35 | 109,5,4,1 36 | 181,1,1,0 37 | 432,1,1,0 38 | 554,1,1,0 39 | 482,10,10,1 40 | 516,1,1,0 41 | 132,10,8,1 42 | 176,1,1,0 43 | 72,3,3,0 44 | 254,10,10,1 45 | 577,1,1,0 46 | 649,1,1,0 47 | 595,1,1,0 48 | 666,2,2,0 49 | 352,4,5,0 50 | 76,1,4,0 51 | 148,1,1,0 52 | 346,2,2,0 53 | 90,1,1,0 54 | 681,10,10,1 55 | 10,1,1,0 56 | 63,3,4,1 57 | 635,1,4,0 58 | 656,1,1,0 59 | 174,6,5,1 60 | 256,1,1,0 61 | 667,1,1,0 62 | 31,1,1,0 63 | 369,1,3,0 64 | 570,10,4,1 65 | 77,3,1,0 66 | 532,1,1,0 67 | 548,1,1,0 68 | 211,10,8,1 69 | 55,6,6,1 70 | 135,1,1,0 71 | 671,2,1,0 72 | 340,3,3,1 73 | 2,1,1,0 74 | 227,9,9,1 75 | 81,1,1,0 76 | 473,1,1,0 77 | 694,1,1,0 78 | 665,1,1,0 79 | 604,3,2,1 80 | 120,1,2,0 81 | 311,1,1,0 82 | 204,1,1,0 83 | 244,1,1,0 84 | 686,1,1,0 85 | 271,1,1,0 86 | 131,1,1,0 87 | 680,10,10,1 88 | 60,3,5,1 89 | 310,1,1,0 90 | 30,1,1,0 91 | 69,1,1,0 92 | 651,2,1,0 93 | 390,1,1,0 94 | 44,10,10,1 95 | 625,1,3,0 96 | 70,1,3,0 97 | 515,10,4,1 98 | 654,1,1,0 99 | 249,1,1,0 100 | 209,1,1,0 101 | 165,1,1,0 102 | 470,1,1,0 103 | 164,1,1,0 104 | 507,1,1,0 105 | 323,4,6,1 106 | 65,4,2,1 107 | 409,1,2,0 108 | 49,8,7,1 109 | 118,1,1,0 110 | 192,1,1,0 111 | 39,5,3,1 112 | 259,7,7,0 113 | 422,3,3,0 114 | 6,1,1,0 115 | 101,3,4,1 116 | 542,3,1,0 117 | 299,1,2,1 118 | 395,1,1,0 119 | 501,1,1,0 120 | 318,1,1,0 121 | 145,1,3,0 122 | 486,1,1,0 123 | 353,7,10,1 124 | 208,1,1,0 125 | 695,1,1,0 126 | 361,10,3,1 127 | 86,3,6,1 128 | 664,1,1,0 129 | 481,3,2,0 130 | 633,7,4,1 131 | 41,4,3,1 132 | 108,1,1,0 133 | 690,1,1,0 134 | 56,10,10,1 135 | 424,1,1,0 136 | 514,6,7,1 137 | 24,1,1,0 138 | 218,10,7,1 139 | 431,1,1,0 140 | 281,1,1,0 141 | 110,3,1,0 142 | 82,2,1,0 143 | 51,3,3,1 144 | 220,1,1,0 145 | 559,1,1,0 146 | 544,1,3,0 147 | 302,10,10,1 148 | 552,2,2,0 149 | 215,7,8,1 150 | 235,1,4,0 151 | 18,7,7,1 152 | 250,2,2,0 153 | 260,5,8,1 154 | 430,3,1,0 155 | 264,9,4,1 156 | 61,1,1,0 157 | 213,10,10,1 158 | 377,1,1,0 159 | 29,1,3,0 160 | 182,1,1,0 161 | 306,1,1,0 162 | 388,1,1,0 163 | 329,4,6,1 164 | 437,1,1,0 165 | 296,3,4,0 166 | 584,1,1,0 167 | 342,1,1,0 168 | 436,10,10,1 169 | 579,1,1,0 170 | 326,1,1,1 171 | 362,2,2,0 172 | 617,1,1,0 173 | 578,1,1,0 174 | 231,8,7,1 175 | 336,5,5,1 176 | 655,1,1,0 177 | -------------------------------------------------------------------------------- /scripts/consumer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Demi Yu' 5 | 6 | from kafka import KafkaConsumer 7 | 8 | consumer = KafkaConsumer('result') 9 | for msg in consumer: 10 | print((msg.value).decode('utf8')) -------------------------------------------------------------------------------- /scripts/producer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Demi Yu' 5 | 6 | import csv 7 | import time 8 | from kafka import KafkaProducer 9 | 10 | # 实例化一个KafkaProducer示例,用于向Kafka投递消息 11 | producer = KafkaProducer(bootstrap_servers='127.0.0.1:9092') 12 | # 打开数据文件 13 | csvfile = open("../data/user_log.csv", "r") 14 | # 生成一个可用于读取csv文件的reader 15 | reader = csv.reader(csvfile) 16 | 17 | for line in reader: 18 | gender = line[9] # 性别在每行日志代码的第9个元素 19 | if gender == 'gender': 20 | continue # 去除第一行表头 21 | time.sleep(0.1) # 每隔0.1秒发送一行数据 22 | # 发送数据,topic为'sex' 23 | producer.send('sex', line[9].encode('utf8')) -------------------------------------------------------------------------------- /work_one/Asyncio_hello.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | # 从Python 3.5开始 可以让coroutine的代码更简洁易读 3 | # 把@asyncio.coroutine替换为async; 4 | # 把yield from替换为await。 5 | 6 | # @asyncio.coroutine 7 | async def wget(host): 8 | print('wget %s...' % host) 9 | connect = asyncio.open_connection(host, 80) 10 | # reader, writer = yield from connect 11 | reader, writer = await connect 12 | header = 'GET / HTTP/1.0\r\nHost: %s\r\n\r\n' % host 13 | writer.write(header.encode('utf-8')) 14 | # yield from writer.drain() 15 | await writer.drain() 16 | while True: 17 | # line = yield from reader.readline() 18 | line = await reader.readline() 19 | if line == b'\r\n': 20 | break 21 | print('%s header > %s' % (host, line.decode('utf-8').rstrip())) 22 | # Ignore the body, close the socket 23 | writer.close() 24 | 25 | 26 | loop = asyncio.get_event_loop() 27 | tasks = [wget(host) for host in ['www.sina.com.cn', 'www.sohu.com', 'www.163.com']] 28 | loop.run_until_complete(asyncio.wait(tasks)) 29 | loop.close() 30 | -------------------------------------------------------------------------------- /work_one/FLASK_app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, render_template 2 | 3 | app = Flask(__name__) 4 | 5 | @app.route('/', methods=['GET', 'POST']) 6 | def home(): 7 | return render_template('home.html') 8 | 9 | @app.route('/signin', methods=['GET']) 10 | def signin_form(): 11 | return render_template('form.html') 12 | 13 | @app.route('/signin', methods=['POST']) 14 | def signin(): 15 | username = request.form['username'] 16 | password = request.form['password'] 17 | if username=='admin' and password=='password': 18 | return render_template('signin-ok.html', username=username) 19 | return render_template('form.html', message='Bad username or password', username=username) 20 | 21 | if __name__ == '__main__': 22 | app.run() -------------------------------------------------------------------------------- /work_one/IO.py: -------------------------------------------------------------------------------- 1 | # from io import StringIO 2 | # 3 | # f = StringIO('Hello!\nHi!\nGoodbye!') 4 | # while True: 5 | # s = f.readline() 6 | # if s == '': 7 | # break 8 | # print(s) 9 | 10 | # 最后看看如何利用Python的特性来过滤文件。比如我们要列出当前目录下的所有目录,只需要一行代码: 11 | # import os 12 | # [x for x in os.listdir('.') if os.path.isdir(x)] 13 | 14 | # 要列出所有的.py文件,也只需一行代码: 15 | # [x for x in os.listdir('.') if os.path.isfile(x) and os.path.splitext(x)[1]=='.py'] 16 | 17 | #编写一个程序,能在当前目录以及当前目录的所有子目录下查找文件名包含指定字符串的文件,并打印出相对路径 18 | # import os 19 | # def search(dir, text): 20 | # for x in os.listdir(dir): 21 | # if os.path.isfile(os.path.join(dir,x)): 22 | # if text in os.path.splitext(x)[0]: 23 | # print('%s, %s'% (dir, x)) 24 | # if os.path.isdir(os.path.join(dir,x)): 25 | # search(os.path.join(dir, x),text) 26 | # 27 | # print(os.path.abspath('.')) 28 | # search('/Users/yuhongjun/reactNativeWorkSpace/YuHongJun.github.io' , 'feedtest2') 29 | # 30 | # import os 31 | # 32 | # print('Process (%s) start...' % os.getpid()) 33 | # # Only works on Unix/Linux/Mac: 34 | # pid = os.fork() 35 | # if pid == 0: 36 | # print('I am child process (%s) and my.cnf parent is %s.' % (os.getpid(), os.getppid())) 37 | # else: 38 | # print('I (%s) just created a child process (%s).' % (os.getpid(), pid)) 39 | 40 | # import subprocess 41 | # 42 | # print('$ nslookup www.python.org') 43 | # r = subprocess.call(['nslookup', 'www.python.org']) 44 | # print('Exit code:', r) 45 | 46 | # from multiprocessing import Process, Queue 47 | # import os, time, random 48 | # 49 | # # 写数据进程执行的代码: 50 | # def write(q): 51 | # print('Process to write: %s' % os.getpid()) 52 | # for value in ['A', 'B', 'C']: 53 | # print('Put %s to queue...' % value) 54 | # q.put(value) 55 | # time.sleep(random.random()) 56 | # 57 | # # 读数据进程执行的代码: 58 | # def read(q): 59 | # print('Process to read: %s' % os.getpid()) 60 | # while True: 61 | # value = q.get(True) 62 | # print('Get %s from queue.' % value) 63 | # 64 | # if __name__=='__main__': 65 | # # 父进程创建Queue,并传给各个子进程: 66 | # q = Queue() 67 | # pw = Process(target=write, args=(q,)) 68 | # pr = Process(target=read, args=(q,)) 69 | # # 启动子进程pw,写入: 70 | # pw.start() 71 | # # 启动子进程pr,读取: 72 | # pr.start() 73 | # # 等待pw结束: 74 | # pw.join() 75 | # # pr进程里是死循环,无法等待其结束,只能强行终止: 76 | # pr.terminate() 77 | 78 | # import time, threading 79 | # 80 | # # 新线程执行的代码: 81 | # def loop(): 82 | # print('thread %s is running...' % threading.current_thread().name) 83 | # n = 0 84 | # while n < 5: 85 | # n = n + 1 86 | # print('thread %s >>> %s' % (threading.current_thread().name, n)) 87 | # time.sleep(1) 88 | # print('thread %s ended.' % threading.current_thread().name) 89 | # 90 | # print('thread %s is running...' % threading.current_thread().name) 91 | # t = threading.Thread(target=loop, name='LoopThread') 92 | # t.start() 93 | # t.join() 94 | # print('thread %s ended.' % threading.current_thread().name) 95 | # 96 | # import time, threading 97 | # balance = 0 98 | # lock = threading.Lock() 99 | # 100 | # def change_it(n): 101 | # # 先存后取,结果应该为0: 102 | # global balance 103 | # balance = balance + n 104 | # balance = balance - n 105 | # 106 | # def run_thread(n): 107 | # for i in range(100000): 108 | # # 先要获取锁: 109 | # lock.acquire() 110 | # try: 111 | # # 放心地改吧: 112 | # change_it(n) 113 | # finally: 114 | # # 改完了一定要释放锁: 115 | # lock.release() 116 | 117 | # import re 118 | # re_mail=re.compile(r'^(.+)@([a-zA-Z0-9]+)\.([a-zA-Z0-9]{2,3}|[0-9]{1,3})$') 119 | # a=re_mail.match('someone@gmail.com').groups() 120 | # print(a) 121 | 122 | # from datetime import datetime 123 | # now=datetime.now() 124 | # print(now) 125 | # print(type(now)) 126 | # 127 | # dt=datetime(2017,8,17,12,12) 128 | # tt=dt.timestamp() 129 | # 130 | # print(datetime.fromtimestamp(tt)) # 本地时间 131 | # print(datetime.utcfromtimestamp(tt)) # UTC时间 132 | 133 | # from datetime import datetime 134 | # cday = datetime.strptime('2017-8-17 12:12:12','%Y-%m-%d %H:%M:%S') 135 | # print(cday) 136 | # 137 | # now=datetime.now() 138 | # print(now.strftime('%a,%b %d %H:%M')) 139 | 140 | import re 141 | from datetime import datetime, timezone, timedelta 142 | 143 | def to_timestamp(dt_str, tz_str): 144 | dt = datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S') 145 | print(dt) 146 | tz_info = re.split(r'[UTC\:]+',tz_str) 147 | 148 | print(tz_info) 149 | tz_hours = int(tz_info[1]) 150 | print(tz_hours) 151 | 152 | tz_minutes = int(tz_info[2]) 153 | print(tz_minutes) 154 | 155 | dt = dt.replace(tzinfo = timezone(timedelta(hours=tz_hours, minutes=tz_minutes))) 156 | return dt.timestamp() 157 | 158 | # 测试: 159 | 160 | t1 = to_timestamp('2015-6-1 08:10:30', 'UTC+7:00') 161 | assert t1 == 1433121030.0, t1 162 | 163 | t2 = to_timestamp('2015-5-31 16:10:30', 'UTC-09:00') 164 | assert t2 == 1433121030.0, t2 165 | 166 | print('Pass') -------------------------------------------------------------------------------- /work_one/WSGI_hello.py: -------------------------------------------------------------------------------- 1 | # hello.py 2 | def application(environ, start_response): 3 | start_response('200 OK',[('Content-Type','text/html')]) 4 | body='

Hello, %s!

' % (environ['PATH_INFO'][1:] or 'web') 5 | return [body.encode('utf-8')] -------------------------------------------------------------------------------- /work_one/WSGI_server.py: -------------------------------------------------------------------------------- 1 | from wsgiref.simple_server import make_server 2 | from WSGI_hello import application 3 | # 创建一个服务器,IP地址为空,端口是8000,处理函数是application: 4 | httpd= make_server('',8000,application) 5 | print('Serving HTTP on port 8000...') 6 | # 开始监听HTTP请求: 7 | httpd.serve_forever() -------------------------------------------------------------------------------- /work_one/code.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/code.jpg -------------------------------------------------------------------------------- /work_one/distributed/task_master.py: -------------------------------------------------------------------------------- 1 | import random, time, queue 2 | from multiprocessing.managers import BaseManager 3 | 4 | # 发送任务的队列: 5 | task_queue = queue.Queue() 6 | # 接收结果的队列: 7 | result_queue = queue.Queue() 8 | 9 | # 从BaseManager继承的QueueManager: 10 | class QueueManager(BaseManager): 11 | pass 12 | 13 | # 把两个Queue都注册到网络上, callable参数关联了Queue对象: 14 | QueueManager.register('get_task_queue', callable=lambda: task_queue) 15 | QueueManager.register('get_result_queue', callable=lambda: result_queue) 16 | # 绑定端口5000, 设置验证码'abc': 17 | manager = QueueManager(address=('', 5000), authkey=b'abc') 18 | # 启动Queue: 19 | manager.start() 20 | # 获得通过网络访问的Queue对象: 21 | task = manager.get_task_queue() 22 | result = manager.get_result_queue() 23 | # 放几个任务进去: 24 | for i in range(10): 25 | n = random.randint(0, 10000) 26 | print('Put task %d...' % n) 27 | task.put(n) 28 | # 从result队列读取结果: 29 | print('Try get results...') 30 | for i in range(10): 31 | r = result.get(timeout=10) 32 | print('Result: %s' % r) 33 | # 关闭: 34 | manager.shutdown() 35 | print('master exit.') -------------------------------------------------------------------------------- /work_one/distributed/task_worker.py: -------------------------------------------------------------------------------- 1 | import time, sys, queue 2 | from multiprocessing.managers import BaseManager 3 | 4 | # 创建类似的QueueManager: 5 | class QueueManager(BaseManager): 6 | pass 7 | 8 | # 由于这个QueueManager只从网络上获取Queue,所以注册时只提供名字: 9 | QueueManager.register('get_task_queue') 10 | QueueManager.register('get_result_queue') 11 | 12 | # 连接到服务器,也就是运行task_master.py的机器: 13 | server_addr = '127.0.0.1' 14 | print('Connect to server %s...' % server_addr) 15 | # 端口和验证码注意保持与task_master.py设置的完全一致: 16 | m = QueueManager(address=(server_addr, 5000), authkey=b'abc') 17 | # 从网络连接: 18 | m.connect() 19 | # 获取Queue的对象: 20 | task = m.get_task_queue() 21 | result = m.get_result_queue() 22 | # 从task队列取任务,并把结果写入result队列: 23 | for i in range(10): 24 | try: 25 | n = task.get(timeout=1) 26 | print('run task %d * %d...' % (n, n)) 27 | r = '%d * %d = %d' % (n, n, n*n) 28 | time.sleep(1) 29 | result.put(r) 30 | except queue.Empty: 31 | print('task queue is empty.') 32 | # 处理结束: 33 | print('worker exit.') -------------------------------------------------------------------------------- /work_one/leet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Demi Yu' 5 | 6 | 7 | # class Solution(object): 8 | # def longestCommonPrefix(self, strs): 9 | # """ 10 | # :type strs: List[str] 11 | # :rtype: str 12 | # """ 13 | # if not strs: 14 | # print("kong") 15 | # return "" 16 | # for i, letter_group in enumerate(zip(*strs)): 17 | # print(list(zip(*strs))) 18 | # print(len(set(letter_group))) 19 | # if len(set(letter_group)) > 1: 20 | # print(i) 21 | # print('jinlaile') 22 | # print(strs[0][:i]) 23 | # return strs[0][:i] 24 | # else: 25 | # print('min') 26 | # print(min(strs)) 27 | # return min(strs) 28 | # # me=Solution() 29 | # Solution().longestCommonPrefix(["abc","de"]) 30 | 31 | class Solution(object): 32 | def isValid(self, s): 33 | """ 34 | :type s: str 35 | :rtype: bool 36 | """ 37 | stack = [] 38 | dict = {"]":"[", "}":"{", ")":"("} 39 | for char in s: 40 | if char in dict.values(): 41 | 42 | stack.append(char) 43 | print(stack) 44 | elif char in dict.keys(): 45 | if stack == [] or dict[char] != stack.pop(): 46 | return False 47 | else: 48 | return False 49 | return stack == [] 50 | result=Solution().isValid("{}[") 51 | print(result) -------------------------------------------------------------------------------- /work_one/mydict.py: -------------------------------------------------------------------------------- 1 | class Dict(dict): 2 | def __int__(self, **kw): 3 | super().__init__(**kw) 4 | 5 | def __getattr__(self, key): 6 | try: 7 | return self[key] 8 | except KeyError: 9 | raise AttributeError(r"'Dict' object has no attribute '%s'" % key) 10 | 11 | def __setattr__(self, key, value): 12 | self[key] = value 13 | 14 | 15 | with open('mydict2.py', 'r') as f: 16 | print(f.read()) 17 | 18 | # with open('test1.jpg', 'rb') as f: 19 | # print(f.read()) 20 | # 21 | # with open('mydict2.py', 'w') as f: 22 | # f.write('hhheeelllooo') 23 | -------------------------------------------------------------------------------- /work_one/mydict2.py: -------------------------------------------------------------------------------- 1 | class Dict(dict): 2 | ''' 3 | Simple dict but also support access as x.y style. 4 | 5 | >>> d1=Dict() 6 | >>> d1['x']=100 7 | >>> d1.x 8 | 100 9 | >>> d1.y=200 10 | >>> d1['y'] 11 | 200 12 | >>> d2 = Dict(a=1, b=2, c='3') 13 | >>> d2.c 14 | '3' 15 | >>> d2['empty'] 16 | Traceback (most recent call last): 17 | ... 18 | KeyError: 'empty' 19 | 20 | >>> d2.empty 21 | Traceback (most recent call last): 22 | ... 23 | AttributeError: 'Dict' object has no attribute 'empty' 24 | 25 | ''' 26 | 27 | def __int__(self, **kw): 28 | super(Dict, self).__init__(**kw) 29 | 30 | def __getattr__(self, key): 31 | try: 32 | return self[key] 33 | except KeyError: 34 | raise AttributeError(r"'Dict' object has no attribute '%s'" % key) 35 | 36 | def __setattr__(self, key, value): 37 | self[key] = value 38 | 39 | 40 | if __name__ == '__main__': 41 | import doctest 42 | 43 | doctest.testmod() 44 | -------------------------------------------------------------------------------- /work_one/mydict_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from mydict import Dict 4 | 5 | 6 | class TestDict(unittest.TestCase): 7 | def setUp(self): 8 | print('setUp...') 9 | 10 | def tearDown(self): 11 | print('tearDown...') 12 | 13 | def test_init(self): 14 | d = Dict(a=1, b='test') 15 | self.assertEqual(d.a, 1) 16 | self.assertEqual(d.b, 'test') 17 | self.assertTrue(isinstance(d, dict)) 18 | 19 | def test_key(self): 20 | d = Dict() 21 | d['key'] = 'value' 22 | self.assertEqual(d.key, 'value') 23 | 24 | def test_attr(self): 25 | d = Dict() 26 | d.key = 'value' 27 | self.assertTrue('key' in d) 28 | self.assertEqual(d['key'], 'value') 29 | 30 | def test_keyerror(self): 31 | d = Dict() 32 | with self.assertRaises(KeyError): 33 | value = d['empty'] 34 | 35 | def test_attrerror(self): 36 | d = Dict() 37 | with self.assertRaises(AttributeError): 38 | value = d.empty 39 | 40 | 41 | if __name__ == '__main__': 42 | unittest.main() 43 | -------------------------------------------------------------------------------- /work_one/myfile.py: -------------------------------------------------------------------------------- 1 | title = "The Meaning of Life" -------------------------------------------------------------------------------- /work_one/requestUrlTest.py: -------------------------------------------------------------------------------- 1 | #1.单进程: 2 | # import requests,time 3 | # start_time=time.time() 4 | # [requests.get('http://www.liaoxuefeng.com/') for x in range(100)] 5 | # print("用时:{}秒".format(time.time()-start_time)) 6 | 7 | #2.多线程 8 | 9 | # import threadpool,requests 10 | # def run(url): 11 | # r=requests.get(url=url) 12 | # pool=threadpool.ThreadPool(10) 13 | # reqs=threadpool.makeRequests(run,['http://www.liaoxuefeng.com' for x in range(100)]) 14 | # [pool.putRequest(x) for x in reqs] 15 | # pool.wait() 16 | # print("用时:{}秒".format(time.time()-start_time)) 17 | 18 | #3.多进程 19 | 20 | #!/usr/bin/env python3 21 | # -*- coding: utf-8 -*- 22 | # import multiprocessing,time,requests 23 | # start_time=time.time() 24 | # def run(url): 25 | # r=requests.get(url=url) 26 | # #print(1) 27 | # if __name__=='__main__': 28 | # pool=multiprocessing.Pool(10) 29 | # [pool.apply_async(run,args=('http://www.liaoxuefeng.com',)) for x in range(100)] 30 | # pool.close() 31 | # pool.join() 32 | # print("用时:{}秒".format(time.time()-start_time)) 33 | 34 | #4.协程(异步IO) 35 | 36 | import asyncio, aiohttp, time 37 | start_time=time.time() 38 | async def run(url): 39 | async with aiohttp.ClientSession() as session: 40 | async with session.get(url=url) as resp: 41 | pass 42 | loop=asyncio.get_event_loop() 43 | tasks=[asyncio.ensure_future(run('http://www.liaoxuefeng.com')) for x in range(100)] 44 | loop.run_until_complete(asyncio.wait(tasks)) 45 | print("用时:{}秒".format(time.time()-start_time)) -------------------------------------------------------------------------------- /work_one/script1.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # A first Python script 4 | 5 | # import sys # Load a library module 6 | # 7 | # print(sys.platform) 8 | # print(2 ** 100) # Raise 2 to a power 9 | # x = 'Spam!' 10 | # print(x * 8) # String repetition 11 | 12 | # 小明的成绩从去年的72分提升到了今年的85分,请计算小明成绩提升的百分点,并用字符串格式化显示出'xx.x%',只保留小数点后1位: 13 | # s1 = 72 14 | # s2 = 85 15 | # r = (85 - 72) / 72 * 100 16 | # print('小明成绩提升百分点%.1f%%' % r) 17 | # 18 | # #小明身高1.75,体重80.5kg。请根据BMI公式(体重除以身高的平方)帮小明计算他的BMI指数,并根据BMI指数: 19 | # h = input('please input an height: ') 20 | # height = float(h) 21 | # w = input('please input an weight: ') 22 | # weight = float(w) 23 | # bmi = weight / (height ** 2) 24 | # print('你的身高是%.2f m,体重是%.1f kg,BMI指数是%f ' %(height,weight,bmi)) 25 | # 26 | # if bmi < 18.5: 27 | # print('过轻') 28 | # elif 18.5 <= bmi < 25: 29 | # print('正常') 30 | # elif 25 <= bmi < 28: 31 | # print('过重') 32 | # elif 28 <= bmi < 32: 33 | # print('肥胖') 34 | # else: 35 | # print('严重肥胖') 36 | 37 | # range(101)就可以生成0-100的整数序列,计算如下 38 | # sum = 0 39 | # for x in range(101): 40 | # sum = sum + x 41 | # print(sum) 42 | 43 | # 第二种循环是while循环,只要条件满足,就不断循环,条件不满足时退出循环。比如我们要计算100以内所有奇数之和,可以用while循环实现: 44 | # sum = 0 45 | # n = 99 46 | # while n > 0: 47 | # sum = sum + n 48 | # n = n - 2 49 | # print(sum) 50 | 51 | # 请定义一个函数quadratic(a, b, c),接收3个参数,返回一元二次方程: 52 | # 53 | # ax2 + bx + c = 0 54 | # 55 | # 的两个解。 56 | # 57 | # 提示:计算平方根可以调用math.sqrt()函数 58 | 59 | # import math 60 | # def quadratic(a, b, c): 61 | # if b**2-4*a*c ==0: 62 | # return -b/(2*a) 63 | # elif (b**2-4*a*c) <0: 64 | # return '无解' 65 | # else: 66 | # n1=(-b+math.sqrt(b**2-4*a*c))/(2*a) 67 | # n2=(-b-math.sqrt(b**2-4*a*c))/(2*a) 68 | # return n1,n2 69 | # print(quadratic(2, 3, 1)) 70 | # print(quadratic(1, 4, 4)) 71 | 72 | # 参数组合 73 | # 74 | # 在Python中定义函数,可以用必选参数、默认参数、可变参数、关键字参数和命名关键字参数,这5种参数都可以组合使用。但是请注意,参数定义的顺序必须是:必选参数、默认参数、可变参数、命名关键字参数和关键字参数。 75 | # 76 | # 比如定义一个函数,包含上述若干种参数: 77 | 78 | # def f1(a, b, c=0, *args, **kw): 79 | # print('a =', a, 'b =', b, 'c =', c, 'args =', args, 'kw =', kw) 80 | # 81 | # def f2(a, b, c=0, *, d, **kw): 82 | # print('a =', a, 'b =', b, 'c =', c, 'd =', d, 'kw =', kw) 83 | # 84 | # args = (1, 2, 3) 85 | # kw = {'d': 99, 'x': '#'} 86 | # f1(args, kw) 87 | # f1(*args, kw) 88 | # f1(*args, *kw) 89 | # f1(*args, **kw) 90 | 91 | # f2(*args, **kw) 92 | 93 | # def fact(n): 94 | # if n==1: 95 | # return 1 96 | # return n * fact(n - 1) 97 | # 98 | # print(fact(5)) 99 | 100 | # s=0 101 | # def hanoi(n,a,b,c): 102 | # global s 103 | # if n==1: 104 | # s=s+1 105 | # print('第 %s 步:' % s) 106 | # print(a,'->',c) 107 | # else: 108 | # hanoi(n-1,a,c,b) #将前n-1个盘子从a移动到b上 109 | # hanoi(1, a, b, c) #将最底下的最后一个盘子从a移动到c上 110 | # hanoi(n - 1, b, a, c) #将b上的n-1个盘子移动到c上 111 | # 112 | # 113 | # hanoi(3,'A','B','C') 114 | 115 | # 汉诺塔 http://baike.baidu.com/item/%E6%B1%89%E8%AF%BA%E5%A1%94/3468295 116 | 117 | # B=[] 118 | # def move(n,a,b,c): 119 | # if n==1: 120 | # buzhou=a+str(n)+'-->'+c+str(n)+'first' 121 | # B.append(buzhou) 122 | # return 123 | # else: 124 | # move(n-1,a,c,b) 125 | # buzhou = a + str(n) + '-->' + c + str(n)+'seco' 126 | # B.append(buzhou) 127 | # move(n-1,b,a,c) 128 | # move(3,'A','B','C') 129 | # print('共需操作'+str(len(B))+'次','操作过程为',B) 130 | # 共需操作7次 操作过程为 131 | # ['A1-->C1first', 'A2-->B2seco', 'C1-->B1first', 'A3-->C3seco', 'B1-->A1first', 'B2-->C2seco', 'A1-->C1first'] 132 | 133 | # 134 | # L1 = ['Hello', 'World', 18, 'Apple', None] 135 | # L2 = [s.lower() for s in L1 if isinstance(s,str)==True] 136 | # L3 = [s.lower() if isinstance(s,str) else s for s in L1] 137 | # L4 = [s.upper() if isinstance(s,str) is True else s for s in L1] 138 | # L5 = [s[:1].upper()+s[1:].lower() if isinstance(s,str) else s for s in L1] 139 | # print('L1:',L1) 140 | # print('L2:',L2) 141 | # print('L3:',L3) 142 | # print('L4:',L4) 143 | # print('L5:',L5) 144 | 145 | # class Solution(object): 146 | # def removeDuplicates(self, nums): 147 | # """ 148 | # :type nums: List[int] 149 | # :rtype: int 150 | # """ 151 | # if not nums: 152 | # return 0 153 | # 154 | # newTail = 0 155 | # 156 | # for i in range(1, len(nums)): 157 | # if nums[i] != nums[newTail]: 158 | # newTail += 1 159 | # nums[newTail] = nums[i] 160 | # 161 | # return newTail + 1 162 | 163 | # 35. Search Insert Position 164 | # def searchInsert( nums, target): 165 | # if (len(nums) == 0): 166 | # return 0 167 | # 168 | # start = 0 169 | # end = len(nums) - 1 170 | # while (start + 1 < end): 171 | # mid = start + (end - start) // 2 172 | # if (nums[mid] == target): 173 | # return mid 174 | # elif (nums[mid] < target): 175 | # start = mid 176 | # else: 177 | # end = mid 178 | # 179 | # if target <= nums[start]: 180 | # return start 181 | # elif target <= nums[end]: 182 | # return end 183 | # else: 184 | # return end + 1 185 | # 186 | # print(searchInsert([1,3,5,6],4)) 187 | 188 | # from PIL import Image 189 | # im = Image.open('test1.jpg') 190 | # print(im.format, im.size, im.mode) 191 | # im.thumbnail((540,405)) 192 | # im.save('test22.jpg','JPEG') 193 | # import sys 194 | # sys.path 195 | 196 | # class Student(object): 197 | # 198 | # @property 199 | # def birth(self): 200 | # return self._birth 201 | # 202 | # @birth.setter 203 | # def birth(self, value): 204 | # self._birth = value 205 | # 206 | # @property 207 | # def age(self): 208 | # return 2015 - self._birth 209 | # 210 | # 211 | # 212 | # s=Student() 213 | # s.birth=2000 214 | # print(s.birth) 215 | # print(s.age) 216 | 217 | # class Solution(object): 218 | # def findDisappearedNumbers(self, nums): 219 | # """ 220 | # :type nums: List[int] 221 | # :rtype: List[int] 222 | # """ 223 | # # For each number i in nums, 224 | # # we mark the number that i points as negative. 225 | # # Then we filter the list, get all the indexes 226 | # # who points to a positive number 227 | # for i in xrange(len(nums)): 228 | # index = abs(nums[i]) - 1 229 | # nums[index] = - abs(nums[index]) 230 | # 231 | # return [i + 1 for i in range(len(nums)) if nums[i] > 0] 232 | # 233 | # __repr__=findDisappearedNumbers 234 | # 235 | 236 | class Chain(object): 237 | def __init__(self, path=''): 238 | self._path = path 239 | 240 | # def __getattr__(self, path): 241 | # return Chain('%s/%s' % (self._path, path)) 242 | 243 | def __getattr__(self, path): 244 | if path in ['users', 'group']: 245 | return Chain('%s' % self._path) 246 | else: 247 | return Chain('%s/%s' % (self._path, path)) 248 | 249 | def __call__(self, path): 250 | return Chain('%s/%s' % (self._path, path)) 251 | 252 | def __str__(self): 253 | return self._path 254 | 255 | __repr__ = __str__ 256 | 257 | 258 | # print(Chain().status.user.timeline.list) 259 | print(Chain().users('michael').group('student').repos) 260 | # /status/user/timeline/list 261 | # /users/michael/group/student/repos 262 | # /michael/student/repos 263 | 264 | # 调用时,需要把:user替换为实际用户名。如果我们能写出这样的链式调用: 265 | # In: 266 | # Chain().users('Michael').group('student').repos 267 | # 268 | # Out: 269 | # GET/Michael/student/repos 270 | -------------------------------------------------------------------------------- /work_one/templates/form.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Please Sign In 6 | 7 | 8 | {% if message %} 9 |

{{message}}

10 | {% endif %} 11 |
12 | Please sign in: 13 |

14 |

15 |

16 |
17 | 18 | 19 | -------------------------------------------------------------------------------- /work_one/templates/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Home 6 | 7 | 8 |

Home

9 | 10 | -------------------------------------------------------------------------------- /work_one/templates/signin-ok.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Wlecome,{{username}} 6 | 7 | 8 |

Welcome,{{username}}!

9 | 10 | -------------------------------------------------------------------------------- /work_one/test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test.db -------------------------------------------------------------------------------- /work_one/test1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test1.jpg -------------------------------------------------------------------------------- /work_one/test22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test22.jpg -------------------------------------------------------------------------------- /work_one/test33.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test33.png -------------------------------------------------------------------------------- /work_one/test44.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_one/test44.bmp -------------------------------------------------------------------------------- /work_one/work_GUI.py: -------------------------------------------------------------------------------- 1 | from tkinter import * 2 | import tkinter.messagebox as messagebox 3 | 4 | class Application(Frame): 5 | def __init__(self,master=None): 6 | Frame.__init__(self,master) 7 | self.pack() 8 | self.createWidgets() 9 | 10 | def createWidgets(self): 11 | self.nameInput=Entry(self) 12 | self.nameInput.pack() 13 | self.quitButton=Button(self,text='Hello',command=self.hello) 14 | self.quitButton.pack() 15 | 16 | def hello(self): 17 | name=self.nameInput.get() or 'world' 18 | messagebox.showinfo('Message','Hello,%s' % name) 19 | 20 | 21 | app=Application() 22 | app.master.title('Hello') 23 | app.mainloop() -------------------------------------------------------------------------------- /work_one/work_HTMLParser.py: -------------------------------------------------------------------------------- 1 | # from html.parser import HTMLParser 2 | # import urllib.request 3 | # 4 | # response = urllib.request.urlopen('https://www.python.org/events/python-events/') 5 | # class PythonEvent(HTMLParser): 6 | # def __init__(self): 7 | # super(PythonEvent, self).__init__() 8 | # self.key = 0 9 | # self.location_key = 0 10 | # self.event_list = [] 11 | # self.event_tmp = [] 12 | # def handle_starttag(self, tag, attrs): 13 | # if attrs: 14 | # if attrs[0][1] == 'event-title' or tag == 'time': 15 | # self.key = 1 # self.key=1表示data需要保存 16 | # if attrs[0][1] == 'event-location': 17 | # self.key = 1 18 | # self.location_key =1 # self.location_key=1表示单个data信息结尾 19 | # 20 | # def handle_data(self, data): 21 | # if self.key: 22 | # self.event_tmp.append(data) 23 | # if self.location_key: 24 | # self.event_list.append(self.event_tmp) # event_tmp保存进list并重置 25 | # self.event_tmp = [] 26 | # 27 | # def handle_endtag(self, tag): 28 | # self.key = 0 29 | # self.location_key = 0 30 | # 31 | # event = PythonEvent() 32 | # event.feed(response.read().decode('utf-8')) 33 | # for i in event.event_list: 34 | # print(i) 35 | 36 | #from urllib import request 37 | 38 | # with request.urlopen('https://api.douban.com/v2/book/2129650') as f: 39 | # data = f.read() 40 | # print('Status:',f.status,f.reason) 41 | # for k, v in f.getheaders(): 42 | # print('%s:%s' % (k,v)) 43 | # print('Data:', data.decode('utf-8')) 44 | 45 | # from urllib import request 46 | # 47 | # req=request.Request('http://www.douban.com/') 48 | # req.add_header('User-Agent','Mozilla/6.0 (iphone; CPU iphone os 8_0 like Mac OS X) AppleWebkit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25') 49 | # with request.urlopen(req) as f: 50 | # print('Status:', f.status, f.reason) 51 | # for k,v in f.getheaders(): 52 | # print('%s:%s' %(k,v)) 53 | # print('Data:', f.read().decode('utf-8')) 54 | 55 | from urllib import request, parse 56 | 57 | print('Login to weibo.cn...') 58 | email = input('Email: ') 59 | passwd = input('Password: ') 60 | login_data = parse.urlencode([ 61 | ('username', email), 62 | ('password', passwd), 63 | ('entry', 'mweibo'), 64 | ('client_id', ''), 65 | ('savestate', '1'), 66 | ('ec', ''), 67 | ('pagerefer', 'https://passport.weibo.cn/signin/welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F') 68 | ]) 69 | 70 | req = request.Request('https://passport.weibo.cn/sso/login') 71 | req.add_header('Origin', 'https://passport.weibo.cn') 72 | req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25') 73 | req.add_header('Referer', 'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F') 74 | 75 | with request.urlopen(req, data=login_data.encode('utf-8')) as f: 76 | print('Status:', f.status, f.reason) 77 | for k, v in f.getheaders(): 78 | print('%s: %s' % (k, v)) 79 | print('Data:', f.read().decode('utf-8')) -------------------------------------------------------------------------------- /work_one/work_PILImageDraw.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageDraw, ImageFont, ImageFilter 2 | 3 | import random 4 | 5 | # random letters,num,chinese 6 | def rndChar(): 7 | # return chr(random.randint(65,90)) 8 | # random Upper and lower case letters 9 | letterChr=chr(random.choice([x for x in range(65,123) if x not in [z for z in range(91,97)]])) 10 | #random num 11 | numChr=str(random.randint(0,9)) 12 | # random chinese 13 | cnChr=random.choice('我是中文汉字') 14 | return random.choice((letterChr,numChr,cnChr)) 15 | # random color1 16 | def rndColor(): 17 | return (random.randint(64,255), random.randint(64,255),random.randint(64,255)) 18 | 19 | # random color2 20 | def rndColor2(): 21 | return (random.randint(32,127), random.randint(32,127), random.randint(32,127)) 22 | 23 | # 240 * 60 24 | width = 60 * 4 25 | height = 60 26 | image = Image.new('RGB',(width,height),(255,255,255)) 27 | # create font 28 | font = ImageFont.truetype('Arial.ttf',36) 29 | # create draw 30 | draw= ImageDraw.Draw(image) 31 | # fill each pixel 32 | for x in range(width): 33 | for y in range(height): 34 | draw.point((x,y), fill=rndColor()) 35 | 36 | # output text 37 | for t in range(4): 38 | draw.text((60*t+10,10), rndChar(),font=font,fill=rndColor2()) 39 | 40 | # fuzzy 41 | image = image.filter(ImageFilter.BLUR) 42 | image.save('code.jpg','jpeg') 43 | 44 | -------------------------------------------------------------------------------- /work_one/work_TCP_client.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 4 | # 建立连接: 5 | s.connect(('127.0.0.1', 9999)) 6 | # 接收欢迎消息: 7 | print(s.recv(1024).decode('utf-8')) 8 | for data in [b'Michael', b'Tracy', b'Sarah']: 9 | # 发送数据: 10 | s.send(data) 11 | print(s.recv(1024).decode('utf-8')) 12 | s.send(b'exit') 13 | s.close() -------------------------------------------------------------------------------- /work_one/work_TCP_server.py: -------------------------------------------------------------------------------- 1 | # import socket 2 | # 3 | # s=socket.socket(socket.AF_INET,socket.SOCK_STREAM) 4 | # s.connect(('www.sina.com.cn', 80)) 5 | # 6 | # s.send(b'GET / HTTP/1.1\r\nHost: www.sina.com.cn\r\nConnection: close\r\n\r\n') 7 | # 8 | # buffer = [] 9 | # while True: 10 | # d=s.recv(1024) 11 | # if d: 12 | # buffer.append(d) 13 | # else: 14 | # break 15 | # 16 | # data=b''.join(buffer) 17 | # 18 | # s.close() 19 | # 20 | # header, html = data.split(b'\r\n\r\n',1) 21 | # print(header.decode('utf-8')) 22 | # 23 | # with open('sina.html','wb') as f: 24 | # f.write(html) 25 | 26 | import socket,threading,time 27 | 28 | def tcplink(sock, addr): 29 | print('Accept new connection from %s:%s...' % addr) 30 | sock.send(b'Welcome!') 31 | while True: 32 | data = sock.recv(1024) 33 | time.sleep(1) 34 | if not data or data.decode('utf-8') == 'exit': 35 | break 36 | sock.send(('Hello, %s!' % data.decode('utf-8')).encode('utf-8')) 37 | sock.close() 38 | print('Connection from %s:%s closed.' % addr) 39 | 40 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 41 | s.bind(('127.0.0.1', 9999)) 42 | s.listen(5) 43 | print('Waiting for connection...') 44 | 45 | while True: 46 | # 接受一个新连接: 47 | sock, addr = s.accept() 48 | # 创建新线程来处理TCP连接: 49 | t = threading.Thread(target=tcplink, args=(sock, addr)) 50 | t.start() -------------------------------------------------------------------------------- /work_one/work_UDP_client.py: -------------------------------------------------------------------------------- 1 | import socket 2 | s=socket.socket(socket.AF_INET,socket.SOCK_DGRAM) 3 | for data in [b'Michael',b'Tra',b'Sarah']: 4 | s.sendto(data,('127.0.0.1',9999)) 5 | print(s.recv(1024).decode('utf-8')) 6 | s.close() -------------------------------------------------------------------------------- /work_one/work_UDP_server.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | s= socket.socket(socket.AF_INET,socket.SOCK_DGRAM) 4 | s.bind(('127.0.0.1',9999)) 5 | print('Bind UDP on 9999') 6 | while True: 7 | data,addr=s.recvfrom(1024) 8 | print('Received from %s:%s' %addr) 9 | s.sendto(b'Hello, %s!' % data, addr) 10 | 11 | -------------------------------------------------------------------------------- /work_one/work_data_MYSQL.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import pymysql 4 | 5 | # 打开数据库连接 6 | db = pymysql.connect("localhost","test","password","TESTDB" ) 7 | 8 | # 使用 cursor() 方法创建一个游标对象 cursor 9 | cursor = db.cursor() 10 | 11 | # 使用 execute() 方法执行 SQL 查询 12 | # cursor.execute("SELECT VERSION()") 13 | 14 | # 使用 fetchone() 方法获取单条数据. 15 | # data = cursor.fetchone() 16 | 17 | # print ("Database version : %s " % data) 18 | 19 | # 使用 execute() 方法执行 SQL,如果表存在则删除 20 | # cursor.execute("DROP TABLE IF EXISTS EMPLOYEE") 21 | 22 | # 使用预处理语句创建表 23 | # sql = """CREATE TABLE EMPLOYEE ( 24 | # FIRST_NAME CHAR(20) NOT NULL, 25 | # LAST_NAME CHAR(20), 26 | # AGE INT, 27 | # SEX CHAR(1), 28 | # INCOME FLOAT )""" 29 | # 30 | # 31 | # cursor.execute(sql) 32 | 33 | 34 | 35 | # SQL 插入语句 36 | # sql = """INSERT INTO EMPLOYEE(FIRST_NAME, 37 | # LAST_NAME, AGE, SEX, INCOME) 38 | # VALUES ('Mac', 'Mohan', 20, 'M', 2000)""" 39 | 40 | # sql = "INSERT INTO EMPLOYEE(FIRST_NAME, \ 41 | # LAST_NAME, AGE, SEX, INCOME) \ 42 | # VALUES ('%s', '%s', '%d', '%c', '%d' )" % \ 43 | # ('Mac2', 'Mohan2', 202, 'M', 2000) 44 | # try: 45 | # # 执行sql语句 46 | # cursor.execute(sql) 47 | # # 提交到数据库执行 48 | # db.commit() 49 | # except: 50 | # # 如果发生错误则回滚 51 | # db.rollback() 52 | 53 | 54 | # SQL 查询语句 55 | # sql = "SELECT * FROM EMPLOYEE \ 56 | # WHERE INCOME > '%d'" % (1000) 57 | # 58 | # try: 59 | # # 执行SQL语句 60 | # cursor.execute(sql) 61 | # # 获取所有记录列表 62 | # results = cursor.fetchall() 63 | # for row in results: 64 | # fname = row[0] 65 | # lname = row[1] 66 | # age = row[2] 67 | # sex = row[3] 68 | # income = row[4] 69 | # # 打印结果 70 | # print ("fname=%s,lname=%s,age=%d,sex=%s,income=%d" % \ 71 | # (fname, lname, age, sex, income )) 72 | # except: 73 | # print ("Error: unable to fetch data") 74 | 75 | # SQL 更新语句 76 | # sql = "UPDATE EMPLOYEE SET AGE = AGE + 1\ 77 | # WHERE SEX = '%c'" % ('M') 78 | 79 | # SQL 删除语句 80 | sql = "DELETE FROM EMPLOYEE WHERE AGE > '%d'" % (200) 81 | 82 | try: 83 | # 执行SQL语句 84 | cursor.execute(sql) 85 | # 提交到数据库执行 86 | db.commit() 87 | except: 88 | # 发生错误时回滚 89 | db.rollback() 90 | 91 | # 关闭数据库连接 92 | db.close() -------------------------------------------------------------------------------- /work_one/work_data_SQLAlchemy.py: -------------------------------------------------------------------------------- 1 | # 导入: 2 | from sqlalchemy import Column, String, Integer,create_engine 3 | from sqlalchemy.orm import sessionmaker 4 | from sqlalchemy.ext.declarative import declarative_base 5 | 6 | # 创建对象的基类: 7 | Base = declarative_base() 8 | 9 | # 定义User对象: 10 | class User(Base): 11 | # 表的名字: 12 | __tablename__ = 'test' 13 | 14 | # 表的结构: 15 | id = Column(Integer, primary_key=True) 16 | name = Column(String(50)) 17 | age=Column(Integer) 18 | 19 | # 初始化数据库连接: 20 | engine = create_engine('mysql+pymysql://root:password@localhost:3306/sqltest') 21 | # 创建DBSession类型: 22 | DBSession = sessionmaker(bind=engine) 23 | 24 | # 创建session对象: 25 | session = DBSession() 26 | # 创建新User对象: 27 | new_user = User(id=2, name='Bob',age=22) 28 | # 添加到session: 29 | session.add(new_user) 30 | # 提交即保存到数据库: 31 | session.commit() 32 | 33 | 34 | # 创建Query查询,filter是where条件,最后调用one()返回唯一行,如果调用all()则返回所有行: 35 | user = session.query(User).filter(User.id==5).one() 36 | # 打印类型和对象的name属性: 37 | print('type:', type(user)) 38 | print('name:', user.name) 39 | 40 | 41 | # 关闭session: 42 | session.close() 43 | 44 | 45 | -------------------------------------------------------------------------------- /work_one/work_data_SQLite.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, sqlite3 4 | 5 | db_file = os.path.join(os.path.dirname(__file__), 'test.db') 6 | if os.path.isfile(db_file): 7 | os.remove(db_file) 8 | 9 | # 初始数据: 10 | conn = sqlite3.connect(db_file) 11 | cursor = conn.cursor() 12 | cursor.execute('create table user(id varchar(20) primary key, name varchar(20), score int)') 13 | cursor.execute(r"insert into user values ('A-001', 'Adam', 95)") 14 | cursor.execute(r"insert into user values ('A-002', 'Bart', 62)") 15 | cursor.execute(r"insert into user values ('A-003', 'Lisa', 78)") 16 | cursor.close() 17 | conn.commit() 18 | conn.close() -------------------------------------------------------------------------------- /work_one/work_mail_POP3.py: -------------------------------------------------------------------------------- 1 | # receive mail 2 | from email.parser import Parser 3 | from email.header import decode_header 4 | from email.utils import parseaddr 5 | 6 | import poplib 7 | 8 | # 输入邮件地址, 口令和POP3服务器地址: 9 | # email = input('Email: ') 10 | # password = input('Password: ') 11 | # pop3_server = input('POP3 server: ') 12 | 13 | # 163 14 | # email = 'xindy138@163.com' 15 | # password = '****' 16 | # pop3_server = 'pop3.163.com' 17 | 18 | # qq 特殊端口995 和验证码 19 | email = '373128869@qq.com' 20 | password = 'qhhtlbcexpnzcbac' 21 | pop3_server = 'pop.qq.com' 22 | pop3_port = 995 23 | 24 | 25 | # indent用于缩进显示: 26 | def print_info(msg, indent=0): 27 | if indent == 0: 28 | for header in ['From', 'To', 'Subject']: 29 | value = msg.get(header, '') 30 | if value: 31 | if header == 'Subject': 32 | value = decode_str(value) 33 | else: 34 | hdr, addr = parseaddr(value) 35 | name = decode_str(hdr) 36 | value = u'%s <%s>' % (name, addr) 37 | print('%s%s: %s' % (' ' * indent, header, value)) 38 | if (msg.is_multipart()): 39 | parts = msg.get_payload() 40 | for n, part in enumerate(parts): 41 | print('%spart %s' % (' ' * indent, n)) 42 | print('%s--------------------' % (' ' * indent)) 43 | print_info(part, indent + 1) 44 | else: 45 | content_type = msg.get_content_type() 46 | if content_type == 'text/plain' or content_type == 'text/html': 47 | content = msg.get_payload(decode=True) 48 | charset = guess_charset(msg) 49 | if charset: 50 | content = content.decode(charset) 51 | print('%sText: %s' % (' ' * indent, content + '...')) 52 | else: 53 | print('%sAttachment: %s' % (' ' * indent, content_type)) 54 | 55 | 56 | def decode_str(s): 57 | value, charset = decode_header(s)[0] 58 | if charset: 59 | value = value.decode(charset) 60 | return value 61 | 62 | 63 | def guess_charset(msg): 64 | charset = msg.get_charset() 65 | if charset is None: 66 | content_type = msg.get('Content-Type', '').lower() 67 | pos = content_type.find('charset=') 68 | if pos >= 0: 69 | charset = content_type[pos + 8:].strip() 70 | return charset 71 | 72 | 73 | # 连接到POP3服务器: 74 | server = poplib.POP3_SSL(pop3_server, pop3_port) 75 | # 可以打开或关闭调试信息: 76 | server.set_debuglevel(1) 77 | # 可选:打印POP3服务器的欢迎文字: 78 | print(server.getwelcome().decode('utf-8')) 79 | 80 | # 身份认证: 81 | server.user(email) 82 | server.pass_(password) 83 | 84 | # stat()返回邮件数量和占用空间: 85 | print('Messages: %s. Size: %s' % server.stat()) 86 | # list()返回所有邮件的编号: 87 | resp, mails, octets = server.list() 88 | # 可以查看返回的列表类似[b'1 82923', b'2 2184', ...] 89 | print(mails) 90 | 91 | # 获取最新一封邮件, 注意索引号从1开始: 92 | index = len(mails) 93 | resp, lines, octets = server.retr(index) 94 | 95 | # lines存储了邮件的原始文本的每一行, 96 | # 可以获得整个邮件的原始文本: 97 | msg_content = b'\r\n'.join(lines).decode('utf-8') 98 | # 稍后解析出邮件: 99 | msg = Parser().parsestr(msg_content) 100 | 101 | print_info(msg) 102 | # 可以根据邮件索引号直接从服务器删除邮件: 103 | # server.dele(index) 104 | # 关闭连接: 105 | server.quit() 106 | -------------------------------------------------------------------------------- /work_one/work_mail_SMTP.py: -------------------------------------------------------------------------------- 1 | # send mail 2 | from email import encoders 3 | from email.header import Header 4 | from email.mime.base import MIMEBase 5 | from email.mime.multipart import MIMEMultipart 6 | from email.mime.text import MIMEText 7 | from email.utils import parseaddr, formataddr 8 | import smtplib 9 | 10 | 11 | def _format_addr(s): 12 | name, addr = parseaddr(s) 13 | return formataddr((Header(name, 'utf-8').encode(), addr)) 14 | 15 | 16 | # from_addr = input('From:') 17 | # password = input('Password:') 18 | # to_addr = input('To:') 19 | # smtp_server = input('SMTP server:') 20 | 21 | # 163 22 | # from_addr = 'xindy138@163.com' 23 | # password = '*******' 24 | # to_addr = '373128869@qq.com' 25 | # smtp_server = 'smtp.163.com' 26 | # smtp_port = 25 27 | 28 | # qq 特殊端口465 和验证码 29 | from_addr = '373128869@qq.com' 30 | password = 'qhhtlbcexpnzcbac' 31 | to_addr = 'demiyuhongjun@gmail.com' 32 | smtp_server = 'smtp.qq.com' 33 | smtp_port = 465 34 | 35 | # msg = MIMEText('

Hello

' + 36 | # '

send by Python...

' + 37 | # '', 'html', 'utf-8') 38 | msg = MIMEMultipart('alternative') # 同时支持HTML和Plain格式 如果收件人无法查看HTML格式的邮件,就可以自动降级查看纯文本邮件 39 | msg.attach(MIMEText('send with file...', 'plain', 'utf-8')) 40 | msg.attach(MIMEText('

Hello

' + 41 | '

' + 42 | '', 'html', 'utf-8')) 43 | msg['From'] = _format_addr('Python爱好者 <%s>' % from_addr) 44 | msg['To'] = _format_addr('管理员 <%s>' % to_addr) 45 | msg['Subject'] = Header('来自SMTP的问候……', 'utf-8').encode() 46 | 47 | with open('test33.png', 'rb') as f: 48 | # 设置附件的MIME和文件名,这里是png类型: 49 | mime = MIMEBase('image', 'png', filename='test33.png') 50 | # 加上必要的头信息: 51 | mime.add_header('Content-Disposition', 'attachment', filename='test33.png') 52 | mime.add_header('Content-ID', '<0>') 53 | mime.add_header('X-Attachment-Id', '0') 54 | # 把附件的内容读进来: 55 | mime.set_payload(f.read()) 56 | # 用Base64编码: 57 | encoders.encode_base64(mime) 58 | # 添加到MIMEMultipart: 59 | msg.attach(mime) 60 | 61 | server = smtplib.SMTP_SSL(smtp_server, smtp_port) 62 | server.set_debuglevel(1) 63 | server.login(from_addr, password) 64 | server.sendmail(from_addr, [to_addr], msg.as_string()) 65 | server.quit() 66 | -------------------------------------------------------------------------------- /work_one/work_register.py: -------------------------------------------------------------------------------- 1 | # from PIL import Image 2 | # im = Image.open('test1.jpg') 3 | # print(im.format, im.size, im.mode) 4 | # im.thumbnail((540,405)) 5 | # im.save('test44.bmp','BMP') 6 | 7 | # import struct 8 | # 9 | # def judge(url): 10 | # with open(url,'rb') as f: 11 | # s=(f.read(30)) 12 | # t=(struct.unpack(' 106 | # 107 | # 108 | # Yahoo! Weather - Beijing, CN 109 | # Wed, 27 May 2015 11:00 am CST 110 | # 111 | # 112 | # 113 | # 114 | # 115 | # 116 | # 39.91 117 | # 116.39 118 | # Wed, 27 May 2015 11:00 am CST 119 | # 120 | # 121 | # 122 | # 123 | # 124 | # 125 | # 126 | # 127 | # 128 | # ''' 129 | 130 | 131 | # weather = parse_weather(data) 132 | 133 | # assert weather['city'] == 'Beijing', weather['city'] 134 | # assert weather['country'] == 'China', weather['country'] 135 | # assert weather['today']['text'] == 'Partly Cloudy', weather['today']['text'] 136 | # assert weather['today']['low'] == 20, weather['today']['low'] 137 | # assert weather['today']['high'] == 33, weather['today']['high'] 138 | # assert weather['tomorrow']['text'] == 'Sunny', weather['tomorrow']['text'] 139 | # assert weather['tomorrow']['low'] == 21, weather['tomorrow']['low'] 140 | # assert weather['tomorrow']['high'] == 34, weather['tomorrow']['high'] 141 | # print('Weather:', str(weather)) 142 | 143 | def get_weather(city): # 输入城市名(拼音)字符串,输出天气dict 144 | baseurl = "https://query.yahooapis.com/v1/public/yql?" 145 | yql_query = 'select * from weather.forecast where woeid in (select woeid from geo.places(1) where text="%s")' % city 146 | yql_url = baseurl + urllib.parse.urlencode({'q':yql_query}) 147 | print(yql_url) 148 | with urllib.request.urlopen(yql_url) as f: 149 | city_xml = f.read().decode('utf-8') 150 | city_weather = parse_weather(city_xml) 151 | return city_weather 152 | 153 | def main(): 154 | city = input('Weather Forecast in City: ') 155 | print(get_weather(city)) 156 | 157 | main() -------------------------------------------------------------------------------- /work_two_Crawler/86.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_two_Crawler/86.jpg -------------------------------------------------------------------------------- /work_two_Crawler/93.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_two_Crawler/93.jpg -------------------------------------------------------------------------------- /work_two_Crawler/94.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuHongJun/python-training/6ffafe6c781f29315cf1bb08ea3cec6020f078cd/work_two_Crawler/94.jpg -------------------------------------------------------------------------------- /work_two_Crawler/Download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | #http://cuiqingcai.com/3256.html 4 | #http://www.xicidaili.com/nt/ 代理IP地址 5 | __author__ = 'Demi Yu' 6 | 7 | #限制IP访问频率,超过频率就断开连接。(这种方法解决办法就是,降低爬虫的速度在每个请求前面加上time.sleep;或者不停的更换代理IP,这样就绕过反爬虫机制啦!) 8 | #后台对访问进行统计,如果单个userAgent访问超过阈值,予以封锁。(效果出奇的棒!不过误伤也超级大,一般站点不会使用,不过我们也考虑进去 9 | #上面讲过有的网站会限制相同的User-Agent的访问频率,那我们就给他随机来一个User-Agent,不停的更换代理IP好了!去百度一下User-Agent,我找到了下面这些: 10 | 11 | import requests 12 | import re 13 | import random 14 | import time 15 | from bs4 import BeautifulSoup 16 | 17 | class download(): 18 | 19 | def __init__(self): 20 | 21 | headers = { 22 | 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 23 | 24 | self.iplist = [] ##初始化一个list用来存放我们获取到的IP 25 | html = requests.get("http://www.xicidaili.com/nt/",headers=headers) ##不解释咯 26 | # iplistn = re.findall(r'r/>(.*?) 0: ##num_retries是我们限定的重试次数 75 | time.sleep(10) ##延迟十秒 76 | print(u'获取网页出错,10S后将获取倒数第:', num_retries, u'次') 77 | return self.get(url, timeout, num_retries-1) ##调用自身 并将次数减1 78 | else: 79 | print(u'开始使用代理') 80 | time.sleep(10) 81 | IP = ''.join(str(random.choice(self.iplist[1:])).strip()) ##下面有解释哦 82 | proxy = {'http': IP} 83 | return self.get(url, timeout, proxy,) ##代理不为空的时候 84 | 85 | else: ##当代理不为空 86 | try: 87 | IP = ''.join(str(random.choice(self.iplist[1:])).strip()) ##将从self.iplist中获取的字符串处理成我们需要的格式(处理了些什么自己看哦,这是基础呢) 88 | proxy = {'http': IP} ##构造成一个代理 89 | return requests.get(url, headers=headers, proxies=proxy, timeout=timeout) ##使用代理获取response 90 | except: 91 | 92 | if num_retries > 0: 93 | time.sleep(10) 94 | IP = ''.join(str(random.choice(self.iplist[1:])).strip()) 95 | proxy = {'http': IP} 96 | print(u'正在更换代理,10S后将重新获取倒数第', num_retries, u'次') 97 | print(u'当前代理是:', proxy) 98 | return self.get(url, timeout, proxy, num_retries - 1) 99 | else: 100 | print(u'代理也不好使了!取消代理') 101 | return self.get(url, 3) 102 | 103 | request = download() ## 104 | 105 | 106 | # if __name__ == '__main__': 107 | # request=download() 108 | # request.get('http://www.mzitu.com/all',3) 109 | -------------------------------------------------------------------------------- /work_two_Crawler/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Demi Yu' -------------------------------------------------------------------------------- /work_two_Crawler/catch_blog.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | #爬取简书上某个文章地址的主体内容 4 | __author__ = 'Demi Yu' 5 | 6 | from bs4 import BeautifulSoup 7 | import requests 8 | import codecs 9 | 10 | 11 | def get_page(url): 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 14 | } 15 | data = requests.get(url, headers=headers).content 16 | return data 17 | 18 | 19 | def get_text(html): 20 | parser = BeautifulSoup(html, 'html.parser') 21 | article = parser.find('div', attrs={'class': 'article'}) # 定位文章 22 | title = article.find('h1', attrs={'class': 'title'}).get_text() # 获取标题 23 | text = [] # 创建空列表存放文章 24 | for paragraph in article.find_all('p'): 25 | paragraph_content = paragraph.get_text() 26 | text.append(paragraph_content) # 将文章一段一段的添加到列表中 27 | return title, text 28 | 29 | 30 | def save_text(title, text): 31 | file_name = title + '.txt' 32 | with codecs.open(file_name, 'wb', encoding='utf-8') as open_file: 33 | try: 34 | for p in text: 35 | open_file.write('\t%s\r\n' % p) # \t是tab制表符,\r\n是Carrige Return换行 36 | except Exception: 37 | print('发生了错误!') 38 | print('文章抓取完成!') 39 | return 40 | 41 | 42 | if __name__ == '__main__': 43 | url = 'http://www.jianshu.com/p/293c3b71416e' 44 | html = get_page(url) 45 | title, text = get_text(html) 46 | save_text(title, text) 47 | -------------------------------------------------------------------------------- /work_two_Crawler/catch_blog3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | #爬取简书上某个文章地址的主体内容 4 | __author__ = 'Demi Yu' 5 | 6 | from bs4 import BeautifulSoup 7 | import requests 8 | import codecs 9 | 10 | 11 | def get_page(url): 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 14 | } 15 | data = requests.get(url, headers=headers).content 16 | return data 17 | 18 | 19 | def get_text(html): 20 | parser = BeautifulSoup(html, 'html.parser') 21 | article = parser.find('div', attrs={'class': 'article'}) # 定位文章 22 | title = article.find('h1', attrs={'class': 'title'}).get_text() # 获取标题 23 | text = [] # 创建空列表存放文章 24 | for paragraph in article.find_all('p'): 25 | paragraph_content = paragraph.get_text() 26 | text.append(paragraph_content) # 将文章一段一段的添加到列表中 27 | return title, text 28 | 29 | 30 | def save_text(title, text): 31 | file_name = title + '.txt' 32 | with codecs.open(file_name, 'wb', encoding='utf-8') as open_file: 33 | try: 34 | for p in text: 35 | open_file.write('\t%s\r\n' % p) # \t是tab制表符,\r\n是Carrige Return换行 36 | except Exception: 37 | print('发生了错误!') 38 | print('文章抓取完成!') 39 | return 40 | 41 | 42 | if __name__ == '__main__': 43 | url = 'http://www.jianshu.com/p/293c3b71416e' 44 | html = get_page(url) 45 | title, text = get_text(html) 46 | save_text(title, text) 47 | -------------------------------------------------------------------------------- /work_two_Crawler/catch_blog5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | #爬取简书上某个文章地址的主体内容 4 | __author__ = 'Demi Yu' 5 | 6 | from bs4 import BeautifulSoup 7 | import requests 8 | import codecs 9 | 10 | 11 | def get_page(url): 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' 14 | } 15 | data = requests.get(url, headers=headers).content 16 | return data 17 | 18 | 19 | def get_text(html): 20 | parser = BeautifulSoup(html, 'html.parser') 21 | article = parser.find('div', attrs={'class': 'article'}) # 定位文章 22 | title = article.find('h1', attrs={'class': 'title'}).get_text() # 获取标题 23 | text = [] # 创建空列表存放文章 24 | for paragraph in article.find_all('p'): 25 | paragraph_content = paragraph.get_text() 26 | text.append(paragraph_content) # 将文章一段一段的添加到列表中 27 | return title, text 28 | 29 | 30 | def save_text(title, text): 31 | file_name = title + '.txt' 32 | with codecs.open(file_name, 'wb', encoding='utf-8') as open_file: 33 | try: 34 | for p in text: 35 | open_file.write('\t%s\r\n' % p) # \t是tab制表符,\r\n是Carrige Return换行 36 | except Exception: 37 | print('发生了错误!') 38 | print('文章抓取完成!') 39 | return 40 | 41 | 42 | if __name__ == '__main__': 43 | url = 'http://www.jianshu.com/p/293c3b71416e' 44 | html = get_page(url) 45 | title, text = get_text(html) 46 | save_text(title, text) 47 | -------------------------------------------------------------------------------- /work_two_Crawler/catch_img.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # 抓取天极图片网某个网址的全部图片 4 | __author__ = 'Demi Yu' 5 | 6 | import re 7 | import urllib.request 8 | import urllib 9 | 10 | 11 | def download_page(url): 12 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) ' 13 | 14 | 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'} 15 | 16 | request = urllib.request.Request(url, headers=headers) 17 | response = urllib.request.urlopen(request) 18 | data = response.read() 19 | return data 20 | 21 | 22 | def get_image(html): 23 | regx = r'http://[\S]*\.jpg' # 定义正则表达式,意思是所有以.jpg格式结尾的网址 24 | pattern = re.compile(regx) 25 | get_img = re.findall(pattern, repr(html)) # 用repr方式将初始网址转换为字符串,然后开始按照预定的模式进行查找,将所有符合条件的网址都放入内存中 26 | num = 1 27 | for img in get_img: 28 | image = download_page(img) # 将每个img连接重新解析 29 | with open('%s.jpg' % num, 'wb') as fp: 30 | fp.write(image) 31 | num += 1 32 | print('正在下载第%s 张图片' % num) 33 | return 34 | 35 | 36 | url = 'http://pic.yesky.com/c/6_61112.shtml' 37 | html = download_page(url) 38 | get_image(html) 39 | 40 | # 但是往往裸奔版对一些网站是爬不了的,这时就需要对爬虫进行一些伪装了。伪装浏览器或者加入延时。 41 | 42 | # 伪装的话直接把request请求改成 43 | 44 | # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'} 45 | 46 | # request = urllib.request.Request(url,headers=headers) 47 | 48 | # 这样就成功完成伪装了 49 | -------------------------------------------------------------------------------- /work_two_Crawler/catch_mongo_mzui.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # http://cuiqingcai.com/3179.html 4 | __author__ = 'Demi Yu' 5 | 6 | from bs4 import BeautifulSoup 7 | import os 8 | 9 | from Download import request 10 | from pymongo import MongoClient 11 | import datetime 12 | 13 | class mzitu(): 14 | def __init__(self): 15 | client = MongoClient() ##与MongDB建立连接(这是默认连接本地MongDB数据库) 16 | db = client['meinvxiezhenji'] ## 选择一个数据库 17 | self.meizitu_collection = db['meizitu'] ##在meizixiezhenji这个数据库中,选择一个集合 18 | self.title = '' ##用来保存页面主题 19 | self.url = '' ##用来保存页面地址 20 | self.img_urls = [] ##初始化一个 列表 用来保存图片地址 21 | 22 | 23 | 24 | 25 | def all_url(self, url): 26 | html = request.get(url,3) ##调用request函数把套图地址传进去会返回给我们一个response 27 | all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') 28 | retval = os.getcwd() # 查看当前工作目录 '/Users/yuhongjun/Python/python-training/work_two_Crawler' 29 | for a in all_a: 30 | title = a.get_text() 31 | self.title=title #将主题保存到self.title中 32 | print(u'开始保存:', title) ##加点提示不然太枯燥了 33 | path = str(title).replace("?", '_') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 34 | self.mkdir(path) ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦! 35 | href = a['href'] 36 | self.url=href #将页面地址保存到self.url中 37 | 38 | if self.meizitu_collection.find_one({'主题页面': href}): ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。 39 | print(u'这个页面已经爬取过了') 40 | else: 41 | self.html(href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦! 42 | 43 | os.chdir(retval) ##切换到目录 44 | 45 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址 46 | html = request.get(href,3) 47 | max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() 48 | page_num = 0 # 这个当作计数器用 (用来判断图片是否下载完毕) 49 | # for page in range(1, 2): 50 | for page in range(1, int(max_span) + 1): 51 | page_num = page_num + 1 ##每for循环一次就+1 (当page_num等于max_span的时候,就证明我们的在下载最后一张图片了) 52 | page_url = href + '/' + str(page) 53 | self.img(page_url, max_span, page_num) ##调用img函数 把上面我们我们需要的两个变量,传递给下一个函数。 54 | 55 | def img(self, page_url, max_span, page_num): ##这个函数处理图片页面地址获得图片的实际地址 56 | img_html = request.get(page_url,3) 57 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src'] 58 | self.img_urls.append(img_url) ##每一次 for page in range(1, int(max_span) + 1)获取到的图片地址都会添加到 img_urls这个初始化的列表 59 | if int(max_span) == page_num: ##我们传递下来的两个参数用上了 当max_span和Page_num相等时,就是最后一张图片了,最后一次下载图片并保存到数据库中。 60 | self.save(img_url) 61 | post = { ##这是构造一个字典,里面有啥都是中文,很好理解吧! 62 | '标题': self.title, 63 | '主题页面': self.url, 64 | '图片地址': self.img_urls, 65 | '获取时间': datetime.datetime.now() 66 | } 67 | self.meizitu_collection.save(post) ##将post中的内容写入数据库。 68 | print(u'插入数据库成功') 69 | else: ##max_span 不等于 page_num执行这下面 70 | self.save(img_url) 71 | 72 | def save(self, img_url): ##这个函数保存图片 73 | name = img_url[-9:-4] 74 | print(u'开始保存:', img_url) 75 | img = request.get(img_url,3) 76 | f = open(name + '.jpg', 'ab') 77 | f.write(img.content) 78 | f.close() 79 | 80 | def mkdir(self, path): ##这个函数创建文件夹 81 | path = path.strip() 82 | macPath="Pic/" 83 | isExists = os.path.exists(os.path.join(macPath, path)) 84 | if not isExists: 85 | print(u'建了一个名字叫做', path, u'的文件夹!') 86 | os.makedirs(os.path.join(macPath, path)) 87 | os.chdir(os.path.join(macPath, path)) ##切换到目录 88 | return True 89 | else: 90 | print(u'名字叫做', path, u'的文件夹已经存在了!') 91 | return False 92 | 93 | # def request(self, url): ##这个函数获取网页的response 然后返回 94 | # headers = { 95 | # 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 96 | # content = requests.get(url, headers=headers) 97 | # return content 98 | 99 | 100 | Mzitu = mzitu() ##实例化 101 | Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数 你可以当作启动爬虫(就是入口) 102 | -------------------------------------------------------------------------------- /work_two_Crawler/catch_mongodb_mzi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | #http://cuiqingcai.com/3363.html 4 | 5 | __author__ = 'Demi Yu' 6 | 7 | import os 8 | import time 9 | import threading 10 | import multiprocessing 11 | from catch_mongodb_queue import MogoQueue 12 | from Download import request 13 | from bs4 import BeautifulSoup 14 | 15 | SLEEP_TIME = 1 16 | 17 | def mzitu_crawler(max_threads=10): 18 | crawl_queue = MogoQueue('meinvxiezhenji', 'crawl_queue') ##这个是我们获取URL的队列 19 | ##img_queue = MogoQueue('meinvxiezhenji', 'img_queue') 20 | def pageurl_crawler(): 21 | while True: 22 | try: 23 | url = crawl_queue.pop() 24 | print(url) 25 | except KeyError: 26 | print('队列没有数据') 27 | break 28 | else: 29 | img_urls = [] 30 | req = request.get(url, 3).text 31 | title = crawl_queue.pop_title(url) 32 | mkdir(title) 33 | os.chdir('D:\mzitu\\' + title) 34 | max_span = BeautifulSoup(req, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() 35 | for page in range(1, int(max_span) + 1): 36 | page_url = url + '/' + str(page) 37 | img_url = BeautifulSoup(request.get(page_url, 3).text, 'lxml').find('div', class_='main-image').find('img')['src'] 38 | img_urls.append(img_url) 39 | save(img_url) 40 | crawl_queue.complete(url) ##设置为完成状态 41 | ##img_queue.push_imgurl(title, img_urls) 42 | ##print('插入数据库成功') 43 | 44 | def save(img_url): 45 | name = img_url[-9:-4] 46 | print(u'开始保存:', img_url) 47 | img = request.get(img_url, 3) 48 | f = open(name + '.jpg', 'ab') 49 | f.write(img.content) 50 | f.close() 51 | 52 | def mkdir(path): 53 | path = path.strip() 54 | isExists = os.path.exists(os.path.join("D:\mzitu", path)) 55 | if not isExists: 56 | print(u'建了一个名字叫做', path, u'的文件夹!') 57 | os.makedirs(os.path.join("D:\mzitu", path)) 58 | return True 59 | else: 60 | print(u'名字叫做', path, u'的文件夹已经存在了!') 61 | return False 62 | 63 | threads = [] 64 | while threads or crawl_queue: 65 | """ 66 | 这儿crawl_queue用上了,就是我们__bool__函数的作用,为真则代表我们MongoDB队列里面还有数据 67 | threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行 68 | """ 69 | for thread in threads: 70 | if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 71 | threads.remove(thread) 72 | while len(threads) < max_threads or crawl_queue.peek(): ##线程池中的线程少于max_threads 或者 crawl_qeue时 73 | thread = threading.Thread(target=pageurl_crawler) ##创建线程 74 | thread.setDaemon(True) ##设置守护线程 75 | thread.start() ##启动线程 76 | threads.append(thread) ##添加进线程队列 77 | time.sleep(SLEEP_TIME) 78 | 79 | def process_crawler(): 80 | process = [] 81 | num_cpus = multiprocessing.cpu_count() 82 | print('将会启动进程数为:', num_cpus) 83 | for i in range(num_cpus): 84 | p = multiprocessing.Process(target=mzitu_crawler) ##创建进程 85 | p.start() ##启动进程 86 | process.append(p) ##添加进进程队列 87 | for p in process: 88 | p.join() ##等待进程队列里面的进程结束 89 | 90 | if __name__ == "__main__": 91 | process_crawler() -------------------------------------------------------------------------------- /work_two_Crawler/catch_mongodb_queue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | #http://cuiqingcai.com/3363.html 4 | 5 | __author__ = 'Demi Yu' 6 | 7 | from datetime import datetime, timedelta 8 | from pymongo import MongoClient, errors 9 | 10 | 11 | class MogoQueue(): 12 | OUTSTANDING = 1 ##初始状态 13 | PROCESSING = 2 ##正在下载状态 14 | COMPLETE = 3 ##下载完成状态 15 | 16 | def __init__(self, db, collection, timeout=300): ##初始mongodb连接 17 | self.client = MongoClient() 18 | self.Client = self.client[db] 19 | self.db = self.Client[collection] 20 | self.timeout = timeout 21 | 22 | def __bool__(self): 23 | """ 24 | 这个函数,我的理解是如果下面的表达为真,则整个类为真 25 | 至于有什么用,后面我会注明的(如果我的理解有误,请指点出来谢谢,我也是Python新手) 26 | $ne的意思是不匹配 27 | """ 28 | record = self.db.find_one( 29 | {'status': {'$ne': self.COMPLETE}} 30 | ) 31 | return True if record else False 32 | 33 | def push(self, url, title): ##这个函数用来添加新的URL进队列 34 | try: 35 | self.db.insert({'_id': url, 'status': self.OUTSTANDING, '主题': title}) 36 | print(url, '插入队列成功') 37 | except errors.DuplicateKeyError as e: ##报错则代表已经存在于队列之中了 38 | print(url, '已经存在于队列中了') 39 | pass 40 | 41 | def push_imgurl(self, title, url): 42 | try: 43 | self.db.insert({'_id': title, 'statue': self.OUTSTANDING, 'url': url}) 44 | print('图片地址插入成功') 45 | except errors.DuplicateKeyError as e: 46 | print('地址已经存在了') 47 | pass 48 | 49 | def pop(self): 50 | """ 51 | 这个函数会查询队列中的所有状态为OUTSTANDING的值, 52 | 更改状态,(query后面是查询)(update后面是更新) 53 | 并返回_id(就是我们的URL),MongDB好使吧,^_^ 54 | 如果没有OUTSTANDING的值则调用repair()函数重置所有超时的状态为OUTSTANDING, 55 | $set是设置的意思,和MySQL的set语法一个意思 56 | """ 57 | record = self.db.find_and_modify( 58 | query={'status': self.OUTSTANDING}, 59 | update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}} 60 | ) 61 | if record: 62 | return record['_id'] 63 | else: 64 | self.repair() 65 | raise KeyError 66 | 67 | def pop_title(self, url): 68 | record = self.db.find_one({'_id': url}) 69 | return record['主题'] 70 | 71 | def peek(self): 72 | """这个函数是取出状态为 OUTSTANDING的文档并返回_id(URL)""" 73 | record = self.db.find_one({'status': self.OUTSTANDING}) 74 | if record: 75 | return record['_id'] 76 | 77 | def complete(self, url): 78 | """这个函数是更新已完成的URL完成""" 79 | self.db.update({'_id': url}, {'$set': {'status': self.COMPLETE}}) 80 | 81 | def repair(self): 82 | """这个函数是重置状态$lt是比较""" 83 | record = self.db.find_and_modify( 84 | query={ 85 | 'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)}, 86 | 'status': {'$ne': self.COMPLETE} 87 | }, 88 | update={'$set': {'status': self.OUTSTANDING}} 89 | ) 90 | if record: 91 | print('重置URL状态', record['_id']) 92 | 93 | def clear(self): 94 | """这个函数只有第一次才调用、后续不要调用、因为这是删库啊!""" 95 | self.db.drop() -------------------------------------------------------------------------------- /work_two_Crawler/catch_mzui.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # http://cuiqingcai.com/3179.html 4 | __author__ = 'Demi Yu' 5 | 6 | from bs4 import BeautifulSoup 7 | import os 8 | 9 | from Download import request 10 | 11 | class mzitu(): 12 | def all_url(self, url): 13 | html = request.get(url,3) ##调用request函数把套图地址传进去会返回给我们一个response 14 | all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') 15 | retval = os.getcwd() # 查看当前工作目录 '/Users/yuhongjun/Python/python-training/work_two_Crawler' 16 | for a in all_a: 17 | title = a.get_text() 18 | print(u'开始保存:', title) ##加点提示不然太枯燥了 19 | path = str(title).replace("?", '_') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 20 | self.mkdir(path) ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦! 21 | href = a['href'] 22 | self.html(href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦! 23 | os.chdir(retval) ##切换到目录 24 | 25 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址 26 | html = request.get(href,3) 27 | max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() 28 | # for page in range(1, int(max_span) + 1): 29 | for page in range(1, 2): 30 | page_url = href + '/' + str(page) 31 | self.img(page_url) ##调用img函数 32 | 33 | def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址 34 | img_html = request.get(page_url,3) 35 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src'] 36 | self.save(img_url) 37 | 38 | def save(self, img_url): ##这个函数保存图片 39 | name = img_url[-9:-4] 40 | img = request.get(img_url,3) 41 | f = open(name + '.jpg', 'ab') 42 | f.write(img.content) 43 | f.close() 44 | 45 | def mkdir(self, path): ##这个函数创建文件夹 46 | path = path.strip() 47 | macPath="Pic/" 48 | isExists = os.path.exists(os.path.join(macPath, path)) 49 | if not isExists: 50 | print(u'建了一个名字叫做', path, u'的文件夹!') 51 | os.makedirs(os.path.join(macPath, path)) 52 | os.chdir(os.path.join(macPath, path)) ##切换到目录 53 | return True 54 | else: 55 | print(u'名字叫做', path, u'的文件夹已经存在了!') 56 | return False 57 | 58 | # def request(self, url): ##这个函数获取网页的response 然后返回 59 | # headers = { 60 | # 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 61 | # content = requests.get(url, headers=headers) 62 | # return content 63 | 64 | 65 | Mzitu = mzitu() ##实例化 66 | Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数 你可以当作启动爬虫(就是入口) 67 | -------------------------------------------------------------------------------- /work_two_Crawler/catch_tianmao_rating.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Demi Yu' 5 | 6 | # 导入所需的开发模块 7 | import requests 8 | import re 9 | # 创建循环链接 10 | urls = [] 11 | #处理前100页的评价 12 | for i in list(range(1,10)): 13 | urls.append('https://rate.tmall.com/list_detail_rate.htm?itemId=521136254098&spuId=345965243&sellerId=2106525799&order=1¤tPage=%s' %i) 14 | 15 | # 构建字段容器 16 | nickname = [] #昵称 17 | ratedate = [] #评价时间 18 | color = [] #款式 19 | size = [] #尺码 20 | ratecontent = [] #评价内容 21 | # 循环抓取数据 22 | for url in urls: 23 | content = requests.get(url).text 24 | 25 | # 借助正则表达式使用findall进行匹配查询,可以使用bs 26 | nickname.extend(re.findall('"displayUserNick":"(.*?)"',content)) 27 | color.extend(re.findall(re.compile('颜色分类:(.*?);'),content)) 28 | size.extend(re.findall(re.compile('尺码:(.*?);'),content)) 29 | ratecontent.extend(re.findall(re.compile('"rateContent":"(.*?)","rateDate"'),content)) 30 | ratedate.extend(re.findall(re.compile('"rateDate":"(.*?)","reply"'),content)) 31 | print(nickname) 32 | 33 | # 写入数据,最好改成with形式 34 | file = open('南极人天猫评价.csv','w') 35 | for i in list(range(0,len(nickname))): 36 | file.write(','.join((nickname[i],ratedate[i],color[i],size[i],ratecontent[i]))+'\n') 37 | file.close() -------------------------------------------------------------------------------- /work_two_Crawler/save_cookie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | __author__ = 'Demi Yu' 5 | 6 | 7 | # 基于py3.6 + pycharm 8 | # 原地址:http://cuiqingcai.com/968.html 9 | # 10 | # 11 | # 1.从服务器获取cookie,保存到变量中。 12 | # 整体思路是: 13 | # 先从cookiejar类中声明一个变量用来保存cookie→然后创建cookie处理器,用来处理cookie→创建打开器,读取处理好的cookie→创建请求,来生成cookie→ 将cookie读取到内存 14 | 15 | # import http.cookiejar 16 | # import urllib.request 17 | # 18 | # cookie = http.cookiejar.CookieJar() 19 | # #声明一个CookieJar对象实例来保存cookie 20 | # handler=urllib.request.HTTPCookieProcessor(cookie) 21 | # #创建一个cookie的处理器,handler本质上是HTTPCookieProcessor类下的实例 22 | # opener = urllib.request.build_opener(handler) 23 | # #创建一个打开器opener,读取handler处理好的内容 24 | # response = opener.open('http://www.baidu.com') 25 | # #创建请求,来生成cookie。此处的open方法同urlopen方法,也可以传入request 26 | # for item in cookie: 27 | # print ('Name = '+item.name) 28 | # print ('Value = '+item.value) 29 | # 30 | 31 | 32 | 33 | # 2.将cookie保存到文件。 34 | # 整体思路是: 35 | # 创建保存cookie工具的实例(同时指定文件名)→然后创建cookie处理器,用来处理cookie→创建打开器,用来读取处理好的cookie→创建请求,来生成cookie→将cookie保存到文件 36 | 37 | # import http.cookiejar 38 | # import urllib.request 39 | # 40 | # cookie = http.cookiejar.MozillaCookieJar('cookie.txt') 41 | # #创建保存cookie工具的实例(同时指定文件名) 42 | # handler = urllib.request.HTTPCookieProcessor(cookie) 43 | # #创建cookie处理器,用来处理cookie。handler本质上是HTTPCookieProcessor类下的实例 44 | # opener = urllib.request.build_opener(handler) 45 | # #创建打开器,用来读取处理好的cookie 46 | # response = opener.open("http://www.baidu.com") 47 | # #创建请求,来生成cookie 48 | # cookie.save(ignore_discard=True, ignore_expires=True) 49 | # #将cookie保存到文件 50 | # 51 | 52 | 53 | # 3.从文件中读取cookie并使用 54 | # 整体思路: 55 | # 创建空的cookie实例→从文件中读取cookie到变量→创建request请求→创建cookie处理器,用来处理cookie→创建打开器,用来读取已保存的cookie变量→发送请求,获得服务器的response→打印response 56 | 57 | 58 | import http.cookiejar 59 | import urllib.request 60 | 61 | cookie = http.cookiejar.MozillaCookieJar() 62 | #创建空的cookie实例 63 | cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True) 64 | #从文件中读取cookie到变量 65 | req = urllib.request.Request("http://www.baidu.com") 66 | #创建request请求 67 | handler = urllib.request.HTTPCookieProcessor(cookie) 68 | #创建cookie处理器,用来处理cookie 69 | opener = urllib.request.build_opener(handler) 70 | #创建打开器,用来读取已保存的cookie变量 71 | response = opener.open(req) 72 | #发送请求,获得服务器的response 73 | print (response.read()) 74 | #打印response -------------------------------------------------------------------------------- /work_two_Crawler/test2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | #http://cuiqingcai.com/3335.html 4 | import multiprocessing 5 | import time 6 | 7 | def process(num): 8 | time.sleep(num) 9 | print ('Process:', num) 10 | 11 | if __name__ == '__main__': 12 | for i in range(5): 13 | p = multiprocessing.Process(target=process, args=(i,)) 14 | p.start() 15 | 16 | print('CPU number:' + str(multiprocessing.cpu_count())) 17 | for p in multiprocessing.active_children(): 18 | print('Child process name: ' + p.name + ' id: ' + str(p.pid)) 19 | 20 | print('Process Ended') 21 | 22 | -------------------------------------------------------------------------------- /work_two_Crawler/《幸福之路》(一)——开篇.txt: -------------------------------------------------------------------------------- 1 | 《元认知》的最后一部分推荐了一些书目,刚好这本《幸福之路》在手边。借着脑袋还有“元认知”的余热,拿来巩固一下。 2 | 伯特兰·罗素,十九世纪的哲学家、数学家和历史学家。最开始知道他是因为《数学原理》这本书,所以脑海中一直认为他是一个数学家。没想到,他在历史和哲学上的成就并不比在数学中的差。不仅如此,他还获得过“诺贝尔文学奖”。这本《幸福之路》是写于1930年的,距今已经有87年了,文化、科技、医学等等跟“幸福”相关的因素都与现在有着巨大的差距。存在如此巨大差距的情况下,我们可以看看当时的一位智者是怎样思考“幸福”这个一直到现在还在困扰着我们的话题。 3 | 我们人类并不会因为有了足够的食物,就会变得很快乐。事实证明,太多的人似乎都经常处于一种不快乐的状态中。造成这种状况的原因,一部分是由于社会制度的缘故,一部分由于个人心理的缘故。既然充足的食物并不能使人们快乐,那么类推,财富似乎也不是解决人们不快乐的万能钥匙。那么如果想靠单纯地追逐财富来解决不快乐的问题,就显得有点滑稽。这种在大多数国度中大多数人都存在的一种“日常的烦恼”实在让人讨厌,有没有一种方法能够让人摆脱这种可恶的状态呢? 4 | 作者从童年感到生活的漫长苦闷到写这本书时感到明显对生活的热爱,产生了怎么样的变化呢?主要有这么三点:第一,发现了自己最渴望的的东西是什么,并且逐渐得到了不少;第二,成功抛弃了某些根本就不可能实现的欲望,比如获得某种无可置疑的知识的欲望;第三,不那么经常地想着自己,也就是对自己宽容许多。这是作者自己给自己的总结,虽然并不一定适用于所有人,但至少可以拿来借鉴一下。另外,对于那种自我专注过分的人,作者给出的方法是利用外部的戒律,强行将自己的思绪从自身分离出来。 5 | 这里面作者通过举例,解释了一下所谓的“自我专注”。普遍的三类——“犯罪狂”、“自恋狂”、“自大狂”。 6 | 犯罪狂:指沉溺于犯罪意识中的人,并且为此导致自己讨厌自己。这种人通常会让自己时刻处在儿时外界设立的一些禁律之中,这样现实中的自己永远与想象中的自己产生强烈的冲突。身处这样一种境况,如何能感受到幸福。很明显,打破那些童年中架设在这些人身上的荒谬枷锁,是他们重获新生、迈向快乐的第一步。 7 | 自恋狂:跟上面的“犯罪狂”刚好相反,惯于自我欣赏和希望受人欣赏,而且往往过度。这种人失去了爱的能力,渴望得到的是一种强烈的、被所有人关注的欲望。他们对自己以外的人和事都不会再真正感兴趣了。 8 | 自大狂:更渴望权利而不是魅力,与令人爱戴相比,更倾向于选择令人畏惧。过分追求权力会使人不快乐、愚蠢或者既不快乐又愚蠢。那种疯癫式的“我为王”的病态快乐,实际上是一种人格不健全的、屈辱的产物。对于权力的过分追求,会让人变得可悲。 9 | 不快乐的原因多种多样,一言以蔽之:凡有任何明显的精神分析意义上的抑制出现,就没有真正的快乐可言,这让“不快乐”都有了某种共同的元素。一个人在年轻的时候一些正常的心理需求却没有得倒满足,ta就会把能达到这种满足看得比其它任何事情都重要(可以解释受父母忽略的孩子长大之后总是过分渴望得到朋友甚至恋人的关注,这给健康的亲密关系带来了极大的麻烦)。而过分强调在这一方面得到满足,会让ta在相当长的时间里放弃寻找其它快乐的方向,至此,让自己的生活变得脆弱而又单一,真正的幸福快乐也就无从谈起了。 10 | 简单介绍导致人们不快乐的一些原因,后面会对给我们生活造成困扰的几大常见问题展开讨论。 11 | 12 | 13 | 日更挑战第八天 14 | --------------------------------------------------------------------------------