├── README.md
├── 第02章 提供推荐
    ├── deliciousrec.py
    ├── pydelicious.py
    └── recommendations.py
├── 第03章 发现群组
    ├── Thumbs.db
    ├── blogdata.txt
    ├── clusters.py
    ├── downloadzebodata.py
    ├── feedlist.txt
    ├── generatefeedvector.py
    └── zebo.txt
├── 第04章 搜索与排名
    ├── nn.py
    └── searchengine.py
├── 第05章 优化
    ├── dorm.py
    ├── kayak.py
    ├── optimization.py
    ├── schedule.txt
    └── socialnetwork.py
├── 第06章 文档过滤
    ├── docclass.py
    ├── feedfilter.py
    ├── python_search.xml
    ├── test.db
    └── test1.db
├── 第07章 决策树建模
    ├── Thumbs.db
    ├── addresslist.txt
    ├── hotornot.py
    ├── treepredict.py
    └── zillow.py
├── 第08章 构建价格模型
    ├── ebaypredict.py
    ├── numpredict.py
    └── optimization.py
├── 第09章 高阶分类 核方法与SVM
    ├── advancedclassify.py
    ├── agesonly.csv
    ├── facebook.py
    ├── matchmaker.csv
    ├── svm.py
    ├── svm.pyc
    └── svmc.pyd
├── 第10章 寻找独立特征
    ├── Thumbs.db
    ├── articles.txt
    ├── clusters.py
    ├── docclass.py
    ├── features.txt
    ├── newsfeatures.py
    ├── nnmf.py
    ├── stockfeatures.txt
    └── stockvolume.py
└── 第11章 智能进化
    ├── gp.py
    └── gp.pyc


/README.md:
--------------------------------------------------------------------------------
1 | # Programming-Collective-Intelligence-Source-Code
2 | 集体智慧编程源代码
3 | 
4 | ## 中文版PDF电子书免费下载地址
5 | 
6 | http://pan.baidu.com/s/1ntKHRPB
7 | 


--------------------------------------------------------------------------------
/第02章 提供推荐/deliciousrec.py:
--------------------------------------------------------------------------------
 1 | from pydelicious import get_popular,get_userposts,get_urlposts
 2 | import time
 3 | 
 4 | def initializeUserDict(tag,count=5):
 5 |   user_dict={}
 6 |   # get the top count' popular posts
 7 |   for p1 in get_popular(tag=tag)[0:count]:
 8 |     # find all users who posted this
 9 |     for p2 in get_urlposts(p1['href']):
10 |       user=p2['user']
11 |       user_dict[user]={}
12 |   return user_dict
13 | 
14 | def fillItems(user_dict):
15 |   all_items={}
16 |   # Find links posted by all users
17 |   for user in user_dict:
18 |     for i in range(3):
19 |       try:
20 |         posts=get_userposts(user)
21 |         break
22 |       except:
23 |         print "Failed user "+user+", retrying"
24 |         time.sleep(4)
25 |     for post in posts:
26 |       url=post['href']
27 |       user_dict[user][url]=1.0
28 |       all_items[url]=1
29 |   
30 |   # Fill in missing items with 0
31 |   for ratings in user_dict.values():
32 |     for item in all_items:
33 |       if item not in ratings:
34 |         ratings[item]=0.0
35 | 


--------------------------------------------------------------------------------
/第02章 提供推荐/pydelicious.py:
--------------------------------------------------------------------------------
  1 | """Library to access del.icio.us data via Python.
  2 | 
  3 | :examples:
  4 | 
  5 |   Using the API class directly:
  6 | 
  7 |   >>> a = pydelicious.apiNew('user', 'passwd')
  8 |   >>> # or:
  9 |   >>> a = DeliciousAPI('user', 'passwd')
 10 |   >>> a.tags_get() # Same as:
 11 |   >>> a.request('tags/get', )
 12 | 
 13 |   Or by calling the 'convenience' methods on the module.
 14 | 
 15 |   - def add(user, passwd, url, description, tags = "", extended = "", dt = "", replace="no"):
 16 |   - def get(user, passwd, tag="", dt="",  count = 0):
 17 |   - def get_all(user, passwd, tag = ""):
 18 |   - def delete(user, passwd, url):
 19 |   - def rename_tag(user, passwd, oldtag, newtag):
 20 |   - def get_tags(user, passwd):
 21 | 
 22 |   >>> a = apiNew(user, passwd)
 23 |   >>> a.posts_add(url="http://my.com/", desciption="my.com", extended="the url is my.moc", tags="my com")
 24 |   True
 25 |   >>> len(a.posts_all())
 26 |   1
 27 |   >>> get_all(user, passwd)
 28 |   1
 29 | 
 30 |   This are short functions for getrss calls.
 31 | 
 32 |   >>> rss_
 33 | 
 34 | def get_userposts(user):
 35 | def get_tagposts(tag):
 36 | def get_urlposts(url):
 37 | def get_popular(tag = ""):
 38 | 
 39 |   >>> json_posts()
 40 |   >>> json_tags()
 41 |   >>> json_network()
 42 |   >>> json_fans()
 43 | 
 44 | :License: pydelicious is released under the BSD license. See 'license.txt'
 45 |  for more informations.
 46 | 
 47 | :berend:
 48 |  - Rewriting comments to english. More documentation, examples.
 49 |  - Added JSON-like return values for XML data (del.icio.us also serves some JSON...)
 50 |  - better error/exception classes and handling, work in progress.
 51 |  - Encoding seems to be working (using UTF-8 here).
 52 | 
 53 | :@todo:
 54 |  - Source code SHOULD BE ASCII!
 55 |  - More tests.
 56 |  - Parse datetimes in XML.
 57 |  - Salvage and test RSS functionality?
 58 |  - Setup not used, Still works? Should setup.py be tested?
 59 |  - API functions need required argument checks.
 60 | 
 61 |  * lizense einbinden und auch via setup.py verteilen
 62 |  * readme auch schreiben und via setup.py verteilen
 63 |  * auch auf anderen systemen testen (linux -> uni)
 64 |  * automatisch releases bauen lassen, richtig benennen und in das
 65 |    richtige verzeichnis verschieben.
 66 |  * was k[o]nnen die anderen librarys denn noch so? (ruby, java, perl, etc)
 67 |  * was wollen die, die es benutzen?
 68 |  * wof[u]r k[o]nnte ich es benutzen?
 69 |  * entschlacken?
 70 | 
 71 | :done:
 72 |  * Refactored the API class, much cleaner now and functions dlcs_api_request, dlcs_parse_xml are available for who wants them.
 73 |  * stimmt das so? muss eher noch t[a]g str2utf8 konvertieren
 74 |    >>> pydelicious.getrss(tag="t[a]g")
 75 |    url: http://del.icio.us/rss/tag/t[a]g
 76 |  * requester muss eine sekunde warten
 77 |  * __init__.py gibt die funktionen weiter
 78 |  * html parser funktioniert noch nicht, gar nicht
 79 |  * alte funktionen fehlen, get_posts_by_url, etc.
 80 |  * post funktion erstellen, die auch die fehlenden attribs addiert.
 81 |  * die api muss ich noch weiter machen
 82 |  * requester muss die 503er abfangen
 83 |  * rss parser muss auf viele m[o]glichkeiten angepasst werden
 84 | """
 85 | import sys
 86 | import os
 87 | import time
 88 | import datetime
 89 | import md5, httplib
 90 | import urllib, urllib2, time
 91 | from StringIO import StringIO
 92 | 
 93 | try:
 94 |     from elementtree.ElementTree import parse as parse_xml
 95 | except ImportError:
 96 |     from  xml.etree.ElementTree import parse as parse_xml
 97 | 
 98 | import feedparser
 99 | 
100 | 
101 | ### Static config
102 | 
103 | __version__ = '0.5.0'
104 | __author__ = 'Frank Timmermann <regenkind_at_gmx_dot_de>' # GP: does not respond to emails
105 | __contributors__ = [
106 |     'Greg Pinero',
107 |     'Berend van Berkum <berend+pydelicious@dotmpe.com>']
108 | __url__ = 'http://code.google.com/p/pydelicious/'
109 | __author_email__ = ""
110 | # Old URL: 'http://deliciouspython.python-hosting.com/'
111 | 
112 | __description__ = '''pydelicious.py allows you to access the web service of del.icio.us via it's API through python.'''
113 | __long_description__ = '''the goal is to design an easy to use and fully functional python interface to del.icio.us. '''
114 | 
115 | DLCS_OK_MESSAGES = ('done', 'ok') # Known text values of positive del.icio.us <result> answers
116 | DLCS_WAIT_TIME = 4
117 | DLCS_REQUEST_TIMEOUT = 444 # Seconds before socket triggers timeout
118 | #DLCS_API_REALM = 'del.icio.us API'
119 | DLCS_API_HOST = 'https://api.del.icio.us'
120 | DLCS_API_PATH = 'v1'
121 | DLCS_API = "%s/%s" % (DLCS_API_HOST, DLCS_API_PATH)
122 | DLCS_RSS = 'http://del.icio.us/rss/'
123 | 
124 | ISO_8601_DATETIME = '%Y-%m-%dT%H:%M:%SZ'
125 | 
126 | USER_AGENT = 'pydelicious.py/%s %s' % (__version__, __url__)
127 | 
128 | DEBUG = 0
129 | if 'DLCS_DEBUG' in os.environ:
130 |     DEBUG = int(os.environ['DLCS_DEBUG'])
131 | 
132 | 
133 | # Taken from FeedParser.py
134 | # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
135 | # Python 2.3 now has this functionality available in the standard socket library, so under
136 | # 2.3 you don't need to install anything.  But you probably should anyway, because the socket
137 | # module is buggy and timeoutsocket is better.
138 | try:
139 |     import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
140 |     timeoutsocket.setDefaultSocketTimeout(DLCS_REQUEST_TIMEOUT)
141 | except ImportError:
142 |     import socket
143 |     if hasattr(socket, 'setdefaulttimeout'): socket.setdefaulttimeout(DLCS_REQUEST_TIMEOUT)
144 | if DEBUG: print >>sys.stderr, "Set socket timeout to %s seconds" % DLCS_REQUEST_TIMEOUT
145 | 
146 | 
147 | ### Utility classes
148 | 
149 | class _Waiter:
150 |     """Waiter makes sure a certain amount of time passes between
151 |     successive calls of `Waiter()`.
152 | 
153 |     Some attributes:
154 |     :last: time of last call
155 |     :wait: the minimum time needed between calls
156 |     :waited: the number of calls throttled
157 | 
158 |     pydelicious.Waiter is an instance created when the module is loaded.
159 |     """
160 |     def __init__(self, wait):
161 |         self.wait = wait
162 |         self.waited = 0
163 |         self.lastcall = 0;
164 | 
165 |     def __call__(self):
166 |         tt = time.time()
167 | 
168 |         timeago = tt - self.lastcall
169 | 
170 |         if self.lastcall and DEBUG>2:
171 |             print >>sys.stderr, "Lastcall: %s seconds ago." % lastcall
172 | 
173 |         if timeago <= self.wait:
174 |             if DEBUG>0: print >>sys.stderr, "Waiting %s seconds." % self.wait
175 |             time.sleep(self.wait)
176 |             self.waited += 1
177 |             self.lastcall = tt + self.wait
178 |         else:
179 |             self.lastcall = tt
180 | 
181 | Waiter = _Waiter(DLCS_WAIT_TIME)
182 | 
183 | class PyDeliciousException(Exception):
184 |     '''Std. pydelicious error'''
185 |     pass
186 | 
187 | class DeliciousError(Exception):
188 | 	"""Raised when the server responds with a negative answer"""
189 | 
190 | 
191 | class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
192 |     '''@xxx:bvb: Where is this used? should it be registered somewhere with urllib2?
193 | 
194 |     Handles HTTP Error, currently only 503.
195 |     '''
196 |     def http_error_503(self, req, fp, code, msg, headers):
197 |         raise urllib2.HTTPError(req, code, throttled_message, headers, fp)
198 | 
199 | 
200 | class post(dict):
201 |     """Post object, contains href, description, hash, dt, tags,
202 |     extended, user, count(, shared).
203 | 
204 |     @xxx:bvb: Is this needed? Right now this is superfluous,
205 |     """
206 |     def __init__(self, href = "", description = "", hash = "", time = "", tag = "", extended = "", user = "", count = "",
207 |                  tags = "", url = "", dt = ""): # tags or tag?
208 |         self["href"] = href
209 |         if url != "": self["href"] = url
210 |         self["description"] = description
211 |         self["hash"] = hash
212 |         self["dt"] = dt
213 |         if time != "": self["dt"] = time
214 |         self["tags"] = tags
215 |         if tag != "":  self["tags"] = tag     # tag or tags? # !! tags
216 |         self["extended"] = extended
217 |         self["user"] = user
218 |         self["count"] = count
219 | 
220 |     def __getattr__(self, name):
221 |         try: return self[name]
222 |         except: object.__getattribute__(self, name)
223 | 
224 | 
225 | class posts(list):
226 |     """@xxx:bvb: idem as class post, python structures (dict/list) might
227 |     suffice or a more generic solution is needed.
228 |     """
229 |     def __init__(self, *args):
230 |         for i in args: self.append(i)
231 | 
232 |     def __getattr__(self, attr):
233 |         try: return [p[attr] for p in self]
234 |         except: object.__getattribute__(self, attr)
235 | 
236 | ### Utility functions
237 | 
238 | def str2uni(s):
239 |     # type(in) str or unicode
240 |     # type(out) unicode
241 |     return ("".join([unichr(ord(i)) for i in s]))
242 | 
243 | def str2utf8(s):
244 |     # type(in) str or unicode
245 |     # type(out) str
246 |     return ("".join([unichr(ord(i)).encode("utf-8") for i in s]))
247 | 
248 | def str2quote(s):
249 |     return urllib.quote_plus("".join([unichr(ord(i)).encode("utf-8") for i in s]))
250 | 
251 | def dict0(d):
252 |     # Trims empty dict entries
253 |     # {'a':'a', 'b':'', 'c': 'c'} => {'a': 'a', 'c': 'c'}
254 |     dd = dict()
255 |     for i in d:
256 |             if d[i] != "": dd[i] = d[i]
257 |     return dd
258 | 
259 | def delicious_datetime(str):
260 |     """Parse a ISO 8601 formatted string to a Python datetime ...
261 |     """
262 |     return datetime.datetime(*time.strptime(str, ISO_8601_DATETIME)[0:6])
263 | 
264 | def http_request(url, user_agent=USER_AGENT, retry=4):
265 |     """Retrieve the contents referenced by the URL using urllib2.
266 | 
267 |     Retries up to four times (default) on exceptions.
268 |     """
269 |     request = urllib2.Request(url, headers={'User-Agent':user_agent})
270 | 
271 |     # Remember last error
272 |     e = None
273 | 
274 |     # Repeat request on time-out errors
275 |     tries = retry;
276 |     while tries:
277 |         try:
278 |             return urllib2.urlopen(request)
279 | 
280 |         except urllib2.HTTPError, e: # protocol errors,
281 |             raise PyDeliciousException, "%s" % e
282 | 
283 |         except urllib2.URLError, e:
284 |             # @xxx: Ugly check for time-out errors
285 | 			#if len(e)>0 and 'timed out' in arg[0]:
286 | 			print >> sys.stderr, "%s, %s tries left." % (e, tries)
287 | 			Waiter()
288 | 			tries = tries - 1
289 | 			#else:
290 | 			#	tries = None
291 | 
292 |     # Give up
293 |     raise PyDeliciousException, \
294 |             "Unable to retrieve data at '%s', %s" % (url, e)
295 | 
296 | def http_auth_request(url, host, user, passwd, user_agent=USER_AGENT):
297 |     """Call an HTTP server with authorization credentials using urllib2.
298 |     """
299 |     if DEBUG: httplib.HTTPConnection.debuglevel = 1
300 | 
301 |     # Hook up handler/opener to urllib2
302 |     password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
303 |     password_manager.add_password(None, host, user, passwd)
304 |     auth_handler = urllib2.HTTPBasicAuthHandler(password_manager)
305 |     opener = urllib2.build_opener(auth_handler)
306 |     urllib2.install_opener(opener)
307 | 
308 |     return http_request(url, user_agent)
309 | 
310 | def dlcs_api_request(path, params='', user='', passwd='', throttle=True):
311 |     """Retrieve/query a path within the del.icio.us API.
312 | 
313 |     This implements a minimum interval between calls to avoid
314 |     throttling. [#]_ Use param 'throttle' to turn this behaviour off.
315 | 
316 |     @todo: back off on 503's (HTTPError, URLError? @todo: testing).
317 | 
318 |     Returned XML does not always correspond with given del.icio.us examples
319 |     @todo: (cf. help/api/... and post's attributes)
320 | 
321 |     .. [#] http://del.icio.us/help/api/
322 |     """
323 |     if throttle:
324 |         Waiter()
325 | 
326 |     if params:
327 |         # params come as a dict, strip empty entries and urlencode
328 |         url = "%s/%s?%s" % (DLCS_API, path, urllib.urlencode(dict0(params)))
329 |     else:
330 |         url = "%s/%s" % (DLCS_API, path)
331 | 
332 |     if DEBUG: print >>sys.stderr, "dlcs_api_request: %s" % url
333 | 
334 |     try:
335 |         return http_auth_request(url, DLCS_API_HOST, user, passwd, USER_AGENT)
336 | 
337 |     # @bvb: Is this ever raised? When?
338 |     except DefaultErrorHandler, e:
339 |         print >>sys.stderr, "%s" % e
340 | 
341 | def dlcs_parse_xml(data, split_tags=False):
342 |     """Parse any del.icio.us XML document and return Python data structure.
343 | 
344 |     Recognizes all XML document formats as returned by the version 1 API and
345 |     translates to a JSON-like data structure (dicts 'n lists).
346 | 
347 |     Returned instance is always a dictionary. Examples::
348 | 
349 |      {'posts': [{'url':'...','hash':'...',},],}
350 |      {'tags':['tag1', 'tag2',]}
351 |      {'dates': [{'count':'...','date':'...'},], 'tag':'', 'user':'...'}
352 | 	 {'result':(True, "done")}
353 |      # etcetera.
354 |     """
355 | 
356 |     if DEBUG>3: print >>sys.stderr, "dlcs_parse_xml: parsing from ", data
357 | 
358 |     if not hasattr(data, 'read'):
359 |         data = StringIO(data)
360 | 
361 |     doc = parse_xml(data)
362 |     root = doc.getroot()
363 |     fmt = root.tag
364 | 
365 | 	# Split up into three cases: Data, Result or Update
366 |     if fmt in ('tags', 'posts', 'dates', 'bundles'):
367 | 
368 |         # Data: expect a list of data elements, 'resources'.
369 |         # Use `fmt` (without last 's') to find data elements, elements
370 |         # don't have contents, attributes contain all the data we need:
371 |         # append to list
372 |         elist = [el.attrib for el in doc.findall(fmt[:-1])]
373 | 
374 |         # Return list in dict, use tagname of rootnode as keyname.
375 |         data = {fmt: elist}
376 | 
377 |         # Root element might have attributes too, append dict.
378 |         data.update(root.attrib)
379 | 
380 |         return data
381 | 
382 |     elif fmt == 'result':
383 | 
384 |         # Result: answer to operations
385 |         if root.attrib.has_key('code'):
386 |             msg = root.attrib['code']
387 |         else:
388 |             msg = root.text
389 | 
390 | 		# Return {'result':(True, msg)} for /known/ O.K. messages,
391 |         # use (False, msg) otherwise
392 |         v = msg in DLCS_OK_MESSAGES
393 |         return {fmt: (v, msg)}
394 | 
395 |     elif fmt == 'update':
396 | 
397 |         # Update: "time"
398 |         #return {fmt: root.attrib}
399 | 		return {fmt: {'time':time.strptime(root.attrib['time'], ISO_8601_DATETIME)}}
400 | 
401 |     else:
402 |         raise PyDeliciousException, "Unknown XML document format '%s'" % fmt
403 | 
404 | def dlcs_rss_request(tag = "", popular = 0, user = "", url = ''):
405 |     """Handle a request for RSS
406 | 
407 |     @todo: translate from German
408 | 
409 |     rss sollte nun wieder funktionieren, aber diese try, except scheisse ist so nicht schoen
410 | 
411 |     rss wird unterschiedlich zusammengesetzt. ich kann noch keinen einheitlichen zusammenhang
412 |     zwischen daten (url, desc, ext, usw) und dem feed erkennen. warum k[o]nnen die das nicht einheitlich machen?
413 |     """
414 |     tag = str2quote(tag)
415 |     user = str2quote(user)
416 |     if url != '':
417 |         # http://del.icio.us/rss/url/efbfb246d886393d48065551434dab54
418 |         url = DLCS_RSS + '''url/%s'''%md5.new(url).hexdigest()
419 |     elif user != '' and tag != '':
420 |         url = DLCS_RSS + '''%(user)s/%(tag)s'''%dict(user=user, tag=tag)
421 |     elif user != '' and tag == '':
422 |         # http://del.icio.us/rss/delpy
423 |         url = DLCS_RSS + '''%s'''%user
424 |     elif popular == 0 and tag == '':
425 |         url = DLCS_RSS
426 |     elif popular == 0 and tag != '':
427 |         # http://del.icio.us/rss/tag/apple
428 |         # http://del.icio.us/rss/tag/web2.0
429 |         url = DLCS_RSS + "tag/%s"%tag
430 |     elif popular == 1 and tag == '':
431 |         url = DLCS_RSS + '''popular/'''
432 |     elif popular == 1 and tag != '':
433 |         url = DLCS_RSS + '''popular/%s'''%tag
434 |     rss = http_request(url).read()
435 |     rss = feedparser.parse(rss)
436 |     # print rss
437 | #     for e in rss.entries: print e;print
438 |     l = posts()
439 |     for e in rss.entries:
440 |         if e.has_key("links") and e["links"]!=[] and e["links"][0].has_key("href"):
441 |             url = e["links"][0]["href"]
442 |         elif e.has_key("link"):
443 |             url = e["link"]
444 |         elif e.has_key("id"):
445 |             url = e["id"]
446 |         else:
447 |             url = ""
448 |         if e.has_key("title"):
449 |             description = e['title']
450 |         elif e.has_key("title_detail") and e["title_detail"].has_key("title"):
451 |             description = e["title_detail"]['value']
452 |         else:
453 |             description = ''
454 |         try: tags = e['categories'][0][1]
455 |         except:
456 |             try: tags = e["category"]
457 |             except: tags = ""
458 |         if e.has_key("modified"):
459 |             dt = e['modified']
460 |         else:
461 |             dt = ""
462 |         if e.has_key("summary"):
463 |             extended = e['summary']
464 |         elif e.has_key("summary_detail"):
465 |             e['summary_detail']["value"]
466 |         else:
467 |             extended = ""
468 |         if e.has_key("author"):
469 |             user = e['author']
470 |         else:
471 |             user = ""
472 | #  time = dt ist weist auf ein problem hin
473 | # die benennung der variablen ist nicht einheitlich
474 | #  api senden und
475 | #  xml bekommen sind zwei verschiedene schuhe :(
476 |         l.append(post(url = url, description = description, tags = tags, dt = dt, extended = extended, user = user))
477 |     return l
478 | 
479 | 
480 | ### Main module class
481 | 
482 | class DeliciousAPI:
483 |     """Class providing main interace to del.icio.us API.
484 | 
485 |     Methods ``request`` and ``request_raw`` represent the core. For all API
486 |     paths there are furthermore methods (e.g. posts_add for 'posts/all') with
487 |     an explicit declaration of the parameters and documentation. These all call
488 |     ``request`` and pass on extra keywords like ``_raw``.
489 |     """
490 | 
491 |     def __init__(self, user, passwd, codec='iso-8859-1', api_request=dlcs_api_request, xml_parser=dlcs_parse_xml):
492 |         """Initialize access to the API with ``user`` and ``passwd``.
493 | 
494 |         ``codec`` sets the encoding of the arguments.
495 | 
496 |         The ``api_request`` and ``xml_parser`` parameters by default point to
497 |         functions within this package with standard implementations to
498 |         request and parse a resource. See ``dlcs_api_request()`` and
499 |         ``dlcs_parse_xml()``. Note that ``api_request`` should return a
500 |         file-like instance with an HTTPMessage instance under ``info()``,
501 |         see ``urllib2.openurl`` for more info.
502 |         """
503 |         assert user != ""
504 |         self.user = user
505 |         self.passwd = passwd
506 |         self.codec = codec
507 | 
508 |         # Implement communication to server and parsing of respons messages:
509 |         assert callable(api_request)
510 |         self._api_request = api_request
511 |         assert callable(xml_parser)
512 |         self._parse_response = xml_parser
513 | 
514 |     def _call_server(self, path, **params):
515 |         params = dict0(params)
516 |         for key in params:
517 |             params[key] = params[key].encode(self.codec)
518 | 
519 |         # see __init__ for _api_request()
520 |         return self._api_request(path, params, self.user, self.passwd)
521 | 
522 | 
523 |     ### Core functionality
524 | 
525 |     def request(self, path, _raw=False, **params):
526 |         """Calls a path in the API, parses the answer to a JSON-like structure by
527 |         default. Use with ``_raw=True`` or ``call request_raw()`` directly to
528 |         get the filehandler and process the response message manually.
529 | 
530 |         Calls to some paths will return a `result` message, i.e.::
531 | 
532 |             <result code="..." />
533 | 
534 |         or::
535 | 
536 |             <result>...</result>
537 | 
538 |         These are all parsed to ``{'result':(Boolean, MessageString)}`` and this
539 |         method will raise ``DeliciousError`` on negative `result` answers. Using
540 |         ``_raw=True`` bypasses all parsing and will never raise ``DeliciousError``.
541 | 
542 |         See ``dlcs_parse_xml()`` and ``self.request_raw()``."""
543 | 
544 |         # method _parse_response is bound in `__init__()`, `_call_server`
545 |         # uses `_api_request` also set in `__init__()`
546 |         if _raw:
547 |             # return answer
548 |             return self.request_raw(path, **params)
549 | 
550 |         else:
551 |             # get answer and parse
552 |             fl = self._call_server(path, **params)
553 |             rs = self._parse_response(fl)
554 | 
555 | 			# Raise an error for negative 'result' answers
556 |             if type(rs) == dict and rs == 'result' and not rs['result'][0]:
557 |                 errmsg = ""
558 |                 if len(rs['result'])>0:
559 |                     errmsg = rs['result'][1:]
560 |                 raise DeliciousError, errmsg
561 | 
562 |             return rs
563 | 
564 |     def request_raw(self, path, **params):
565 |         """Calls the path in the API, returns the filehandle. Returned
566 |         file-like instances have an ``HTTPMessage`` instance with HTTP header
567 |         information available. Use ``filehandle.info()`` or refer to the
568 |         ``urllib2.openurl`` documentation.
569 |         """
570 |         # see `request()` on how the response can be handled
571 |         return self._call_server(path, **params)
572 | 
573 |     ### Explicit declarations of API paths, their parameters and docs
574 | 
575 |     # Tags
576 |     def tags_get(self, **kwds):
577 |         """Returns a list of tags and the number of times it is used by the user.
578 |         ::
579 | 
580 |             <tags>
581 |                 <tag tag="TagName" count="888">
582 |         """
583 |         return self.request("tags/get", **kwds)
584 | 
585 |     def tags_rename(self, old, new, **kwds):
586 |         """Rename an existing tag with a new tag name. Returns a `result`
587 |         message or raises an ``DeliciousError``. See ``self.request()``.
588 | 
589 |         &old (required)
590 |             Tag to rename.
591 |         &new (required)
592 |             New name.
593 |         """
594 |         return self.request("tags/rename", old=old, new=new, **kwds)
595 | 
596 |     # Posts
597 |     def posts_update(self, **kwds):
598 |         """Returns the last update time for the user. Use this before calling
599 |         `posts_all` to see if the data has changed since the last fetch.
600 |         ::
601 | 
602 |             <update time="CCYY-MM-DDThh:mm:ssZ">
603 | 		"""
604 |         return self.request("posts/update", **kwds)
605 | 
606 |     def posts_dates(self, tag="", **kwds):
607 |         """Returns a list of dates with the number of posts at each date.
608 |         ::
609 | 
610 |             <dates>
611 |                 <date date="CCYY-MM-DD" count="888">
612 | 
613 |         &tag (optional).
614 |             Filter by this tag.
615 |         """
616 |         return self.request("posts/dates", tag=tag, **kwds)
617 | 
618 |     def posts_get(self, tag="", dt="", url="", **kwds):
619 |         """Returns posts matching the arguments. If no date or url is given,
620 |         most recent date will be used.
621 |         ::
622 | 
623 |             <posts dt="CCYY-MM-DD" tag="..." user="...">
624 |                 <post ...>
625 | 
626 |         &tag (optional).
627 |             Filter by this tag.
628 |         &dt (optional).
629 |             Filter by this date (CCYY-MM-DDThh:mm:ssZ).
630 |         &url (optional).
631 |             Filter by this url.
632 |         """
633 |         return self.request("posts/get", tag=tag, dt=dt, url=url, **kwds)
634 | 
635 |     def posts_recent(self, tag="", count="", **kwds):
636 |         """Returns a list of the most recent posts, filtered by argument.
637 |         ::
638 | 
639 |             <posts tag="..." user="...">
640 |                 <post ...>
641 | 
642 |         &tag (optional).
643 |             Filter by this tag.
644 |         &count (optional).
645 |             Number of items to retrieve (Default:15, Maximum:100).
646 |         """
647 |         return self.request("posts/recent", tag=tag, count=count, **kwds)
648 | 
649 |     def posts_all(self, tag="", **kwds):
650 |         """Returns all posts. Please use sparingly. Call the `posts_update`
651 |         method to see if you need to fetch this at all.
652 |         ::
653 | 
654 |             <posts tag="..." user="..." update="CCYY-MM-DDThh:mm:ssZ">
655 |                 <post ...>
656 | 
657 |         &tag (optional).
658 |             Filter by this tag.
659 |         """
660 |         return self.request("posts/all", tag=tag, **kwds)
661 | 
662 |     def posts_add(self, url, description, extended="", tags="", dt="",
663 |             replace="no", shared="yes", **kwds):
664 |         """Add a post to del.icio.us. Returns a `result` message or raises an
665 |         ``DeliciousError``. See ``self.request()``.
666 | 
667 |         &url (required)
668 |             the url of the item.
669 |         &description (required)
670 |             the description of the item.
671 |         &extended (optional)
672 |             notes for the item.
673 |         &tags (optional)
674 |             tags for the item (space delimited).
675 |         &dt (optional)
676 |             datestamp of the item (format "CCYY-MM-DDThh:mm:ssZ").
677 | 
678 |         Requires a LITERAL "T" and "Z" like in ISO8601 at http://www.cl.cam.ac.uk/~mgk25/iso-time.html for example: "1984-09-01T14:21:31Z"
679 |         &replace=no (optional) - don't replace post if given url has already been posted.
680 |         &shared=no (optional) - make the item private
681 |         """
682 |         return self.request("posts/add", url=url, description=description,
683 |                 extended=extended, tags=tags, dt=dt,
684 |                 replace=replace, shared=shared, **kwds)
685 | 
686 |     def posts_delete(self, url, **kwds):
687 |         """Delete a post from del.icio.us. Returns a `result` message or
688 |         raises an ``DeliciousError``. See ``self.request()``.
689 | 
690 |         &url (required)
691 |             the url of the item.
692 |         """
693 |         return self.request("posts/delete", url=url, **kwds)
694 | 
695 |     # Bundles
696 |     def bundles_all(self, **kwds):
697 |         """Retrieve user bundles from del.icio.us.
698 |         ::
699 | 
700 |             <bundles>
701 |                 <bundel name="..." tags=...">
702 |         """
703 |         return self.request("tags/bundles/all", **kwds)
704 | 
705 |     def bundles_set(self, bundle, tags, **kwds):
706 |         """Assign a set of tags to a single bundle, wipes away previous
707 |         settings for bundle. Returns a `result` messages or raises an
708 |         ``DeliciousError``. See ``self.request()``.
709 | 
710 |         &bundle (required)
711 |             the bundle name.
712 |         &tags (required)
713 |             list of tags (space seperated).
714 |         """
715 |         if type(tags)==list:
716 |             tags = " ".join(tags)
717 |         return self.request("tags/bundles/set", bundle=bundle, tags=tags,
718 |                 **kwds)
719 | 
720 |     def bundles_delete(self, bundle, **kwds):
721 |         """Delete a bundle from del.icio.us. Returns a `result` message or
722 |         raises an ``DeliciousError``. See ``self.request()``.
723 | 
724 |         &bundle (required)
725 |             the bundle name.
726 |         """
727 |         return self.request("tags/bundles/delete", bundle=bundle, **kwds)
728 | 
729 |     ### Utils
730 | 
731 |     # Lookup table for del.icio.us url-path to DeliciousAPI method.
732 |     paths = {
733 |         'tags/get': tags_get,
734 |         'tags/rename': tags_rename,
735 |         'posts/update': posts_update,
736 |         'posts/dates': posts_dates,
737 |         'posts/get': posts_get,
738 |         'posts/recent': posts_recent,
739 |         'posts/all': posts_all,
740 |         'posts/add': posts_add,
741 |         'posts/delete': posts_delete,
742 |         'tags/bundles/all': bundles_all,
743 |         'tags/bundles/set': bundles_set,
744 |         'tags/bundles/delete': bundles_delete,
745 |     }
746 | 
747 |     def get_url(self, url):
748 |         """Return the del.icio.us url at which the HTML page with posts for
749 |         ``url`` can be found.
750 |         """
751 |         return "http://del.icio.us/url/?url=%s" % (url,)
752 | 
753 | 
754 | ### Convenience functions on this package
755 | 
756 | def apiNew(user, passwd):
757 |     """creates a new DeliciousAPI object.
758 |     requires user(name) and passwd
759 | 	"""
760 |     return DeliciousAPI(user=user, passwd=passwd)
761 | 
762 | def add(user, passwd, url, description, tags="", extended="", dt="", replace="no"):
763 |     return apiNew(user, passwd).posts_add(url=url, description=description, extended=extended, tags=tags, dt=dt, replace=replace)
764 | 
765 | def get(user, passwd, tag="", dt="",  count = 0):
766 |     posts = apiNew(user, passwd).posts_get(tag=tag,dt=dt)
767 |     if count != 0: posts = posts[0:count]
768 |     return posts
769 | 
770 | def get_all(user, passwd, tag=""):
771 |     return apiNew(user, passwd).posts_all(tag=tag)
772 | 
773 | def delete(user, passwd, url):
774 |     return apiNew(user, passwd).posts_delete(url=url)
775 | 
776 | def rename_tag(user, passwd, oldtag, newtag):
777 |     return apiNew(user=user, passwd=passwd).tags_rename(old=oldtag, new=newtag)
778 | 
779 | def get_tags(user, passwd):
780 |     return apiNew(user=user, passwd=passwd).tags_get()
781 | 
782 | 
783 | ### RSS functions @bvb: still working...?
784 | def getrss(tag="", popular=0, url='', user=""):
785 |     """get posts from del.icio.us via parsing RSS @bvb[or HTML]
786 | 
787 | 	@bvb[not tested]
788 | 
789 |     tag (opt) sort by tag
790 |     popular (opt) look for the popular stuff
791 |     user (opt) get the posts by a user, this striks popular
792 |     url (opt) get the posts by url
793 | 	"""
794 |     return dlcs_rss_request(tag=tag, popular=popular, user=user, url=url)
795 | 
796 | def get_userposts(user):
797 |     return getrss(user = user)
798 | 
799 | def get_tagposts(tag):
800 |     return getrss(tag = tag)
801 | 
802 | def get_urlposts(url):
803 |     return getrss(url = url)
804 | 
805 | def get_popular(tag = ""):
806 |     return getrss(tag = tag, popular = 1)
807 | 
808 | 
809 | ### @TODO: implement JSON fetching
810 | def json_posts(user, count=15):
811 |     """http://del.icio.us/feeds/json/mpe
812 |     http://del.icio.us/feeds/json/mpe/art+history
813 |     count=###   the number of posts you want to get (default is 15, maximum is 100)
814 |     raw         a raw JSON object is returned, instead of an object named Delicious.posts
815 |     """
816 | 
817 | def json_tags(user, atleast, count, sort='alpha'):
818 |     """http://del.icio.us/feeds/json/tags/mpe
819 |     atleast=###         include only tags for which there are at least ### number of posts
820 |     count=###           include ### tags, counting down from the top
821 |     sort={alpha|count}  construct the object with tags in alphabetic order (alpha), or by count of posts (count)
822 |     callback=NAME       wrap the object definition in a function call NAME(...), thus invoking that function when the feed is executed
823 |     raw                 a pure JSON object is returned, instead of code that will construct an object named Delicious.tags
824 |     """
825 | 
826 | def json_network(user):
827 |     """http://del.icio.us/feeds/json/network/mpe
828 |     callback=NAME       wrap the object definition in a function call NAME(...)
829 |     ?raw         a raw JSON object is returned, instead of an object named Delicious.posts
830 |     """
831 | 
832 | def json_fans(user):
833 |     """http://del.icio.us/feeds/json/fans/mpe
834 |     callback=NAME       wrap the object definition in a function call NAME(...)
835 |     ?raw         a pure JSON object is returned, instead of an object named Delicious.
836 |     """
837 | 
838 | 


--------------------------------------------------------------------------------
/第02章 提供推荐/recommendations.py:
--------------------------------------------------------------------------------
  1 | # A dictionary of movie critics and their ratings of a small
  2 | # set of movies
  3 | critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
  4 |  'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
  5 |  'The Night Listener': 3.0},
  6 | 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
  7 |  'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
  8 |  'You, Me and Dupree': 3.5}, 
  9 | 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 10 |  'Superman Returns': 3.5, 'The Night Listener': 4.0},
 11 | 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 12 |  'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 13 |  'You, Me and Dupree': 2.5},
 14 | 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 15 |  'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 16 |  'You, Me and Dupree': 2.0}, 
 17 | 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 18 |  'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
 19 | 'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
 20 | 
 21 | 
 22 | from math import sqrt
 23 | 
 24 | # Returns a distance-based similarity score for person1 and person2
 25 | def sim_distance(prefs,person1,person2):
 26 |   # Get the list of shared_items
 27 |   si={}
 28 |   for item in prefs[person1]: 
 29 |     if item in prefs[person2]: si[item]=1
 30 | 
 31 |   # if they have no ratings in common, return 0
 32 |   if len(si)==0: return 0
 33 | 
 34 |   # Add up the squares of all the differences
 35 |   sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) 
 36 |                       for item in prefs[person1] if item in prefs[person2]])
 37 | 
 38 |   return 1/(1+sum_of_squares)
 39 | 
 40 | # Returns the Pearson correlation coefficient for p1 and p2
 41 | def sim_pearson(prefs,p1,p2):
 42 |   # Get the list of mutually rated items
 43 |   si={}
 44 |   for item in prefs[p1]: 
 45 |     if item in prefs[p2]: si[item]=1
 46 | 
 47 |   # if they are no ratings in common, return 0
 48 |   if len(si)==0: return 0
 49 | 
 50 |   # Sum calculations
 51 |   n=len(si)
 52 |   
 53 |   # Sums of all the preferences
 54 |   sum1=sum([prefs[p1][it] for it in si])
 55 |   sum2=sum([prefs[p2][it] for it in si])
 56 |   
 57 |   # Sums of the squares
 58 |   sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
 59 |   sum2Sq=sum([pow(prefs[p2][it],2) for it in si])	
 60 |   
 61 |   # Sum of the products
 62 |   pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
 63 |   
 64 |   # Calculate r (Pearson score)
 65 |   num=pSum-(sum1*sum2/n)
 66 |   den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
 67 |   if den==0: return 0
 68 | 
 69 |   r=num/den
 70 | 
 71 |   return r
 72 | 
 73 | # Returns the best matches for person from the prefs dictionary. 
 74 | # Number of results and similarity function are optional params.
 75 | def topMatches(prefs,person,n=5,similarity=sim_pearson):
 76 |   scores=[(similarity(prefs,person,other),other) 
 77 |                   for other in prefs if other!=person]
 78 |   scores.sort()
 79 |   scores.reverse()
 80 |   return scores[0:n]
 81 | 
 82 | # Gets recommendations for a person by using a weighted average
 83 | # of every other user's rankings
 84 | def getRecommendations(prefs,person,similarity=sim_pearson):
 85 |   totals={}
 86 |   simSums={}
 87 |   for other in prefs:
 88 |     # don't compare me to myself
 89 |     if other==person: continue
 90 |     sim=similarity(prefs,person,other)
 91 | 
 92 |     # ignore scores of zero or lower
 93 |     if sim<=0: continue
 94 |     for item in prefs[other]:
 95 | 	    
 96 |       # only score movies I haven't seen yet
 97 |       if item not in prefs[person] or prefs[person][item]==0:
 98 |         # Similarity * Score
 99 |         totals.setdefault(item,0)
100 |         totals[item]+=prefs[other][item]*sim
101 |         # Sum of similarities
102 |         simSums.setdefault(item,0)
103 |         simSums[item]+=sim
104 | 
105 |   # Create the normalized list
106 |   rankings=[(total/simSums[item],item) for item,total in totals.items()]
107 | 
108 |   # Return the sorted list
109 |   rankings.sort()
110 |   rankings.reverse()
111 |   return rankings
112 | 
113 | def transformPrefs(prefs):
114 |   result={}
115 |   for person in prefs:
116 |     for item in prefs[person]:
117 |       result.setdefault(item,{})
118 |       
119 |       # Flip item and person
120 |       result[item][person]=prefs[person][item]
121 |   return result
122 | 
123 | 
124 | def calculateSimilarItems(prefs,n=10):
125 |   # Create a dictionary of items showing which other items they
126 |   # are most similar to.
127 |   result={}
128 |   # Invert the preference matrix to be item-centric
129 |   itemPrefs=transformPrefs(prefs)
130 |   c=0
131 |   for item in itemPrefs:
132 |     # Status updates for large datasets
133 |     c+=1
134 |     if c%100==0: print "%d / %d" % (c,len(itemPrefs))
135 |     # Find the most similar items to this one
136 |     scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
137 |     result[item]=scores
138 |   return result
139 | 
140 | def getRecommendedItems(prefs,itemMatch,user):
141 |   userRatings=prefs[user]
142 |   scores={}
143 |   totalSim={}
144 |   # Loop over items rated by this user
145 |   for (item,rating) in userRatings.items( ):
146 | 
147 |     # Loop over items similar to this one
148 |     for (similarity,item2) in itemMatch[item]:
149 | 
150 |       # Ignore if this user has already rated this item
151 |       if item2 in userRatings: continue
152 |       # Weighted sum of rating times similarity
153 |       scores.setdefault(item2,0)
154 |       scores[item2]+=similarity*rating
155 |       # Sum of all the similarities
156 |       totalSim.setdefault(item2,0)
157 |       totalSim[item2]+=similarity
158 | 
159 |   # Divide each total score by total weighting to get an average
160 |   rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
161 | 
162 |   # Return the rankings from highest to lowest
163 |   rankings.sort( )
164 |   rankings.reverse( )
165 |   return rankings
166 | 
167 | def loadMovieLens(path='/data/movielens'):
168 |   # Get movie titles
169 |   movies={}
170 |   for line in open(path+'/u.item'):
171 |     (id,title)=line.split('|')[0:2]
172 |     movies[id]=title
173 |   
174 |   # Load data
175 |   prefs={}
176 |   for line in open(path+'/u.data'):
177 |     (user,movieid,rating,ts)=line.split('\t')
178 |     prefs.setdefault(user,{})
179 |     prefs[user][movies[movieid]]=float(rating)
180 |   return prefs
181 | 


--------------------------------------------------------------------------------
/第03章 发现群组/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第03章 发现群组/Thumbs.db


--------------------------------------------------------------------------------
/第03章 发现群组/clusters.py:
--------------------------------------------------------------------------------
  1 | from PIL import Image,ImageDraw
  2 | 
  3 | def readfile(filename):
  4 |   lines=[line for line in file(filename)]
  5 |   
  6 |   # First line is the column titles
  7 |   colnames=lines[0].strip().split('\t')[1:]
  8 |   rownames=[]
  9 |   data=[]
 10 |   for line in lines[1:]:
 11 |     p=line.strip().split('\t')
 12 |     # First column in each row is the rowname
 13 |     rownames.append(p[0])
 14 |     # The data for this row is the remainder of the row
 15 |     data.append([float(x) for x in p[1:]])
 16 |   return rownames,colnames,data
 17 | 
 18 | 
 19 | from math import sqrt
 20 | 
 21 | def pearson(v1,v2):
 22 |   # Simple sums
 23 |   sum1=sum(v1)
 24 |   sum2=sum(v2)
 25 |   
 26 |   # Sums of the squares
 27 |   sum1Sq=sum([pow(v,2) for v in v1])
 28 |   sum2Sq=sum([pow(v,2) for v in v2])	
 29 |   
 30 |   # Sum of the products
 31 |   pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
 32 |   
 33 |   # Calculate r (Pearson score)
 34 |   num=pSum-(sum1*sum2/len(v1))
 35 |   den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
 36 |   if den==0: return 0
 37 | 
 38 |   return 1.0-num/den
 39 | 
 40 | class bicluster:
 41 |   def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
 42 |     self.left=left
 43 |     self.right=right
 44 |     self.vec=vec
 45 |     self.id=id
 46 |     self.distance=distance
 47 | 
 48 | def hcluster(rows,distance=pearson):
 49 |   distances={}
 50 |   currentclustid=-1
 51 | 
 52 |   # Clusters are initially just the rows
 53 |   clust=[bicluster(rows[i],id=i) for i in range(len(rows))]
 54 | 
 55 |   while len(clust)>1:
 56 |     lowestpair=(0,1)
 57 |     closest=distance(clust[0].vec,clust[1].vec)
 58 | 
 59 |     # loop through every pair looking for the smallest distance
 60 |     for i in range(len(clust)):
 61 |       for j in range(i+1,len(clust)):
 62 |         # distances is the cache of distance calculations
 63 |         if (clust[i].id,clust[j].id) not in distances: 
 64 |           distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
 65 | 
 66 |         d=distances[(clust[i].id,clust[j].id)]
 67 | 
 68 |         if d<closest:
 69 |           closest=d
 70 |           lowestpair=(i,j)
 71 | 
 72 |     # calculate the average of the two clusters
 73 |     mergevec=[
 74 |     (clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 
 75 |     for i in range(len(clust[0].vec))]
 76 | 
 77 |     # create the new cluster
 78 |     newcluster=bicluster(mergevec,left=clust[lowestpair[0]],
 79 |                          right=clust[lowestpair[1]],
 80 |                          distance=closest,id=currentclustid)
 81 | 
 82 |     # cluster ids that weren't in the original set are negative
 83 |     currentclustid-=1
 84 |     del clust[lowestpair[1]]
 85 |     del clust[lowestpair[0]]
 86 |     clust.append(newcluster)
 87 | 
 88 |   return clust[0]
 89 | 
 90 | def printclust(clust,labels=None,n=0):
 91 |   # indent to make a hierarchy layout
 92 |   for i in range(n): print ' ',
 93 |   if clust.id<0:
 94 |     # negative id means that this is branch
 95 |     print '-'
 96 |   else:
 97 |     # positive id means that this is an endpoint
 98 |     if labels==None: print clust.id
 99 |     else: print labels[clust.id]
100 | 
101 |   # now print the right and left branches
102 |   if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
103 |   if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
104 | 
105 | def getheight(clust):
106 |   # Is this an endpoint? Then the height is just 1
107 |   if clust.left==None and clust.right==None: return 1
108 | 
109 |   # Otherwise the height is the same of the heights of
110 |   # each branch
111 |   return getheight(clust.left)+getheight(clust.right)
112 | 
113 | def getdepth(clust):
114 |   # The distance of an endpoint is 0.0
115 |   if clust.left==None and clust.right==None: return 0
116 | 
117 |   # The distance of a branch is the greater of its two sides
118 |   # plus its own distance
119 |   return max(getdepth(clust.left),getdepth(clust.right))+clust.distance
120 | 
121 | 
122 | def drawdendrogram(clust,labels,jpeg='clusters.jpg'):
123 |   # height and width
124 |   h=getheight(clust)*20
125 |   w=1200
126 |   depth=getdepth(clust)
127 | 
128 |   # width is fixed, so scale distances accordingly
129 |   scaling=float(w-150)/depth
130 | 
131 |   # Create a new image with a white background
132 |   img=Image.new('RGB',(w,h),(255,255,255))
133 |   draw=ImageDraw.Draw(img)
134 | 
135 |   draw.line((0,h/2,10,h/2),fill=(255,0,0))    
136 | 
137 |   # Draw the first node
138 |   drawnode(draw,clust,10,(h/2),scaling,labels)
139 |   img.save(jpeg,'JPEG')
140 | 
141 | def drawnode(draw,clust,x,y,scaling,labels):
142 |   if clust.id<0:
143 |     h1=getheight(clust.left)*20
144 |     h2=getheight(clust.right)*20
145 |     top=y-(h1+h2)/2
146 |     bottom=y+(h1+h2)/2
147 |     # Line length
148 |     ll=clust.distance*scaling
149 |     # Vertical line from this cluster to children    
150 |     draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))    
151 |     
152 |     # Horizontal line to left item
153 |     draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))    
154 | 
155 |     # Horizontal line to right item
156 |     draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))        
157 | 
158 |     # Call the function to draw the left and right nodes    
159 |     drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels)
160 |     drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels)
161 |   else:   
162 |     # If this is an endpoint, draw the item label
163 |     draw.text((x+5,y-7),labels[clust.id],(0,0,0))
164 | 
165 | def rotatematrix(data):
166 |   newdata=[]
167 |   for i in range(len(data[0])):
168 |     newrow=[data[j][i] for j in range(len(data))]
169 |     newdata.append(newrow)
170 |   return newdata
171 | 
172 | import random
173 | 
174 | def kcluster(rows,distance=pearson,k=4):
175 |   # Determine the minimum and maximum values for each point
176 |   ranges=[(min([row[i] for row in rows]),max([row[i] for row in rows])) 
177 |   for i in range(len(rows[0]))]
178 | 
179 |   # Create k randomly placed centroids
180 |   clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] 
181 |   for i in range(len(rows[0]))] for j in range(k)]
182 |   
183 |   lastmatches=None
184 |   for t in range(100):
185 |     print 'Iteration %d' % t
186 |     bestmatches=[[] for i in range(k)]
187 |     
188 |     # Find which centroid is the closest for each row
189 |     for j in range(len(rows)):
190 |       row=rows[j]
191 |       bestmatch=0
192 |       for i in range(k):
193 |         d=distance(clusters[i],row)
194 |         if d<distance(clusters[bestmatch],row): bestmatch=i
195 |       bestmatches[bestmatch].append(j)
196 | 
197 |     # If the results are the same as last time, this is complete
198 |     if bestmatches==lastmatches: break
199 |     lastmatches=bestmatches
200 |     
201 |     # Move the centroids to the average of their members
202 |     for i in range(k):
203 |       avgs=[0.0]*len(rows[0])
204 |       if len(bestmatches[i])>0:
205 |         for rowid in bestmatches[i]:
206 |           for m in range(len(rows[rowid])):
207 |             avgs[m]+=rows[rowid][m]
208 |         for j in range(len(avgs)):
209 |           avgs[j]/=len(bestmatches[i])
210 |         clusters[i]=avgs
211 |       
212 |   return bestmatches
213 | 
214 | def tanamoto(v1,v2):
215 |   c1,c2,shr=0,0,0
216 |   
217 |   for i in range(len(v1)):
218 |     if v1[i]!=0: c1+=1 # in v1
219 |     if v2[i]!=0: c2+=1 # in v2
220 |     if v1[i]!=0 and v2[i]!=0: shr+=1 # in both
221 |   
222 |   return 1.0-(float(shr)/(c1+c2-shr))
223 | 
224 | def scaledown(data,distance=pearson,rate=0.01):
225 |   n=len(data)
226 | 
227 |   # The real distances between every pair of items
228 |   realdist=[[distance(data[i],data[j]) for j in range(n)] 
229 |              for i in range(0,n)]
230 | 
231 |   # Randomly initialize the starting points of the locations in 2D
232 |   loc=[[random.random(),random.random()] for i in range(n)]
233 |   fakedist=[[0.0 for j in range(n)] for i in range(n)]
234 |   
235 |   lasterror=None
236 |   for m in range(0,1000):
237 |     # Find projected distances
238 |     for i in range(n):
239 |       for j in range(n):
240 |         fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 
241 |                                  for x in range(len(loc[i]))]))
242 |   
243 |     # Move points
244 |     grad=[[0.0,0.0] for i in range(n)]
245 |     
246 |     totalerror=0
247 |     for k in range(n):
248 |       for j in range(n):
249 |         if j==k: continue
250 |         # The error is percent difference between the distances
251 |         errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
252 |         
253 |         # Each point needs to be moved away from or towards the other
254 |         # point in proportion to how much error it has
255 |         grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
256 |         grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm
257 | 
258 |         # Keep track of the total error
259 |         totalerror+=abs(errorterm)
260 |     print totalerror
261 | 
262 |     # If the answer got worse by moving the points, we are done
263 |     if lasterror and lasterror<totalerror: break
264 |     lasterror=totalerror
265 |     
266 |     # Move each of the points by the learning rate times the gradient
267 |     for k in range(n):
268 |       loc[k][0]-=rate*grad[k][0]
269 |       loc[k][1]-=rate*grad[k][1]
270 | 
271 |   return loc
272 | 
273 | def draw2d(data,labels,jpeg='mds2d.jpg'):
274 |   img=Image.new('RGB',(2000,2000),(255,255,255))
275 |   draw=ImageDraw.Draw(img)
276 |   for i in range(len(data)):
277 |     x=(data[i][0]+0.5)*1000
278 |     y=(data[i][1]+0.5)*1000
279 |     draw.text((x,y),labels[i],(0,0,0))
280 |   img.save(jpeg,'JPEG')  
281 | 


--------------------------------------------------------------------------------
/第03章 发现群组/downloadzebodata.py:
--------------------------------------------------------------------------------
 1 | from BeautifulSoup import BeautifulSoup
 2 | import urllib2
 3 | import re
 4 | chare=re.compile(r'[!-\.&]')
 5 | itemowners={}
 6 | 
 7 | # Words to remove
 8 | dropwords=['a','new','some','more','my','own','the','many','other','another']
 9 | 
10 | currentuser=0
11 | for i in range(1,51):
12 |   # URL for the want search page
13 |   c=urllib2.urlopen(
14 |   'http://member.zebo.com/Main?event_key=USERSEARCH&wiowiw=wiw&keyword=car&page=%d'
15 |   % (i))
16 |   soup=BeautifulSoup(c.read())
17 |   for td in soup('td'):
18 |     # Find table cells of bgverdanasmall class
19 |     if ('class' in dict(td.attrs) and td['class']=='bgverdanasmall'):
20 |       items=[re.sub(chare,'',str(a.contents[0]).lower()).strip() for a in td('a')]
21 |       for item in items:
22 |         # Remove extra words
23 |         txt=' '.join([t for t in item.split(' ') if t not in dropwords])
24 |         if len(txt)<2: continue
25 |         itemowners.setdefault(txt,{})
26 |         itemowners[txt][currentuser]=1
27 |       currentuser+=1
28 |       
29 | out=file('zebo.txt','w')
30 | out.write('Item')
31 | for user in range(0,currentuser): out.write('\tU%d' % user)
32 | out.write('\n')
33 | for item,owners in itemowners.items():
34 |   if len(owners)>10:
35 |     out.write(item)
36 |     for user in range(0,currentuser):
37 |       if user in owners: out.write('\t1')
38 |       else: out.write('\t0')
39 |     out.write('\n')
40 | 


--------------------------------------------------------------------------------
/第03章 发现群组/feedlist.txt:
--------------------------------------------------------------------------------
 1 | http://feeds.feedburner.com/37signals/beMH
 2 | http://feeds.feedburner.com/blogspot/bRuz
 3 | http://battellemedia.com/index.xml
 4 | http://blog.guykawasaki.com/index.rdf
 5 | http://blog.outer-court.com/rss.xml
 6 | http://feeds.searchenginewatch.com/sewblog
 7 | http://blog.topix.net/index.rdf
 8 | http://blogs.abcnews.com/theblotter/index.rdf
 9 | http://feeds.feedburner.com/ConsumingExperienceFull
10 | http://flagrantdisregard.com/index.php/feed/
11 | http://featured.gigaom.com/feed/
12 | http://gizmodo.com/index.xml
13 | http://gofugyourself.typepad.com/go_fug_yourself/index.rdf
14 | http://googleblog.blogspot.com/rss.xml
15 | http://feeds.feedburner.com/GoogleOperatingSystem
16 | http://headrush.typepad.com/creating_passionate_users/index.rdf
17 | http://feeds.feedburner.com/instapundit/main
18 | http://jeremy.zawodny.com/blog/rss2.xml
19 | http://joi.ito.com/index.rdf
20 | http://feeds.feedburner.com/Mashable
21 | http://michellemalkin.com/index.rdf
22 | http://moblogsmoproblems.blogspot.com/rss.xml
23 | http://newsbusters.org/node/feed
24 | http://beta.blogger.com/feeds/27154654/posts/full?alt=rss
25 | http://feeds.feedburner.com/paulstamatiou
26 | http://powerlineblog.com/index.rdf
27 | http://feeds.feedburner.com/Publishing20
28 | http://radar.oreilly.com/index.rdf
29 | http://scienceblogs.com/pharyngula/index.xml
30 | http://scobleizer.wordpress.com/feed/
31 | http://sethgodin.typepad.com/seths_blog/index.rdf
32 | http://rss.slashdot.org/Slashdot/slashdot
33 | http://thinkprogress.org/feed/
34 | http://feeds.feedburner.com/andrewsullivan/rApM
35 | http://wilwheaton.typepad.com/wwdnbackup/index.rdf
36 | http://www.43folders.com/feed/
37 | http://www.456bereastreet.com/feed.xml
38 | http://www.autoblog.com/rss.xml
39 | http://www.bloggersblog.com/rss.xml
40 | http://www.bloglines.com/rss/about/news
41 | http://www.blogmaverick.com/rss.xml
42 | http://www.boingboing.net/index.rdf
43 | http://www.buzzmachine.com/index.xml
44 | http://www.captainsquartersblog.com/mt/index.rdf
45 | http://www.coolhunting.com/index.rdf
46 | http://feeds.copyblogger.com/Copyblogger
47 | http://feeds.feedburner.com/crooksandliars/YaCP
48 | http://feeds.dailykos.com/dailykos/index.xml
49 | http://www.deadspin.com/index.xml
50 | http://www.downloadsquad.com/rss.xml
51 | http://www.engadget.com/rss.xml
52 | http://www.gapingvoid.com/index.rdf
53 | http://www.gawker.com/index.xml
54 | http://www.gothamist.com/index.rdf
55 | http://www.huffingtonpost.com/raw_feed_index.rdf
56 | http://www.hyperorg.com/blogger/index.rdf
57 | http://www.joelonsoftware.com/rss.xml
58 | http://www.joystiq.com/rss.xml
59 | http://www.kotaku.com/index.xml
60 | http://feeds.kottke.org/main
61 | http://www.lifehack.org/feed/
62 | http://www.lifehacker.com/index.xml
63 | http://littlegreenfootballs.com/weblog/lgf-rss.php
64 | http://www.makezine.com/blog/index.xml
65 | http://www.mattcutts.com/blog/feed/
66 | http://xml.metafilter.com/rss.xml
67 | http://www.mezzoblue.com/rss/index.xml
68 | http://www.micropersuasion.com/index.rdf
69 | http://www.neilgaiman.com/journal/feed/rss.xml
70 | http://www.oilman.ca/feed/
71 | http://www.perezhilton.com/index.xml
72 | http://www.plasticbag.org/index.rdf
73 | http://www.powazek.com/rss.xml
74 | http://www.problogger.net/feed/
75 | http://feeds.feedburner.com/QuickOnlineTips
76 | http://www.readwriteweb.com/rss.xml
77 | http://www.schneier.com/blog/index.rdf
78 | http://scienceblogs.com/sample/combined.xml
79 | http://www.seroundtable.com/index.rdf
80 | http://www.shoemoney.com/feed/
81 | http://www.sifry.com/alerts/index.rdf
82 | http://www.simplebits.com/xml/rss.xml
83 | http://feeds.feedburner.com/Spikedhumor
84 | http://www.stevepavlina.com/blog/feed
85 | http://www.talkingpointsmemo.com/index.xml
86 | http://www.tbray.org/ongoing/ongoing.rss
87 | http://feeds.feedburner.com/TechCrunch
88 | http://www.techdirt.com/techdirt_rss.xml
89 | http://www.techeblog.com/index.php/feed/
90 | http://www.thesuperficial.com/index.xml
91 | http://www.tmz.com/rss.xml
92 | http://www.treehugger.com/index.rdf
93 | http://www.tuaw.com/rss.xml
94 | http://www.valleywag.com/index.xml
95 | http://www.we-make-money-not-art.com/index.rdf
96 | http://www.wired.com/rss/index.xml
97 | http://www.wonkette.com/index.xml
98 | 


--------------------------------------------------------------------------------
/第03章 发现群组/generatefeedvector.py:
--------------------------------------------------------------------------------
 1 | import feedparser
 2 | import re
 3 | 
 4 | # Returns title and dictionary of word counts for an RSS feed
 5 | def getwordcounts(url):
 6 |   # Parse the feed
 7 |   d=feedparser.parse(url)
 8 |   wc={}
 9 | 
10 |   # Loop over all the entries
11 |   for e in d.entries:
12 |     if 'summary' in e: summary=e.summary
13 |     else: summary=e.description
14 | 
15 |     # Extract a list of words
16 |     words=getwords(e.title+' '+summary)
17 |     for word in words:
18 |       wc.setdefault(word,0)
19 |       wc[word]+=1
20 |   return d.feed.title,wc
21 | 
22 | def getwords(html):
23 |   # Remove all the HTML tags
24 |   txt=re.compile(r'<[^>]+>').sub('',html)
25 | 
26 |   # Split words by all non-alpha characters
27 |   words=re.compile(r'[^A-Z^a-z]+').split(txt)
28 | 
29 |   # Convert to lowercase
30 |   return [word.lower() for word in words if word!='']
31 | 
32 | 
33 | apcount={}
34 | wordcounts={}
35 | feedlist=[line for line in file('feedlist.txt')]
36 | for feedurl in feedlist:
37 |   try:
38 |     title,wc=getwordcounts(feedurl)
39 |     wordcounts[title]=wc
40 |     for word,count in wc.items():
41 |       apcount.setdefault(word,0)
42 |       if count>1:
43 |         apcount[word]+=1
44 |   except:
45 |     print 'Failed to parse feed %s' % feedurl
46 | 
47 | wordlist=[]
48 | for w,bc in apcount.items():
49 |   frac=float(bc)/len(feedlist)
50 |   if frac>0.1 and frac<0.5:
51 |     wordlist.append(w)
52 | 
53 | out=file('blogdata1.txt','w')
54 | out.write('Blog')
55 | for word in wordlist: out.write('\t%s' % word)
56 | out.write('\n')
57 | for blog,wc in wordcounts.items():
58 |   print blog
59 |   out.write(blog)
60 |   for word in wordlist:
61 |     if word in wc: out.write('\t%d' % wc[word])
62 |     else: out.write('\t0')
63 |   out.write('\n')
64 | 


--------------------------------------------------------------------------------
/第04章 搜索与排名/nn.py:
--------------------------------------------------------------------------------
  1 | from math import tanh
  2 | from pysqlite2 import dbapi2 as sqlite
  3 | 
  4 | def dtanh(y):
  5 |     return 1.0-y*y
  6 | 
  7 | class searchnet:
  8 |     def __init__(self,dbname):
  9 |       self.con=sqlite.connect(dbname)
 10 |   
 11 |     def __del__(self):
 12 |       self.con.close()
 13 | 
 14 |     def maketables(self):
 15 |       self.con.execute('create table hiddennode(create_key)')
 16 |       self.con.execute('create table wordhidden(fromid,toid,strength)')
 17 |       self.con.execute('create table hiddenurl(fromid,toid,strength)')
 18 |       self.con.commit()
 19 | 
 20 |     def getstrength(self,fromid,toid,layer):
 21 |       if layer==0: table='wordhidden'
 22 |       else: table='hiddenurl'
 23 |       res=self.con.execute('select strength from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone()
 24 |       if res==None: 
 25 |           if layer==0: return -0.2
 26 |           if layer==1: return 0
 27 |       return res[0]
 28 | 
 29 |     def setstrength(self,fromid,toid,layer,strength):
 30 |       if layer==0: table='wordhidden'
 31 |       else: table='hiddenurl'
 32 |       res=self.con.execute('select rowid from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone()
 33 |       if res==None: 
 34 |         self.con.execute('insert into %s (fromid,toid,strength) values (%d,%d,%f)' % (table,fromid,toid,strength))
 35 |       else:
 36 |         rowid=res[0]
 37 |         self.con.execute('update %s set strength=%f where rowid=%d' % (table,strength,rowid))
 38 | 
 39 |     def generatehiddennode(self,wordids,urls):
 40 |       if len(wordids)>3: return None
 41 |       # Check if we already created a node for this set of words
 42 |       sorted_words=[str(id) for id in wordids]
 43 |       sorted_words.sort()
 44 |       createkey='_'.join(sorted_words)
 45 |       res=self.con.execute(
 46 |       "select rowid from hiddennode where create_key='%s'" % createkey).fetchone()
 47 | 
 48 |       # If not, create it
 49 |       if res==None:
 50 |         cur=self.con.execute(
 51 |         "insert into hiddennode (create_key) values ('%s')" % createkey)
 52 |         hiddenid=cur.lastrowid
 53 |         # Put in some default weights
 54 |         for wordid in wordids:
 55 |           self.setstrength(wordid,hiddenid,0,1.0/len(wordids))
 56 |         for urlid in urls:
 57 |           self.setstrength(hiddenid,urlid,1,0.1)
 58 |         self.con.commit()
 59 | 
 60 |     def getallhiddenids(self,wordids,urlids):
 61 |       l1={}
 62 |       for wordid in wordids:
 63 |         cur=self.con.execute(
 64 |         'select toid from wordhidden where fromid=%d' % wordid)
 65 |         for row in cur: l1[row[0]]=1
 66 |       for urlid in urlids:
 67 |         cur=self.con.execute(
 68 |         'select fromid from hiddenurl where toid=%d' % urlid)
 69 |         for row in cur: l1[row[0]]=1
 70 |       return l1.keys()
 71 | 
 72 |     def setupnetwork(self,wordids,urlids):
 73 |         # value lists
 74 |         self.wordids=wordids
 75 |         self.hiddenids=self.getallhiddenids(wordids,urlids)
 76 |         self.urlids=urlids
 77 |  
 78 |         # node outputs
 79 |         self.ai = [1.0]*len(self.wordids)
 80 |         self.ah = [1.0]*len(self.hiddenids)
 81 |         self.ao = [1.0]*len(self.urlids)
 82 |         
 83 |         # create weights matrix
 84 |         self.wi = [[self.getstrength(wordid,hiddenid,0) 
 85 |                     for hiddenid in self.hiddenids] 
 86 |                    for wordid in self.wordids]
 87 |         self.wo = [[self.getstrength(hiddenid,urlid,1) 
 88 |                     for urlid in self.urlids] 
 89 |                    for hiddenid in self.hiddenids]
 90 | 
 91 |     def feedforward(self):
 92 |         # the only inputs are the query words
 93 |         for i in range(len(self.wordids)):
 94 |             self.ai[i] = 1.0
 95 | 
 96 |         # hidden activations
 97 |         for j in range(len(self.hiddenids)):
 98 |             sum = 0.0
 99 |             for i in range(len(self.wordids)):
100 |                 sum = sum + self.ai[i] * self.wi[i][j]
101 |             self.ah[j] = tanh(sum)
102 | 
103 |         # output activations
104 |         for k in range(len(self.urlids)):
105 |             sum = 0.0
106 |             for j in range(len(self.hiddenids)):
107 |                 sum = sum + self.ah[j] * self.wo[j][k]
108 |             self.ao[k] = tanh(sum)
109 | 
110 |         return self.ao[:]
111 | 
112 |     def getresult(self,wordids,urlids):
113 |       self.setupnetwork(wordids,urlids)
114 |       return self.feedforward()
115 | 
116 |     def backPropagate(self, targets, N=0.5):
117 |         # calculate errors for output
118 |         output_deltas = [0.0] * len(self.urlids)
119 |         for k in range(len(self.urlids)):
120 |             error = targets[k]-self.ao[k]
121 |             output_deltas[k] = dtanh(self.ao[k]) * error
122 | 
123 |         # calculate errors for hidden layer
124 |         hidden_deltas = [0.0] * len(self.hiddenids)
125 |         for j in range(len(self.hiddenids)):
126 |             error = 0.0
127 |             for k in range(len(self.urlids)):
128 |                 error = error + output_deltas[k]*self.wo[j][k]
129 |             hidden_deltas[j] = dtanh(self.ah[j]) * error
130 | 
131 |         # update output weights
132 |         for j in range(len(self.hiddenids)):
133 |             for k in range(len(self.urlids)):
134 |                 change = output_deltas[k]*self.ah[j]
135 |                 self.wo[j][k] = self.wo[j][k] + N*change
136 | 
137 |         # update input weights
138 |         for i in range(len(self.wordids)):
139 |             for j in range(len(self.hiddenids)):
140 |                 change = hidden_deltas[j]*self.ai[i]
141 |                 self.wi[i][j] = self.wi[i][j] + N*change
142 | 
143 |     def trainquery(self,wordids,urlids,selectedurl): 
144 |       # generate a hidden node if necessary
145 |       self.generatehiddennode(wordids,urlids)
146 | 
147 |       self.setupnetwork(wordids,urlids)      
148 |       self.feedforward()
149 |       targets=[0.0]*len(urlids)
150 |       targets[urlids.index(selectedurl)]=1.0
151 |       error = self.backPropagate(targets)
152 |       self.updatedatabase()
153 | 
154 |     def updatedatabase(self):
155 |       # set them to database values
156 |       for i in range(len(self.wordids)):
157 |           for j in range(len(self.hiddenids)):
158 |               self.setstrength(self.wordids[i],self. hiddenids[j],0,self.wi[i][j])
159 |       for j in range(len(self.hiddenids)):
160 |           for k in range(len(self.urlids)):
161 |               self.setstrength(self.hiddenids[j],self.urlids[k],1,self.wo[j][k])
162 |       self.con.commit()
163 | 


--------------------------------------------------------------------------------
/第04章 搜索与排名/searchengine.py:
--------------------------------------------------------------------------------
  1 | import urllib2
  2 | from BeautifulSoup import *
  3 | from urlparse import urljoin
  4 | from pysqlite2 import dbapi2 as sqlite
  5 | import nn
  6 | mynet=nn.searchnet('nn.db')
  7 | 
  8 | # Create a list of words to ignore
  9 | ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1}
 10 | 
 11 | 
 12 | class crawler:
 13 |   # Initialize the crawler with the name of database
 14 |   def __init__(self,dbname):
 15 |     self.con=sqlite.connect(dbname)
 16 |   
 17 |   def __del__(self):
 18 |     self.con.close()
 19 | 
 20 |   def dbcommit(self):
 21 |     self.con.commit()
 22 | 
 23 |   # Auxilliary function for getting an entry id and adding 
 24 |   # it if it's not present
 25 |   def getentryid(self,table,field,value,createnew=True):
 26 |     cur=self.con.execute(
 27 |     "select rowid from %s where %s='%s'" % (table,field,value))
 28 |     res=cur.fetchone()
 29 |     if res==None:
 30 |       cur=self.con.execute(
 31 |       "insert into %s (%s) values ('%s')" % (table,field,value))
 32 |       return cur.lastrowid
 33 |     else:
 34 |       return res[0] 
 35 | 
 36 | 
 37 |   # Index an individual page
 38 |   def addtoindex(self,url,soup):
 39 |     if self.isindexed(url): return
 40 |     print 'Indexing '+url
 41 |   
 42 |     # Get the individual words
 43 |     text=self.gettextonly(soup)
 44 |     words=self.separatewords(text)
 45 |     
 46 |     # Get the URL id
 47 |     urlid=self.getentryid('urllist','url',url)
 48 |     
 49 |     # Link each word to this url
 50 |     for i in range(len(words)):
 51 |       word=words[i]
 52 |       if word in ignorewords: continue
 53 |       wordid=self.getentryid('wordlist','word',word)
 54 |       self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i))
 55 |   
 56 | 
 57 |   
 58 |   # Extract the text from an HTML page (no tags)
 59 |   def gettextonly(self,soup):
 60 |     v=soup.string
 61 |     if v==Null:   
 62 |       c=soup.contents
 63 |       resulttext=''
 64 |       for t in c:
 65 |         subtext=self.gettextonly(t)
 66 |         resulttext+=subtext+'\n'
 67 |       return resulttext
 68 |     else:
 69 |       return v.strip()
 70 | 
 71 |   # Seperate the words by any non-whitespace character
 72 |   def separatewords(self,text):
 73 |     splitter=re.compile('\\W*')
 74 |     return [s.lower() for s in splitter.split(text) if s!='']
 75 | 
 76 |     
 77 |   # Return true if this url is already indexed
 78 |   def isindexed(self,url):
 79 |     return False
 80 |   
 81 |   # Add a link between two pages
 82 |   def addlinkref(self,urlFrom,urlTo,linkText):
 83 |     words=self.separateWords(linkText)
 84 |     fromid=self.getentryid('urllist','url',urlFrom)
 85 |     toid=self.getentryid('urllist','url',urlTo)
 86 |     if fromid==toid: return
 87 |     cur=self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (fromid,toid))
 88 |     linkid=cur.lastrowid
 89 |     for word in words:
 90 |       if word in ignorewords: continue
 91 |       wordid=self.getentryid('wordlist','word',word)
 92 |       self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid,wordid))
 93 | 
 94 |   # Starting with a list of pages, do a breadth
 95 |   # first search to the given depth, indexing pages
 96 |   # as we go
 97 |   def crawl(self,pages,depth=2):
 98 |     for i in range(depth):
 99 |       newpages={}
100 |       for page in pages:
101 |         try:
102 |           c=urllib2.urlopen(page)
103 |         except:
104 |           print "Could not open %s" % page
105 |           continue
106 |         try:
107 |           soup=BeautifulSoup(c.read())
108 |           self.addtoindex(page,soup)
109 |   
110 |           links=soup('a')
111 |           for link in links:
112 |             if ('href' in dict(link.attrs)):
113 |               url=urljoin(page,link['href'])
114 |               if url.find("'")!=-1: continue
115 |               url=url.split('#')[0]  # remove location portion
116 |               if url[0:4]=='http' and not self.isindexed(url):
117 |                 newpages[url]=1
118 |               linkText=self.gettextonly(link)
119 |               self.addlinkref(page,url,linkText)
120 |   
121 |           self.dbcommit()
122 |         except:
123 |           print "Could not parse page %s" % page
124 | 
125 |       pages=newpages
126 | 
127 |   
128 |   # Create the database tables
129 |   def createindextables(self): 
130 |     self.con.execute('create table urllist(url)')
131 |     self.con.execute('create table wordlist(word)')
132 |     self.con.execute('create table wordlocation(urlid,wordid,location)')
133 |     self.con.execute('create table link(fromid integer,toid integer)')
134 |     self.con.execute('create table linkwords(wordid,linkid)')
135 |     self.con.execute('create index wordidx on wordlist(word)')
136 |     self.con.execute('create index urlidx on urllist(url)')
137 |     self.con.execute('create index wordurlidx on wordlocation(wordid)')
138 |     self.con.execute('create index urltoidx on link(toid)')
139 |     self.con.execute('create index urlfromidx on link(fromid)')
140 |     self.dbcommit()
141 | 
142 |   def calculatepagerank(self,iterations=20):
143 |     # clear out the current page rank tables
144 |     self.con.execute('drop table if exists pagerank')
145 |     self.con.execute('create table pagerank(urlid primary key,score)')
146 |     
147 |     # initialize every url with a page rank of 1
148 |     for (urlid,) in self.con.execute('select rowid from urllist'):
149 |       self.con.execute('insert into pagerank(urlid,score) values (%d,1.0)' % urlid)
150 |     self.dbcommit()
151 |     
152 |     for i in range(iterations):
153 |       print "Iteration %d" % (i)
154 |       for (urlid,) in self.con.execute('select rowid from urllist'):
155 |         pr=0.15
156 |         
157 |         # Loop through all the pages that link to this one
158 |         for (linker,) in self.con.execute(
159 |         'select distinct fromid from link where toid=%d' % urlid):
160 |           # Get the page rank of the linker
161 |           linkingpr=self.con.execute(
162 |           'select score from pagerank where urlid=%d' % linker).fetchone()[0]
163 | 
164 |           # Get the total number of links from the linker
165 |           linkingcount=self.con.execute(
166 |           'select count(*) from link where fromid=%d' % linker).fetchone()[0]
167 |           pr+=0.85*(linkingpr/linkingcount)
168 |         self.con.execute(
169 |         'update pagerank set score=%f where urlid=%d' % (pr,urlid))
170 |       self.dbcommit()
171 | 
172 | class searcher:
173 |   def __init__(self,dbname):
174 |     self.con=sqlite.connect(dbname)
175 | 
176 |   def __del__(self):
177 |     self.con.close()
178 | 
179 |   def getmatchrows(self,q):
180 |     # Strings to build the query
181 |     fieldlist='w0.urlid'
182 |     tablelist=''  
183 |     clauselist=''
184 |     wordids=[]
185 | 
186 |     # Split the words by spaces
187 |     words=q.split(' ')  
188 |     tablenumber=0
189 | 
190 |     for word in words:
191 |       # Get the word ID
192 |       wordrow=self.con.execute(
193 |       "select rowid from wordlist where word='%s'" % word).fetchone()
194 |       if wordrow!=None:
195 |         wordid=wordrow[0]
196 |         wordids.append(wordid)
197 |         if tablenumber>0:
198 |           tablelist+=','
199 |           clauselist+=' and '
200 |           clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber)
201 |         fieldlist+=',w%d.location' % tablenumber
202 |         tablelist+='wordlocation w%d' % tablenumber      
203 |         clauselist+='w%d.wordid=%d' % (tablenumber,wordid)
204 |         tablenumber+=1
205 | 
206 |     # Create the query from the separate parts
207 |     fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist)
208 |     print fullquery
209 |     cur=self.con.execute(fullquery)
210 |     rows=[row for row in cur]
211 | 
212 |     return rows,wordids
213 | 
214 |   def getscoredlist(self,rows,wordids):
215 |     totalscores=dict([(row[0],0) for row in rows])
216 | 
217 |     # This is where we'll put our scoring functions
218 |     weights=[(1.0,self.locationscore(rows)), 
219 |              (1.0,self.frequencyscore(rows)),
220 |              (1.0,self.pagerankscore(rows)),
221 |              (1.0,self.linktextscore(rows,wordids)),
222 |              (5.0,self.nnscore(rows,wordids))]
223 |     for (weight,scores) in weights:
224 |       for url in totalscores:
225 |         totalscores[url]+=weight*scores[url]
226 | 
227 |     return totalscores
228 | 
229 |   def geturlname(self,id):
230 |     return self.con.execute(
231 |     "select url from urllist where rowid=%d" % id).fetchone()[0]
232 | 
233 |   def query(self,q):
234 |     rows,wordids=self.getmatchrows(q)
235 |     scores=self.getscoredlist(rows,wordids)
236 |     rankedscores=[(score,url) for (url,score) in scores.items()]
237 |     rankedscores.sort()
238 |     rankedscores.reverse()
239 |     for (score,urlid) in rankedscores[0:10]:
240 |       print '%f\t%s' % (score,self.geturlname(urlid))
241 |     return wordids,[r[1] for r in rankedscores[0:10]]
242 | 
243 |   def normalizescores(self,scores,smallIsBetter=0):
244 |     vsmall=0.00001 # Avoid division by zero errors
245 |     if smallIsBetter:
246 |       minscore=min(scores.values())
247 |       return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()])
248 |     else:
249 |       maxscore=max(scores.values())
250 |       if maxscore==0: maxscore=vsmall
251 |       return dict([(u,float(c)/maxscore) for (u,c) in scores.items()])
252 | 
253 |   def frequencyscore(self,rows):
254 |     counts=dict([(row[0],0) for row in rows])
255 |     for row in rows: counts[row[0]]+=1
256 |     return self.normalizescores(counts)
257 | 
258 |   def locationscore(self,rows):
259 |     locations=dict([(row[0],1000000) for row in rows])
260 |     for row in rows:
261 |       loc=sum(row[1:])
262 |       if loc<locations[row[0]]: locations[row[0]]=loc
263 |     
264 |     return self.normalizescores(locations,smallIsBetter=1)
265 | 
266 |   def distancescore(self,rows):
267 |     # If there's only one word, everyone wins!
268 |     if len(rows[0])<=2: return dict([(row[0],1.0) for row in rows])
269 | 
270 |     # Initialize the dictionary with large values
271 |     mindistance=dict([(row[0],1000000) for row in rows])
272 | 
273 |     for row in rows:
274 |       dist=sum([abs(row[i]-row[i-1]) for i in range(2,len(row))])
275 |       if dist<mindistance[row[0]]: mindistance[row[0]]=dist
276 |     return self.normalizescores(mindistance,smallIsBetter=1)
277 | 
278 |   def inboundlinkscore(self,rows):
279 |     uniqueurls=dict([(row[0],1) for row in rows])
280 |     inboundcount=dict([(u,self.con.execute('select count(*) from link where toid=%d' % u).fetchone()[0]) for u in uniqueurls])   
281 |     return self.normalizescores(inboundcount)
282 | 
283 |   def linktextscore(self,rows,wordids):
284 |     linkscores=dict([(row[0],0) for row in rows])
285 |     for wordid in wordids:
286 |       cur=self.con.execute('select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid)
287 |       for (fromid,toid) in cur:
288 |         if toid in linkscores:
289 |           pr=self.con.execute('select score from pagerank where urlid=%d' % fromid).fetchone()[0]
290 |           linkscores[toid]+=pr
291 |     maxscore=max(linkscores.values())
292 |     normalizedscores=dict([(u,float(l)/maxscore) for (u,l) in linkscores.items()])
293 |     return normalizedscores
294 | 
295 |   def pagerankscore(self,rows):
296 |     pageranks=dict([(row[0],self.con.execute('select score from pagerank where urlid=%d' % row[0]).fetchone()[0]) for row in rows])
297 |     maxrank=max(pageranks.values())
298 |     normalizedscores=dict([(u,float(l)/maxrank) for (u,l) in pageranks.items()])
299 |     return normalizedscores
300 | 
301 |   def nnscore(self,rows,wordids):
302 |     # Get unique URL IDs as an ordered list
303 |     urlids=[urlid for urlid in dict([(row[0],1) for row in rows])]
304 |     nnres=mynet.getresult(wordids,urlids)
305 |     scores=dict([(urlids[i],nnres[i]) for i in range(len(urlids))])
306 |     return self.normalizescores(scores)
307 | 


--------------------------------------------------------------------------------
/第05章 优化/dorm.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | 
 4 | # The dorms, each of which has two available spaces
 5 | dorms=['Zeus','Athena','Hercules','Bacchus','Pluto']
 6 | 
 7 | # People, along with their first and second choices
 8 | prefs=[('Toby', ('Bacchus', 'Hercules')),
 9 |        ('Steve', ('Zeus', 'Pluto')),
10 |        ('Karen', ('Athena', 'Zeus')),
11 |        ('Sarah', ('Zeus', 'Pluto')),
12 |        ('Dave', ('Athena', 'Bacchus')), 
13 |        ('Jeff', ('Hercules', 'Pluto')), 
14 |        ('Fred', ('Pluto', 'Athena')), 
15 |        ('Suzie', ('Bacchus', 'Hercules')), 
16 |        ('Laura', ('Bacchus', 'Hercules')), 
17 |        ('James', ('Hercules', 'Athena'))]
18 | 
19 | # [(0,9),(0,8),(0,7),(0,6),...,(0,0)]
20 | domain=[(0,(len(dorms)*2)-i-1) for i in range(0,len(dorms)*2)]
21 | 
22 | def printsolution(vec):
23 |   slots=[]
24 |   # Create two slots for each dorm
25 |   for i in range(len(dorms)): slots+=[i,i]
26 | 
27 |   # Loop over each students assignment
28 |   for i in range(len(vec)):
29 |     x=int(vec[i])
30 | 
31 |     # Choose the slot from the remaining ones
32 |     dorm=dorms[slots[x]]
33 |     # Show the student and assigned dorm
34 |     print prefs[i][0],dorm
35 |     # Remove this slot
36 |     del slots[x]
37 | 
38 | def dormcost(vec):
39 |   cost=0
40 |   # Create list a of slots
41 |   slots=[0,0,1,1,2,2,3,3,4,4]
42 | 
43 |   # Loop over each student
44 |   for i in range(len(vec)):
45 |     x=int(vec[i])
46 |     dorm=dorms[slots[x]]
47 |     pref=prefs[i][1]
48 |     # First choice costs 0, second choice costs 1
49 |     if pref[0]==dorm: cost+=0
50 |     elif pref[1]==dorm: cost+=1
51 |     else: cost+=3
52 |     # Not on the list costs 3
53 | 
54 |     # Remove selected slot
55 |     del slots[x]
56 |     
57 |   return cost
58 | 


--------------------------------------------------------------------------------
/第05章 优化/kayak.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import urllib2
 3 | import xml.dom.minidom
 4 | 
 5 | kayakkey='YOUR KEY HERE'
 6 | 
 7 | def getkayaksession():
 8 |   # Construct the URL to start a session
 9 |   url='http://www.kayak.com/k/ident/apisession?token=%s&version=1' % kayakkey
10 |   
11 |   # Parse the resulting XML
12 |   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
13 |   
14 |   # Find <sid>xxxxxxxx</sid>
15 |   sid=doc.getElementsByTagName('sid')[0].firstChild.data
16 |   return sid
17 | 
18 | def flightsearch(sid,origin,destination,depart_date):
19 |   
20 |   # Construct search URL
21 |   url='http://www.kayak.com/s/apisearch?basicmode=true&oneway=y&origin=%s' % origin
22 |   url+='&destination=%s&depart_date=%s' % (destination,depart_date)
23 |   url+='&return_date=none&depart_time=a&return_time=a'
24 |   url+='&travelers=1&cabin=e&action=doFlights&apimode=1'
25 |   url+='&_sid_=%s&version=1' % (sid)
26 | 
27 |   # Get the XML
28 |   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
29 | 
30 |   # Extract the search ID
31 |   searchid=doc.getElementsByTagName('searchid')[0].firstChild.data
32 | 
33 |   return searchid
34 | 
35 | def flightsearchresults(sid,searchid):
36 |   def parseprice(p): 
37 |     return float(p[1:].replace(',',''))
38 | 
39 |   # Polling loop
40 |   while 1:
41 |     time.sleep(2)
42 | 
43 |     # Construct URL for polling
44 |     url='http://www.kayak.com/s/basic/flight?'
45 |     url+='searchid=%s&c=5&apimode=1&_sid_=%s&version=1' % (searchid,sid)
46 |     doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
47 | 
48 |     # Look for morepending tag, and wait until it is no longer true
49 |     morepending=doc.getElementsByTagName('morepending')[0].firstChild
50 |     if morepending==None or morepending.data=='false': break
51 | 
52 |   # Now download the complete list
53 |   url='http://www.kayak.com/s/basic/flight?'
54 |   url+='searchid=%s&c=999&apimode=1&_sid_=%s&version=1' % (searchid,sid)
55 |   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
56 | 
57 |   # Get the various elements as lists
58 |   prices=doc.getElementsByTagName('price')
59 |   departures=doc.getElementsByTagName('depart')
60 |   arrivals=doc.getElementsByTagName('arrive')  
61 | 
62 |   # Zip them together
63 |   return zip([p.firstChild.data.split(' ')[1] for p in departures],
64 |              [p.firstChild.data.split(' ')[1] for p in arrivals],
65 |              [parseprice(p.firstChild.data) for p in prices])
66 | 
67 | 
68 | def createschedule(people,dest,dep,ret):
69 |   # Get a session id for these searches
70 |   sid=getkayaksession()
71 |   flights={}
72 |   
73 |   for p in people:
74 |     name,origin=p
75 |     # Outbound flight
76 |     searchid=flightsearch(sid,origin,dest,dep)
77 |     flights[(origin,dest)]=flightsearchresults(sid,searchid)
78 |     
79 |     # Return flight
80 |     searchid=flightsearch(sid,dest,origin,ret)
81 |     flights[(dest,origin)]=flightsearchresults(sid,searchid)
82 |     
83 |   return flights
84 | 


--------------------------------------------------------------------------------
/第05章 优化/optimization.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import random
  3 | import math
  4 | 
  5 | people = [('Seymour','BOS'),
  6 |           ('Franny','DAL'),
  7 |           ('Zooey','CAK'),
  8 |           ('Walt','MIA'),
  9 |           ('Buddy','ORD'),
 10 |           ('Les','OMA')]
 11 | # Laguardia
 12 | destination='LGA'
 13 | 
 14 | flights={}
 15 | # 
 16 | for line in file('schedule.txt'):
 17 |   origin,dest,depart,arrive,price=line.strip().split(',')
 18 |   flights.setdefault((origin,dest),[])
 19 | 
 20 |   # Add details to the list of possible flights
 21 |   flights[(origin,dest)].append((depart,arrive,int(price)))
 22 | 
 23 | def getminutes(t):
 24 |   x=time.strptime(t,'%H:%M')
 25 |   return x[3]*60+x[4]
 26 | 
 27 | def printschedule(r):
 28 |   for d in range(len(r)/2):
 29 |     name=people[d][0]
 30 |     origin=people[d][1]
 31 |     out=flights[(origin,destination)][int(r[d])]
 32 |     ret=flights[(destination,origin)][int(r[d+1])]
 33 |     print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin,
 34 |                                                   out[0],out[1],out[2],
 35 |                                                   ret[0],ret[1],ret[2])
 36 | 
 37 | def schedulecost(sol):
 38 |   totalprice=0
 39 |   latestarrival=0
 40 |   earliestdep=24*60
 41 | 
 42 |   for d in range(len(sol)/2):
 43 |     # Get the inbound and outbound flights
 44 |     origin=people[d][1]
 45 |     outbound=flights[(origin,destination)][int(sol[d])]
 46 |     returnf=flights[(destination,origin)][int(sol[d+1])]
 47 |     
 48 |     # Total price is the price of all outbound and return flights
 49 |     totalprice+=outbound[2]
 50 |     totalprice+=returnf[2]
 51 |     
 52 |     # Track the latest arrival and earliest departure
 53 |     if latestarrival<getminutes(outbound[1]): latestarrival=getminutes(outbound[1])
 54 |     if earliestdep>getminutes(returnf[0]): earliestdep=getminutes(returnf[0])
 55 |   
 56 |   # Every person must wait at the airport until the latest person arrives.
 57 |   # They also must arrive at the same time and wait for their flights.
 58 |   totalwait=0  
 59 |   for d in range(len(sol)/2):
 60 |     origin=people[d][1]
 61 |     outbound=flights[(origin,destination)][int(sol[d])]
 62 |     returnf=flights[(destination,origin)][int(sol[d+1])]
 63 |     totalwait+=latestarrival-getminutes(outbound[1])
 64 |     totalwait+=getminutes(returnf[0])-earliestdep  
 65 | 
 66 |   # Does this solution require an extra day of car rental? That'll be $50!
 67 |   if latestarrival>earliestdep: totalprice+=50
 68 |   
 69 |   return totalprice+totalwait
 70 | 
 71 | def randomoptimize(domain,costf):
 72 |   best=999999999
 73 |   bestr=None
 74 |   for i in range(0,1000):
 75 |     # Create a random solution
 76 |     r=[float(random.randint(domain[i][0],domain[i][1])) 
 77 |        for i in range(len(domain))]
 78 |     
 79 |     # Get the cost
 80 |     cost=costf(r)
 81 |     
 82 |     # Compare it to the best one so far
 83 |     if cost<best:
 84 |       best=cost
 85 |       bestr=r 
 86 |   return r
 87 | 
 88 | def hillclimb(domain,costf):
 89 |   # Create a random solution
 90 |   sol=[random.randint(domain[i][0],domain[i][1])
 91 |       for i in range(len(domain))]
 92 |   # Main loop
 93 |   while 1:
 94 |     # Create list of neighboring solutions
 95 |     neighbors=[]
 96 |     
 97 |     for j in range(len(domain)):
 98 |       # One away in each direction
 99 |       if sol[j]>domain[j][0]:
100 |         neighbors.append(sol[0:j]+[sol[j]+1]+sol[j+1:])
101 |       if sol[j]<domain[j][1]:
102 |         neighbors.append(sol[0:j]+[sol[j]-1]+sol[j+1:])
103 | 
104 |     # See what the best solution amongst the neighbors is
105 |     current=costf(sol)
106 |     best=current
107 |     for j in range(len(neighbors)):
108 |       cost=costf(neighbors[j])
109 |       if cost<best:
110 |         best=cost
111 |         sol=neighbors[j]
112 | 
113 |     # If there's no improvement, then we've reached the top
114 |     if best==current:
115 |       break
116 |   return sol
117 | 
118 | def annealingoptimize(domain,costf,T=10000.0,cool=0.95,step=1):
119 |   # Initialize the values randomly
120 |   vec=[float(random.randint(domain[i][0],domain[i][1])) 
121 |        for i in range(len(domain))]
122 |   
123 |   while T>0.1:
124 |     # Choose one of the indices
125 |     i=random.randint(0,len(domain)-1)
126 | 
127 |     # Choose a direction to change it
128 |     dir=random.randint(-step,step)
129 | 
130 |     # Create a new list with one of the values changed
131 |     vecb=vec[:]
132 |     vecb[i]+=dir
133 |     if vecb[i]<domain[i][0]: vecb[i]=domain[i][0]
134 |     elif vecb[i]>domain[i][1]: vecb[i]=domain[i][1]
135 | 
136 |     # Calculate the current cost and the new cost
137 |     ea=costf(vec)
138 |     eb=costf(vecb)
139 |     p=pow(math.e,(-eb-ea)/T)
140 | 
141 |     # Is it better, or does it make the probability
142 |     # cutoff?
143 |     if (eb<ea or random.random()<p):
144 |       vec=vecb      
145 | 
146 |     # Decrease the temperature
147 |     T=T*cool
148 |   return vec
149 | 
150 | def geneticoptimize(domain,costf,popsize=50,step=1,
151 |                     mutprob=0.2,elite=0.2,maxiter=100):
152 |   # Mutation Operation
153 |   def mutate(vec):
154 |     i=random.randint(0,len(domain)-1)
155 |     if random.random()<0.5 and vec[i]>domain[i][0]:
156 |       return vec[0:i]+[vec[i]-step]+vec[i+1:] 
157 |     elif vec[i]<domain[i][1]:
158 |       return vec[0:i]+[vec[i]+step]+vec[i+1:]
159 |   
160 |   # Crossover Operation
161 |   def crossover(r1,r2):
162 |     i=random.randint(1,len(domain)-2)
163 |     return r1[0:i]+r2[i:]
164 | 
165 |   # Build the initial population
166 |   pop=[]
167 |   for i in range(popsize):
168 |     vec=[random.randint(domain[i][0],domain[i][1]) 
169 |          for i in range(len(domain))]
170 |     pop.append(vec)
171 |   
172 |   # How many winners from each generation?
173 |   topelite=int(elite*popsize)
174 |   
175 |   # Main loop 
176 |   for i in range(maxiter):
177 |     scores=[(costf(v),v) for v in pop]
178 |     scores.sort()
179 |     ranked=[v for (s,v) in scores]
180 |     
181 |     # Start with the pure winners
182 |     pop=ranked[0:topelite]
183 |     
184 |     # Add mutated and bred forms of the winners
185 |     while len(pop)<popsize:
186 |       if random.random()<mutprob:
187 | 
188 |         # Mutation
189 |         c=random.randint(0,topelite)
190 |         pop.append(mutate(ranked[c]))
191 |       else:
192 |       
193 |         # Crossover
194 |         c1=random.randint(0,topelite)
195 |         c2=random.randint(0,topelite)
196 |         pop.append(crossover(ranked[c1],ranked[c2]))
197 |     
198 |     # Print current best score
199 |     print scores[0][0]
200 |     
201 |   return scores[0][1]
202 | 


--------------------------------------------------------------------------------
/第05章 优化/schedule.txt:
--------------------------------------------------------------------------------
  1 | LGA,OMA,6:19,8:13,239
  2 | OMA,LGA,6:11,8:31,249
  3 | LGA,OMA,8:04,10:59,136
  4 | OMA,LGA,7:39,10:24,219
  5 | LGA,OMA,9:31,11:43,210
  6 | OMA,LGA,9:15,12:03,99
  7 | LGA,OMA,11:07,13:24,171
  8 | OMA,LGA,11:08,13:07,175
  9 | LGA,OMA,12:31,14:02,234
 10 | OMA,LGA,12:18,14:56,172
 11 | LGA,OMA,14:05,15:47,226
 12 | OMA,LGA,13:37,15:08,250
 13 | LGA,OMA,15:07,17:21,129
 14 | OMA,LGA,15:03,16:42,135
 15 | LGA,OMA,16:35,18:56,144
 16 | OMA,LGA,16:51,19:09,147
 17 | LGA,OMA,18:25,20:34,205
 18 | OMA,LGA,18:12,20:17,242
 19 | LGA,OMA,20:05,21:44,172
 20 | OMA,LGA,20:05,22:06,261
 21 | LGA,ORD,6:03,8:43,219
 22 | ORD,LGA,6:05,8:32,174
 23 | LGA,ORD,7:50,10:08,164
 24 | ORD,LGA,8:25,10:34,157
 25 | LGA,ORD,9:11,10:42,172
 26 | ORD,LGA,9:42,11:32,169
 27 | LGA,ORD,10:33,13:11,132
 28 | ORD,LGA,11:01,12:39,260
 29 | LGA,ORD,12:08,14:47,231
 30 | ORD,LGA,12:44,14:17,134
 31 | LGA,ORD,14:19,17:09,190
 32 | ORD,LGA,14:22,16:32,126
 33 | LGA,ORD,15:04,17:23,189
 34 | ORD,LGA,15:58,18:40,173
 35 | LGA,ORD,17:06,20:00,95
 36 | ORD,LGA,16:43,19:00,246
 37 | LGA,ORD,18:33,20:22,143
 38 | ORD,LGA,18:48,21:45,246
 39 | LGA,ORD,19:32,21:25,160
 40 | ORD,LGA,19:50,22:24,269
 41 | LGA,MIA,6:33,9:14,172
 42 | MIA,LGA,6:25,9:30,335
 43 | LGA,MIA,8:23,11:07,143
 44 | MIA,LGA,7:34,9:40,324
 45 | LGA,MIA,9:25,12:46,295
 46 | MIA,LGA,9:15,12:29,225
 47 | LGA,MIA,11:08,14:38,262
 48 | MIA,LGA,11:28,14:40,248
 49 | LGA,MIA,12:37,15:05,170
 50 | MIA,LGA,12:05,15:30,330
 51 | LGA,MIA,14:08,16:09,232
 52 | MIA,LGA,14:01,17:24,338
 53 | LGA,MIA,15:23,18:49,150
 54 | MIA,LGA,15:34,18:11,326
 55 | LGA,MIA,16:50,19:26,304
 56 | MIA,LGA,17:07,20:04,291
 57 | LGA,MIA,18:07,21:30,355
 58 | MIA,LGA,18:23,21:35,134
 59 | LGA,MIA,20:27,23:42,169
 60 | MIA,LGA,19:53,22:21,173
 61 | LGA,BOS,6:39,8:09,86
 62 | BOS,LGA,6:17,8:26,89
 63 | LGA,BOS,8:23,10:28,149
 64 | BOS,LGA,8:04,10:11,95
 65 | LGA,BOS,9:58,11:18,130
 66 | BOS,LGA,9:45,11:50,172
 67 | LGA,BOS,10:33,12:03,74
 68 | BOS,LGA,11:16,13:29,83
 69 | LGA,BOS,12:08,14:05,142
 70 | BOS,LGA,12:34,15:02,109
 71 | LGA,BOS,13:39,15:30,74
 72 | BOS,LGA,13:40,15:37,138
 73 | LGA,BOS,15:25,16:58,62
 74 | BOS,LGA,15:27,17:18,151
 75 | LGA,BOS,17:03,18:03,103
 76 | BOS,LGA,17:11,18:30,108
 77 | LGA,BOS,18:24,20:49,124
 78 | BOS,LGA,18:34,19:36,136
 79 | LGA,BOS,19:58,21:23,142
 80 | BOS,LGA,20:17,22:22,102
 81 | LGA,DAL,6:09,9:49,414
 82 | DAL,LGA,6:12,10:22,230
 83 | LGA,DAL,7:57,11:15,347
 84 | DAL,LGA,7:53,11:37,433
 85 | LGA,DAL,9:49,13:51,229
 86 | DAL,LGA,9:08,12:12,364
 87 | LGA,DAL,10:51,14:16,256
 88 | DAL,LGA,10:30,14:57,290
 89 | LGA,DAL,12:20,16:34,500
 90 | DAL,LGA,12:19,15:25,342
 91 | LGA,DAL,14:20,17:32,332
 92 | DAL,LGA,13:54,18:02,294
 93 | LGA,DAL,15:49,20:10,497
 94 | DAL,LGA,15:44,18:55,382
 95 | LGA,DAL,17:14,20:59,277
 96 | DAL,LGA,16:52,20:48,448
 97 | LGA,DAL,18:44,22:42,351
 98 | DAL,LGA,18:26,21:29,464
 99 | LGA,DAL,19:57,23:15,512
100 | DAL,LGA,20:07,23:27,473
101 | LGA,CAK,6:58,9:01,238
102 | CAK,LGA,6:08,8:06,224
103 | LGA,CAK,8:19,11:16,122
104 | CAK,LGA,8:27,10:45,139
105 | LGA,CAK,9:58,12:56,249
106 | CAK,LGA,9:15,12:14,247
107 | LGA,CAK,10:32,13:16,139
108 | CAK,LGA,10:53,13:36,189
109 | LGA,CAK,12:01,13:41,267
110 | CAK,LGA,12:08,14:59,149
111 | LGA,CAK,13:37,15:33,142
112 | CAK,LGA,13:40,15:38,137
113 | LGA,CAK,15:50,18:45,243
114 | CAK,LGA,15:23,17:25,232
115 | LGA,CAK,16:33,18:15,253
116 | CAK,LGA,17:08,19:08,262
117 | LGA,CAK,18:17,21:04,259
118 | CAK,LGA,18:35,20:28,204
119 | LGA,CAK,19:46,21:45,214
120 | CAK,LGA,20:30,23:11,114
121 | 


--------------------------------------------------------------------------------
/第05章 优化/socialnetwork.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | people=['Charlie','Augustus','Veruca','Violet','Mike','Joe','Willy','Miranda']
 4 | 
 5 | links=[('Augustus', 'Willy'), 
 6 |        ('Mike', 'Joe'), 
 7 |        ('Miranda', 'Mike'), 
 8 |        ('Violet', 'Augustus'), 
 9 |        ('Miranda', 'Willy'), 
10 |        ('Charlie', 'Mike'), 
11 |        ('Veruca', 'Joe'), 
12 |        ('Miranda', 'Augustus'), 
13 |        ('Willy', 'Augustus'), 
14 |        ('Joe', 'Charlie'), 
15 |        ('Veruca', 'Augustus'), 
16 |        ('Miranda', 'Joe')]
17 | 
18 | 
19 | def crosscount(v):
20 |   # Convert the number list into a dictionary of person:(x,y)
21 |   loc=dict([(people[i],(v[i*2],v[i*2+1])) for i in range(0,len(people))])
22 |   total=0
23 |   
24 |   # Loop through every pair of links
25 |   for i in range(len(links)):
26 |     for j in range(i+1,len(links)):
27 | 
28 |       # Get the locations 
29 |       (x1,y1),(x2,y2)=loc[links[i][0]],loc[links[i][1]]
30 |       (x3,y3),(x4,y4)=loc[links[j][0]],loc[links[j][1]]
31 |       
32 |       den=(y4-y3)*(x2-x1)-(x4-x3)*(y2-y1)
33 | 
34 |       # den==0 if the lines are parallel
35 |       if den==0: continue
36 | 
37 |       # Otherwise ua and ub are the fraction of the
38 |       # line where they cross
39 |       ua=((x4-x3)*(y1-y3)-(y4-y3)*(x1-x3))/den
40 |       ub=((x2-x1)*(y1-y3)-(y2-y1)*(x1-x3))/den
41 |       
42 |       # If the fraction is between 0 and 1 for both lines
43 |       # then they cross each other
44 |       if ua>0 and ua<1 and ub>0 and ub<1:
45 |         total+=1
46 |     for i in range(len(people)):
47 |       for j in range(i+1,len(people)):
48 |         # Get the locations of the two nodes
49 |         (x1,y1),(x2,y2)=loc[people[i]],loc[people[j]]
50 | 
51 |         # Find the distance between them
52 |         dist=math.sqrt(math.pow(x1-x2,2)+math.pow(y1-y2,2))
53 |         # Penalize any nodes closer than 50 pixels
54 |         if dist<50:
55 |           total+=(1.0-(dist/50.0))
56 |         
57 |   return total
58 | from PIL import Image,ImageDraw
59 | 
60 | def drawnetwork(sol):
61 |   # Create the image
62 |   img=Image.new('RGB',(400,400),(255,255,255))
63 |   draw=ImageDraw.Draw(img)
64 | 
65 |   # Create the position dict
66 |   pos=dict([(people[i],(sol[i*2],sol[i*2+1])) for i in range(0,len(people))])
67 | 
68 |   for (a,b) in links:
69 |     draw.line((pos[a],pos[b]),fill=(255,0,0))
70 | 
71 |   for n,p in pos.items():
72 |     draw.text(p,n,(0,0,0))
73 | 
74 |   img.show()
75 | 
76 | 
77 | domain=[(10,370)]*(len(people)*2)


--------------------------------------------------------------------------------
/第06章 文档过滤/docclass.py:
--------------------------------------------------------------------------------
  1 | from pysqlite2 import dbapi2 as sqlite
  2 | import re
  3 | import math
  4 | 
  5 | def getwords(doc):
  6 |   splitter=re.compile('\\W*')
  7 |   print doc
  8 |   # Split the words by non-alpha characters
  9 |   words=[s.lower() for s in splitter.split(doc) 
 10 |           if len(s)>2 and len(s)<20]
 11 |   
 12 |   # Return the unique set of words only
 13 |   return dict([(w,1) for w in words])
 14 | 
 15 | class classifier:
 16 |   def __init__(self,getfeatures,filename=None):
 17 |     # Counts of feature/category combinations
 18 |     self.fc={}
 19 |     # Counts of documents in each category
 20 |     self.cc={}
 21 |     self.getfeatures=getfeatures
 22 |     
 23 |   def setdb(self,dbfile):
 24 |     self.con=sqlite.connect(dbfile)    
 25 |     self.con.execute('create table if not exists fc(feature,category,count)')
 26 |     self.con.execute('create table if not exists cc(category,count)')
 27 | 
 28 | 
 29 |   def incf(self,f,cat):
 30 |     count=self.fcount(f,cat)
 31 |     if count==0:
 32 |       self.con.execute("insert into fc values ('%s','%s',1)" 
 33 |                        % (f,cat))
 34 |     else:
 35 |       self.con.execute(
 36 |         "update fc set count=%d where feature='%s' and category='%s'" 
 37 |         % (count+1,f,cat)) 
 38 |   
 39 |   def fcount(self,f,cat):
 40 |     res=self.con.execute(
 41 |       'select count from fc where feature="%s" and category="%s"'
 42 |       %(f,cat)).fetchone()
 43 |     if res==None: return 0
 44 |     else: return float(res[0])
 45 | 
 46 |   def incc(self,cat):
 47 |     count=self.catcount(cat)
 48 |     if count==0:
 49 |       self.con.execute("insert into cc values ('%s',1)" % (cat))
 50 |     else:
 51 |       self.con.execute("update cc set count=%d where category='%s'" 
 52 |                        % (count+1,cat))    
 53 | 
 54 |   def catcount(self,cat):
 55 |     res=self.con.execute('select count from cc where category="%s"'
 56 |                          %(cat)).fetchone()
 57 |     if res==None: return 0
 58 |     else: return float(res[0])
 59 | 
 60 |   def categories(self):
 61 |     cur=self.con.execute('select category from cc');
 62 |     return [d[0] for d in cur]
 63 | 
 64 |   def totalcount(self):
 65 |     res=self.con.execute('select sum(count) from cc').fetchone();
 66 |     if res==None: return 0
 67 |     return res[0]
 68 | 
 69 | 
 70 |   def train(self,item,cat):
 71 |     features=self.getfeatures(item)
 72 |     # Increment the count for every feature with this category
 73 |     for f in features:
 74 |       self.incf(f,cat)
 75 | 
 76 |     # Increment the count for this category
 77 |     self.incc(cat)
 78 |     self.con.commit()
 79 | 
 80 |   def fprob(self,f,cat):
 81 |     if self.catcount(cat)==0: return 0
 82 | 
 83 |     # The total number of times this feature appeared in this 
 84 |     # category divided by the total number of items in this category
 85 |     return self.fcount(f,cat)/self.catcount(cat)
 86 | 
 87 |   def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
 88 |     # Calculate current probability
 89 |     basicprob=prf(f,cat)
 90 | 
 91 |     # Count the number of times this feature has appeared in
 92 |     # all categories
 93 |     totals=sum([self.fcount(f,c) for c in self.categories()])
 94 | 
 95 |     # Calculate the weighted average
 96 |     bp=((weight*ap)+(totals*basicprob))/(weight+totals)
 97 |     return bp
 98 | 
 99 | 
100 | 
101 | 
102 | class naivebayes(classifier):
103 |   
104 |   def __init__(self,getfeatures):
105 |     classifier.__init__(self,getfeatures)
106 |     self.thresholds={}
107 |   
108 |   def docprob(self,item,cat):
109 |     features=self.getfeatures(item)   
110 | 
111 |     # Multiply the probabilities of all the features together
112 |     p=1
113 |     for f in features: p*=self.weightedprob(f,cat,self.fprob)
114 |     return p
115 | 
116 |   def prob(self,item,cat):
117 |     catprob=self.catcount(cat)/self.totalcount()
118 |     docprob=self.docprob(item,cat)
119 |     return docprob*catprob
120 |   
121 |   def setthreshold(self,cat,t):
122 |     self.thresholds[cat]=t
123 |     
124 |   def getthreshold(self,cat):
125 |     if cat not in self.thresholds: return 1.0
126 |     return self.thresholds[cat]
127 |   
128 |   def classify(self,item,default=None):
129 |     probs={}
130 |     # Find the category with the highest probability
131 |     max=0.0
132 |     for cat in self.categories():
133 |       probs[cat]=self.prob(item,cat)
134 |       if probs[cat]>max: 
135 |         max=probs[cat]
136 |         best=cat
137 | 
138 |     # Make sure the probability exceeds threshold*next best
139 |     for cat in probs:
140 |       if cat==best: continue
141 |       if probs[cat]*self.getthreshold(best)>probs[best]: return default
142 |     return best
143 | 
144 | class fisherclassifier(classifier):
145 |   def cprob(self,f,cat):
146 |     # The frequency of this feature in this category    
147 |     clf=self.fprob(f,cat)
148 |     if clf==0: return 0
149 | 
150 |     # The frequency of this feature in all the categories
151 |     freqsum=sum([self.fprob(f,c) for c in self.categories()])
152 | 
153 |     # The probability is the frequency in this category divided by
154 |     # the overall frequency
155 |     p=clf/(freqsum)
156 |     
157 |     return p
158 |   def fisherprob(self,item,cat):
159 |     # Multiply all the probabilities together
160 |     p=1
161 |     features=self.getfeatures(item)
162 |     for f in features:
163 |       p*=(self.weightedprob(f,cat,self.cprob))
164 | 
165 |     # Take the natural log and multiply by -2
166 |     fscore=-2*math.log(p)
167 | 
168 |     # Use the inverse chi2 function to get a probability
169 |     return self.invchi2(fscore,len(features)*2)
170 |   def invchi2(self,chi, df):
171 |     m = chi / 2.0
172 |     sum = term = math.exp(-m)
173 |     for i in range(1, df//2):
174 |         term *= m / i
175 |         sum += term
176 |     return min(sum, 1.0)
177 |   def __init__(self,getfeatures):
178 |     classifier.__init__(self,getfeatures)
179 |     self.minimums={}
180 | 
181 |   def setminimum(self,cat,min):
182 |     self.minimums[cat]=min
183 |   
184 |   def getminimum(self,cat):
185 |     if cat not in self.minimums: return 0
186 |     return self.minimums[cat]
187 |   def classify(self,item,default=None):
188 |     # Loop through looking for the best result
189 |     best=default
190 |     max=0.0
191 |     for c in self.categories():
192 |       p=self.fisherprob(item,c)
193 |       # Make sure it exceeds its minimum
194 |       if p>self.getminimum(c) and p>max:
195 |         best=c
196 |         max=p
197 |     return best
198 | 
199 | 
200 | def sampletrain(cl):
201 |   cl.train('Nobody owns the water.','good')
202 |   cl.train('the quick rabbit jumps fences','good')
203 |   cl.train('buy pharmaceuticals now','bad')
204 |   cl.train('make quick money at the online casino','bad')
205 |   cl.train('the quick brown fox jumps','good')
206 | 


--------------------------------------------------------------------------------
/第06章 文档过滤/feedfilter.py:
--------------------------------------------------------------------------------
 1 | import feedparser
 2 | import re
 3 | 
 4 | # Takes a filename of URL of a blog feed and classifies the entries
 5 | def read(feed,classifier):
 6 |   # Get feed entries and loop over them
 7 |   f=feedparser.parse(feed)
 8 |   for entry in f['entries']:
 9 |     print
10 |     print '-----'
11 |     # Print the contents of the entry
12 |     print 'Title:     '+entry['title'].encode('utf-8')
13 |     print 'Publisher: '+entry['publisher'].encode('utf-8')
14 |     print
15 |     print entry['summary'].encode('utf-8')
16 |     
17 | 
18 |     # Combine all the text to create one item for the classifier
19 |     fulltext='%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary'])
20 | 
21 |     # Print the best guess at the current category
22 |     print 'Guess: '+str(classifier.classify(entry))
23 | 
24 |     # Ask the user to specify the correct category and train on that
25 |     cl=raw_input('Enter category: ')
26 |     classifier.train(entry,cl)
27 | 
28 | 
29 | def entryfeatures(entry):
30 |   splitter=re.compile('\\W*')
31 |   f={}
32 |   
33 |   # Extract the title words and annotate
34 |   titlewords=[s.lower() for s in splitter.split(entry['title']) 
35 |           if len(s)>2 and len(s)<20]
36 |   for w in titlewords: f['Title:'+w]=1
37 |   
38 |   # Extract the summary words
39 |   summarywords=[s.lower() for s in splitter.split(entry['summary']) 
40 |           if len(s)>2 and len(s)<20]
41 | 
42 |   # Count uppercase words
43 |   uc=0
44 |   for i in range(len(summarywords)):
45 |     w=summarywords[i]
46 |     f[w]=1
47 |     if w.isupper(): uc+=1
48 |     
49 |     # Get word pairs in summary as features
50 |     if i<len(summarywords)-1:
51 |       twowords=' '.join(summarywords[i:i+1])
52 |       f[twowords]=1
53 |     
54 |   # Keep creator and publisher whole
55 |   f['Publisher:'+entry['publisher']]=1
56 | 
57 |   # UPPERCASE is a virtual word flagging too much shouting  
58 |   if float(uc)/len(summarywords)>0.3: f['UPPERCASE']=1
59 |   
60 |   return f
61 | 


--------------------------------------------------------------------------------
/第06章 文档过滤/test.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第06章 文档过滤/test.db


--------------------------------------------------------------------------------
/第06章 文档过滤/test1.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第06章 文档过滤/test1.db


--------------------------------------------------------------------------------
/第07章 决策树建模/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第07章 决策树建模/Thumbs.db


--------------------------------------------------------------------------------
/第07章 决策树建模/addresslist.txt:
--------------------------------------------------------------------------------
 1 | 6 Washington
 2 | 21 Manassas
 3 | 280 Pearl
 4 | 55 Ellery
 5 | 50 Follen
 6 | 51 Granite
 7 | 992 Memorial
 8 | 83 Trowbridge
 9 | 1 Dana
10 | 45 Regent
11 | 90 Alpine
12 | 21 Francis
13 | 112 Avon Hill
14 | 9 Bellevue
15 | 4 Blanchard Rd
16 | 34 Shea
17 | 5 Fountain
18 | 14 Marcella
19 | 39 Saint Saveur
20 | 35 Pemberton
21 | 46 Shepard
22 | 31 Market
23 | 99 Howard
24 | 88 Pearl
25 | 208 Western
26 | 285 Windsor
27 | 26 Cambridgepark
28 | 211 Erie
29 | 129 Franklin
30 | 27 Gurney
31 | 149 Prospect
32 | 27 Linnaean
33 | 20 Dudley
34 | 60 Otis St
35 | 130 Mount Auburn St
36 | 2 Michael Way
37 | 263 Columbia St
38 | 6 Hurlbut St
39 | 199 Harvard St
40 | 168 River St
41 | 400 Washington St
42 | 12 Traill St
43 | 74 Field St
44 | 21 Walden Square Rd
45 | 7 Wendell St
46 | 15 Normandy Ave
47 | 6 Gibson Ter
48 | 94 Pine St
49 | 23 Magee St
50 | 175 Richdale Ave
51 | 168 River St
52 | 246 Brattle St


--------------------------------------------------------------------------------
/第07章 决策树建模/hotornot.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | import xml.dom.minidom
 3 | 
 4 | api_key='YOUR KEY HERE'
 5 | 
 6 | def getrandomratings(c):
 7 |   # Construct URL for getRandomProfile
 8 |   url="http://services.hotornot.com/rest/?app_key=%s" % api_key
 9 |   url+="&method=Rate.getRandomProfile&retrieve_num=%d" % c
10 |   url+="&get_rate_info=true&meet_users_only=true"
11 |   
12 |   f1=urllib2.urlopen(url).read()
13 | 
14 |   doc=xml.dom.minidom.parseString(f1)
15 |   
16 |   emids=doc.getElementsByTagName('emid')
17 |   ratings=doc.getElementsByTagName('rating')
18 | 
19 |   # Combine the emids and ratings together into a list
20 |   result=[]
21 |   for e,r in zip(emids,ratings):
22 |     if r.firstChild!=None:
23 |       result.append((e.firstChild.data,r.firstChild.data))
24 |   return result
25 | 
26 | stateregions={'New England':['ct','mn','ma','nh','ri','vt'],
27 |               'Mid Atlantic':['de','md','nj','ny','pa'],
28 |               'South':['al','ak','fl','ga','ky','la','ms','mo',
29 |                        'nc','sc','tn','va','wv'],
30 |               'Midwest':['il','in','ia','ks','mi','ne','nd','oh','sd','wi'],
31 |               'West':['ak','ca','co','hi','id','mt','nv','or','ut','wa','wy']}
32 | 
33 | def getpeopledata(ratings):
34 |   result=[]
35 |   for emid,rating in ratings:
36 |     # URL for the MeetMe.getProfile method
37 |     url="http://services.hotornot.com/rest/?app_key=%s" % api_key
38 |     url+="&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid
39 | 
40 |     # Get all the info about this person
41 |     try:
42 |       rating=int(float(rating)+0.5)
43 |       doc2=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
44 |       gender=doc2.getElementsByTagName('gender')[0].firstChild.data
45 |       age=doc2.getElementsByTagName('age')[0].firstChild.data
46 |       loc=doc2.getElementsByTagName('location')[0].firstChild.data[0:2]
47 | 
48 |       # Convert state to region
49 |       for r,s in stateregions.items():
50 |         if loc in s: region=r
51 | 
52 |       if region!=None:
53 |         result.append((gender,int(age),region,rating))
54 |     except:
55 |       pass
56 |   return result
57 | 
58 | 


--------------------------------------------------------------------------------
/第07章 决策树建模/treepredict.py:
--------------------------------------------------------------------------------
  1 | my_data=[['slashdot','USA','yes',18,'None'],
  2 |         ['google','France','yes',23,'Premium'],
  3 |         ['digg','USA','yes',24,'Basic'],
  4 |         ['kiwitobes','France','yes',23,'Basic'],
  5 |         ['google','UK','no',21,'Premium'],
  6 |         ['(direct)','New Zealand','no',12,'None'],
  7 |         ['(direct)','UK','no',21,'Basic'],
  8 |         ['google','USA','no',24,'Premium'],
  9 |         ['slashdot','France','yes',19,'None'],
 10 |         ['digg','USA','no',18,'None'],
 11 |         ['google','UK','no',18,'None'],
 12 |         ['kiwitobes','UK','no',19,'None'],
 13 |         ['digg','New Zealand','yes',12,'Basic'],
 14 |         ['slashdot','UK','no',21,'None'],
 15 |         ['google','UK','yes',18,'Basic'],
 16 |         ['kiwitobes','France','yes',19,'Basic']]
 17 | 
 18 | class decisionnode:
 19 |   def __init__(self,col=-1,value=None,results=None,tb=None,fb=None):
 20 |     self.col=col
 21 |     self.value=value
 22 |     self.results=results
 23 |     self.tb=tb
 24 |     self.fb=fb
 25 | 
 26 | # Divides a set on a specific column. Can handle numeric
 27 | # or nominal values
 28 | def divideset(rows,column,value):
 29 |    # Make a function that tells us if a row is in 
 30 |    # the first group (true) or the second group (false)
 31 |    split_function=None
 32 |    if isinstance(value,int) or isinstance(value,float):
 33 |       split_function=lambda row:row[column]>=value
 34 |    else:
 35 |       split_function=lambda row:row[column]==value
 36 |    
 37 |    # Divide the rows into two sets and return them
 38 |    set1=[row for row in rows if split_function(row)]
 39 |    set2=[row for row in rows if not split_function(row)]
 40 |    return (set1,set2)
 41 | 
 42 | 
 43 | # Create counts of possible results (the last column of 
 44 | # each row is the result)
 45 | def uniquecounts(rows):
 46 |    results={}
 47 |    for row in rows:
 48 |       # The result is the last column
 49 |       r=row[len(row)-1]
 50 |       if r not in results: results[r]=0
 51 |       results[r]+=1
 52 |    return results
 53 | 
 54 | # Probability that a randomly placed item will
 55 | # be in the wrong category
 56 | def giniimpurity(rows):
 57 |   total=len(rows)
 58 |   counts=uniquecounts(rows)
 59 |   imp=0
 60 |   for k1 in counts:
 61 |     p1=float(counts[k1])/total
 62 |     for k2 in counts:
 63 |       if k1==k2: continue
 64 |       p2=float(counts[k2])/total
 65 |       imp+=p1*p2
 66 |   return imp
 67 | 
 68 | # Entropy is the sum of p(x)log(p(x)) across all 
 69 | # the different possible results
 70 | def entropy(rows):
 71 |    from math import log
 72 |    log2=lambda x:log(x)/log(2)  
 73 |    results=uniquecounts(rows)
 74 |    # Now calculate the entropy
 75 |    ent=0.0
 76 |    for r in results.keys():
 77 |       p=float(results[r])/len(rows)
 78 |       ent=ent-p*log2(p)
 79 |    return ent
 80 | 
 81 | 
 82 | 
 83 | 
 84 | def printtree(tree,indent=''):
 85 |    # Is this a leaf node?
 86 |    if tree.results!=None:
 87 |       print str(tree.results)
 88 |    else:
 89 |       # Print the criteria
 90 |       print str(tree.col)+':'+str(tree.value)+'? '
 91 | 
 92 |       # Print the branches
 93 |       print indent+'T->',
 94 |       printtree(tree.tb,indent+'  ')
 95 |       print indent+'F->',
 96 |       printtree(tree.fb,indent+'  ')
 97 | 
 98 | 
 99 | def getwidth(tree):
100 |   if tree.tb==None and tree.fb==None: return 1
101 |   return getwidth(tree.tb)+getwidth(tree.fb)
102 | 
103 | def getdepth(tree):
104 |   if tree.tb==None and tree.fb==None: return 0
105 |   return max(getdepth(tree.tb),getdepth(tree.fb))+1
106 | 
107 | 
108 | from PIL import Image,ImageDraw
109 | 
110 | def drawtree(tree,jpeg='tree.jpg'):
111 |   w=getwidth(tree)*100
112 |   h=getdepth(tree)*100+120
113 | 
114 |   img=Image.new('RGB',(w,h),(255,255,255))
115 |   draw=ImageDraw.Draw(img)
116 | 
117 |   drawnode(draw,tree,w/2,20)
118 |   img.save(jpeg,'JPEG')
119 |   
120 | def drawnode(draw,tree,x,y):
121 |   if tree.results==None:
122 |     # Get the width of each branch
123 |     w1=getwidth(tree.fb)*100
124 |     w2=getwidth(tree.tb)*100
125 | 
126 |     # Determine the total space required by this node
127 |     left=x-(w1+w2)/2
128 |     right=x+(w1+w2)/2
129 | 
130 |     # Draw the condition string
131 |     draw.text((x-20,y-10),str(tree.col)+':'+str(tree.value),(0,0,0))
132 | 
133 |     # Draw links to the branches
134 |     draw.line((x,y,left+w1/2,y+100),fill=(255,0,0))
135 |     draw.line((x,y,right-w2/2,y+100),fill=(255,0,0))
136 |     
137 |     # Draw the branch nodes
138 |     drawnode(draw,tree.fb,left+w1/2,y+100)
139 |     drawnode(draw,tree.tb,right-w2/2,y+100)
140 |   else:
141 |     txt=' \n'.join(['%s:%d'%v for v in tree.results.items()])
142 |     draw.text((x-20,y),txt,(0,0,0))
143 | 
144 | 
145 | def classify(observation,tree):
146 |   if tree.results!=None:
147 |     return tree.results
148 |   else:
149 |     v=observation[tree.col]
150 |     branch=None
151 |     if isinstance(v,int) or isinstance(v,float):
152 |       if v>=tree.value: branch=tree.tb
153 |       else: branch=tree.fb
154 |     else:
155 |       if v==tree.value: branch=tree.tb
156 |       else: branch=tree.fb
157 |     return classify(observation,branch)
158 | 
159 | def prune(tree,mingain):
160 |   # If the branches aren't leaves, then prune them
161 |   if tree.tb.results==None:
162 |     prune(tree.tb,mingain)
163 |   if tree.fb.results==None:
164 |     prune(tree.fb,mingain)
165 |     
166 |   # If both the subbranches are now leaves, see if they
167 |   # should merged
168 |   if tree.tb.results!=None and tree.fb.results!=None:
169 |     # Build a combined dataset
170 |     tb,fb=[],[]
171 |     for v,c in tree.tb.results.items():
172 |       tb+=[[v]]*c
173 |     for v,c in tree.fb.results.items():
174 |       fb+=[[v]]*c
175 |     
176 |     # Test the reduction in entropy
177 |     delta=entropy(tb+fb)-(entropy(tb)+entropy(fb)/2)
178 | 
179 |     if delta<mingain:
180 |       # Merge the branches
181 |       tree.tb,tree.fb=None,None
182 |       tree.results=uniquecounts(tb+fb)
183 | 
184 | def mdclassify(observation,tree):
185 |   if tree.results!=None:
186 |     return tree.results
187 |   else:
188 |     v=observation[tree.col]
189 |     if v==None:
190 |       tr,fr=mdclassify(observation,tree.tb),mdclassify(observation,tree.fb)
191 |       tcount=sum(tr.values())
192 |       fcount=sum(fr.values())
193 |       tw=float(tcount)/(tcount+fcount)
194 |       fw=float(fcount)/(tcount+fcount)
195 |       result={}
196 |       for k,v in tr.items(): result[k]=v*tw
197 |       for k,v in fr.items(): result[k]=v*fw      
198 |       return result
199 |     else:
200 |       if isinstance(v,int) or isinstance(v,float):
201 |         if v>=tree.value: branch=tree.tb
202 |         else: branch=tree.fb
203 |       else:
204 |         if v==tree.value: branch=tree.tb
205 |         else: branch=tree.fb
206 |       return mdclassify(observation,branch)
207 | 
208 | def variance(rows):
209 |   if len(rows)==0: return 0
210 |   data=[float(row[len(row)-1]) for row in rows]
211 |   mean=sum(data)/len(data)
212 |   variance=sum([(d-mean)**2 for d in data])/len(data)
213 |   return variance
214 | 
215 | def buildtree(rows,scoref=entropy):
216 |   if len(rows)==0: return decisionnode()
217 |   current_score=scoref(rows)
218 | 
219 |   # Set up some variables to track the best criteria
220 |   best_gain=0.0
221 |   best_criteria=None
222 |   best_sets=None
223 |   
224 |   column_count=len(rows[0])-1
225 |   for col in range(0,column_count):
226 |     # Generate the list of different values in
227 |     # this column
228 |     column_values={}
229 |     for row in rows:
230 |        column_values[row[col]]=1
231 |     # Now try dividing the rows up for each value
232 |     # in this column
233 |     for value in column_values.keys():
234 |       (set1,set2)=divideset(rows,col,value)
235 |       
236 |       # Information gain
237 |       p=float(len(set1))/len(rows)
238 |       gain=current_score-p*scoref(set1)-(1-p)*scoref(set2)
239 |       if gain>best_gain and len(set1)>0 and len(set2)>0:
240 |         best_gain=gain
241 |         best_criteria=(col,value)
242 |         best_sets=(set1,set2)
243 |   # Create the sub branches   
244 |   if best_gain>0:
245 |     trueBranch=buildtree(best_sets[0])
246 |     falseBranch=buildtree(best_sets[1])
247 |     return decisionnode(col=best_criteria[0],value=best_criteria[1],
248 |                         tb=trueBranch,fb=falseBranch)
249 |   else:
250 |     return decisionnode(results=uniquecounts(rows))
251 | 


--------------------------------------------------------------------------------
/第07章 决策树建模/zillow.py:
--------------------------------------------------------------------------------
 1 | import xml.dom.minidom
 2 | import urllib2
 3 | 
 4 | zwskey="YOUR API KEY"
 5 | 
 6 | def getaddressdata(address,city):
 7 |   escad=address.replace(' ','+')
 8 |   url='http://www.zillow.com/webservice/GetDeepSearchResults.htm?'
 9 |   url+='zws-id=%s&address=%s&citystatezip=%s' % (zwskey,escad,city)
10 |   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
11 |   code=doc.getElementsByTagName('code')[0].firstChild.data
12 |   if code!='0': return None
13 |   if 1:
14 |     zipcode=doc.getElementsByTagName('zipcode')[0].firstChild.data
15 |     use=doc.getElementsByTagName('useCode')[0].firstChild.data
16 |     year=doc.getElementsByTagName('yearBuilt')[0].firstChild.data
17 |     sqft=doc.getElementsByTagName('finishedSqFt')[0].firstChild.data
18 |     bath=doc.getElementsByTagName('bathrooms')[0].firstChild.data
19 |     bed=doc.getElementsByTagName('bedrooms')[0].firstChild.data
20 |     rooms=1 #doc.getElementsByTagName('totalRooms')[0].firstChild.data
21 |     price=doc.getElementsByTagName('amount')[0].firstChild.data
22 |   else:
23 |     return None
24 |        
25 |   return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price)
26 | 
27 | def getpricelist():
28 |   l1=[]
29 |   for line in file('addresslist.txt'):
30 |     data=getaddressdata(line.strip(),'Cambridge,MA')
31 |     l1.append(data)
32 |   return l1
33 | 


--------------------------------------------------------------------------------
/第08章 构建价格模型/ebaypredict.py:
--------------------------------------------------------------------------------
  1 | import httplib
  2 | from xml.dom.minidom import parse, parseString, Node
  3 | 
  4 | devKey = 'YOUR DEV KEY'
  5 | appKey = 'YOUR APP KEY'
  6 | certKey = 'YOUR CERT KEY'
  7 | serverUrl = 'api.ebay.com'
  8 | userToken = 'YOUR TOKEN'
  9 | 
 10 | def getHeaders(apicall,siteID="0",compatabilityLevel = "433"):
 11 |   headers = {"X-EBAY-API-COMPATIBILITY-LEVEL": compatabilityLevel,	
 12 |              "X-EBAY-API-DEV-NAME": devKey,
 13 |              "X-EBAY-API-APP-NAME": appKey,
 14 |              "X-EBAY-API-CERT-NAME": certKey,
 15 |              "X-EBAY-API-CALL-NAME": apicall,
 16 |              "X-EBAY-API-SITEID": siteID,
 17 |              "Content-Type": "text/xml"}
 18 |   return headers
 19 | 
 20 | def sendRequest(apicall,xmlparameters):
 21 |   connection = httplib.HTTPSConnection(serverUrl)
 22 |   connection.request("POST", '/ws/api.dll', xmlparameters, getHeaders(apicall))
 23 |   response = connection.getresponse()
 24 |   if response.status != 200:
 25 |     print "Error sending request:" + response.reason
 26 |   else: 
 27 |     data = response.read()
 28 |     connection.close()
 29 |   return data
 30 | 
 31 | def getSingleValue(node,tag):
 32 |   nl=node.getElementsByTagName(tag)
 33 |   if len(nl)>0:
 34 |     tagNode=nl[0]
 35 |     if tagNode.hasChildNodes():
 36 |       return tagNode.firstChild.nodeValue
 37 |   return '-1'
 38 | 
 39 | 
 40 | def doSearch(query,categoryID=None,page=1):
 41 |   xml = "<?xml version='1.0' encoding='utf-8'?>"+\
 42 |         "<GetSearchResultsRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">"+\
 43 |         "<RequesterCredentials><eBayAuthToken>" +\
 44 |         userToken +\
 45 |         "</eBayAuthToken></RequesterCredentials>" + \
 46 |         "<Pagination>"+\
 47 |           "<EntriesPerPage>200</EntriesPerPage>"+\
 48 |           "<PageNumber>"+str(page)+"</PageNumber>"+\
 49 |         "</Pagination>"+\
 50 |         "<Query>" + query + "</Query>"
 51 |   if categoryID!=None:
 52 |     xml+="<CategoryID>"+str(categoryID)+"</CategoryID>"
 53 |   xml+="</GetSearchResultsRequest>"
 54 |   
 55 |   data=sendRequest('GetSearchResults',xml)
 56 |   response = parseString(data)
 57 |   itemNodes = response.getElementsByTagName('Item');
 58 |   results = []
 59 |   for item in itemNodes:
 60 |     itemId=getSingleValue(item,'ItemID')
 61 |     itemTitle=getSingleValue(item,'Title')
 62 |     itemPrice=getSingleValue(item,'CurrentPrice')
 63 |     itemEnds=getSingleValue(item,'EndTime')
 64 |     results.append((itemId,itemTitle,itemPrice,itemEnds))
 65 |   return results
 66 | 
 67 | 
 68 | def getCategory(query='',parentID=None,siteID='0'):
 69 |   lquery=query.lower()
 70 |   xml = "<?xml version='1.0' encoding='utf-8'?>"+\
 71 |         "<GetCategoriesRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">"+\
 72 |         "<RequesterCredentials><eBayAuthToken>" +\
 73 |         userToken +\
 74 |         "</eBayAuthToken></RequesterCredentials>"+\
 75 |         "<DetailLevel>ReturnAll</DetailLevel>"+\
 76 |         "<ViewAllNodes>true</ViewAllNodes>"+\
 77 |         "<CategorySiteID>"+siteID+"</CategorySiteID>"
 78 |   if parentID==None:
 79 |     xml+="<LevelLimit>1</LevelLimit>"
 80 |   else:
 81 |     xml+="<CategoryParent>"+str(parentID)+"</CategoryParent>"
 82 |   xml += "</GetCategoriesRequest>"
 83 |   data=sendRequest('GetCategories',xml)
 84 |   categoryList=parseString(data)
 85 |   catNodes=categoryList.getElementsByTagName('Category')
 86 |   for node in catNodes:
 87 |     catid=getSingleValue(node,'CategoryID')
 88 |     name=getSingleValue(node,'CategoryName')
 89 |     if name.lower().find(lquery)!=-1:
 90 |       print catid,name
 91 | 
 92 | def getItem(itemID):
 93 |   xml = "<?xml version='1.0' encoding='utf-8'?>"+\
 94 |         "<GetItemRequest xmlns=\"urn:ebay:apis:eBLBaseComponents\">"+\
 95 |         "<RequesterCredentials><eBayAuthToken>" +\
 96 |         userToken +\
 97 |         "</eBayAuthToken></RequesterCredentials>" + \
 98 |         "<ItemID>" + str(itemID) + "</ItemID>"+\
 99 |         "<DetailLevel>ItemReturnAttributes</DetailLevel>"+\
100 |         "</GetItemRequest>"
101 |   data=sendRequest('GetItem',xml)
102 |   result={}
103 |   response=parseString(data)
104 |   result['title']=getSingleValue(response,'Title')
105 |   sellingStatusNode = response.getElementsByTagName('SellingStatus')[0];
106 |   result['price']=getSingleValue(sellingStatusNode,'CurrentPrice')
107 |   result['bids']=getSingleValue(sellingStatusNode,'BidCount')
108 |   seller = response.getElementsByTagName('Seller')
109 |   result['feedback'] = getSingleValue(seller[0],'FeedbackScore')
110 | 
111 |   attributeSet=response.getElementsByTagName('Attribute');
112 |   attributes={}
113 |   for att in attributeSet:
114 |     attID=att.attributes.getNamedItem('attributeID').nodeValue
115 |     attValue=getSingleValue(att,'ValueLiteral')
116 |     attributes[attID]=attValue
117 |   result['attributes']=attributes
118 |   return result
119 | 
120 | 
121 | def makeLaptopDataset():
122 |   searchResults=doSearch('laptop',categoryID=51148)
123 |   result=[]
124 |   for r in searchResults:
125 |     item=getItem(r[0])
126 |     att=item['attributes']
127 |     try:
128 |       data=(float(att['12']),float(att['26444']),
129 |             float(att['26446']),float(att['25710']),
130 |             float(item['feedback'])
131 |            )
132 |       entry={'input':data,'result':float(item['price'])}
133 |       result.append(entry)
134 |     except:
135 |       print item['title']+' failed'
136 |   return result
137 | 


--------------------------------------------------------------------------------
/第08章 构建价格模型/numpredict.py:
--------------------------------------------------------------------------------
  1 | from random import random,randint
  2 | import math
  3 | 
  4 | def wineprice(rating,age):
  5 |   peak_age=rating-50
  6 |   
  7 |   # Calculate price based on rating
  8 |   price=rating/2
  9 |   if age>peak_age:
 10 |     # Past its peak, goes bad in 10 years
 11 |     price=price*(5-(age-peak_age)/2)
 12 |   else:
 13 |     # Increases to 5x original value as it
 14 |     # approaches its peak
 15 |     price=price*(5*((age+1)/peak_age))
 16 |   if price<0: price=0
 17 |   return price
 18 | 
 19 | 
 20 | def wineset1():
 21 |   rows=[]
 22 |   for i in range(300):
 23 |     # Create a random age and rating
 24 |     rating=random()*50+50
 25 |     age=random()*50
 26 | 
 27 |     # Get reference price
 28 |     price=wineprice(rating,age)
 29 |     
 30 |     # Add some noise
 31 |     price*=(random()*0.2+0.9)
 32 | 
 33 |     # Add to the dataset
 34 |     rows.append({'input':(rating,age),
 35 |                  'result':price})
 36 |   return rows
 37 | 
 38 | def euclidean(v1,v2):
 39 |   d=0.0
 40 |   for i in range(len(v1)):
 41 |     d+=(v1[i]-v2[i])**2
 42 |   return math.sqrt(d)
 43 | 
 44 | 
 45 | def getdistances(data,vec1):
 46 |   distancelist=[]
 47 |   
 48 |   # Loop over every item in the dataset
 49 |   for i in range(len(data)):
 50 |     vec2=data[i]['input']
 51 |     
 52 |     # Add the distance and the index
 53 |     distancelist.append((euclidean(vec1,vec2),i))
 54 |   
 55 |   # Sort by distance
 56 |   distancelist.sort()
 57 |   return distancelist
 58 | 
 59 | def knnestimate(data,vec1,k=5):
 60 |   # Get sorted distances
 61 |   dlist=getdistances(data,vec1)
 62 |   avg=0.0
 63 |   
 64 |   # Take the average of the top k results
 65 |   for i in range(k):
 66 |     idx=dlist[i][1]
 67 |     avg+=data[idx]['result']
 68 |   avg=avg/k
 69 |   return avg
 70 | 
 71 | def inverseweight(dist,num=1.0,const=0.1):
 72 |   return num/(dist+const)
 73 | 
 74 | def subtractweight(dist,const=1.0):
 75 |   if dist>const: 
 76 |     return 0
 77 |   else: 
 78 |     return const-dist
 79 | 
 80 | def gaussian(dist,sigma=5.0):
 81 |   return math.e**(-dist**2/(2*sigma**2))
 82 | 
 83 | def weightedknn(data,vec1,k=5,weightf=gaussian):
 84 |   # Get distances
 85 |   dlist=getdistances(data,vec1)
 86 |   avg=0.0
 87 |   totalweight=0.0
 88 |   
 89 |   # Get weighted average
 90 |   for i in range(k):
 91 |     dist=dlist[i][0]
 92 |     idx=dlist[i][1]
 93 |     weight=weightf(dist)
 94 |     avg+=weight*data[idx]['result']
 95 |     totalweight+=weight
 96 |   if totalweight==0: return 0
 97 |   avg=avg/totalweight
 98 |   return avg
 99 | 
100 | def dividedata(data,test=0.05):
101 |   trainset=[]
102 |   testset=[]
103 |   for row in data:
104 |     if random()<test:
105 |       testset.append(row)
106 |     else:
107 |       trainset.append(row)
108 |   return trainset,testset
109 | 
110 | def testalgorithm(algf,trainset,testset):
111 |   error=0.0
112 |   for row in testset:
113 |     guess=algf(trainset,row['input'])
114 |     error+=(row['result']-guess)**2
115 |     #print row['result'],guess
116 |   #print error/len(testset)
117 |   return error/len(testset)
118 | 
119 | def crossvalidate(algf,data,trials=100,test=0.1):
120 |   error=0.0
121 |   for i in range(trials):
122 |     trainset,testset=dividedata(data,test)
123 |     error+=testalgorithm(algf,trainset,testset)
124 |   return error/trials
125 | 
126 | def wineset2():
127 |   rows=[]
128 |   for i in range(300):
129 |     rating=random()*50+50
130 |     age=random()*50
131 |     aisle=float(randint(1,20))
132 |     bottlesize=[375.0,750.0,1500.0][randint(0,2)]
133 |     price=wineprice(rating,age)
134 |     price*=(bottlesize/750)
135 |     price*=(random()*0.2+0.9)
136 |     rows.append({'input':(rating,age,aisle,bottlesize),
137 |                  'result':price})
138 |   return rows
139 | 
140 | def rescale(data,scale):
141 |   scaleddata=[]
142 |   for row in data:
143 |     scaled=[scale[i]*row['input'][i] for i in range(len(scale))]
144 |     scaleddata.append({'input':scaled,'result':row['result']})
145 |   return scaleddata
146 | 
147 | def createcostfunction(algf,data):
148 |   def costf(scale):
149 |     sdata=rescale(data,scale)
150 |     return crossvalidate(algf,sdata,trials=20)
151 |   return costf
152 | 
153 | weightdomain=[(0,10)]*4
154 | 
155 | def wineset3():
156 |   rows=wineset1()
157 |   for row in rows:
158 |     if random()<0.5:
159 |       # Wine was bought at a discount store
160 |       row['result']*=0.6
161 |   return rows
162 | 
163 | def probguess(data,vec1,low,high,k=5,weightf=gaussian):
164 |   dlist=getdistances(data,vec1)
165 |   nweight=0.0
166 |   tweight=0.0
167 |   
168 |   for i in range(k):
169 |     dist=dlist[i][0]
170 |     idx=dlist[i][1]
171 |     weight=weightf(dist)
172 |     v=data[idx]['result']
173 |     
174 |     # Is this point in the range?
175 |     if v>=low and v<=high:
176 |       nweight+=weight
177 |     tweight+=weight
178 |   if tweight==0: return 0
179 |   
180 |   # The probability is the weights in the range
181 |   # divided by all the weights
182 |   return nweight/tweight
183 | 
184 | from pylab import *
185 | 
186 | def cumulativegraph(data,vec1,high,k=5,weightf=gaussian):
187 |   t1=arange(0.0,high,0.1)
188 |   cprob=array([probguess(data,vec1,0,v,k,weightf) for v in t1])
189 |   plot(t1,cprob)
190 |   show()
191 | 
192 | 
193 | def probabilitygraph(data,vec1,high,k=5,weightf=gaussian,ss=5.0):
194 |   # Make a range for the prices
195 |   t1=arange(0.0,high,0.1)
196 |   
197 |   # Get the probabilities for the entire range
198 |   probs=[probguess(data,vec1,v,v+0.1,k,weightf) for v in t1]
199 |   
200 |   # Smooth them by adding the gaussian of the nearby probabilites
201 |   smoothed=[]
202 |   for i in range(len(probs)):
203 |     sv=0.0
204 |     for j in range(0,len(probs)):
205 |       dist=abs(i-j)*0.1
206 |       weight=gaussian(dist,sigma=ss)
207 |       sv+=weight*probs[j]
208 |     smoothed.append(sv)
209 |   smoothed=array(smoothed)
210 |     
211 |   plot(t1,smoothed)
212 |   show()
213 | 


--------------------------------------------------------------------------------
/第08章 构建价格模型/optimization.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import random
  3 | import math
  4 | 
  5 | people = [('Seymour','BOS'),
  6 |           ('Franny','DAL'),
  7 |           ('Zooey','CAK'),
  8 |           ('Walt','MIA'),
  9 |           ('Buddy','ORD'),
 10 |           ('Les','OMA')]
 11 | # Laguardia
 12 | destination='LGA'
 13 | 
 14 | flights={}
 15 | # 
 16 | """
 17 | for line in file('schedule.txt'):
 18 |   origin,dest,depart,arrive,price=line.strip().split(',')
 19 |   flights.setdefault((origin,dest),[])
 20 | 
 21 |   # Add details to the list of possible flights
 22 |   flights[(origin,dest)].append((depart,arrive,int(price)))
 23 | """
 24 | def getminutes(t):
 25 |   x=time.strptime(t,'%H:%M')
 26 |   return x[3]*60+x[4]
 27 | 
 28 | def printschedule(r):
 29 |   for d in range(len(r)/2):
 30 |     name=people[d][0]
 31 |     origin=people[d][1]
 32 |     out=flights[(origin,destination)][int(r[d])]
 33 |     ret=flights[(destination,origin)][int(r[d+1])]
 34 |     print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin,
 35 |                                                   out[0],out[1],out[2],
 36 |                                                   ret[0],ret[1],ret[2])
 37 | 
 38 | def schedulecost(sol):
 39 |   totalprice=0
 40 |   latestarrival=0
 41 |   earliestdep=24*60
 42 | 
 43 |   for d in range(len(sol)/2):
 44 |     # Get the inbound and outbound flights
 45 |     origin=people[d][1]
 46 |     outbound=flights[(origin,destination)][int(sol[d])]
 47 |     returnf=flights[(destination,origin)][int(sol[d+1])]
 48 |     
 49 |     # Total price is the price of all outbound and return flights
 50 |     totalprice+=outbound[2]
 51 |     totalprice+=returnf[2]
 52 |     
 53 |     # Track the latest arrival and earliest departure
 54 |     if latestarrival<getminutes(outbound[1]): latestarrival=getminutes(outbound[1])
 55 |     if earliestdep>getminutes(returnf[0]): earliestdep=getminutes(returnf[0])
 56 |   
 57 |   # Every person must wait at the airport until the latest person arrives.
 58 |   # They also must arrive at the same time and wait for their flights.
 59 |   totalwait=0  
 60 |   for d in range(len(sol)/2):
 61 |     origin=people[d][1]
 62 |     outbound=flights[(origin,destination)][int(sol[d])]
 63 |     returnf=flights[(destination,origin)][int(sol[d+1])]
 64 |     totalwait+=latestarrival-getminutes(outbound[1])
 65 |     totalwait+=getminutes(returnf[0])-earliestdep  
 66 | 
 67 |   # Does this solution require an extra day of car rental? That'll be $50!
 68 |   if latestarrival>earliestdep: totalprice+=50
 69 |   
 70 |   return totalprice+totalwait
 71 | 
 72 | def randomoptimize(domain,costf):
 73 |   best=999999999
 74 |   bestr=None
 75 |   for i in range(0,1000):
 76 |     # Create a random solution
 77 |     r=[float(random.randint(domain[i][0],domain[i][1])) 
 78 |        for i in range(len(domain))]
 79 |     
 80 |     # Get the cost
 81 |     cost=costf(r)
 82 |     
 83 |     # Compare it to the best one so far
 84 |     if cost<best:
 85 |       best=cost
 86 |       bestr=r 
 87 |   return r
 88 | 
 89 | 
 90 | def annealingoptimize(domain,costf,T=10000.0,cool=0.95,step=1):
 91 |   # Initialize the values randomly
 92 |   vec=[float(random.randint(domain[i][0],domain[i][1])) 
 93 |        for i in range(len(domain))]
 94 |   
 95 |   while T>0.1:
 96 |     # Choose one of the indices
 97 |     i=random.randint(0,len(domain)-1)
 98 | 
 99 |     # Choose a direction to change it
100 |     dir=random.randint(-step,step)
101 | 
102 |     # Create a new list with one of the values changed
103 |     vecb=vec[:]
104 |     vecb[i]+=dir
105 |     if vecb[i]<domain[i][0]: vecb[i]=domain[i][0]
106 |     elif vecb[i]>domain[i][1]: vecb[i]=domain[i][1]
107 | 
108 |     # Calculate the current cost and the new cost
109 |     ea=costf(vec)
110 |     eb=costf(vecb)
111 |     p=pow(math.e,(-eb-ea)/T)
112 | 
113 |     print vec,ea
114 | 
115 | 
116 |     # Is it better, or does it make the probability
117 |     # cutoff?
118 |     if (eb<ea or random.random()<p):
119 |       vec=vecb      
120 | 
121 |     # Decrease the temperature
122 |     T=T*cool
123 |   return vec
124 | 
125 | def swarmoptimize(domain,costf,popsize=20,lrate=0.1,maxv=2.0,iters=50):
126 |   # Initialize individuals
127 |   # current solutions
128 |   x=[]
129 | 
130 |   # best solutions
131 |   p=[]
132 | 
133 |   # velocities
134 |   v=[]
135 |   
136 |   for i in range(0,popsize):
137 |     vec=[float(random.randint(domain[i][0],domain[i][1])) 
138 |          for i in range(len(domain))]
139 |     x.append(vec)
140 |     p.append(vec[:])
141 |     v.append([0.0 for i in vec])
142 |   
143 |   
144 |   for ml in range(0,iters):
145 |     for i in range(0,popsize):
146 |       # Best solution for this particle
147 |       if costf(x[i])<costf(p[i]):
148 |         p[i]=x[i][:]
149 |       g=i
150 | 
151 |       # Best solution for any particle
152 |       for j in range(0,popsize):
153 |         if costf(p[j])<costf(p[g]): g=j
154 |       for d in range(len(x[i])):
155 |         # Update the velocity of this particle
156 |         v[i][d]+=lrate*(p[i][d]-x[i][d])+lrate*(p[g][d]-x[i][d])
157 | 
158 |         # constrain velocity to a maximum
159 |         if v[i][d]>maxv: v[i][d]=maxv
160 |         elif v[i][d]<-maxv: v[i][d]=-maxv
161 | 
162 |         # constrain bounds of solutions
163 |         x[i][d]+=v[i][d]
164 |         if x[i][d]<domain[d][0]: x[i][d]=domain[d][0]
165 |         elif x[i][d]>domain[d][1]: x[i][d]=domain[d][1]
166 | 
167 |     print p[g],costf(p[g])
168 |   return p[g]
169 | 


--------------------------------------------------------------------------------
/第09章 高阶分类 核方法与SVM/advancedclassify.py:
--------------------------------------------------------------------------------
  1 | class matchrow:
  2 |   def __init__(self,row,allnum=False):
  3 |     if allnum:
  4 |       self.data=[float(row[i]) for i in range(len(row)-1)]
  5 |     else:
  6 |       self.data=row[0:len(row)-1]
  7 |     self.match=int(row[len(row)-1])
  8 | 
  9 | def loadmatch(f,allnum=False):
 10 |   rows=[]
 11 |   for line in file(f):
 12 |     rows.append(matchrow(line.split(','),allnum))
 13 |   return rows
 14 |  
 15 | from pylab import *
 16 | def plotagematches(rows):
 17 |   xdm,ydm=[r.data[0] for r in rows if r.match==1],\
 18 |           [r.data[1] for r in rows if r.match==1]
 19 |   xdn,ydn=[r.data[0] for r in rows if r.match==0],\
 20 |           [r.data[1] for r in rows if r.match==0] 
 21 |   
 22 |   plot(xdm,ydm,'bo')
 23 |   plot(xdn,ydn,'b+')
 24 |   
 25 |   show()
 26 | 
 27 | def lineartrain(rows):
 28 |   averages={}
 29 |   counts={}
 30 |   
 31 |   for row in rows:
 32 |     # Get the class of this point
 33 |     cl=row.match
 34 |     
 35 |     averages.setdefault(cl,[0.0]*(len(row.data)))
 36 |     counts.setdefault(cl,0)
 37 |     
 38 |     # Add this point to the averages
 39 |     for i in range(len(row.data)):
 40 |       averages[cl][i]+=float(row.data[i])
 41 |       
 42 |     # Keep track of how many points in each class
 43 |     counts[cl]+=1
 44 |     
 45 |   # Divide sums by counts to get the averages
 46 |   for cl,avg in averages.items():
 47 |     for i in range(len(avg)):
 48 |       avg[i]/=counts[cl]
 49 |   
 50 |   return averages
 51 | 
 52 | def dotproduct(v1,v2):
 53 |   return sum([v1[i]*v2[i] for i in range(len(v1))])
 54 | 
 55 | def veclength(v):
 56 |   return sum([p**2 for p in v])
 57 | 
 58 | def dpclassify(point,avgs):
 59 |   b=(dotproduct(avgs[1],avgs[1])-dotproduct(avgs[0],avgs[0]))/2
 60 |   y=dotproduct(point,avgs[0])-dotproduct(point,avgs[1])+b
 61 |   if y>0: return 0
 62 |   else: return 1
 63 | 
 64 | def yesno(v):
 65 |   if v=='yes': return 1
 66 |   elif v=='no': return -1
 67 |   else: return 0
 68 |   
 69 | def matchcount(interest1,interest2):
 70 |   l1=interest1.split(':')
 71 |   l2=interest2.split(':')
 72 |   x=0
 73 |   for v in l1:
 74 |     if v in l2: x+=1
 75 |   return x
 76 | 
 77 | yahookey="YOUR API KEY"
 78 | from xml.dom.minidom import parseString
 79 | from urllib import urlopen,quote_plus
 80 | 
 81 | loc_cache={}
 82 | def getlocation(address):
 83 |   if address in loc_cache: return loc_cache[address]
 84 |   data=urlopen('http://api.local.yahoo.com/MapsService/V1/'+\
 85 |                'geocode?appid=%s&location=%s' %
 86 |                (yahookey,quote_plus(address))).read()
 87 |   doc=parseString(data)
 88 |   lat=doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue
 89 |   long=doc.getElementsByTagName('Longitude')[0].firstChild.nodeValue  
 90 |   loc_cache[address]=(float(lat),float(long))
 91 |   return loc_cache[address]
 92 | 
 93 | def milesdistance(a1,a2):
 94 |   lat1,long1=getlocation(a1)
 95 |   lat2,long2=getlocation(a2)
 96 |   latdif=69.1*(lat2-lat1)
 97 |   longdif=53.0*(long2-long1)
 98 |   return (latdif**2+longdif**2)**.5
 99 | 
100 | def loadnumerical():
101 |   oldrows=loadmatch('matchmaker.csv')
102 |   newrows=[]
103 |   for row in oldrows:
104 |     d=row.data
105 |     data=[float(d[0]),yesno(d[1]),yesno(d[2]),
106 |           float(d[5]),yesno(d[6]),yesno(d[7]),
107 |           matchcount(d[3],d[8]),
108 |           milesdistance(d[4],d[9]),
109 |           row.match]
110 |     newrows.append(matchrow(data))
111 |   return newrows
112 | 
113 | def scaledata(rows):
114 |   low=[999999999.0]*len(rows[0].data)
115 |   high=[-999999999.0]*len(rows[0].data)
116 |   # Find the lowest and highest values
117 |   for row in rows:
118 |     d=row.data
119 |     for i in range(len(d)):
120 |       if d[i]<low[i]: low[i]=d[i]
121 |       if d[i]>high[i]: high[i]=d[i]
122 |   
123 |   # Create a function that scales data
124 |   def scaleinput(d):
125 |      return [(d[i]-low[i])/(high[i]-low[i])
126 |             for i in range(len(low))]
127 |   
128 |   # Scale all the data
129 |   newrows=[matchrow(scaleinput(row.data)+[row.match])
130 |            for row in rows]
131 |   
132 |   # Return the new data and the function
133 |   return newrows,scaleinput
134 | 
135 | 
136 | def rbf(v1,v2,gamma=10):
137 |   dv=[v1[i]-v2[i] for i in range(len(v1))]
138 |   l=veclength(dv)
139 |   return math.e**(-gamma*l)
140 | 
141 | def nlclassify(point,rows,offset,gamma=10):
142 |   sum0=0.0
143 |   sum1=0.0
144 |   count0=0
145 |   count1=0
146 |   
147 |   for row in rows:
148 |     if row.match==0:
149 |       sum0+=rbf(point,row.data,gamma)
150 |       count0+=1
151 |     else:
152 |       sum1+=rbf(point,row.data,gamma)
153 |       count1+=1
154 |   y=(1.0/count0)*sum0-(1.0/count1)*sum1+offset
155 | 
156 |   if y>0: return 0
157 |   else: return 1
158 | 
159 | def getoffset(rows,gamma=10):
160 |   l0=[]
161 |   l1=[]
162 |   for row in rows:
163 |     if row.match==0: l0.append(row.data)
164 |     else: l1.append(row.data)
165 |   sum0=sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0)
166 |   sum1=sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1)
167 |   
168 |   return (1.0/(len(l1)**2))*sum1-(1.0/(len(l0)**2))*sum0
169 | 


--------------------------------------------------------------------------------
/第09章 高阶分类 核方法与SVM/agesonly.csv:
--------------------------------------------------------------------------------
  1 | 24,30,1
  2 | 30,40,1
  3 | 22,49,0
  4 | 43,39,1
  5 | 23,30,1
  6 | 23,49,0
  7 | 48,46,1
  8 | 23,23,1
  9 | 29,49,0
 10 | 38,38,1
 11 | 30,34,1
 12 | 40,50,1
 13 | 35,32,1
 14 | 49,44,1
 15 | 38,22,1
 16 | 30,27,1
 17 | 26,24,1
 18 | 39,23,1
 19 | 36,43,1
 20 | 25,31,1
 21 | 27,27,1
 22 | 32,22,1
 23 | 40,30,1
 24 | 26,28,1
 25 | 46,32,1
 26 | 41,37,1
 27 | 39,41,1
 28 | 18,28,0
 29 | 18,47,0
 30 | 39,44,1
 31 | 38,21,1
 32 | 24,36,0
 33 | 32,22,1
 34 | 21,20,1
 35 | 42,36,1
 36 | 46,41,1
 37 | 39,38,1
 38 | 18,31,0
 39 | 31,45,1
 40 | 44,24,0
 41 | 49,22,0
 42 | 26,27,1
 43 | 25,34,1
 44 | 47,23,0
 45 | 27,48,0
 46 | 32,49,1
 47 | 46,41,1
 48 | 24,32,1
 49 | 29,26,1
 50 | 25,36,1
 51 | 27,35,1
 52 | 38,19,1
 53 | 18,40,0
 54 | 34,49,1
 55 | 32,35,1
 56 | 47,49,1
 57 | 47,18,0
 58 | 33,24,1
 59 | 35,28,1
 60 | 35,41,1
 61 | 39,43,1
 62 | 29,18,1
 63 | 18,44,0
 64 | 26,26,1
 65 | 31,43,1
 66 | 20,29,0
 67 | 28,18,1
 68 | 31,38,1
 69 | 34,34,1
 70 | 32,33,1
 71 | 34,27,1
 72 | 19,38,0
 73 | 32,21,1
 74 | 33,37,1
 75 | 33,18,1
 76 | 18,46,0
 77 | 31,37,1
 78 | 36,30,1
 79 | 40,40,1
 80 | 38,30,1
 81 | 49,28,1
 82 | 31,47,1
 83 | 28,50,0
 84 | 49,43,1
 85 | 24,31,1
 86 | 33,43,1
 87 | 28,24,1
 88 | 45,29,1
 89 | 49,35,1
 90 | 36,29,1
 91 | 42,32,1
 92 | 29,18,1
 93 | 49,20,0
 94 | 22,27,1
 95 | 41,38,1
 96 | 47,21,0
 97 | 40,32,1
 98 | 35,18,1
 99 | 35,33,1
100 | 34,28,1
101 | 22,31,0
102 | 46,20,0
103 | 18,49,0
104 | 48,23,0
105 | 39,21,1
106 | 20,34,0
107 | 24,20,1
108 | 38,18,1
109 | 37,47,1
110 | 39,37,1
111 | 38,39,1
112 | 27,42,1
113 | 47,49,1
114 | 27,42,1
115 | 40,28,1
116 | 41,46,1
117 | 39,25,1
118 | 43,36,1
119 | 49,30,1
120 | 24,38,0
121 | 49,42,1
122 | 19,22,0
123 | 43,27,1
124 | 30,37,1
125 | 24,31,1
126 | 24,48,0
127 | 24,29,1
128 | 18,19,1
129 | 29,25,1
130 | 38,33,1
131 | 39,20,1
132 | 24,30,1
133 | 22,39,0
134 | 47,21,0
135 | 30,44,1
136 | 41,38,1
137 | 29,33,1
138 | 42,42,1
139 | 47,27,1
140 | 23,20,1
141 | 39,18,1
142 | 30,26,1
143 | 36,27,1
144 | 40,18,1
145 | 31,18,1
146 | 46,27,1
147 | 41,44,1
148 | 26,34,1
149 | 33,18,1
150 | 48,19,0
151 | 46,27,1
152 | 25,40,0
153 | 50,36,1
154 | 20,21,1
155 | 33,47,1
156 | 40,35,1
157 | 24,27,1
158 | 34,19,1
159 | 26,45,0
160 | 34,36,1
161 | 21,27,0
162 | 48,28,1
163 | 23,25,1
164 | 48,46,1
165 | 30,20,1
166 | 23,40,0
167 | 36,40,1
168 | 21,45,0
169 | 30,40,1
170 | 39,24,1
171 | 42,47,1
172 | 28,37,1
173 | 24,30,1
174 | 37,25,1
175 | 44,34,1
176 | 43,32,1
177 | 46,29,1
178 | 49,22,0
179 | 41,28,1
180 | 23,50,0
181 | 30,43,1
182 | 25,32,1
183 | 27,46,0
184 | 23,21,1
185 | 39,41,1
186 | 33,27,1
187 | 49,21,0
188 | 33,33,1
189 | 18,25,0
190 | 42,35,1
191 | 36,25,1
192 | 26,50,0
193 | 18,37,0
194 | 35,37,1
195 | 39,38,1
196 | 22,30,0
197 | 18,44,0
198 | 46,44,1
199 | 24,27,1
200 | 41,34,1
201 | 40,39,1
202 | 34,49,1
203 | 35,41,1
204 | 46,48,1
205 | 50,23,0
206 | 49,20,0
207 | 22,47,0
208 | 27,26,1
209 | 30,30,1
210 | 37,39,1
211 | 42,44,1
212 | 41,27,1
213 | 24,21,1
214 | 34,28,1
215 | 23,43,0
216 | 43,35,1
217 | 42,40,1
218 | 25,24,1
219 | 36,24,1
220 | 25,23,1
221 | 44,30,1
222 | 39,33,1
223 | 38,33,1
224 | 49,30,1
225 | 40,19,1
226 | 19,46,0
227 | 31,21,1
228 | 48,33,1
229 | 26,24,1
230 | 20,37,0
231 | 29,31,1
232 | 35,28,1
233 | 37,25,1
234 | 42,42,1
235 | 42,48,1
236 | 41,47,1
237 | 44,45,1
238 | 45,46,1
239 | 25,38,1
240 | 19,45,0
241 | 36,26,1
242 | 33,36,1
243 | 27,19,1
244 | 48,24,0
245 | 37,48,1
246 | 23,31,0
247 | 20,29,0
248 | 27,44,0
249 | 47,24,0
250 | 36,18,1
251 | 37,48,1
252 | 32,29,1
253 | 46,48,1
254 | 31,47,1
255 | 23,45,0
256 | 28,30,1
257 | 36,32,1
258 | 25,43,0
259 | 24,44,0
260 | 34,47,1
261 | 46,42,1
262 | 18,31,0
263 | 23,25,1
264 | 44,39,1
265 | 18,29,0
266 | 49,40,1
267 | 24,33,0
268 | 21,44,0
269 | 40,24,1
270 | 46,41,1
271 | 42,33,1
272 | 25,41,0
273 | 29,42,1
274 | 40,18,1
275 | 37,40,1
276 | 46,28,1
277 | 33,20,1
278 | 18,42,0
279 | 22,36,0
280 | 27,46,0
281 | 33,48,1
282 | 21,37,0
283 | 26,50,0
284 | 29,23,1
285 | 23,33,0
286 | 21,38,0
287 | 18,30,0
288 | 29,28,1
289 | 31,22,1
290 | 30,48,1
291 | 41,37,1
292 | 35,31,1
293 | 48,32,1
294 | 29,37,1
295 | 32,33,1
296 | 43,26,1
297 | 21,33,0
298 | 44,28,1
299 | 35,18,1
300 | 35,35,1
301 | 25,20,1
302 | 39,46,1
303 | 26,39,1
304 | 36,29,1
305 | 29,44,1
306 | 28,42,1
307 | 38,21,1
308 | 28,49,0
309 | 33,26,1
310 | 31,28,1
311 | 25,47,0
312 | 23,25,1
313 | 45,49,1
314 | 28,26,1
315 | 36,48,1
316 | 42,48,1
317 | 42,21,1
318 | 29,32,1
319 | 26,28,1
320 | 24,46,0
321 | 39,30,1
322 | 29,46,1
323 | 43,43,1
324 | 20,42,0
325 | 35,41,1
326 | 45,19,0
327 | 38,45,1
328 | 25,38,1
329 | 31,20,1
330 | 38,43,1
331 | 37,30,1
332 | 43,27,1
333 | 43,44,1
334 | 21,30,0
335 | 22,45,0
336 | 44,26,1
337 | 43,42,1
338 | 26,41,0
339 | 47,35,1
340 | 48,30,1
341 | 41,24,1
342 | 19,48,0
343 | 45,24,0
344 | 38,41,1
345 | 42,46,1
346 | 49,45,1
347 | 28,44,1
348 | 22,44,0
349 | 31,48,1
350 | 48,21,0
351 | 31,20,1
352 | 30,39,1
353 | 23,23,1
354 | 21,32,0
355 | 19,19,1
356 | 21,27,0
357 | 24,46,0
358 | 25,28,1
359 | 48,50,1
360 | 25,32,1
361 | 26,29,1
362 | 33,48,1
363 | 35,32,1
364 | 48,25,1
365 | 30,27,1
366 | 34,49,1
367 | 40,45,1
368 | 28,32,1
369 | 47,33,1
370 | 29,33,1
371 | 21,22,1
372 | 21,39,0
373 | 41,45,1
374 | 46,39,1
375 | 22,24,1
376 | 32,22,1
377 | 27,46,0
378 | 26,35,1
379 | 27,29,1
380 | 48,19,0
381 | 35,26,1
382 | 42,29,1
383 | 30,22,1
384 | 20,26,0
385 | 33,25,1
386 | 37,30,1
387 | 37,32,1
388 | 20,22,1
389 | 42,48,1
390 | 29,20,1
391 | 32,46,1
392 | 37,34,1
393 | 29,45,1
394 | 19,44,0
395 | 49,18,0
396 | 28,25,1
397 | 48,31,1
398 | 35,46,1
399 | 34,26,1
400 | 38,26,1
401 | 36,31,1
402 | 31,30,1
403 | 27,19,1
404 | 44,38,1
405 | 19,37,0
406 | 43,49,1
407 | 19,42,0
408 | 32,24,1
409 | 46,43,1
410 | 43,46,1
411 | 33,32,1
412 | 23,35,0
413 | 26,34,1
414 | 48,20,0
415 | 45,38,1
416 | 30,30,1
417 | 28,23,1
418 | 43,36,1
419 | 19,37,0
420 | 39,45,1
421 | 20,30,0
422 | 28,30,1
423 | 19,42,0
424 | 41,21,1
425 | 42,31,1
426 | 47,45,1
427 | 42,48,1
428 | 40,22,1
429 | 28,20,1
430 | 22,31,0
431 | 28,24,1
432 | 18,33,0
433 | 42,47,1
434 | 35,18,1
435 | 32,28,1
436 | 45,39,1
437 | 46,45,1
438 | 41,43,1
439 | 24,37,0
440 | 34,30,1
441 | 40,22,1
442 | 38,20,1
443 | 43,28,1
444 | 21,26,0
445 | 35,27,1
446 | 33,37,1
447 | 48,39,1
448 | 47,40,1
449 | 31,32,1
450 | 18,32,0
451 | 31,20,1
452 | 30,49,1
453 | 22,46,0
454 | 36,39,1
455 | 30,35,1
456 | 49,50,1
457 | 46,39,1
458 | 45,44,1
459 | 34,40,1
460 | 27,28,1
461 | 27,35,1
462 | 46,46,1
463 | 26,42,0
464 | 27,18,1
465 | 23,38,0
466 | 30,30,1
467 | 34,32,1
468 | 48,27,1
469 | 31,23,1
470 | 29,47,0
471 | 47,31,1
472 | 35,19,1
473 | 30,28,1
474 | 33,44,1
475 | 36,37,1
476 | 34,44,1
477 | 42,43,1
478 | 36,29,1
479 | 35,46,1
480 | 22,36,0
481 | 39,47,1
482 | 23,23,1
483 | 47,20,0
484 | 38,22,1
485 | 21,33,0
486 | 37,41,1
487 | 18,18,1
488 | 35,34,1
489 | 49,49,1
490 | 33,32,1
491 | 31,19,1
492 | 31,26,1
493 | 45,31,1
494 | 41,44,1
495 | 27,47,0
496 | 28,26,1
497 | 18,47,0
498 | 37,18,1
499 | 20,42,0
500 | 36,45,1
501 | 


--------------------------------------------------------------------------------
/第09章 高阶分类 核方法与SVM/facebook.py:
--------------------------------------------------------------------------------
  1 | import urllib,md5,webbrowser,time
  2 | from xml.dom.minidom import parseString
  3 | 
  4 | apikey="47e953c8ea9ed30db904af453125c759"
  5 | secret="ea703e4721e8c7bf88b92110a46a9b06"
  6 | FacebookURL = "https://api.facebook.com/restserver.php"
  7 | 
  8 | def getsinglevalue(node,tag):
  9 |   nl=node.getElementsByTagName(tag)
 10 |   if len(nl)>0:
 11 |     tagNode=nl[0]
 12 |     if tagNode.hasChildNodes():
 13 |       return tagNode.firstChild.nodeValue
 14 |   return ''
 15 | 
 16 | def callid(): 
 17 |   return str(int(time.time()*10))
 18 | 
 19 | class fbsession:
 20 |   def __init__(self):
 21 |     self.session_secret=None
 22 |     self.session_key=None
 23 |     self.createtoken()
 24 |     webbrowser.open(self.getlogin())
 25 |     print "Press enter after logging in:",
 26 |     raw_input()
 27 |     self.getsession()
 28 |   def sendrequest(self, args):
 29 |     args['api_key'] = apikey
 30 |     args['sig'] = self.makehash(args)
 31 |     post_data = urllib.urlencode(args)
 32 |     url = FacebookURL + "?" + post_data
 33 |     data=urllib.urlopen(url).read()
 34 |     print data
 35 |     return parseString(data)
 36 |   def makehash(self,args):
 37 |     hasher = md5.new(''.join([x + '=' + args[x] for x in sorted(args.keys())]))
 38 |     if self.session_secret: hasher.update(self.session_secret)
 39 |     else: hasher.update(secret)
 40 |     return hasher.hexdigest()
 41 |   def createtoken(self):
 42 |     res = self.sendrequest({'method':"facebook.auth.createToken"})
 43 |     self.token = getsinglevalue(res,'token')
 44 |   def getlogin(self):
 45 |     return "http://api.facebook.com/login.php?api_key="+apikey+\
 46 |            "&auth_token=" + self.token
 47 |   def getsession(self):
 48 |     doc=self.sendrequest({'method':'facebook.auth.getSession',
 49 |                                'auth_token':self.token})
 50 |     self.session_key=getsinglevalue(doc,'session_key')
 51 |     self.session_secret=getsinglevalue(doc,'secret')
 52 |   def getfriends(self):
 53 |     doc=self.sendrequest({'method':'facebook.friends.get',
 54 |                           'session_key':self.session_key,'call_id':callid()})
 55 |     results=[]
 56 |     for n in doc.getElementsByTagName('result_elt'):
 57 |       results.append(n.firstChild.nodeValue)
 58 |     return results
 59 | 
 60 |   def getinfo(self,users):
 61 |     ulist=','.join(users)
 62 |     
 63 |     fields='gender,current_location,relationship_status,'+\
 64 |            'affiliations,hometown_location'
 65 |     
 66 |     doc=self.sendrequest({'method':'facebook.users.getInfo',
 67 |     'session_key':self.session_key,'call_id':callid(),
 68 |     'users':ulist,'fields':fields})
 69 | 
 70 |     results={}
 71 |     for n,id in zip(doc.getElementsByTagName('result_elt'),users):
 72 |       # Get the location
 73 |       locnode=n.getElementsByTagName('hometown_location')[0]
 74 |       loc=getsinglevalue(locnode,'city')+', '+getsinglevalue(locnode,'state')
 75 |       
 76 |       # Get school
 77 |       college=''
 78 |       gradyear='0'
 79 |       affiliations=n.getElementsByTagName('affiliations_elt')
 80 |       for aff in affiliations:
 81 |         # Type 1 is college
 82 |         if getsinglevalue(aff,'type')=='1': 
 83 |           college=getsinglevalue(aff,'name')
 84 |           gradyear=getsinglevalue(aff,'year')
 85 |       
 86 |       results[id]={'gender':getsinglevalue(n,'gender'),
 87 |                    'status':getsinglevalue(n,'relationship_status'),
 88 |                    'location':loc,'college':college,'year':gradyear}
 89 |     return results
 90 | 
 91 |   def arefriends(self,idlist1,idlist2):
 92 |     id1=','.join(idlist1)
 93 |     id2=','.join(idlist2)
 94 |     doc=self.sendrequest({'method':'facebook.friends.areFriends',
 95 |                           'session_key':self.session_key,'call_id':callid(),
 96 |                           'id1':id1,'id2':id2})
 97 |     results=[]
 98 |     for n in doc.getElementsByTagName('result_elt'):
 99 |       results.append(int(n.firstChild.nodeValue))
100 |     return results
101 |   
102 |   
103 | 
104 |   def makedataset(self):
105 |     from advancedclassify import milesdistance
106 |     # Get all the info for all my friends
107 |     friends=self.getfriends()
108 |     info=self.getinfo(friends)
109 |     ids1,ids2=[],[]
110 |     rows=[]
111 | 
112 |     # Nested loop to look at every pair of friends
113 |     for i in range(len(friends)):
114 |       f1=friends[i]
115 |       data1=info[f1]
116 |       
117 |       # Start at i+1 so we don't double up
118 |       for j in range(i+1,len(friends)):
119 |         f2=friends[j]
120 |         data2=info[f2]
121 |         ids1.append(f1)
122 |         ids2.append(f2)
123 | 
124 |         # Generate some numbers from the data
125 |         if data1['college']==data2['college']: sameschool=1
126 |         else: sameschool=0
127 |         male1=(data1['gender']=='Male') and 1 or 0
128 |         male2=(data2['gender']=='Male') and 1 or 0        
129 |         
130 |         row=[male1,int(data1['year']),male2,int(data2['year']),sameschool]
131 |         rows.append(row)
132 |     # Call arefriends in blocks for every pair of people
133 |     arefriends=[]
134 |     for i in range(0,len(ids1),30):
135 |       j=min(i+30,len(ids1))
136 |       pa=self.arefriends(ids1[i:j],ids2[i:j])
137 |       arefriends+=pa
138 |     return arefriends,rows
139 |   
140 | 


--------------------------------------------------------------------------------
/第09章 高阶分类 核方法与SVM/svm.py:
--------------------------------------------------------------------------------
  1 | import svmc
  2 | from svmc import C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR
  3 | from svmc import LINEAR, POLY, RBF, SIGMOID
  4 | from math import exp, fabs
  5 | 
  6 | def _int_array(seq):
  7 | 	size = len(seq)
  8 | 	array = svmc.new_int(size)
  9 | 	i = 0
 10 | 	for item in seq:
 11 | 		svmc.int_setitem(array,i,item)
 12 | 		i = i + 1
 13 | 	return array
 14 | 
 15 | def _double_array(seq):
 16 | 	size = len(seq)
 17 | 	array = svmc.new_double(size)
 18 | 	i = 0
 19 | 	for item in seq:
 20 | 		svmc.double_setitem(array,i,item)
 21 | 		i = i + 1
 22 | 	return array
 23 | 
 24 | def _free_int_array(x):
 25 | 	if x != 'NULL' and x != None:
 26 | 		svmc.delete_int(x)
 27 | 
 28 | def _free_double_array(x):
 29 | 	if x != 'NULL' and x != None:
 30 | 		svmc.delete_double(x)
 31 | 
 32 | def _int_array_to_list(x,n):
 33 | 	return map(svmc.int_getitem,[x]*n,range(n))
 34 | 
 35 | def _double_array_to_list(x,n):
 36 | 	return map(svmc.double_getitem,[x]*n,range(n))
 37 | 
 38 | class svm_parameter:
 39 | 	
 40 | 	# default values
 41 | 	default_parameters = {
 42 | 	'svm_type' : C_SVC,
 43 | 	'kernel_type' : RBF,
 44 | 	'degree' : 3,
 45 | 	'gamma' : 0,		# 1/k
 46 | 	'coef0' : 0,
 47 | 	'nu' : 0.5,
 48 | 	'cache_size' : 40,
 49 | 	'C' : 1,
 50 | 	'eps' : 1e-3,
 51 | 	'p' : 0.1,
 52 | 	'shrinking' : 1,
 53 | 	'nr_weight' : 0,
 54 | 	'weight_label' : [],
 55 | 	'weight' : [],
 56 | 	'probability' : 0
 57 | 	}
 58 | 
 59 | 	def __init__(self,**kw):
 60 | 		self.__dict__['param'] = svmc.new_svm_parameter()
 61 | 		for attr,val in self.default_parameters.items():
 62 | 			setattr(self,attr,val)
 63 | 		for attr,val in kw.items():
 64 | 			setattr(self,attr,val)
 65 | 
 66 | 	def __getattr__(self,attr):
 67 | 		get_func = getattr(svmc,'svm_parameter_%s_get' % (attr))
 68 | 		return get_func(self.param)
 69 | 
 70 | 	def __setattr__(self,attr,val):
 71 | 
 72 | 		if attr == 'weight_label':
 73 | 			self.__dict__['weight_label_len'] = len(val)
 74 | 			val = _int_array(val)
 75 | 			_free_int_array(self.weight_label)
 76 | 		elif attr == 'weight':
 77 | 			self.__dict__['weight_len'] = len(val)
 78 | 			val = _double_array(val)
 79 | 			_free_double_array(self.weight)
 80 | 
 81 | 		set_func = getattr(svmc,'svm_parameter_%s_set' % (attr))
 82 | 		set_func(self.param,val)
 83 | 
 84 | 	def __repr__(self):
 85 | 		ret = '<svm_parameter:'
 86 | 		for name in dir(svmc):
 87 | 			if name[:len('svm_parameter_')] == 'svm_parameter_' and name[-len('_set'):] == '_set':
 88 | 				attr = name[len('svm_parameter_'):-len('_set')]
 89 | 				if attr == 'weight_label':
 90 | 					ret = ret+' weight_label = %s,' % _int_array_to_list(self.weight_label,self.weight_label_len)
 91 | 				elif attr == 'weight':
 92 | 					ret = ret+' weight = %s,' % _double_array_to_list(self.weight,self.weight_len)
 93 | 				else:
 94 | 					ret = ret+' %s = %s,' % (attr,getattr(self,attr))
 95 | 		return ret+'>'
 96 | 
 97 | 	def __del__(self):
 98 | 		_free_int_array(self.weight_label)
 99 | 		_free_double_array(self.weight)
100 | 		svmc.delete_svm_parameter(self.param)
101 | 
102 | def _convert_to_svm_node_array(x):
103 | 	""" convert a sequence or mapping to an svm_node array """
104 | 	import operator
105 | 
106 | 	# Find non zero elements
107 | 	iter_range = []
108 | 	if type(x) == dict:
109 | 		for k, v in x.iteritems():
110 | # all zeros kept due to the precomputed kernel; no good solution yet
111 | #			if v != 0:
112 | 				iter_range.append( k )
113 | 	elif operator.isSequenceType(x):
114 | 		for j in range(len(x)):
115 | #			if x[j] != 0:
116 | 				iter_range.append( j )
117 | 	else:
118 | 		raise TypeError,"data must be a mapping or a sequence"
119 | 
120 | 	iter_range.sort()
121 | 	data = svmc.svm_node_array(len(iter_range)+1)
122 | 	svmc.svm_node_array_set(data,len(iter_range),-1,0)
123 | 
124 | 	j = 0
125 | 	for k in iter_range:
126 | 		svmc.svm_node_array_set(data,j,k,x[k])
127 | 		j = j + 1
128 | 	return data
129 | 
130 | class svm_problem:
131 | 	def __init__(self,y,x):
132 | 		assert len(y) == len(x)
133 | 		self.prob = prob = svmc.new_svm_problem()
134 | 		self.size = size = len(y)
135 | 
136 | 		self.y_array = y_array = svmc.new_double(size)
137 | 		for i in range(size):
138 | 			svmc.double_setitem(y_array,i,y[i])
139 | 
140 | 		self.x_matrix = x_matrix = svmc.svm_node_matrix(size)
141 | 		self.data = []
142 | 		self.maxlen = 0;
143 | 		for i in range(size):
144 | 			data = _convert_to_svm_node_array(x[i])
145 | 			self.data.append(data);
146 | 			svmc.svm_node_matrix_set(x_matrix,i,data)
147 | 			if type(x[i]) == dict:
148 | 				if (len(x[i]) > 0):
149 | 					self.maxlen = max(self.maxlen,max(x[i].keys()))
150 | 			else:
151 | 				self.maxlen = max(self.maxlen,len(x[i]))
152 | 
153 | 		svmc.svm_problem_l_set(prob,size)
154 | 		svmc.svm_problem_y_set(prob,y_array)
155 | 		svmc.svm_problem_x_set(prob,x_matrix)
156 | 
157 | 	def __repr__(self):
158 | 		return "<svm_problem: size = %s>" % (self.size)
159 | 
160 | 	def __del__(self):
161 | 		svmc.delete_svm_problem(self.prob)
162 | 		svmc.delete_double(self.y_array)
163 | 		for i in range(self.size):
164 | 			svmc.svm_node_array_destroy(self.data[i])
165 | 		svmc.svm_node_matrix_destroy(self.x_matrix)
166 | 
167 | class svm_model:
168 | 	def __init__(self,arg1,arg2=None):
169 | 		if arg2 == None:
170 | 			# create model from file
171 | 			filename = arg1
172 | 			self.model = svmc.svm_load_model(filename)
173 | 		else:
174 | 			# create model from problem and parameter
175 | 			prob,param = arg1,arg2
176 | 			self.prob = prob
177 | 			if param.gamma == 0:
178 | 				param.gamma = 1.0/prob.maxlen
179 | 			msg = svmc.svm_check_parameter(prob.prob,param.param)
180 | 			if msg: raise ValueError, msg
181 | 			self.model = svmc.svm_train(prob.prob,param.param)
182 | 
183 | 		#setup some classwide variables
184 | 		self.nr_class = svmc.svm_get_nr_class(self.model)
185 | 		self.svm_type = svmc.svm_get_svm_type(self.model)
186 | 		#create labels(classes)
187 | 		intarr = svmc.new_int(self.nr_class)
188 | 		svmc.svm_get_labels(self.model,intarr)
189 | 		self.labels = _int_array_to_list(intarr, self.nr_class)
190 | 		svmc.delete_int(intarr)
191 | 		#check if valid probability model
192 | 		self.probability = svmc.svm_check_probability_model(self.model)
193 | 
194 | 	def predict(self,x):
195 | 		data = _convert_to_svm_node_array(x)
196 | 		ret = svmc.svm_predict(self.model,data)
197 | 		svmc.svm_node_array_destroy(data)
198 | 		return ret
199 | 
200 | 
201 | 	def get_nr_class(self):
202 | 		return self.nr_class
203 | 
204 | 	def get_labels(self):
205 | 		if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR or self.svm_type == ONE_CLASS:
206 | 			raise TypeError, "Unable to get label from a SVR/ONE_CLASS model"
207 | 		return self.labels
208 | 		
209 | 	def predict_values_raw(self,x):
210 | 		#convert x into svm_node, allocate a double array for return
211 | 		n = self.nr_class*(self.nr_class-1)//2
212 | 		data = _convert_to_svm_node_array(x)
213 | 		dblarr = svmc.new_double(n)
214 | 		svmc.svm_predict_values(self.model, data, dblarr)
215 | 		ret = _double_array_to_list(dblarr, n)
216 | 		svmc.delete_double(dblarr)
217 | 		svmc.svm_node_array_destroy(data)
218 | 		return ret
219 | 
220 | 	def predict_values(self,x):
221 | 		v=self.predict_values_raw(x)
222 | 		if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR or self.svm_type == ONE_CLASS:
223 | 			return v[0]
224 | 		else: #self.svm_type == C_SVC or self.svm_type == NU_SVC
225 | 			count = 0
226 | 			d = {}
227 | 			for i in range(len(self.labels)):
228 | 				for j in range(i+1, len(self.labels)):
229 | 					d[self.labels[i],self.labels[j]] = v[count]
230 | 					d[self.labels[j],self.labels[i]] = -v[count]
231 | 					count += 1
232 | 			return  d
233 | 
234 | 	def predict_probability(self,x):
235 | 		#c code will do nothing on wrong type, so we have to check ourself
236 | 		if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR:
237 | 			raise TypeError, "call get_svr_probability or get_svr_pdf for probability output of regression"
238 | 		elif self.svm_type == ONE_CLASS:
239 | 			raise TypeError, "probability not supported yet for one-class problem"
240 | 		#only C_SVC,NU_SVC goes in
241 | 		if not self.probability:
242 | 			raise TypeError, "model does not support probabiliy estimates"
243 | 
244 | 		#convert x into svm_node, alloc a double array to receive probabilities
245 | 		data = _convert_to_svm_node_array(x)
246 | 		dblarr = svmc.new_double(self.nr_class)
247 | 		pred = svmc.svm_predict_probability(self.model, data, dblarr)
248 | 		pv = _double_array_to_list(dblarr, self.nr_class)
249 | 		svmc.delete_double(dblarr)
250 | 		svmc.svm_node_array_destroy(data)
251 | 		p = {}
252 | 		for i in range(len(self.labels)):
253 | 			p[self.labels[i]] = pv[i]
254 | 		return pred, p
255 | 	
256 | 	def get_svr_probability(self):
257 | 		#leave the Error checking to svm.cpp code
258 | 		ret = svmc.svm_get_svr_probability(self.model)
259 | 		if ret == 0:
260 | 			raise TypeError, "not a regression model or probability information not available"
261 | 		return ret
262 | 
263 | 	def get_svr_pdf(self):
264 | 		#get_svr_probability will handle error checking
265 | 		sigma = self.get_svr_probability()
266 | 		return lambda z: exp(-fabs(z)/sigma)/(2*sigma)
267 | 
268 | 
269 | 	def save(self,filename):
270 | 		svmc.svm_save_model(filename,self.model)
271 | 
272 | 	def __del__(self):
273 | 		svmc.svm_destroy_model(self.model)
274 | 
275 | 
276 | def cross_validation(prob, param, fold):
277 | 	if param.gamma == 0:
278 | 		param.gamma = 1.0/prob.maxlen
279 | 	dblarr = svmc.new_double(prob.size)
280 | 	svmc.svm_cross_validation(prob.prob, param.param, fold, dblarr)
281 | 	ret = _double_array_to_list(dblarr, prob.size)
282 | 	svmc.delete_double(dblarr)
283 | 	return ret
284 | 


--------------------------------------------------------------------------------
/第09章 高阶分类 核方法与SVM/svm.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第09章 高阶分类 核方法与SVM/svm.pyc


--------------------------------------------------------------------------------
/第09章 高阶分类 核方法与SVM/svmc.pyd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第09章 高阶分类 核方法与SVM/svmc.pyd


--------------------------------------------------------------------------------
/第10章 寻找独立特征/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第10章 寻找独立特征/Thumbs.db


--------------------------------------------------------------------------------
/第10章 寻找独立特征/articles.txt:
--------------------------------------------------------------------------------
  1 | Obesity not a problem
  2 | 0.689921777771 ['food', 'calories', 'than', 'easy', 'high', 'come']
  3 | 0.616521773806 ['with', 'your', 'weight', 'have', 'control', 'about']
  4 | 0.594775751071 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
  5 | 
  6 | Fitness equipment
  7 | 0.336438029037 ['with', 'your', 'weight', 'have', 'control', 'about']
  8 | 0.0336830699618 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
  9 | 0.0323861213375 ['that', 'much', 'does', 'exercise', 'this', 'morning']
 10 | 
 11 | 1000 Atkins Recipes E-Book
 12 | 2.056067447 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
 13 | 1.81222264198 ['with', 'your', 'weight', 'have', 'control', 'about']
 14 | 0.31319239108 ['quot', 'they', 'money', 'want', 'very', 'best']
 15 | 
 16 | saturday
 17 | 7.46811621754 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
 18 | 0.174282882652 ['food', 'calories', 'than', 'easy', 'high', 'come']
 19 | 0.00317828003493 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
 20 | 
 21 | Food & Exercise -- 10/13/2007
 22 | 4.73555293191 ['food', 'home', 'then', 'exercise', 'morning', 'went']
 23 | 0.937525542474 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
 24 | 0.10240571114 ['with', 'your', 'weight', 'have', 'control', 'about']
 25 | 
 26 | Food & exercise -- 10/12/2007 (yesterday)
 27 | 3.97594760235 ['food', 'home', 'then', 'exercise', 'morning', 'went']
 28 | 1.01018312908 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
 29 | 0.0198736807467 ['food', 'calories', 'than', 'easy', 'high', 'come']
 30 | 
 31 | Have you been enslaved and confused by the omniscience myth?
 32 | 1.14697423243 ['with', 'your', 'weight', 'have', 'control', 'about']
 33 | 0.548717665826 ['food', 'calories', 'than', 'easy', 'high', 'come']
 34 | 0.159098480615 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
 35 | 
 36 | High or low fat food? Easy trick for figuring it out
 37 | 9.98464450123 ['food', 'calories', 'than', 'easy', 'high', 'come']
 38 | 4.04959173123 ['quot', 'they', 'money', 'want', 'very', 'best']
 39 | 0.123588233146 ['fats', 'quot', 'this', 'good', 'about', 'like']
 40 | 
 41 | Absolutely Free People Search
 42 | 1.39249472006 ['with', 'your', 'weight', 'have', 'control', 'about']
 43 | 0.459779859548 ['fats', 'quot', 'this', 'good', 'about', 'like']
 44 | 0.4371224863 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
 45 | 
 46 | Friday
 47 | 4.35052263015 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
 48 | 0.609863992308 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
 49 | 0.249760019695 ['food', 'calories', 'than', 'easy', 'high', 'come']
 50 | 
 51 | Food and Workout Log 10.11.07
 52 | 4.76278425737 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
 53 | 1.18573164731 ['food', 'calories', 'than', 'easy', 'high', 'come']
 54 | 0.559740845941 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
 55 | 
 56 | LIL / Biggie's October Bulletin - UK
 57 | 1.6407957576 ['that', 'much', 'does', 'exercise', 'this', 'morning']
 58 | 1.09371385364 ['with', 'your', 'weight', 'have', 'control', 'about']
 59 | 0.0997065733116 ['food', 'home', 'then', 'exercise', 'morning', 'went']
 60 | 
 61 | How accurate are the calorie meters on gym equipment?
 62 | 1.68385026718 ['with', 'your', 'weight', 'have', 'control', 'about']
 63 | 1.24336224612 ['food', 'calories', 'than', 'easy', 'high', 'come']
 64 | 0.472039508303 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
 65 | 
 66 | diet-exercise thursday
 67 | 5.62839188358 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
 68 | 1.42876311885 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
 69 | 0.451891791988 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
 70 | 
 71 | Fast Food Meat
 72 | 3.96657604228 ['quot', 'they', 'money', 'want', 'very', 'best']
 73 | 1.56912835469 ['with', 'your', 'weight', 'have', 'control', 'about']
 74 | 0.945562729964 ['food', 'calories', 'than', 'easy', 'high', 'come']
 75 | 
 76 | Food & Exercise -- 10/11/2007
 77 | 2.06565313343 ['food', 'home', 'then', 'exercise', 'morning', 'went']
 78 | 0.915925841734 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
 79 | 0.852089104271 ['food', 'calories', 'than', 'easy', 'high', 'come']
 80 | 
 81 | sleepy food/fitness thursday
 82 | 5.29370213306 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
 83 | 0.821758436298 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
 84 | 0.373361576129 ['fats', 'quot', 'this', 'good', 'about', 'like']
 85 | 
 86 | 6 Dollars! You Can't Lose!! AS SEEN ON OPRAH & 20/20
 87 | 3.50452080121 ['quot', 'they', 'money', 'want', 'very', 'best']
 88 | 1.48000806252 ['with', 'your', 'weight', 'have', 'control', 'about']
 89 | 0.353120143386 ['that', 'much', 'does', 'exercise', 'this', 'morning']
 90 | 
 91 | Looking for mediterranean buffet restaurants in Toronto, Canada
 92 | 0.766709189825 ['food', 'home', 'then', 'exercise', 'morning', 'went']
 93 | 0.488536397538 ['with', 'your', 'weight', 'have', 'control', 'about']
 94 | 0.305836578699 ['quot', 'they', 'money', 'want', 'very', 'best']
 95 | 
 96 | Food and Workout Log 10.10.07
 97 | 5.10395750879 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
 98 | 0.931990921746 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
 99 | 0.0751093197335 ['with', 'your', 'weight', 'have', 'control', 'about']
100 | 
101 | Food and Workout Log 10.9.07
102 | 3.66128126402 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
103 | 0.924777033606 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
104 | 0.46368820747 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
105 | 
106 | Food & Exercise -- 10/10/2007
107 | 2.09636617791 ['food', 'home', 'then', 'exercise', 'morning', 'went']
108 | 0.777930860455 ['that', 'much', 'does', 'exercise', 'this', 'morning']
109 | 0.234412590473 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
110 | 
111 | rainy diet/exercise
112 | 2.42408643655 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
113 | 1.79759287175 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
114 | 1.44383382428 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
115 | 
116 | Whatever happened to Kaleb?
117 | 1.62771768736 ['that', 'much', 'does', 'exercise', 'this', 'morning']
118 | 0.424792812054 ['with', 'your', 'weight', 'have', 'control', 'about']
119 | 3.00050522053e-008 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
120 | 
121 | Food & Exercise -- 10/9/2007
122 | 2.67051008267 ['that', 'much', 'does', 'exercise', 'this', 'morning']
123 | 2.25685573791 ['food', 'home', 'then', 'exercise', 'morning', 'went']
124 | 0.962829471038 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
125 | 
126 | Way of getting my veggies
127 | 2.51102412355 ['quot', 'they', 'money', 'want', 'very', 'best']
128 | 1.82193456941 ['with', 'your', 'weight', 'have', 'control', 'about']
129 | 1.20974377068 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
130 | 
131 | Food & Exercise -- 10/8/2007 (yesterday)
132 | 3.57035878288 ['food', 'home', 'then', 'exercise', 'morning', 'went']
133 | 0.581070745119 ['with', 'your', 'weight', 'have', 'control', 'about']
134 | 0.151621405217 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
135 | 
136 | Oatmeal, cereal of choice.
137 | 3.41252863148 ['food', 'calories', 'than', 'easy', 'high', 'come']
138 | 0.482857491594 ['with', 'your', 'weight', 'have', 'control', 'about']
139 | 0.21056938621 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
140 | 
141 | Whatever happened to Dally?
142 | 0.38943120274 ['with', 'your', 'weight', 'have', 'control', 'about']
143 | 0.38027115946 ['quot', 'they', 'money', 'want', 'very', 'best']
144 | 0.176713051522 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
145 | 
146 | More about the Chicago marathon
147 | 6.1620884463 ['quot', 'they', 'money', 'want', 'very', 'best']
148 | 0.268050785403 ['with', 'your', 'weight', 'have', 'control', 'about']
149 | 0.0210462038578 ['fats', 'quot', 'this', 'good', 'about', 'like']
150 | 
151 | Food and Workout Log 10.8.07
152 | 3.19119866786 ['food', 'calories', 'than', 'easy', 'high', 'come']
153 | 2.68113794132 ['that', 'much', 'does', 'exercise', 'this', 'morning']
154 | 1.31607800222 ['fats', 'quot', 'this', 'good', 'about', 'like']
155 | 
156 | diet/exercise 10/8
157 | 4.35583316205 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
158 | 1.56546955704 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
159 | 1.25839277593 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
160 | 
161 | I did'nt diet to get in shape for Trinidad's Carnival.....
162 | 5.9231935598 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
163 | 0.567204076047 ['that', 'much', 'does', 'exercise', 'this', 'morning']
164 | 0.0169687217709 ['with', 'your', 'weight', 'have', 'control', 'about']
165 | 
166 | I got in shape and took part in Trinidad Carnival!
167 | 1.02074036539 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
168 | 0.930265487859 ['food', 'home', 'then', 'exercise', 'morning', 'went']
169 | 0.266432773175 ['fats', 'quot', 'this', 'good', 'about', 'like']
170 | 
171 | THE ULTIMATE FAT-BURNING DAY
172 | 1.72598890408 ['that', 'much', 'does', 'exercise', 'this', 'morning']
173 | 0.528141703291 ['with', 'your', 'weight', 'have', 'control', 'about']
174 | 0.390073319858 ['fats', 'quot', 'this', 'good', 'about', 'like']
175 | 
176 | Control ur Weight
177 | 6.78756986407 ['with', 'your', 'weight', 'have', 'control', 'about']
178 | 0.000529137198612 ['food', 'home', 'then', 'exercise', 'morning', 'went']
179 | 0.00038074933869 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
180 | 
181 | BrainStimPro Binaural Brainwave Generator
182 | 0.533630276909 ['with', 'your', 'weight', 'have', 'control', 'about']
183 | 0.37841909077 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
184 | 0.116016288049 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
185 | 
186 | Food & Exercise -- 10/7/2007
187 | 7.73926153154 ['that', 'much', 'does', 'exercise', 'this', 'morning']
188 | 0.470298707782 ['food', 'home', 'then', 'exercise', 'morning', 'went']
189 | 0.233105196286 ['food', 'calories', 'than', 'easy', 'high', 'come']
190 | 
191 | food/exercise Friday 10/7
192 | 4.69100441998 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
193 | 1.64398092185 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
194 | 0.689120996726 ['food', 'calories', 'than', 'easy', 'high', 'come']
195 | 
196 | Should we ban marathons?
197 | 2.44173145283 ['quot', 'they', 'money', 'want', 'very', 'best']
198 | 1.81373140989 ['that', 'much', 'does', 'exercise', 'this', 'morning']
199 | 1.3775418859 ['fats', 'quot', 'this', 'good', 'about', 'like']
200 | 
201 | Abstinence 3 (8 October to 24 October)
202 | 0.969974503706 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
203 | 0.912311154908 ['food', 'calories', 'than', 'easy', 'high', 'come']
204 | 0.371633089984 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
205 | 
206 | Food & Exercise -- 10/6/2007 (yesterday)
207 | 2.63514100937 ['food', 'home', 'then', 'exercise', 'morning', 'went']
208 | 1.80605150884 ['that', 'much', 'does', 'exercise', 'this', 'morning']
209 | 0.426403502815 ['food', 'calories', 'than', 'easy', 'high', 'come']
210 | 
211 | Food and Workout Log 10.5.07
212 | 2.03340244602 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
213 | 0.321040122788 ['food', 'home', 'then', 'exercise', 'morning', 'went']
214 | 0.286990704435 ['food', 'calories', 'than', 'easy', 'high', 'come']
215 | 
216 | Food and Workout Log 10.4.07
217 | 2.32606586074 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
218 | 2.23872352546 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
219 | 0.991619356436 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
220 | 
221 | Exercise
222 | 1.82773959677 ['that', 'much', 'does', 'exercise', 'this', 'morning']
223 | 1.05124941331 ['food', 'home', 'then', 'exercise', 'morning', 'went']
224 | 0.420038570854 ['with', 'your', 'weight', 'have', 'control', 'about']
225 | 
226 | food/exercise Friday 10/5
227 | 5.3332773133 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
228 | 1.381638768 ['food', 'calories', 'than', 'easy', 'high', 'come']
229 | 0.590183487282 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
230 | 
231 | Food & Exercise -- 10/5/2007
232 | 5.22083940456 ['food', 'home', 'then', 'exercise', 'morning', 'went']
233 | 0.29336324721 ['fats', 'quot', 'this', 'good', 'about', 'like']
234 | 0.279839860069 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
235 | 
236 | Fitness and wellness is here to help you, about fittnes, nutrition, health, everything is here...
237 | 1.47212290909 ['with', 'your', 'weight', 'have', 'control', 'about']
238 | 0.581092551305 ['fats', 'quot', 'this', 'good', 'about', 'like']
239 | 0.22366446507 ['that', 'much', 'does', 'exercise', 'this', 'morning']
240 | 
241 | Live healthy and disease free
242 | 1.33925118974 ['fats', 'quot', 'this', 'good', 'about', 'like']
243 | 0.735241239185 ['with', 'your', 'weight', 'have', 'control', 'about']
244 | 4.48445780778e-005 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
245 | 
246 | maintain ur diet dailyu
247 | 1.37867872087 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
248 | 7.75955547582e-008 ['with', 'your', 'weight', 'have', 'control', 'about']
249 | 8.66186281206e-016 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
250 | 
251 | Food & Exercise -- 10/4/2007
252 | 5.16310413391 ['food', 'home', 'then', 'exercise', 'morning', 'went']
253 | 0.108950865658 ['food', 'calories', 'than', 'easy', 'high', 'come']
254 | 0.103411657525 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
255 | 
256 | diet/exercise 10/4
257 | 5.94642162786 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
258 | 1.15981737715 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
259 | 0.0648977104196 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
260 | 
261 | sad to say
262 | 2.0658103969 ['that', 'much', 'does', 'exercise', 'this', 'morning']
263 | 1.0211752756 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
264 | 0.606678422181 ['food', 'home', 'then', 'exercise', 'morning', 'went']
265 | 
266 | Food and Workout Log 10.3.07
267 | 5.48488799917 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
268 | 1.04086527858 ['that', 'much', 'does', 'exercise', 'this', 'morning']
269 | 0.86250634261 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
270 | 
271 | Walking As Exercise
272 | 2.49352373509 ['quot', 'they', 'money', 'want', 'very', 'best']
273 | 2.18075570265 ['that', 'much', 'does', 'exercise', 'this', 'morning']
274 | 0.613516861795 ['with', 'your', 'weight', 'have', 'control', 'about']
275 | 
276 | food/exercise wednesday 10/3
277 | 5.07554844226 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
278 | 1.23323613477 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
279 | 0.351030687614 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
280 | 
281 | How much proteins leak into water when cooking vegetables ?
282 | 2.93064858147 ['with', 'your', 'weight', 'have', 'control', 'about']
283 | 1.18774119665 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
284 | 0.00439761415131 ['that', 'much', 'does', 'exercise', 'this', 'morning']
285 | 
286 | Food & Exercise -- 10/3/2007
287 | 3.50183444986 ['food', 'home', 'then', 'exercise', 'morning', 'went']
288 | 2.21222380937 ['quot', 'they', 'money', 'want', 'very', 'best']
289 | 1.37194239805 ['fats', 'quot', 'this', 'good', 'about', 'like']
290 | 
291 | The truth about exercising and your body as a whole
292 | 3.21386971879 ['that', 'much', 'does', 'exercise', 'this', 'morning']
293 | 3.15819744924 ['with', 'your', 'weight', 'have', 'control', 'about']
294 | 0.270507407515 ['fats', 'quot', 'this', 'good', 'about', 'like']
295 | 
296 | Got the new Elliptical
297 | 1.14114086054 ['that', 'much', 'does', 'exercise', 'this', 'morning']
298 | 1.10131650413 ['food', 'calories', 'than', 'easy', 'high', 'come']
299 | 0.477565648015 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
300 | 
301 | Fingerstick cholesterol tests accurate?
302 | 2.14611673458 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
303 | 1.26970417226 ['fats', 'quot', 'this', 'good', 'about', 'like']
304 | 0.750416051713 ['that', 'much', 'does', 'exercise', 'this', 'morning']
305 | 
306 | Weight Loss Tips
307 | 5.21079777525 ['with', 'your', 'weight', 'have', 'control', 'about']
308 | 1.59092846403 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
309 | 0.00134310805496 ['food', 'home', 'then', 'exercise', 'morning', 'went']
310 | 
311 | Obesity Driving Rising U.S. Health Costs
312 | 1.25615188819 ['with', 'your', 'weight', 'have', 'control', 'about']
313 | 0.712235310825 ['food', 'calories', 'than', 'easy', 'high', 'come']
314 | 0.238899338741 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
315 | 
316 | Flu-Busting Chicken Soup
317 | 5.54567450388 ['with', 'your', 'weight', 'have', 'control', 'about']
318 | 1.21893998075 ['food', 'calories', 'than', 'easy', 'high', 'come']
319 | 0.451175568656 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
320 | 
321 | Re: My Healing Story
322 | 1.96323375291 ['with', 'your', 'weight', 'have', 'control', 'about']
323 | 0.491756558034 ['that', 'much', 'does', 'exercise', 'this', 'morning']
324 | 0.477646079039 ['fats', 'quot', 'this', 'good', 'about', 'like']
325 | 
326 | Food and Workout Log 10.2.07
327 | 2.60282810129 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
328 | 1.20884355476 ['that', 'much', 'does', 'exercise', 'this', 'morning']
329 | 0.950088631141 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
330 | 
331 | food/exercise Tuesday 10/2
332 | 2.64289395866 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
333 | 0.651445733893 ['food', 'home', 'then', 'exercise', 'morning', 'went']
334 | 0.371530809297 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
335 | 
336 | Food & Exercise -- 10/2/2007
337 | 4.03395454429 ['food', 'home', 'then', 'exercise', 'morning', 'went']
338 | 0.733387053358 ['food', 'calories', 'than', 'easy', 'high', 'come']
339 | 0.335622901915 ['that', 'much', 'does', 'exercise', 'this', 'morning']
340 | 
341 | Diet Recommendations following stoppage of activity
342 | 2.76689639493 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
343 | 2.66796965036 ['quot', 'they', 'money', 'want', 'very', 'best']
344 | 0.00214949067482 ['fats', 'quot', 'this', 'good', 'about', 'like']
345 | 
346 | why I'm succeeding, finally, with my fitness
347 | 3.81276353396 ['that', 'much', 'does', 'exercise', 'this', 'morning']
348 | 2.28727363664 ['with', 'your', 'weight', 'have', 'control', 'about']
349 | 0.0084896973916 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
350 | 
351 | food/exercise Monday 10/1
352 | 6.52183126318 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
353 | 1.04845803053 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
354 | 0.220817568443 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
355 | 
356 | Food & Exercise -- 10/1/2007
357 | 3.41693152333 ['food', 'home', 'then', 'exercise', 'morning', 'went']
358 | 1.43659575232 ['with', 'your', 'weight', 'have', 'control', 'about']
359 | 0.0107024339333 ['that', 'much', 'does', 'exercise', 'this', 'morning']
360 | 
361 | Good fats bad fats
362 | 14.9233786406 ['fats', 'quot', 'this', 'good', 'about', 'like']
363 | 0.12157320235 ['quot', 'they', 'money', 'want', 'very', 'best']
364 | 0.000388079511473 ['food', 'calories', 'than', 'easy', 'high', 'come']
365 | 
366 | milk products
367 | 2.91179410526 ['quot', 'they', 'money', 'want', 'very', 'best']
368 | 1.35981372517 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
369 | 0.894968443359 ['with', 'your', 'weight', 'have', 'control', 'about']
370 | 
371 | Food and Workout Log 10.1.07
372 | 4.86163252456 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
373 | 3.04379043965 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
374 | 0.288092400057 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
375 | 
376 | < 1g, etc.
377 | 5.96451663382 ['that', 'much', 'does', 'exercise', 'this', 'morning']
378 | 0.757014711498 ['food', 'calories', 'than', 'easy', 'high', 'come']
379 | 0.0106873617525 ['with', 'your', 'weight', 'have', 'control', 'about']
380 | 
381 | peanut butter
382 | 2.79640093073 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
383 | 0.974232666377 ['fats', 'quot', 'this', 'good', 'about', 'like']
384 | 0.20210540615 ['food', 'home', 'then', 'exercise', 'morning', 'went']
385 | 
386 | food/exercise Sunday 9/30
387 | 3.13525705865 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
388 | 1.77396028396 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
389 | 0.84994166551 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
390 | 
391 | Food and Workout Log 9.30.07
392 | 4.16473442796 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
393 | 1.1093666032 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
394 | 0.0787803510605 ['food', 'calories', 'than', 'easy', 'high', 'come']
395 | 
396 | Food & Exercise -- 9/30/2007
397 | 4.50185209238 ['food', 'home', 'then', 'exercise', 'morning', 'went']
398 | 1.04931983732 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
399 | 0.691962870134 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
400 | 
401 | Food and Workout Log 9.29.07
402 | 4.84939396321 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
403 | 2.39617281343 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
404 | 0.816992749371 ['with', 'your', 'weight', 'have', 'control', 'about']
405 | 
406 | Food and Workout Log 9.28.07
407 | 4.67171965065 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
408 | 0.282691470562 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
409 | 0.200338717705 ['food', 'home', 'then', 'exercise', 'morning', 'went']
410 | 
411 | LOUIE + LINESMAKER = $$$$
412 | 5.58276496802 ['quot', 'they', 'money', 'want', 'very', 'best']
413 | 0.342556596947 ['with', 'your', 'weight', 'have', 'control', 'about']
414 | 0.179902643439 ['food', 'home', 'then', 'exercise', 'morning', 'went']
415 | 
416 | Food & Exercise -- 9/29/2007 (yesterday)
417 | 2.58541666839 ['quot', 'they', 'money', 'want', 'very', 'best']
418 | 2.09030954926 ['food', 'home', 'then', 'exercise', 'morning', 'went']
419 | 0.852396204369 ['that', 'much', 'does', 'exercise', 'this', 'morning']
420 | 
421 | ASDLC has changed
422 | 0.112818164838 ['that', 'much', 'does', 'exercise', 'this', 'morning']
423 | 0.102149673333 ['with', 'your', 'weight', 'have', 'control', 'about']
424 | 0.0271390834142 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
425 | 
426 | diet/exercise Saturday 9/29
427 | 2.11947826799 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
428 | 1.44172267631 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
429 | 0.589173976223 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
430 | 
431 | Sensible Diet & Exercise
432 | 5.04673654071 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
433 | 3.61357653903 ['that', 'much', 'does', 'exercise', 'this', 'morning']
434 | 1.40785970283 ['with', 'your', 'weight', 'have', 'control', 'about']
435 | 
436 | Abstinence 2.- (20 September to 7 October )
437 | 0.968219095052 ['food', 'calories', 'than', 'easy', 'high', 'come']
438 | 0.949878615175 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
439 | 0.46422787912 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
440 | 
441 | Evolution and Weight
442 | 2.20784229869 ['quot', 'they', 'money', 'want', 'very', 'best']
443 | 1.18857451599 ['fats', 'quot', 'this', 'good', 'about', 'like']
444 | 1.00060668406 ['with', 'your', 'weight', 'have', 'control', 'about']
445 | 
446 | Food & Exercise -- 9/28/2007 (yesterday)
447 | 4.75585045074 ['food', 'home', 'then', 'exercise', 'morning', 'went']
448 | 0.840897380766 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
449 | 0.0455304461165 ['quot', 'they', 'money', 'want', 'very', 'best']
450 | 
451 | The Abs Diet by David Zinczenko
452 | 6.58003120192 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
453 | 0.00218473474643 ['fats', 'quot', 'this', 'good', 'about', 'like']
454 | 0.00140268366152 ['food', 'calories', 'than', 'easy', 'high', 'come']
455 | 
456 | Re: ABC News Nightline: Carbohydrates Make You Fat, and Perhaps Sick
457 | 2.50629631142 ['that', 'much', 'does', 'exercise', 'this', 'morning']
458 | 1.32614477527 ['with', 'your', 'weight', 'have', 'control', 'about']
459 | 0.194215052468 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
460 | 
461 | Sensible Eating
462 | 2.67280083529 ['quot', 'they', 'money', 'want', 'very', 'best']
463 | 1.95758534619 ['with', 'your', 'weight', 'have', 'control', 'about']
464 | 0.272657483613 ['fats', 'quot', 'this', 'good', 'about', 'like']
465 | 
466 | 3 slices of ff cheese on a poached egg
467 | 0.925803022044 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
468 | 0.158529510462 ['food', 'home', 'then', 'exercise', 'morning', 'went']
469 | 1.69625785908e-009 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
470 | 
471 | food/exercise Friday 9/28
472 | 4.2243066995 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
473 | 1.69260968796 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
474 | 0.355277035138 ['food', 'home', 'then', 'exercise', 'morning', 'went']
475 | 
476 | money making opportunity
477 | 1.52941598839 ['that', 'much', 'does', 'exercise', 'this', 'morning']
478 | 1.23535469791 ['quot', 'they', 'money', 'want', 'very', 'best']
479 | 0.292057461928 ['with', 'your', 'weight', 'have', 'control', 'about']
480 | 
481 | The Benefits of Biotechnology For Mankind
482 | 0.543967592519 ['with', 'your', 'weight', 'have', 'control', 'about']
483 | 0.389852100811 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
484 | 0.208865237821 ['fats', 'quot', 'this', 'good', 'about', 'like']
485 | 
486 | Food and Workout Log 9.27.08
487 | 5.58477112035 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
488 | 0.093698650419 ['food', 'home', 'then', 'exercise', 'morning', 'went']
489 | 1.16281432958e-005 ['quot', 'they', 'money', 'want', 'very', 'best']
490 | 
491 | Re: My First Century Ride
492 | 2.7939634836 ['quot', 'they', 'money', 'want', 'very', 'best']
493 | 1.98080235676 ['with', 'your', 'weight', 'have', 'control', 'about']
494 | 0.0869645267992 ['fats', 'quot', 'this', 'good', 'about', 'like']
495 | 
496 | A Special Update from Matty V.
497 | 1.64986127649 ['that', 'much', 'does', 'exercise', 'this', 'morning']
498 | 0.635897197115 ['with', 'your', 'weight', 'have', 'control', 'about']
499 | 0.337529069245 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
500 | 
501 | 


--------------------------------------------------------------------------------
/第10章 寻找独立特征/clusters.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | from math import sqrt
  4 | from PIL import Image,ImageDraw,ImageFont
  5 | 
  6 | # Returns the Pearson correlation coefficient for p1 and p2
  7 | def pearson(v1,v2):
  8 |   # Simple sums
  9 |   sum1=sum(v1)
 10 |   sum2=sum(v2)
 11 |   
 12 |   # Sums of the squares
 13 |   sum1Sq=sum([pow(v,2) for v in v1])
 14 |   sum2Sq=sum([pow(v,2) for v in v2])	
 15 |   
 16 |   # Sum of the products
 17 |   pSum=sum([v1[i]*v2[i] for i in range(len(v1))])
 18 |   
 19 |   # Calculate r (Pearson score)
 20 |   num=pSum-(sum1*sum2/len(v1))
 21 |   den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1)))
 22 |   if den==0: return 0
 23 | 
 24 |   return 1.0-(num/den)
 25 | 
 26 | 
 27 | class bicluster:
 28 |   def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
 29 |     self.left=left
 30 |     self.right=right
 31 |     self.vec=vec
 32 |     self.id=id
 33 |     self.distance=distance
 34 | 
 35 | def euclidean(v1,v2):
 36 |   sqsum=sum([math.pow(v1[i]-v2[i],2) for i in range(len(v1))])
 37 |   return math.sqrt(sqsum)
 38 | 
 39 | def printclust(clust,labels=None,n=0):
 40 |   for i in range(n): print ' ',
 41 |   if clust.id<0:
 42 |     print '-'
 43 |   else:
 44 |     if labels==None: print clust.id
 45 |     else: print labels[clust.id]
 46 |   if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
 47 |   if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
 48 | 
 49 | def hcluster(vecs,distance=pearson):
 50 |   distances={}
 51 |   currentclustid=-1
 52 |   clust=[bicluster(vecs[i],id=i) for i in range(len(vecs))]
 53 | 
 54 |   while len(clust)>1:
 55 |     lowestpair=(0,1)
 56 |     closest=distance(clust[0].vec,clust[1].vec)
 57 |     for i in range(len(clust)):
 58 |       for j in range(i+1,len(clust)):
 59 |         if (clust[i].id,clust[j].id) not in distances: 
 60 |           distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
 61 |         d=distances[(clust[i].id,clust[j].id)]
 62 | 
 63 |         if d<closest:
 64 |           closest=d
 65 |           lowestpair=(i,j)
 66 | 
 67 |     mergevec=[(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 for i in range(len(clust[0].vec))]
 68 |     error=closest
 69 |     newcluster=bicluster(mergevec,left=clust[lowestpair[0]],right=clust[lowestpair[1]],distance=error,id=currentclustid)
 70 |     
 71 |     currentclustid-=1
 72 |     del clust[lowestpair[1]]
 73 |     del clust[lowestpair[0]]
 74 |     clust.append(newcluster)
 75 | 
 76 |   return clust[0]
 77 |   
 78 |   
 79 | def kcluster(vecs,distance=pearson,k=4):
 80 |   ranges=[(min([vec[i] for vec in vecs]),max([vec[i] for vec in vecs])) for i in range(len(vecs[0]))]
 81 |   clusters=[[random.random()*(ranges[i][1]-ranges[i][0])+ranges[i][0] for i in range(len(vecs[0]))] for j in range(k)]
 82 |   
 83 |   lastmatches=None
 84 |   for t in range(100):
 85 |     print 'Iteration %d' % t
 86 |     bestmatches=[[] for i in range(k)]
 87 |     
 88 |     for j in range(len(vecs)):
 89 |       vec=vecs[j]
 90 |       bestmatch=0
 91 |       for i in range(k):
 92 |         d=distance(clusters[i],vec)
 93 |         if d<distance(clusters[bestmatch],vec): bestmatch=i
 94 |       bestmatches[bestmatch].append(j)
 95 | 
 96 |     if bestmatches==lastmatches: break
 97 |     lastmatches=bestmatches
 98 |     
 99 |     for i in range(k):
100 |       avgs=[0.0]*len(vecs[0])
101 |       if len(bestmatches[i])>0:
102 |         for vecid in bestmatches[i]:
103 |           for m in range(len(vecs[vecid])):
104 |             avgs[m]+=vecs[vecid][m]
105 |         for j in range(len(avgs)):
106 |           avgs[j]/=len(bestmatches[i])
107 |         clusters[i]=avgs
108 |       
109 |   return bestmatches
110 | 
111 | def readfile(filename):
112 |   lines=[line for line in file(filename)]
113 |   colnames=lines[0].strip().split('\t')[1:]
114 |   rownames=[]
115 |   data=[]
116 |   for line in lines[1:]:
117 |     p=line.strip().split('\t')
118 |     rownames.append(p[0])
119 |     data.append([float(x) for x in p[1:]])
120 |   return rownames,colnames,data
121 | 
122 | def test2():
123 |   rownames,colnames,data=readfile('datafile.txt')
124 |   return hcluster(data)
125 |   #for i in range(len(rownames)):
126 |   #  print i,rownames[i]
127 | 
128 | def distance(v1,v2):
129 |   c1,c2,shr=0,0,0
130 |   
131 |   for i in range(len(v1)):
132 |     if v1[i]!=0: c1+=1
133 |     if v2[i]!=0: c2+=1
134 |     if v1[i]!=0 and v2[i]!=0: shr+=1
135 |   
136 |   return float(shr)/(c1+c2-shr)
137 | 
138 | 
139 | #test2()
140 | 
141 | def getheight(clust):
142 |   if clust.left==None and clust.right==None: return 1
143 |   return getheight(clust.left)+getheight(clust.right)
144 | 
145 | def getdepth(clust):
146 |   if clust.left==None and clust.right==None: return 0
147 |   return max(getdepth(clust.left),getdepth(clust.right))+clust.distance
148 | 
149 | def drawdendrogram(clust,labels,jpeg='clusters.jpg'):
150 |   h=getheight(clust)*20
151 |   depth=getdepth(clust)
152 |   w=1200
153 |   scaling=float(w-150)/depth
154 |   img=Image.new('RGB',(w,h),(255,255,255))
155 |   draw=ImageDraw.Draw(img)
156 | 
157 |   draw.line((0,h/2,10,h/2),fill=(255,0,0))    
158 | 
159 |   drawnode(draw,clust,10,(h/2),scaling,labels)
160 |   img.save(jpeg,'JPEG')
161 | 
162 | def drawnode(draw,clust,x,y,scaling,labels):
163 |   if clust.id<0:
164 |     h1=getheight(clust.left)*20
165 |     h2=getheight(clust.right)*20
166 |     top=y-(h1+h2)/2
167 |     bottom=y+(h1+h2)/2
168 |     
169 |     ll=clust.distance*scaling
170 |     
171 |     draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0))    
172 | 
173 |     draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0))    
174 |     draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0))        
175 |     
176 |     drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels)
177 |     drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels)
178 |   else:   
179 |     draw.text((x+5,y-7),labels[clust.id].encode('utf8'),(0,0,0))
180 | 
181 | def rotatematrix(data):
182 |   newdata=[]
183 |   for i in range(len(data[0])):
184 |     newrow=[data[j][i] for j in range(len(data))]
185 |     newdata.append(newrow)
186 |   return newdata
187 | 
188 | def scaledown(data,distance=pearson,rate=0.01):
189 |   n=len(data)
190 |   realdist=[[distance(data[i],data[j]) for j in range(n)] for i in range(0,n)]
191 | 
192 |   outersum=0.0
193 |   
194 |   loc=[[random.random(),random.random()] for i in range(n)] 
195 |   fakedist=[[0.0 for j in range(n)] for i in range(n)]
196 |   
197 |   lasterror=None
198 |   for m in range(0,1000):
199 |     # Find projected distances
200 |     for i in range(n):
201 |       for j in range(n):
202 |         fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 
203 |                                  for x in range(len(loc[i]))]))
204 |   
205 |     # Move points
206 |     grad=[[0.0,0.0] for i in range(n)]
207 |     
208 |     totalerror=0
209 |     for k in range(n):
210 |       for j in range(n):
211 |         if j==k: continue
212 |         errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k]
213 |         grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm
214 |         grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm    
215 |         totalerror+=abs(errorterm)
216 |     print totalerror
217 |     if lasterror and lasterror<totalerror: break
218 |     lasterror=totalerror
219 |     
220 |     for k in range(n):
221 |       loc[k][0]-=rate*grad[k][0]
222 |       loc[k][1]-=rate*grad[k][1]
223 | 
224 |   return loc
225 | 
226 | def draw2d(data,labels,jpg='mds2d.jpg'):
227 |   img=Image.new('RGB',(2000,2000),(255,255,255))
228 |   draw=ImageDraw.Draw(img)
229 |   for i in range(len(data)):
230 |     x=(data[i][0]+0.5)*1000
231 |     y=(data[i][1]+0.5)*1000
232 |     draw.text((x,y),labels[i],(0,0,0))
233 |   img.save(jpg,'JPEG')  
234 |   img.show()
235 | 


--------------------------------------------------------------------------------
/第10章 寻找独立特征/docclass.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import math
  3 | import cPickle
  4 | from pysqlite2 import dbapi2 as sqlite
  5 | 
  6 | def getwords(doc):
  7 |   splitter=re.compile('\\W*')
  8 |   words=[s.lower() for s in splitter.split(doc) 
  9 |           if len(s)>2 and len(s)<20]
 10 |   
 11 |   # Return the unique set of words only
 12 |   return dict([(w,1) for w in words])
 13 | 
 14 | #def entryfeatures(entry):
 15 | 
 16 | def sampletrain(cl):
 17 |   cl.train('Nobody owns the water.','good')
 18 |   cl.train('the quick rabbit jumps fences','good')
 19 |   cl.train('buy pharmaceuticals now','bad')
 20 |   cl.train('make quick money at the online casino','bad')
 21 |   cl.train('the quick brown fox jumps','good')
 22 | 
 23 | class classifier:
 24 |   def __init__(self,getfeatures):
 25 |     self.fc={}
 26 |     self.cc={}
 27 |     self.getfeatures=getfeatures
 28 |   
 29 |   def setdb(self,dbfile):
 30 |     self.con=sqlite.connect(dbfile)    
 31 |     self.con.execute('create table if not exists fc(feature,category,count)')
 32 |     self.con.execute('create table if not exists cc(category,count)')
 33 |   
 34 |   def incf(self,f,cat):
 35 |     count=self.fcount(f,cat)
 36 |     if count==0:
 37 |       self.con.execute("insert into fc values ('%s','%s',1)" 
 38 |                        % (f,cat))
 39 |     else:
 40 |       self.con.execute(
 41 |         "update fc set count=%d where feature='%s' and category='%s'" 
 42 |         % (count+1,f,cat)) 
 43 |   
 44 |   def fcount(self,f,cat):
 45 |     res=self.con.execute(
 46 |       'select count from fc where feature="%s" and category="%s"'
 47 |       %(f,cat)).fetchone()
 48 |     if res==None: return 0
 49 |     else: return float(res[0])
 50 | 
 51 |   def incc(self,cat):
 52 |     count=self.catcount(cat)
 53 |     if count==0:
 54 |       self.con.execute("insert into cc values ('%s',1)" % (cat))
 55 |     else:
 56 |       self.con.execute("update cc set count=%d where category='%s'" 
 57 |                        % (count+1,cat))    
 58 |       
 59 |   def catcount(self,cat):
 60 |     res=self.con.execute('select count from cc where category="%s"'
 61 |                          %(cat)).fetchone()
 62 |     if res==None: return 0.0
 63 |     else: return float(res[0])
 64 |     
 65 |   def categories(self):
 66 |     cur=self.con.execute('select category from cc');
 67 |     return [d[0] for d in cur]
 68 | 
 69 |   def totalcount(self):
 70 |     res=self.con.execute('select sum(count) from cc').fetchone();
 71 |     if res==None: return 0
 72 |     return res[0]
 73 |     
 74 | 
 75 |   """  
 76 |   def incf(self,f,cat):
 77 |     self.fc.setdefault(f,{})
 78 |     self.fc[f].setdefault(cat,0)
 79 |     self.fc[f][cat]+=1
 80 |   
 81 |   def incc(self,cat):
 82 |     self.cc.setdefault(cat,0)
 83 |     self.cc[cat]+=1
 84 |  
 85 |   def fcount(self,f,cat):
 86 |     if f in self.fc and cat in self.fc[f]: 
 87 |       return float(self.fc[f][cat])
 88 |     return 0.0
 89 |   
 90 |   def catcount(self,cat):
 91 |     if cat in self.cc:
 92 |       return float(self.cc[cat])
 93 |     return 0
 94 | 
 95 |   def totalcount(self):
 96 |     return sum(self.cc.values())
 97 | 
 98 |   def categories(self):
 99 |     return self.cc.keys()
100 |   """  
101 |   
102 |   
103 |   def train(self,item,cat):
104 |     features=self.getfeatures(item)   
105 |     for f in features:
106 |       self.incf(f,cat)
107 |     self.incc(cat)
108 |     self.con.commit()
109 |    
110 |   def fprob(self,f,cat):
111 |     if self.catcount(cat)==0: return 0
112 |     return self.fcount(f,cat)/self.catcount(cat)
113 | 
114 |   def setfilename(self,filename):
115 |     self.filename=filename
116 |     self.restoredata()
117 | 
118 |   def restoredata(self):
119 |     try: f=file(self.filename,'rb')
120 |     except: return
121 |     self.fc=cPickle.load(f)
122 |     self.cc=cPickle.load(f)
123 |     f.close()
124 |     
125 |   def savedata(self):
126 |     f=file(self.filename,'wb')
127 |     cPickle.dump(self.fc,f,True)
128 |     cPickle.dump(self.cc,f,True)
129 |     f.close()
130 |   def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
131 |     basicprob=prf(f,cat)
132 |     totals=sum([self.fcount(f,c) for c in self.categories()])
133 |     bp=((weight*ap)+(totals*basicprob))/(weight+totals)
134 |     return bp
135 |     
136 |       
137 |   
138 | class naivebayes(classifier):
139 |   def __init__(self,getfeatures):
140 |     classifier.__init__(self,getfeatures)
141 |     self.thresholds={}
142 | 
143 |   def setthreshold(self,cat,t):
144 |     self.thresholds[cat]=t
145 |     
146 |   def getthreshold(self,cat):
147 |     if cat not in self.thresholds: return 1.0
148 |     return self.thresholds[cat]
149 | 
150 |   def classify(self,item,default=None):
151 |     probs={}
152 |     max=0.0
153 |     for cat in self.categories():
154 |       probs[cat]=self.prob(item,cat)
155 |       if probs[cat]>max: 
156 |         max=probs[cat]
157 |         best=cat
158 |     for cat in probs:
159 |       if cat==best: continue
160 |       if probs[cat]*self.getthreshold(best)>probs[best]: return default
161 |     return best
162 |     
163 |   def docprob(self,item,cat):
164 |     features=self.getfeatures(item)   
165 |     p=1
166 |     for f in features: p*=self.weightedprob(f,cat,self.fprob)
167 |     return p
168 | 
169 | 
170 |   def prob(self,item,cat):
171 |     catprob=self.catcount(cat)/self.totalcount()
172 |     docprob=self.docprob(item,cat)
173 |     return docprob*catprob
174 | 
175 | class fisherclassifier(classifier):
176 |   def __init__(self,getfeatures):
177 |     classifier.__init__(self,getfeatures)
178 |     self.minimums={}
179 | 
180 |   def setminimum(self,cat,min):
181 |     self.minimums[cat]=min
182 |   
183 |   def getminimum(self,cat):
184 |     if cat not in self.minimums: return 0
185 |     return self.minimums[cat]
186 |   
187 |   def classify(self,item,default=None):
188 |     best=default
189 |     max=0.0
190 |     for c in self.categories():
191 |       p=self.fisherprob(item,c)
192 |       if p>self.getminimum(c) and p>max:
193 |         best=c
194 |         max=p
195 |     return best
196 |         
197 | 
198 |   def cprob(self,f,cat):
199 |     # The frequency of this feature in this category    
200 |     clf=self.fprob(f,cat)
201 | 
202 |     if clf==0: return 0.0
203 | 
204 |     # The frequency of this feature in all the categories
205 |     freqsum=sum([self.fprob(f,c) for c in self.categories()])
206 | 
207 |     # The probability is the frequency in this category divided by
208 |     # the overall frequency
209 |     p=clf/(freqsum)
210 |     
211 |     return p
212 |   
213 |   
214 |   def fisherprob(self,item,cat):
215 |     p=1
216 |     features=self.getfeatures(item)
217 |     for f in features:
218 |       p*=(self.weightedprob(f,cat,self.cprob))
219 |     fscore=-2*math.log(p)
220 |     return self.chi2P(fscore,len(features)*2)
221 |   
222 |   def chi2P(self,chi,df):
223 |     m = chi / 2.0
224 |     sum = term = math.exp(-m)
225 |     for i in range(1, df//2):
226 |         term *= m / i
227 |         sum += term
228 |     return min(sum, 1.0)
229 | 
230 | 


--------------------------------------------------------------------------------
/第10章 寻找独立特征/features.txt:
--------------------------------------------------------------------------------
 1 | ['diet', 'with', 'great', 'what', 'trinidad', 'carnival']
 2 | (6.58003120192, u'The Abs Diet by David Zinczenko')
 3 | (5.9231935598, u"I did'nt diet to get in shape for Trinidad's Carnival.....")
 4 | (5.04673654071, u'Sensible Diet & Exercise')
 5 | 
 6 | ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small']
 7 | (6.52183126318, u'food/exercise Monday 10/1')
 8 | (5.94642162786, u'diet/exercise 10/4')
 9 | (5.3332773133, u'food/exercise Friday 10/5')
10 | 
11 | ['food', 'calories', 'than', 'easy', 'high', 'come']
12 | (9.98464450123, u'High or low fat food? Easy trick for figuring it out')
13 | (3.41252863148, u'Oatmeal, cereal of choice.')
14 | (3.19119866786, u'Food and Workout Log 10.8.07')
15 | 
16 | ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato']
17 | (7.46811621754, u'saturday')
18 | (5.62839188358, u'diet-exercise thursday')
19 | (5.29370213306, u'sleepy food/fitness thursday')
20 | 
21 | ['food', 'home', 'then', 'exercise', 'morning', 'went']
22 | (5.22083940456, u'Food & Exercise -- 10/5/2007')
23 | (5.16310413391, u'Food & Exercise -- 10/4/2007')
24 | (4.75585045074, u'Food & Exercise -- 9/28/2007 (yesterday)')
25 | 
26 | ['fats', 'quot', 'this', 'good', 'about', 'like']
27 | (14.9233786406, u'Good fats bad fats')
28 | (1.3775418859, u'Should we ban marathons?')
29 | (1.37194239805, u'Food & Exercise -- 10/3/2007')
30 | 
31 | ['quot', 'they', 'money', 'want', 'very', 'best']
32 | (6.1620884463, u'More about the Chicago marathon')
33 | (5.58276496802, u'LOUIE + LINESMAKER = $$$$')
34 | (4.04959173123, u'High or low fat food? Easy trick for figuring it out')
35 | 
36 | ['that', 'much', 'does', 'exercise', 'this', 'morning']
37 | (7.73926153154, u'Food & Exercise -- 10/7/2007')
38 | (5.96451663382, u'< 1g, etc.')
39 | (3.81276353396, u"why I'm succeeding, finally, with my fitness")
40 | 
41 | ['with', 'your', 'weight', 'have', 'control', 'about']
42 | (6.78756986407, u'Control ur Weight')
43 | (5.54567450388, u'Flu-Busting Chicken Soup')
44 | (5.21079777525, u'Weight Loss Tips')
45 | 
46 | ['with', 'lunch', 'workout', 'food', 'butter', 'peanut']
47 | (5.58477112035, u'Food and Workout Log 9.27.08')
48 | (5.48488799917, u'Food and Workout Log 10.3.07')
49 | (5.10395750879, u'Food and Workout Log 10.10.07')
50 | 
51 | 


--------------------------------------------------------------------------------
/第10章 寻找独立特征/newsfeatures.py:
--------------------------------------------------------------------------------
  1 | import feedparser
  2 | import re
  3 | 
  4 | 
  5 | feedlist=['http://today.reuters.com/rss/topNews',
  6 |           'http://today.reuters.com/rss/domesticNews',
  7 |           'http://today.reuters.com/rss/worldNews',
  8 |           'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
  9 |           'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
 10 |           'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
 11 |           'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
 12 |           'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
 13 |           'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
 14 |           'http://news.google.com/?output=rss',
 15 |           'http://feeds.salon.com/salon/news',
 16 |           'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
 17 |           'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
 18 |           'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
 19 |           'http://rss.cnn.com/rss/edition.rss',
 20 |           'http://rss.cnn.com/rss/edition_world.rss',
 21 |           'http://rss.cnn.com/rss/edition_us.rss']
 22 | 
 23 | def stripHTML(h):
 24 |   p=''
 25 |   s=0
 26 |   for c in h:
 27 |     if c=='<': s=1
 28 |     elif c=='>':
 29 |       s=0
 30 |       p+=' '
 31 |     elif s==0: p+=c
 32 |   return p
 33 | 
 34 | 
 35 | def separatewords(text):
 36 |   splitter=re.compile('\\W*')
 37 |   return [s.lower() for s in splitter.split(text) if len(s)>3]
 38 | 
 39 | def getarticlewords():
 40 |   allwords={}
 41 |   articlewords=[]
 42 |   articletitles=[]
 43 |   ec=0
 44 |   # Loop over every feed
 45 |   for feed in feedlist:
 46 |     f=feedparser.parse(feed)
 47 |     
 48 |     # Loop over every article
 49 |     for e in f.entries:
 50 |       # Ignore identical articles
 51 |       if e.title in articletitles: continue
 52 |       
 53 |       # Extract the words
 54 |       txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8'))
 55 |       words=separatewords(txt)
 56 |       articlewords.append({})
 57 |       articletitles.append(e.title)
 58 |       
 59 |       # Increase the counts for this word in allwords and in articlewords
 60 |       for word in words:
 61 |         allwords.setdefault(word,0)
 62 |         allwords[word]+=1
 63 |         articlewords[ec].setdefault(word,0)
 64 |         articlewords[ec][word]+=1
 65 |       ec+=1
 66 |   return allwords,articlewords,articletitles
 67 | 
 68 | def makematrix(allw,articlew):
 69 |   wordvec=[]
 70 |   
 71 |   # Only take words that are common but not too common
 72 |   for w,c in allw.items():
 73 |     if c>3 and c<len(articlew)*0.6:
 74 |       wordvec.append(w) 
 75 |   
 76 |   # Create the word matrix
 77 |   l1=[[(word in f and f[word] or 0) for word in wordvec] for f in articlew]
 78 |   return l1,wordvec
 79 | 
 80 | from numpy import *
 81 | 
 82 | def showfeatures(w,h,titles,wordvec,out='features.txt'): 
 83 |   outfile=file(out,'w')  
 84 |   pc,wc=shape(h)
 85 |   toppatterns=[[] for i in range(len(titles))]
 86 |   patternnames=[]
 87 |   
 88 |   # Loop over all the features
 89 |   for i in range(pc):
 90 |     slist=[]
 91 |     # Create a list of words and their weights
 92 |     for j in range(wc):
 93 |       slist.append((h[i,j],wordvec[j]))
 94 |     # Reverse sort the word list
 95 |     slist.sort()
 96 |     slist.reverse()
 97 |     
 98 |     # Print the first six elements
 99 |     n=[s[1] for s in slist[0:6]]
100 |     outfile.write(str(n)+'\n')
101 |     patternnames.append(n)
102 |     
103 |     # Create a list of articles for this feature
104 |     flist=[]
105 |     for j in range(len(titles)):
106 |       # Add the article with its weight
107 |       flist.append((w[j,i],titles[j]))
108 |       toppatterns[j].append((w[j,i],i,titles[j]))
109 |     
110 |     # Reverse sort the list
111 |     flist.sort()
112 |     flist.reverse()
113 |     
114 |     # Show the top 3 articles
115 |     for f in flist[0:3]:
116 |       outfile.write(str(f)+'\n')
117 |     outfile.write('\n')
118 | 
119 |   outfile.close()
120 |   # Return the pattern names for later use
121 |   return toppatterns,patternnames
122 | 
123 | def showarticles(titles,toppatterns,patternnames,out='articles.txt'):
124 |   outfile=file(out,'w')  
125 |   
126 |   # Loop over all the articles
127 |   for j in range(len(titles)):
128 |     outfile.write(titles[j].encode('utf8')+'\n')
129 |     
130 |     # Get the top features for this article and
131 |     # reverse sort them
132 |     toppatterns[j].sort()
133 |     toppatterns[j].reverse()
134 |     
135 |     # Print the top three patterns
136 |     for i in range(3):
137 |       outfile.write(str(toppatterns[j][i][0])+' '+
138 |                     str(patternnames[toppatterns[j][i][1]])+'\n')
139 |     outfile.write('\n')
140 |     
141 |   outfile.close()
142 | 


--------------------------------------------------------------------------------
/第10章 寻找独立特征/nnmf.py:
--------------------------------------------------------------------------------
 1 | from numpy import *
 2 | 
 3 | def difcost(a,b):
 4 |   dif=0
 5 |   for i in range(shape(a)[0]):
 6 |     for j in range(shape(a)[1]):
 7 |       # Euclidean Distance
 8 |       dif+=pow(a[i,j]-b[i,j],2)
 9 |   return dif
10 | 
11 | def factorize(v,pc=10,iter=50):
12 |   ic=shape(v)[0]
13 |   fc=shape(v)[1]
14 | 
15 |   # Initialize the weight and feature matrices with random values
16 |   w=matrix([[random.random() for j in range(pc)] for i in range(ic)])
17 |   h=matrix([[random.random() for i in range(fc)] for i in range(pc)])
18 | 
19 |   # Perform operation a maximum of iter times
20 |   for i in range(iter):
21 |     wh=w*h
22 |     
23 |     # Calculate the current difference
24 |     cost=difcost(v,wh)
25 |     
26 |     if i%10==0: print cost
27 |     
28 |     # Terminate if the matrix has been fully factorized
29 |     if cost==0: break
30 |     
31 |     # Update feature matrix
32 |     hn=(transpose(w)*v)
33 |     hd=(transpose(w)*w*h)
34 |   
35 |     h=matrix(array(h)*array(hn)/array(hd))
36 | 
37 |     # Update weights matrix
38 |     wn=(v*transpose(h))
39 |     wd=(w*h*transpose(h))
40 | 
41 |     w=matrix(array(w)*array(wn)/array(wd))  
42 |     
43 |   return w,h
44 | 


--------------------------------------------------------------------------------
/第10章 寻找独立特征/stockfeatures.txt:
--------------------------------------------------------------------------------
  1 | 5.26743580154e+017
  2 | 3.93402025291e+016
  3 | 2.21688612312e+016
  4 | 1.71500393528e+016
  5 | 1.49411594165e+016
  6 | [[  2.33322541e+06   2.07819608e+06   2.51935438e+06   2.96234043e+06
  7 |     1.75536111e+06   7.86146406e+06   2.63057169e+06   2.15047807e+06
  8 |     5.08400536e+06   7.00030282e+06   1.85413701e+07   3.38175040e+06]
  9 |  [  4.39522609e+06   3.06456173e+05   1.01774069e+06   5.95775828e+05
 10 |     4.58278700e+05   2.44897111e+06   6.88990546e+05   9.20287049e+05
 11 |     4.92159041e+06   2.73739991e+06   3.16536914e+06   1.59875019e+07]
 12 |  [  1.94852289e+07   2.76219783e+05   2.65520981e+03   3.05103534e+05
 13 |     1.98473327e+05   3.64804329e+05   1.19037805e+05   1.98460099e+02
 14 |     3.76011874e+05   1.43281935e+05   1.39846581e+06   3.84252682e+05]
 15 |  [  1.17533915e+07   3.03635741e+05   5.79421694e+05   4.36884572e+05
 16 |     3.06811879e+05   9.98011680e+05   5.08825718e+05   2.75383182e+05
 17 |     5.01943100e+06   1.15884764e+06   1.40079467e+06   1.47720209e+04]
 18 |  [  1.05481574e+07   3.70822814e+05   6.30403606e+05   7.01379744e+05
 19 |     1.69117963e+05   1.67921090e+06   6.68489498e+05   3.92653670e+05
 20 |     1.29366132e+03   1.25383449e+06   4.57858763e+06   4.65246631e+05]]
 21 | [[ 0.43597457  0.05871326  0.06516699  0.03360034  0.52074744]
 22 |  [ 0.44059965  0.1652264   0.29512033  0.11415813  0.55129002]
 23 |  [ 0.71651796  0.14618471  0.76636368  0.58318813  0.27252264]
 24 |  ..., 
 25 |  [ 1.28562362  0.84010606  0.65675734  0.2187646   0.68153007]
 26 |  [ 0.78639688  0.40560653  1.21738032  1.17089036  2.06706388]
 27 |  [ 2.45069957  0.00640682  0.86072825  0.10106403  1.12640551]]
 28 | Feature 0
 29 | (18541370.141110275, 'XOM')
 30 | (7861464.0553792343, 'CVX')
 31 | (7000302.8181583285, 'PG')
 32 | (5084005.3613334689, 'GOOG')
 33 | (3381750.4044293971, 'AMGN')
 34 | (2962340.4315599473, 'BP')
 35 | (2630571.6923459047, 'DNA')
 36 | (2519354.3804378472, 'BIIB')
 37 | (2333225.4065250917, 'YHOO')
 38 | (2150478.0737609738, 'EXPE')
 39 | (2078196.0848287165, 'AVP')
 40 | (1755361.1131727577, 'CL')
 41 | 
 42 | [(2.4506995728828622, '18-Oct-05'), (1.7327784403764923, '11-Sep-06'), (1.5111300572258395, '8-Jun-06')]
 43 | 
 44 | Feature 1
 45 | (15987501.883808712, 'AMGN')
 46 | (4921590.4116128432, 'GOOG')
 47 | (4395226.0932264365, 'YHOO')
 48 | (3165369.1418494503, 'XOM')
 49 | (2737399.9096869556, 'PG')
 50 | (2448971.1065134653, 'CVX')
 51 | (1017740.6942413859, 'BIIB')
 52 | (920287.04939950886, 'EXPE')
 53 | (688990.54637332377, 'DNA')
 54 | (595775.82846660342, 'BP')
 55 | (458278.69976566656, 'CL')
 56 | (306456.1727793481, 'AVP')
 57 | 
 58 | [(5.5183934865182875, '15-Feb-06'), (2.138473391072961, '1-Feb-06'), (1.9475044925471119, '26-Jan-06')]
 59 | 
 60 | Feature 2
 61 | (19485228.873686153, 'YHOO')
 62 | (1398465.8074515802, 'XOM')
 63 | (384252.68231490435, 'AMGN')
 64 | (376011.87440058013, 'GOOG')
 65 | (364804.32850560133, 'CVX')
 66 | (305103.53400016041, 'BP')
 67 | (276219.78349040612, 'AVP')
 68 | (198473.32671485722, 'CL')
 69 | (143281.93458262246, 'PG')
 70 | (119037.80463716132, 'DNA')
 71 | (2655.2098122150296, 'BIIB')
 72 | (198.46009910268154, 'EXPE')
 73 | 
 74 | [(8.3018051767438337, '19-Jul-06'), (4.5697390847378792, '19-Sep-06'), (2.506039176128628, '19-Apr-06')]
 75 | 
 76 | Feature 3
 77 | (11753391.461576829, 'YHOO')
 78 | (5019430.9962252304, 'GOOG')
 79 | (1400794.6664170395, 'XOM')
 80 | (1158847.6445206082, 'PG')
 81 | (998011.67965212127, 'CVX')
 82 | (579421.69354580715, 'BIIB')
 83 | (508825.71818347432, 'DNA')
 84 | (436884.5719282077, 'BP')
 85 | (306811.8787867761, 'CL')
 86 | (303635.74069823755, 'AVP')
 87 | (275383.18216351332, 'EXPE')
 88 | (14772.020946359988, 'AMGN')
 89 | 
 90 | [(6.9635400795449733, '18-Jan-06'), (4.4080426022720891, '18-Oct-06'), (3.6766225277997848, '20-Jan-06')]
 91 | 
 92 | Feature 4
 93 | (10548157.403712066, 'YHOO')
 94 | (4578587.6349422066, 'XOM')
 95 | (1679210.8955857321, 'CVX')
 96 | (1253834.4926672454, 'PG')
 97 | (701379.74418841151, 'BP')
 98 | (668489.49759360566, 'DNA')
 99 | (630403.60590710363, 'BIIB')
100 | (465246.63059152756, 'AMGN')
101 | (392653.67018991744, 'EXPE')
102 | (370822.8136748391, 'AVP')
103 | (169117.96293892173, 'CL')
104 | (1293.6613221068894, 'GOOG')
105 | 
106 | [(3.2242716186256213, '19-Jul-06'), (2.4565899212822875, '18-Oct-06'), (2.2169891870590743, '17-Oct-06')]
107 | 
108 | 


--------------------------------------------------------------------------------
/第10章 寻找独立特征/stockvolume.py:
--------------------------------------------------------------------------------
 1 | import nnmf
 2 | import urllib2
 3 | from numpy import *
 4 | 
 5 | tickers=['YHOO','AVP','BIIB','BP','CL','CVX',
 6 |          'DNA','EXPE','GOOG','PG','XOM','AMGN']
 7 | 
 8 | shortest=300
 9 | prices={}
10 | dates=None
11 | 
12 | for t in tickers:
13 |   # Open the URL
14 |   rows=urllib2.urlopen('http://ichart.finance.yahoo.com/table.csv?'+\
15 |                        's=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=1996'%t +\
16 |                        '&ignore=.csv').readlines()
17 | 
18 |   
19 |   # Extract the volume field from every line
20 |   prices[t]=[float(r.split(',')[5]) for r in rows[1:] if r.strip()!='']
21 |   if len(prices[t])<shortest: shortest=len(prices[t])
22 |   
23 |   if not dates:
24 |     dates=[r.split(',')[0] for r in rows[1:] if r.strip()!='']
25 | 
26 | l1=[[prices[tickers[i]][j] 
27 |      for i in range(len(tickers))] 
28 |     for j in range(shortest)]
29 | 
30 | w,h=nnmf.factorize(matrix(l1),pc=5)
31 | 
32 | print h
33 | print w
34 | 
35 | # Loop over all the features
36 | for i in range(shape(h)[0]):
37 |   print "Feature %d" %i
38 |   
39 |   # Get the top stocks for this feature
40 |   ol=[(h[i,j],tickers[j]) for j in range(shape(h)[1])]
41 |   ol.sort()
42 |   ol.reverse()
43 |   for j in range(12):
44 |     print ol[j]
45 |   print
46 |   
47 |   # Show the top dates for this feature
48 |   porder=[(w[d,i],d) for d in range(300)]
49 |   porder.sort()
50 |   porder.reverse()
51 |   print [(p[0],dates[p[1]]) for p in porder[0:3]]
52 |   print
53 | 


--------------------------------------------------------------------------------
/第11章 智能进化/gp.py:
--------------------------------------------------------------------------------
  1 | from random import random,randint,choice
  2 | from copy import deepcopy
  3 | from math import log
  4 | 
  5 | class fwrapper:
  6 |   def __init__(self,function,childcount,name):
  7 |     self.function=function
  8 |     self.childcount=childcount
  9 |     self.name=name
 10 | 
 11 | class node:
 12 |   def __init__(self,fw,children):
 13 |     self.function=fw.function
 14 |     self.name=fw.name
 15 |     self.children=children
 16 | 
 17 |   def evaluate(self,inp):    
 18 |     results=[n.evaluate(inp) for n in self.children]
 19 |     return self.function(results)
 20 |   def display(self,indent=0):
 21 |     print (' '*indent)+self.name
 22 |     for c in self.children:
 23 |       c.display(indent+1)
 24 |     
 25 | 
 26 | class paramnode:
 27 |   def __init__(self,idx):
 28 |     self.idx=idx
 29 | 
 30 |   def evaluate(self,inp):
 31 |     return inp[self.idx]
 32 |   def display(self,indent=0):
 33 |     print '%sp%d' % (' '*indent,self.idx)
 34 |     
 35 |     
 36 | class constnode:
 37 |   def __init__(self,v):
 38 |     self.v=v
 39 |   def evaluate(self,inp):
 40 |     return self.v
 41 |   def display(self,indent=0):
 42 |     print '%s%d' % (' '*indent,self.v)
 43 |     
 44 | 
 45 | addw=fwrapper(lambda l:l[0]+l[1],2,'add')
 46 | subw=fwrapper(lambda l:l[0]-l[1],2,'subtract') 
 47 | mulw=fwrapper(lambda l:l[0]*l[1],2,'multiply')
 48 | 
 49 | def iffunc(l):
 50 |   if l[0]>0: return l[1]
 51 |   else: return l[2]
 52 | ifw=fwrapper(iffunc,3,'if')
 53 | 
 54 | def isgreater(l):
 55 |   if l[0]>l[1]: return 1
 56 |   else: return 0
 57 | gtw=fwrapper(isgreater,2,'isgreater')
 58 | 
 59 | flist=[addw,mulw,ifw,gtw,subw]
 60 | 
 61 | def exampletree():
 62 |   return node(ifw,[
 63 |                   node(gtw,[paramnode(0),constnode(3)]),
 64 |                   node(addw,[paramnode(1),constnode(5)]),
 65 |                   node(subw,[paramnode(1),constnode(2)]),
 66 |                   ]
 67 |               )
 68 | 
 69 | def makerandomtree(pc,maxdepth=4,fpr=0.5,ppr=0.6):
 70 |   if random()<fpr and maxdepth>0:
 71 |     f=choice(flist)
 72 |     children=[makerandomtree(pc,maxdepth-1,fpr,ppr) 
 73 |               for i in range(f.childcount)]
 74 |     return node(f,children)
 75 |   elif random()<ppr:
 76 |     return paramnode(randint(0,pc-1))
 77 |   else:
 78 |     return constnode(randint(0,10))
 79 |               
 80 | 
 81 | def hiddenfunction(x,y):
 82 |     return x**2+2*y+3*x+5
 83 | 
 84 | def buildhiddenset():
 85 |   rows=[]
 86 |   for i in range(200):
 87 |     x=randint(0,40)
 88 |     y=randint(0,40)
 89 |     rows.append([x,y,hiddenfunction(x,y)])
 90 |   return rows
 91 | 
 92 | def scorefunction(tree,s):
 93 |   dif=0
 94 |   for data in s:
 95 |     v=tree.evaluate([data[0],data[1]])
 96 |     dif+=abs(v-data[2])
 97 |   return dif
 98 | 
 99 | 
100 | def mutate(t,pc,probchange=0.1):
101 |   if random()<probchange:
102 |     return makerandomtree(pc)
103 |   else:
104 |     result=deepcopy(t)
105 |     if hasattr(t,"children"):
106 |       result.children=[mutate(c,pc,probchange) for c in t.children]
107 |     return result
108 | 
109 | def crossover(t1,t2,probswap=0.7,top=1):
110 |   if random()<probswap and not top:
111 |     return deepcopy(t2) 
112 |   else:
113 |     result=deepcopy(t1)
114 |     if hasattr(t1,'children') and hasattr(t2,'children'):
115 |       result.children=[crossover(c,choice(t2.children),probswap,0) 
116 |                        for c in t1.children]
117 |     return result
118 | 
119 | def getrankfunction(dataset):
120 |   def rankfunction(population):
121 |     scores=[(scorefunction(t,dataset),t) for t in population]
122 |     scores.sort()
123 |     return scores
124 |   return rankfunction
125 |   
126 |     
127 | 
128 | def evolve(pc,popsize,rankfunction,maxgen=500,
129 |            mutationrate=0.1,breedingrate=0.4,pexp=0.7,pnew=0.05):
130 |   # Returns a random number, tending towards lower numbers. The lower pexp
131 |   # is, more lower numbers you will get
132 |   def selectindex():
133 |     return int(log(random())/log(pexp))
134 | 
135 |   # Create a random initial population
136 |   population=[makerandomtree(pc) for i in range(popsize)]
137 |   for i in range(maxgen):
138 |     scores=rankfunction(population)
139 |     print scores[0][0]
140 |     if scores[0][0]==0: break
141 |     
142 |     # The two best always make it
143 |     newpop=[scores[0][1],scores[1][1]]
144 |     
145 |     # Build the next generation
146 |     while len(newpop)<popsize:
147 |       if random()>pnew:
148 |         newpop.append(mutate(
149 |                       crossover(scores[selectindex()][1],
150 |                                  scores[selectindex()][1],
151 |                                 probswap=breedingrate),
152 |                         pc,probchange=mutationrate))
153 |       else:
154 |       # Add a random node to mix things up
155 |         newpop.append(makerandomtree(pc))
156 |         
157 |     population=newpop
158 |   scores[0][1].display()    
159 |   return scores[0][1]
160 | 
161 | 
162 | def gridgame(p):
163 |   # Board size
164 |   max=(3,3)
165 |   
166 |   # Remember the last move for each player
167 |   lastmove=[-1,-1]
168 |   
169 |   # Remember the player's locations
170 |   location=[[randint(0,max[0]),randint(0,max[1])]]
171 |   
172 |   # Put the second player a sufficient distance from the first
173 |   location.append([(location[0][0]+2)%4,(location[0][1]+2)%4])
174 |   # Maximum of 50 moves before a tie
175 |   for o in range(50):
176 |   
177 |     # For each player
178 |     for i in range(2):
179 |       locs=location[i][:]+location[1-i][:]
180 |       locs.append(lastmove[i])
181 |       move=p[i].evaluate(locs)%4
182 |       
183 |       # You lose if you move the same direction twice in a row
184 |       if lastmove[i]==move: return 1-i
185 |       lastmove[i]=move
186 |       if move==0: 
187 |         location[i][0]-=1
188 |         # Board wraps
189 |         if location[i][0]<0: location[i][0]=0
190 |       if move==1: 
191 |         location[i][0]+=1
192 |         if location[i][0]>max[0]: location[i][0]=max[0]
193 |       if move==2: 
194 |         location[i][1]-=1
195 |         if location[i][1]<0: location[i][1]=0
196 |       if move==3: 
197 |         location[i][1]+=1
198 |         if location[i][1]>max[1]: location[i][1]=max[1]
199 |       
200 |       # If you have captured the other player, you win
201 |       if location[i]==location[1-i]: return i
202 |   return -1
203 | 
204 | 
205 | def tournament(pl):
206 |   # Count losses
207 |   losses=[0 for p in pl]
208 |   
209 |   # Every player plays every other player
210 |   for i in range(len(pl)):
211 |     for j in range(len(pl)):
212 |       if i==j: continue
213 |       
214 |       # Who is the winner?
215 |       winner=gridgame([pl[i],pl[j]])
216 |       
217 |       # Two points for a loss, one point for a tie
218 |       if winner==0:
219 |         losses[j]+=2
220 |       elif winner==1:
221 |         losses[i]+=2
222 |       elif winner==-1:
223 |         losses[i]+=1
224 |         losses[i]+=1
225 |         pass
226 | 
227 |   # Sort and return the results
228 |   z=zip(losses,pl)
229 |   z.sort()
230 |   return z      
231 | 
232 | class humanplayer:
233 |   def evaluate(self,board):
234 | 
235 |     # Get my location and the location of other players
236 |     me=tuple(board[0:2])
237 |     others=[tuple(board[x:x+2]) for x in range(2,len(board)-1,2)]
238 |     
239 |     # Display the board
240 |     for i in range(4):
241 |       for j in range(4):
242 |         if (i,j)==me:
243 |           print 'O',
244 |         elif (i,j) in others:
245 |           print 'X',
246 |         else:
247 |           print '.',
248 |       print
249 |       
250 |     # Show moves, for reference
251 |     print 'Your last move was %d' % board[len(board)-1]
252 |     print ' 0'
253 |     print '2 3'
254 |     print ' 1'
255 |     print 'Enter move: ',
256 |     
257 |     # Return whatever the user enters
258 |     move=int(raw_input())
259 |     return move
260 | 
261 | 
262 | class fwrapper:
263 |   def __init__(self,function,params,name):
264 |     self.function=function
265 |     self.childcount=param
266 |     self.name=name
267 |     
268 | #flist={'str':[substringw,concatw],'int':[indexw]}
269 | flist=[addw,mulw,ifw,gtw,subw]
270 | 


--------------------------------------------------------------------------------
/第11章 智能进化/gp.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第11章 智能进化/gp.pyc


--------------------------------------------------------------------------------