├── README.md ├── 第02章 提供推荐 ├── deliciousrec.py ├── pydelicious.py └── recommendations.py ├── 第03章 发现群组 ├── Thumbs.db ├── blogdata.txt ├── clusters.py ├── downloadzebodata.py ├── feedlist.txt ├── generatefeedvector.py └── zebo.txt ├── 第04章 搜索与排名 ├── nn.py └── searchengine.py ├── 第05章 优化 ├── dorm.py ├── kayak.py ├── optimization.py ├── schedule.txt └── socialnetwork.py ├── 第06章 文档过滤 ├── docclass.py ├── feedfilter.py ├── python_search.xml ├── test.db └── test1.db ├── 第07章 决策树建模 ├── Thumbs.db ├── addresslist.txt ├── hotornot.py ├── treepredict.py └── zillow.py ├── 第08章 构建价格模型 ├── ebaypredict.py ├── numpredict.py └── optimization.py ├── 第09章 高阶分类 核方法与SVM ├── advancedclassify.py ├── agesonly.csv ├── facebook.py ├── matchmaker.csv ├── svm.py ├── svm.pyc └── svmc.pyd ├── 第10章 寻找独立特征 ├── Thumbs.db ├── articles.txt ├── clusters.py ├── docclass.py ├── features.txt ├── newsfeatures.py ├── nnmf.py ├── stockfeatures.txt └── stockvolume.py └── 第11章 智能进化 ├── gp.py └── gp.pyc /README.md: -------------------------------------------------------------------------------- 1 | # Programming-Collective-Intelligence-Source-Code 2 | 集体智慧编程源代码 3 | 4 | ## 中文版PDF电子书免费下载地址 5 | 6 | http://pan.baidu.com/s/1ntKHRPB 7 | -------------------------------------------------------------------------------- /第02章 提供推荐/deliciousrec.py: -------------------------------------------------------------------------------- 1 | from pydelicious import get_popular,get_userposts,get_urlposts 2 | import time 3 | 4 | def initializeUserDict(tag,count=5): 5 | user_dict={} 6 | # get the top count' popular posts 7 | for p1 in get_popular(tag=tag)[0:count]: 8 | # find all users who posted this 9 | for p2 in get_urlposts(p1['href']): 10 | user=p2['user'] 11 | user_dict[user]={} 12 | return user_dict 13 | 14 | def fillItems(user_dict): 15 | all_items={} 16 | # Find links posted by all users 17 | for user in user_dict: 18 | for i in range(3): 19 | try: 20 | posts=get_userposts(user) 21 | break 22 | except: 23 | print "Failed user "+user+", retrying" 24 | time.sleep(4) 25 | for post in posts: 26 | url=post['href'] 27 | user_dict[user][url]=1.0 28 | all_items[url]=1 29 | 30 | # Fill in missing items with 0 31 | for ratings in user_dict.values(): 32 | for item in all_items: 33 | if item not in ratings: 34 | ratings[item]=0.0 35 | -------------------------------------------------------------------------------- /第02章 提供推荐/pydelicious.py: -------------------------------------------------------------------------------- 1 | """Library to access del.icio.us data via Python. 2 | 3 | :examples: 4 | 5 | Using the API class directly: 6 | 7 | >>> a = pydelicious.apiNew('user', 'passwd') 8 | >>> # or: 9 | >>> a = DeliciousAPI('user', 'passwd') 10 | >>> a.tags_get() # Same as: 11 | >>> a.request('tags/get', ) 12 | 13 | Or by calling the 'convenience' methods on the module. 14 | 15 | - def add(user, passwd, url, description, tags = "", extended = "", dt = "", replace="no"): 16 | - def get(user, passwd, tag="", dt="", count = 0): 17 | - def get_all(user, passwd, tag = ""): 18 | - def delete(user, passwd, url): 19 | - def rename_tag(user, passwd, oldtag, newtag): 20 | - def get_tags(user, passwd): 21 | 22 | >>> a = apiNew(user, passwd) 23 | >>> a.posts_add(url="http://my.com/", desciption="my.com", extended="the url is my.moc", tags="my com") 24 | True 25 | >>> len(a.posts_all()) 26 | 1 27 | >>> get_all(user, passwd) 28 | 1 29 | 30 | This are short functions for getrss calls. 31 | 32 | >>> rss_ 33 | 34 | def get_userposts(user): 35 | def get_tagposts(tag): 36 | def get_urlposts(url): 37 | def get_popular(tag = ""): 38 | 39 | >>> json_posts() 40 | >>> json_tags() 41 | >>> json_network() 42 | >>> json_fans() 43 | 44 | :License: pydelicious is released under the BSD license. See 'license.txt' 45 | for more informations. 46 | 47 | :berend: 48 | - Rewriting comments to english. More documentation, examples. 49 | - Added JSON-like return values for XML data (del.icio.us also serves some JSON...) 50 | - better error/exception classes and handling, work in progress. 51 | - Encoding seems to be working (using UTF-8 here). 52 | 53 | :@todo: 54 | - Source code SHOULD BE ASCII! 55 | - More tests. 56 | - Parse datetimes in XML. 57 | - Salvage and test RSS functionality? 58 | - Setup not used, Still works? Should setup.py be tested? 59 | - API functions need required argument checks. 60 | 61 | * lizense einbinden und auch via setup.py verteilen 62 | * readme auch schreiben und via setup.py verteilen 63 | * auch auf anderen systemen testen (linux -> uni) 64 | * automatisch releases bauen lassen, richtig benennen und in das 65 | richtige verzeichnis verschieben. 66 | * was k[o]nnen die anderen librarys denn noch so? (ruby, java, perl, etc) 67 | * was wollen die, die es benutzen? 68 | * wof[u]r k[o]nnte ich es benutzen? 69 | * entschlacken? 70 | 71 | :done: 72 | * Refactored the API class, much cleaner now and functions dlcs_api_request, dlcs_parse_xml are available for who wants them. 73 | * stimmt das so? muss eher noch t[a]g str2utf8 konvertieren 74 | >>> pydelicious.getrss(tag="t[a]g") 75 | url: http://del.icio.us/rss/tag/t[a]g 76 | * requester muss eine sekunde warten 77 | * __init__.py gibt die funktionen weiter 78 | * html parser funktioniert noch nicht, gar nicht 79 | * alte funktionen fehlen, get_posts_by_url, etc. 80 | * post funktion erstellen, die auch die fehlenden attribs addiert. 81 | * die api muss ich noch weiter machen 82 | * requester muss die 503er abfangen 83 | * rss parser muss auf viele m[o]glichkeiten angepasst werden 84 | """ 85 | import sys 86 | import os 87 | import time 88 | import datetime 89 | import md5, httplib 90 | import urllib, urllib2, time 91 | from StringIO import StringIO 92 | 93 | try: 94 | from elementtree.ElementTree import parse as parse_xml 95 | except ImportError: 96 | from xml.etree.ElementTree import parse as parse_xml 97 | 98 | import feedparser 99 | 100 | 101 | ### Static config 102 | 103 | __version__ = '0.5.0' 104 | __author__ = 'Frank Timmermann ' # GP: does not respond to emails 105 | __contributors__ = [ 106 | 'Greg Pinero', 107 | 'Berend van Berkum '] 108 | __url__ = 'http://code.google.com/p/pydelicious/' 109 | __author_email__ = "" 110 | # Old URL: 'http://deliciouspython.python-hosting.com/' 111 | 112 | __description__ = '''pydelicious.py allows you to access the web service of del.icio.us via it's API through python.''' 113 | __long_description__ = '''the goal is to design an easy to use and fully functional python interface to del.icio.us. ''' 114 | 115 | DLCS_OK_MESSAGES = ('done', 'ok') # Known text values of positive del.icio.us answers 116 | DLCS_WAIT_TIME = 4 117 | DLCS_REQUEST_TIMEOUT = 444 # Seconds before socket triggers timeout 118 | #DLCS_API_REALM = 'del.icio.us API' 119 | DLCS_API_HOST = 'https://api.del.icio.us' 120 | DLCS_API_PATH = 'v1' 121 | DLCS_API = "%s/%s" % (DLCS_API_HOST, DLCS_API_PATH) 122 | DLCS_RSS = 'http://del.icio.us/rss/' 123 | 124 | ISO_8601_DATETIME = '%Y-%m-%dT%H:%M:%SZ' 125 | 126 | USER_AGENT = 'pydelicious.py/%s %s' % (__version__, __url__) 127 | 128 | DEBUG = 0 129 | if 'DLCS_DEBUG' in os.environ: 130 | DEBUG = int(os.environ['DLCS_DEBUG']) 131 | 132 | 133 | # Taken from FeedParser.py 134 | # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers. 135 | # Python 2.3 now has this functionality available in the standard socket library, so under 136 | # 2.3 you don't need to install anything. But you probably should anyway, because the socket 137 | # module is buggy and timeoutsocket is better. 138 | try: 139 | import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py 140 | timeoutsocket.setDefaultSocketTimeout(DLCS_REQUEST_TIMEOUT) 141 | except ImportError: 142 | import socket 143 | if hasattr(socket, 'setdefaulttimeout'): socket.setdefaulttimeout(DLCS_REQUEST_TIMEOUT) 144 | if DEBUG: print >>sys.stderr, "Set socket timeout to %s seconds" % DLCS_REQUEST_TIMEOUT 145 | 146 | 147 | ### Utility classes 148 | 149 | class _Waiter: 150 | """Waiter makes sure a certain amount of time passes between 151 | successive calls of `Waiter()`. 152 | 153 | Some attributes: 154 | :last: time of last call 155 | :wait: the minimum time needed between calls 156 | :waited: the number of calls throttled 157 | 158 | pydelicious.Waiter is an instance created when the module is loaded. 159 | """ 160 | def __init__(self, wait): 161 | self.wait = wait 162 | self.waited = 0 163 | self.lastcall = 0; 164 | 165 | def __call__(self): 166 | tt = time.time() 167 | 168 | timeago = tt - self.lastcall 169 | 170 | if self.lastcall and DEBUG>2: 171 | print >>sys.stderr, "Lastcall: %s seconds ago." % lastcall 172 | 173 | if timeago <= self.wait: 174 | if DEBUG>0: print >>sys.stderr, "Waiting %s seconds." % self.wait 175 | time.sleep(self.wait) 176 | self.waited += 1 177 | self.lastcall = tt + self.wait 178 | else: 179 | self.lastcall = tt 180 | 181 | Waiter = _Waiter(DLCS_WAIT_TIME) 182 | 183 | class PyDeliciousException(Exception): 184 | '''Std. pydelicious error''' 185 | pass 186 | 187 | class DeliciousError(Exception): 188 | """Raised when the server responds with a negative answer""" 189 | 190 | 191 | class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 192 | '''@xxx:bvb: Where is this used? should it be registered somewhere with urllib2? 193 | 194 | Handles HTTP Error, currently only 503. 195 | ''' 196 | def http_error_503(self, req, fp, code, msg, headers): 197 | raise urllib2.HTTPError(req, code, throttled_message, headers, fp) 198 | 199 | 200 | class post(dict): 201 | """Post object, contains href, description, hash, dt, tags, 202 | extended, user, count(, shared). 203 | 204 | @xxx:bvb: Is this needed? Right now this is superfluous, 205 | """ 206 | def __init__(self, href = "", description = "", hash = "", time = "", tag = "", extended = "", user = "", count = "", 207 | tags = "", url = "", dt = ""): # tags or tag? 208 | self["href"] = href 209 | if url != "": self["href"] = url 210 | self["description"] = description 211 | self["hash"] = hash 212 | self["dt"] = dt 213 | if time != "": self["dt"] = time 214 | self["tags"] = tags 215 | if tag != "": self["tags"] = tag # tag or tags? # !! tags 216 | self["extended"] = extended 217 | self["user"] = user 218 | self["count"] = count 219 | 220 | def __getattr__(self, name): 221 | try: return self[name] 222 | except: object.__getattribute__(self, name) 223 | 224 | 225 | class posts(list): 226 | """@xxx:bvb: idem as class post, python structures (dict/list) might 227 | suffice or a more generic solution is needed. 228 | """ 229 | def __init__(self, *args): 230 | for i in args: self.append(i) 231 | 232 | def __getattr__(self, attr): 233 | try: return [p[attr] for p in self] 234 | except: object.__getattribute__(self, attr) 235 | 236 | ### Utility functions 237 | 238 | def str2uni(s): 239 | # type(in) str or unicode 240 | # type(out) unicode 241 | return ("".join([unichr(ord(i)) for i in s])) 242 | 243 | def str2utf8(s): 244 | # type(in) str or unicode 245 | # type(out) str 246 | return ("".join([unichr(ord(i)).encode("utf-8") for i in s])) 247 | 248 | def str2quote(s): 249 | return urllib.quote_plus("".join([unichr(ord(i)).encode("utf-8") for i in s])) 250 | 251 | def dict0(d): 252 | # Trims empty dict entries 253 | # {'a':'a', 'b':'', 'c': 'c'} => {'a': 'a', 'c': 'c'} 254 | dd = dict() 255 | for i in d: 256 | if d[i] != "": dd[i] = d[i] 257 | return dd 258 | 259 | def delicious_datetime(str): 260 | """Parse a ISO 8601 formatted string to a Python datetime ... 261 | """ 262 | return datetime.datetime(*time.strptime(str, ISO_8601_DATETIME)[0:6]) 263 | 264 | def http_request(url, user_agent=USER_AGENT, retry=4): 265 | """Retrieve the contents referenced by the URL using urllib2. 266 | 267 | Retries up to four times (default) on exceptions. 268 | """ 269 | request = urllib2.Request(url, headers={'User-Agent':user_agent}) 270 | 271 | # Remember last error 272 | e = None 273 | 274 | # Repeat request on time-out errors 275 | tries = retry; 276 | while tries: 277 | try: 278 | return urllib2.urlopen(request) 279 | 280 | except urllib2.HTTPError, e: # protocol errors, 281 | raise PyDeliciousException, "%s" % e 282 | 283 | except urllib2.URLError, e: 284 | # @xxx: Ugly check for time-out errors 285 | #if len(e)>0 and 'timed out' in arg[0]: 286 | print >> sys.stderr, "%s, %s tries left." % (e, tries) 287 | Waiter() 288 | tries = tries - 1 289 | #else: 290 | # tries = None 291 | 292 | # Give up 293 | raise PyDeliciousException, \ 294 | "Unable to retrieve data at '%s', %s" % (url, e) 295 | 296 | def http_auth_request(url, host, user, passwd, user_agent=USER_AGENT): 297 | """Call an HTTP server with authorization credentials using urllib2. 298 | """ 299 | if DEBUG: httplib.HTTPConnection.debuglevel = 1 300 | 301 | # Hook up handler/opener to urllib2 302 | password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 303 | password_manager.add_password(None, host, user, passwd) 304 | auth_handler = urllib2.HTTPBasicAuthHandler(password_manager) 305 | opener = urllib2.build_opener(auth_handler) 306 | urllib2.install_opener(opener) 307 | 308 | return http_request(url, user_agent) 309 | 310 | def dlcs_api_request(path, params='', user='', passwd='', throttle=True): 311 | """Retrieve/query a path within the del.icio.us API. 312 | 313 | This implements a minimum interval between calls to avoid 314 | throttling. [#]_ Use param 'throttle' to turn this behaviour off. 315 | 316 | @todo: back off on 503's (HTTPError, URLError? @todo: testing). 317 | 318 | Returned XML does not always correspond with given del.icio.us examples 319 | @todo: (cf. help/api/... and post's attributes) 320 | 321 | .. [#] http://del.icio.us/help/api/ 322 | """ 323 | if throttle: 324 | Waiter() 325 | 326 | if params: 327 | # params come as a dict, strip empty entries and urlencode 328 | url = "%s/%s?%s" % (DLCS_API, path, urllib.urlencode(dict0(params))) 329 | else: 330 | url = "%s/%s" % (DLCS_API, path) 331 | 332 | if DEBUG: print >>sys.stderr, "dlcs_api_request: %s" % url 333 | 334 | try: 335 | return http_auth_request(url, DLCS_API_HOST, user, passwd, USER_AGENT) 336 | 337 | # @bvb: Is this ever raised? When? 338 | except DefaultErrorHandler, e: 339 | print >>sys.stderr, "%s" % e 340 | 341 | def dlcs_parse_xml(data, split_tags=False): 342 | """Parse any del.icio.us XML document and return Python data structure. 343 | 344 | Recognizes all XML document formats as returned by the version 1 API and 345 | translates to a JSON-like data structure (dicts 'n lists). 346 | 347 | Returned instance is always a dictionary. Examples:: 348 | 349 | {'posts': [{'url':'...','hash':'...',},],} 350 | {'tags':['tag1', 'tag2',]} 351 | {'dates': [{'count':'...','date':'...'},], 'tag':'', 'user':'...'} 352 | {'result':(True, "done")} 353 | # etcetera. 354 | """ 355 | 356 | if DEBUG>3: print >>sys.stderr, "dlcs_parse_xml: parsing from ", data 357 | 358 | if not hasattr(data, 'read'): 359 | data = StringIO(data) 360 | 361 | doc = parse_xml(data) 362 | root = doc.getroot() 363 | fmt = root.tag 364 | 365 | # Split up into three cases: Data, Result or Update 366 | if fmt in ('tags', 'posts', 'dates', 'bundles'): 367 | 368 | # Data: expect a list of data elements, 'resources'. 369 | # Use `fmt` (without last 's') to find data elements, elements 370 | # don't have contents, attributes contain all the data we need: 371 | # append to list 372 | elist = [el.attrib for el in doc.findall(fmt[:-1])] 373 | 374 | # Return list in dict, use tagname of rootnode as keyname. 375 | data = {fmt: elist} 376 | 377 | # Root element might have attributes too, append dict. 378 | data.update(root.attrib) 379 | 380 | return data 381 | 382 | elif fmt == 'result': 383 | 384 | # Result: answer to operations 385 | if root.attrib.has_key('code'): 386 | msg = root.attrib['code'] 387 | else: 388 | msg = root.text 389 | 390 | # Return {'result':(True, msg)} for /known/ O.K. messages, 391 | # use (False, msg) otherwise 392 | v = msg in DLCS_OK_MESSAGES 393 | return {fmt: (v, msg)} 394 | 395 | elif fmt == 'update': 396 | 397 | # Update: "time" 398 | #return {fmt: root.attrib} 399 | return {fmt: {'time':time.strptime(root.attrib['time'], ISO_8601_DATETIME)}} 400 | 401 | else: 402 | raise PyDeliciousException, "Unknown XML document format '%s'" % fmt 403 | 404 | def dlcs_rss_request(tag = "", popular = 0, user = "", url = ''): 405 | """Handle a request for RSS 406 | 407 | @todo: translate from German 408 | 409 | rss sollte nun wieder funktionieren, aber diese try, except scheisse ist so nicht schoen 410 | 411 | rss wird unterschiedlich zusammengesetzt. ich kann noch keinen einheitlichen zusammenhang 412 | zwischen daten (url, desc, ext, usw) und dem feed erkennen. warum k[o]nnen die das nicht einheitlich machen? 413 | """ 414 | tag = str2quote(tag) 415 | user = str2quote(user) 416 | if url != '': 417 | # http://del.icio.us/rss/url/efbfb246d886393d48065551434dab54 418 | url = DLCS_RSS + '''url/%s'''%md5.new(url).hexdigest() 419 | elif user != '' and tag != '': 420 | url = DLCS_RSS + '''%(user)s/%(tag)s'''%dict(user=user, tag=tag) 421 | elif user != '' and tag == '': 422 | # http://del.icio.us/rss/delpy 423 | url = DLCS_RSS + '''%s'''%user 424 | elif popular == 0 and tag == '': 425 | url = DLCS_RSS 426 | elif popular == 0 and tag != '': 427 | # http://del.icio.us/rss/tag/apple 428 | # http://del.icio.us/rss/tag/web2.0 429 | url = DLCS_RSS + "tag/%s"%tag 430 | elif popular == 1 and tag == '': 431 | url = DLCS_RSS + '''popular/''' 432 | elif popular == 1 and tag != '': 433 | url = DLCS_RSS + '''popular/%s'''%tag 434 | rss = http_request(url).read() 435 | rss = feedparser.parse(rss) 436 | # print rss 437 | # for e in rss.entries: print e;print 438 | l = posts() 439 | for e in rss.entries: 440 | if e.has_key("links") and e["links"]!=[] and e["links"][0].has_key("href"): 441 | url = e["links"][0]["href"] 442 | elif e.has_key("link"): 443 | url = e["link"] 444 | elif e.has_key("id"): 445 | url = e["id"] 446 | else: 447 | url = "" 448 | if e.has_key("title"): 449 | description = e['title'] 450 | elif e.has_key("title_detail") and e["title_detail"].has_key("title"): 451 | description = e["title_detail"]['value'] 452 | else: 453 | description = '' 454 | try: tags = e['categories'][0][1] 455 | except: 456 | try: tags = e["category"] 457 | except: tags = "" 458 | if e.has_key("modified"): 459 | dt = e['modified'] 460 | else: 461 | dt = "" 462 | if e.has_key("summary"): 463 | extended = e['summary'] 464 | elif e.has_key("summary_detail"): 465 | e['summary_detail']["value"] 466 | else: 467 | extended = "" 468 | if e.has_key("author"): 469 | user = e['author'] 470 | else: 471 | user = "" 472 | # time = dt ist weist auf ein problem hin 473 | # die benennung der variablen ist nicht einheitlich 474 | # api senden und 475 | # xml bekommen sind zwei verschiedene schuhe :( 476 | l.append(post(url = url, description = description, tags = tags, dt = dt, extended = extended, user = user)) 477 | return l 478 | 479 | 480 | ### Main module class 481 | 482 | class DeliciousAPI: 483 | """Class providing main interace to del.icio.us API. 484 | 485 | Methods ``request`` and ``request_raw`` represent the core. For all API 486 | paths there are furthermore methods (e.g. posts_add for 'posts/all') with 487 | an explicit declaration of the parameters and documentation. These all call 488 | ``request`` and pass on extra keywords like ``_raw``. 489 | """ 490 | 491 | def __init__(self, user, passwd, codec='iso-8859-1', api_request=dlcs_api_request, xml_parser=dlcs_parse_xml): 492 | """Initialize access to the API with ``user`` and ``passwd``. 493 | 494 | ``codec`` sets the encoding of the arguments. 495 | 496 | The ``api_request`` and ``xml_parser`` parameters by default point to 497 | functions within this package with standard implementations to 498 | request and parse a resource. See ``dlcs_api_request()`` and 499 | ``dlcs_parse_xml()``. Note that ``api_request`` should return a 500 | file-like instance with an HTTPMessage instance under ``info()``, 501 | see ``urllib2.openurl`` for more info. 502 | """ 503 | assert user != "" 504 | self.user = user 505 | self.passwd = passwd 506 | self.codec = codec 507 | 508 | # Implement communication to server and parsing of respons messages: 509 | assert callable(api_request) 510 | self._api_request = api_request 511 | assert callable(xml_parser) 512 | self._parse_response = xml_parser 513 | 514 | def _call_server(self, path, **params): 515 | params = dict0(params) 516 | for key in params: 517 | params[key] = params[key].encode(self.codec) 518 | 519 | # see __init__ for _api_request() 520 | return self._api_request(path, params, self.user, self.passwd) 521 | 522 | 523 | ### Core functionality 524 | 525 | def request(self, path, _raw=False, **params): 526 | """Calls a path in the API, parses the answer to a JSON-like structure by 527 | default. Use with ``_raw=True`` or ``call request_raw()`` directly to 528 | get the filehandler and process the response message manually. 529 | 530 | Calls to some paths will return a `result` message, i.e.:: 531 | 532 | 533 | 534 | or:: 535 | 536 | ... 537 | 538 | These are all parsed to ``{'result':(Boolean, MessageString)}`` and this 539 | method will raise ``DeliciousError`` on negative `result` answers. Using 540 | ``_raw=True`` bypasses all parsing and will never raise ``DeliciousError``. 541 | 542 | See ``dlcs_parse_xml()`` and ``self.request_raw()``.""" 543 | 544 | # method _parse_response is bound in `__init__()`, `_call_server` 545 | # uses `_api_request` also set in `__init__()` 546 | if _raw: 547 | # return answer 548 | return self.request_raw(path, **params) 549 | 550 | else: 551 | # get answer and parse 552 | fl = self._call_server(path, **params) 553 | rs = self._parse_response(fl) 554 | 555 | # Raise an error for negative 'result' answers 556 | if type(rs) == dict and rs == 'result' and not rs['result'][0]: 557 | errmsg = "" 558 | if len(rs['result'])>0: 559 | errmsg = rs['result'][1:] 560 | raise DeliciousError, errmsg 561 | 562 | return rs 563 | 564 | def request_raw(self, path, **params): 565 | """Calls the path in the API, returns the filehandle. Returned 566 | file-like instances have an ``HTTPMessage`` instance with HTTP header 567 | information available. Use ``filehandle.info()`` or refer to the 568 | ``urllib2.openurl`` documentation. 569 | """ 570 | # see `request()` on how the response can be handled 571 | return self._call_server(path, **params) 572 | 573 | ### Explicit declarations of API paths, their parameters and docs 574 | 575 | # Tags 576 | def tags_get(self, **kwds): 577 | """Returns a list of tags and the number of times it is used by the user. 578 | :: 579 | 580 | 581 | 582 | """ 583 | return self.request("tags/get", **kwds) 584 | 585 | def tags_rename(self, old, new, **kwds): 586 | """Rename an existing tag with a new tag name. Returns a `result` 587 | message or raises an ``DeliciousError``. See ``self.request()``. 588 | 589 | &old (required) 590 | Tag to rename. 591 | &new (required) 592 | New name. 593 | """ 594 | return self.request("tags/rename", old=old, new=new, **kwds) 595 | 596 | # Posts 597 | def posts_update(self, **kwds): 598 | """Returns the last update time for the user. Use this before calling 599 | `posts_all` to see if the data has changed since the last fetch. 600 | :: 601 | 602 | 603 | """ 604 | return self.request("posts/update", **kwds) 605 | 606 | def posts_dates(self, tag="", **kwds): 607 | """Returns a list of dates with the number of posts at each date. 608 | :: 609 | 610 | 611 | 612 | 613 | &tag (optional). 614 | Filter by this tag. 615 | """ 616 | return self.request("posts/dates", tag=tag, **kwds) 617 | 618 | def posts_get(self, tag="", dt="", url="", **kwds): 619 | """Returns posts matching the arguments. If no date or url is given, 620 | most recent date will be used. 621 | :: 622 | 623 | 624 | 625 | 626 | &tag (optional). 627 | Filter by this tag. 628 | &dt (optional). 629 | Filter by this date (CCYY-MM-DDThh:mm:ssZ). 630 | &url (optional). 631 | Filter by this url. 632 | """ 633 | return self.request("posts/get", tag=tag, dt=dt, url=url, **kwds) 634 | 635 | def posts_recent(self, tag="", count="", **kwds): 636 | """Returns a list of the most recent posts, filtered by argument. 637 | :: 638 | 639 | 640 | 641 | 642 | &tag (optional). 643 | Filter by this tag. 644 | &count (optional). 645 | Number of items to retrieve (Default:15, Maximum:100). 646 | """ 647 | return self.request("posts/recent", tag=tag, count=count, **kwds) 648 | 649 | def posts_all(self, tag="", **kwds): 650 | """Returns all posts. Please use sparingly. Call the `posts_update` 651 | method to see if you need to fetch this at all. 652 | :: 653 | 654 | 655 | 656 | 657 | &tag (optional). 658 | Filter by this tag. 659 | """ 660 | return self.request("posts/all", tag=tag, **kwds) 661 | 662 | def posts_add(self, url, description, extended="", tags="", dt="", 663 | replace="no", shared="yes", **kwds): 664 | """Add a post to del.icio.us. Returns a `result` message or raises an 665 | ``DeliciousError``. See ``self.request()``. 666 | 667 | &url (required) 668 | the url of the item. 669 | &description (required) 670 | the description of the item. 671 | &extended (optional) 672 | notes for the item. 673 | &tags (optional) 674 | tags for the item (space delimited). 675 | &dt (optional) 676 | datestamp of the item (format "CCYY-MM-DDThh:mm:ssZ"). 677 | 678 | Requires a LITERAL "T" and "Z" like in ISO8601 at http://www.cl.cam.ac.uk/~mgk25/iso-time.html for example: "1984-09-01T14:21:31Z" 679 | &replace=no (optional) - don't replace post if given url has already been posted. 680 | &shared=no (optional) - make the item private 681 | """ 682 | return self.request("posts/add", url=url, description=description, 683 | extended=extended, tags=tags, dt=dt, 684 | replace=replace, shared=shared, **kwds) 685 | 686 | def posts_delete(self, url, **kwds): 687 | """Delete a post from del.icio.us. Returns a `result` message or 688 | raises an ``DeliciousError``. See ``self.request()``. 689 | 690 | &url (required) 691 | the url of the item. 692 | """ 693 | return self.request("posts/delete", url=url, **kwds) 694 | 695 | # Bundles 696 | def bundles_all(self, **kwds): 697 | """Retrieve user bundles from del.icio.us. 698 | :: 699 | 700 | 701 | 702 | """ 703 | return self.request("tags/bundles/all", **kwds) 704 | 705 | def bundles_set(self, bundle, tags, **kwds): 706 | """Assign a set of tags to a single bundle, wipes away previous 707 | settings for bundle. Returns a `result` messages or raises an 708 | ``DeliciousError``. See ``self.request()``. 709 | 710 | &bundle (required) 711 | the bundle name. 712 | &tags (required) 713 | list of tags (space seperated). 714 | """ 715 | if type(tags)==list: 716 | tags = " ".join(tags) 717 | return self.request("tags/bundles/set", bundle=bundle, tags=tags, 718 | **kwds) 719 | 720 | def bundles_delete(self, bundle, **kwds): 721 | """Delete a bundle from del.icio.us. Returns a `result` message or 722 | raises an ``DeliciousError``. See ``self.request()``. 723 | 724 | &bundle (required) 725 | the bundle name. 726 | """ 727 | return self.request("tags/bundles/delete", bundle=bundle, **kwds) 728 | 729 | ### Utils 730 | 731 | # Lookup table for del.icio.us url-path to DeliciousAPI method. 732 | paths = { 733 | 'tags/get': tags_get, 734 | 'tags/rename': tags_rename, 735 | 'posts/update': posts_update, 736 | 'posts/dates': posts_dates, 737 | 'posts/get': posts_get, 738 | 'posts/recent': posts_recent, 739 | 'posts/all': posts_all, 740 | 'posts/add': posts_add, 741 | 'posts/delete': posts_delete, 742 | 'tags/bundles/all': bundles_all, 743 | 'tags/bundles/set': bundles_set, 744 | 'tags/bundles/delete': bundles_delete, 745 | } 746 | 747 | def get_url(self, url): 748 | """Return the del.icio.us url at which the HTML page with posts for 749 | ``url`` can be found. 750 | """ 751 | return "http://del.icio.us/url/?url=%s" % (url,) 752 | 753 | 754 | ### Convenience functions on this package 755 | 756 | def apiNew(user, passwd): 757 | """creates a new DeliciousAPI object. 758 | requires user(name) and passwd 759 | """ 760 | return DeliciousAPI(user=user, passwd=passwd) 761 | 762 | def add(user, passwd, url, description, tags="", extended="", dt="", replace="no"): 763 | return apiNew(user, passwd).posts_add(url=url, description=description, extended=extended, tags=tags, dt=dt, replace=replace) 764 | 765 | def get(user, passwd, tag="", dt="", count = 0): 766 | posts = apiNew(user, passwd).posts_get(tag=tag,dt=dt) 767 | if count != 0: posts = posts[0:count] 768 | return posts 769 | 770 | def get_all(user, passwd, tag=""): 771 | return apiNew(user, passwd).posts_all(tag=tag) 772 | 773 | def delete(user, passwd, url): 774 | return apiNew(user, passwd).posts_delete(url=url) 775 | 776 | def rename_tag(user, passwd, oldtag, newtag): 777 | return apiNew(user=user, passwd=passwd).tags_rename(old=oldtag, new=newtag) 778 | 779 | def get_tags(user, passwd): 780 | return apiNew(user=user, passwd=passwd).tags_get() 781 | 782 | 783 | ### RSS functions @bvb: still working...? 784 | def getrss(tag="", popular=0, url='', user=""): 785 | """get posts from del.icio.us via parsing RSS @bvb[or HTML] 786 | 787 | @bvb[not tested] 788 | 789 | tag (opt) sort by tag 790 | popular (opt) look for the popular stuff 791 | user (opt) get the posts by a user, this striks popular 792 | url (opt) get the posts by url 793 | """ 794 | return dlcs_rss_request(tag=tag, popular=popular, user=user, url=url) 795 | 796 | def get_userposts(user): 797 | return getrss(user = user) 798 | 799 | def get_tagposts(tag): 800 | return getrss(tag = tag) 801 | 802 | def get_urlposts(url): 803 | return getrss(url = url) 804 | 805 | def get_popular(tag = ""): 806 | return getrss(tag = tag, popular = 1) 807 | 808 | 809 | ### @TODO: implement JSON fetching 810 | def json_posts(user, count=15): 811 | """http://del.icio.us/feeds/json/mpe 812 | http://del.icio.us/feeds/json/mpe/art+history 813 | count=### the number of posts you want to get (default is 15, maximum is 100) 814 | raw a raw JSON object is returned, instead of an object named Delicious.posts 815 | """ 816 | 817 | def json_tags(user, atleast, count, sort='alpha'): 818 | """http://del.icio.us/feeds/json/tags/mpe 819 | atleast=### include only tags for which there are at least ### number of posts 820 | count=### include ### tags, counting down from the top 821 | sort={alpha|count} construct the object with tags in alphabetic order (alpha), or by count of posts (count) 822 | callback=NAME wrap the object definition in a function call NAME(...), thus invoking that function when the feed is executed 823 | raw a pure JSON object is returned, instead of code that will construct an object named Delicious.tags 824 | """ 825 | 826 | def json_network(user): 827 | """http://del.icio.us/feeds/json/network/mpe 828 | callback=NAME wrap the object definition in a function call NAME(...) 829 | ?raw a raw JSON object is returned, instead of an object named Delicious.posts 830 | """ 831 | 832 | def json_fans(user): 833 | """http://del.icio.us/feeds/json/fans/mpe 834 | callback=NAME wrap the object definition in a function call NAME(...) 835 | ?raw a pure JSON object is returned, instead of an object named Delicious. 836 | """ 837 | 838 | -------------------------------------------------------------------------------- /第02章 提供推荐/recommendations.py: -------------------------------------------------------------------------------- 1 | # A dictionary of movie critics and their ratings of a small 2 | # set of movies 3 | critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5, 4 | 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 5 | 'The Night Listener': 3.0}, 6 | 'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 7 | 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 8 | 'You, Me and Dupree': 3.5}, 9 | 'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0, 10 | 'Superman Returns': 3.5, 'The Night Listener': 4.0}, 11 | 'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0, 12 | 'The Night Listener': 4.5, 'Superman Returns': 4.0, 13 | 'You, Me and Dupree': 2.5}, 14 | 'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 15 | 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0, 16 | 'You, Me and Dupree': 2.0}, 17 | 'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 18 | 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5}, 19 | 'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}} 20 | 21 | 22 | from math import sqrt 23 | 24 | # Returns a distance-based similarity score for person1 and person2 25 | def sim_distance(prefs,person1,person2): 26 | # Get the list of shared_items 27 | si={} 28 | for item in prefs[person1]: 29 | if item in prefs[person2]: si[item]=1 30 | 31 | # if they have no ratings in common, return 0 32 | if len(si)==0: return 0 33 | 34 | # Add up the squares of all the differences 35 | sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) 36 | for item in prefs[person1] if item in prefs[person2]]) 37 | 38 | return 1/(1+sum_of_squares) 39 | 40 | # Returns the Pearson correlation coefficient for p1 and p2 41 | def sim_pearson(prefs,p1,p2): 42 | # Get the list of mutually rated items 43 | si={} 44 | for item in prefs[p1]: 45 | if item in prefs[p2]: si[item]=1 46 | 47 | # if they are no ratings in common, return 0 48 | if len(si)==0: return 0 49 | 50 | # Sum calculations 51 | n=len(si) 52 | 53 | # Sums of all the preferences 54 | sum1=sum([prefs[p1][it] for it in si]) 55 | sum2=sum([prefs[p2][it] for it in si]) 56 | 57 | # Sums of the squares 58 | sum1Sq=sum([pow(prefs[p1][it],2) for it in si]) 59 | sum2Sq=sum([pow(prefs[p2][it],2) for it in si]) 60 | 61 | # Sum of the products 62 | pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si]) 63 | 64 | # Calculate r (Pearson score) 65 | num=pSum-(sum1*sum2/n) 66 | den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n)) 67 | if den==0: return 0 68 | 69 | r=num/den 70 | 71 | return r 72 | 73 | # Returns the best matches for person from the prefs dictionary. 74 | # Number of results and similarity function are optional params. 75 | def topMatches(prefs,person,n=5,similarity=sim_pearson): 76 | scores=[(similarity(prefs,person,other),other) 77 | for other in prefs if other!=person] 78 | scores.sort() 79 | scores.reverse() 80 | return scores[0:n] 81 | 82 | # Gets recommendations for a person by using a weighted average 83 | # of every other user's rankings 84 | def getRecommendations(prefs,person,similarity=sim_pearson): 85 | totals={} 86 | simSums={} 87 | for other in prefs: 88 | # don't compare me to myself 89 | if other==person: continue 90 | sim=similarity(prefs,person,other) 91 | 92 | # ignore scores of zero or lower 93 | if sim<=0: continue 94 | for item in prefs[other]: 95 | 96 | # only score movies I haven't seen yet 97 | if item not in prefs[person] or prefs[person][item]==0: 98 | # Similarity * Score 99 | totals.setdefault(item,0) 100 | totals[item]+=prefs[other][item]*sim 101 | # Sum of similarities 102 | simSums.setdefault(item,0) 103 | simSums[item]+=sim 104 | 105 | # Create the normalized list 106 | rankings=[(total/simSums[item],item) for item,total in totals.items()] 107 | 108 | # Return the sorted list 109 | rankings.sort() 110 | rankings.reverse() 111 | return rankings 112 | 113 | def transformPrefs(prefs): 114 | result={} 115 | for person in prefs: 116 | for item in prefs[person]: 117 | result.setdefault(item,{}) 118 | 119 | # Flip item and person 120 | result[item][person]=prefs[person][item] 121 | return result 122 | 123 | 124 | def calculateSimilarItems(prefs,n=10): 125 | # Create a dictionary of items showing which other items they 126 | # are most similar to. 127 | result={} 128 | # Invert the preference matrix to be item-centric 129 | itemPrefs=transformPrefs(prefs) 130 | c=0 131 | for item in itemPrefs: 132 | # Status updates for large datasets 133 | c+=1 134 | if c%100==0: print "%d / %d" % (c,len(itemPrefs)) 135 | # Find the most similar items to this one 136 | scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance) 137 | result[item]=scores 138 | return result 139 | 140 | def getRecommendedItems(prefs,itemMatch,user): 141 | userRatings=prefs[user] 142 | scores={} 143 | totalSim={} 144 | # Loop over items rated by this user 145 | for (item,rating) in userRatings.items( ): 146 | 147 | # Loop over items similar to this one 148 | for (similarity,item2) in itemMatch[item]: 149 | 150 | # Ignore if this user has already rated this item 151 | if item2 in userRatings: continue 152 | # Weighted sum of rating times similarity 153 | scores.setdefault(item2,0) 154 | scores[item2]+=similarity*rating 155 | # Sum of all the similarities 156 | totalSim.setdefault(item2,0) 157 | totalSim[item2]+=similarity 158 | 159 | # Divide each total score by total weighting to get an average 160 | rankings=[(score/totalSim[item],item) for item,score in scores.items( )] 161 | 162 | # Return the rankings from highest to lowest 163 | rankings.sort( ) 164 | rankings.reverse( ) 165 | return rankings 166 | 167 | def loadMovieLens(path='/data/movielens'): 168 | # Get movie titles 169 | movies={} 170 | for line in open(path+'/u.item'): 171 | (id,title)=line.split('|')[0:2] 172 | movies[id]=title 173 | 174 | # Load data 175 | prefs={} 176 | for line in open(path+'/u.data'): 177 | (user,movieid,rating,ts)=line.split('\t') 178 | prefs.setdefault(user,{}) 179 | prefs[user][movies[movieid]]=float(rating) 180 | return prefs 181 | -------------------------------------------------------------------------------- /第03章 发现群组/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第03章 发现群组/Thumbs.db -------------------------------------------------------------------------------- /第03章 发现群组/clusters.py: -------------------------------------------------------------------------------- 1 | from PIL import Image,ImageDraw 2 | 3 | def readfile(filename): 4 | lines=[line for line in file(filename)] 5 | 6 | # First line is the column titles 7 | colnames=lines[0].strip().split('\t')[1:] 8 | rownames=[] 9 | data=[] 10 | for line in lines[1:]: 11 | p=line.strip().split('\t') 12 | # First column in each row is the rowname 13 | rownames.append(p[0]) 14 | # The data for this row is the remainder of the row 15 | data.append([float(x) for x in p[1:]]) 16 | return rownames,colnames,data 17 | 18 | 19 | from math import sqrt 20 | 21 | def pearson(v1,v2): 22 | # Simple sums 23 | sum1=sum(v1) 24 | sum2=sum(v2) 25 | 26 | # Sums of the squares 27 | sum1Sq=sum([pow(v,2) for v in v1]) 28 | sum2Sq=sum([pow(v,2) for v in v2]) 29 | 30 | # Sum of the products 31 | pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) 32 | 33 | # Calculate r (Pearson score) 34 | num=pSum-(sum1*sum2/len(v1)) 35 | den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) 36 | if den==0: return 0 37 | 38 | return 1.0-num/den 39 | 40 | class bicluster: 41 | def __init__(self,vec,left=None,right=None,distance=0.0,id=None): 42 | self.left=left 43 | self.right=right 44 | self.vec=vec 45 | self.id=id 46 | self.distance=distance 47 | 48 | def hcluster(rows,distance=pearson): 49 | distances={} 50 | currentclustid=-1 51 | 52 | # Clusters are initially just the rows 53 | clust=[bicluster(rows[i],id=i) for i in range(len(rows))] 54 | 55 | while len(clust)>1: 56 | lowestpair=(0,1) 57 | closest=distance(clust[0].vec,clust[1].vec) 58 | 59 | # loop through every pair looking for the smallest distance 60 | for i in range(len(clust)): 61 | for j in range(i+1,len(clust)): 62 | # distances is the cache of distance calculations 63 | if (clust[i].id,clust[j].id) not in distances: 64 | distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) 65 | 66 | d=distances[(clust[i].id,clust[j].id)] 67 | 68 | if d0: 205 | for rowid in bestmatches[i]: 206 | for m in range(len(rows[rowid])): 207 | avgs[m]+=rows[rowid][m] 208 | for j in range(len(avgs)): 209 | avgs[j]/=len(bestmatches[i]) 210 | clusters[i]=avgs 211 | 212 | return bestmatches 213 | 214 | def tanamoto(v1,v2): 215 | c1,c2,shr=0,0,0 216 | 217 | for i in range(len(v1)): 218 | if v1[i]!=0: c1+=1 # in v1 219 | if v2[i]!=0: c2+=1 # in v2 220 | if v1[i]!=0 and v2[i]!=0: shr+=1 # in both 221 | 222 | return 1.0-(float(shr)/(c1+c2-shr)) 223 | 224 | def scaledown(data,distance=pearson,rate=0.01): 225 | n=len(data) 226 | 227 | # The real distances between every pair of items 228 | realdist=[[distance(data[i],data[j]) for j in range(n)] 229 | for i in range(0,n)] 230 | 231 | # Randomly initialize the starting points of the locations in 2D 232 | loc=[[random.random(),random.random()] for i in range(n)] 233 | fakedist=[[0.0 for j in range(n)] for i in range(n)] 234 | 235 | lasterror=None 236 | for m in range(0,1000): 237 | # Find projected distances 238 | for i in range(n): 239 | for j in range(n): 240 | fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 241 | for x in range(len(loc[i]))])) 242 | 243 | # Move points 244 | grad=[[0.0,0.0] for i in range(n)] 245 | 246 | totalerror=0 247 | for k in range(n): 248 | for j in range(n): 249 | if j==k: continue 250 | # The error is percent difference between the distances 251 | errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] 252 | 253 | # Each point needs to be moved away from or towards the other 254 | # point in proportion to how much error it has 255 | grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm 256 | grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm 257 | 258 | # Keep track of the total error 259 | totalerror+=abs(errorterm) 260 | print totalerror 261 | 262 | # If the answer got worse by moving the points, we are done 263 | if lasterror and lasterror10: 35 | out.write(item) 36 | for user in range(0,currentuser): 37 | if user in owners: out.write('\t1') 38 | else: out.write('\t0') 39 | out.write('\n') 40 | -------------------------------------------------------------------------------- /第03章 发现群组/feedlist.txt: -------------------------------------------------------------------------------- 1 | http://feeds.feedburner.com/37signals/beMH 2 | http://feeds.feedburner.com/blogspot/bRuz 3 | http://battellemedia.com/index.xml 4 | http://blog.guykawasaki.com/index.rdf 5 | http://blog.outer-court.com/rss.xml 6 | http://feeds.searchenginewatch.com/sewblog 7 | http://blog.topix.net/index.rdf 8 | http://blogs.abcnews.com/theblotter/index.rdf 9 | http://feeds.feedburner.com/ConsumingExperienceFull 10 | http://flagrantdisregard.com/index.php/feed/ 11 | http://featured.gigaom.com/feed/ 12 | http://gizmodo.com/index.xml 13 | http://gofugyourself.typepad.com/go_fug_yourself/index.rdf 14 | http://googleblog.blogspot.com/rss.xml 15 | http://feeds.feedburner.com/GoogleOperatingSystem 16 | http://headrush.typepad.com/creating_passionate_users/index.rdf 17 | http://feeds.feedburner.com/instapundit/main 18 | http://jeremy.zawodny.com/blog/rss2.xml 19 | http://joi.ito.com/index.rdf 20 | http://feeds.feedburner.com/Mashable 21 | http://michellemalkin.com/index.rdf 22 | http://moblogsmoproblems.blogspot.com/rss.xml 23 | http://newsbusters.org/node/feed 24 | http://beta.blogger.com/feeds/27154654/posts/full?alt=rss 25 | http://feeds.feedburner.com/paulstamatiou 26 | http://powerlineblog.com/index.rdf 27 | http://feeds.feedburner.com/Publishing20 28 | http://radar.oreilly.com/index.rdf 29 | http://scienceblogs.com/pharyngula/index.xml 30 | http://scobleizer.wordpress.com/feed/ 31 | http://sethgodin.typepad.com/seths_blog/index.rdf 32 | http://rss.slashdot.org/Slashdot/slashdot 33 | http://thinkprogress.org/feed/ 34 | http://feeds.feedburner.com/andrewsullivan/rApM 35 | http://wilwheaton.typepad.com/wwdnbackup/index.rdf 36 | http://www.43folders.com/feed/ 37 | http://www.456bereastreet.com/feed.xml 38 | http://www.autoblog.com/rss.xml 39 | http://www.bloggersblog.com/rss.xml 40 | http://www.bloglines.com/rss/about/news 41 | http://www.blogmaverick.com/rss.xml 42 | http://www.boingboing.net/index.rdf 43 | http://www.buzzmachine.com/index.xml 44 | http://www.captainsquartersblog.com/mt/index.rdf 45 | http://www.coolhunting.com/index.rdf 46 | http://feeds.copyblogger.com/Copyblogger 47 | http://feeds.feedburner.com/crooksandliars/YaCP 48 | http://feeds.dailykos.com/dailykos/index.xml 49 | http://www.deadspin.com/index.xml 50 | http://www.downloadsquad.com/rss.xml 51 | http://www.engadget.com/rss.xml 52 | http://www.gapingvoid.com/index.rdf 53 | http://www.gawker.com/index.xml 54 | http://www.gothamist.com/index.rdf 55 | http://www.huffingtonpost.com/raw_feed_index.rdf 56 | http://www.hyperorg.com/blogger/index.rdf 57 | http://www.joelonsoftware.com/rss.xml 58 | http://www.joystiq.com/rss.xml 59 | http://www.kotaku.com/index.xml 60 | http://feeds.kottke.org/main 61 | http://www.lifehack.org/feed/ 62 | http://www.lifehacker.com/index.xml 63 | http://littlegreenfootballs.com/weblog/lgf-rss.php 64 | http://www.makezine.com/blog/index.xml 65 | http://www.mattcutts.com/blog/feed/ 66 | http://xml.metafilter.com/rss.xml 67 | http://www.mezzoblue.com/rss/index.xml 68 | http://www.micropersuasion.com/index.rdf 69 | http://www.neilgaiman.com/journal/feed/rss.xml 70 | http://www.oilman.ca/feed/ 71 | http://www.perezhilton.com/index.xml 72 | http://www.plasticbag.org/index.rdf 73 | http://www.powazek.com/rss.xml 74 | http://www.problogger.net/feed/ 75 | http://feeds.feedburner.com/QuickOnlineTips 76 | http://www.readwriteweb.com/rss.xml 77 | http://www.schneier.com/blog/index.rdf 78 | http://scienceblogs.com/sample/combined.xml 79 | http://www.seroundtable.com/index.rdf 80 | http://www.shoemoney.com/feed/ 81 | http://www.sifry.com/alerts/index.rdf 82 | http://www.simplebits.com/xml/rss.xml 83 | http://feeds.feedburner.com/Spikedhumor 84 | http://www.stevepavlina.com/blog/feed 85 | http://www.talkingpointsmemo.com/index.xml 86 | http://www.tbray.org/ongoing/ongoing.rss 87 | http://feeds.feedburner.com/TechCrunch 88 | http://www.techdirt.com/techdirt_rss.xml 89 | http://www.techeblog.com/index.php/feed/ 90 | http://www.thesuperficial.com/index.xml 91 | http://www.tmz.com/rss.xml 92 | http://www.treehugger.com/index.rdf 93 | http://www.tuaw.com/rss.xml 94 | http://www.valleywag.com/index.xml 95 | http://www.we-make-money-not-art.com/index.rdf 96 | http://www.wired.com/rss/index.xml 97 | http://www.wonkette.com/index.xml 98 | -------------------------------------------------------------------------------- /第03章 发现群组/generatefeedvector.py: -------------------------------------------------------------------------------- 1 | import feedparser 2 | import re 3 | 4 | # Returns title and dictionary of word counts for an RSS feed 5 | def getwordcounts(url): 6 | # Parse the feed 7 | d=feedparser.parse(url) 8 | wc={} 9 | 10 | # Loop over all the entries 11 | for e in d.entries: 12 | if 'summary' in e: summary=e.summary 13 | else: summary=e.description 14 | 15 | # Extract a list of words 16 | words=getwords(e.title+' '+summary) 17 | for word in words: 18 | wc.setdefault(word,0) 19 | wc[word]+=1 20 | return d.feed.title,wc 21 | 22 | def getwords(html): 23 | # Remove all the HTML tags 24 | txt=re.compile(r'<[^>]+>').sub('',html) 25 | 26 | # Split words by all non-alpha characters 27 | words=re.compile(r'[^A-Z^a-z]+').split(txt) 28 | 29 | # Convert to lowercase 30 | return [word.lower() for word in words if word!=''] 31 | 32 | 33 | apcount={} 34 | wordcounts={} 35 | feedlist=[line for line in file('feedlist.txt')] 36 | for feedurl in feedlist: 37 | try: 38 | title,wc=getwordcounts(feedurl) 39 | wordcounts[title]=wc 40 | for word,count in wc.items(): 41 | apcount.setdefault(word,0) 42 | if count>1: 43 | apcount[word]+=1 44 | except: 45 | print 'Failed to parse feed %s' % feedurl 46 | 47 | wordlist=[] 48 | for w,bc in apcount.items(): 49 | frac=float(bc)/len(feedlist) 50 | if frac>0.1 and frac<0.5: 51 | wordlist.append(w) 52 | 53 | out=file('blogdata1.txt','w') 54 | out.write('Blog') 55 | for word in wordlist: out.write('\t%s' % word) 56 | out.write('\n') 57 | for blog,wc in wordcounts.items(): 58 | print blog 59 | out.write(blog) 60 | for word in wordlist: 61 | if word in wc: out.write('\t%d' % wc[word]) 62 | else: out.write('\t0') 63 | out.write('\n') 64 | -------------------------------------------------------------------------------- /第04章 搜索与排名/nn.py: -------------------------------------------------------------------------------- 1 | from math import tanh 2 | from pysqlite2 import dbapi2 as sqlite 3 | 4 | def dtanh(y): 5 | return 1.0-y*y 6 | 7 | class searchnet: 8 | def __init__(self,dbname): 9 | self.con=sqlite.connect(dbname) 10 | 11 | def __del__(self): 12 | self.con.close() 13 | 14 | def maketables(self): 15 | self.con.execute('create table hiddennode(create_key)') 16 | self.con.execute('create table wordhidden(fromid,toid,strength)') 17 | self.con.execute('create table hiddenurl(fromid,toid,strength)') 18 | self.con.commit() 19 | 20 | def getstrength(self,fromid,toid,layer): 21 | if layer==0: table='wordhidden' 22 | else: table='hiddenurl' 23 | res=self.con.execute('select strength from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone() 24 | if res==None: 25 | if layer==0: return -0.2 26 | if layer==1: return 0 27 | return res[0] 28 | 29 | def setstrength(self,fromid,toid,layer,strength): 30 | if layer==0: table='wordhidden' 31 | else: table='hiddenurl' 32 | res=self.con.execute('select rowid from %s where fromid=%d and toid=%d' % (table,fromid,toid)).fetchone() 33 | if res==None: 34 | self.con.execute('insert into %s (fromid,toid,strength) values (%d,%d,%f)' % (table,fromid,toid,strength)) 35 | else: 36 | rowid=res[0] 37 | self.con.execute('update %s set strength=%f where rowid=%d' % (table,strength,rowid)) 38 | 39 | def generatehiddennode(self,wordids,urls): 40 | if len(wordids)>3: return None 41 | # Check if we already created a node for this set of words 42 | sorted_words=[str(id) for id in wordids] 43 | sorted_words.sort() 44 | createkey='_'.join(sorted_words) 45 | res=self.con.execute( 46 | "select rowid from hiddennode where create_key='%s'" % createkey).fetchone() 47 | 48 | # If not, create it 49 | if res==None: 50 | cur=self.con.execute( 51 | "insert into hiddennode (create_key) values ('%s')" % createkey) 52 | hiddenid=cur.lastrowid 53 | # Put in some default weights 54 | for wordid in wordids: 55 | self.setstrength(wordid,hiddenid,0,1.0/len(wordids)) 56 | for urlid in urls: 57 | self.setstrength(hiddenid,urlid,1,0.1) 58 | self.con.commit() 59 | 60 | def getallhiddenids(self,wordids,urlids): 61 | l1={} 62 | for wordid in wordids: 63 | cur=self.con.execute( 64 | 'select toid from wordhidden where fromid=%d' % wordid) 65 | for row in cur: l1[row[0]]=1 66 | for urlid in urlids: 67 | cur=self.con.execute( 68 | 'select fromid from hiddenurl where toid=%d' % urlid) 69 | for row in cur: l1[row[0]]=1 70 | return l1.keys() 71 | 72 | def setupnetwork(self,wordids,urlids): 73 | # value lists 74 | self.wordids=wordids 75 | self.hiddenids=self.getallhiddenids(wordids,urlids) 76 | self.urlids=urlids 77 | 78 | # node outputs 79 | self.ai = [1.0]*len(self.wordids) 80 | self.ah = [1.0]*len(self.hiddenids) 81 | self.ao = [1.0]*len(self.urlids) 82 | 83 | # create weights matrix 84 | self.wi = [[self.getstrength(wordid,hiddenid,0) 85 | for hiddenid in self.hiddenids] 86 | for wordid in self.wordids] 87 | self.wo = [[self.getstrength(hiddenid,urlid,1) 88 | for urlid in self.urlids] 89 | for hiddenid in self.hiddenids] 90 | 91 | def feedforward(self): 92 | # the only inputs are the query words 93 | for i in range(len(self.wordids)): 94 | self.ai[i] = 1.0 95 | 96 | # hidden activations 97 | for j in range(len(self.hiddenids)): 98 | sum = 0.0 99 | for i in range(len(self.wordids)): 100 | sum = sum + self.ai[i] * self.wi[i][j] 101 | self.ah[j] = tanh(sum) 102 | 103 | # output activations 104 | for k in range(len(self.urlids)): 105 | sum = 0.0 106 | for j in range(len(self.hiddenids)): 107 | sum = sum + self.ah[j] * self.wo[j][k] 108 | self.ao[k] = tanh(sum) 109 | 110 | return self.ao[:] 111 | 112 | def getresult(self,wordids,urlids): 113 | self.setupnetwork(wordids,urlids) 114 | return self.feedforward() 115 | 116 | def backPropagate(self, targets, N=0.5): 117 | # calculate errors for output 118 | output_deltas = [0.0] * len(self.urlids) 119 | for k in range(len(self.urlids)): 120 | error = targets[k]-self.ao[k] 121 | output_deltas[k] = dtanh(self.ao[k]) * error 122 | 123 | # calculate errors for hidden layer 124 | hidden_deltas = [0.0] * len(self.hiddenids) 125 | for j in range(len(self.hiddenids)): 126 | error = 0.0 127 | for k in range(len(self.urlids)): 128 | error = error + output_deltas[k]*self.wo[j][k] 129 | hidden_deltas[j] = dtanh(self.ah[j]) * error 130 | 131 | # update output weights 132 | for j in range(len(self.hiddenids)): 133 | for k in range(len(self.urlids)): 134 | change = output_deltas[k]*self.ah[j] 135 | self.wo[j][k] = self.wo[j][k] + N*change 136 | 137 | # update input weights 138 | for i in range(len(self.wordids)): 139 | for j in range(len(self.hiddenids)): 140 | change = hidden_deltas[j]*self.ai[i] 141 | self.wi[i][j] = self.wi[i][j] + N*change 142 | 143 | def trainquery(self,wordids,urlids,selectedurl): 144 | # generate a hidden node if necessary 145 | self.generatehiddennode(wordids,urlids) 146 | 147 | self.setupnetwork(wordids,urlids) 148 | self.feedforward() 149 | targets=[0.0]*len(urlids) 150 | targets[urlids.index(selectedurl)]=1.0 151 | error = self.backPropagate(targets) 152 | self.updatedatabase() 153 | 154 | def updatedatabase(self): 155 | # set them to database values 156 | for i in range(len(self.wordids)): 157 | for j in range(len(self.hiddenids)): 158 | self.setstrength(self.wordids[i],self. hiddenids[j],0,self.wi[i][j]) 159 | for j in range(len(self.hiddenids)): 160 | for k in range(len(self.urlids)): 161 | self.setstrength(self.hiddenids[j],self.urlids[k],1,self.wo[j][k]) 162 | self.con.commit() 163 | -------------------------------------------------------------------------------- /第04章 搜索与排名/searchengine.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from BeautifulSoup import * 3 | from urlparse import urljoin 4 | from pysqlite2 import dbapi2 as sqlite 5 | import nn 6 | mynet=nn.searchnet('nn.db') 7 | 8 | # Create a list of words to ignore 9 | ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1} 10 | 11 | 12 | class crawler: 13 | # Initialize the crawler with the name of database 14 | def __init__(self,dbname): 15 | self.con=sqlite.connect(dbname) 16 | 17 | def __del__(self): 18 | self.con.close() 19 | 20 | def dbcommit(self): 21 | self.con.commit() 22 | 23 | # Auxilliary function for getting an entry id and adding 24 | # it if it's not present 25 | def getentryid(self,table,field,value,createnew=True): 26 | cur=self.con.execute( 27 | "select rowid from %s where %s='%s'" % (table,field,value)) 28 | res=cur.fetchone() 29 | if res==None: 30 | cur=self.con.execute( 31 | "insert into %s (%s) values ('%s')" % (table,field,value)) 32 | return cur.lastrowid 33 | else: 34 | return res[0] 35 | 36 | 37 | # Index an individual page 38 | def addtoindex(self,url,soup): 39 | if self.isindexed(url): return 40 | print 'Indexing '+url 41 | 42 | # Get the individual words 43 | text=self.gettextonly(soup) 44 | words=self.separatewords(text) 45 | 46 | # Get the URL id 47 | urlid=self.getentryid('urllist','url',url) 48 | 49 | # Link each word to this url 50 | for i in range(len(words)): 51 | word=words[i] 52 | if word in ignorewords: continue 53 | wordid=self.getentryid('wordlist','word',word) 54 | self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) 55 | 56 | 57 | 58 | # Extract the text from an HTML page (no tags) 59 | def gettextonly(self,soup): 60 | v=soup.string 61 | if v==Null: 62 | c=soup.contents 63 | resulttext='' 64 | for t in c: 65 | subtext=self.gettextonly(t) 66 | resulttext+=subtext+'\n' 67 | return resulttext 68 | else: 69 | return v.strip() 70 | 71 | # Seperate the words by any non-whitespace character 72 | def separatewords(self,text): 73 | splitter=re.compile('\\W*') 74 | return [s.lower() for s in splitter.split(text) if s!=''] 75 | 76 | 77 | # Return true if this url is already indexed 78 | def isindexed(self,url): 79 | return False 80 | 81 | # Add a link between two pages 82 | def addlinkref(self,urlFrom,urlTo,linkText): 83 | words=self.separateWords(linkText) 84 | fromid=self.getentryid('urllist','url',urlFrom) 85 | toid=self.getentryid('urllist','url',urlTo) 86 | if fromid==toid: return 87 | cur=self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (fromid,toid)) 88 | linkid=cur.lastrowid 89 | for word in words: 90 | if word in ignorewords: continue 91 | wordid=self.getentryid('wordlist','word',word) 92 | self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid,wordid)) 93 | 94 | # Starting with a list of pages, do a breadth 95 | # first search to the given depth, indexing pages 96 | # as we go 97 | def crawl(self,pages,depth=2): 98 | for i in range(depth): 99 | newpages={} 100 | for page in pages: 101 | try: 102 | c=urllib2.urlopen(page) 103 | except: 104 | print "Could not open %s" % page 105 | continue 106 | try: 107 | soup=BeautifulSoup(c.read()) 108 | self.addtoindex(page,soup) 109 | 110 | links=soup('a') 111 | for link in links: 112 | if ('href' in dict(link.attrs)): 113 | url=urljoin(page,link['href']) 114 | if url.find("'")!=-1: continue 115 | url=url.split('#')[0] # remove location portion 116 | if url[0:4]=='http' and not self.isindexed(url): 117 | newpages[url]=1 118 | linkText=self.gettextonly(link) 119 | self.addlinkref(page,url,linkText) 120 | 121 | self.dbcommit() 122 | except: 123 | print "Could not parse page %s" % page 124 | 125 | pages=newpages 126 | 127 | 128 | # Create the database tables 129 | def createindextables(self): 130 | self.con.execute('create table urllist(url)') 131 | self.con.execute('create table wordlist(word)') 132 | self.con.execute('create table wordlocation(urlid,wordid,location)') 133 | self.con.execute('create table link(fromid integer,toid integer)') 134 | self.con.execute('create table linkwords(wordid,linkid)') 135 | self.con.execute('create index wordidx on wordlist(word)') 136 | self.con.execute('create index urlidx on urllist(url)') 137 | self.con.execute('create index wordurlidx on wordlocation(wordid)') 138 | self.con.execute('create index urltoidx on link(toid)') 139 | self.con.execute('create index urlfromidx on link(fromid)') 140 | self.dbcommit() 141 | 142 | def calculatepagerank(self,iterations=20): 143 | # clear out the current page rank tables 144 | self.con.execute('drop table if exists pagerank') 145 | self.con.execute('create table pagerank(urlid primary key,score)') 146 | 147 | # initialize every url with a page rank of 1 148 | for (urlid,) in self.con.execute('select rowid from urllist'): 149 | self.con.execute('insert into pagerank(urlid,score) values (%d,1.0)' % urlid) 150 | self.dbcommit() 151 | 152 | for i in range(iterations): 153 | print "Iteration %d" % (i) 154 | for (urlid,) in self.con.execute('select rowid from urllist'): 155 | pr=0.15 156 | 157 | # Loop through all the pages that link to this one 158 | for (linker,) in self.con.execute( 159 | 'select distinct fromid from link where toid=%d' % urlid): 160 | # Get the page rank of the linker 161 | linkingpr=self.con.execute( 162 | 'select score from pagerank where urlid=%d' % linker).fetchone()[0] 163 | 164 | # Get the total number of links from the linker 165 | linkingcount=self.con.execute( 166 | 'select count(*) from link where fromid=%d' % linker).fetchone()[0] 167 | pr+=0.85*(linkingpr/linkingcount) 168 | self.con.execute( 169 | 'update pagerank set score=%f where urlid=%d' % (pr,urlid)) 170 | self.dbcommit() 171 | 172 | class searcher: 173 | def __init__(self,dbname): 174 | self.con=sqlite.connect(dbname) 175 | 176 | def __del__(self): 177 | self.con.close() 178 | 179 | def getmatchrows(self,q): 180 | # Strings to build the query 181 | fieldlist='w0.urlid' 182 | tablelist='' 183 | clauselist='' 184 | wordids=[] 185 | 186 | # Split the words by spaces 187 | words=q.split(' ') 188 | tablenumber=0 189 | 190 | for word in words: 191 | # Get the word ID 192 | wordrow=self.con.execute( 193 | "select rowid from wordlist where word='%s'" % word).fetchone() 194 | if wordrow!=None: 195 | wordid=wordrow[0] 196 | wordids.append(wordid) 197 | if tablenumber>0: 198 | tablelist+=',' 199 | clauselist+=' and ' 200 | clauselist+='w%d.urlid=w%d.urlid and ' % (tablenumber-1,tablenumber) 201 | fieldlist+=',w%d.location' % tablenumber 202 | tablelist+='wordlocation w%d' % tablenumber 203 | clauselist+='w%d.wordid=%d' % (tablenumber,wordid) 204 | tablenumber+=1 205 | 206 | # Create the query from the separate parts 207 | fullquery='select %s from %s where %s' % (fieldlist,tablelist,clauselist) 208 | print fullquery 209 | cur=self.con.execute(fullquery) 210 | rows=[row for row in cur] 211 | 212 | return rows,wordids 213 | 214 | def getscoredlist(self,rows,wordids): 215 | totalscores=dict([(row[0],0) for row in rows]) 216 | 217 | # This is where we'll put our scoring functions 218 | weights=[(1.0,self.locationscore(rows)), 219 | (1.0,self.frequencyscore(rows)), 220 | (1.0,self.pagerankscore(rows)), 221 | (1.0,self.linktextscore(rows,wordids)), 222 | (5.0,self.nnscore(rows,wordids))] 223 | for (weight,scores) in weights: 224 | for url in totalscores: 225 | totalscores[url]+=weight*scores[url] 226 | 227 | return totalscores 228 | 229 | def geturlname(self,id): 230 | return self.con.execute( 231 | "select url from urllist where rowid=%d" % id).fetchone()[0] 232 | 233 | def query(self,q): 234 | rows,wordids=self.getmatchrows(q) 235 | scores=self.getscoredlist(rows,wordids) 236 | rankedscores=[(score,url) for (url,score) in scores.items()] 237 | rankedscores.sort() 238 | rankedscores.reverse() 239 | for (score,urlid) in rankedscores[0:10]: 240 | print '%f\t%s' % (score,self.geturlname(urlid)) 241 | return wordids,[r[1] for r in rankedscores[0:10]] 242 | 243 | def normalizescores(self,scores,smallIsBetter=0): 244 | vsmall=0.00001 # Avoid division by zero errors 245 | if smallIsBetter: 246 | minscore=min(scores.values()) 247 | return dict([(u,float(minscore)/max(vsmall,l)) for (u,l) in scores.items()]) 248 | else: 249 | maxscore=max(scores.values()) 250 | if maxscore==0: maxscore=vsmall 251 | return dict([(u,float(c)/maxscore) for (u,c) in scores.items()]) 252 | 253 | def frequencyscore(self,rows): 254 | counts=dict([(row[0],0) for row in rows]) 255 | for row in rows: counts[row[0]]+=1 256 | return self.normalizescores(counts) 257 | 258 | def locationscore(self,rows): 259 | locations=dict([(row[0],1000000) for row in rows]) 260 | for row in rows: 261 | loc=sum(row[1:]) 262 | if locxxxxxxxx 15 | sid=doc.getElementsByTagName('sid')[0].firstChild.data 16 | return sid 17 | 18 | def flightsearch(sid,origin,destination,depart_date): 19 | 20 | # Construct search URL 21 | url='http://www.kayak.com/s/apisearch?basicmode=true&oneway=y&origin=%s' % origin 22 | url+='&destination=%s&depart_date=%s' % (destination,depart_date) 23 | url+='&return_date=none&depart_time=a&return_time=a' 24 | url+='&travelers=1&cabin=e&action=doFlights&apimode=1' 25 | url+='&_sid_=%s&version=1' % (sid) 26 | 27 | # Get the XML 28 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 29 | 30 | # Extract the search ID 31 | searchid=doc.getElementsByTagName('searchid')[0].firstChild.data 32 | 33 | return searchid 34 | 35 | def flightsearchresults(sid,searchid): 36 | def parseprice(p): 37 | return float(p[1:].replace(',','')) 38 | 39 | # Polling loop 40 | while 1: 41 | time.sleep(2) 42 | 43 | # Construct URL for polling 44 | url='http://www.kayak.com/s/basic/flight?' 45 | url+='searchid=%s&c=5&apimode=1&_sid_=%s&version=1' % (searchid,sid) 46 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 47 | 48 | # Look for morepending tag, and wait until it is no longer true 49 | morepending=doc.getElementsByTagName('morepending')[0].firstChild 50 | if morepending==None or morepending.data=='false': break 51 | 52 | # Now download the complete list 53 | url='http://www.kayak.com/s/basic/flight?' 54 | url+='searchid=%s&c=999&apimode=1&_sid_=%s&version=1' % (searchid,sid) 55 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 56 | 57 | # Get the various elements as lists 58 | prices=doc.getElementsByTagName('price') 59 | departures=doc.getElementsByTagName('depart') 60 | arrivals=doc.getElementsByTagName('arrive') 61 | 62 | # Zip them together 63 | return zip([p.firstChild.data.split(' ')[1] for p in departures], 64 | [p.firstChild.data.split(' ')[1] for p in arrivals], 65 | [parseprice(p.firstChild.data) for p in prices]) 66 | 67 | 68 | def createschedule(people,dest,dep,ret): 69 | # Get a session id for these searches 70 | sid=getkayaksession() 71 | flights={} 72 | 73 | for p in people: 74 | name,origin=p 75 | # Outbound flight 76 | searchid=flightsearch(sid,origin,dest,dep) 77 | flights[(origin,dest)]=flightsearchresults(sid,searchid) 78 | 79 | # Return flight 80 | searchid=flightsearch(sid,dest,origin,ret) 81 | flights[(dest,origin)]=flightsearchresults(sid,searchid) 82 | 83 | return flights 84 | -------------------------------------------------------------------------------- /第05章 优化/optimization.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import math 4 | 5 | people = [('Seymour','BOS'), 6 | ('Franny','DAL'), 7 | ('Zooey','CAK'), 8 | ('Walt','MIA'), 9 | ('Buddy','ORD'), 10 | ('Les','OMA')] 11 | # Laguardia 12 | destination='LGA' 13 | 14 | flights={} 15 | # 16 | for line in file('schedule.txt'): 17 | origin,dest,depart,arrive,price=line.strip().split(',') 18 | flights.setdefault((origin,dest),[]) 19 | 20 | # Add details to the list of possible flights 21 | flights[(origin,dest)].append((depart,arrive,int(price))) 22 | 23 | def getminutes(t): 24 | x=time.strptime(t,'%H:%M') 25 | return x[3]*60+x[4] 26 | 27 | def printschedule(r): 28 | for d in range(len(r)/2): 29 | name=people[d][0] 30 | origin=people[d][1] 31 | out=flights[(origin,destination)][int(r[d])] 32 | ret=flights[(destination,origin)][int(r[d+1])] 33 | print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin, 34 | out[0],out[1],out[2], 35 | ret[0],ret[1],ret[2]) 36 | 37 | def schedulecost(sol): 38 | totalprice=0 39 | latestarrival=0 40 | earliestdep=24*60 41 | 42 | for d in range(len(sol)/2): 43 | # Get the inbound and outbound flights 44 | origin=people[d][1] 45 | outbound=flights[(origin,destination)][int(sol[d])] 46 | returnf=flights[(destination,origin)][int(sol[d+1])] 47 | 48 | # Total price is the price of all outbound and return flights 49 | totalprice+=outbound[2] 50 | totalprice+=returnf[2] 51 | 52 | # Track the latest arrival and earliest departure 53 | if latestarrivalgetminutes(returnf[0]): earliestdep=getminutes(returnf[0]) 55 | 56 | # Every person must wait at the airport until the latest person arrives. 57 | # They also must arrive at the same time and wait for their flights. 58 | totalwait=0 59 | for d in range(len(sol)/2): 60 | origin=people[d][1] 61 | outbound=flights[(origin,destination)][int(sol[d])] 62 | returnf=flights[(destination,origin)][int(sol[d+1])] 63 | totalwait+=latestarrival-getminutes(outbound[1]) 64 | totalwait+=getminutes(returnf[0])-earliestdep 65 | 66 | # Does this solution require an extra day of car rental? That'll be $50! 67 | if latestarrival>earliestdep: totalprice+=50 68 | 69 | return totalprice+totalwait 70 | 71 | def randomoptimize(domain,costf): 72 | best=999999999 73 | bestr=None 74 | for i in range(0,1000): 75 | # Create a random solution 76 | r=[float(random.randint(domain[i][0],domain[i][1])) 77 | for i in range(len(domain))] 78 | 79 | # Get the cost 80 | cost=costf(r) 81 | 82 | # Compare it to the best one so far 83 | if costdomain[j][0]: 100 | neighbors.append(sol[0:j]+[sol[j]+1]+sol[j+1:]) 101 | if sol[j]0.1: 124 | # Choose one of the indices 125 | i=random.randint(0,len(domain)-1) 126 | 127 | # Choose a direction to change it 128 | dir=random.randint(-step,step) 129 | 130 | # Create a new list with one of the values changed 131 | vecb=vec[:] 132 | vecb[i]+=dir 133 | if vecb[i]domain[i][1]: vecb[i]=domain[i][1] 135 | 136 | # Calculate the current cost and the new cost 137 | ea=costf(vec) 138 | eb=costf(vecb) 139 | p=pow(math.e,(-eb-ea)/T) 140 | 141 | # Is it better, or does it make the probability 142 | # cutoff? 143 | if (ebdomain[i][0]: 156 | return vec[0:i]+[vec[i]-step]+vec[i+1:] 157 | elif vec[i]0 and ua<1 and ub>0 and ub<1: 45 | total+=1 46 | for i in range(len(people)): 47 | for j in range(i+1,len(people)): 48 | # Get the locations of the two nodes 49 | (x1,y1),(x2,y2)=loc[people[i]],loc[people[j]] 50 | 51 | # Find the distance between them 52 | dist=math.sqrt(math.pow(x1-x2,2)+math.pow(y1-y2,2)) 53 | # Penalize any nodes closer than 50 pixels 54 | if dist<50: 55 | total+=(1.0-(dist/50.0)) 56 | 57 | return total 58 | from PIL import Image,ImageDraw 59 | 60 | def drawnetwork(sol): 61 | # Create the image 62 | img=Image.new('RGB',(400,400),(255,255,255)) 63 | draw=ImageDraw.Draw(img) 64 | 65 | # Create the position dict 66 | pos=dict([(people[i],(sol[i*2],sol[i*2+1])) for i in range(0,len(people))]) 67 | 68 | for (a,b) in links: 69 | draw.line((pos[a],pos[b]),fill=(255,0,0)) 70 | 71 | for n,p in pos.items(): 72 | draw.text(p,n,(0,0,0)) 73 | 74 | img.show() 75 | 76 | 77 | domain=[(10,370)]*(len(people)*2) -------------------------------------------------------------------------------- /第06章 文档过滤/docclass.py: -------------------------------------------------------------------------------- 1 | from pysqlite2 import dbapi2 as sqlite 2 | import re 3 | import math 4 | 5 | def getwords(doc): 6 | splitter=re.compile('\\W*') 7 | print doc 8 | # Split the words by non-alpha characters 9 | words=[s.lower() for s in splitter.split(doc) 10 | if len(s)>2 and len(s)<20] 11 | 12 | # Return the unique set of words only 13 | return dict([(w,1) for w in words]) 14 | 15 | class classifier: 16 | def __init__(self,getfeatures,filename=None): 17 | # Counts of feature/category combinations 18 | self.fc={} 19 | # Counts of documents in each category 20 | self.cc={} 21 | self.getfeatures=getfeatures 22 | 23 | def setdb(self,dbfile): 24 | self.con=sqlite.connect(dbfile) 25 | self.con.execute('create table if not exists fc(feature,category,count)') 26 | self.con.execute('create table if not exists cc(category,count)') 27 | 28 | 29 | def incf(self,f,cat): 30 | count=self.fcount(f,cat) 31 | if count==0: 32 | self.con.execute("insert into fc values ('%s','%s',1)" 33 | % (f,cat)) 34 | else: 35 | self.con.execute( 36 | "update fc set count=%d where feature='%s' and category='%s'" 37 | % (count+1,f,cat)) 38 | 39 | def fcount(self,f,cat): 40 | res=self.con.execute( 41 | 'select count from fc where feature="%s" and category="%s"' 42 | %(f,cat)).fetchone() 43 | if res==None: return 0 44 | else: return float(res[0]) 45 | 46 | def incc(self,cat): 47 | count=self.catcount(cat) 48 | if count==0: 49 | self.con.execute("insert into cc values ('%s',1)" % (cat)) 50 | else: 51 | self.con.execute("update cc set count=%d where category='%s'" 52 | % (count+1,cat)) 53 | 54 | def catcount(self,cat): 55 | res=self.con.execute('select count from cc where category="%s"' 56 | %(cat)).fetchone() 57 | if res==None: return 0 58 | else: return float(res[0]) 59 | 60 | def categories(self): 61 | cur=self.con.execute('select category from cc'); 62 | return [d[0] for d in cur] 63 | 64 | def totalcount(self): 65 | res=self.con.execute('select sum(count) from cc').fetchone(); 66 | if res==None: return 0 67 | return res[0] 68 | 69 | 70 | def train(self,item,cat): 71 | features=self.getfeatures(item) 72 | # Increment the count for every feature with this category 73 | for f in features: 74 | self.incf(f,cat) 75 | 76 | # Increment the count for this category 77 | self.incc(cat) 78 | self.con.commit() 79 | 80 | def fprob(self,f,cat): 81 | if self.catcount(cat)==0: return 0 82 | 83 | # The total number of times this feature appeared in this 84 | # category divided by the total number of items in this category 85 | return self.fcount(f,cat)/self.catcount(cat) 86 | 87 | def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5): 88 | # Calculate current probability 89 | basicprob=prf(f,cat) 90 | 91 | # Count the number of times this feature has appeared in 92 | # all categories 93 | totals=sum([self.fcount(f,c) for c in self.categories()]) 94 | 95 | # Calculate the weighted average 96 | bp=((weight*ap)+(totals*basicprob))/(weight+totals) 97 | return bp 98 | 99 | 100 | 101 | 102 | class naivebayes(classifier): 103 | 104 | def __init__(self,getfeatures): 105 | classifier.__init__(self,getfeatures) 106 | self.thresholds={} 107 | 108 | def docprob(self,item,cat): 109 | features=self.getfeatures(item) 110 | 111 | # Multiply the probabilities of all the features together 112 | p=1 113 | for f in features: p*=self.weightedprob(f,cat,self.fprob) 114 | return p 115 | 116 | def prob(self,item,cat): 117 | catprob=self.catcount(cat)/self.totalcount() 118 | docprob=self.docprob(item,cat) 119 | return docprob*catprob 120 | 121 | def setthreshold(self,cat,t): 122 | self.thresholds[cat]=t 123 | 124 | def getthreshold(self,cat): 125 | if cat not in self.thresholds: return 1.0 126 | return self.thresholds[cat] 127 | 128 | def classify(self,item,default=None): 129 | probs={} 130 | # Find the category with the highest probability 131 | max=0.0 132 | for cat in self.categories(): 133 | probs[cat]=self.prob(item,cat) 134 | if probs[cat]>max: 135 | max=probs[cat] 136 | best=cat 137 | 138 | # Make sure the probability exceeds threshold*next best 139 | for cat in probs: 140 | if cat==best: continue 141 | if probs[cat]*self.getthreshold(best)>probs[best]: return default 142 | return best 143 | 144 | class fisherclassifier(classifier): 145 | def cprob(self,f,cat): 146 | # The frequency of this feature in this category 147 | clf=self.fprob(f,cat) 148 | if clf==0: return 0 149 | 150 | # The frequency of this feature in all the categories 151 | freqsum=sum([self.fprob(f,c) for c in self.categories()]) 152 | 153 | # The probability is the frequency in this category divided by 154 | # the overall frequency 155 | p=clf/(freqsum) 156 | 157 | return p 158 | def fisherprob(self,item,cat): 159 | # Multiply all the probabilities together 160 | p=1 161 | features=self.getfeatures(item) 162 | for f in features: 163 | p*=(self.weightedprob(f,cat,self.cprob)) 164 | 165 | # Take the natural log and multiply by -2 166 | fscore=-2*math.log(p) 167 | 168 | # Use the inverse chi2 function to get a probability 169 | return self.invchi2(fscore,len(features)*2) 170 | def invchi2(self,chi, df): 171 | m = chi / 2.0 172 | sum = term = math.exp(-m) 173 | for i in range(1, df//2): 174 | term *= m / i 175 | sum += term 176 | return min(sum, 1.0) 177 | def __init__(self,getfeatures): 178 | classifier.__init__(self,getfeatures) 179 | self.minimums={} 180 | 181 | def setminimum(self,cat,min): 182 | self.minimums[cat]=min 183 | 184 | def getminimum(self,cat): 185 | if cat not in self.minimums: return 0 186 | return self.minimums[cat] 187 | def classify(self,item,default=None): 188 | # Loop through looking for the best result 189 | best=default 190 | max=0.0 191 | for c in self.categories(): 192 | p=self.fisherprob(item,c) 193 | # Make sure it exceeds its minimum 194 | if p>self.getminimum(c) and p>max: 195 | best=c 196 | max=p 197 | return best 198 | 199 | 200 | def sampletrain(cl): 201 | cl.train('Nobody owns the water.','good') 202 | cl.train('the quick rabbit jumps fences','good') 203 | cl.train('buy pharmaceuticals now','bad') 204 | cl.train('make quick money at the online casino','bad') 205 | cl.train('the quick brown fox jumps','good') 206 | -------------------------------------------------------------------------------- /第06章 文档过滤/feedfilter.py: -------------------------------------------------------------------------------- 1 | import feedparser 2 | import re 3 | 4 | # Takes a filename of URL of a blog feed and classifies the entries 5 | def read(feed,classifier): 6 | # Get feed entries and loop over them 7 | f=feedparser.parse(feed) 8 | for entry in f['entries']: 9 | print 10 | print '-----' 11 | # Print the contents of the entry 12 | print 'Title: '+entry['title'].encode('utf-8') 13 | print 'Publisher: '+entry['publisher'].encode('utf-8') 14 | print 15 | print entry['summary'].encode('utf-8') 16 | 17 | 18 | # Combine all the text to create one item for the classifier 19 | fulltext='%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary']) 20 | 21 | # Print the best guess at the current category 22 | print 'Guess: '+str(classifier.classify(entry)) 23 | 24 | # Ask the user to specify the correct category and train on that 25 | cl=raw_input('Enter category: ') 26 | classifier.train(entry,cl) 27 | 28 | 29 | def entryfeatures(entry): 30 | splitter=re.compile('\\W*') 31 | f={} 32 | 33 | # Extract the title words and annotate 34 | titlewords=[s.lower() for s in splitter.split(entry['title']) 35 | if len(s)>2 and len(s)<20] 36 | for w in titlewords: f['Title:'+w]=1 37 | 38 | # Extract the summary words 39 | summarywords=[s.lower() for s in splitter.split(entry['summary']) 40 | if len(s)>2 and len(s)<20] 41 | 42 | # Count uppercase words 43 | uc=0 44 | for i in range(len(summarywords)): 45 | w=summarywords[i] 46 | f[w]=1 47 | if w.isupper(): uc+=1 48 | 49 | # Get word pairs in summary as features 50 | if i0.3: f['UPPERCASE']=1 59 | 60 | return f 61 | -------------------------------------------------------------------------------- /第06章 文档过滤/test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第06章 文档过滤/test.db -------------------------------------------------------------------------------- /第06章 文档过滤/test1.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第06章 文档过滤/test1.db -------------------------------------------------------------------------------- /第07章 决策树建模/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第07章 决策树建模/Thumbs.db -------------------------------------------------------------------------------- /第07章 决策树建模/addresslist.txt: -------------------------------------------------------------------------------- 1 | 6 Washington 2 | 21 Manassas 3 | 280 Pearl 4 | 55 Ellery 5 | 50 Follen 6 | 51 Granite 7 | 992 Memorial 8 | 83 Trowbridge 9 | 1 Dana 10 | 45 Regent 11 | 90 Alpine 12 | 21 Francis 13 | 112 Avon Hill 14 | 9 Bellevue 15 | 4 Blanchard Rd 16 | 34 Shea 17 | 5 Fountain 18 | 14 Marcella 19 | 39 Saint Saveur 20 | 35 Pemberton 21 | 46 Shepard 22 | 31 Market 23 | 99 Howard 24 | 88 Pearl 25 | 208 Western 26 | 285 Windsor 27 | 26 Cambridgepark 28 | 211 Erie 29 | 129 Franklin 30 | 27 Gurney 31 | 149 Prospect 32 | 27 Linnaean 33 | 20 Dudley 34 | 60 Otis St 35 | 130 Mount Auburn St 36 | 2 Michael Way 37 | 263 Columbia St 38 | 6 Hurlbut St 39 | 199 Harvard St 40 | 168 River St 41 | 400 Washington St 42 | 12 Traill St 43 | 74 Field St 44 | 21 Walden Square Rd 45 | 7 Wendell St 46 | 15 Normandy Ave 47 | 6 Gibson Ter 48 | 94 Pine St 49 | 23 Magee St 50 | 175 Richdale Ave 51 | 168 River St 52 | 246 Brattle St -------------------------------------------------------------------------------- /第07章 决策树建模/hotornot.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import xml.dom.minidom 3 | 4 | api_key='YOUR KEY HERE' 5 | 6 | def getrandomratings(c): 7 | # Construct URL for getRandomProfile 8 | url="http://services.hotornot.com/rest/?app_key=%s" % api_key 9 | url+="&method=Rate.getRandomProfile&retrieve_num=%d" % c 10 | url+="&get_rate_info=true&meet_users_only=true" 11 | 12 | f1=urllib2.urlopen(url).read() 13 | 14 | doc=xml.dom.minidom.parseString(f1) 15 | 16 | emids=doc.getElementsByTagName('emid') 17 | ratings=doc.getElementsByTagName('rating') 18 | 19 | # Combine the emids and ratings together into a list 20 | result=[] 21 | for e,r in zip(emids,ratings): 22 | if r.firstChild!=None: 23 | result.append((e.firstChild.data,r.firstChild.data)) 24 | return result 25 | 26 | stateregions={'New England':['ct','mn','ma','nh','ri','vt'], 27 | 'Mid Atlantic':['de','md','nj','ny','pa'], 28 | 'South':['al','ak','fl','ga','ky','la','ms','mo', 29 | 'nc','sc','tn','va','wv'], 30 | 'Midwest':['il','in','ia','ks','mi','ne','nd','oh','sd','wi'], 31 | 'West':['ak','ca','co','hi','id','mt','nv','or','ut','wa','wy']} 32 | 33 | def getpeopledata(ratings): 34 | result=[] 35 | for emid,rating in ratings: 36 | # URL for the MeetMe.getProfile method 37 | url="http://services.hotornot.com/rest/?app_key=%s" % api_key 38 | url+="&method=MeetMe.getProfile&emid=%s&get_keywords=true" % emid 39 | 40 | # Get all the info about this person 41 | try: 42 | rating=int(float(rating)+0.5) 43 | doc2=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 44 | gender=doc2.getElementsByTagName('gender')[0].firstChild.data 45 | age=doc2.getElementsByTagName('age')[0].firstChild.data 46 | loc=doc2.getElementsByTagName('location')[0].firstChild.data[0:2] 47 | 48 | # Convert state to region 49 | for r,s in stateregions.items(): 50 | if loc in s: region=r 51 | 52 | if region!=None: 53 | result.append((gender,int(age),region,rating)) 54 | except: 55 | pass 56 | return result 57 | 58 | -------------------------------------------------------------------------------- /第07章 决策树建模/treepredict.py: -------------------------------------------------------------------------------- 1 | my_data=[['slashdot','USA','yes',18,'None'], 2 | ['google','France','yes',23,'Premium'], 3 | ['digg','USA','yes',24,'Basic'], 4 | ['kiwitobes','France','yes',23,'Basic'], 5 | ['google','UK','no',21,'Premium'], 6 | ['(direct)','New Zealand','no',12,'None'], 7 | ['(direct)','UK','no',21,'Basic'], 8 | ['google','USA','no',24,'Premium'], 9 | ['slashdot','France','yes',19,'None'], 10 | ['digg','USA','no',18,'None'], 11 | ['google','UK','no',18,'None'], 12 | ['kiwitobes','UK','no',19,'None'], 13 | ['digg','New Zealand','yes',12,'Basic'], 14 | ['slashdot','UK','no',21,'None'], 15 | ['google','UK','yes',18,'Basic'], 16 | ['kiwitobes','France','yes',19,'Basic']] 17 | 18 | class decisionnode: 19 | def __init__(self,col=-1,value=None,results=None,tb=None,fb=None): 20 | self.col=col 21 | self.value=value 22 | self.results=results 23 | self.tb=tb 24 | self.fb=fb 25 | 26 | # Divides a set on a specific column. Can handle numeric 27 | # or nominal values 28 | def divideset(rows,column,value): 29 | # Make a function that tells us if a row is in 30 | # the first group (true) or the second group (false) 31 | split_function=None 32 | if isinstance(value,int) or isinstance(value,float): 33 | split_function=lambda row:row[column]>=value 34 | else: 35 | split_function=lambda row:row[column]==value 36 | 37 | # Divide the rows into two sets and return them 38 | set1=[row for row in rows if split_function(row)] 39 | set2=[row for row in rows if not split_function(row)] 40 | return (set1,set2) 41 | 42 | 43 | # Create counts of possible results (the last column of 44 | # each row is the result) 45 | def uniquecounts(rows): 46 | results={} 47 | for row in rows: 48 | # The result is the last column 49 | r=row[len(row)-1] 50 | if r not in results: results[r]=0 51 | results[r]+=1 52 | return results 53 | 54 | # Probability that a randomly placed item will 55 | # be in the wrong category 56 | def giniimpurity(rows): 57 | total=len(rows) 58 | counts=uniquecounts(rows) 59 | imp=0 60 | for k1 in counts: 61 | p1=float(counts[k1])/total 62 | for k2 in counts: 63 | if k1==k2: continue 64 | p2=float(counts[k2])/total 65 | imp+=p1*p2 66 | return imp 67 | 68 | # Entropy is the sum of p(x)log(p(x)) across all 69 | # the different possible results 70 | def entropy(rows): 71 | from math import log 72 | log2=lambda x:log(x)/log(2) 73 | results=uniquecounts(rows) 74 | # Now calculate the entropy 75 | ent=0.0 76 | for r in results.keys(): 77 | p=float(results[r])/len(rows) 78 | ent=ent-p*log2(p) 79 | return ent 80 | 81 | 82 | 83 | 84 | def printtree(tree,indent=''): 85 | # Is this a leaf node? 86 | if tree.results!=None: 87 | print str(tree.results) 88 | else: 89 | # Print the criteria 90 | print str(tree.col)+':'+str(tree.value)+'? ' 91 | 92 | # Print the branches 93 | print indent+'T->', 94 | printtree(tree.tb,indent+' ') 95 | print indent+'F->', 96 | printtree(tree.fb,indent+' ') 97 | 98 | 99 | def getwidth(tree): 100 | if tree.tb==None and tree.fb==None: return 1 101 | return getwidth(tree.tb)+getwidth(tree.fb) 102 | 103 | def getdepth(tree): 104 | if tree.tb==None and tree.fb==None: return 0 105 | return max(getdepth(tree.tb),getdepth(tree.fb))+1 106 | 107 | 108 | from PIL import Image,ImageDraw 109 | 110 | def drawtree(tree,jpeg='tree.jpg'): 111 | w=getwidth(tree)*100 112 | h=getdepth(tree)*100+120 113 | 114 | img=Image.new('RGB',(w,h),(255,255,255)) 115 | draw=ImageDraw.Draw(img) 116 | 117 | drawnode(draw,tree,w/2,20) 118 | img.save(jpeg,'JPEG') 119 | 120 | def drawnode(draw,tree,x,y): 121 | if tree.results==None: 122 | # Get the width of each branch 123 | w1=getwidth(tree.fb)*100 124 | w2=getwidth(tree.tb)*100 125 | 126 | # Determine the total space required by this node 127 | left=x-(w1+w2)/2 128 | right=x+(w1+w2)/2 129 | 130 | # Draw the condition string 131 | draw.text((x-20,y-10),str(tree.col)+':'+str(tree.value),(0,0,0)) 132 | 133 | # Draw links to the branches 134 | draw.line((x,y,left+w1/2,y+100),fill=(255,0,0)) 135 | draw.line((x,y,right-w2/2,y+100),fill=(255,0,0)) 136 | 137 | # Draw the branch nodes 138 | drawnode(draw,tree.fb,left+w1/2,y+100) 139 | drawnode(draw,tree.tb,right-w2/2,y+100) 140 | else: 141 | txt=' \n'.join(['%s:%d'%v for v in tree.results.items()]) 142 | draw.text((x-20,y),txt,(0,0,0)) 143 | 144 | 145 | def classify(observation,tree): 146 | if tree.results!=None: 147 | return tree.results 148 | else: 149 | v=observation[tree.col] 150 | branch=None 151 | if isinstance(v,int) or isinstance(v,float): 152 | if v>=tree.value: branch=tree.tb 153 | else: branch=tree.fb 154 | else: 155 | if v==tree.value: branch=tree.tb 156 | else: branch=tree.fb 157 | return classify(observation,branch) 158 | 159 | def prune(tree,mingain): 160 | # If the branches aren't leaves, then prune them 161 | if tree.tb.results==None: 162 | prune(tree.tb,mingain) 163 | if tree.fb.results==None: 164 | prune(tree.fb,mingain) 165 | 166 | # If both the subbranches are now leaves, see if they 167 | # should merged 168 | if tree.tb.results!=None and tree.fb.results!=None: 169 | # Build a combined dataset 170 | tb,fb=[],[] 171 | for v,c in tree.tb.results.items(): 172 | tb+=[[v]]*c 173 | for v,c in tree.fb.results.items(): 174 | fb+=[[v]]*c 175 | 176 | # Test the reduction in entropy 177 | delta=entropy(tb+fb)-(entropy(tb)+entropy(fb)/2) 178 | 179 | if delta=tree.value: branch=tree.tb 202 | else: branch=tree.fb 203 | else: 204 | if v==tree.value: branch=tree.tb 205 | else: branch=tree.fb 206 | return mdclassify(observation,branch) 207 | 208 | def variance(rows): 209 | if len(rows)==0: return 0 210 | data=[float(row[len(row)-1]) for row in rows] 211 | mean=sum(data)/len(data) 212 | variance=sum([(d-mean)**2 for d in data])/len(data) 213 | return variance 214 | 215 | def buildtree(rows,scoref=entropy): 216 | if len(rows)==0: return decisionnode() 217 | current_score=scoref(rows) 218 | 219 | # Set up some variables to track the best criteria 220 | best_gain=0.0 221 | best_criteria=None 222 | best_sets=None 223 | 224 | column_count=len(rows[0])-1 225 | for col in range(0,column_count): 226 | # Generate the list of different values in 227 | # this column 228 | column_values={} 229 | for row in rows: 230 | column_values[row[col]]=1 231 | # Now try dividing the rows up for each value 232 | # in this column 233 | for value in column_values.keys(): 234 | (set1,set2)=divideset(rows,col,value) 235 | 236 | # Information gain 237 | p=float(len(set1))/len(rows) 238 | gain=current_score-p*scoref(set1)-(1-p)*scoref(set2) 239 | if gain>best_gain and len(set1)>0 and len(set2)>0: 240 | best_gain=gain 241 | best_criteria=(col,value) 242 | best_sets=(set1,set2) 243 | # Create the sub branches 244 | if best_gain>0: 245 | trueBranch=buildtree(best_sets[0]) 246 | falseBranch=buildtree(best_sets[1]) 247 | return decisionnode(col=best_criteria[0],value=best_criteria[1], 248 | tb=trueBranch,fb=falseBranch) 249 | else: 250 | return decisionnode(results=uniquecounts(rows)) 251 | -------------------------------------------------------------------------------- /第07章 决策树建模/zillow.py: -------------------------------------------------------------------------------- 1 | import xml.dom.minidom 2 | import urllib2 3 | 4 | zwskey="YOUR API KEY" 5 | 6 | def getaddressdata(address,city): 7 | escad=address.replace(' ','+') 8 | url='http://www.zillow.com/webservice/GetDeepSearchResults.htm?' 9 | url+='zws-id=%s&address=%s&citystatezip=%s' % (zwskey,escad,city) 10 | doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read()) 11 | code=doc.getElementsByTagName('code')[0].firstChild.data 12 | if code!='0': return None 13 | if 1: 14 | zipcode=doc.getElementsByTagName('zipcode')[0].firstChild.data 15 | use=doc.getElementsByTagName('useCode')[0].firstChild.data 16 | year=doc.getElementsByTagName('yearBuilt')[0].firstChild.data 17 | sqft=doc.getElementsByTagName('finishedSqFt')[0].firstChild.data 18 | bath=doc.getElementsByTagName('bathrooms')[0].firstChild.data 19 | bed=doc.getElementsByTagName('bedrooms')[0].firstChild.data 20 | rooms=1 #doc.getElementsByTagName('totalRooms')[0].firstChild.data 21 | price=doc.getElementsByTagName('amount')[0].firstChild.data 22 | else: 23 | return None 24 | 25 | return (zipcode,use,int(year),float(bath),int(bed),int(rooms),price) 26 | 27 | def getpricelist(): 28 | l1=[] 29 | for line in file('addresslist.txt'): 30 | data=getaddressdata(line.strip(),'Cambridge,MA') 31 | l1.append(data) 32 | return l1 33 | -------------------------------------------------------------------------------- /第08章 构建价格模型/ebaypredict.py: -------------------------------------------------------------------------------- 1 | import httplib 2 | from xml.dom.minidom import parse, parseString, Node 3 | 4 | devKey = 'YOUR DEV KEY' 5 | appKey = 'YOUR APP KEY' 6 | certKey = 'YOUR CERT KEY' 7 | serverUrl = 'api.ebay.com' 8 | userToken = 'YOUR TOKEN' 9 | 10 | def getHeaders(apicall,siteID="0",compatabilityLevel = "433"): 11 | headers = {"X-EBAY-API-COMPATIBILITY-LEVEL": compatabilityLevel, 12 | "X-EBAY-API-DEV-NAME": devKey, 13 | "X-EBAY-API-APP-NAME": appKey, 14 | "X-EBAY-API-CERT-NAME": certKey, 15 | "X-EBAY-API-CALL-NAME": apicall, 16 | "X-EBAY-API-SITEID": siteID, 17 | "Content-Type": "text/xml"} 18 | return headers 19 | 20 | def sendRequest(apicall,xmlparameters): 21 | connection = httplib.HTTPSConnection(serverUrl) 22 | connection.request("POST", '/ws/api.dll', xmlparameters, getHeaders(apicall)) 23 | response = connection.getresponse() 24 | if response.status != 200: 25 | print "Error sending request:" + response.reason 26 | else: 27 | data = response.read() 28 | connection.close() 29 | return data 30 | 31 | def getSingleValue(node,tag): 32 | nl=node.getElementsByTagName(tag) 33 | if len(nl)>0: 34 | tagNode=nl[0] 35 | if tagNode.hasChildNodes(): 36 | return tagNode.firstChild.nodeValue 37 | return '-1' 38 | 39 | 40 | def doSearch(query,categoryID=None,page=1): 41 | xml = ""+\ 42 | ""+\ 43 | "" +\ 44 | userToken +\ 45 | "" + \ 46 | ""+\ 47 | "200"+\ 48 | ""+str(page)+""+\ 49 | ""+\ 50 | "" + query + "" 51 | if categoryID!=None: 52 | xml+=""+str(categoryID)+"" 53 | xml+="" 54 | 55 | data=sendRequest('GetSearchResults',xml) 56 | response = parseString(data) 57 | itemNodes = response.getElementsByTagName('Item'); 58 | results = [] 59 | for item in itemNodes: 60 | itemId=getSingleValue(item,'ItemID') 61 | itemTitle=getSingleValue(item,'Title') 62 | itemPrice=getSingleValue(item,'CurrentPrice') 63 | itemEnds=getSingleValue(item,'EndTime') 64 | results.append((itemId,itemTitle,itemPrice,itemEnds)) 65 | return results 66 | 67 | 68 | def getCategory(query='',parentID=None,siteID='0'): 69 | lquery=query.lower() 70 | xml = ""+\ 71 | ""+\ 72 | "" +\ 73 | userToken +\ 74 | ""+\ 75 | "ReturnAll"+\ 76 | "true"+\ 77 | ""+siteID+"" 78 | if parentID==None: 79 | xml+="1" 80 | else: 81 | xml+=""+str(parentID)+"" 82 | xml += "" 83 | data=sendRequest('GetCategories',xml) 84 | categoryList=parseString(data) 85 | catNodes=categoryList.getElementsByTagName('Category') 86 | for node in catNodes: 87 | catid=getSingleValue(node,'CategoryID') 88 | name=getSingleValue(node,'CategoryName') 89 | if name.lower().find(lquery)!=-1: 90 | print catid,name 91 | 92 | def getItem(itemID): 93 | xml = ""+\ 94 | ""+\ 95 | "" +\ 96 | userToken +\ 97 | "" + \ 98 | "" + str(itemID) + ""+\ 99 | "ItemReturnAttributes"+\ 100 | "" 101 | data=sendRequest('GetItem',xml) 102 | result={} 103 | response=parseString(data) 104 | result['title']=getSingleValue(response,'Title') 105 | sellingStatusNode = response.getElementsByTagName('SellingStatus')[0]; 106 | result['price']=getSingleValue(sellingStatusNode,'CurrentPrice') 107 | result['bids']=getSingleValue(sellingStatusNode,'BidCount') 108 | seller = response.getElementsByTagName('Seller') 109 | result['feedback'] = getSingleValue(seller[0],'FeedbackScore') 110 | 111 | attributeSet=response.getElementsByTagName('Attribute'); 112 | attributes={} 113 | for att in attributeSet: 114 | attID=att.attributes.getNamedItem('attributeID').nodeValue 115 | attValue=getSingleValue(att,'ValueLiteral') 116 | attributes[attID]=attValue 117 | result['attributes']=attributes 118 | return result 119 | 120 | 121 | def makeLaptopDataset(): 122 | searchResults=doSearch('laptop',categoryID=51148) 123 | result=[] 124 | for r in searchResults: 125 | item=getItem(r[0]) 126 | att=item['attributes'] 127 | try: 128 | data=(float(att['12']),float(att['26444']), 129 | float(att['26446']),float(att['25710']), 130 | float(item['feedback']) 131 | ) 132 | entry={'input':data,'result':float(item['price'])} 133 | result.append(entry) 134 | except: 135 | print item['title']+' failed' 136 | return result 137 | -------------------------------------------------------------------------------- /第08章 构建价格模型/numpredict.py: -------------------------------------------------------------------------------- 1 | from random import random,randint 2 | import math 3 | 4 | def wineprice(rating,age): 5 | peak_age=rating-50 6 | 7 | # Calculate price based on rating 8 | price=rating/2 9 | if age>peak_age: 10 | # Past its peak, goes bad in 10 years 11 | price=price*(5-(age-peak_age)/2) 12 | else: 13 | # Increases to 5x original value as it 14 | # approaches its peak 15 | price=price*(5*((age+1)/peak_age)) 16 | if price<0: price=0 17 | return price 18 | 19 | 20 | def wineset1(): 21 | rows=[] 22 | for i in range(300): 23 | # Create a random age and rating 24 | rating=random()*50+50 25 | age=random()*50 26 | 27 | # Get reference price 28 | price=wineprice(rating,age) 29 | 30 | # Add some noise 31 | price*=(random()*0.2+0.9) 32 | 33 | # Add to the dataset 34 | rows.append({'input':(rating,age), 35 | 'result':price}) 36 | return rows 37 | 38 | def euclidean(v1,v2): 39 | d=0.0 40 | for i in range(len(v1)): 41 | d+=(v1[i]-v2[i])**2 42 | return math.sqrt(d) 43 | 44 | 45 | def getdistances(data,vec1): 46 | distancelist=[] 47 | 48 | # Loop over every item in the dataset 49 | for i in range(len(data)): 50 | vec2=data[i]['input'] 51 | 52 | # Add the distance and the index 53 | distancelist.append((euclidean(vec1,vec2),i)) 54 | 55 | # Sort by distance 56 | distancelist.sort() 57 | return distancelist 58 | 59 | def knnestimate(data,vec1,k=5): 60 | # Get sorted distances 61 | dlist=getdistances(data,vec1) 62 | avg=0.0 63 | 64 | # Take the average of the top k results 65 | for i in range(k): 66 | idx=dlist[i][1] 67 | avg+=data[idx]['result'] 68 | avg=avg/k 69 | return avg 70 | 71 | def inverseweight(dist,num=1.0,const=0.1): 72 | return num/(dist+const) 73 | 74 | def subtractweight(dist,const=1.0): 75 | if dist>const: 76 | return 0 77 | else: 78 | return const-dist 79 | 80 | def gaussian(dist,sigma=5.0): 81 | return math.e**(-dist**2/(2*sigma**2)) 82 | 83 | def weightedknn(data,vec1,k=5,weightf=gaussian): 84 | # Get distances 85 | dlist=getdistances(data,vec1) 86 | avg=0.0 87 | totalweight=0.0 88 | 89 | # Get weighted average 90 | for i in range(k): 91 | dist=dlist[i][0] 92 | idx=dlist[i][1] 93 | weight=weightf(dist) 94 | avg+=weight*data[idx]['result'] 95 | totalweight+=weight 96 | if totalweight==0: return 0 97 | avg=avg/totalweight 98 | return avg 99 | 100 | def dividedata(data,test=0.05): 101 | trainset=[] 102 | testset=[] 103 | for row in data: 104 | if random()=low and v<=high: 176 | nweight+=weight 177 | tweight+=weight 178 | if tweight==0: return 0 179 | 180 | # The probability is the weights in the range 181 | # divided by all the weights 182 | return nweight/tweight 183 | 184 | from pylab import * 185 | 186 | def cumulativegraph(data,vec1,high,k=5,weightf=gaussian): 187 | t1=arange(0.0,high,0.1) 188 | cprob=array([probguess(data,vec1,0,v,k,weightf) for v in t1]) 189 | plot(t1,cprob) 190 | show() 191 | 192 | 193 | def probabilitygraph(data,vec1,high,k=5,weightf=gaussian,ss=5.0): 194 | # Make a range for the prices 195 | t1=arange(0.0,high,0.1) 196 | 197 | # Get the probabilities for the entire range 198 | probs=[probguess(data,vec1,v,v+0.1,k,weightf) for v in t1] 199 | 200 | # Smooth them by adding the gaussian of the nearby probabilites 201 | smoothed=[] 202 | for i in range(len(probs)): 203 | sv=0.0 204 | for j in range(0,len(probs)): 205 | dist=abs(i-j)*0.1 206 | weight=gaussian(dist,sigma=ss) 207 | sv+=weight*probs[j] 208 | smoothed.append(sv) 209 | smoothed=array(smoothed) 210 | 211 | plot(t1,smoothed) 212 | show() 213 | -------------------------------------------------------------------------------- /第08章 构建价格模型/optimization.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import math 4 | 5 | people = [('Seymour','BOS'), 6 | ('Franny','DAL'), 7 | ('Zooey','CAK'), 8 | ('Walt','MIA'), 9 | ('Buddy','ORD'), 10 | ('Les','OMA')] 11 | # Laguardia 12 | destination='LGA' 13 | 14 | flights={} 15 | # 16 | """ 17 | for line in file('schedule.txt'): 18 | origin,dest,depart,arrive,price=line.strip().split(',') 19 | flights.setdefault((origin,dest),[]) 20 | 21 | # Add details to the list of possible flights 22 | flights[(origin,dest)].append((depart,arrive,int(price))) 23 | """ 24 | def getminutes(t): 25 | x=time.strptime(t,'%H:%M') 26 | return x[3]*60+x[4] 27 | 28 | def printschedule(r): 29 | for d in range(len(r)/2): 30 | name=people[d][0] 31 | origin=people[d][1] 32 | out=flights[(origin,destination)][int(r[d])] 33 | ret=flights[(destination,origin)][int(r[d+1])] 34 | print '%10s%10s %5s-%5s $%3s %5s-%5s $%3s' % (name,origin, 35 | out[0],out[1],out[2], 36 | ret[0],ret[1],ret[2]) 37 | 38 | def schedulecost(sol): 39 | totalprice=0 40 | latestarrival=0 41 | earliestdep=24*60 42 | 43 | for d in range(len(sol)/2): 44 | # Get the inbound and outbound flights 45 | origin=people[d][1] 46 | outbound=flights[(origin,destination)][int(sol[d])] 47 | returnf=flights[(destination,origin)][int(sol[d+1])] 48 | 49 | # Total price is the price of all outbound and return flights 50 | totalprice+=outbound[2] 51 | totalprice+=returnf[2] 52 | 53 | # Track the latest arrival and earliest departure 54 | if latestarrivalgetminutes(returnf[0]): earliestdep=getminutes(returnf[0]) 56 | 57 | # Every person must wait at the airport until the latest person arrives. 58 | # They also must arrive at the same time and wait for their flights. 59 | totalwait=0 60 | for d in range(len(sol)/2): 61 | origin=people[d][1] 62 | outbound=flights[(origin,destination)][int(sol[d])] 63 | returnf=flights[(destination,origin)][int(sol[d+1])] 64 | totalwait+=latestarrival-getminutes(outbound[1]) 65 | totalwait+=getminutes(returnf[0])-earliestdep 66 | 67 | # Does this solution require an extra day of car rental? That'll be $50! 68 | if latestarrival>earliestdep: totalprice+=50 69 | 70 | return totalprice+totalwait 71 | 72 | def randomoptimize(domain,costf): 73 | best=999999999 74 | bestr=None 75 | for i in range(0,1000): 76 | # Create a random solution 77 | r=[float(random.randint(domain[i][0],domain[i][1])) 78 | for i in range(len(domain))] 79 | 80 | # Get the cost 81 | cost=costf(r) 82 | 83 | # Compare it to the best one so far 84 | if cost0.1: 96 | # Choose one of the indices 97 | i=random.randint(0,len(domain)-1) 98 | 99 | # Choose a direction to change it 100 | dir=random.randint(-step,step) 101 | 102 | # Create a new list with one of the values changed 103 | vecb=vec[:] 104 | vecb[i]+=dir 105 | if vecb[i]domain[i][1]: vecb[i]=domain[i][1] 107 | 108 | # Calculate the current cost and the new cost 109 | ea=costf(vec) 110 | eb=costf(vecb) 111 | p=pow(math.e,(-eb-ea)/T) 112 | 113 | print vec,ea 114 | 115 | 116 | # Is it better, or does it make the probability 117 | # cutoff? 118 | if (ebmaxv: v[i][d]=maxv 160 | elif v[i][d]<-maxv: v[i][d]=-maxv 161 | 162 | # constrain bounds of solutions 163 | x[i][d]+=v[i][d] 164 | if x[i][d]domain[d][1]: x[i][d]=domain[d][1] 166 | 167 | print p[g],costf(p[g]) 168 | return p[g] 169 | -------------------------------------------------------------------------------- /第09章 高阶分类 核方法与SVM/advancedclassify.py: -------------------------------------------------------------------------------- 1 | class matchrow: 2 | def __init__(self,row,allnum=False): 3 | if allnum: 4 | self.data=[float(row[i]) for i in range(len(row)-1)] 5 | else: 6 | self.data=row[0:len(row)-1] 7 | self.match=int(row[len(row)-1]) 8 | 9 | def loadmatch(f,allnum=False): 10 | rows=[] 11 | for line in file(f): 12 | rows.append(matchrow(line.split(','),allnum)) 13 | return rows 14 | 15 | from pylab import * 16 | def plotagematches(rows): 17 | xdm,ydm=[r.data[0] for r in rows if r.match==1],\ 18 | [r.data[1] for r in rows if r.match==1] 19 | xdn,ydn=[r.data[0] for r in rows if r.match==0],\ 20 | [r.data[1] for r in rows if r.match==0] 21 | 22 | plot(xdm,ydm,'bo') 23 | plot(xdn,ydn,'b+') 24 | 25 | show() 26 | 27 | def lineartrain(rows): 28 | averages={} 29 | counts={} 30 | 31 | for row in rows: 32 | # Get the class of this point 33 | cl=row.match 34 | 35 | averages.setdefault(cl,[0.0]*(len(row.data))) 36 | counts.setdefault(cl,0) 37 | 38 | # Add this point to the averages 39 | for i in range(len(row.data)): 40 | averages[cl][i]+=float(row.data[i]) 41 | 42 | # Keep track of how many points in each class 43 | counts[cl]+=1 44 | 45 | # Divide sums by counts to get the averages 46 | for cl,avg in averages.items(): 47 | for i in range(len(avg)): 48 | avg[i]/=counts[cl] 49 | 50 | return averages 51 | 52 | def dotproduct(v1,v2): 53 | return sum([v1[i]*v2[i] for i in range(len(v1))]) 54 | 55 | def veclength(v): 56 | return sum([p**2 for p in v]) 57 | 58 | def dpclassify(point,avgs): 59 | b=(dotproduct(avgs[1],avgs[1])-dotproduct(avgs[0],avgs[0]))/2 60 | y=dotproduct(point,avgs[0])-dotproduct(point,avgs[1])+b 61 | if y>0: return 0 62 | else: return 1 63 | 64 | def yesno(v): 65 | if v=='yes': return 1 66 | elif v=='no': return -1 67 | else: return 0 68 | 69 | def matchcount(interest1,interest2): 70 | l1=interest1.split(':') 71 | l2=interest2.split(':') 72 | x=0 73 | for v in l1: 74 | if v in l2: x+=1 75 | return x 76 | 77 | yahookey="YOUR API KEY" 78 | from xml.dom.minidom import parseString 79 | from urllib import urlopen,quote_plus 80 | 81 | loc_cache={} 82 | def getlocation(address): 83 | if address in loc_cache: return loc_cache[address] 84 | data=urlopen('http://api.local.yahoo.com/MapsService/V1/'+\ 85 | 'geocode?appid=%s&location=%s' % 86 | (yahookey,quote_plus(address))).read() 87 | doc=parseString(data) 88 | lat=doc.getElementsByTagName('Latitude')[0].firstChild.nodeValue 89 | long=doc.getElementsByTagName('Longitude')[0].firstChild.nodeValue 90 | loc_cache[address]=(float(lat),float(long)) 91 | return loc_cache[address] 92 | 93 | def milesdistance(a1,a2): 94 | lat1,long1=getlocation(a1) 95 | lat2,long2=getlocation(a2) 96 | latdif=69.1*(lat2-lat1) 97 | longdif=53.0*(long2-long1) 98 | return (latdif**2+longdif**2)**.5 99 | 100 | def loadnumerical(): 101 | oldrows=loadmatch('matchmaker.csv') 102 | newrows=[] 103 | for row in oldrows: 104 | d=row.data 105 | data=[float(d[0]),yesno(d[1]),yesno(d[2]), 106 | float(d[5]),yesno(d[6]),yesno(d[7]), 107 | matchcount(d[3],d[8]), 108 | milesdistance(d[4],d[9]), 109 | row.match] 110 | newrows.append(matchrow(data)) 111 | return newrows 112 | 113 | def scaledata(rows): 114 | low=[999999999.0]*len(rows[0].data) 115 | high=[-999999999.0]*len(rows[0].data) 116 | # Find the lowest and highest values 117 | for row in rows: 118 | d=row.data 119 | for i in range(len(d)): 120 | if d[i]high[i]: high[i]=d[i] 122 | 123 | # Create a function that scales data 124 | def scaleinput(d): 125 | return [(d[i]-low[i])/(high[i]-low[i]) 126 | for i in range(len(low))] 127 | 128 | # Scale all the data 129 | newrows=[matchrow(scaleinput(row.data)+[row.match]) 130 | for row in rows] 131 | 132 | # Return the new data and the function 133 | return newrows,scaleinput 134 | 135 | 136 | def rbf(v1,v2,gamma=10): 137 | dv=[v1[i]-v2[i] for i in range(len(v1))] 138 | l=veclength(dv) 139 | return math.e**(-gamma*l) 140 | 141 | def nlclassify(point,rows,offset,gamma=10): 142 | sum0=0.0 143 | sum1=0.0 144 | count0=0 145 | count1=0 146 | 147 | for row in rows: 148 | if row.match==0: 149 | sum0+=rbf(point,row.data,gamma) 150 | count0+=1 151 | else: 152 | sum1+=rbf(point,row.data,gamma) 153 | count1+=1 154 | y=(1.0/count0)*sum0-(1.0/count1)*sum1+offset 155 | 156 | if y>0: return 0 157 | else: return 1 158 | 159 | def getoffset(rows,gamma=10): 160 | l0=[] 161 | l1=[] 162 | for row in rows: 163 | if row.match==0: l0.append(row.data) 164 | else: l1.append(row.data) 165 | sum0=sum(sum([rbf(v1,v2,gamma) for v1 in l0]) for v2 in l0) 166 | sum1=sum(sum([rbf(v1,v2,gamma) for v1 in l1]) for v2 in l1) 167 | 168 | return (1.0/(len(l1)**2))*sum1-(1.0/(len(l0)**2))*sum0 169 | -------------------------------------------------------------------------------- /第09章 高阶分类 核方法与SVM/agesonly.csv: -------------------------------------------------------------------------------- 1 | 24,30,1 2 | 30,40,1 3 | 22,49,0 4 | 43,39,1 5 | 23,30,1 6 | 23,49,0 7 | 48,46,1 8 | 23,23,1 9 | 29,49,0 10 | 38,38,1 11 | 30,34,1 12 | 40,50,1 13 | 35,32,1 14 | 49,44,1 15 | 38,22,1 16 | 30,27,1 17 | 26,24,1 18 | 39,23,1 19 | 36,43,1 20 | 25,31,1 21 | 27,27,1 22 | 32,22,1 23 | 40,30,1 24 | 26,28,1 25 | 46,32,1 26 | 41,37,1 27 | 39,41,1 28 | 18,28,0 29 | 18,47,0 30 | 39,44,1 31 | 38,21,1 32 | 24,36,0 33 | 32,22,1 34 | 21,20,1 35 | 42,36,1 36 | 46,41,1 37 | 39,38,1 38 | 18,31,0 39 | 31,45,1 40 | 44,24,0 41 | 49,22,0 42 | 26,27,1 43 | 25,34,1 44 | 47,23,0 45 | 27,48,0 46 | 32,49,1 47 | 46,41,1 48 | 24,32,1 49 | 29,26,1 50 | 25,36,1 51 | 27,35,1 52 | 38,19,1 53 | 18,40,0 54 | 34,49,1 55 | 32,35,1 56 | 47,49,1 57 | 47,18,0 58 | 33,24,1 59 | 35,28,1 60 | 35,41,1 61 | 39,43,1 62 | 29,18,1 63 | 18,44,0 64 | 26,26,1 65 | 31,43,1 66 | 20,29,0 67 | 28,18,1 68 | 31,38,1 69 | 34,34,1 70 | 32,33,1 71 | 34,27,1 72 | 19,38,0 73 | 32,21,1 74 | 33,37,1 75 | 33,18,1 76 | 18,46,0 77 | 31,37,1 78 | 36,30,1 79 | 40,40,1 80 | 38,30,1 81 | 49,28,1 82 | 31,47,1 83 | 28,50,0 84 | 49,43,1 85 | 24,31,1 86 | 33,43,1 87 | 28,24,1 88 | 45,29,1 89 | 49,35,1 90 | 36,29,1 91 | 42,32,1 92 | 29,18,1 93 | 49,20,0 94 | 22,27,1 95 | 41,38,1 96 | 47,21,0 97 | 40,32,1 98 | 35,18,1 99 | 35,33,1 100 | 34,28,1 101 | 22,31,0 102 | 46,20,0 103 | 18,49,0 104 | 48,23,0 105 | 39,21,1 106 | 20,34,0 107 | 24,20,1 108 | 38,18,1 109 | 37,47,1 110 | 39,37,1 111 | 38,39,1 112 | 27,42,1 113 | 47,49,1 114 | 27,42,1 115 | 40,28,1 116 | 41,46,1 117 | 39,25,1 118 | 43,36,1 119 | 49,30,1 120 | 24,38,0 121 | 49,42,1 122 | 19,22,0 123 | 43,27,1 124 | 30,37,1 125 | 24,31,1 126 | 24,48,0 127 | 24,29,1 128 | 18,19,1 129 | 29,25,1 130 | 38,33,1 131 | 39,20,1 132 | 24,30,1 133 | 22,39,0 134 | 47,21,0 135 | 30,44,1 136 | 41,38,1 137 | 29,33,1 138 | 42,42,1 139 | 47,27,1 140 | 23,20,1 141 | 39,18,1 142 | 30,26,1 143 | 36,27,1 144 | 40,18,1 145 | 31,18,1 146 | 46,27,1 147 | 41,44,1 148 | 26,34,1 149 | 33,18,1 150 | 48,19,0 151 | 46,27,1 152 | 25,40,0 153 | 50,36,1 154 | 20,21,1 155 | 33,47,1 156 | 40,35,1 157 | 24,27,1 158 | 34,19,1 159 | 26,45,0 160 | 34,36,1 161 | 21,27,0 162 | 48,28,1 163 | 23,25,1 164 | 48,46,1 165 | 30,20,1 166 | 23,40,0 167 | 36,40,1 168 | 21,45,0 169 | 30,40,1 170 | 39,24,1 171 | 42,47,1 172 | 28,37,1 173 | 24,30,1 174 | 37,25,1 175 | 44,34,1 176 | 43,32,1 177 | 46,29,1 178 | 49,22,0 179 | 41,28,1 180 | 23,50,0 181 | 30,43,1 182 | 25,32,1 183 | 27,46,0 184 | 23,21,1 185 | 39,41,1 186 | 33,27,1 187 | 49,21,0 188 | 33,33,1 189 | 18,25,0 190 | 42,35,1 191 | 36,25,1 192 | 26,50,0 193 | 18,37,0 194 | 35,37,1 195 | 39,38,1 196 | 22,30,0 197 | 18,44,0 198 | 46,44,1 199 | 24,27,1 200 | 41,34,1 201 | 40,39,1 202 | 34,49,1 203 | 35,41,1 204 | 46,48,1 205 | 50,23,0 206 | 49,20,0 207 | 22,47,0 208 | 27,26,1 209 | 30,30,1 210 | 37,39,1 211 | 42,44,1 212 | 41,27,1 213 | 24,21,1 214 | 34,28,1 215 | 23,43,0 216 | 43,35,1 217 | 42,40,1 218 | 25,24,1 219 | 36,24,1 220 | 25,23,1 221 | 44,30,1 222 | 39,33,1 223 | 38,33,1 224 | 49,30,1 225 | 40,19,1 226 | 19,46,0 227 | 31,21,1 228 | 48,33,1 229 | 26,24,1 230 | 20,37,0 231 | 29,31,1 232 | 35,28,1 233 | 37,25,1 234 | 42,42,1 235 | 42,48,1 236 | 41,47,1 237 | 44,45,1 238 | 45,46,1 239 | 25,38,1 240 | 19,45,0 241 | 36,26,1 242 | 33,36,1 243 | 27,19,1 244 | 48,24,0 245 | 37,48,1 246 | 23,31,0 247 | 20,29,0 248 | 27,44,0 249 | 47,24,0 250 | 36,18,1 251 | 37,48,1 252 | 32,29,1 253 | 46,48,1 254 | 31,47,1 255 | 23,45,0 256 | 28,30,1 257 | 36,32,1 258 | 25,43,0 259 | 24,44,0 260 | 34,47,1 261 | 46,42,1 262 | 18,31,0 263 | 23,25,1 264 | 44,39,1 265 | 18,29,0 266 | 49,40,1 267 | 24,33,0 268 | 21,44,0 269 | 40,24,1 270 | 46,41,1 271 | 42,33,1 272 | 25,41,0 273 | 29,42,1 274 | 40,18,1 275 | 37,40,1 276 | 46,28,1 277 | 33,20,1 278 | 18,42,0 279 | 22,36,0 280 | 27,46,0 281 | 33,48,1 282 | 21,37,0 283 | 26,50,0 284 | 29,23,1 285 | 23,33,0 286 | 21,38,0 287 | 18,30,0 288 | 29,28,1 289 | 31,22,1 290 | 30,48,1 291 | 41,37,1 292 | 35,31,1 293 | 48,32,1 294 | 29,37,1 295 | 32,33,1 296 | 43,26,1 297 | 21,33,0 298 | 44,28,1 299 | 35,18,1 300 | 35,35,1 301 | 25,20,1 302 | 39,46,1 303 | 26,39,1 304 | 36,29,1 305 | 29,44,1 306 | 28,42,1 307 | 38,21,1 308 | 28,49,0 309 | 33,26,1 310 | 31,28,1 311 | 25,47,0 312 | 23,25,1 313 | 45,49,1 314 | 28,26,1 315 | 36,48,1 316 | 42,48,1 317 | 42,21,1 318 | 29,32,1 319 | 26,28,1 320 | 24,46,0 321 | 39,30,1 322 | 29,46,1 323 | 43,43,1 324 | 20,42,0 325 | 35,41,1 326 | 45,19,0 327 | 38,45,1 328 | 25,38,1 329 | 31,20,1 330 | 38,43,1 331 | 37,30,1 332 | 43,27,1 333 | 43,44,1 334 | 21,30,0 335 | 22,45,0 336 | 44,26,1 337 | 43,42,1 338 | 26,41,0 339 | 47,35,1 340 | 48,30,1 341 | 41,24,1 342 | 19,48,0 343 | 45,24,0 344 | 38,41,1 345 | 42,46,1 346 | 49,45,1 347 | 28,44,1 348 | 22,44,0 349 | 31,48,1 350 | 48,21,0 351 | 31,20,1 352 | 30,39,1 353 | 23,23,1 354 | 21,32,0 355 | 19,19,1 356 | 21,27,0 357 | 24,46,0 358 | 25,28,1 359 | 48,50,1 360 | 25,32,1 361 | 26,29,1 362 | 33,48,1 363 | 35,32,1 364 | 48,25,1 365 | 30,27,1 366 | 34,49,1 367 | 40,45,1 368 | 28,32,1 369 | 47,33,1 370 | 29,33,1 371 | 21,22,1 372 | 21,39,0 373 | 41,45,1 374 | 46,39,1 375 | 22,24,1 376 | 32,22,1 377 | 27,46,0 378 | 26,35,1 379 | 27,29,1 380 | 48,19,0 381 | 35,26,1 382 | 42,29,1 383 | 30,22,1 384 | 20,26,0 385 | 33,25,1 386 | 37,30,1 387 | 37,32,1 388 | 20,22,1 389 | 42,48,1 390 | 29,20,1 391 | 32,46,1 392 | 37,34,1 393 | 29,45,1 394 | 19,44,0 395 | 49,18,0 396 | 28,25,1 397 | 48,31,1 398 | 35,46,1 399 | 34,26,1 400 | 38,26,1 401 | 36,31,1 402 | 31,30,1 403 | 27,19,1 404 | 44,38,1 405 | 19,37,0 406 | 43,49,1 407 | 19,42,0 408 | 32,24,1 409 | 46,43,1 410 | 43,46,1 411 | 33,32,1 412 | 23,35,0 413 | 26,34,1 414 | 48,20,0 415 | 45,38,1 416 | 30,30,1 417 | 28,23,1 418 | 43,36,1 419 | 19,37,0 420 | 39,45,1 421 | 20,30,0 422 | 28,30,1 423 | 19,42,0 424 | 41,21,1 425 | 42,31,1 426 | 47,45,1 427 | 42,48,1 428 | 40,22,1 429 | 28,20,1 430 | 22,31,0 431 | 28,24,1 432 | 18,33,0 433 | 42,47,1 434 | 35,18,1 435 | 32,28,1 436 | 45,39,1 437 | 46,45,1 438 | 41,43,1 439 | 24,37,0 440 | 34,30,1 441 | 40,22,1 442 | 38,20,1 443 | 43,28,1 444 | 21,26,0 445 | 35,27,1 446 | 33,37,1 447 | 48,39,1 448 | 47,40,1 449 | 31,32,1 450 | 18,32,0 451 | 31,20,1 452 | 30,49,1 453 | 22,46,0 454 | 36,39,1 455 | 30,35,1 456 | 49,50,1 457 | 46,39,1 458 | 45,44,1 459 | 34,40,1 460 | 27,28,1 461 | 27,35,1 462 | 46,46,1 463 | 26,42,0 464 | 27,18,1 465 | 23,38,0 466 | 30,30,1 467 | 34,32,1 468 | 48,27,1 469 | 31,23,1 470 | 29,47,0 471 | 47,31,1 472 | 35,19,1 473 | 30,28,1 474 | 33,44,1 475 | 36,37,1 476 | 34,44,1 477 | 42,43,1 478 | 36,29,1 479 | 35,46,1 480 | 22,36,0 481 | 39,47,1 482 | 23,23,1 483 | 47,20,0 484 | 38,22,1 485 | 21,33,0 486 | 37,41,1 487 | 18,18,1 488 | 35,34,1 489 | 49,49,1 490 | 33,32,1 491 | 31,19,1 492 | 31,26,1 493 | 45,31,1 494 | 41,44,1 495 | 27,47,0 496 | 28,26,1 497 | 18,47,0 498 | 37,18,1 499 | 20,42,0 500 | 36,45,1 501 | -------------------------------------------------------------------------------- /第09章 高阶分类 核方法与SVM/facebook.py: -------------------------------------------------------------------------------- 1 | import urllib,md5,webbrowser,time 2 | from xml.dom.minidom import parseString 3 | 4 | apikey="47e953c8ea9ed30db904af453125c759" 5 | secret="ea703e4721e8c7bf88b92110a46a9b06" 6 | FacebookURL = "https://api.facebook.com/restserver.php" 7 | 8 | def getsinglevalue(node,tag): 9 | nl=node.getElementsByTagName(tag) 10 | if len(nl)>0: 11 | tagNode=nl[0] 12 | if tagNode.hasChildNodes(): 13 | return tagNode.firstChild.nodeValue 14 | return '' 15 | 16 | def callid(): 17 | return str(int(time.time()*10)) 18 | 19 | class fbsession: 20 | def __init__(self): 21 | self.session_secret=None 22 | self.session_key=None 23 | self.createtoken() 24 | webbrowser.open(self.getlogin()) 25 | print "Press enter after logging in:", 26 | raw_input() 27 | self.getsession() 28 | def sendrequest(self, args): 29 | args['api_key'] = apikey 30 | args['sig'] = self.makehash(args) 31 | post_data = urllib.urlencode(args) 32 | url = FacebookURL + "?" + post_data 33 | data=urllib.urlopen(url).read() 34 | print data 35 | return parseString(data) 36 | def makehash(self,args): 37 | hasher = md5.new(''.join([x + '=' + args[x] for x in sorted(args.keys())])) 38 | if self.session_secret: hasher.update(self.session_secret) 39 | else: hasher.update(secret) 40 | return hasher.hexdigest() 41 | def createtoken(self): 42 | res = self.sendrequest({'method':"facebook.auth.createToken"}) 43 | self.token = getsinglevalue(res,'token') 44 | def getlogin(self): 45 | return "http://api.facebook.com/login.php?api_key="+apikey+\ 46 | "&auth_token=" + self.token 47 | def getsession(self): 48 | doc=self.sendrequest({'method':'facebook.auth.getSession', 49 | 'auth_token':self.token}) 50 | self.session_key=getsinglevalue(doc,'session_key') 51 | self.session_secret=getsinglevalue(doc,'secret') 52 | def getfriends(self): 53 | doc=self.sendrequest({'method':'facebook.friends.get', 54 | 'session_key':self.session_key,'call_id':callid()}) 55 | results=[] 56 | for n in doc.getElementsByTagName('result_elt'): 57 | results.append(n.firstChild.nodeValue) 58 | return results 59 | 60 | def getinfo(self,users): 61 | ulist=','.join(users) 62 | 63 | fields='gender,current_location,relationship_status,'+\ 64 | 'affiliations,hometown_location' 65 | 66 | doc=self.sendrequest({'method':'facebook.users.getInfo', 67 | 'session_key':self.session_key,'call_id':callid(), 68 | 'users':ulist,'fields':fields}) 69 | 70 | results={} 71 | for n,id in zip(doc.getElementsByTagName('result_elt'),users): 72 | # Get the location 73 | locnode=n.getElementsByTagName('hometown_location')[0] 74 | loc=getsinglevalue(locnode,'city')+', '+getsinglevalue(locnode,'state') 75 | 76 | # Get school 77 | college='' 78 | gradyear='0' 79 | affiliations=n.getElementsByTagName('affiliations_elt') 80 | for aff in affiliations: 81 | # Type 1 is college 82 | if getsinglevalue(aff,'type')=='1': 83 | college=getsinglevalue(aff,'name') 84 | gradyear=getsinglevalue(aff,'year') 85 | 86 | results[id]={'gender':getsinglevalue(n,'gender'), 87 | 'status':getsinglevalue(n,'relationship_status'), 88 | 'location':loc,'college':college,'year':gradyear} 89 | return results 90 | 91 | def arefriends(self,idlist1,idlist2): 92 | id1=','.join(idlist1) 93 | id2=','.join(idlist2) 94 | doc=self.sendrequest({'method':'facebook.friends.areFriends', 95 | 'session_key':self.session_key,'call_id':callid(), 96 | 'id1':id1,'id2':id2}) 97 | results=[] 98 | for n in doc.getElementsByTagName('result_elt'): 99 | results.append(int(n.firstChild.nodeValue)) 100 | return results 101 | 102 | 103 | 104 | def makedataset(self): 105 | from advancedclassify import milesdistance 106 | # Get all the info for all my friends 107 | friends=self.getfriends() 108 | info=self.getinfo(friends) 109 | ids1,ids2=[],[] 110 | rows=[] 111 | 112 | # Nested loop to look at every pair of friends 113 | for i in range(len(friends)): 114 | f1=friends[i] 115 | data1=info[f1] 116 | 117 | # Start at i+1 so we don't double up 118 | for j in range(i+1,len(friends)): 119 | f2=friends[j] 120 | data2=info[f2] 121 | ids1.append(f1) 122 | ids2.append(f2) 123 | 124 | # Generate some numbers from the data 125 | if data1['college']==data2['college']: sameschool=1 126 | else: sameschool=0 127 | male1=(data1['gender']=='Male') and 1 or 0 128 | male2=(data2['gender']=='Male') and 1 or 0 129 | 130 | row=[male1,int(data1['year']),male2,int(data2['year']),sameschool] 131 | rows.append(row) 132 | # Call arefriends in blocks for every pair of people 133 | arefriends=[] 134 | for i in range(0,len(ids1),30): 135 | j=min(i+30,len(ids1)) 136 | pa=self.arefriends(ids1[i:j],ids2[i:j]) 137 | arefriends+=pa 138 | return arefriends,rows 139 | 140 | -------------------------------------------------------------------------------- /第09章 高阶分类 核方法与SVM/svm.py: -------------------------------------------------------------------------------- 1 | import svmc 2 | from svmc import C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR 3 | from svmc import LINEAR, POLY, RBF, SIGMOID 4 | from math import exp, fabs 5 | 6 | def _int_array(seq): 7 | size = len(seq) 8 | array = svmc.new_int(size) 9 | i = 0 10 | for item in seq: 11 | svmc.int_setitem(array,i,item) 12 | i = i + 1 13 | return array 14 | 15 | def _double_array(seq): 16 | size = len(seq) 17 | array = svmc.new_double(size) 18 | i = 0 19 | for item in seq: 20 | svmc.double_setitem(array,i,item) 21 | i = i + 1 22 | return array 23 | 24 | def _free_int_array(x): 25 | if x != 'NULL' and x != None: 26 | svmc.delete_int(x) 27 | 28 | def _free_double_array(x): 29 | if x != 'NULL' and x != None: 30 | svmc.delete_double(x) 31 | 32 | def _int_array_to_list(x,n): 33 | return map(svmc.int_getitem,[x]*n,range(n)) 34 | 35 | def _double_array_to_list(x,n): 36 | return map(svmc.double_getitem,[x]*n,range(n)) 37 | 38 | class svm_parameter: 39 | 40 | # default values 41 | default_parameters = { 42 | 'svm_type' : C_SVC, 43 | 'kernel_type' : RBF, 44 | 'degree' : 3, 45 | 'gamma' : 0, # 1/k 46 | 'coef0' : 0, 47 | 'nu' : 0.5, 48 | 'cache_size' : 40, 49 | 'C' : 1, 50 | 'eps' : 1e-3, 51 | 'p' : 0.1, 52 | 'shrinking' : 1, 53 | 'nr_weight' : 0, 54 | 'weight_label' : [], 55 | 'weight' : [], 56 | 'probability' : 0 57 | } 58 | 59 | def __init__(self,**kw): 60 | self.__dict__['param'] = svmc.new_svm_parameter() 61 | for attr,val in self.default_parameters.items(): 62 | setattr(self,attr,val) 63 | for attr,val in kw.items(): 64 | setattr(self,attr,val) 65 | 66 | def __getattr__(self,attr): 67 | get_func = getattr(svmc,'svm_parameter_%s_get' % (attr)) 68 | return get_func(self.param) 69 | 70 | def __setattr__(self,attr,val): 71 | 72 | if attr == 'weight_label': 73 | self.__dict__['weight_label_len'] = len(val) 74 | val = _int_array(val) 75 | _free_int_array(self.weight_label) 76 | elif attr == 'weight': 77 | self.__dict__['weight_len'] = len(val) 78 | val = _double_array(val) 79 | _free_double_array(self.weight) 80 | 81 | set_func = getattr(svmc,'svm_parameter_%s_set' % (attr)) 82 | set_func(self.param,val) 83 | 84 | def __repr__(self): 85 | ret = '' 96 | 97 | def __del__(self): 98 | _free_int_array(self.weight_label) 99 | _free_double_array(self.weight) 100 | svmc.delete_svm_parameter(self.param) 101 | 102 | def _convert_to_svm_node_array(x): 103 | """ convert a sequence or mapping to an svm_node array """ 104 | import operator 105 | 106 | # Find non zero elements 107 | iter_range = [] 108 | if type(x) == dict: 109 | for k, v in x.iteritems(): 110 | # all zeros kept due to the precomputed kernel; no good solution yet 111 | # if v != 0: 112 | iter_range.append( k ) 113 | elif operator.isSequenceType(x): 114 | for j in range(len(x)): 115 | # if x[j] != 0: 116 | iter_range.append( j ) 117 | else: 118 | raise TypeError,"data must be a mapping or a sequence" 119 | 120 | iter_range.sort() 121 | data = svmc.svm_node_array(len(iter_range)+1) 122 | svmc.svm_node_array_set(data,len(iter_range),-1,0) 123 | 124 | j = 0 125 | for k in iter_range: 126 | svmc.svm_node_array_set(data,j,k,x[k]) 127 | j = j + 1 128 | return data 129 | 130 | class svm_problem: 131 | def __init__(self,y,x): 132 | assert len(y) == len(x) 133 | self.prob = prob = svmc.new_svm_problem() 134 | self.size = size = len(y) 135 | 136 | self.y_array = y_array = svmc.new_double(size) 137 | for i in range(size): 138 | svmc.double_setitem(y_array,i,y[i]) 139 | 140 | self.x_matrix = x_matrix = svmc.svm_node_matrix(size) 141 | self.data = [] 142 | self.maxlen = 0; 143 | for i in range(size): 144 | data = _convert_to_svm_node_array(x[i]) 145 | self.data.append(data); 146 | svmc.svm_node_matrix_set(x_matrix,i,data) 147 | if type(x[i]) == dict: 148 | if (len(x[i]) > 0): 149 | self.maxlen = max(self.maxlen,max(x[i].keys())) 150 | else: 151 | self.maxlen = max(self.maxlen,len(x[i])) 152 | 153 | svmc.svm_problem_l_set(prob,size) 154 | svmc.svm_problem_y_set(prob,y_array) 155 | svmc.svm_problem_x_set(prob,x_matrix) 156 | 157 | def __repr__(self): 158 | return "" % (self.size) 159 | 160 | def __del__(self): 161 | svmc.delete_svm_problem(self.prob) 162 | svmc.delete_double(self.y_array) 163 | for i in range(self.size): 164 | svmc.svm_node_array_destroy(self.data[i]) 165 | svmc.svm_node_matrix_destroy(self.x_matrix) 166 | 167 | class svm_model: 168 | def __init__(self,arg1,arg2=None): 169 | if arg2 == None: 170 | # create model from file 171 | filename = arg1 172 | self.model = svmc.svm_load_model(filename) 173 | else: 174 | # create model from problem and parameter 175 | prob,param = arg1,arg2 176 | self.prob = prob 177 | if param.gamma == 0: 178 | param.gamma = 1.0/prob.maxlen 179 | msg = svmc.svm_check_parameter(prob.prob,param.param) 180 | if msg: raise ValueError, msg 181 | self.model = svmc.svm_train(prob.prob,param.param) 182 | 183 | #setup some classwide variables 184 | self.nr_class = svmc.svm_get_nr_class(self.model) 185 | self.svm_type = svmc.svm_get_svm_type(self.model) 186 | #create labels(classes) 187 | intarr = svmc.new_int(self.nr_class) 188 | svmc.svm_get_labels(self.model,intarr) 189 | self.labels = _int_array_to_list(intarr, self.nr_class) 190 | svmc.delete_int(intarr) 191 | #check if valid probability model 192 | self.probability = svmc.svm_check_probability_model(self.model) 193 | 194 | def predict(self,x): 195 | data = _convert_to_svm_node_array(x) 196 | ret = svmc.svm_predict(self.model,data) 197 | svmc.svm_node_array_destroy(data) 198 | return ret 199 | 200 | 201 | def get_nr_class(self): 202 | return self.nr_class 203 | 204 | def get_labels(self): 205 | if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR or self.svm_type == ONE_CLASS: 206 | raise TypeError, "Unable to get label from a SVR/ONE_CLASS model" 207 | return self.labels 208 | 209 | def predict_values_raw(self,x): 210 | #convert x into svm_node, allocate a double array for return 211 | n = self.nr_class*(self.nr_class-1)//2 212 | data = _convert_to_svm_node_array(x) 213 | dblarr = svmc.new_double(n) 214 | svmc.svm_predict_values(self.model, data, dblarr) 215 | ret = _double_array_to_list(dblarr, n) 216 | svmc.delete_double(dblarr) 217 | svmc.svm_node_array_destroy(data) 218 | return ret 219 | 220 | def predict_values(self,x): 221 | v=self.predict_values_raw(x) 222 | if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR or self.svm_type == ONE_CLASS: 223 | return v[0] 224 | else: #self.svm_type == C_SVC or self.svm_type == NU_SVC 225 | count = 0 226 | d = {} 227 | for i in range(len(self.labels)): 228 | for j in range(i+1, len(self.labels)): 229 | d[self.labels[i],self.labels[j]] = v[count] 230 | d[self.labels[j],self.labels[i]] = -v[count] 231 | count += 1 232 | return d 233 | 234 | def predict_probability(self,x): 235 | #c code will do nothing on wrong type, so we have to check ourself 236 | if self.svm_type == NU_SVR or self.svm_type == EPSILON_SVR: 237 | raise TypeError, "call get_svr_probability or get_svr_pdf for probability output of regression" 238 | elif self.svm_type == ONE_CLASS: 239 | raise TypeError, "probability not supported yet for one-class problem" 240 | #only C_SVC,NU_SVC goes in 241 | if not self.probability: 242 | raise TypeError, "model does not support probabiliy estimates" 243 | 244 | #convert x into svm_node, alloc a double array to receive probabilities 245 | data = _convert_to_svm_node_array(x) 246 | dblarr = svmc.new_double(self.nr_class) 247 | pred = svmc.svm_predict_probability(self.model, data, dblarr) 248 | pv = _double_array_to_list(dblarr, self.nr_class) 249 | svmc.delete_double(dblarr) 250 | svmc.svm_node_array_destroy(data) 251 | p = {} 252 | for i in range(len(self.labels)): 253 | p[self.labels[i]] = pv[i] 254 | return pred, p 255 | 256 | def get_svr_probability(self): 257 | #leave the Error checking to svm.cpp code 258 | ret = svmc.svm_get_svr_probability(self.model) 259 | if ret == 0: 260 | raise TypeError, "not a regression model or probability information not available" 261 | return ret 262 | 263 | def get_svr_pdf(self): 264 | #get_svr_probability will handle error checking 265 | sigma = self.get_svr_probability() 266 | return lambda z: exp(-fabs(z)/sigma)/(2*sigma) 267 | 268 | 269 | def save(self,filename): 270 | svmc.svm_save_model(filename,self.model) 271 | 272 | def __del__(self): 273 | svmc.svm_destroy_model(self.model) 274 | 275 | 276 | def cross_validation(prob, param, fold): 277 | if param.gamma == 0: 278 | param.gamma = 1.0/prob.maxlen 279 | dblarr = svmc.new_double(prob.size) 280 | svmc.svm_cross_validation(prob.prob, param.param, fold, dblarr) 281 | ret = _double_array_to_list(dblarr, prob.size) 282 | svmc.delete_double(dblarr) 283 | return ret 284 | -------------------------------------------------------------------------------- /第09章 高阶分类 核方法与SVM/svm.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第09章 高阶分类 核方法与SVM/svm.pyc -------------------------------------------------------------------------------- /第09章 高阶分类 核方法与SVM/svmc.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第09章 高阶分类 核方法与SVM/svmc.pyd -------------------------------------------------------------------------------- /第10章 寻找独立特征/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第10章 寻找独立特征/Thumbs.db -------------------------------------------------------------------------------- /第10章 寻找独立特征/articles.txt: -------------------------------------------------------------------------------- 1 | Obesity not a problem 2 | 0.689921777771 ['food', 'calories', 'than', 'easy', 'high', 'come'] 3 | 0.616521773806 ['with', 'your', 'weight', 'have', 'control', 'about'] 4 | 0.594775751071 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 5 | 6 | Fitness equipment 7 | 0.336438029037 ['with', 'your', 'weight', 'have', 'control', 'about'] 8 | 0.0336830699618 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 9 | 0.0323861213375 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 10 | 11 | 1000 Atkins Recipes E-Book 12 | 2.056067447 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 13 | 1.81222264198 ['with', 'your', 'weight', 'have', 'control', 'about'] 14 | 0.31319239108 ['quot', 'they', 'money', 'want', 'very', 'best'] 15 | 16 | saturday 17 | 7.46811621754 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 18 | 0.174282882652 ['food', 'calories', 'than', 'easy', 'high', 'come'] 19 | 0.00317828003493 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 20 | 21 | Food & Exercise -- 10/13/2007 22 | 4.73555293191 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 23 | 0.937525542474 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 24 | 0.10240571114 ['with', 'your', 'weight', 'have', 'control', 'about'] 25 | 26 | Food & exercise -- 10/12/2007 (yesterday) 27 | 3.97594760235 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 28 | 1.01018312908 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 29 | 0.0198736807467 ['food', 'calories', 'than', 'easy', 'high', 'come'] 30 | 31 | Have you been enslaved and confused by the omniscience myth? 32 | 1.14697423243 ['with', 'your', 'weight', 'have', 'control', 'about'] 33 | 0.548717665826 ['food', 'calories', 'than', 'easy', 'high', 'come'] 34 | 0.159098480615 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 35 | 36 | High or low fat food? Easy trick for figuring it out 37 | 9.98464450123 ['food', 'calories', 'than', 'easy', 'high', 'come'] 38 | 4.04959173123 ['quot', 'they', 'money', 'want', 'very', 'best'] 39 | 0.123588233146 ['fats', 'quot', 'this', 'good', 'about', 'like'] 40 | 41 | Absolutely Free People Search 42 | 1.39249472006 ['with', 'your', 'weight', 'have', 'control', 'about'] 43 | 0.459779859548 ['fats', 'quot', 'this', 'good', 'about', 'like'] 44 | 0.4371224863 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 45 | 46 | Friday 47 | 4.35052263015 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 48 | 0.609863992308 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 49 | 0.249760019695 ['food', 'calories', 'than', 'easy', 'high', 'come'] 50 | 51 | Food and Workout Log 10.11.07 52 | 4.76278425737 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 53 | 1.18573164731 ['food', 'calories', 'than', 'easy', 'high', 'come'] 54 | 0.559740845941 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 55 | 56 | LIL / Biggie's October Bulletin - UK 57 | 1.6407957576 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 58 | 1.09371385364 ['with', 'your', 'weight', 'have', 'control', 'about'] 59 | 0.0997065733116 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 60 | 61 | How accurate are the calorie meters on gym equipment? 62 | 1.68385026718 ['with', 'your', 'weight', 'have', 'control', 'about'] 63 | 1.24336224612 ['food', 'calories', 'than', 'easy', 'high', 'come'] 64 | 0.472039508303 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 65 | 66 | diet-exercise thursday 67 | 5.62839188358 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 68 | 1.42876311885 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 69 | 0.451891791988 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 70 | 71 | Fast Food Meat 72 | 3.96657604228 ['quot', 'they', 'money', 'want', 'very', 'best'] 73 | 1.56912835469 ['with', 'your', 'weight', 'have', 'control', 'about'] 74 | 0.945562729964 ['food', 'calories', 'than', 'easy', 'high', 'come'] 75 | 76 | Food & Exercise -- 10/11/2007 77 | 2.06565313343 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 78 | 0.915925841734 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 79 | 0.852089104271 ['food', 'calories', 'than', 'easy', 'high', 'come'] 80 | 81 | sleepy food/fitness thursday 82 | 5.29370213306 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 83 | 0.821758436298 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 84 | 0.373361576129 ['fats', 'quot', 'this', 'good', 'about', 'like'] 85 | 86 | 6 Dollars! You Can't Lose!! AS SEEN ON OPRAH & 20/20 87 | 3.50452080121 ['quot', 'they', 'money', 'want', 'very', 'best'] 88 | 1.48000806252 ['with', 'your', 'weight', 'have', 'control', 'about'] 89 | 0.353120143386 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 90 | 91 | Looking for mediterranean buffet restaurants in Toronto, Canada 92 | 0.766709189825 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 93 | 0.488536397538 ['with', 'your', 'weight', 'have', 'control', 'about'] 94 | 0.305836578699 ['quot', 'they', 'money', 'want', 'very', 'best'] 95 | 96 | Food and Workout Log 10.10.07 97 | 5.10395750879 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 98 | 0.931990921746 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 99 | 0.0751093197335 ['with', 'your', 'weight', 'have', 'control', 'about'] 100 | 101 | Food and Workout Log 10.9.07 102 | 3.66128126402 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 103 | 0.924777033606 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 104 | 0.46368820747 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 105 | 106 | Food & Exercise -- 10/10/2007 107 | 2.09636617791 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 108 | 0.777930860455 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 109 | 0.234412590473 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 110 | 111 | rainy diet/exercise 112 | 2.42408643655 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 113 | 1.79759287175 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 114 | 1.44383382428 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 115 | 116 | Whatever happened to Kaleb? 117 | 1.62771768736 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 118 | 0.424792812054 ['with', 'your', 'weight', 'have', 'control', 'about'] 119 | 3.00050522053e-008 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 120 | 121 | Food & Exercise -- 10/9/2007 122 | 2.67051008267 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 123 | 2.25685573791 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 124 | 0.962829471038 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 125 | 126 | Way of getting my veggies 127 | 2.51102412355 ['quot', 'they', 'money', 'want', 'very', 'best'] 128 | 1.82193456941 ['with', 'your', 'weight', 'have', 'control', 'about'] 129 | 1.20974377068 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 130 | 131 | Food & Exercise -- 10/8/2007 (yesterday) 132 | 3.57035878288 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 133 | 0.581070745119 ['with', 'your', 'weight', 'have', 'control', 'about'] 134 | 0.151621405217 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 135 | 136 | Oatmeal, cereal of choice. 137 | 3.41252863148 ['food', 'calories', 'than', 'easy', 'high', 'come'] 138 | 0.482857491594 ['with', 'your', 'weight', 'have', 'control', 'about'] 139 | 0.21056938621 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 140 | 141 | Whatever happened to Dally? 142 | 0.38943120274 ['with', 'your', 'weight', 'have', 'control', 'about'] 143 | 0.38027115946 ['quot', 'they', 'money', 'want', 'very', 'best'] 144 | 0.176713051522 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 145 | 146 | More about the Chicago marathon 147 | 6.1620884463 ['quot', 'they', 'money', 'want', 'very', 'best'] 148 | 0.268050785403 ['with', 'your', 'weight', 'have', 'control', 'about'] 149 | 0.0210462038578 ['fats', 'quot', 'this', 'good', 'about', 'like'] 150 | 151 | Food and Workout Log 10.8.07 152 | 3.19119866786 ['food', 'calories', 'than', 'easy', 'high', 'come'] 153 | 2.68113794132 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 154 | 1.31607800222 ['fats', 'quot', 'this', 'good', 'about', 'like'] 155 | 156 | diet/exercise 10/8 157 | 4.35583316205 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 158 | 1.56546955704 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 159 | 1.25839277593 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 160 | 161 | I did'nt diet to get in shape for Trinidad's Carnival..... 162 | 5.9231935598 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 163 | 0.567204076047 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 164 | 0.0169687217709 ['with', 'your', 'weight', 'have', 'control', 'about'] 165 | 166 | I got in shape and took part in Trinidad Carnival! 167 | 1.02074036539 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 168 | 0.930265487859 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 169 | 0.266432773175 ['fats', 'quot', 'this', 'good', 'about', 'like'] 170 | 171 | THE ULTIMATE FAT-BURNING DAY 172 | 1.72598890408 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 173 | 0.528141703291 ['with', 'your', 'weight', 'have', 'control', 'about'] 174 | 0.390073319858 ['fats', 'quot', 'this', 'good', 'about', 'like'] 175 | 176 | Control ur Weight 177 | 6.78756986407 ['with', 'your', 'weight', 'have', 'control', 'about'] 178 | 0.000529137198612 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 179 | 0.00038074933869 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 180 | 181 | BrainStimPro Binaural Brainwave Generator 182 | 0.533630276909 ['with', 'your', 'weight', 'have', 'control', 'about'] 183 | 0.37841909077 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 184 | 0.116016288049 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 185 | 186 | Food & Exercise -- 10/7/2007 187 | 7.73926153154 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 188 | 0.470298707782 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 189 | 0.233105196286 ['food', 'calories', 'than', 'easy', 'high', 'come'] 190 | 191 | food/exercise Friday 10/7 192 | 4.69100441998 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 193 | 1.64398092185 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 194 | 0.689120996726 ['food', 'calories', 'than', 'easy', 'high', 'come'] 195 | 196 | Should we ban marathons? 197 | 2.44173145283 ['quot', 'they', 'money', 'want', 'very', 'best'] 198 | 1.81373140989 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 199 | 1.3775418859 ['fats', 'quot', 'this', 'good', 'about', 'like'] 200 | 201 | Abstinence 3 (8 October to 24 October) 202 | 0.969974503706 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 203 | 0.912311154908 ['food', 'calories', 'than', 'easy', 'high', 'come'] 204 | 0.371633089984 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 205 | 206 | Food & Exercise -- 10/6/2007 (yesterday) 207 | 2.63514100937 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 208 | 1.80605150884 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 209 | 0.426403502815 ['food', 'calories', 'than', 'easy', 'high', 'come'] 210 | 211 | Food and Workout Log 10.5.07 212 | 2.03340244602 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 213 | 0.321040122788 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 214 | 0.286990704435 ['food', 'calories', 'than', 'easy', 'high', 'come'] 215 | 216 | Food and Workout Log 10.4.07 217 | 2.32606586074 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 218 | 2.23872352546 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 219 | 0.991619356436 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 220 | 221 | Exercise 222 | 1.82773959677 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 223 | 1.05124941331 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 224 | 0.420038570854 ['with', 'your', 'weight', 'have', 'control', 'about'] 225 | 226 | food/exercise Friday 10/5 227 | 5.3332773133 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 228 | 1.381638768 ['food', 'calories', 'than', 'easy', 'high', 'come'] 229 | 0.590183487282 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 230 | 231 | Food & Exercise -- 10/5/2007 232 | 5.22083940456 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 233 | 0.29336324721 ['fats', 'quot', 'this', 'good', 'about', 'like'] 234 | 0.279839860069 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 235 | 236 | Fitness and wellness is here to help you, about fittnes, nutrition, health, everything is here... 237 | 1.47212290909 ['with', 'your', 'weight', 'have', 'control', 'about'] 238 | 0.581092551305 ['fats', 'quot', 'this', 'good', 'about', 'like'] 239 | 0.22366446507 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 240 | 241 | Live healthy and disease free 242 | 1.33925118974 ['fats', 'quot', 'this', 'good', 'about', 'like'] 243 | 0.735241239185 ['with', 'your', 'weight', 'have', 'control', 'about'] 244 | 4.48445780778e-005 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 245 | 246 | maintain ur diet dailyu 247 | 1.37867872087 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 248 | 7.75955547582e-008 ['with', 'your', 'weight', 'have', 'control', 'about'] 249 | 8.66186281206e-016 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 250 | 251 | Food & Exercise -- 10/4/2007 252 | 5.16310413391 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 253 | 0.108950865658 ['food', 'calories', 'than', 'easy', 'high', 'come'] 254 | 0.103411657525 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 255 | 256 | diet/exercise 10/4 257 | 5.94642162786 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 258 | 1.15981737715 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 259 | 0.0648977104196 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 260 | 261 | sad to say 262 | 2.0658103969 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 263 | 1.0211752756 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 264 | 0.606678422181 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 265 | 266 | Food and Workout Log 10.3.07 267 | 5.48488799917 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 268 | 1.04086527858 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 269 | 0.86250634261 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 270 | 271 | Walking As Exercise 272 | 2.49352373509 ['quot', 'they', 'money', 'want', 'very', 'best'] 273 | 2.18075570265 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 274 | 0.613516861795 ['with', 'your', 'weight', 'have', 'control', 'about'] 275 | 276 | food/exercise wednesday 10/3 277 | 5.07554844226 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 278 | 1.23323613477 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 279 | 0.351030687614 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 280 | 281 | How much proteins leak into water when cooking vegetables ? 282 | 2.93064858147 ['with', 'your', 'weight', 'have', 'control', 'about'] 283 | 1.18774119665 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 284 | 0.00439761415131 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 285 | 286 | Food & Exercise -- 10/3/2007 287 | 3.50183444986 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 288 | 2.21222380937 ['quot', 'they', 'money', 'want', 'very', 'best'] 289 | 1.37194239805 ['fats', 'quot', 'this', 'good', 'about', 'like'] 290 | 291 | The truth about exercising and your body as a whole 292 | 3.21386971879 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 293 | 3.15819744924 ['with', 'your', 'weight', 'have', 'control', 'about'] 294 | 0.270507407515 ['fats', 'quot', 'this', 'good', 'about', 'like'] 295 | 296 | Got the new Elliptical 297 | 1.14114086054 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 298 | 1.10131650413 ['food', 'calories', 'than', 'easy', 'high', 'come'] 299 | 0.477565648015 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 300 | 301 | Fingerstick cholesterol tests accurate? 302 | 2.14611673458 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 303 | 1.26970417226 ['fats', 'quot', 'this', 'good', 'about', 'like'] 304 | 0.750416051713 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 305 | 306 | Weight Loss Tips 307 | 5.21079777525 ['with', 'your', 'weight', 'have', 'control', 'about'] 308 | 1.59092846403 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 309 | 0.00134310805496 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 310 | 311 | Obesity Driving Rising U.S. Health Costs 312 | 1.25615188819 ['with', 'your', 'weight', 'have', 'control', 'about'] 313 | 0.712235310825 ['food', 'calories', 'than', 'easy', 'high', 'come'] 314 | 0.238899338741 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 315 | 316 | Flu-Busting Chicken Soup 317 | 5.54567450388 ['with', 'your', 'weight', 'have', 'control', 'about'] 318 | 1.21893998075 ['food', 'calories', 'than', 'easy', 'high', 'come'] 319 | 0.451175568656 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 320 | 321 | Re: My Healing Story 322 | 1.96323375291 ['with', 'your', 'weight', 'have', 'control', 'about'] 323 | 0.491756558034 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 324 | 0.477646079039 ['fats', 'quot', 'this', 'good', 'about', 'like'] 325 | 326 | Food and Workout Log 10.2.07 327 | 2.60282810129 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 328 | 1.20884355476 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 329 | 0.950088631141 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 330 | 331 | food/exercise Tuesday 10/2 332 | 2.64289395866 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 333 | 0.651445733893 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 334 | 0.371530809297 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 335 | 336 | Food & Exercise -- 10/2/2007 337 | 4.03395454429 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 338 | 0.733387053358 ['food', 'calories', 'than', 'easy', 'high', 'come'] 339 | 0.335622901915 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 340 | 341 | Diet Recommendations following stoppage of activity 342 | 2.76689639493 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 343 | 2.66796965036 ['quot', 'they', 'money', 'want', 'very', 'best'] 344 | 0.00214949067482 ['fats', 'quot', 'this', 'good', 'about', 'like'] 345 | 346 | why I'm succeeding, finally, with my fitness 347 | 3.81276353396 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 348 | 2.28727363664 ['with', 'your', 'weight', 'have', 'control', 'about'] 349 | 0.0084896973916 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 350 | 351 | food/exercise Monday 10/1 352 | 6.52183126318 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 353 | 1.04845803053 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 354 | 0.220817568443 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 355 | 356 | Food & Exercise -- 10/1/2007 357 | 3.41693152333 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 358 | 1.43659575232 ['with', 'your', 'weight', 'have', 'control', 'about'] 359 | 0.0107024339333 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 360 | 361 | Good fats bad fats 362 | 14.9233786406 ['fats', 'quot', 'this', 'good', 'about', 'like'] 363 | 0.12157320235 ['quot', 'they', 'money', 'want', 'very', 'best'] 364 | 0.000388079511473 ['food', 'calories', 'than', 'easy', 'high', 'come'] 365 | 366 | milk products 367 | 2.91179410526 ['quot', 'they', 'money', 'want', 'very', 'best'] 368 | 1.35981372517 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 369 | 0.894968443359 ['with', 'your', 'weight', 'have', 'control', 'about'] 370 | 371 | Food and Workout Log 10.1.07 372 | 4.86163252456 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 373 | 3.04379043965 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 374 | 0.288092400057 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 375 | 376 | < 1g, etc. 377 | 5.96451663382 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 378 | 0.757014711498 ['food', 'calories', 'than', 'easy', 'high', 'come'] 379 | 0.0106873617525 ['with', 'your', 'weight', 'have', 'control', 'about'] 380 | 381 | peanut butter 382 | 2.79640093073 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 383 | 0.974232666377 ['fats', 'quot', 'this', 'good', 'about', 'like'] 384 | 0.20210540615 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 385 | 386 | food/exercise Sunday 9/30 387 | 3.13525705865 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 388 | 1.77396028396 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 389 | 0.84994166551 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 390 | 391 | Food and Workout Log 9.30.07 392 | 4.16473442796 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 393 | 1.1093666032 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 394 | 0.0787803510605 ['food', 'calories', 'than', 'easy', 'high', 'come'] 395 | 396 | Food & Exercise -- 9/30/2007 397 | 4.50185209238 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 398 | 1.04931983732 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 399 | 0.691962870134 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 400 | 401 | Food and Workout Log 9.29.07 402 | 4.84939396321 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 403 | 2.39617281343 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 404 | 0.816992749371 ['with', 'your', 'weight', 'have', 'control', 'about'] 405 | 406 | Food and Workout Log 9.28.07 407 | 4.67171965065 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 408 | 0.282691470562 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 409 | 0.200338717705 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 410 | 411 | LOUIE + LINESMAKER = $$$$ 412 | 5.58276496802 ['quot', 'they', 'money', 'want', 'very', 'best'] 413 | 0.342556596947 ['with', 'your', 'weight', 'have', 'control', 'about'] 414 | 0.179902643439 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 415 | 416 | Food & Exercise -- 9/29/2007 (yesterday) 417 | 2.58541666839 ['quot', 'they', 'money', 'want', 'very', 'best'] 418 | 2.09030954926 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 419 | 0.852396204369 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 420 | 421 | ASDLC has changed 422 | 0.112818164838 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 423 | 0.102149673333 ['with', 'your', 'weight', 'have', 'control', 'about'] 424 | 0.0271390834142 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 425 | 426 | diet/exercise Saturday 9/29 427 | 2.11947826799 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 428 | 1.44172267631 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 429 | 0.589173976223 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 430 | 431 | Sensible Diet & Exercise 432 | 5.04673654071 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 433 | 3.61357653903 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 434 | 1.40785970283 ['with', 'your', 'weight', 'have', 'control', 'about'] 435 | 436 | Abstinence 2.- (20 September to 7 October ) 437 | 0.968219095052 ['food', 'calories', 'than', 'easy', 'high', 'come'] 438 | 0.949878615175 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 439 | 0.46422787912 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 440 | 441 | Evolution and Weight 442 | 2.20784229869 ['quot', 'they', 'money', 'want', 'very', 'best'] 443 | 1.18857451599 ['fats', 'quot', 'this', 'good', 'about', 'like'] 444 | 1.00060668406 ['with', 'your', 'weight', 'have', 'control', 'about'] 445 | 446 | Food & Exercise -- 9/28/2007 (yesterday) 447 | 4.75585045074 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 448 | 0.840897380766 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 449 | 0.0455304461165 ['quot', 'they', 'money', 'want', 'very', 'best'] 450 | 451 | The Abs Diet by David Zinczenko 452 | 6.58003120192 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 453 | 0.00218473474643 ['fats', 'quot', 'this', 'good', 'about', 'like'] 454 | 0.00140268366152 ['food', 'calories', 'than', 'easy', 'high', 'come'] 455 | 456 | Re: ABC News Nightline: Carbohydrates Make You Fat, and Perhaps Sick 457 | 2.50629631142 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 458 | 1.32614477527 ['with', 'your', 'weight', 'have', 'control', 'about'] 459 | 0.194215052468 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 460 | 461 | Sensible Eating 462 | 2.67280083529 ['quot', 'they', 'money', 'want', 'very', 'best'] 463 | 1.95758534619 ['with', 'your', 'weight', 'have', 'control', 'about'] 464 | 0.272657483613 ['fats', 'quot', 'this', 'good', 'about', 'like'] 465 | 466 | 3 slices of ff cheese on a poached egg 467 | 0.925803022044 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 468 | 0.158529510462 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 469 | 1.69625785908e-009 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 470 | 471 | food/exercise Friday 9/28 472 | 4.2243066995 ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 473 | 1.69260968796 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 474 | 0.355277035138 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 475 | 476 | money making opportunity 477 | 1.52941598839 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 478 | 1.23535469791 ['quot', 'they', 'money', 'want', 'very', 'best'] 479 | 0.292057461928 ['with', 'your', 'weight', 'have', 'control', 'about'] 480 | 481 | The Benefits of Biotechnology For Mankind 482 | 0.543967592519 ['with', 'your', 'weight', 'have', 'control', 'about'] 483 | 0.389852100811 ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 484 | 0.208865237821 ['fats', 'quot', 'this', 'good', 'about', 'like'] 485 | 486 | Food and Workout Log 9.27.08 487 | 5.58477112035 ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 488 | 0.093698650419 ['food', 'home', 'then', 'exercise', 'morning', 'went'] 489 | 1.16281432958e-005 ['quot', 'they', 'money', 'want', 'very', 'best'] 490 | 491 | Re: My First Century Ride 492 | 2.7939634836 ['quot', 'they', 'money', 'want', 'very', 'best'] 493 | 1.98080235676 ['with', 'your', 'weight', 'have', 'control', 'about'] 494 | 0.0869645267992 ['fats', 'quot', 'this', 'good', 'about', 'like'] 495 | 496 | A Special Update from Matty V. 497 | 1.64986127649 ['that', 'much', 'does', 'exercise', 'this', 'morning'] 498 | 0.635897197115 ['with', 'your', 'weight', 'have', 'control', 'about'] 499 | 0.337529069245 ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 500 | 501 | -------------------------------------------------------------------------------- /第10章 寻找独立特征/clusters.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | from math import sqrt 4 | from PIL import Image,ImageDraw,ImageFont 5 | 6 | # Returns the Pearson correlation coefficient for p1 and p2 7 | def pearson(v1,v2): 8 | # Simple sums 9 | sum1=sum(v1) 10 | sum2=sum(v2) 11 | 12 | # Sums of the squares 13 | sum1Sq=sum([pow(v,2) for v in v1]) 14 | sum2Sq=sum([pow(v,2) for v in v2]) 15 | 16 | # Sum of the products 17 | pSum=sum([v1[i]*v2[i] for i in range(len(v1))]) 18 | 19 | # Calculate r (Pearson score) 20 | num=pSum-(sum1*sum2/len(v1)) 21 | den=sqrt((sum1Sq-pow(sum1,2)/len(v1))*(sum2Sq-pow(sum2,2)/len(v1))) 22 | if den==0: return 0 23 | 24 | return 1.0-(num/den) 25 | 26 | 27 | class bicluster: 28 | def __init__(self,vec,left=None,right=None,distance=0.0,id=None): 29 | self.left=left 30 | self.right=right 31 | self.vec=vec 32 | self.id=id 33 | self.distance=distance 34 | 35 | def euclidean(v1,v2): 36 | sqsum=sum([math.pow(v1[i]-v2[i],2) for i in range(len(v1))]) 37 | return math.sqrt(sqsum) 38 | 39 | def printclust(clust,labels=None,n=0): 40 | for i in range(n): print ' ', 41 | if clust.id<0: 42 | print '-' 43 | else: 44 | if labels==None: print clust.id 45 | else: print labels[clust.id] 46 | if clust.left!=None: printclust(clust.left,labels=labels,n=n+1) 47 | if clust.right!=None: printclust(clust.right,labels=labels,n=n+1) 48 | 49 | def hcluster(vecs,distance=pearson): 50 | distances={} 51 | currentclustid=-1 52 | clust=[bicluster(vecs[i],id=i) for i in range(len(vecs))] 53 | 54 | while len(clust)>1: 55 | lowestpair=(0,1) 56 | closest=distance(clust[0].vec,clust[1].vec) 57 | for i in range(len(clust)): 58 | for j in range(i+1,len(clust)): 59 | if (clust[i].id,clust[j].id) not in distances: 60 | distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec) 61 | d=distances[(clust[i].id,clust[j].id)] 62 | 63 | if d0: 102 | for vecid in bestmatches[i]: 103 | for m in range(len(vecs[vecid])): 104 | avgs[m]+=vecs[vecid][m] 105 | for j in range(len(avgs)): 106 | avgs[j]/=len(bestmatches[i]) 107 | clusters[i]=avgs 108 | 109 | return bestmatches 110 | 111 | def readfile(filename): 112 | lines=[line for line in file(filename)] 113 | colnames=lines[0].strip().split('\t')[1:] 114 | rownames=[] 115 | data=[] 116 | for line in lines[1:]: 117 | p=line.strip().split('\t') 118 | rownames.append(p[0]) 119 | data.append([float(x) for x in p[1:]]) 120 | return rownames,colnames,data 121 | 122 | def test2(): 123 | rownames,colnames,data=readfile('datafile.txt') 124 | return hcluster(data) 125 | #for i in range(len(rownames)): 126 | # print i,rownames[i] 127 | 128 | def distance(v1,v2): 129 | c1,c2,shr=0,0,0 130 | 131 | for i in range(len(v1)): 132 | if v1[i]!=0: c1+=1 133 | if v2[i]!=0: c2+=1 134 | if v1[i]!=0 and v2[i]!=0: shr+=1 135 | 136 | return float(shr)/(c1+c2-shr) 137 | 138 | 139 | #test2() 140 | 141 | def getheight(clust): 142 | if clust.left==None and clust.right==None: return 1 143 | return getheight(clust.left)+getheight(clust.right) 144 | 145 | def getdepth(clust): 146 | if clust.left==None and clust.right==None: return 0 147 | return max(getdepth(clust.left),getdepth(clust.right))+clust.distance 148 | 149 | def drawdendrogram(clust,labels,jpeg='clusters.jpg'): 150 | h=getheight(clust)*20 151 | depth=getdepth(clust) 152 | w=1200 153 | scaling=float(w-150)/depth 154 | img=Image.new('RGB',(w,h),(255,255,255)) 155 | draw=ImageDraw.Draw(img) 156 | 157 | draw.line((0,h/2,10,h/2),fill=(255,0,0)) 158 | 159 | drawnode(draw,clust,10,(h/2),scaling,labels) 160 | img.save(jpeg,'JPEG') 161 | 162 | def drawnode(draw,clust,x,y,scaling,labels): 163 | if clust.id<0: 164 | h1=getheight(clust.left)*20 165 | h2=getheight(clust.right)*20 166 | top=y-(h1+h2)/2 167 | bottom=y+(h1+h2)/2 168 | 169 | ll=clust.distance*scaling 170 | 171 | draw.line((x,top+h1/2,x,bottom-h2/2),fill=(255,0,0)) 172 | 173 | draw.line((x,top+h1/2,x+ll,top+h1/2),fill=(255,0,0)) 174 | draw.line((x,bottom-h2/2,x+ll,bottom-h2/2),fill=(255,0,0)) 175 | 176 | drawnode(draw,clust.left,x+ll,top+h1/2,scaling,labels) 177 | drawnode(draw,clust.right,x+ll,bottom-h2/2,scaling,labels) 178 | else: 179 | draw.text((x+5,y-7),labels[clust.id].encode('utf8'),(0,0,0)) 180 | 181 | def rotatematrix(data): 182 | newdata=[] 183 | for i in range(len(data[0])): 184 | newrow=[data[j][i] for j in range(len(data))] 185 | newdata.append(newrow) 186 | return newdata 187 | 188 | def scaledown(data,distance=pearson,rate=0.01): 189 | n=len(data) 190 | realdist=[[distance(data[i],data[j]) for j in range(n)] for i in range(0,n)] 191 | 192 | outersum=0.0 193 | 194 | loc=[[random.random(),random.random()] for i in range(n)] 195 | fakedist=[[0.0 for j in range(n)] for i in range(n)] 196 | 197 | lasterror=None 198 | for m in range(0,1000): 199 | # Find projected distances 200 | for i in range(n): 201 | for j in range(n): 202 | fakedist[i][j]=sqrt(sum([pow(loc[i][x]-loc[j][x],2) 203 | for x in range(len(loc[i]))])) 204 | 205 | # Move points 206 | grad=[[0.0,0.0] for i in range(n)] 207 | 208 | totalerror=0 209 | for k in range(n): 210 | for j in range(n): 211 | if j==k: continue 212 | errorterm=(fakedist[j][k]-realdist[j][k])/realdist[j][k] 213 | grad[k][0]+=((loc[k][0]-loc[j][0])/fakedist[j][k])*errorterm 214 | grad[k][1]+=((loc[k][1]-loc[j][1])/fakedist[j][k])*errorterm 215 | totalerror+=abs(errorterm) 216 | print totalerror 217 | if lasterror and lasterror2 and len(s)<20] 10 | 11 | # Return the unique set of words only 12 | return dict([(w,1) for w in words]) 13 | 14 | #def entryfeatures(entry): 15 | 16 | def sampletrain(cl): 17 | cl.train('Nobody owns the water.','good') 18 | cl.train('the quick rabbit jumps fences','good') 19 | cl.train('buy pharmaceuticals now','bad') 20 | cl.train('make quick money at the online casino','bad') 21 | cl.train('the quick brown fox jumps','good') 22 | 23 | class classifier: 24 | def __init__(self,getfeatures): 25 | self.fc={} 26 | self.cc={} 27 | self.getfeatures=getfeatures 28 | 29 | def setdb(self,dbfile): 30 | self.con=sqlite.connect(dbfile) 31 | self.con.execute('create table if not exists fc(feature,category,count)') 32 | self.con.execute('create table if not exists cc(category,count)') 33 | 34 | def incf(self,f,cat): 35 | count=self.fcount(f,cat) 36 | if count==0: 37 | self.con.execute("insert into fc values ('%s','%s',1)" 38 | % (f,cat)) 39 | else: 40 | self.con.execute( 41 | "update fc set count=%d where feature='%s' and category='%s'" 42 | % (count+1,f,cat)) 43 | 44 | def fcount(self,f,cat): 45 | res=self.con.execute( 46 | 'select count from fc where feature="%s" and category="%s"' 47 | %(f,cat)).fetchone() 48 | if res==None: return 0 49 | else: return float(res[0]) 50 | 51 | def incc(self,cat): 52 | count=self.catcount(cat) 53 | if count==0: 54 | self.con.execute("insert into cc values ('%s',1)" % (cat)) 55 | else: 56 | self.con.execute("update cc set count=%d where category='%s'" 57 | % (count+1,cat)) 58 | 59 | def catcount(self,cat): 60 | res=self.con.execute('select count from cc where category="%s"' 61 | %(cat)).fetchone() 62 | if res==None: return 0.0 63 | else: return float(res[0]) 64 | 65 | def categories(self): 66 | cur=self.con.execute('select category from cc'); 67 | return [d[0] for d in cur] 68 | 69 | def totalcount(self): 70 | res=self.con.execute('select sum(count) from cc').fetchone(); 71 | if res==None: return 0 72 | return res[0] 73 | 74 | 75 | """ 76 | def incf(self,f,cat): 77 | self.fc.setdefault(f,{}) 78 | self.fc[f].setdefault(cat,0) 79 | self.fc[f][cat]+=1 80 | 81 | def incc(self,cat): 82 | self.cc.setdefault(cat,0) 83 | self.cc[cat]+=1 84 | 85 | def fcount(self,f,cat): 86 | if f in self.fc and cat in self.fc[f]: 87 | return float(self.fc[f][cat]) 88 | return 0.0 89 | 90 | def catcount(self,cat): 91 | if cat in self.cc: 92 | return float(self.cc[cat]) 93 | return 0 94 | 95 | def totalcount(self): 96 | return sum(self.cc.values()) 97 | 98 | def categories(self): 99 | return self.cc.keys() 100 | """ 101 | 102 | 103 | def train(self,item,cat): 104 | features=self.getfeatures(item) 105 | for f in features: 106 | self.incf(f,cat) 107 | self.incc(cat) 108 | self.con.commit() 109 | 110 | def fprob(self,f,cat): 111 | if self.catcount(cat)==0: return 0 112 | return self.fcount(f,cat)/self.catcount(cat) 113 | 114 | def setfilename(self,filename): 115 | self.filename=filename 116 | self.restoredata() 117 | 118 | def restoredata(self): 119 | try: f=file(self.filename,'rb') 120 | except: return 121 | self.fc=cPickle.load(f) 122 | self.cc=cPickle.load(f) 123 | f.close() 124 | 125 | def savedata(self): 126 | f=file(self.filename,'wb') 127 | cPickle.dump(self.fc,f,True) 128 | cPickle.dump(self.cc,f,True) 129 | f.close() 130 | def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5): 131 | basicprob=prf(f,cat) 132 | totals=sum([self.fcount(f,c) for c in self.categories()]) 133 | bp=((weight*ap)+(totals*basicprob))/(weight+totals) 134 | return bp 135 | 136 | 137 | 138 | class naivebayes(classifier): 139 | def __init__(self,getfeatures): 140 | classifier.__init__(self,getfeatures) 141 | self.thresholds={} 142 | 143 | def setthreshold(self,cat,t): 144 | self.thresholds[cat]=t 145 | 146 | def getthreshold(self,cat): 147 | if cat not in self.thresholds: return 1.0 148 | return self.thresholds[cat] 149 | 150 | def classify(self,item,default=None): 151 | probs={} 152 | max=0.0 153 | for cat in self.categories(): 154 | probs[cat]=self.prob(item,cat) 155 | if probs[cat]>max: 156 | max=probs[cat] 157 | best=cat 158 | for cat in probs: 159 | if cat==best: continue 160 | if probs[cat]*self.getthreshold(best)>probs[best]: return default 161 | return best 162 | 163 | def docprob(self,item,cat): 164 | features=self.getfeatures(item) 165 | p=1 166 | for f in features: p*=self.weightedprob(f,cat,self.fprob) 167 | return p 168 | 169 | 170 | def prob(self,item,cat): 171 | catprob=self.catcount(cat)/self.totalcount() 172 | docprob=self.docprob(item,cat) 173 | return docprob*catprob 174 | 175 | class fisherclassifier(classifier): 176 | def __init__(self,getfeatures): 177 | classifier.__init__(self,getfeatures) 178 | self.minimums={} 179 | 180 | def setminimum(self,cat,min): 181 | self.minimums[cat]=min 182 | 183 | def getminimum(self,cat): 184 | if cat not in self.minimums: return 0 185 | return self.minimums[cat] 186 | 187 | def classify(self,item,default=None): 188 | best=default 189 | max=0.0 190 | for c in self.categories(): 191 | p=self.fisherprob(item,c) 192 | if p>self.getminimum(c) and p>max: 193 | best=c 194 | max=p 195 | return best 196 | 197 | 198 | def cprob(self,f,cat): 199 | # The frequency of this feature in this category 200 | clf=self.fprob(f,cat) 201 | 202 | if clf==0: return 0.0 203 | 204 | # The frequency of this feature in all the categories 205 | freqsum=sum([self.fprob(f,c) for c in self.categories()]) 206 | 207 | # The probability is the frequency in this category divided by 208 | # the overall frequency 209 | p=clf/(freqsum) 210 | 211 | return p 212 | 213 | 214 | def fisherprob(self,item,cat): 215 | p=1 216 | features=self.getfeatures(item) 217 | for f in features: 218 | p*=(self.weightedprob(f,cat,self.cprob)) 219 | fscore=-2*math.log(p) 220 | return self.chi2P(fscore,len(features)*2) 221 | 222 | def chi2P(self,chi,df): 223 | m = chi / 2.0 224 | sum = term = math.exp(-m) 225 | for i in range(1, df//2): 226 | term *= m / i 227 | sum += term 228 | return min(sum, 1.0) 229 | 230 | -------------------------------------------------------------------------------- /第10章 寻找独立特征/features.txt: -------------------------------------------------------------------------------- 1 | ['diet', 'with', 'great', 'what', 'trinidad', 'carnival'] 2 | (6.58003120192, u'The Abs Diet by David Zinczenko') 3 | (5.9231935598, u"I did'nt diet to get in shape for Trinidad's Carnival.....") 4 | (5.04673654071, u'Sensible Diet & Exercise') 5 | 6 | ['coffee', 'black', 'exercise', 'minutes', 'olive', 'small'] 7 | (6.52183126318, u'food/exercise Monday 10/1') 8 | (5.94642162786, u'diet/exercise 10/4') 9 | (5.3332773133, u'food/exercise Friday 10/5') 10 | 11 | ['food', 'calories', 'than', 'easy', 'high', 'come'] 12 | (9.98464450123, u'High or low fat food? Easy trick for figuring it out') 13 | (3.41252863148, u'Oatmeal, cereal of choice.') 14 | (3.19119866786, u'Food and Workout Log 10.8.07') 15 | 16 | ['cheese', 'black', 'salad', 'coffee', 'broccoli', 'tomato'] 17 | (7.46811621754, u'saturday') 18 | (5.62839188358, u'diet-exercise thursday') 19 | (5.29370213306, u'sleepy food/fitness thursday') 20 | 21 | ['food', 'home', 'then', 'exercise', 'morning', 'went'] 22 | (5.22083940456, u'Food & Exercise -- 10/5/2007') 23 | (5.16310413391, u'Food & Exercise -- 10/4/2007') 24 | (4.75585045074, u'Food & Exercise -- 9/28/2007 (yesterday)') 25 | 26 | ['fats', 'quot', 'this', 'good', 'about', 'like'] 27 | (14.9233786406, u'Good fats bad fats') 28 | (1.3775418859, u'Should we ban marathons?') 29 | (1.37194239805, u'Food & Exercise -- 10/3/2007') 30 | 31 | ['quot', 'they', 'money', 'want', 'very', 'best'] 32 | (6.1620884463, u'More about the Chicago marathon') 33 | (5.58276496802, u'LOUIE + LINESMAKER = $$$$') 34 | (4.04959173123, u'High or low fat food? Easy trick for figuring it out') 35 | 36 | ['that', 'much', 'does', 'exercise', 'this', 'morning'] 37 | (7.73926153154, u'Food & Exercise -- 10/7/2007') 38 | (5.96451663382, u'< 1g, etc.') 39 | (3.81276353396, u"why I'm succeeding, finally, with my fitness") 40 | 41 | ['with', 'your', 'weight', 'have', 'control', 'about'] 42 | (6.78756986407, u'Control ur Weight') 43 | (5.54567450388, u'Flu-Busting Chicken Soup') 44 | (5.21079777525, u'Weight Loss Tips') 45 | 46 | ['with', 'lunch', 'workout', 'food', 'butter', 'peanut'] 47 | (5.58477112035, u'Food and Workout Log 9.27.08') 48 | (5.48488799917, u'Food and Workout Log 10.3.07') 49 | (5.10395750879, u'Food and Workout Log 10.10.07') 50 | 51 | -------------------------------------------------------------------------------- /第10章 寻找独立特征/newsfeatures.py: -------------------------------------------------------------------------------- 1 | import feedparser 2 | import re 3 | 4 | 5 | feedlist=['http://today.reuters.com/rss/topNews', 6 | 'http://today.reuters.com/rss/domesticNews', 7 | 'http://today.reuters.com/rss/worldNews', 8 | 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml', 9 | 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml', 10 | 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml', 11 | 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml', 12 | 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 13 | 'http://www.nytimes.com/services/xml/rss/nyt/International.xml', 14 | 'http://news.google.com/?output=rss', 15 | 'http://feeds.salon.com/salon/news', 16 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss', 17 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss', 18 | 'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss', 19 | 'http://rss.cnn.com/rss/edition.rss', 20 | 'http://rss.cnn.com/rss/edition_world.rss', 21 | 'http://rss.cnn.com/rss/edition_us.rss'] 22 | 23 | def stripHTML(h): 24 | p='' 25 | s=0 26 | for c in h: 27 | if c=='<': s=1 28 | elif c=='>': 29 | s=0 30 | p+=' ' 31 | elif s==0: p+=c 32 | return p 33 | 34 | 35 | def separatewords(text): 36 | splitter=re.compile('\\W*') 37 | return [s.lower() for s in splitter.split(text) if len(s)>3] 38 | 39 | def getarticlewords(): 40 | allwords={} 41 | articlewords=[] 42 | articletitles=[] 43 | ec=0 44 | # Loop over every feed 45 | for feed in feedlist: 46 | f=feedparser.parse(feed) 47 | 48 | # Loop over every article 49 | for e in f.entries: 50 | # Ignore identical articles 51 | if e.title in articletitles: continue 52 | 53 | # Extract the words 54 | txt=e.title.encode('utf8')+stripHTML(e.description.encode('utf8')) 55 | words=separatewords(txt) 56 | articlewords.append({}) 57 | articletitles.append(e.title) 58 | 59 | # Increase the counts for this word in allwords and in articlewords 60 | for word in words: 61 | allwords.setdefault(word,0) 62 | allwords[word]+=1 63 | articlewords[ec].setdefault(word,0) 64 | articlewords[ec][word]+=1 65 | ec+=1 66 | return allwords,articlewords,articletitles 67 | 68 | def makematrix(allw,articlew): 69 | wordvec=[] 70 | 71 | # Only take words that are common but not too common 72 | for w,c in allw.items(): 73 | if c>3 and c0: return l[1] 51 | else: return l[2] 52 | ifw=fwrapper(iffunc,3,'if') 53 | 54 | def isgreater(l): 55 | if l[0]>l[1]: return 1 56 | else: return 0 57 | gtw=fwrapper(isgreater,2,'isgreater') 58 | 59 | flist=[addw,mulw,ifw,gtw,subw] 60 | 61 | def exampletree(): 62 | return node(ifw,[ 63 | node(gtw,[paramnode(0),constnode(3)]), 64 | node(addw,[paramnode(1),constnode(5)]), 65 | node(subw,[paramnode(1),constnode(2)]), 66 | ] 67 | ) 68 | 69 | def makerandomtree(pc,maxdepth=4,fpr=0.5,ppr=0.6): 70 | if random()0: 71 | f=choice(flist) 72 | children=[makerandomtree(pc,maxdepth-1,fpr,ppr) 73 | for i in range(f.childcount)] 74 | return node(f,children) 75 | elif random()pnew: 148 | newpop.append(mutate( 149 | crossover(scores[selectindex()][1], 150 | scores[selectindex()][1], 151 | probswap=breedingrate), 152 | pc,probchange=mutationrate)) 153 | else: 154 | # Add a random node to mix things up 155 | newpop.append(makerandomtree(pc)) 156 | 157 | population=newpop 158 | scores[0][1].display() 159 | return scores[0][1] 160 | 161 | 162 | def gridgame(p): 163 | # Board size 164 | max=(3,3) 165 | 166 | # Remember the last move for each player 167 | lastmove=[-1,-1] 168 | 169 | # Remember the player's locations 170 | location=[[randint(0,max[0]),randint(0,max[1])]] 171 | 172 | # Put the second player a sufficient distance from the first 173 | location.append([(location[0][0]+2)%4,(location[0][1]+2)%4]) 174 | # Maximum of 50 moves before a tie 175 | for o in range(50): 176 | 177 | # For each player 178 | for i in range(2): 179 | locs=location[i][:]+location[1-i][:] 180 | locs.append(lastmove[i]) 181 | move=p[i].evaluate(locs)%4 182 | 183 | # You lose if you move the same direction twice in a row 184 | if lastmove[i]==move: return 1-i 185 | lastmove[i]=move 186 | if move==0: 187 | location[i][0]-=1 188 | # Board wraps 189 | if location[i][0]<0: location[i][0]=0 190 | if move==1: 191 | location[i][0]+=1 192 | if location[i][0]>max[0]: location[i][0]=max[0] 193 | if move==2: 194 | location[i][1]-=1 195 | if location[i][1]<0: location[i][1]=0 196 | if move==3: 197 | location[i][1]+=1 198 | if location[i][1]>max[1]: location[i][1]=max[1] 199 | 200 | # If you have captured the other player, you win 201 | if location[i]==location[1-i]: return i 202 | return -1 203 | 204 | 205 | def tournament(pl): 206 | # Count losses 207 | losses=[0 for p in pl] 208 | 209 | # Every player plays every other player 210 | for i in range(len(pl)): 211 | for j in range(len(pl)): 212 | if i==j: continue 213 | 214 | # Who is the winner? 215 | winner=gridgame([pl[i],pl[j]]) 216 | 217 | # Two points for a loss, one point for a tie 218 | if winner==0: 219 | losses[j]+=2 220 | elif winner==1: 221 | losses[i]+=2 222 | elif winner==-1: 223 | losses[i]+=1 224 | losses[i]+=1 225 | pass 226 | 227 | # Sort and return the results 228 | z=zip(losses,pl) 229 | z.sort() 230 | return z 231 | 232 | class humanplayer: 233 | def evaluate(self,board): 234 | 235 | # Get my location and the location of other players 236 | me=tuple(board[0:2]) 237 | others=[tuple(board[x:x+2]) for x in range(2,len(board)-1,2)] 238 | 239 | # Display the board 240 | for i in range(4): 241 | for j in range(4): 242 | if (i,j)==me: 243 | print 'O', 244 | elif (i,j) in others: 245 | print 'X', 246 | else: 247 | print '.', 248 | print 249 | 250 | # Show moves, for reference 251 | print 'Your last move was %d' % board[len(board)-1] 252 | print ' 0' 253 | print '2 3' 254 | print ' 1' 255 | print 'Enter move: ', 256 | 257 | # Return whatever the user enters 258 | move=int(raw_input()) 259 | return move 260 | 261 | 262 | class fwrapper: 263 | def __init__(self,function,params,name): 264 | self.function=function 265 | self.childcount=param 266 | self.name=name 267 | 268 | #flist={'str':[substringw,concatw],'int':[indexw]} 269 | flist=[addw,mulw,ifw,gtw,subw] 270 | -------------------------------------------------------------------------------- /第11章 智能进化/gp.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouhongzhao/Programming-Collective-Intelligence-Source-Code/0ff3d57651cd8dfd259695b9b75d085c081d4783/第11章 智能进化/gp.pyc --------------------------------------------------------------------------------