├── README.md ├── Week2 ├── LICENSE ├── README.txt ├── bs4 │ ├── __init__.py │ ├── __init__.py.bak │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── dammit.cpython-35.pyc │ │ └── element.cpython-35.pyc │ ├── builder │ │ ├── __init__.py │ │ ├── __init__.py.bak │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── _html5lib.cpython-35.pyc │ │ │ ├── _htmlparser.cpython-35.pyc │ │ │ └── _lxml.cpython-35.pyc │ │ ├── _html5lib.py │ │ ├── _html5lib.py.bak │ │ ├── _htmlparser.py │ │ ├── _htmlparser.py.bak │ │ ├── _lxml.py │ │ └── _lxml.py.bak │ ├── dammit.py │ ├── dammit.py.bak │ ├── diagnose.py │ ├── diagnose.py.bak │ ├── element.py │ ├── element.py.bak │ ├── testing.py │ ├── testing.py.bak │ └── tests │ │ ├── __init__.py │ │ ├── test_builder_registry.py │ │ ├── test_docs.py │ │ ├── test_html5lib.py │ │ ├── test_html5lib.py.bak │ │ ├── test_htmlparser.py │ │ ├── test_lxml.py │ │ ├── test_lxml.py.bak │ │ ├── test_soup.py │ │ ├── test_soup.py.bak │ │ ├── test_tree.py │ │ └── test_tree.py.bak ├── d3.v2.js ├── force.css ├── force.html ├── force.js ├── spdump.py ├── spider.js ├── spider.py ├── spider.sqlite ├── spjson.py ├── sprank.py └── spreset.py └── Week4-6 ├── README.txt ├── content.sqlite ├── d3.layout.cloud.js ├── d3.v2.js ├── gbasic.py ├── gline.htm ├── gline.js ├── gline.py ├── gmane.py ├── gmodel.py ├── gword.htm ├── gword.js ├── gword.py ├── gyear.py ├── index.sqlite └── mapping.sqlite /README.md: -------------------------------------------------------------------------------- 1 | # Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python 2 | Coursera's course: Capstone: Retrieving, Processing, and Visualizing Data with Python, by University of Michigan 3 | 4 | In the capstone, students will build a series of applications to retrieve, process and visualize data using Python. The projects will involve all the elements of the specialization. In the first part of the capstone, students will do some visualizations to become familiar with the technologies in use and then will pursue their own project to visualize some other data that they have or can find. Chapters 15 and 16 from the book “Python for Everybody” will serve as the backbone for the capstone. This course covers Python 3. 5 | -------------------------------------------------------------------------------- /Week2/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Michael Bostock 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * The name Michael Bostock may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 26 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Week2/README.txt: -------------------------------------------------------------------------------- 1 | Simple Python Search Spider, Page Ranker, and Visualizer 2 | 3 | This is a set of programs that emulate some of the functions of a 4 | search engine. They store their data in a SQLITE3 database named 5 | 'spider.sqlite'. This file can be removed at any time to restart the 6 | process. 7 | 8 | You should install the SQLite browser to view and modify 9 | the databases from: 10 | 11 | http://sqlitebrowser.org/ 12 | 13 | This program crawls a web site and pulls a series of pages into the 14 | database, recording the links between pages. 15 | 16 | Note: Windows has difficulty in displaying UTF-8 characters 17 | in the console so for each console window you open, you may need 18 | to type the following command before running this code: 19 | 20 | chcp 65001 21 | 22 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 23 | 24 | Mac: rm spider.sqlite 25 | Mac: python3 spider.py 26 | 27 | Win: del spider.sqlite 28 | Win: spider.py 29 | 30 | Enter web url or enter: http://www.dr-chuck.com/ 31 | ['http://www.dr-chuck.com'] 32 | How many pages:2 33 | 1 http://www.dr-chuck.com/ 12 34 | 2 http://www.dr-chuck.com/csev-blog/ 57 35 | How many pages: 36 | 37 | In this sample run, we told it to crawl a website and retrieve two 38 | pages. If you restart the program again and tell it to crawl more 39 | pages, it will not re-crawl any pages already in the database. Upon 40 | restart it goes to a random non-crawled page and starts there. So 41 | each successive run of spider.py is additive. 42 | 43 | Mac: python3 spider.py 44 | Win: spider.py 45 | 46 | Enter web url or enter: http://www.dr-chuck.com/ 47 | ['http://www.dr-chuck.com'] 48 | How many pages:3 49 | 3 http://www.dr-chuck.com/csev-blog 57 50 | 4 http://www.dr-chuck.com/dr-chuck/resume/speaking.htm 1 51 | 5 http://www.dr-chuck.com/dr-chuck/resume/index.htm 13 52 | How many pages: 53 | 54 | You can have multiple starting points in the same database - 55 | within the program these are called "webs". The spider 56 | chooses randomly amongst all non-visited links across all 57 | the webs. 58 | 59 | If you want to dump the contents of the spider.sqlite file, you can 60 | run spdump.py as follows: 61 | 62 | Mac: python3 spdump.py 63 | Win: spdump.py 64 | 65 | (5, None, 1.0, 3, u'http://www.dr-chuck.com/csev-blog') 66 | (3, None, 1.0, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 67 | (1, None, 1.0, 2, u'http://www.dr-chuck.com/csev-blog/') 68 | (1, None, 1.0, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 69 | 4 rows. 70 | 71 | This shows the number of incoming links, the old page rank, the new page 72 | rank, the id of the page, and the url of the page. The spdump.py program 73 | only shows pages that have at least one incoming link to them. 74 | 75 | Once you have a few pages in the database, you can run Page Rank on the 76 | pages using the sprank.py program. You simply tell it how many Page 77 | Rank iterations to run. 78 | 79 | Mac: python3 sprank.py 80 | Win: sprank.py 81 | 82 | How many iterations:2 83 | 1 0.546848992536 84 | 2 0.226714939664 85 | [(1, 0.559), (2, 0.659), (3, 0.985), (4, 2.135), (5, 0.659)] 86 | 87 | You can dump the database again to see that page rank has been updated: 88 | 89 | Mac: python3 spdump.py 90 | Win: spdump.py 91 | 92 | (5, 1.0, 0.985, 3, u'http://www.dr-chuck.com/csev-blog') 93 | (3, 1.0, 2.135, 4, u'http://www.dr-chuck.com/dr-chuck/resume/speaking.htm') 94 | (1, 1.0, 0.659, 2, u'http://www.dr-chuck.com/csev-blog/') 95 | (1, 1.0, 0.659, 5, u'http://www.dr-chuck.com/dr-chuck/resume/index.htm') 96 | 4 rows. 97 | 98 | You can run sprank.py as many times as you like and it will simply refine 99 | the page rank the more times you run it. You can even run sprank.py a few times 100 | and then go spider a few more pages sith spider.py and then run sprank.py 101 | to converge the page ranks. 102 | 103 | If you want to restart the Page Rank calculations without re-spidering the 104 | web pages, you can use spreset.py 105 | 106 | Mac: python3 spreset.py 107 | Win: spreset.py 108 | 109 | All pages set to a rank of 1.0 110 | 111 | Mac: python3 sprank.py 112 | Win: sprank.py 113 | 114 | How many iterations:50 115 | 1 0.546848992536 116 | 2 0.226714939664 117 | 3 0.0659516187242 118 | 4 0.0244199333 119 | 5 0.0102096489546 120 | 6 0.00610244329379 121 | ... 122 | 42 0.000109076928206 123 | 43 9.91987599002e-05 124 | 44 9.02151706798e-05 125 | 45 8.20451504471e-05 126 | 46 7.46150183837e-05 127 | 47 6.7857770908e-05 128 | 48 6.17124694224e-05 129 | 49 5.61236959327e-05 130 | 50 5.10410499467e-05 131 | [(512, 0.02963718031139026), (1, 12.790786721866658), (2, 28.939418898678284), (3, 6.808468390725946), (4, 13.469889092397006)] 132 | 133 | For each iteration of the page rank algorithm it prints the average 134 | change per page of the page rank. The network initially is quite 135 | unbalanced and so the individual page ranks are changeing wildly. 136 | But in a few short iterations, the page rank converges. You 137 | should run prank.py long enough that the page ranks converge. 138 | 139 | If you want to visualize the current top pages in terms of page rank, 140 | run spjson.py to write the pages out in JSON format to be viewed in a 141 | web browser. 142 | 143 | Mac: python3 spjson.py 144 | Win: spjson.py 145 | 146 | Creating JSON output on spider.js... 147 | How many nodes? 30 148 | Open force.html in a browser to view the visualization 149 | 150 | You can view this data by opening the file force.html in your web browser. 151 | This shows an automatic layout of the nodes and links. You can click and 152 | drag any node and you can also double click on a node to find the URL 153 | that is represented by the node. 154 | 155 | This visualization is provided using the force layout from: 156 | 157 | http://mbostock.github.com/d3/ 158 | 159 | If you rerun the other utilities and then re-run spjson.py - you merely 160 | have to press refresh in the browser to get the new data from spider.js. 161 | 162 | -------------------------------------------------------------------------------- /Week2/bs4/__init__.py: -------------------------------------------------------------------------------- 1 | """Beautiful Soup 2 | Elixir and Tonic 3 | "The Screen-Scraper's Friend" 4 | http://www.crummy.com/software/BeautifulSoup/ 5 | 6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a 7 | (possibly invalid) document into a tree representation. Beautiful Soup 8 | provides provides methods and Pythonic idioms that make it easy to 9 | navigate, search, and modify the parse tree. 10 | 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml 12 | and/or html5lib is installed. 13 | 14 | For more than you ever wanted to know about Beautiful Soup, see the 15 | documentation: 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 17 | """ 18 | 19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" 20 | __version__ = "4.4.0" 21 | __copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" 22 | __license__ = "MIT" 23 | 24 | __all__ = ['BeautifulSoup'] 25 | 26 | import os 27 | import re 28 | import warnings 29 | 30 | from .builder import builder_registry, ParserRejectedMarkup 31 | from .dammit import UnicodeDammit 32 | from .element import ( 33 | CData, 34 | Comment, 35 | DEFAULT_OUTPUT_ENCODING, 36 | Declaration, 37 | Doctype, 38 | NavigableString, 39 | PageElement, 40 | ProcessingInstruction, 41 | ResultSet, 42 | SoupStrainer, 43 | Tag, 44 | ) 45 | 46 | # The very first thing we do is give a useful error if someone is 47 | # running this code under Python 3 without converting it. 48 | 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 49 | 50 | class BeautifulSoup(Tag): 51 | """ 52 | This class defines the basic interface called by the tree builders. 53 | 54 | These methods will be called by the parser: 55 | reset() 56 | feed(markup) 57 | 58 | The tree builder may call these methods from its feed() implementation: 59 | handle_starttag(name, attrs) # See note about return value 60 | handle_endtag(name) 61 | handle_data(data) # Appends to the current data node 62 | endData(containerClass=NavigableString) # Ends the current data node 63 | 64 | No matter how complicated the underlying parser is, you should be 65 | able to build a tree using 'start tag' events, 'end tag' events, 66 | 'data' events, and "done with data" events. 67 | 68 | If you encounter an empty-element tag (aka a self-closing tag, 69 | like HTML's
tag), call handle_starttag and then 70 | handle_endtag. 71 | """ 72 | ROOT_TAG_NAME = '[document]' 73 | 74 | # If the end-user gives no indication which tree builder they 75 | # want, look for one with these features. 76 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 77 | 78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 79 | 80 | NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" 81 | 82 | def __init__(self, markup="", features=None, builder=None, 83 | parse_only=None, from_encoding=None, exclude_encodings=None, 84 | **kwargs): 85 | """The Soup object is initialized as the 'root tag', and the 86 | provided markup (which can be a string or a file-like object) 87 | is fed into the underlying parser.""" 88 | 89 | if 'convertEntities' in kwargs: 90 | warnings.warn( 91 | "BS4 does not respect the convertEntities argument to the " 92 | "BeautifulSoup constructor. Entities are always converted " 93 | "to Unicode characters.") 94 | 95 | if 'markupMassage' in kwargs: 96 | del kwargs['markupMassage'] 97 | warnings.warn( 98 | "BS4 does not respect the markupMassage argument to the " 99 | "BeautifulSoup constructor. The tree builder is responsible " 100 | "for any necessary markup massage.") 101 | 102 | if 'smartQuotesTo' in kwargs: 103 | del kwargs['smartQuotesTo'] 104 | warnings.warn( 105 | "BS4 does not respect the smartQuotesTo argument to the " 106 | "BeautifulSoup constructor. Smart quotes are always converted " 107 | "to Unicode characters.") 108 | 109 | if 'selfClosingTags' in kwargs: 110 | del kwargs['selfClosingTags'] 111 | warnings.warn( 112 | "BS4 does not respect the selfClosingTags argument to the " 113 | "BeautifulSoup constructor. The tree builder is responsible " 114 | "for understanding self-closing tags.") 115 | 116 | if 'isHTML' in kwargs: 117 | del kwargs['isHTML'] 118 | warnings.warn( 119 | "BS4 does not respect the isHTML argument to the " 120 | "BeautifulSoup constructor. Suggest you use " 121 | "features='lxml' for HTML and features='lxml-xml' for " 122 | "XML.") 123 | 124 | def deprecated_argument(old_name, new_name): 125 | if old_name in kwargs: 126 | warnings.warn( 127 | 'The "%s" argument to the BeautifulSoup constructor ' 128 | 'has been renamed to "%s."' % (old_name, new_name)) 129 | value = kwargs[old_name] 130 | del kwargs[old_name] 131 | return value 132 | return None 133 | 134 | parse_only = parse_only or deprecated_argument( 135 | "parseOnlyThese", "parse_only") 136 | 137 | from_encoding = from_encoding or deprecated_argument( 138 | "fromEncoding", "from_encoding") 139 | 140 | if len(kwargs) > 0: 141 | arg = list(kwargs.keys()).pop() 142 | raise TypeError( 143 | "__init__() got an unexpected keyword argument '%s'" % arg) 144 | 145 | if builder is None: 146 | original_features = features 147 | if isinstance(features, str): 148 | features = [features] 149 | if features is None or len(features) == 0: 150 | features = self.DEFAULT_BUILDER_FEATURES 151 | builder_class = builder_registry.lookup(*features) 152 | if builder_class is None: 153 | raise FeatureNotFound( 154 | "Couldn't find a tree builder with the features you " 155 | "requested: %s. Do you need to install a parser library?" 156 | % ",".join(features)) 157 | builder = builder_class() 158 | if not (original_features == builder.NAME or 159 | original_features in builder.ALTERNATE_NAMES): 160 | if builder.is_xml: 161 | markup_type = "XML" 162 | else: 163 | markup_type = "HTML" 164 | warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( 165 | parser=builder.NAME, 166 | markup_type=markup_type)) 167 | 168 | self.builder = builder 169 | self.is_xml = builder.is_xml 170 | self.builder.soup = self 171 | 172 | self.parse_only = parse_only 173 | 174 | if hasattr(markup, 'read'): # It's a file-type object. 175 | markup = markup.read() 176 | elif len(markup) <= 256: 177 | # Print out warnings for a couple beginner problems 178 | # involving passing non-markup to Beautiful Soup. 179 | # Beautiful Soup will still parse the input as markup, 180 | # just in case that's what the user really wants. 181 | if (isinstance(markup, str) 182 | and not os.path.supports_unicode_filenames): 183 | possible_filename = markup.encode("utf8") 184 | else: 185 | possible_filename = markup 186 | is_file = False 187 | try: 188 | is_file = os.path.exists(possible_filename) 189 | except Exception as e: 190 | # This is almost certainly a problem involving 191 | # characters not valid in filenames on this 192 | # system. Just let it go. 193 | pass 194 | if is_file: 195 | if isinstance(markup, str): 196 | markup = markup.encode("utf8") 197 | warnings.warn( 198 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) 199 | if markup[:5] == "http:" or markup[:6] == "https:": 200 | # TODO: This is ugly but I couldn't get it to work in 201 | # Python 3 otherwise. 202 | if ((isinstance(markup, bytes) and not b' ' in markup) 203 | or (isinstance(markup, str) and not ' ' in markup)): 204 | if isinstance(markup, str): 205 | markup = markup.encode("utf8") 206 | warnings.warn( 207 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) 208 | 209 | for (self.markup, self.original_encoding, self.declared_html_encoding, 210 | self.contains_replacement_characters) in ( 211 | self.builder.prepare_markup( 212 | markup, from_encoding, exclude_encodings=exclude_encodings)): 213 | self.reset() 214 | try: 215 | self._feed() 216 | break 217 | except ParserRejectedMarkup: 218 | pass 219 | 220 | # Clear out the markup and remove the builder's circular 221 | # reference to this object. 222 | self.markup = None 223 | self.builder.soup = None 224 | 225 | def __copy__(self): 226 | return type(self)(self.encode(), builder=self.builder) 227 | 228 | def __getstate__(self): 229 | # Frequently a tree builder can't be pickled. 230 | d = dict(self.__dict__) 231 | if 'builder' in d and not self.builder.picklable: 232 | del d['builder'] 233 | return d 234 | 235 | def _feed(self): 236 | # Convert the document to Unicode. 237 | self.builder.reset() 238 | 239 | self.builder.feed(self.markup) 240 | # Close out any unfinished strings and close all the open tags. 241 | self.endData() 242 | while self.currentTag.name != self.ROOT_TAG_NAME: 243 | self.popTag() 244 | 245 | def reset(self): 246 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 247 | self.hidden = 1 248 | self.builder.reset() 249 | self.current_data = [] 250 | self.currentTag = None 251 | self.tagStack = [] 252 | self.preserve_whitespace_tag_stack = [] 253 | self.pushTag(self) 254 | 255 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 256 | """Create a new tag associated with this soup.""" 257 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) 258 | 259 | def new_string(self, s, subclass=NavigableString): 260 | """Create a new NavigableString associated with this soup.""" 261 | return subclass(s) 262 | 263 | def insert_before(self, successor): 264 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 265 | 266 | def insert_after(self, successor): 267 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 268 | 269 | def popTag(self): 270 | tag = self.tagStack.pop() 271 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 272 | self.preserve_whitespace_tag_stack.pop() 273 | #print "Pop", tag.name 274 | if self.tagStack: 275 | self.currentTag = self.tagStack[-1] 276 | return self.currentTag 277 | 278 | def pushTag(self, tag): 279 | #print "Push", tag.name 280 | if self.currentTag: 281 | self.currentTag.contents.append(tag) 282 | self.tagStack.append(tag) 283 | self.currentTag = self.tagStack[-1] 284 | if tag.name in self.builder.preserve_whitespace_tags: 285 | self.preserve_whitespace_tag_stack.append(tag) 286 | 287 | def endData(self, containerClass=NavigableString): 288 | if self.current_data: 289 | current_data = ''.join(self.current_data) 290 | # If whitespace is not preserved, and this string contains 291 | # nothing but ASCII spaces, replace it with a single space 292 | # or newline. 293 | if not self.preserve_whitespace_tag_stack: 294 | strippable = True 295 | for i in current_data: 296 | if i not in self.ASCII_SPACES: 297 | strippable = False 298 | break 299 | if strippable: 300 | if '\n' in current_data: 301 | current_data = '\n' 302 | else: 303 | current_data = ' ' 304 | 305 | # Reset the data collector. 306 | self.current_data = [] 307 | 308 | # Should we add this string to the tree at all? 309 | if self.parse_only and len(self.tagStack) <= 1 and \ 310 | (not self.parse_only.text or \ 311 | not self.parse_only.search(current_data)): 312 | return 313 | 314 | o = containerClass(current_data) 315 | self.object_was_parsed(o) 316 | 317 | def object_was_parsed(self, o, parent=None, most_recent_element=None): 318 | """Add an object to the parse tree.""" 319 | parent = parent or self.currentTag 320 | previous_element = most_recent_element or self._most_recent_element 321 | 322 | next_element = previous_sibling = next_sibling = None 323 | if isinstance(o, Tag): 324 | next_element = o.next_element 325 | next_sibling = o.next_sibling 326 | previous_sibling = o.previous_sibling 327 | if not previous_element: 328 | previous_element = o.previous_element 329 | 330 | o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 331 | 332 | self._most_recent_element = o 333 | parent.contents.append(o) 334 | 335 | if parent.next_sibling: 336 | # This node is being inserted into an element that has 337 | # already been parsed. Deal with any dangling references. 338 | index = parent.contents.index(o) 339 | if index == 0: 340 | previous_element = parent 341 | previous_sibling = None 342 | else: 343 | previous_element = previous_sibling = parent.contents[index-1] 344 | if index == len(parent.contents)-1: 345 | next_element = parent.next_sibling 346 | next_sibling = None 347 | else: 348 | next_element = next_sibling = parent.contents[index+1] 349 | 350 | o.previous_element = previous_element 351 | if previous_element: 352 | previous_element.next_element = o 353 | o.next_element = next_element 354 | if next_element: 355 | next_element.previous_element = o 356 | o.next_sibling = next_sibling 357 | if next_sibling: 358 | next_sibling.previous_sibling = o 359 | o.previous_sibling = previous_sibling 360 | if previous_sibling: 361 | previous_sibling.next_sibling = o 362 | 363 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): 364 | """Pops the tag stack up to and including the most recent 365 | instance of the given tag. If inclusivePop is false, pops the tag 366 | stack up to but *not* including the most recent instqance of 367 | the given tag.""" 368 | #print "Popping to %s" % name 369 | if name == self.ROOT_TAG_NAME: 370 | # The BeautifulSoup object itself can never be popped. 371 | return 372 | 373 | most_recently_popped = None 374 | 375 | stack_size = len(self.tagStack) 376 | for i in range(stack_size - 1, 0, -1): 377 | t = self.tagStack[i] 378 | if (name == t.name and nsprefix == t.prefix): 379 | if inclusivePop: 380 | most_recently_popped = self.popTag() 381 | break 382 | most_recently_popped = self.popTag() 383 | 384 | return most_recently_popped 385 | 386 | def handle_starttag(self, name, namespace, nsprefix, attrs): 387 | """Push a start tag on to the stack. 388 | 389 | If this method returns None, the tag was rejected by the 390 | SoupStrainer. You should proceed as if the tag had not occured 391 | in the document. For instance, if this was a self-closing tag, 392 | don't call handle_endtag. 393 | """ 394 | 395 | # print "Start tag %s: %s" % (name, attrs) 396 | self.endData() 397 | 398 | if (self.parse_only and len(self.tagStack) <= 1 399 | and (self.parse_only.text 400 | or not self.parse_only.search_tag(name, attrs))): 401 | return None 402 | 403 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 404 | self.currentTag, self._most_recent_element) 405 | if tag is None: 406 | return tag 407 | if self._most_recent_element: 408 | self._most_recent_element.next_element = tag 409 | self._most_recent_element = tag 410 | self.pushTag(tag) 411 | return tag 412 | 413 | def handle_endtag(self, name, nsprefix=None): 414 | #print "End tag: " + name 415 | self.endData() 416 | self._popToTag(name, nsprefix) 417 | 418 | def handle_data(self, data): 419 | self.current_data.append(data) 420 | 421 | def decode(self, pretty_print=False, 422 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, 423 | formatter="minimal"): 424 | """Returns a string or Unicode representation of this document. 425 | To get Unicode, pass None for encoding.""" 426 | 427 | if self.is_xml: 428 | # Print the XML declaration 429 | encoding_part = '' 430 | if eventual_encoding != None: 431 | encoding_part = ' encoding="%s"' % eventual_encoding 432 | prefix = '\n' % encoding_part 433 | else: 434 | prefix = '' 435 | if not pretty_print: 436 | indent_level = None 437 | else: 438 | indent_level = 0 439 | return prefix + super(BeautifulSoup, self).decode( 440 | indent_level, eventual_encoding, formatter) 441 | 442 | # Alias to make it easier to type import: 'from bs4 import _soup' 443 | _s = BeautifulSoup 444 | _soup = BeautifulSoup 445 | 446 | class BeautifulStoneSoup(BeautifulSoup): 447 | """Deprecated interface to an XML parser.""" 448 | 449 | def __init__(self, *args, **kwargs): 450 | kwargs['features'] = 'xml' 451 | warnings.warn( 452 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' 453 | 'it, pass features="xml" into the BeautifulSoup constructor.') 454 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 455 | 456 | 457 | class StopParsing(Exception): 458 | pass 459 | 460 | class FeatureNotFound(ValueError): 461 | pass 462 | 463 | 464 | #By default, act as an HTML pretty-printer. 465 | if __name__ == '__main__': 466 | import sys 467 | soup = BeautifulSoup(sys.stdin) 468 | print(soup.prettify()) 469 | -------------------------------------------------------------------------------- /Week2/bs4/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week2/bs4/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /Week2/bs4/__pycache__/dammit.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week2/bs4/__pycache__/dammit.cpython-35.pyc -------------------------------------------------------------------------------- /Week2/bs4/__pycache__/element.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week2/bs4/__pycache__/element.cpython-35.pyc -------------------------------------------------------------------------------- /Week2/bs4/builder/__init__.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import itertools 3 | import sys 4 | from bs4.element import ( 5 | CharsetMetaAttributeValue, 6 | ContentMetaAttributeValue, 7 | whitespace_re 8 | ) 9 | 10 | __all__ = [ 11 | 'HTMLTreeBuilder', 12 | 'SAXTreeBuilder', 13 | 'TreeBuilder', 14 | 'TreeBuilderRegistry', 15 | ] 16 | 17 | # Some useful features for a TreeBuilder to have. 18 | FAST = 'fast' 19 | PERMISSIVE = 'permissive' 20 | STRICT = 'strict' 21 | XML = 'xml' 22 | HTML = 'html' 23 | HTML_5 = 'html5' 24 | 25 | 26 | class TreeBuilderRegistry(object): 27 | 28 | def __init__(self): 29 | self.builders_for_feature = defaultdict(list) 30 | self.builders = [] 31 | 32 | def register(self, treebuilder_class): 33 | """Register a treebuilder based on its advertised features.""" 34 | for feature in treebuilder_class.features: 35 | self.builders_for_feature[feature].insert(0, treebuilder_class) 36 | self.builders.insert(0, treebuilder_class) 37 | 38 | def lookup(self, *features): 39 | if len(self.builders) == 0: 40 | # There are no builders at all. 41 | return None 42 | 43 | if len(features) == 0: 44 | # They didn't ask for any features. Give them the most 45 | # recently registered builder. 46 | return self.builders[0] 47 | 48 | # Go down the list of features in order, and eliminate any builders 49 | # that don't match every feature. 50 | features = list(features) 51 | features.reverse() 52 | candidates = None 53 | candidate_set = None 54 | while len(features) > 0: 55 | feature = features.pop() 56 | we_have_the_feature = self.builders_for_feature.get(feature, []) 57 | if len(we_have_the_feature) > 0: 58 | if candidates is None: 59 | candidates = we_have_the_feature 60 | candidate_set = set(candidates) 61 | else: 62 | # Eliminate any candidates that don't have this feature. 63 | candidate_set = candidate_set.intersection( 64 | set(we_have_the_feature)) 65 | 66 | # The only valid candidates are the ones in candidate_set. 67 | # Go through the original list of candidates and pick the first one 68 | # that's in candidate_set. 69 | if candidate_set is None: 70 | return None 71 | for candidate in candidates: 72 | if candidate in candidate_set: 73 | return candidate 74 | return None 75 | 76 | # The BeautifulSoup class will take feature lists from developers and use them 77 | # to look up builders in this registry. 78 | builder_registry = TreeBuilderRegistry() 79 | 80 | class TreeBuilder(object): 81 | """Turn a document into a Beautiful Soup object tree.""" 82 | 83 | NAME = "[Unknown tree builder]" 84 | ALTERNATE_NAMES = [] 85 | features = [] 86 | 87 | is_xml = False 88 | picklable = False 89 | preserve_whitespace_tags = set() 90 | empty_element_tags = None # A tag will be considered an empty-element 91 | # tag when and only when it has no contents. 92 | 93 | # A value for these tag/attribute combinations is a space- or 94 | # comma-separated list of CDATA, rather than a single CDATA. 95 | cdata_list_attributes = {} 96 | 97 | 98 | def __init__(self): 99 | self.soup = None 100 | 101 | def reset(self): 102 | pass 103 | 104 | def can_be_empty_element(self, tag_name): 105 | """Might a tag with this name be an empty-element tag? 106 | 107 | The final markup may or may not actually present this tag as 108 | self-closing. 109 | 110 | For instance: an HTMLBuilder does not consider a

tag to be 111 | an empty-element tag (it's not in 112 | HTMLBuilder.empty_element_tags). This means an empty

tag 113 | will be presented as "

", not "

". 114 | 115 | The default implementation has no opinion about which tags are 116 | empty-element tags, so a tag will be presented as an 117 | empty-element tag if and only if it has no contents. 118 | "" will become "", and "bar" will 119 | be left alone. 120 | """ 121 | if self.empty_element_tags is None: 122 | return True 123 | return tag_name in self.empty_element_tags 124 | 125 | def feed(self, markup): 126 | raise NotImplementedError() 127 | 128 | def prepare_markup(self, markup, user_specified_encoding=None, 129 | document_declared_encoding=None): 130 | return markup, None, None, False 131 | 132 | def test_fragment_to_document(self, fragment): 133 | """Wrap an HTML fragment to make it look like a document. 134 | 135 | Different parsers do this differently. For instance, lxml 136 | introduces an empty tag, and html5lib 137 | doesn't. Abstracting this away lets us write simple tests 138 | which run HTML fragments through the parser and compare the 139 | results against other HTML fragments. 140 | 141 | This method should not be used outside of tests. 142 | """ 143 | return fragment 144 | 145 | def set_up_substitutions(self, tag): 146 | return False 147 | 148 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 149 | """Replaces class="foo bar" with class=["foo", "bar"] 150 | 151 | Modifies its input in place. 152 | """ 153 | if not attrs: 154 | return attrs 155 | if self.cdata_list_attributes: 156 | universal = self.cdata_list_attributes.get('*', []) 157 | tag_specific = self.cdata_list_attributes.get( 158 | tag_name.lower(), None) 159 | for attr in list(attrs.keys()): 160 | if attr in universal or (tag_specific and attr in tag_specific): 161 | # We have a "class"-type attribute whose string 162 | # value is a whitespace-separated list of 163 | # values. Split it into a list. 164 | value = attrs[attr] 165 | if isinstance(value, str): 166 | values = whitespace_re.split(value) 167 | else: 168 | # html5lib sometimes calls setAttributes twice 169 | # for the same tag when rearranging the parse 170 | # tree. On the second call the attribute value 171 | # here is already a list. If this happens, 172 | # leave the value alone rather than trying to 173 | # split it again. 174 | values = value 175 | attrs[attr] = values 176 | return attrs 177 | 178 | class SAXTreeBuilder(TreeBuilder): 179 | """A Beautiful Soup treebuilder that listens for SAX events.""" 180 | 181 | def feed(self, markup): 182 | raise NotImplementedError() 183 | 184 | def close(self): 185 | pass 186 | 187 | def startElement(self, name, attrs): 188 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 189 | #print "Start %s, %r" % (name, attrs) 190 | self.soup.handle_starttag(name, attrs) 191 | 192 | def endElement(self, name): 193 | #print "End %s" % name 194 | self.soup.handle_endtag(name) 195 | 196 | def startElementNS(self, nsTuple, nodeName, attrs): 197 | # Throw away (ns, nodeName) for now. 198 | self.startElement(nodeName, attrs) 199 | 200 | def endElementNS(self, nsTuple, nodeName): 201 | # Throw away (ns, nodeName) for now. 202 | self.endElement(nodeName) 203 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 204 | 205 | def startPrefixMapping(self, prefix, nodeValue): 206 | # Ignore the prefix for now. 207 | pass 208 | 209 | def endPrefixMapping(self, prefix): 210 | # Ignore the prefix for now. 211 | # handler.endPrefixMapping(prefix) 212 | pass 213 | 214 | def characters(self, content): 215 | self.soup.handle_data(content) 216 | 217 | def startDocument(self): 218 | pass 219 | 220 | def endDocument(self): 221 | pass 222 | 223 | 224 | class HTMLTreeBuilder(TreeBuilder): 225 | """This TreeBuilder knows facts about HTML. 226 | 227 | Such as which tags are empty-element tags. 228 | """ 229 | 230 | preserve_whitespace_tags = set(['pre', 'textarea']) 231 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 232 | 'spacer', 'link', 'frame', 'base']) 233 | 234 | # The HTML standard defines these attributes as containing a 235 | # space-separated list of values, not a single value. That is, 236 | # class="foo bar" means that the 'class' attribute has two values, 237 | # 'foo' and 'bar', not the single value 'foo bar'. When we 238 | # encounter one of these attributes, we will parse its value into 239 | # a list of values if possible. Upon output, the list will be 240 | # converted back into a string. 241 | cdata_list_attributes = { 242 | "*" : ['class', 'accesskey', 'dropzone'], 243 | "a" : ['rel', 'rev'], 244 | "link" : ['rel', 'rev'], 245 | "td" : ["headers"], 246 | "th" : ["headers"], 247 | "td" : ["headers"], 248 | "form" : ["accept-charset"], 249 | "object" : ["archive"], 250 | 251 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 252 | "area" : ["rel"], 253 | "icon" : ["sizes"], 254 | "iframe" : ["sandbox"], 255 | "output" : ["for"], 256 | } 257 | 258 | def set_up_substitutions(self, tag): 259 | # We are only interested in tags 260 | if tag.name != 'meta': 261 | return False 262 | 263 | http_equiv = tag.get('http-equiv') 264 | content = tag.get('content') 265 | charset = tag.get('charset') 266 | 267 | # We are interested in tags that say what encoding the 268 | # document was originally in. This means HTML 5-style 269 | # tags that provide the "charset" attribute. It also means 270 | # HTML 4-style tags that provide the "content" 271 | # attribute and have "http-equiv" set to "content-type". 272 | # 273 | # In both cases we will replace the value of the appropriate 274 | # attribute with a standin object that can take on any 275 | # encoding. 276 | meta_encoding = None 277 | if charset is not None: 278 | # HTML 5 style: 279 | # 280 | meta_encoding = charset 281 | tag['charset'] = CharsetMetaAttributeValue(charset) 282 | 283 | elif (content is not None and http_equiv is not None 284 | and http_equiv.lower() == 'content-type'): 285 | # HTML 4 style: 286 | # 287 | tag['content'] = ContentMetaAttributeValue(content) 288 | 289 | return (meta_encoding is not None) 290 | 291 | def register_treebuilders_from(module): 292 | """Copy TreeBuilders from the given module into this module.""" 293 | # I'm fairly sure this is not the best way to do this. 294 | this_module = sys.modules['bs4.builder'] 295 | for name in module.__all__: 296 | obj = getattr(module, name) 297 | 298 | if issubclass(obj, TreeBuilder): 299 | setattr(this_module, name, obj) 300 | this_module.__all__.append(name) 301 | # Register the builder while we're at it. 302 | this_module.builder_registry.register(obj) 303 | 304 | class ParserRejectedMarkup(Exception): 305 | pass 306 | 307 | # Builders are registered in reverse order of priority, so that custom 308 | # builder registrations will take precedence. In general, we want lxml 309 | # to take precedence over html5lib, because it's faster. And we only 310 | # want to use HTMLParser as a last result. 311 | from . import _htmlparser 312 | register_treebuilders_from(_htmlparser) 313 | try: 314 | from . import _html5lib 315 | register_treebuilders_from(_html5lib) 316 | except ImportError: 317 | # They don't have html5lib installed. 318 | pass 319 | try: 320 | from . import _lxml 321 | register_treebuilders_from(_lxml) 322 | except ImportError: 323 | # They don't have lxml installed. 324 | pass 325 | -------------------------------------------------------------------------------- /Week2/bs4/builder/__init__.py.bak: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import itertools 3 | import sys 4 | from bs4.element import ( 5 | CharsetMetaAttributeValue, 6 | ContentMetaAttributeValue, 7 | whitespace_re 8 | ) 9 | 10 | __all__ = [ 11 | 'HTMLTreeBuilder', 12 | 'SAXTreeBuilder', 13 | 'TreeBuilder', 14 | 'TreeBuilderRegistry', 15 | ] 16 | 17 | # Some useful features for a TreeBuilder to have. 18 | FAST = 'fast' 19 | PERMISSIVE = 'permissive' 20 | STRICT = 'strict' 21 | XML = 'xml' 22 | HTML = 'html' 23 | HTML_5 = 'html5' 24 | 25 | 26 | class TreeBuilderRegistry(object): 27 | 28 | def __init__(self): 29 | self.builders_for_feature = defaultdict(list) 30 | self.builders = [] 31 | 32 | def register(self, treebuilder_class): 33 | """Register a treebuilder based on its advertised features.""" 34 | for feature in treebuilder_class.features: 35 | self.builders_for_feature[feature].insert(0, treebuilder_class) 36 | self.builders.insert(0, treebuilder_class) 37 | 38 | def lookup(self, *features): 39 | if len(self.builders) == 0: 40 | # There are no builders at all. 41 | return None 42 | 43 | if len(features) == 0: 44 | # They didn't ask for any features. Give them the most 45 | # recently registered builder. 46 | return self.builders[0] 47 | 48 | # Go down the list of features in order, and eliminate any builders 49 | # that don't match every feature. 50 | features = list(features) 51 | features.reverse() 52 | candidates = None 53 | candidate_set = None 54 | while len(features) > 0: 55 | feature = features.pop() 56 | we_have_the_feature = self.builders_for_feature.get(feature, []) 57 | if len(we_have_the_feature) > 0: 58 | if candidates is None: 59 | candidates = we_have_the_feature 60 | candidate_set = set(candidates) 61 | else: 62 | # Eliminate any candidates that don't have this feature. 63 | candidate_set = candidate_set.intersection( 64 | set(we_have_the_feature)) 65 | 66 | # The only valid candidates are the ones in candidate_set. 67 | # Go through the original list of candidates and pick the first one 68 | # that's in candidate_set. 69 | if candidate_set is None: 70 | return None 71 | for candidate in candidates: 72 | if candidate in candidate_set: 73 | return candidate 74 | return None 75 | 76 | # The BeautifulSoup class will take feature lists from developers and use them 77 | # to look up builders in this registry. 78 | builder_registry = TreeBuilderRegistry() 79 | 80 | class TreeBuilder(object): 81 | """Turn a document into a Beautiful Soup object tree.""" 82 | 83 | NAME = "[Unknown tree builder]" 84 | ALTERNATE_NAMES = [] 85 | features = [] 86 | 87 | is_xml = False 88 | picklable = False 89 | preserve_whitespace_tags = set() 90 | empty_element_tags = None # A tag will be considered an empty-element 91 | # tag when and only when it has no contents. 92 | 93 | # A value for these tag/attribute combinations is a space- or 94 | # comma-separated list of CDATA, rather than a single CDATA. 95 | cdata_list_attributes = {} 96 | 97 | 98 | def __init__(self): 99 | self.soup = None 100 | 101 | def reset(self): 102 | pass 103 | 104 | def can_be_empty_element(self, tag_name): 105 | """Might a tag with this name be an empty-element tag? 106 | 107 | The final markup may or may not actually present this tag as 108 | self-closing. 109 | 110 | For instance: an HTMLBuilder does not consider a

tag to be 111 | an empty-element tag (it's not in 112 | HTMLBuilder.empty_element_tags). This means an empty

tag 113 | will be presented as "

", not "

". 114 | 115 | The default implementation has no opinion about which tags are 116 | empty-element tags, so a tag will be presented as an 117 | empty-element tag if and only if it has no contents. 118 | "" will become "", and "bar" will 119 | be left alone. 120 | """ 121 | if self.empty_element_tags is None: 122 | return True 123 | return tag_name in self.empty_element_tags 124 | 125 | def feed(self, markup): 126 | raise NotImplementedError() 127 | 128 | def prepare_markup(self, markup, user_specified_encoding=None, 129 | document_declared_encoding=None): 130 | return markup, None, None, False 131 | 132 | def test_fragment_to_document(self, fragment): 133 | """Wrap an HTML fragment to make it look like a document. 134 | 135 | Different parsers do this differently. For instance, lxml 136 | introduces an empty tag, and html5lib 137 | doesn't. Abstracting this away lets us write simple tests 138 | which run HTML fragments through the parser and compare the 139 | results against other HTML fragments. 140 | 141 | This method should not be used outside of tests. 142 | """ 143 | return fragment 144 | 145 | def set_up_substitutions(self, tag): 146 | return False 147 | 148 | def _replace_cdata_list_attribute_values(self, tag_name, attrs): 149 | """Replaces class="foo bar" with class=["foo", "bar"] 150 | 151 | Modifies its input in place. 152 | """ 153 | if not attrs: 154 | return attrs 155 | if self.cdata_list_attributes: 156 | universal = self.cdata_list_attributes.get('*', []) 157 | tag_specific = self.cdata_list_attributes.get( 158 | tag_name.lower(), None) 159 | for attr in attrs.keys(): 160 | if attr in universal or (tag_specific and attr in tag_specific): 161 | # We have a "class"-type attribute whose string 162 | # value is a whitespace-separated list of 163 | # values. Split it into a list. 164 | value = attrs[attr] 165 | if isinstance(value, basestring): 166 | values = whitespace_re.split(value) 167 | else: 168 | # html5lib sometimes calls setAttributes twice 169 | # for the same tag when rearranging the parse 170 | # tree. On the second call the attribute value 171 | # here is already a list. If this happens, 172 | # leave the value alone rather than trying to 173 | # split it again. 174 | values = value 175 | attrs[attr] = values 176 | return attrs 177 | 178 | class SAXTreeBuilder(TreeBuilder): 179 | """A Beautiful Soup treebuilder that listens for SAX events.""" 180 | 181 | def feed(self, markup): 182 | raise NotImplementedError() 183 | 184 | def close(self): 185 | pass 186 | 187 | def startElement(self, name, attrs): 188 | attrs = dict((key[1], value) for key, value in list(attrs.items())) 189 | #print "Start %s, %r" % (name, attrs) 190 | self.soup.handle_starttag(name, attrs) 191 | 192 | def endElement(self, name): 193 | #print "End %s" % name 194 | self.soup.handle_endtag(name) 195 | 196 | def startElementNS(self, nsTuple, nodeName, attrs): 197 | # Throw away (ns, nodeName) for now. 198 | self.startElement(nodeName, attrs) 199 | 200 | def endElementNS(self, nsTuple, nodeName): 201 | # Throw away (ns, nodeName) for now. 202 | self.endElement(nodeName) 203 | #handler.endElementNS((ns, node.nodeName), node.nodeName) 204 | 205 | def startPrefixMapping(self, prefix, nodeValue): 206 | # Ignore the prefix for now. 207 | pass 208 | 209 | def endPrefixMapping(self, prefix): 210 | # Ignore the prefix for now. 211 | # handler.endPrefixMapping(prefix) 212 | pass 213 | 214 | def characters(self, content): 215 | self.soup.handle_data(content) 216 | 217 | def startDocument(self): 218 | pass 219 | 220 | def endDocument(self): 221 | pass 222 | 223 | 224 | class HTMLTreeBuilder(TreeBuilder): 225 | """This TreeBuilder knows facts about HTML. 226 | 227 | Such as which tags are empty-element tags. 228 | """ 229 | 230 | preserve_whitespace_tags = set(['pre', 'textarea']) 231 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 232 | 'spacer', 'link', 'frame', 'base']) 233 | 234 | # The HTML standard defines these attributes as containing a 235 | # space-separated list of values, not a single value. That is, 236 | # class="foo bar" means that the 'class' attribute has two values, 237 | # 'foo' and 'bar', not the single value 'foo bar'. When we 238 | # encounter one of these attributes, we will parse its value into 239 | # a list of values if possible. Upon output, the list will be 240 | # converted back into a string. 241 | cdata_list_attributes = { 242 | "*" : ['class', 'accesskey', 'dropzone'], 243 | "a" : ['rel', 'rev'], 244 | "link" : ['rel', 'rev'], 245 | "td" : ["headers"], 246 | "th" : ["headers"], 247 | "td" : ["headers"], 248 | "form" : ["accept-charset"], 249 | "object" : ["archive"], 250 | 251 | # These are HTML5 specific, as are *.accesskey and *.dropzone above. 252 | "area" : ["rel"], 253 | "icon" : ["sizes"], 254 | "iframe" : ["sandbox"], 255 | "output" : ["for"], 256 | } 257 | 258 | def set_up_substitutions(self, tag): 259 | # We are only interested in tags 260 | if tag.name != 'meta': 261 | return False 262 | 263 | http_equiv = tag.get('http-equiv') 264 | content = tag.get('content') 265 | charset = tag.get('charset') 266 | 267 | # We are interested in tags that say what encoding the 268 | # document was originally in. This means HTML 5-style 269 | # tags that provide the "charset" attribute. It also means 270 | # HTML 4-style tags that provide the "content" 271 | # attribute and have "http-equiv" set to "content-type". 272 | # 273 | # In both cases we will replace the value of the appropriate 274 | # attribute with a standin object that can take on any 275 | # encoding. 276 | meta_encoding = None 277 | if charset is not None: 278 | # HTML 5 style: 279 | # 280 | meta_encoding = charset 281 | tag['charset'] = CharsetMetaAttributeValue(charset) 282 | 283 | elif (content is not None and http_equiv is not None 284 | and http_equiv.lower() == 'content-type'): 285 | # HTML 4 style: 286 | # 287 | tag['content'] = ContentMetaAttributeValue(content) 288 | 289 | return (meta_encoding is not None) 290 | 291 | def register_treebuilders_from(module): 292 | """Copy TreeBuilders from the given module into this module.""" 293 | # I'm fairly sure this is not the best way to do this. 294 | this_module = sys.modules['bs4.builder'] 295 | for name in module.__all__: 296 | obj = getattr(module, name) 297 | 298 | if issubclass(obj, TreeBuilder): 299 | setattr(this_module, name, obj) 300 | this_module.__all__.append(name) 301 | # Register the builder while we're at it. 302 | this_module.builder_registry.register(obj) 303 | 304 | class ParserRejectedMarkup(Exception): 305 | pass 306 | 307 | # Builders are registered in reverse order of priority, so that custom 308 | # builder registrations will take precedence. In general, we want lxml 309 | # to take precedence over html5lib, because it's faster. And we only 310 | # want to use HTMLParser as a last result. 311 | from . import _htmlparser 312 | register_treebuilders_from(_htmlparser) 313 | try: 314 | from . import _html5lib 315 | register_treebuilders_from(_html5lib) 316 | except ImportError: 317 | # They don't have html5lib installed. 318 | pass 319 | try: 320 | from . import _lxml 321 | register_treebuilders_from(_lxml) 322 | except ImportError: 323 | # They don't have lxml installed. 324 | pass 325 | -------------------------------------------------------------------------------- /Week2/bs4/builder/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week2/bs4/builder/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /Week2/bs4/builder/__pycache__/_html5lib.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week2/bs4/builder/__pycache__/_html5lib.cpython-35.pyc -------------------------------------------------------------------------------- /Week2/bs4/builder/__pycache__/_htmlparser.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week2/bs4/builder/__pycache__/_htmlparser.cpython-35.pyc -------------------------------------------------------------------------------- /Week2/bs4/builder/__pycache__/_lxml.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week2/bs4/builder/__pycache__/_lxml.cpython-35.pyc -------------------------------------------------------------------------------- /Week2/bs4/builder/_html5lib.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'HTML5TreeBuilder', 3 | ] 4 | 5 | from pdb import set_trace 6 | import warnings 7 | from bs4.builder import ( 8 | PERMISSIVE, 9 | HTML, 10 | HTML_5, 11 | HTMLTreeBuilder, 12 | ) 13 | from bs4.element import ( 14 | NamespacedAttribute, 15 | whitespace_re, 16 | ) 17 | import html5lib 18 | from html5lib.constants import namespaces 19 | from bs4.element import ( 20 | Comment, 21 | Doctype, 22 | NavigableString, 23 | Tag, 24 | ) 25 | 26 | class HTML5TreeBuilder(HTMLTreeBuilder): 27 | """Use html5lib to build a tree.""" 28 | 29 | NAME = "html5lib" 30 | 31 | features = [NAME, PERMISSIVE, HTML_5, HTML] 32 | 33 | def prepare_markup(self, markup, user_specified_encoding, 34 | document_declared_encoding=None, exclude_encodings=None): 35 | # Store the user-specified encoding for use later on. 36 | self.user_specified_encoding = user_specified_encoding 37 | 38 | # document_declared_encoding and exclude_encodings aren't used 39 | # ATM because the html5lib TreeBuilder doesn't use 40 | # UnicodeDammit. 41 | if exclude_encodings: 42 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 43 | yield (markup, None, None, False) 44 | 45 | # These methods are defined by Beautiful Soup. 46 | def feed(self, markup): 47 | if self.soup.parse_only is not None: 48 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 49 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) 50 | doc = parser.parse(markup, encoding=self.user_specified_encoding) 51 | 52 | # Set the character encoding detected by the tokenizer. 53 | if isinstance(markup, str): 54 | # We need to special-case this because html5lib sets 55 | # charEncoding to UTF-8 if it gets Unicode input. 56 | doc.original_encoding = None 57 | else: 58 | doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 59 | 60 | def create_treebuilder(self, namespaceHTMLElements): 61 | self.underlying_builder = TreeBuilderForHtml5lib( 62 | self.soup, namespaceHTMLElements) 63 | return self.underlying_builder 64 | 65 | def test_fragment_to_document(self, fragment): 66 | """See `TreeBuilder`.""" 67 | return '%s' % fragment 68 | 69 | 70 | class TreeBuilderForHtml5lib(html5lib.treebuilders.base.TreeBuilder): 71 | 72 | def __init__(self, soup, namespaceHTMLElements): 73 | self.soup = soup 74 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 75 | 76 | def documentClass(self): 77 | self.soup.reset() 78 | return Element(self.soup, self.soup, None) 79 | 80 | def insertDoctype(self, token): 81 | name = token["name"] 82 | publicId = token["publicId"] 83 | systemId = token["systemId"] 84 | 85 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) 86 | self.soup.object_was_parsed(doctype) 87 | 88 | def elementClass(self, name, namespace): 89 | tag = self.soup.new_tag(name, namespace) 90 | return Element(tag, self.soup, namespace) 91 | 92 | def commentClass(self, data): 93 | return TextNode(Comment(data), self.soup) 94 | 95 | def fragmentClass(self, BeautifulSoup): 96 | self.soup = BeautifulSoup("") 97 | self.soup.name = "[document_fragment]" 98 | return Element(self.soup, self.soup, None) 99 | 100 | def appendChild(self, node): 101 | # XXX This code is not covered by the BS4 tests. 102 | self.soup.append(node.element) 103 | 104 | def getDocument(self): 105 | return self.soup 106 | 107 | def getFragment(self): 108 | return html5lib.treebuilders.base.TreeBuilder.getFragment(self).element 109 | 110 | class AttrList(object): 111 | def __init__(self, element): 112 | self.element = element 113 | self.attrs = dict(self.element.attrs) 114 | def __iter__(self): 115 | return list(self.attrs.items()).__iter__() 116 | def __setitem__(self, name, value): 117 | # If this attribute is a multi-valued attribute for this element, 118 | # turn its value into a list. 119 | list_attr = HTML5TreeBuilder.cdata_list_attributes 120 | if (name in list_attr['*'] 121 | or (self.element.name in list_attr 122 | and name in list_attr[self.element.name])): 123 | value = whitespace_re.split(value) 124 | self.element[name] = value 125 | def items(self): 126 | return list(self.attrs.items()) 127 | def keys(self): 128 | return list(self.attrs.keys()) 129 | def __len__(self): 130 | return len(self.attrs) 131 | def __getitem__(self, name): 132 | return self.attrs[name] 133 | def __contains__(self, name): 134 | return name in list(self.attrs.keys()) 135 | 136 | 137 | class Element(html5lib.treebuilders.base.Node): 138 | def __init__(self, element, soup, namespace): 139 | html5lib.treebuilders.base.Node.__init__(self, element.name) 140 | self.element = element 141 | self.soup = soup 142 | self.namespace = namespace 143 | 144 | def appendChild(self, node): 145 | string_child = child = None 146 | if isinstance(node, str): 147 | # Some other piece of code decided to pass in a string 148 | # instead of creating a TextElement object to contain the 149 | # string. 150 | string_child = child = node 151 | elif isinstance(node, Tag): 152 | # Some other piece of code decided to pass in a Tag 153 | # instead of creating an Element object to contain the 154 | # Tag. 155 | child = node 156 | elif node.element.__class__ == NavigableString: 157 | string_child = child = node.element 158 | else: 159 | child = node.element 160 | 161 | if not isinstance(child, str) and child.parent is not None: 162 | node.element.extract() 163 | 164 | if (string_child and self.element.contents 165 | and self.element.contents[-1].__class__ == NavigableString): 166 | # We are appending a string onto another string. 167 | # TODO This has O(n^2) performance, for input like 168 | # "aaa..." 169 | old_element = self.element.contents[-1] 170 | new_element = self.soup.new_string(old_element + string_child) 171 | old_element.replace_with(new_element) 172 | self.soup._most_recent_element = new_element 173 | else: 174 | if isinstance(node, str): 175 | # Create a brand new NavigableString from this string. 176 | child = self.soup.new_string(node) 177 | 178 | # Tell Beautiful Soup to act as if it parsed this element 179 | # immediately after the parent's last descendant. (Or 180 | # immediately after the parent, if it has no children.) 181 | if self.element.contents: 182 | most_recent_element = self.element._last_descendant(False) 183 | elif self.element.next_element is not None: 184 | # Something from further ahead in the parse tree is 185 | # being inserted into this earlier element. This is 186 | # very annoying because it means an expensive search 187 | # for the last element in the tree. 188 | most_recent_element = self.soup._last_descendant() 189 | else: 190 | most_recent_element = self.element 191 | 192 | self.soup.object_was_parsed( 193 | child, parent=self.element, 194 | most_recent_element=most_recent_element) 195 | 196 | def getAttributes(self): 197 | return AttrList(self.element) 198 | 199 | def setAttributes(self, attributes): 200 | 201 | if attributes is not None and len(attributes) > 0: 202 | 203 | converted_attributes = [] 204 | for name, value in list(attributes.items()): 205 | if isinstance(name, tuple): 206 | new_name = NamespacedAttribute(*name) 207 | del attributes[name] 208 | attributes[new_name] = value 209 | 210 | self.soup.builder._replace_cdata_list_attribute_values( 211 | self.name, attributes) 212 | for name, value in list(attributes.items()): 213 | self.element[name] = value 214 | 215 | # The attributes may contain variables that need substitution. 216 | # Call set_up_substitutions manually. 217 | # 218 | # The Tag constructor called this method when the Tag was created, 219 | # but we just set/changed the attributes, so call it again. 220 | self.soup.builder.set_up_substitutions(self.element) 221 | attributes = property(getAttributes, setAttributes) 222 | 223 | def insertText(self, data, insertBefore=None): 224 | if insertBefore: 225 | text = TextNode(self.soup.new_string(data), self.soup) 226 | self.insertBefore(data, insertBefore) 227 | else: 228 | self.appendChild(data) 229 | 230 | def insertBefore(self, node, refNode): 231 | index = self.element.index(refNode.element) 232 | if (node.element.__class__ == NavigableString and self.element.contents 233 | and self.element.contents[index-1].__class__ == NavigableString): 234 | # (See comments in appendChild) 235 | old_node = self.element.contents[index-1] 236 | new_str = self.soup.new_string(old_node + node.element) 237 | old_node.replace_with(new_str) 238 | else: 239 | self.element.insert(index, node.element) 240 | node.parent = self 241 | 242 | def removeChild(self, node): 243 | node.element.extract() 244 | 245 | def reparentChildren(self, new_parent): 246 | """Move all of this tag's children into another tag.""" 247 | # print "MOVE", self.element.contents 248 | # print "FROM", self.element 249 | # print "TO", new_parent.element 250 | element = self.element 251 | new_parent_element = new_parent.element 252 | # Determine what this tag's next_element will be once all the children 253 | # are removed. 254 | final_next_element = element.next_sibling 255 | 256 | new_parents_last_descendant = new_parent_element._last_descendant(False, False) 257 | if len(new_parent_element.contents) > 0: 258 | # The new parent already contains children. We will be 259 | # appending this tag's children to the end. 260 | new_parents_last_child = new_parent_element.contents[-1] 261 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 262 | else: 263 | # The new parent contains no children. 264 | new_parents_last_child = None 265 | new_parents_last_descendant_next_element = new_parent_element.next_element 266 | 267 | to_append = element.contents 268 | append_after = new_parent_element.contents 269 | if len(to_append) > 0: 270 | # Set the first child's previous_element and previous_sibling 271 | # to elements within the new parent 272 | first_child = to_append[0] 273 | if new_parents_last_descendant: 274 | first_child.previous_element = new_parents_last_descendant 275 | else: 276 | first_child.previous_element = new_parent_element 277 | first_child.previous_sibling = new_parents_last_child 278 | if new_parents_last_descendant: 279 | new_parents_last_descendant.next_element = first_child 280 | else: 281 | new_parent_element.next_element = first_child 282 | if new_parents_last_child: 283 | new_parents_last_child.next_sibling = first_child 284 | 285 | # Fix the last child's next_element and next_sibling 286 | last_child = to_append[-1] 287 | last_child.next_element = new_parents_last_descendant_next_element 288 | if new_parents_last_descendant_next_element: 289 | new_parents_last_descendant_next_element.previous_element = last_child 290 | last_child.next_sibling = None 291 | 292 | for child in to_append: 293 | child.parent = new_parent_element 294 | new_parent_element.contents.append(child) 295 | 296 | # Now that this element has no children, change its .next_element. 297 | element.contents = [] 298 | element.next_element = final_next_element 299 | 300 | # print "DONE WITH MOVE" 301 | # print "FROM", self.element 302 | # print "TO", new_parent_element 303 | 304 | def cloneNode(self): 305 | tag = self.soup.new_tag(self.element.name, self.namespace) 306 | node = Element(tag, self.soup, self.namespace) 307 | for key,value in self.attributes: 308 | node.attributes[key] = value 309 | return node 310 | 311 | def hasContent(self): 312 | return self.element.contents 313 | 314 | def getNameTuple(self): 315 | if self.namespace == None: 316 | return namespaces["html"], self.name 317 | else: 318 | return self.namespace, self.name 319 | 320 | nameTuple = property(getNameTuple) 321 | 322 | class TextNode(Element): 323 | def __init__(self, element, soup): 324 | html5lib.treebuilders.base.Node.__init__(self, None) 325 | self.element = element 326 | self.soup = soup 327 | 328 | def cloneNode(self): 329 | raise NotImplementedError 330 | -------------------------------------------------------------------------------- /Week2/bs4/builder/_html5lib.py.bak: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'HTML5TreeBuilder', 3 | ] 4 | 5 | from pdb import set_trace 6 | import warnings 7 | from bs4.builder import ( 8 | PERMISSIVE, 9 | HTML, 10 | HTML_5, 11 | HTMLTreeBuilder, 12 | ) 13 | from bs4.element import ( 14 | NamespacedAttribute, 15 | whitespace_re, 16 | ) 17 | import html5lib 18 | from html5lib.constants import namespaces 19 | from bs4.element import ( 20 | Comment, 21 | Doctype, 22 | NavigableString, 23 | Tag, 24 | ) 25 | 26 | class HTML5TreeBuilder(HTMLTreeBuilder): 27 | """Use html5lib to build a tree.""" 28 | 29 | NAME = "html5lib" 30 | 31 | features = [NAME, PERMISSIVE, HTML_5, HTML] 32 | 33 | def prepare_markup(self, markup, user_specified_encoding, 34 | document_declared_encoding=None, exclude_encodings=None): 35 | # Store the user-specified encoding for use later on. 36 | self.user_specified_encoding = user_specified_encoding 37 | 38 | # document_declared_encoding and exclude_encodings aren't used 39 | # ATM because the html5lib TreeBuilder doesn't use 40 | # UnicodeDammit. 41 | if exclude_encodings: 42 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") 43 | yield (markup, None, None, False) 44 | 45 | # These methods are defined by Beautiful Soup. 46 | def feed(self, markup): 47 | if self.soup.parse_only is not None: 48 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") 49 | parser = html5lib.HTMLParser(tree=self.create_treebuilder) 50 | doc = parser.parse(markup, encoding=self.user_specified_encoding) 51 | 52 | # Set the character encoding detected by the tokenizer. 53 | if isinstance(markup, unicode): 54 | # We need to special-case this because html5lib sets 55 | # charEncoding to UTF-8 if it gets Unicode input. 56 | doc.original_encoding = None 57 | else: 58 | doc.original_encoding = parser.tokenizer.stream.charEncoding[0] 59 | 60 | def create_treebuilder(self, namespaceHTMLElements): 61 | self.underlying_builder = TreeBuilderForHtml5lib( 62 | self.soup, namespaceHTMLElements) 63 | return self.underlying_builder 64 | 65 | def test_fragment_to_document(self, fragment): 66 | """See `TreeBuilder`.""" 67 | return u'%s' % fragment 68 | 69 | 70 | class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): 71 | 72 | def __init__(self, soup, namespaceHTMLElements): 73 | self.soup = soup 74 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) 75 | 76 | def documentClass(self): 77 | self.soup.reset() 78 | return Element(self.soup, self.soup, None) 79 | 80 | def insertDoctype(self, token): 81 | name = token["name"] 82 | publicId = token["publicId"] 83 | systemId = token["systemId"] 84 | 85 | doctype = Doctype.for_name_and_ids(name, publicId, systemId) 86 | self.soup.object_was_parsed(doctype) 87 | 88 | def elementClass(self, name, namespace): 89 | tag = self.soup.new_tag(name, namespace) 90 | return Element(tag, self.soup, namespace) 91 | 92 | def commentClass(self, data): 93 | return TextNode(Comment(data), self.soup) 94 | 95 | def fragmentClass(self): 96 | self.soup = BeautifulSoup("") 97 | self.soup.name = "[document_fragment]" 98 | return Element(self.soup, self.soup, None) 99 | 100 | def appendChild(self, node): 101 | # XXX This code is not covered by the BS4 tests. 102 | self.soup.append(node.element) 103 | 104 | def getDocument(self): 105 | return self.soup 106 | 107 | def getFragment(self): 108 | return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element 109 | 110 | class AttrList(object): 111 | def __init__(self, element): 112 | self.element = element 113 | self.attrs = dict(self.element.attrs) 114 | def __iter__(self): 115 | return list(self.attrs.items()).__iter__() 116 | def __setitem__(self, name, value): 117 | # If this attribute is a multi-valued attribute for this element, 118 | # turn its value into a list. 119 | list_attr = HTML5TreeBuilder.cdata_list_attributes 120 | if (name in list_attr['*'] 121 | or (self.element.name in list_attr 122 | and name in list_attr[self.element.name])): 123 | value = whitespace_re.split(value) 124 | self.element[name] = value 125 | def items(self): 126 | return list(self.attrs.items()) 127 | def keys(self): 128 | return list(self.attrs.keys()) 129 | def __len__(self): 130 | return len(self.attrs) 131 | def __getitem__(self, name): 132 | return self.attrs[name] 133 | def __contains__(self, name): 134 | return name in list(self.attrs.keys()) 135 | 136 | 137 | class Element(html5lib.treebuilders._base.Node): 138 | def __init__(self, element, soup, namespace): 139 | html5lib.treebuilders._base.Node.__init__(self, element.name) 140 | self.element = element 141 | self.soup = soup 142 | self.namespace = namespace 143 | 144 | def appendChild(self, node): 145 | string_child = child = None 146 | if isinstance(node, basestring): 147 | # Some other piece of code decided to pass in a string 148 | # instead of creating a TextElement object to contain the 149 | # string. 150 | string_child = child = node 151 | elif isinstance(node, Tag): 152 | # Some other piece of code decided to pass in a Tag 153 | # instead of creating an Element object to contain the 154 | # Tag. 155 | child = node 156 | elif node.element.__class__ == NavigableString: 157 | string_child = child = node.element 158 | else: 159 | child = node.element 160 | 161 | if not isinstance(child, basestring) and child.parent is not None: 162 | node.element.extract() 163 | 164 | if (string_child and self.element.contents 165 | and self.element.contents[-1].__class__ == NavigableString): 166 | # We are appending a string onto another string. 167 | # TODO This has O(n^2) performance, for input like 168 | # "aaa..." 169 | old_element = self.element.contents[-1] 170 | new_element = self.soup.new_string(old_element + string_child) 171 | old_element.replace_with(new_element) 172 | self.soup._most_recent_element = new_element 173 | else: 174 | if isinstance(node, basestring): 175 | # Create a brand new NavigableString from this string. 176 | child = self.soup.new_string(node) 177 | 178 | # Tell Beautiful Soup to act as if it parsed this element 179 | # immediately after the parent's last descendant. (Or 180 | # immediately after the parent, if it has no children.) 181 | if self.element.contents: 182 | most_recent_element = self.element._last_descendant(False) 183 | elif self.element.next_element is not None: 184 | # Something from further ahead in the parse tree is 185 | # being inserted into this earlier element. This is 186 | # very annoying because it means an expensive search 187 | # for the last element in the tree. 188 | most_recent_element = self.soup._last_descendant() 189 | else: 190 | most_recent_element = self.element 191 | 192 | self.soup.object_was_parsed( 193 | child, parent=self.element, 194 | most_recent_element=most_recent_element) 195 | 196 | def getAttributes(self): 197 | return AttrList(self.element) 198 | 199 | def setAttributes(self, attributes): 200 | 201 | if attributes is not None and len(attributes) > 0: 202 | 203 | converted_attributes = [] 204 | for name, value in list(attributes.items()): 205 | if isinstance(name, tuple): 206 | new_name = NamespacedAttribute(*name) 207 | del attributes[name] 208 | attributes[new_name] = value 209 | 210 | self.soup.builder._replace_cdata_list_attribute_values( 211 | self.name, attributes) 212 | for name, value in attributes.items(): 213 | self.element[name] = value 214 | 215 | # The attributes may contain variables that need substitution. 216 | # Call set_up_substitutions manually. 217 | # 218 | # The Tag constructor called this method when the Tag was created, 219 | # but we just set/changed the attributes, so call it again. 220 | self.soup.builder.set_up_substitutions(self.element) 221 | attributes = property(getAttributes, setAttributes) 222 | 223 | def insertText(self, data, insertBefore=None): 224 | if insertBefore: 225 | text = TextNode(self.soup.new_string(data), self.soup) 226 | self.insertBefore(data, insertBefore) 227 | else: 228 | self.appendChild(data) 229 | 230 | def insertBefore(self, node, refNode): 231 | index = self.element.index(refNode.element) 232 | if (node.element.__class__ == NavigableString and self.element.contents 233 | and self.element.contents[index-1].__class__ == NavigableString): 234 | # (See comments in appendChild) 235 | old_node = self.element.contents[index-1] 236 | new_str = self.soup.new_string(old_node + node.element) 237 | old_node.replace_with(new_str) 238 | else: 239 | self.element.insert(index, node.element) 240 | node.parent = self 241 | 242 | def removeChild(self, node): 243 | node.element.extract() 244 | 245 | def reparentChildren(self, new_parent): 246 | """Move all of this tag's children into another tag.""" 247 | # print "MOVE", self.element.contents 248 | # print "FROM", self.element 249 | # print "TO", new_parent.element 250 | element = self.element 251 | new_parent_element = new_parent.element 252 | # Determine what this tag's next_element will be once all the children 253 | # are removed. 254 | final_next_element = element.next_sibling 255 | 256 | new_parents_last_descendant = new_parent_element._last_descendant(False, False) 257 | if len(new_parent_element.contents) > 0: 258 | # The new parent already contains children. We will be 259 | # appending this tag's children to the end. 260 | new_parents_last_child = new_parent_element.contents[-1] 261 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element 262 | else: 263 | # The new parent contains no children. 264 | new_parents_last_child = None 265 | new_parents_last_descendant_next_element = new_parent_element.next_element 266 | 267 | to_append = element.contents 268 | append_after = new_parent_element.contents 269 | if len(to_append) > 0: 270 | # Set the first child's previous_element and previous_sibling 271 | # to elements within the new parent 272 | first_child = to_append[0] 273 | if new_parents_last_descendant: 274 | first_child.previous_element = new_parents_last_descendant 275 | else: 276 | first_child.previous_element = new_parent_element 277 | first_child.previous_sibling = new_parents_last_child 278 | if new_parents_last_descendant: 279 | new_parents_last_descendant.next_element = first_child 280 | else: 281 | new_parent_element.next_element = first_child 282 | if new_parents_last_child: 283 | new_parents_last_child.next_sibling = first_child 284 | 285 | # Fix the last child's next_element and next_sibling 286 | last_child = to_append[-1] 287 | last_child.next_element = new_parents_last_descendant_next_element 288 | if new_parents_last_descendant_next_element: 289 | new_parents_last_descendant_next_element.previous_element = last_child 290 | last_child.next_sibling = None 291 | 292 | for child in to_append: 293 | child.parent = new_parent_element 294 | new_parent_element.contents.append(child) 295 | 296 | # Now that this element has no children, change its .next_element. 297 | element.contents = [] 298 | element.next_element = final_next_element 299 | 300 | # print "DONE WITH MOVE" 301 | # print "FROM", self.element 302 | # print "TO", new_parent_element 303 | 304 | def cloneNode(self): 305 | tag = self.soup.new_tag(self.element.name, self.namespace) 306 | node = Element(tag, self.soup, self.namespace) 307 | for key,value in self.attributes: 308 | node.attributes[key] = value 309 | return node 310 | 311 | def hasContent(self): 312 | return self.element.contents 313 | 314 | def getNameTuple(self): 315 | if self.namespace == None: 316 | return namespaces["html"], self.name 317 | else: 318 | return self.namespace, self.name 319 | 320 | nameTuple = property(getNameTuple) 321 | 322 | class TextNode(Element): 323 | def __init__(self, element, soup): 324 | html5lib.treebuilders._base.Node.__init__(self, None) 325 | self.element = element 326 | self.soup = soup 327 | 328 | def cloneNode(self): 329 | raise NotImplementedError 330 | -------------------------------------------------------------------------------- /Week2/bs4/builder/_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | 3 | __all__ = [ 4 | 'HTMLParserTreeBuilder', 5 | ] 6 | 7 | from html.parser import HTMLParser 8 | 9 | try: 10 | from html.parser import HTMLParseError 11 | except ImportError as e: 12 | # HTMLParseError is removed in Python 3.5. Since it can never be 13 | # thrown in 3.5, we can just define our own class as a placeholder. 14 | class HTMLParseError(Exception): 15 | pass 16 | 17 | import sys 18 | import warnings 19 | 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 21 | # argument, which we'd like to set to False. Unfortunately, 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet 23 | # before Python 3.2.3. 24 | # 25 | # At the end of this file, we monkeypatch HTMLParser so that 26 | # strict=True works well on Python 3.2.2. 27 | major, minor, release = sys.version_info[:3] 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 31 | 32 | 33 | from bs4.element import ( 34 | CData, 35 | Comment, 36 | Declaration, 37 | Doctype, 38 | ProcessingInstruction, 39 | ) 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit 41 | 42 | from bs4.builder import ( 43 | HTML, 44 | HTMLTreeBuilder, 45 | STRICT, 46 | ) 47 | 48 | 49 | HTMLPARSER = 'html.parser' 50 | 51 | class BeautifulSoupHTMLParser(HTMLParser): 52 | def handle_starttag(self, name, attrs): 53 | # XXX namespace 54 | attr_dict = {} 55 | for key, value in attrs: 56 | # Change None attribute values to the empty string 57 | # for consistency with the other tree builders. 58 | if value is None: 59 | value = '' 60 | attr_dict[key] = value 61 | attrvalue = '""' 62 | self.soup.handle_starttag(name, None, None, attr_dict) 63 | 64 | def handle_endtag(self, name): 65 | self.soup.handle_endtag(name) 66 | 67 | def handle_data(self, data): 68 | self.soup.handle_data(data) 69 | 70 | def handle_charref(self, name): 71 | # XXX workaround for a bug in HTMLParser. Remove this once 72 | # it's fixed in all supported versions. 73 | # http://bugs.python.org/issue13633 74 | if name.startswith('x'): 75 | real_name = int(name.lstrip('x'), 16) 76 | elif name.startswith('X'): 77 | real_name = int(name.lstrip('X'), 16) 78 | else: 79 | real_name = int(name) 80 | 81 | try: 82 | data = chr(real_name) 83 | except (ValueError, OverflowError) as e: 84 | data = "\N{REPLACEMENT CHARACTER}" 85 | 86 | self.handle_data(data) 87 | 88 | def handle_entityref(self, name): 89 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 90 | if character is not None: 91 | data = character 92 | else: 93 | data = "&%s;" % name 94 | self.handle_data(data) 95 | 96 | def handle_comment(self, data): 97 | self.soup.endData() 98 | self.soup.handle_data(data) 99 | self.soup.endData(Comment) 100 | 101 | def handle_decl(self, data): 102 | self.soup.endData() 103 | if data.startswith("DOCTYPE "): 104 | data = data[len("DOCTYPE "):] 105 | elif data == 'DOCTYPE': 106 | # i.e. "" 107 | data = '' 108 | self.soup.handle_data(data) 109 | self.soup.endData(Doctype) 110 | 111 | def unknown_decl(self, data): 112 | if data.upper().startswith('CDATA['): 113 | cls = CData 114 | data = data[len('CDATA['):] 115 | else: 116 | cls = Declaration 117 | self.soup.endData() 118 | self.soup.handle_data(data) 119 | self.soup.endData(cls) 120 | 121 | def handle_pi(self, data): 122 | self.soup.endData() 123 | self.soup.handle_data(data) 124 | self.soup.endData(ProcessingInstruction) 125 | 126 | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 128 | 129 | is_xml = False 130 | picklable = True 131 | NAME = HTMLPARSER 132 | features = [NAME, HTML, STRICT] 133 | 134 | def __init__(self, *args, **kwargs): 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 136 | kwargs['strict'] = False 137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 138 | kwargs['convert_charrefs'] = False 139 | self.parser_args = (args, kwargs) 140 | 141 | def prepare_markup(self, markup, user_specified_encoding=None, 142 | document_declared_encoding=None, exclude_encodings=None): 143 | """ 144 | :return: A 4-tuple (markup, original encoding, encoding 145 | declared within markup, whether any characters had to be 146 | replaced with REPLACEMENT CHARACTER). 147 | """ 148 | if isinstance(markup, str): 149 | yield (markup, None, None, False) 150 | return 151 | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, 154 | exclude_encodings=exclude_encodings) 155 | yield (dammit.markup, dammit.original_encoding, 156 | dammit.declared_html_encoding, 157 | dammit.contains_replacement_characters) 158 | 159 | def feed(self, markup): 160 | args, kwargs = self.parser_args 161 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 162 | parser.soup = self.soup 163 | try: 164 | parser.feed(markup) 165 | except HTMLParseError as e: 166 | warnings.warn(RuntimeWarning( 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 168 | raise e 169 | 170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 171 | # 3.2.3 code. This ensures they don't treat markup like

as a 172 | # string. 173 | # 174 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 176 | import re 177 | attrfind_tolerant = re.compile( 178 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 179 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 180 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 181 | 182 | locatestarttagend = re.compile(r""" 183 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 184 | (?:\s+ # whitespace before attribute name 185 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 186 | (?:\s*=\s* # value indicator 187 | (?:'[^']*' # LITA-enclosed value 188 | |\"[^\"]*\" # LIT-enclosed value 189 | |[^'\">\s]+ # bare value 190 | ) 191 | )? 192 | ) 193 | )* 194 | \s* # trailing whitespace 195 | """, re.VERBOSE) 196 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 197 | 198 | from html.parser import tagfind, attrfind 199 | 200 | def parse_starttag(self, i): 201 | self.__starttag_text = None 202 | endpos = self.check_for_whole_start_tag(i) 203 | if endpos < 0: 204 | return endpos 205 | rawdata = self.rawdata 206 | self.__starttag_text = rawdata[i:endpos] 207 | 208 | # Now parse the data between i+1 and j into a tag and attrs 209 | attrs = [] 210 | match = tagfind.match(rawdata, i+1) 211 | assert match, 'unexpected call to parse_starttag()' 212 | k = match.end() 213 | self.lasttag = tag = rawdata[i+1:k].lower() 214 | while k < endpos: 215 | if self.strict: 216 | m = attrfind.match(rawdata, k) 217 | else: 218 | m = attrfind_tolerant.match(rawdata, k) 219 | if not m: 220 | break 221 | attrname, rest, attrvalue = m.group(1, 2, 3) 222 | if not rest: 223 | attrvalue = None 224 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 225 | attrvalue[:1] == '"' == attrvalue[-1:]: 226 | attrvalue = attrvalue[1:-1] 227 | if attrvalue: 228 | attrvalue = self.unescape(attrvalue) 229 | attrs.append((attrname.lower(), attrvalue)) 230 | k = m.end() 231 | 232 | end = rawdata[k:endpos].strip() 233 | if end not in (">", "/>"): 234 | lineno, offset = self.getpos() 235 | if "\n" in self.__starttag_text: 236 | lineno = lineno + self.__starttag_text.count("\n") 237 | offset = len(self.__starttag_text) \ 238 | - self.__starttag_text.rfind("\n") 239 | else: 240 | offset = offset + len(self.__starttag_text) 241 | if self.strict: 242 | self.error("junk characters in start tag: %r" 243 | % (rawdata[k:endpos][:20],)) 244 | self.handle_data(rawdata[i:endpos]) 245 | return endpos 246 | if end.endswith('/>'): 247 | # XHTML-style empty tag: 248 | self.handle_startendtag(tag, attrs) 249 | else: 250 | self.handle_starttag(tag, attrs) 251 | if tag in self.CDATA_CONTENT_ELEMENTS: 252 | self.set_cdata_mode(tag) 253 | return endpos 254 | 255 | def set_cdata_mode(self, elem): 256 | self.cdata_elem = elem.lower() 257 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 258 | 259 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 260 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 261 | 262 | CONSTRUCTOR_TAKES_STRICT = True 263 | -------------------------------------------------------------------------------- /Week2/bs4/builder/_htmlparser.py.bak: -------------------------------------------------------------------------------- 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" 2 | 3 | __all__ = [ 4 | 'HTMLParserTreeBuilder', 5 | ] 6 | 7 | from HTMLParser import HTMLParser 8 | 9 | try: 10 | from HTMLParser import HTMLParseError 11 | except ImportError, e: 12 | # HTMLParseError is removed in Python 3.5. Since it can never be 13 | # thrown in 3.5, we can just define our own class as a placeholder. 14 | class HTMLParseError(Exception): 15 | pass 16 | 17 | import sys 18 | import warnings 19 | 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 21 | # argument, which we'd like to set to False. Unfortunately, 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet 23 | # before Python 3.2.3. 24 | # 25 | # At the end of this file, we monkeypatch HTMLParser so that 26 | # strict=True works well on Python 3.2.2. 27 | major, minor, release = sys.version_info[:3] 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 31 | 32 | 33 | from bs4.element import ( 34 | CData, 35 | Comment, 36 | Declaration, 37 | Doctype, 38 | ProcessingInstruction, 39 | ) 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit 41 | 42 | from bs4.builder import ( 43 | HTML, 44 | HTMLTreeBuilder, 45 | STRICT, 46 | ) 47 | 48 | 49 | HTMLPARSER = 'html.parser' 50 | 51 | class BeautifulSoupHTMLParser(HTMLParser): 52 | def handle_starttag(self, name, attrs): 53 | # XXX namespace 54 | attr_dict = {} 55 | for key, value in attrs: 56 | # Change None attribute values to the empty string 57 | # for consistency with the other tree builders. 58 | if value is None: 59 | value = '' 60 | attr_dict[key] = value 61 | attrvalue = '""' 62 | self.soup.handle_starttag(name, None, None, attr_dict) 63 | 64 | def handle_endtag(self, name): 65 | self.soup.handle_endtag(name) 66 | 67 | def handle_data(self, data): 68 | self.soup.handle_data(data) 69 | 70 | def handle_charref(self, name): 71 | # XXX workaround for a bug in HTMLParser. Remove this once 72 | # it's fixed in all supported versions. 73 | # http://bugs.python.org/issue13633 74 | if name.startswith('x'): 75 | real_name = int(name.lstrip('x'), 16) 76 | elif name.startswith('X'): 77 | real_name = int(name.lstrip('X'), 16) 78 | else: 79 | real_name = int(name) 80 | 81 | try: 82 | data = unichr(real_name) 83 | except (ValueError, OverflowError), e: 84 | data = u"\N{REPLACEMENT CHARACTER}" 85 | 86 | self.handle_data(data) 87 | 88 | def handle_entityref(self, name): 89 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 90 | if character is not None: 91 | data = character 92 | else: 93 | data = "&%s;" % name 94 | self.handle_data(data) 95 | 96 | def handle_comment(self, data): 97 | self.soup.endData() 98 | self.soup.handle_data(data) 99 | self.soup.endData(Comment) 100 | 101 | def handle_decl(self, data): 102 | self.soup.endData() 103 | if data.startswith("DOCTYPE "): 104 | data = data[len("DOCTYPE "):] 105 | elif data == 'DOCTYPE': 106 | # i.e. "" 107 | data = '' 108 | self.soup.handle_data(data) 109 | self.soup.endData(Doctype) 110 | 111 | def unknown_decl(self, data): 112 | if data.upper().startswith('CDATA['): 113 | cls = CData 114 | data = data[len('CDATA['):] 115 | else: 116 | cls = Declaration 117 | self.soup.endData() 118 | self.soup.handle_data(data) 119 | self.soup.endData(cls) 120 | 121 | def handle_pi(self, data): 122 | self.soup.endData() 123 | self.soup.handle_data(data) 124 | self.soup.endData(ProcessingInstruction) 125 | 126 | 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): 128 | 129 | is_xml = False 130 | picklable = True 131 | NAME = HTMLPARSER 132 | features = [NAME, HTML, STRICT] 133 | 134 | def __init__(self, *args, **kwargs): 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 136 | kwargs['strict'] = False 137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 138 | kwargs['convert_charrefs'] = False 139 | self.parser_args = (args, kwargs) 140 | 141 | def prepare_markup(self, markup, user_specified_encoding=None, 142 | document_declared_encoding=None, exclude_encodings=None): 143 | """ 144 | :return: A 4-tuple (markup, original encoding, encoding 145 | declared within markup, whether any characters had to be 146 | replaced with REPLACEMENT CHARACTER). 147 | """ 148 | if isinstance(markup, unicode): 149 | yield (markup, None, None, False) 150 | return 151 | 152 | try_encodings = [user_specified_encoding, document_declared_encoding] 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, 154 | exclude_encodings=exclude_encodings) 155 | yield (dammit.markup, dammit.original_encoding, 156 | dammit.declared_html_encoding, 157 | dammit.contains_replacement_characters) 158 | 159 | def feed(self, markup): 160 | args, kwargs = self.parser_args 161 | parser = BeautifulSoupHTMLParser(*args, **kwargs) 162 | parser.soup = self.soup 163 | try: 164 | parser.feed(markup) 165 | except HTMLParseError, e: 166 | warnings.warn(RuntimeWarning( 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 168 | raise e 169 | 170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 171 | # 3.2.3 code. This ensures they don't treat markup like

as a 172 | # string. 173 | # 174 | # XXX This code can be removed once most Python 3 users are on 3.2.3. 175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 176 | import re 177 | attrfind_tolerant = re.compile( 178 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 179 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 180 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 181 | 182 | locatestarttagend = re.compile(r""" 183 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 184 | (?:\s+ # whitespace before attribute name 185 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 186 | (?:\s*=\s* # value indicator 187 | (?:'[^']*' # LITA-enclosed value 188 | |\"[^\"]*\" # LIT-enclosed value 189 | |[^'\">\s]+ # bare value 190 | ) 191 | )? 192 | ) 193 | )* 194 | \s* # trailing whitespace 195 | """, re.VERBOSE) 196 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 197 | 198 | from html.parser import tagfind, attrfind 199 | 200 | def parse_starttag(self, i): 201 | self.__starttag_text = None 202 | endpos = self.check_for_whole_start_tag(i) 203 | if endpos < 0: 204 | return endpos 205 | rawdata = self.rawdata 206 | self.__starttag_text = rawdata[i:endpos] 207 | 208 | # Now parse the data between i+1 and j into a tag and attrs 209 | attrs = [] 210 | match = tagfind.match(rawdata, i+1) 211 | assert match, 'unexpected call to parse_starttag()' 212 | k = match.end() 213 | self.lasttag = tag = rawdata[i+1:k].lower() 214 | while k < endpos: 215 | if self.strict: 216 | m = attrfind.match(rawdata, k) 217 | else: 218 | m = attrfind_tolerant.match(rawdata, k) 219 | if not m: 220 | break 221 | attrname, rest, attrvalue = m.group(1, 2, 3) 222 | if not rest: 223 | attrvalue = None 224 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 225 | attrvalue[:1] == '"' == attrvalue[-1:]: 226 | attrvalue = attrvalue[1:-1] 227 | if attrvalue: 228 | attrvalue = self.unescape(attrvalue) 229 | attrs.append((attrname.lower(), attrvalue)) 230 | k = m.end() 231 | 232 | end = rawdata[k:endpos].strip() 233 | if end not in (">", "/>"): 234 | lineno, offset = self.getpos() 235 | if "\n" in self.__starttag_text: 236 | lineno = lineno + self.__starttag_text.count("\n") 237 | offset = len(self.__starttag_text) \ 238 | - self.__starttag_text.rfind("\n") 239 | else: 240 | offset = offset + len(self.__starttag_text) 241 | if self.strict: 242 | self.error("junk characters in start tag: %r" 243 | % (rawdata[k:endpos][:20],)) 244 | self.handle_data(rawdata[i:endpos]) 245 | return endpos 246 | if end.endswith('/>'): 247 | # XHTML-style empty tag: 248 | self.handle_startendtag(tag, attrs) 249 | else: 250 | self.handle_starttag(tag, attrs) 251 | if tag in self.CDATA_CONTENT_ELEMENTS: 252 | self.set_cdata_mode(tag) 253 | return endpos 254 | 255 | def set_cdata_mode(self, elem): 256 | self.cdata_elem = elem.lower() 257 | self.interesting = re.compile(r'' % self.cdata_elem, re.I) 258 | 259 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag 260 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 261 | 262 | CONSTRUCTOR_TAKES_STRICT = True 263 | -------------------------------------------------------------------------------- /Week2/bs4/builder/_lxml.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'LXMLTreeBuilderForXML', 3 | 'LXMLTreeBuilder', 4 | ] 5 | 6 | from io import BytesIO 7 | from io import StringIO 8 | import collections 9 | from lxml import etree 10 | from bs4.element import ( 11 | Comment, 12 | Doctype, 13 | NamespacedAttribute, 14 | ProcessingInstruction, 15 | ) 16 | from bs4.builder import ( 17 | FAST, 18 | HTML, 19 | HTMLTreeBuilder, 20 | PERMISSIVE, 21 | ParserRejectedMarkup, 22 | TreeBuilder, 23 | XML) 24 | from bs4.dammit import EncodingDetector 25 | 26 | LXML = 'lxml' 27 | 28 | class LXMLTreeBuilderForXML(TreeBuilder): 29 | DEFAULT_PARSER_CLASS = etree.XMLParser 30 | 31 | is_xml = True 32 | 33 | NAME = "lxml-xml" 34 | ALTERNATE_NAMES = ["xml"] 35 | 36 | # Well, it's permissive by XML parser standards. 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] 38 | 39 | CHUNK_SIZE = 512 40 | 41 | # This namespace mapping is specified in the XML Namespace 42 | # standard. 43 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 44 | 45 | def default_parser(self, encoding): 46 | # This can either return a parser object or a class, which 47 | # will be instantiated with default arguments. 48 | if self._default_parser is not None: 49 | return self._default_parser 50 | return etree.XMLParser( 51 | target=self, strip_cdata=False, recover=True, encoding=encoding) 52 | 53 | def parser_for(self, encoding): 54 | # Use the default parser. 55 | parser = self.default_parser(encoding) 56 | 57 | if isinstance(parser, collections.Callable): 58 | # Instantiate the parser with default arguments 59 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 60 | return parser 61 | 62 | def __init__(self, parser=None, empty_element_tags=None): 63 | # TODO: Issue a warning if parser is present but not a 64 | # callable, since that means there's no way to create new 65 | # parsers for different encodings. 66 | self._default_parser = parser 67 | if empty_element_tags is not None: 68 | self.empty_element_tags = set(empty_element_tags) 69 | self.soup = None 70 | self.nsmaps = [self.DEFAULT_NSMAPS] 71 | 72 | def _getNsTag(self, tag): 73 | # Split the namespace URL out of a fully-qualified lxml tag 74 | # name. Copied from lxml's src/lxml/sax.py. 75 | if tag[0] == '{': 76 | return tuple(tag[1:].split('}', 1)) 77 | else: 78 | return (None, tag) 79 | 80 | def prepare_markup(self, markup, user_specified_encoding=None, 81 | exclude_encodings=None, 82 | document_declared_encoding=None): 83 | """ 84 | :yield: A series of 4-tuples. 85 | (markup, encoding, declared encoding, 86 | has undergone character replacement) 87 | 88 | Each 4-tuple represents a strategy for parsing the document. 89 | """ 90 | if isinstance(markup, str): 91 | # We were given Unicode. Maybe lxml can parse Unicode on 92 | # this system? 93 | yield markup, None, document_declared_encoding, False 94 | 95 | if isinstance(markup, str): 96 | # No, apparently not. Convert the Unicode to UTF-8 and 97 | # tell lxml to parse it as UTF-8. 98 | yield (markup.encode("utf8"), "utf8", 99 | document_declared_encoding, False) 100 | 101 | # Instead of using UnicodeDammit to convert the bytestring to 102 | # Unicode using different encodings, use EncodingDetector to 103 | # iterate over the encodings, and tell lxml to try to parse 104 | # the document as each one in turn. 105 | is_html = not self.is_xml 106 | try_encodings = [user_specified_encoding, document_declared_encoding] 107 | detector = EncodingDetector( 108 | markup, try_encodings, is_html, exclude_encodings) 109 | for encoding in detector.encodings: 110 | yield (detector.markup, encoding, document_declared_encoding, False) 111 | 112 | def feed(self, markup): 113 | if isinstance(markup, bytes): 114 | markup = BytesIO(markup) 115 | elif isinstance(markup, str): 116 | markup = StringIO(markup) 117 | 118 | # Call feed() at least once, even if the markup is empty, 119 | # or the parser won't be initialized. 120 | data = markup.read(self.CHUNK_SIZE) 121 | try: 122 | self.parser = self.parser_for(self.soup.original_encoding) 123 | self.parser.feed(data) 124 | while len(data) != 0: 125 | # Now call feed() on the rest of the data, chunk by chunk. 126 | data = markup.read(self.CHUNK_SIZE) 127 | if len(data) != 0: 128 | self.parser.feed(data) 129 | self.parser.close() 130 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 131 | raise ParserRejectedMarkup(str(e)) 132 | 133 | def close(self): 134 | self.nsmaps = [self.DEFAULT_NSMAPS] 135 | 136 | def start(self, name, attrs, nsmap={}): 137 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 138 | attrs = dict(attrs) 139 | nsprefix = None 140 | # Invert each namespace map as it comes in. 141 | if len(self.nsmaps) > 1: 142 | # There are no new namespaces for this tag, but 143 | # non-default namespaces are in play, so we need a 144 | # separate tag stack to know when they end. 145 | self.nsmaps.append(None) 146 | elif len(nsmap) > 0: 147 | # A new namespace mapping has come into play. 148 | inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) 149 | self.nsmaps.append(inverted_nsmap) 150 | # Also treat the namespace mapping as a set of attributes on the 151 | # tag, so we can recreate it later. 152 | attrs = attrs.copy() 153 | for prefix, namespace in list(nsmap.items()): 154 | attribute = NamespacedAttribute( 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 156 | attrs[attribute] = namespace 157 | 158 | # Namespaces are in play. Find any attributes that came in 159 | # from lxml with namespaces attached to their names, and 160 | # turn then into NamespacedAttribute objects. 161 | new_attrs = {} 162 | for attr, value in list(attrs.items()): 163 | namespace, attr = self._getNsTag(attr) 164 | if namespace is None: 165 | new_attrs[attr] = value 166 | else: 167 | nsprefix = self._prefix_for_namespace(namespace) 168 | attr = NamespacedAttribute(nsprefix, attr, namespace) 169 | new_attrs[attr] = value 170 | attrs = new_attrs 171 | 172 | namespace, name = self._getNsTag(name) 173 | nsprefix = self._prefix_for_namespace(namespace) 174 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 175 | 176 | def _prefix_for_namespace(self, namespace): 177 | """Find the currently active prefix for the given namespace.""" 178 | if namespace is None: 179 | return None 180 | for inverted_nsmap in reversed(self.nsmaps): 181 | if inverted_nsmap is not None and namespace in inverted_nsmap: 182 | return inverted_nsmap[namespace] 183 | return None 184 | 185 | def end(self, name): 186 | self.soup.endData() 187 | completed_tag = self.soup.tagStack[-1] 188 | namespace, name = self._getNsTag(name) 189 | nsprefix = None 190 | if namespace is not None: 191 | for inverted_nsmap in reversed(self.nsmaps): 192 | if inverted_nsmap is not None and namespace in inverted_nsmap: 193 | nsprefix = inverted_nsmap[namespace] 194 | break 195 | self.soup.handle_endtag(name, nsprefix) 196 | if len(self.nsmaps) > 1: 197 | # This tag, or one of its parents, introduced a namespace 198 | # mapping, so pop it off the stack. 199 | self.nsmaps.pop() 200 | 201 | def pi(self, target, data): 202 | self.soup.endData() 203 | self.soup.handle_data(target + ' ' + data) 204 | self.soup.endData(ProcessingInstruction) 205 | 206 | def data(self, content): 207 | self.soup.handle_data(content) 208 | 209 | def doctype(self, name, pubid, system): 210 | self.soup.endData() 211 | doctype = Doctype.for_name_and_ids(name, pubid, system) 212 | self.soup.object_was_parsed(doctype) 213 | 214 | def comment(self, content): 215 | "Handle comments as Comment objects." 216 | self.soup.endData() 217 | self.soup.handle_data(content) 218 | self.soup.endData(Comment) 219 | 220 | def test_fragment_to_document(self, fragment): 221 | """See `TreeBuilder`.""" 222 | return '\n%s' % fragment 223 | 224 | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 226 | 227 | NAME = LXML 228 | ALTERNATE_NAMES = ["lxml-html"] 229 | 230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 231 | is_xml = False 232 | 233 | def default_parser(self, encoding): 234 | return etree.HTMLParser 235 | 236 | def feed(self, markup): 237 | encoding = self.soup.original_encoding 238 | try: 239 | self.parser = self.parser_for(encoding) 240 | self.parser.feed(markup) 241 | self.parser.close() 242 | except (UnicodeDecodeError, LookupError, etree.ParserError) as e: 243 | raise ParserRejectedMarkup(str(e)) 244 | 245 | 246 | def test_fragment_to_document(self, fragment): 247 | """See `TreeBuilder`.""" 248 | return '%s' % fragment 249 | -------------------------------------------------------------------------------- /Week2/bs4/builder/_lxml.py.bak: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'LXMLTreeBuilderForXML', 3 | 'LXMLTreeBuilder', 4 | ] 5 | 6 | from io import BytesIO 7 | from StringIO import StringIO 8 | import collections 9 | from lxml import etree 10 | from bs4.element import ( 11 | Comment, 12 | Doctype, 13 | NamespacedAttribute, 14 | ProcessingInstruction, 15 | ) 16 | from bs4.builder import ( 17 | FAST, 18 | HTML, 19 | HTMLTreeBuilder, 20 | PERMISSIVE, 21 | ParserRejectedMarkup, 22 | TreeBuilder, 23 | XML) 24 | from bs4.dammit import EncodingDetector 25 | 26 | LXML = 'lxml' 27 | 28 | class LXMLTreeBuilderForXML(TreeBuilder): 29 | DEFAULT_PARSER_CLASS = etree.XMLParser 30 | 31 | is_xml = True 32 | 33 | NAME = "lxml-xml" 34 | ALTERNATE_NAMES = ["xml"] 35 | 36 | # Well, it's permissive by XML parser standards. 37 | features = [NAME, LXML, XML, FAST, PERMISSIVE] 38 | 39 | CHUNK_SIZE = 512 40 | 41 | # This namespace mapping is specified in the XML Namespace 42 | # standard. 43 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} 44 | 45 | def default_parser(self, encoding): 46 | # This can either return a parser object or a class, which 47 | # will be instantiated with default arguments. 48 | if self._default_parser is not None: 49 | return self._default_parser 50 | return etree.XMLParser( 51 | target=self, strip_cdata=False, recover=True, encoding=encoding) 52 | 53 | def parser_for(self, encoding): 54 | # Use the default parser. 55 | parser = self.default_parser(encoding) 56 | 57 | if isinstance(parser, collections.Callable): 58 | # Instantiate the parser with default arguments 59 | parser = parser(target=self, strip_cdata=False, encoding=encoding) 60 | return parser 61 | 62 | def __init__(self, parser=None, empty_element_tags=None): 63 | # TODO: Issue a warning if parser is present but not a 64 | # callable, since that means there's no way to create new 65 | # parsers for different encodings. 66 | self._default_parser = parser 67 | if empty_element_tags is not None: 68 | self.empty_element_tags = set(empty_element_tags) 69 | self.soup = None 70 | self.nsmaps = [self.DEFAULT_NSMAPS] 71 | 72 | def _getNsTag(self, tag): 73 | # Split the namespace URL out of a fully-qualified lxml tag 74 | # name. Copied from lxml's src/lxml/sax.py. 75 | if tag[0] == '{': 76 | return tuple(tag[1:].split('}', 1)) 77 | else: 78 | return (None, tag) 79 | 80 | def prepare_markup(self, markup, user_specified_encoding=None, 81 | exclude_encodings=None, 82 | document_declared_encoding=None): 83 | """ 84 | :yield: A series of 4-tuples. 85 | (markup, encoding, declared encoding, 86 | has undergone character replacement) 87 | 88 | Each 4-tuple represents a strategy for parsing the document. 89 | """ 90 | if isinstance(markup, unicode): 91 | # We were given Unicode. Maybe lxml can parse Unicode on 92 | # this system? 93 | yield markup, None, document_declared_encoding, False 94 | 95 | if isinstance(markup, unicode): 96 | # No, apparently not. Convert the Unicode to UTF-8 and 97 | # tell lxml to parse it as UTF-8. 98 | yield (markup.encode("utf8"), "utf8", 99 | document_declared_encoding, False) 100 | 101 | # Instead of using UnicodeDammit to convert the bytestring to 102 | # Unicode using different encodings, use EncodingDetector to 103 | # iterate over the encodings, and tell lxml to try to parse 104 | # the document as each one in turn. 105 | is_html = not self.is_xml 106 | try_encodings = [user_specified_encoding, document_declared_encoding] 107 | detector = EncodingDetector( 108 | markup, try_encodings, is_html, exclude_encodings) 109 | for encoding in detector.encodings: 110 | yield (detector.markup, encoding, document_declared_encoding, False) 111 | 112 | def feed(self, markup): 113 | if isinstance(markup, bytes): 114 | markup = BytesIO(markup) 115 | elif isinstance(markup, unicode): 116 | markup = StringIO(markup) 117 | 118 | # Call feed() at least once, even if the markup is empty, 119 | # or the parser won't be initialized. 120 | data = markup.read(self.CHUNK_SIZE) 121 | try: 122 | self.parser = self.parser_for(self.soup.original_encoding) 123 | self.parser.feed(data) 124 | while len(data) != 0: 125 | # Now call feed() on the rest of the data, chunk by chunk. 126 | data = markup.read(self.CHUNK_SIZE) 127 | if len(data) != 0: 128 | self.parser.feed(data) 129 | self.parser.close() 130 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: 131 | raise ParserRejectedMarkup(str(e)) 132 | 133 | def close(self): 134 | self.nsmaps = [self.DEFAULT_NSMAPS] 135 | 136 | def start(self, name, attrs, nsmap={}): 137 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. 138 | attrs = dict(attrs) 139 | nsprefix = None 140 | # Invert each namespace map as it comes in. 141 | if len(self.nsmaps) > 1: 142 | # There are no new namespaces for this tag, but 143 | # non-default namespaces are in play, so we need a 144 | # separate tag stack to know when they end. 145 | self.nsmaps.append(None) 146 | elif len(nsmap) > 0: 147 | # A new namespace mapping has come into play. 148 | inverted_nsmap = dict((value, key) for key, value in nsmap.items()) 149 | self.nsmaps.append(inverted_nsmap) 150 | # Also treat the namespace mapping as a set of attributes on the 151 | # tag, so we can recreate it later. 152 | attrs = attrs.copy() 153 | for prefix, namespace in nsmap.items(): 154 | attribute = NamespacedAttribute( 155 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/") 156 | attrs[attribute] = namespace 157 | 158 | # Namespaces are in play. Find any attributes that came in 159 | # from lxml with namespaces attached to their names, and 160 | # turn then into NamespacedAttribute objects. 161 | new_attrs = {} 162 | for attr, value in attrs.items(): 163 | namespace, attr = self._getNsTag(attr) 164 | if namespace is None: 165 | new_attrs[attr] = value 166 | else: 167 | nsprefix = self._prefix_for_namespace(namespace) 168 | attr = NamespacedAttribute(nsprefix, attr, namespace) 169 | new_attrs[attr] = value 170 | attrs = new_attrs 171 | 172 | namespace, name = self._getNsTag(name) 173 | nsprefix = self._prefix_for_namespace(namespace) 174 | self.soup.handle_starttag(name, namespace, nsprefix, attrs) 175 | 176 | def _prefix_for_namespace(self, namespace): 177 | """Find the currently active prefix for the given namespace.""" 178 | if namespace is None: 179 | return None 180 | for inverted_nsmap in reversed(self.nsmaps): 181 | if inverted_nsmap is not None and namespace in inverted_nsmap: 182 | return inverted_nsmap[namespace] 183 | return None 184 | 185 | def end(self, name): 186 | self.soup.endData() 187 | completed_tag = self.soup.tagStack[-1] 188 | namespace, name = self._getNsTag(name) 189 | nsprefix = None 190 | if namespace is not None: 191 | for inverted_nsmap in reversed(self.nsmaps): 192 | if inverted_nsmap is not None and namespace in inverted_nsmap: 193 | nsprefix = inverted_nsmap[namespace] 194 | break 195 | self.soup.handle_endtag(name, nsprefix) 196 | if len(self.nsmaps) > 1: 197 | # This tag, or one of its parents, introduced a namespace 198 | # mapping, so pop it off the stack. 199 | self.nsmaps.pop() 200 | 201 | def pi(self, target, data): 202 | self.soup.endData() 203 | self.soup.handle_data(target + ' ' + data) 204 | self.soup.endData(ProcessingInstruction) 205 | 206 | def data(self, content): 207 | self.soup.handle_data(content) 208 | 209 | def doctype(self, name, pubid, system): 210 | self.soup.endData() 211 | doctype = Doctype.for_name_and_ids(name, pubid, system) 212 | self.soup.object_was_parsed(doctype) 213 | 214 | def comment(self, content): 215 | "Handle comments as Comment objects." 216 | self.soup.endData() 217 | self.soup.handle_data(content) 218 | self.soup.endData(Comment) 219 | 220 | def test_fragment_to_document(self, fragment): 221 | """See `TreeBuilder`.""" 222 | return u'\n%s' % fragment 223 | 224 | 225 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): 226 | 227 | NAME = LXML 228 | ALTERNATE_NAMES = ["lxml-html"] 229 | 230 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] 231 | is_xml = False 232 | 233 | def default_parser(self, encoding): 234 | return etree.HTMLParser 235 | 236 | def feed(self, markup): 237 | encoding = self.soup.original_encoding 238 | try: 239 | self.parser = self.parser_for(encoding) 240 | self.parser.feed(markup) 241 | self.parser.close() 242 | except (UnicodeDecodeError, LookupError, etree.ParserError), e: 243 | raise ParserRejectedMarkup(str(e)) 244 | 245 | 246 | def test_fragment_to_document(self, fragment): 247 | """See `TreeBuilder`.""" 248 | return u'%s' % fragment 249 | -------------------------------------------------------------------------------- /Week2/bs4/diagnose.py: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | import cProfile 3 | from io import StringIO 4 | from html.parser import HTMLParser 5 | import bs4 6 | from bs4 import BeautifulSoup, __version__ 7 | from bs4.builder import builder_registry 8 | 9 | import os 10 | import pstats 11 | import random 12 | import tempfile 13 | import time 14 | import traceback 15 | import sys 16 | import cProfile 17 | 18 | def diagnose(data): 19 | """Diagnostic suite for isolating common problems.""" 20 | print("Diagnostic running on Beautiful Soup %s" % __version__) 21 | print("Python version %s" % sys.version) 22 | 23 | basic_parsers = ["html.parser", "html5lib", "lxml"] 24 | for name in basic_parsers: 25 | for builder in builder_registry.builders: 26 | if name in builder.features: 27 | break 28 | else: 29 | basic_parsers.remove(name) 30 | print(( 31 | "I noticed that %s is not installed. Installing it may help." % 32 | name)) 33 | 34 | if 'lxml' in basic_parsers: 35 | basic_parsers.append(["lxml", "xml"]) 36 | try: 37 | from lxml import etree 38 | print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) 39 | except ImportError as e: 40 | print ( 41 | "lxml is not installed or couldn't be imported.") 42 | 43 | 44 | if 'html5lib' in basic_parsers: 45 | try: 46 | import html5lib 47 | print("Found html5lib version %s" % html5lib.__version__) 48 | except ImportError as e: 49 | print ( 50 | "html5lib is not installed or couldn't be imported.") 51 | 52 | if hasattr(data, 'read'): 53 | data = data.read() 54 | elif os.path.exists(data): 55 | print('"%s" looks like a filename. Reading data from the file.' % data) 56 | data = open(data).read() 57 | elif data.startswith("http:") or data.startswith("https:"): 58 | print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) 59 | print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") 60 | return 61 | print() 62 | 63 | for parser in basic_parsers: 64 | print("Trying to parse your markup with %s" % parser) 65 | success = False 66 | try: 67 | soup = BeautifulSoup(data, parser) 68 | success = True 69 | except Exception as e: 70 | print("%s could not parse the markup." % parser) 71 | traceback.print_exc() 72 | if success: 73 | print("Here's what %s did with the markup:" % parser) 74 | print(soup.prettify()) 75 | 76 | print("-" * 80) 77 | 78 | def lxml_trace(data, html=True, **kwargs): 79 | """Print out the lxml events that occur during parsing. 80 | 81 | This lets you see how lxml parses a document when no Beautiful 82 | Soup code is running. 83 | """ 84 | from lxml import etree 85 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 86 | print(("%s, %4s, %s" % (event, element.tag, element.text))) 87 | 88 | class AnnouncingParser(HTMLParser): 89 | """Announces HTMLParser parse events, without doing anything else.""" 90 | 91 | def _p(self, s): 92 | print(s) 93 | 94 | def handle_starttag(self, name, attrs): 95 | self._p("%s START" % name) 96 | 97 | def handle_endtag(self, name): 98 | self._p("%s END" % name) 99 | 100 | def handle_data(self, data): 101 | self._p("%s DATA" % data) 102 | 103 | def handle_charref(self, name): 104 | self._p("%s CHARREF" % name) 105 | 106 | def handle_entityref(self, name): 107 | self._p("%s ENTITYREF" % name) 108 | 109 | def handle_comment(self, data): 110 | self._p("%s COMMENT" % data) 111 | 112 | def handle_decl(self, data): 113 | self._p("%s DECL" % data) 114 | 115 | def unknown_decl(self, data): 116 | self._p("%s UNKNOWN-DECL" % data) 117 | 118 | def handle_pi(self, data): 119 | self._p("%s PI" % data) 120 | 121 | def htmlparser_trace(data): 122 | """Print out the HTMLParser events that occur during parsing. 123 | 124 | This lets you see how HTMLParser parses a document when no 125 | Beautiful Soup code is running. 126 | """ 127 | parser = AnnouncingParser() 128 | parser.feed(data) 129 | 130 | _vowels = "aeiou" 131 | _consonants = "bcdfghjklmnpqrstvwxyz" 132 | 133 | def rword(length=5): 134 | "Generate a random word-like string." 135 | s = '' 136 | for i in range(length): 137 | if i % 2 == 0: 138 | t = _consonants 139 | else: 140 | t = _vowels 141 | s += random.choice(t) 142 | return s 143 | 144 | def rsentence(length=4): 145 | "Generate a random sentence-like string." 146 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 147 | 148 | def rdoc(num_elements=1000): 149 | """Randomly generate an invalid HTML document.""" 150 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 151 | elements = [] 152 | for i in range(num_elements): 153 | choice = random.randint(0,3) 154 | if choice == 0: 155 | # New tag. 156 | tag_name = random.choice(tag_names) 157 | elements.append("<%s>" % tag_name) 158 | elif choice == 1: 159 | elements.append(rsentence(random.randint(1,4))) 160 | elif choice == 2: 161 | # Close a tag. 162 | tag_name = random.choice(tag_names) 163 | elements.append("" % tag_name) 164 | return "" + "\n".join(elements) + "" 165 | 166 | def benchmark_parsers(num_elements=100000): 167 | """Very basic head-to-head performance benchmark.""" 168 | print("Comparative parser benchmark on Beautiful Soup %s" % __version__) 169 | data = rdoc(num_elements) 170 | print("Generated a large invalid HTML document (%d bytes)." % len(data)) 171 | 172 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 173 | success = False 174 | try: 175 | a = time.time() 176 | soup = BeautifulSoup(data, parser) 177 | b = time.time() 178 | success = True 179 | except Exception as e: 180 | print("%s could not parse the markup." % parser) 181 | traceback.print_exc() 182 | if success: 183 | print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) 184 | 185 | from lxml import etree 186 | a = time.time() 187 | etree.HTML(data) 188 | b = time.time() 189 | print("Raw lxml parsed the markup in %.2fs." % (b-a)) 190 | 191 | import html5lib 192 | parser = html5lib.HTMLParser() 193 | a = time.time() 194 | parser.parse(data) 195 | b = time.time() 196 | print("Raw html5lib parsed the markup in %.2fs." % (b-a)) 197 | 198 | def profile(num_elements=100000, parser="lxml"): 199 | 200 | filehandle = tempfile.NamedTemporaryFile() 201 | filename = filehandle.name 202 | 203 | data = rdoc(num_elements) 204 | vars = dict(bs4=bs4, data=data, parser=parser) 205 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 206 | 207 | stats = pstats.Stats(filename) 208 | # stats.strip_dirs() 209 | stats.sort_stats("cumulative") 210 | stats.print_stats('_html5lib|bs4', 50) 211 | 212 | if __name__ == '__main__': 213 | diagnose(sys.stdin.read()) 214 | -------------------------------------------------------------------------------- /Week2/bs4/diagnose.py.bak: -------------------------------------------------------------------------------- 1 | """Diagnostic functions, mainly for use when doing tech support.""" 2 | import cProfile 3 | from StringIO import StringIO 4 | from HTMLParser import HTMLParser 5 | import bs4 6 | from bs4 import BeautifulSoup, __version__ 7 | from bs4.builder import builder_registry 8 | 9 | import os 10 | import pstats 11 | import random 12 | import tempfile 13 | import time 14 | import traceback 15 | import sys 16 | import cProfile 17 | 18 | def diagnose(data): 19 | """Diagnostic suite for isolating common problems.""" 20 | print "Diagnostic running on Beautiful Soup %s" % __version__ 21 | print "Python version %s" % sys.version 22 | 23 | basic_parsers = ["html.parser", "html5lib", "lxml"] 24 | for name in basic_parsers: 25 | for builder in builder_registry.builders: 26 | if name in builder.features: 27 | break 28 | else: 29 | basic_parsers.remove(name) 30 | print ( 31 | "I noticed that %s is not installed. Installing it may help." % 32 | name) 33 | 34 | if 'lxml' in basic_parsers: 35 | basic_parsers.append(["lxml", "xml"]) 36 | try: 37 | from lxml import etree 38 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) 39 | except ImportError, e: 40 | print ( 41 | "lxml is not installed or couldn't be imported.") 42 | 43 | 44 | if 'html5lib' in basic_parsers: 45 | try: 46 | import html5lib 47 | print "Found html5lib version %s" % html5lib.__version__ 48 | except ImportError, e: 49 | print ( 50 | "html5lib is not installed or couldn't be imported.") 51 | 52 | if hasattr(data, 'read'): 53 | data = data.read() 54 | elif os.path.exists(data): 55 | print '"%s" looks like a filename. Reading data from the file.' % data 56 | data = open(data).read() 57 | elif data.startswith("http:") or data.startswith("https:"): 58 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data 59 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." 60 | return 61 | print 62 | 63 | for parser in basic_parsers: 64 | print "Trying to parse your markup with %s" % parser 65 | success = False 66 | try: 67 | soup = BeautifulSoup(data, parser) 68 | success = True 69 | except Exception, e: 70 | print "%s could not parse the markup." % parser 71 | traceback.print_exc() 72 | if success: 73 | print "Here's what %s did with the markup:" % parser 74 | print soup.prettify() 75 | 76 | print "-" * 80 77 | 78 | def lxml_trace(data, html=True, **kwargs): 79 | """Print out the lxml events that occur during parsing. 80 | 81 | This lets you see how lxml parses a document when no Beautiful 82 | Soup code is running. 83 | """ 84 | from lxml import etree 85 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): 86 | print("%s, %4s, %s" % (event, element.tag, element.text)) 87 | 88 | class AnnouncingParser(HTMLParser): 89 | """Announces HTMLParser parse events, without doing anything else.""" 90 | 91 | def _p(self, s): 92 | print(s) 93 | 94 | def handle_starttag(self, name, attrs): 95 | self._p("%s START" % name) 96 | 97 | def handle_endtag(self, name): 98 | self._p("%s END" % name) 99 | 100 | def handle_data(self, data): 101 | self._p("%s DATA" % data) 102 | 103 | def handle_charref(self, name): 104 | self._p("%s CHARREF" % name) 105 | 106 | def handle_entityref(self, name): 107 | self._p("%s ENTITYREF" % name) 108 | 109 | def handle_comment(self, data): 110 | self._p("%s COMMENT" % data) 111 | 112 | def handle_decl(self, data): 113 | self._p("%s DECL" % data) 114 | 115 | def unknown_decl(self, data): 116 | self._p("%s UNKNOWN-DECL" % data) 117 | 118 | def handle_pi(self, data): 119 | self._p("%s PI" % data) 120 | 121 | def htmlparser_trace(data): 122 | """Print out the HTMLParser events that occur during parsing. 123 | 124 | This lets you see how HTMLParser parses a document when no 125 | Beautiful Soup code is running. 126 | """ 127 | parser = AnnouncingParser() 128 | parser.feed(data) 129 | 130 | _vowels = "aeiou" 131 | _consonants = "bcdfghjklmnpqrstvwxyz" 132 | 133 | def rword(length=5): 134 | "Generate a random word-like string." 135 | s = '' 136 | for i in range(length): 137 | if i % 2 == 0: 138 | t = _consonants 139 | else: 140 | t = _vowels 141 | s += random.choice(t) 142 | return s 143 | 144 | def rsentence(length=4): 145 | "Generate a random sentence-like string." 146 | return " ".join(rword(random.randint(4,9)) for i in range(length)) 147 | 148 | def rdoc(num_elements=1000): 149 | """Randomly generate an invalid HTML document.""" 150 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 151 | elements = [] 152 | for i in range(num_elements): 153 | choice = random.randint(0,3) 154 | if choice == 0: 155 | # New tag. 156 | tag_name = random.choice(tag_names) 157 | elements.append("<%s>" % tag_name) 158 | elif choice == 1: 159 | elements.append(rsentence(random.randint(1,4))) 160 | elif choice == 2: 161 | # Close a tag. 162 | tag_name = random.choice(tag_names) 163 | elements.append("" % tag_name) 164 | return "" + "\n".join(elements) + "" 165 | 166 | def benchmark_parsers(num_elements=100000): 167 | """Very basic head-to-head performance benchmark.""" 168 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__ 169 | data = rdoc(num_elements) 170 | print "Generated a large invalid HTML document (%d bytes)." % len(data) 171 | 172 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 173 | success = False 174 | try: 175 | a = time.time() 176 | soup = BeautifulSoup(data, parser) 177 | b = time.time() 178 | success = True 179 | except Exception, e: 180 | print "%s could not parse the markup." % parser 181 | traceback.print_exc() 182 | if success: 183 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) 184 | 185 | from lxml import etree 186 | a = time.time() 187 | etree.HTML(data) 188 | b = time.time() 189 | print "Raw lxml parsed the markup in %.2fs." % (b-a) 190 | 191 | import html5lib 192 | parser = html5lib.HTMLParser() 193 | a = time.time() 194 | parser.parse(data) 195 | b = time.time() 196 | print "Raw html5lib parsed the markup in %.2fs." % (b-a) 197 | 198 | def profile(num_elements=100000, parser="lxml"): 199 | 200 | filehandle = tempfile.NamedTemporaryFile() 201 | filename = filehandle.name 202 | 203 | data = rdoc(num_elements) 204 | vars = dict(bs4=bs4, data=data, parser=parser) 205 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 206 | 207 | stats = pstats.Stats(filename) 208 | # stats.strip_dirs() 209 | stats.sort_stats("cumulative") 210 | stats.print_stats('_html5lib|bs4', 50) 211 | 212 | if __name__ == '__main__': 213 | diagnose(sys.stdin.read()) 214 | -------------------------------------------------------------------------------- /Week2/bs4/tests/__init__.py: -------------------------------------------------------------------------------- 1 | "The beautifulsoup tests." 2 | -------------------------------------------------------------------------------- /Week2/bs4/tests/test_builder_registry.py: -------------------------------------------------------------------------------- 1 | """Tests of the builder registry.""" 2 | 3 | import unittest 4 | import warnings 5 | 6 | from bs4 import BeautifulSoup 7 | from bs4.builder import ( 8 | builder_registry as registry, 9 | HTMLParserTreeBuilder, 10 | TreeBuilderRegistry, 11 | ) 12 | 13 | try: 14 | from bs4.builder import HTML5TreeBuilder 15 | HTML5LIB_PRESENT = True 16 | except ImportError: 17 | HTML5LIB_PRESENT = False 18 | 19 | try: 20 | from bs4.builder import ( 21 | LXMLTreeBuilderForXML, 22 | LXMLTreeBuilder, 23 | ) 24 | LXML_PRESENT = True 25 | except ImportError: 26 | LXML_PRESENT = False 27 | 28 | 29 | class BuiltInRegistryTest(unittest.TestCase): 30 | """Test the built-in registry with the default builders registered.""" 31 | 32 | def test_combination(self): 33 | if LXML_PRESENT: 34 | self.assertEqual(registry.lookup('fast', 'html'), 35 | LXMLTreeBuilder) 36 | 37 | if LXML_PRESENT: 38 | self.assertEqual(registry.lookup('permissive', 'xml'), 39 | LXMLTreeBuilderForXML) 40 | self.assertEqual(registry.lookup('strict', 'html'), 41 | HTMLParserTreeBuilder) 42 | if HTML5LIB_PRESENT: 43 | self.assertEqual(registry.lookup('html5lib', 'html'), 44 | HTML5TreeBuilder) 45 | 46 | def test_lookup_by_markup_type(self): 47 | if LXML_PRESENT: 48 | self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) 49 | self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) 50 | else: 51 | self.assertEqual(registry.lookup('xml'), None) 52 | if HTML5LIB_PRESENT: 53 | self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) 54 | else: 55 | self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) 56 | 57 | def test_named_library(self): 58 | if LXML_PRESENT: 59 | self.assertEqual(registry.lookup('lxml', 'xml'), 60 | LXMLTreeBuilderForXML) 61 | self.assertEqual(registry.lookup('lxml', 'html'), 62 | LXMLTreeBuilder) 63 | if HTML5LIB_PRESENT: 64 | self.assertEqual(registry.lookup('html5lib'), 65 | HTML5TreeBuilder) 66 | 67 | self.assertEqual(registry.lookup('html.parser'), 68 | HTMLParserTreeBuilder) 69 | 70 | def test_beautifulsoup_constructor_does_lookup(self): 71 | 72 | with warnings.catch_warnings(record=True) as w: 73 | # This will create a warning about not explicitly 74 | # specifying a parser, but we'll ignore it. 75 | 76 | # You can pass in a string. 77 | BeautifulSoup("", features="html") 78 | # Or a list of strings. 79 | BeautifulSoup("", features=["html", "fast"]) 80 | 81 | # You'll get an exception if BS can't find an appropriate 82 | # builder. 83 | self.assertRaises(ValueError, BeautifulSoup, 84 | "", features="no-such-feature") 85 | 86 | class RegistryTest(unittest.TestCase): 87 | """Test the TreeBuilderRegistry class in general.""" 88 | 89 | def setUp(self): 90 | self.registry = TreeBuilderRegistry() 91 | 92 | def builder_for_features(self, *feature_list): 93 | cls = type('Builder_' + '_'.join(feature_list), 94 | (object,), {'features' : feature_list}) 95 | 96 | self.registry.register(cls) 97 | return cls 98 | 99 | def test_register_with_no_features(self): 100 | builder = self.builder_for_features() 101 | 102 | # Since the builder advertises no features, you can't find it 103 | # by looking up features. 104 | self.assertEqual(self.registry.lookup('foo'), None) 105 | 106 | # But you can find it by doing a lookup with no features, if 107 | # this happens to be the only registered builder. 108 | self.assertEqual(self.registry.lookup(), builder) 109 | 110 | def test_register_with_features_makes_lookup_succeed(self): 111 | builder = self.builder_for_features('foo', 'bar') 112 | self.assertEqual(self.registry.lookup('foo'), builder) 113 | self.assertEqual(self.registry.lookup('bar'), builder) 114 | 115 | def test_lookup_fails_when_no_builder_implements_feature(self): 116 | builder = self.builder_for_features('foo', 'bar') 117 | self.assertEqual(self.registry.lookup('baz'), None) 118 | 119 | def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): 120 | builder1 = self.builder_for_features('foo') 121 | builder2 = self.builder_for_features('bar') 122 | self.assertEqual(self.registry.lookup(), builder2) 123 | 124 | def test_lookup_fails_when_no_tree_builders_registered(self): 125 | self.assertEqual(self.registry.lookup(), None) 126 | 127 | def test_lookup_gets_most_recent_builder_supporting_all_features(self): 128 | has_one = self.builder_for_features('foo') 129 | has_the_other = self.builder_for_features('bar') 130 | has_both_early = self.builder_for_features('foo', 'bar', 'baz') 131 | has_both_late = self.builder_for_features('foo', 'bar', 'quux') 132 | lacks_one = self.builder_for_features('bar') 133 | has_the_other = self.builder_for_features('foo') 134 | 135 | # There are two builders featuring 'foo' and 'bar', but 136 | # the one that also features 'quux' was registered later. 137 | self.assertEqual(self.registry.lookup('foo', 'bar'), 138 | has_both_late) 139 | 140 | # There is only one builder featuring 'foo', 'bar', and 'baz'. 141 | self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), 142 | has_both_early) 143 | 144 | def test_lookup_fails_when_cannot_reconcile_requested_features(self): 145 | builder1 = self.builder_for_features('foo', 'bar') 146 | builder2 = self.builder_for_features('foo', 'baz') 147 | self.assertEqual(self.registry.lookup('bar', 'baz'), None) 148 | -------------------------------------------------------------------------------- /Week2/bs4/tests/test_docs.py: -------------------------------------------------------------------------------- 1 | "Test harness for doctests." 2 | 3 | # pylint: disable-msg=E0611,W0142 4 | 5 | __metaclass__ = type 6 | __all__ = [ 7 | 'additional_tests', 8 | ] 9 | 10 | import atexit 11 | import doctest 12 | import os 13 | #from pkg_resources import ( 14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) 15 | import unittest 16 | 17 | DOCTEST_FLAGS = ( 18 | doctest.ELLIPSIS | 19 | doctest.NORMALIZE_WHITESPACE | 20 | doctest.REPORT_NDIFF) 21 | 22 | 23 | # def additional_tests(): 24 | # "Run the doc tests (README.txt and docs/*, if any exist)" 25 | # doctest_files = [ 26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] 27 | # if resource_exists('bs4', 'docs'): 28 | # for name in resource_listdir('bs4', 'docs'): 29 | # if name.endswith('.txt'): 30 | # doctest_files.append( 31 | # os.path.abspath( 32 | # resource_filename('bs4', 'docs/%s' % name))) 33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) 34 | # atexit.register(cleanup_resources) 35 | # return unittest.TestSuite(( 36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) 37 | -------------------------------------------------------------------------------- /Week2/bs4/tests/test_html5lib.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError as e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual("

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | -------------------------------------------------------------------------------- /Week2/bs4/tests/test_html5lib.py.bak: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError, e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | 87 | def test_processing_instruction(self): 88 | """Processing instructions become comments.""" 89 | markup = b"""""" 90 | soup = self.soup(markup) 91 | assert str(soup).startswith("") 92 | -------------------------------------------------------------------------------- /Week2/bs4/tests/test_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html.parser tree builder generates good 2 | trees.""" 3 | 4 | from pdb import set_trace 5 | import pickle 6 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 7 | from bs4.builder import HTMLParserTreeBuilder 8 | 9 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 10 | 11 | @property 12 | def default_builder(self): 13 | return HTMLParserTreeBuilder() 14 | 15 | def test_namespaced_system_doctype(self): 16 | # html.parser can't handle namespaced doctypes, so skip this one. 17 | pass 18 | 19 | def test_namespaced_public_doctype(self): 20 | # html.parser can't handle namespaced doctypes, so skip this one. 21 | pass 22 | 23 | def test_builder_is_pickled(self): 24 | """Unlike most tree builders, HTMLParserTreeBuilder and will 25 | be restored after pickling. 26 | """ 27 | tree = self.soup("foo") 28 | dumped = pickle.dumps(tree, 2) 29 | loaded = pickle.loads(dumped) 30 | self.assertTrue(isinstance(loaded.builder, type(tree.builder))) 31 | 32 | 33 | -------------------------------------------------------------------------------- /Week2/bs4/tests/test_lxml.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError as e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual("", str(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /Week2/bs4/tests/test_lxml.py.bak: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError, e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual(u"", unicode(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | @skipIf( 69 | not LXML_PRESENT, 70 | "lxml seems not to be present, not testing its XML tree builder.") 71 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 72 | """See ``HTMLTreeBuilderSmokeTest``.""" 73 | 74 | @property 75 | def default_builder(self): 76 | return LXMLTreeBuilderForXML() 77 | -------------------------------------------------------------------------------- /Week2/force.css: -------------------------------------------------------------------------------- 1 | circle.node { 2 | stroke: #fff; 3 | stroke-width: 1.5px; 4 | } 5 | 6 | line.link { 7 | stroke: #999; 8 | stroke-opacity: .6; 9 | } 10 | -------------------------------------------------------------------------------- /Week2/force.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Force-Directed Layout 6 | 7 | 8 | 9 | 10 | 11 | 12 | 15 |
16 | 17 |

If you don't see a chart above, check the JavaScript console. You may need to use a different browser.

18 | 19 | 20 | -------------------------------------------------------------------------------- /Week2/force.js: -------------------------------------------------------------------------------- 1 | var width = 600, 2 | height = 600; 3 | 4 | var color = d3.scale.category20(); 5 | 6 | var dist = (width + height) / 4; 7 | 8 | var force = d3.layout.force() 9 | .charge(-120) 10 | .linkDistance(dist) 11 | .size([width, height]); 12 | 13 | function getrank(rval) { 14 | return (rval/2.0) + 3; 15 | } 16 | 17 | function getcolor(rval) { 18 | return color(rval); 19 | } 20 | 21 | var svg = d3.select("#chart").append("svg") 22 | .attr("width", width) 23 | .attr("height", height); 24 | 25 | function loadData(json) { 26 | force 27 | .nodes(json.nodes) 28 | .links(json.links); 29 | 30 | var k = Math.sqrt(json.nodes.length / (width * height)); 31 | 32 | force 33 | .charge(-10 / k) 34 | .gravity(100 * k) 35 | .start(); 36 | 37 | var link = svg.selectAll("line.link") 38 | .data(json.links) 39 | .enter().append("line") 40 | .attr("class", "link") 41 | .style("stroke-width", function(d) { return Math.sqrt(d.value); }); 42 | 43 | var node = svg.selectAll("circle.node") 44 | .data(json.nodes) 45 | .enter().append("circle") 46 | .attr("class", "node") 47 | .attr("r", function(d) { return getrank(d.rank); } ) 48 | .style("fill", function(d) { return getcolor(d.rank); }) 49 | .on("dblclick",function(d) { 50 | if ( confirm('Do you want to open '+d.url) ) 51 | window.open(d.url,'_new',''); 52 | d3.event.stopPropagation(); 53 | }) 54 | .call(force.drag); 55 | 56 | node.append("title") 57 | .text(function(d) { return d.url; }); 58 | 59 | force.on("tick", function() { 60 | link.attr("x1", function(d) { return d.source.x; }) 61 | .attr("y1", function(d) { return d.source.y; }) 62 | .attr("x2", function(d) { return d.target.x; }) 63 | .attr("y2", function(d) { return d.target.y; }); 64 | 65 | node.attr("cx", function(d) { return d.x; }) 66 | .attr("cy", function(d) { return d.y; }); 67 | }); 68 | 69 | } 70 | loadData(spiderJson); 71 | -------------------------------------------------------------------------------- /Week2/spdump.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 7 | FROM Pages JOIN Links ON Pages.id = Links.to_id 8 | WHERE html IS NOT NULL 9 | GROUP BY id ORDER BY inbound DESC''') 10 | 11 | count = 0 12 | for row in cur : 13 | if count < 50 : print(row) 14 | count = count + 1 15 | print(count, 'rows.') 16 | cur.close() 17 | -------------------------------------------------------------------------------- /Week2/spider.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import urllib.error 3 | import ssl 4 | from urllib.parse import urljoin 5 | from urllib.parse import urlparse 6 | from urllib.request import urlopen 7 | from bs4 import BeautifulSoup 8 | 9 | # Ignore SSL certificate errors 10 | ctx = ssl.create_default_context() 11 | ctx.check_hostname = False 12 | ctx.verify_mode = ssl.CERT_NONE 13 | 14 | conn = sqlite3.connect('spider.sqlite') 15 | cur = conn.cursor() 16 | 17 | cur.execute('''CREATE TABLE IF NOT EXISTS Pages 18 | (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT, 19 | error INTEGER, old_rank REAL, new_rank REAL)''') 20 | 21 | cur.execute('''CREATE TABLE IF NOT EXISTS Links 22 | (from_id INTEGER, to_id INTEGER)''') 23 | 24 | cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''') 25 | 26 | # Check to see if we are already in progress... 27 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 28 | row = cur.fetchone() 29 | if row is not None: 30 | print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.") 31 | else : 32 | starturl = input('Enter web url or enter: ') 33 | if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/' 34 | if ( starturl.endswith('/') ) : starturl = starturl[:-1] 35 | web = starturl 36 | if ( starturl.endswith('.htm') or starturl.endswith('.html') ) : 37 | pos = starturl.rfind('/') 38 | web = starturl[:pos] 39 | 40 | if ( len(web) > 1 ) : 41 | cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) ) 42 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) ) 43 | conn.commit() 44 | 45 | # Get the current webs 46 | cur.execute('''SELECT url FROM Webs''') 47 | webs = list() 48 | for row in cur: 49 | webs.append(str(row[0])) 50 | 51 | print(webs) 52 | 53 | many = 0 54 | while True: 55 | if ( many < 1 ) : 56 | sval = input('How many pages:') 57 | if ( len(sval) < 1 ) : break 58 | many = int(sval) 59 | many = many - 1 60 | 61 | cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1') 62 | try: 63 | row = cur.fetchone() 64 | # print row 65 | fromid = row[0] 66 | url = row[1] 67 | except: 68 | print('No unretrieved HTML pages found') 69 | many = 0 70 | break 71 | 72 | print(fromid, url, end=' ') 73 | 74 | # If we are retrieving this page, there should be no links from it 75 | cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) ) 76 | try: 77 | document = urlopen(url, context=ctx) 78 | 79 | html = document.read() 80 | if document.getcode() != 200 : 81 | print("Error on page: ",document.getcode()) 82 | cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) 83 | 84 | if 'text/html' != document.info().get_content_type() : 85 | print("Ignore non text/html page") 86 | cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) ) 87 | cur.execute('UPDATE Pages SET error=0 WHERE url=?', (url, ) ) 88 | conn.commit() 89 | continue 90 | 91 | print('('+str(len(html))+')', end=' ') 92 | 93 | soup = BeautifulSoup(html, "html.parser") 94 | except KeyboardInterrupt: 95 | print('') 96 | print('Program interrupted by user...') 97 | break 98 | except: 99 | print("Unable to retrieve or parse page") 100 | cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) ) 101 | conn.commit() 102 | continue 103 | 104 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) ) 105 | cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) ) 106 | conn.commit() 107 | 108 | # Retrieve all of the anchor tags 109 | tags = soup('a') 110 | count = 0 111 | for tag in tags: 112 | href = tag.get('href', None) 113 | if ( href is None ) : continue 114 | # Resolve relative references like href="/contact" 115 | up = urlparse(href) 116 | if ( len(up.scheme) < 1 ) : 117 | href = urljoin(url, href) 118 | ipos = href.find('#') 119 | if ( ipos > 1 ) : href = href[:ipos] 120 | if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue 121 | if ( href.endswith('/') ) : href = href[:-1] 122 | # print href 123 | if ( len(href) < 1 ) : continue 124 | 125 | # Check if the URL is in any of the webs 126 | found = False 127 | for web in webs: 128 | if ( href.startswith(web) ) : 129 | found = True 130 | break 131 | if not found : continue 132 | 133 | cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) ) 134 | count = count + 1 135 | conn.commit() 136 | 137 | cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, )) 138 | try: 139 | row = cur.fetchone() 140 | toid = row[0] 141 | except: 142 | print('Could not retrieve id') 143 | continue 144 | # print fromid, toid 145 | cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) ) 146 | 147 | 148 | print(count) 149 | 150 | cur.close() 151 | -------------------------------------------------------------------------------- /Week2/spider.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week2/spider.sqlite -------------------------------------------------------------------------------- /Week2/spjson.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | print("Creating JSON output on spider.js...") 7 | howmany = int(input("How many nodes? ")) 8 | 9 | cur.execute('''SELECT COUNT(from_id) AS inbound, old_rank, new_rank, id, url 10 | FROM Pages JOIN Links ON Pages.id = Links.to_id 11 | WHERE html IS NOT NULL AND ERROR IS NULL 12 | GROUP BY id ORDER BY id,inbound''') 13 | 14 | fhand = open('spider.js','w') 15 | nodes = list() 16 | maxrank = None 17 | minrank = None 18 | for row in cur : 19 | nodes.append(row) 20 | rank = row[2] 21 | if maxrank is None or maxrank < rank: maxrank = rank 22 | if minrank is None or minrank > rank : minrank = rank 23 | if len(nodes) > howmany : break 24 | 25 | if maxrank == minrank or maxrank is None or minrank is None: 26 | print("Error - please run sprank.py to compute page rank") 27 | quit() 28 | 29 | fhand.write('spiderJson = {"nodes":[\n') 30 | count = 0 31 | map = dict() 32 | ranks = dict() 33 | for row in nodes : 34 | if count > 0 : fhand.write(',\n') 35 | # print row 36 | rank = row[2] 37 | rank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 38 | fhand.write('{'+'"weight":'+str(row[0])+',"rank":'+str(rank)+',') 39 | fhand.write(' "id":'+str(row[3])+', "url":"'+row[4]+'"}') 40 | map[row[3]] = count 41 | ranks[row[3]] = rank 42 | count = count + 1 43 | fhand.write('],\n') 44 | 45 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 46 | fhand.write('"links":[\n') 47 | 48 | count = 0 49 | for row in cur : 50 | # print row 51 | if row[0] not in map or row[1] not in map : continue 52 | if count > 0 : fhand.write(',\n') 53 | rank = ranks[row[0]] 54 | srank = 19 * ( (rank - minrank) / (maxrank - minrank) ) 55 | fhand.write('{"source":'+str(map[row[0]])+',"target":'+str(map[row[1]])+',"value":3}') 56 | count = count + 1 57 | fhand.write(']};') 58 | fhand.close() 59 | cur.close() 60 | 61 | print("Open force.html in a browser to view the visualization") 62 | -------------------------------------------------------------------------------- /Week2/sprank.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | # Find the ids that send out page rank - we only are interested 7 | # in pages in the SCC that have in and out links 8 | cur.execute('''SELECT DISTINCT from_id FROM Links''') 9 | from_ids = list() 10 | for row in cur: 11 | from_ids.append(row[0]) 12 | 13 | # Find the ids that receive page rank 14 | to_ids = list() 15 | links = list() 16 | cur.execute('''SELECT DISTINCT from_id, to_id FROM Links''') 17 | for row in cur: 18 | from_id = row[0] 19 | to_id = row[1] 20 | if from_id == to_id : continue 21 | if from_id not in from_ids : continue 22 | if to_id not in from_ids : continue 23 | links.append(row) 24 | if to_id not in to_ids : to_ids.append(to_id) 25 | 26 | # Get latest page ranks for strongly connected component 27 | prev_ranks = dict() 28 | for node in from_ids: 29 | cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''', (node, )) 30 | row = cur.fetchone() 31 | prev_ranks[node] = row[0] 32 | 33 | sval = input('How many iterations:') 34 | many = 1 35 | if ( len(sval) > 0 ) : many = int(sval) 36 | 37 | # Sanity check 38 | if len(prev_ranks) < 1 : 39 | print("Nothing to page rank. Check data.") 40 | quit() 41 | 42 | # Lets do Page Rank in memory so it is really fast 43 | for i in range(many): 44 | # print prev_ranks.items()[:5] 45 | next_ranks = dict(); 46 | total = 0.0 47 | for (node, old_rank) in list(prev_ranks.items()): 48 | total = total + old_rank 49 | next_ranks[node] = 0.0 50 | # print total 51 | 52 | # Find the number of outbound links and sent the page rank down each 53 | for (node, old_rank) in list(prev_ranks.items()): 54 | # print node, old_rank 55 | give_ids = list() 56 | for (from_id, to_id) in links: 57 | if from_id != node : continue 58 | # print ' ',from_id,to_id 59 | 60 | if to_id not in to_ids: continue 61 | give_ids.append(to_id) 62 | if ( len(give_ids) < 1 ) : continue 63 | amount = old_rank / len(give_ids) 64 | # print node, old_rank,amount, give_ids 65 | 66 | for id in give_ids: 67 | next_ranks[id] = next_ranks[id] + amount 68 | 69 | newtot = 0 70 | for (node, next_rank) in list(next_ranks.items()): 71 | newtot = newtot + next_rank 72 | evap = (total - newtot) / len(next_ranks) 73 | 74 | # print newtot, evap 75 | for node in next_ranks: 76 | next_ranks[node] = next_ranks[node] + evap 77 | 78 | newtot = 0 79 | for (node, next_rank) in list(next_ranks.items()): 80 | newtot = newtot + next_rank 81 | 82 | # Compute the per-page average change from old rank to new rank 83 | # As indication of convergence of the algorithm 84 | totdiff = 0 85 | for (node, old_rank) in list(prev_ranks.items()): 86 | new_rank = next_ranks[node] 87 | diff = abs(old_rank-new_rank) 88 | totdiff = totdiff + diff 89 | 90 | avediff = totdiff / len(prev_ranks) 91 | print(i+1, avediff) 92 | 93 | # rotate 94 | prev_ranks = next_ranks 95 | 96 | # Put the final ranks back into the database 97 | print(list(next_ranks.items())[:5]) 98 | cur.execute('''UPDATE Pages SET old_rank=new_rank''') 99 | for (id, new_rank) in list(next_ranks.items()) : 100 | cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''', (new_rank, id)) 101 | conn.commit() 102 | cur.close() 103 | 104 | -------------------------------------------------------------------------------- /Week2/spreset.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | conn = sqlite3.connect('spider.sqlite') 4 | cur = conn.cursor() 5 | 6 | cur.execute('''UPDATE Pages SET new_rank=1.0, old_rank=0.0''') 7 | conn.commit() 8 | 9 | cur.close() 10 | 11 | print("All pages set to a rank of 1.0") 12 | -------------------------------------------------------------------------------- /Week4-6/README.txt: -------------------------------------------------------------------------------- 1 | Analyzing an EMAIL Archive from gmane and vizualizing the data 2 | using the D3 JavaScript library 3 | 4 | This is a set of tools that allow you to pull down an archive 5 | of a gmane repository using the instructions at: 6 | 7 | http://gmane.org/export.php 8 | 9 | In order not to overwhelm the gmane.org server, I have put up 10 | my own copy of the messages at: 11 | 12 | http://mbox.dr-chuck.net/ 13 | 14 | This server will be faster and take a lot of load off the 15 | gmane.org server. 16 | 17 | You should install the SQLite browser to view and modify the databases from: 18 | 19 | http://sqlitebrowser.org/ 20 | 21 | The first step is to spider the gmane repository. The base URL 22 | is hard-coded in the gmane.py and is hard-coded to the Sakai 23 | developer list. You can spider another repository by changing that 24 | base url. Make sure to delete the content.sqlite file if you 25 | switch the base url. The gmane.py file operates as a spider in 26 | that it runs slowly and retrieves one mail message per second so 27 | as to avoid getting throttled by gmane.org. It stores all of 28 | its data in a database and can be interrupted and re-started 29 | as often as needed. It may take many hours to pull all the data 30 | down. So you may need to restart several times. 31 | 32 | To give you a head-start, I have put up 600MB of pre-spidered Sakai 33 | email here: 34 | 35 | https://online.dr-chuck.com/files/sakai/email/content.sqlite 36 | 37 | If you download this, you can "catch up with the latest" by 38 | running gmane.py. 39 | 40 | Navigate to the folder where you extracted the gmane.zip 41 | 42 | Note: Windows has difficulty in displaying UTF-8 characters 43 | in the console so for each console window you open, you may need 44 | to type the following command before running this code: 45 | 46 | chcp 65001 47 | 48 | http://stackoverflow.com/questions/388490/unicode-characters-in-windows-command-line-how 49 | 50 | Here is a run of gmane.py getting the last five messages of the 51 | sakai developer list: 52 | 53 | Mac: python3 gmane.py 54 | Win: gmane.py 55 | 56 | How many messages:10 57 | http://mbox.dr-chuck.net/sakai.devel/1/2 2662 58 | ggolden@umich.edu 2005-12-08T23:34:30-06:00 call for participation: developers documentation 59 | http://mbox.dr-chuck.net/sakai.devel/2/3 2434 60 | csev@umich.edu 2005-12-09T00:58:01-05:00 report from the austin conference: sakai developers break into song 61 | http://mbox.dr-chuck.net/sakai.devel/3/4 3055 62 | kevin.carpenter@rsmart.com 2005-12-09T09:01:49-07:00 cas and sakai 1.5 63 | http://mbox.dr-chuck.net/sakai.devel/4/5 11721 64 | michael.feldstein@suny.edu 2005-12-09T09:43:12-05:00 re: lms/vle rants/comments 65 | http://mbox.dr-chuck.net/sakai.devel/5/6 9443 66 | john@caret.cam.ac.uk 2005-12-09T13:32:29+00:00 re: lms/vle rants/comments 67 | Does not start with From 68 | 69 | The program scans content.sqlite from 1 up to the first message number not 70 | already spidered and starts spidering at that message. It continues spidering 71 | until it has spidered the desired number of messages or it reaches a page 72 | that does not appear to be a properly formatted message. 73 | 74 | Sometimes gmane.org is missing a message. Perhaps administrators can delete messages 75 | or perhaps they get lost - I don't know. If your spider stops, and it seems it has hit 76 | a missing message, go into the SQLite Manager and add a row with the missing id - leave 77 | all the other fields blank - and then restart gmane.py. This will unstick the 78 | spidering process and allow it to continue. These empty messages will be ignored in the next 79 | phase of the process. 80 | 81 | One nice thing is that once you have spidered all of the messages and have them in 82 | content.sqlite, you can run gmane.py again to get new messages as they get sent to the 83 | list. gmane.py will quickly scan to the end of the already-spidered pages and check 84 | if there are new messages and then quickly retrieve those messages and add them 85 | to content.sqlite. 86 | 87 | The content.sqlite data is pretty raw, with an innefficient data model, and not compressed. 88 | This is intentional as it allows you to look at content.sqlite to debug the process. 89 | It would be a bad idea to run any queries against this database as they would be 90 | slow. 91 | 92 | The second process is running the program gmodel.py. gmodel.py reads the rough/raw 93 | data from content.sqlite and produces a cleaned-up and well-modeled version of the 94 | data in the file index.sqlite. The file index.sqlite will be much smaller (often 10X 95 | smaller) than content.sqlite because it also compresses the header and body text. 96 | 97 | Each time gmodel.py runs - it completely wipes out and re-builds index.sqlite, allowing 98 | you to adjust its parameters and edit the mapping tables in content.sqlite to tweak the 99 | data cleaning process. 100 | 101 | Running gmodel.py works as follows: 102 | 103 | Mac: python3 gmodel.py 104 | Win: gmodel.py 105 | 106 | Loaded allsenders 1588 and mapping 28 dns mapping 1 107 | 1 2005-12-08T23:34:30-06:00 ggolden22@mac.com 108 | 251 2005-12-22T10:03:20-08:00 tpamsler@ucdavis.edu 109 | 501 2006-01-12T11:17:34-05:00 lance@indiana.edu 110 | 751 2006-01-24T11:13:28-08:00 vrajgopalan@ucmerced.edu 111 | ... 112 | 113 | The gmodel.py program does a number of data cleaing steps 114 | 115 | Domain names are truncated to two levels for .com, .org, .edu, and .net 116 | other domain names are truncated to three levels. So si.umich.edu becomes 117 | umich.edu and caret.cam.ac.uk becomes cam.ac.uk. Also mail addresses are 118 | forced to lower case and some of the @gmane.org address like the following 119 | 120 | arwhyte-63aXycvo3TyHXe+LvDLADg@public.gmane.org 121 | 122 | are converted to the real address whenever there is a matching real email 123 | address elsewhere in the message corpus. 124 | 125 | If you look in the content.sqlite database there are two tables that allow 126 | you to map both domain names and individual email addresses that change over 127 | the lifetime of the email list. For example, Steve Githens used the following 128 | email addresses over the life of the Sakai developer list: 129 | 130 | s-githens@northwestern.edu 131 | sgithens@cam.ac.uk 132 | swgithen@mtu.edu 133 | 134 | We can add two entries to the Mapping table 135 | 136 | s-githens@northwestern.edu -> swgithen@mtu.edu 137 | sgithens@cam.ac.uk -> swgithen@mtu.edu 138 | 139 | And so all the mail messages will be collected under one sender even if 140 | they used several email addresses over the lifetime of the mailing list. 141 | 142 | You can also make similar entries in the DNSMapping table if there are multiple 143 | DNS names you want mapped to a single DNS. In the Sakai data I add the following 144 | mapping: 145 | 146 | iupui.edu -> indiana.edu 147 | 148 | So all the folks from the various Indiana University campuses are tracked together 149 | 150 | You can re-run the gmodel.py over and over as you look at the data, and add mappings 151 | to make the data cleaner and cleaner. When you are done, you will have a nicely 152 | indexed version of the email in index.sqlite. This is the file to use to do data 153 | analysis. With this file, data analysis will be really quick. 154 | 155 | The first, simplest data analysis is to do a "who does the most" and "which 156 | organzation does the most"? This is done using gbasic.py: 157 | 158 | Mac: python3 gbasic.py 159 | Win: gbasic.py 160 | 161 | How many to dump? 5 162 | Loaded messages= 51330 subjects= 25033 senders= 1584 163 | 164 | Top 5 Email list participants 165 | steve.swinsburg@gmail.com 2657 166 | azeckoski@unicon.net 1742 167 | ieb@tfd.co.uk 1591 168 | csev@umich.edu 1304 169 | david.horwitz@uct.ac.za 1184 170 | 171 | Top 5 Email list organizations 172 | gmail.com 7339 173 | umich.edu 6243 174 | uct.ac.za 2451 175 | indiana.edu 2258 176 | unicon.net 2055 177 | 178 | You can look at the data in index.sqlite and if you find a problem, you 179 | can update the Mapping table and DNSMapping table in content.sqlite and 180 | re-run gmodel.py. 181 | 182 | There is a simple vizualization of the word frequence in the subject lines 183 | in the file gword.py: 184 | 185 | Mac: python3 gword.py 186 | Win: gword.py 187 | 188 | Range of counts: 33229 129 189 | Output written to gword.js 190 | 191 | This produces the file gword.js which you can visualize using the file 192 | gword.htm. 193 | 194 | A second visualization is in gline.py. It visualizes email participation by 195 | organizations over time. 196 | 197 | Mac: python3 gline.py 198 | Win: gline.py 199 | 200 | Loaded messages= 51330 subjects= 25033 senders= 1584 201 | Top 10 Oranizations 202 | ['gmail.com', 'umich.edu', 'uct.ac.za', 'indiana.edu', 'unicon.net', 'tfd.co.uk', 'berkeley.edu', 'longsight.com', 'stanford.edu', 'ox.ac.uk'] 203 | Output written to gline.js 204 | 205 | Its output is written to gline.js which is visualized using gline.htm. 206 | 207 | Some URLs for visualization ideas: 208 | 209 | https://developers.google.com/chart/ 210 | 211 | https://developers.google.com/chart/interactive/docs/gallery/motionchart 212 | 213 | https://code.google.com/apis/ajax/playground/?type=visualization#motion_chart_time_formats 214 | 215 | https://developers.google.com/chart/interactive/docs/gallery/annotatedtimeline 216 | 217 | http://bost.ocks.org/mike/uberdata/ 218 | 219 | http://mbostock.github.io/d3/talk/20111018/calendar.html 220 | 221 | http://nltk.org/install.html 222 | 223 | As always - comments welcome. 224 | 225 | -- Dr. Chuck 226 | Sun Sep 29 00:11:01 EDT 2013 227 | 228 | -------------------------------------------------------------------------------- /Week4-6/content.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week4-6/content.sqlite -------------------------------------------------------------------------------- /Week4-6/d3.layout.cloud.js: -------------------------------------------------------------------------------- 1 | // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/ 2 | // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf 3 | (function(exports) { 4 | function cloud() { 5 | var size = [256, 256], 6 | text = cloudText, 7 | font = cloudFont, 8 | fontSize = cloudFontSize, 9 | fontStyle = cloudFontNormal, 10 | fontWeight = cloudFontNormal, 11 | rotate = cloudRotate, 12 | padding = cloudPadding, 13 | spiral = archimedeanSpiral, 14 | words = [], 15 | timeInterval = Infinity, 16 | event = d3.dispatch("word", "end"), 17 | timer = null, 18 | cloud = {}; 19 | 20 | cloud.start = function() { 21 | var board = zeroArray((size[0] >> 5) * size[1]), 22 | bounds = null, 23 | n = words.length, 24 | i = -1, 25 | tags = [], 26 | data = words.map(function(d, i) { 27 | d.text = text.call(this, d, i); 28 | d.font = font.call(this, d, i); 29 | d.style = fontStyle.call(this, d, i); 30 | d.weight = fontWeight.call(this, d, i); 31 | d.rotate = rotate.call(this, d, i); 32 | d.size = ~~fontSize.call(this, d, i); 33 | d.padding = cloudPadding.call(this, d, i); 34 | return d; 35 | }).sort(function(a, b) { return b.size - a.size; }); 36 | 37 | if (timer) clearInterval(timer); 38 | timer = setInterval(step, 0); 39 | step(); 40 | 41 | return cloud; 42 | 43 | function step() { 44 | var start = +new Date, 45 | d; 46 | while (+new Date - start < timeInterval && ++i < n && timer) { 47 | d = data[i]; 48 | d.x = (size[0] * (Math.random() + .5)) >> 1; 49 | d.y = (size[1] * (Math.random() + .5)) >> 1; 50 | cloudSprite(d, data, i); 51 | if (place(board, d, bounds)) { 52 | tags.push(d); 53 | event.word(d); 54 | if (bounds) cloudBounds(bounds, d); 55 | else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}]; 56 | // Temporary hack 57 | d.x -= size[0] >> 1; 58 | d.y -= size[1] >> 1; 59 | } 60 | } 61 | if (i >= n) { 62 | cloud.stop(); 63 | event.end(tags, bounds); 64 | } 65 | } 66 | } 67 | 68 | cloud.stop = function() { 69 | if (timer) { 70 | clearInterval(timer); 71 | timer = null; 72 | } 73 | return cloud; 74 | }; 75 | 76 | cloud.timeInterval = function(x) { 77 | if (!arguments.length) return timeInterval; 78 | timeInterval = x == null ? Infinity : x; 79 | return cloud; 80 | }; 81 | 82 | function place(board, tag, bounds) { 83 | var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}], 84 | startX = tag.x, 85 | startY = tag.y, 86 | maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]), 87 | s = spiral(size), 88 | dt = Math.random() < .5 ? 1 : -1, 89 | t = -dt, 90 | dxdy, 91 | dx, 92 | dy; 93 | 94 | while (dxdy = s(t += dt)) { 95 | dx = ~~dxdy[0]; 96 | dy = ~~dxdy[1]; 97 | 98 | if (Math.min(dx, dy) > maxDelta) break; 99 | 100 | tag.x = startX + dx; 101 | tag.y = startY + dy; 102 | 103 | if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 || 104 | tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue; 105 | // TODO only check for collisions within current bounds. 106 | if (!bounds || !cloudCollide(tag, board, size[0])) { 107 | if (!bounds || collideRects(tag, bounds)) { 108 | var sprite = tag.sprite, 109 | w = tag.width >> 5, 110 | sw = size[0] >> 5, 111 | lx = tag.x - (w << 4), 112 | sx = lx & 0x7f, 113 | msx = 32 - sx, 114 | h = tag.y1 - tag.y0, 115 | x = (tag.y + tag.y0) * sw + (lx >> 5), 116 | last; 117 | for (var j = 0; j < h; j++) { 118 | last = 0; 119 | for (var i = 0; i <= w; i++) { 120 | board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0); 121 | } 122 | x += sw; 123 | } 124 | delete tag.sprite; 125 | return true; 126 | } 127 | } 128 | } 129 | return false; 130 | } 131 | 132 | cloud.words = function(x) { 133 | if (!arguments.length) return words; 134 | words = x; 135 | return cloud; 136 | }; 137 | 138 | cloud.size = function(x) { 139 | if (!arguments.length) return size; 140 | size = [+x[0], +x[1]]; 141 | return cloud; 142 | }; 143 | 144 | cloud.font = function(x) { 145 | if (!arguments.length) return font; 146 | font = d3.functor(x); 147 | return cloud; 148 | }; 149 | 150 | cloud.fontStyle = function(x) { 151 | if (!arguments.length) return fontStyle; 152 | fontStyle = d3.functor(x); 153 | return cloud; 154 | }; 155 | 156 | cloud.fontWeight = function(x) { 157 | if (!arguments.length) return fontWeight; 158 | fontWeight = d3.functor(x); 159 | return cloud; 160 | }; 161 | 162 | cloud.rotate = function(x) { 163 | if (!arguments.length) return rotate; 164 | rotate = d3.functor(x); 165 | return cloud; 166 | }; 167 | 168 | cloud.text = function(x) { 169 | if (!arguments.length) return text; 170 | text = d3.functor(x); 171 | return cloud; 172 | }; 173 | 174 | cloud.spiral = function(x) { 175 | if (!arguments.length) return spiral; 176 | spiral = spirals[x + ""] || x; 177 | return cloud; 178 | }; 179 | 180 | cloud.fontSize = function(x) { 181 | if (!arguments.length) return fontSize; 182 | fontSize = d3.functor(x); 183 | return cloud; 184 | }; 185 | 186 | cloud.padding = function(x) { 187 | if (!arguments.length) return padding; 188 | padding = d3.functor(x); 189 | return cloud; 190 | }; 191 | 192 | return d3.rebind(cloud, event, "on"); 193 | } 194 | 195 | function cloudText(d) { 196 | return d.text; 197 | } 198 | 199 | function cloudFont() { 200 | return "serif"; 201 | } 202 | 203 | function cloudFontNormal() { 204 | return "normal"; 205 | } 206 | 207 | function cloudFontSize(d) { 208 | return Math.sqrt(d.value); 209 | } 210 | 211 | function cloudRotate() { 212 | return (~~(Math.random() * 6) - 3) * 30; 213 | } 214 | 215 | function cloudPadding() { 216 | return 1; 217 | } 218 | 219 | // Fetches a monochrome sprite bitmap for the specified text. 220 | // Load in batches for speed. 221 | function cloudSprite(d, data, di) { 222 | if (d.sprite) return; 223 | c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio); 224 | var x = 0, 225 | y = 0, 226 | maxh = 0, 227 | n = data.length; 228 | di--; 229 | while (++di < n) { 230 | d = data[di]; 231 | c.save(); 232 | c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font; 233 | var w = c.measureText(d.text + "m").width * ratio, 234 | h = d.size << 1; 235 | if (d.rotate) { 236 | var sr = Math.sin(d.rotate * cloudRadians), 237 | cr = Math.cos(d.rotate * cloudRadians), 238 | wcr = w * cr, 239 | wsr = w * sr, 240 | hcr = h * cr, 241 | hsr = h * sr; 242 | w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5; 243 | h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr)); 244 | } else { 245 | w = (w + 0x1f) >> 5 << 5; 246 | } 247 | if (h > maxh) maxh = h; 248 | if (x + w >= (cw << 5)) { 249 | x = 0; 250 | y += maxh; 251 | maxh = 0; 252 | } 253 | if (y + h >= ch) break; 254 | c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio); 255 | if (d.rotate) c.rotate(d.rotate * cloudRadians); 256 | c.fillText(d.text, 0, 0); 257 | c.restore(); 258 | d.width = w; 259 | d.height = h; 260 | d.xoff = x; 261 | d.yoff = y; 262 | d.x1 = w >> 1; 263 | d.y1 = h >> 1; 264 | d.x0 = -d.x1; 265 | d.y0 = -d.y1; 266 | x += w; 267 | } 268 | var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data, 269 | sprite = []; 270 | while (--di >= 0) { 271 | d = data[di]; 272 | var w = d.width, 273 | w32 = w >> 5, 274 | h = d.y1 - d.y0, 275 | p = d.padding; 276 | // Zero the buffer 277 | for (var i = 0; i < h * w32; i++) sprite[i] = 0; 278 | x = d.xoff; 279 | if (x == null) return; 280 | y = d.yoff; 281 | var seen = 0, 282 | seenRow = -1; 283 | for (var j = 0; j < h; j++) { 284 | for (var i = 0; i < w; i++) { 285 | var k = w32 * j + (i >> 5), 286 | m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0; 287 | if (p) { 288 | if (j) sprite[k - w32] |= m; 289 | if (j < w - 1) sprite[k + w32] |= m; 290 | m |= (m << 1) | (m >> 1); 291 | } 292 | sprite[k] |= m; 293 | seen |= m; 294 | } 295 | if (seen) seenRow = j; 296 | else { 297 | d.y0++; 298 | h--; 299 | j--; 300 | y++; 301 | } 302 | } 303 | d.y1 = d.y0 + seenRow; 304 | d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32); 305 | } 306 | } 307 | 308 | // Use mask-based collision detection. 309 | function cloudCollide(tag, board, sw) { 310 | sw >>= 5; 311 | var sprite = tag.sprite, 312 | w = tag.width >> 5, 313 | lx = tag.x - (w << 4), 314 | sx = lx & 0x7f, 315 | msx = 32 - sx, 316 | h = tag.y1 - tag.y0, 317 | x = (tag.y + tag.y0) * sw + (lx >> 5), 318 | last; 319 | for (var j = 0; j < h; j++) { 320 | last = 0; 321 | for (var i = 0; i <= w; i++) { 322 | if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0)) 323 | & board[x + i]) return true; 324 | } 325 | x += sw; 326 | } 327 | return false; 328 | } 329 | 330 | function cloudBounds(bounds, d) { 331 | var b0 = bounds[0], 332 | b1 = bounds[1]; 333 | if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0; 334 | if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0; 335 | if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1; 336 | if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1; 337 | } 338 | 339 | function collideRects(a, b) { 340 | return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y; 341 | } 342 | 343 | function archimedeanSpiral(size) { 344 | var e = size[0] / size[1]; 345 | return function(t) { 346 | return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)]; 347 | }; 348 | } 349 | 350 | function rectangularSpiral(size) { 351 | var dy = 4, 352 | dx = dy * size[0] / size[1], 353 | x = 0, 354 | y = 0; 355 | return function(t) { 356 | var sign = t < 0 ? -1 : 1; 357 | // See triangular numbers: T_n = n * (n + 1) / 2. 358 | switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) { 359 | case 0: x += dx; break; 360 | case 1: y += dy; break; 361 | case 2: x -= dx; break; 362 | default: y -= dy; break; 363 | } 364 | return [x, y]; 365 | }; 366 | } 367 | 368 | // TODO reuse arrays? 369 | function zeroArray(n) { 370 | var a = [], 371 | i = -1; 372 | while (++i < n) a[i] = 0; 373 | return a; 374 | } 375 | 376 | var cloudRadians = Math.PI / 180, 377 | cw = 1 << 11 >> 5, 378 | ch = 1 << 11, 379 | canvas, 380 | ratio = 1; 381 | 382 | if (typeof document !== "undefined") { 383 | canvas = document.createElement("canvas"); 384 | canvas.width = 1; 385 | canvas.height = 1; 386 | ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2); 387 | canvas.width = (cw << 5) / ratio; 388 | canvas.height = ch / ratio; 389 | } else { 390 | // node-canvas support 391 | var Canvas = require("canvas"); 392 | canvas = new Canvas(cw << 5, ch); 393 | } 394 | 395 | var c = canvas.getContext("2d"), 396 | spirals = { 397 | archimedean: archimedeanSpiral, 398 | rectangular: rectangularSpiral 399 | }; 400 | c.fillStyle = "red"; 401 | c.textAlign = "center"; 402 | 403 | exports.cloud = cloud; 404 | })(typeof exports === "undefined" ? d3.layout || (d3.layout = {}) : exports); 405 | -------------------------------------------------------------------------------- /Week4-6/gbasic.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | howmany = int(input("How many to dump? ")) 6 | 7 | conn = sqlite3.connect('index.sqlite') 8 | cur = conn.cursor() 9 | 10 | cur.execute('SELECT id, sender FROM Senders') 11 | senders = dict() 12 | for message_row in cur : 13 | senders[message_row[0]] = message_row[1] 14 | 15 | cur.execute('SELECT id, subject FROM Subjects') 16 | subjects = dict() 17 | for message_row in cur : 18 | subjects[message_row[0]] = message_row[1] 19 | 20 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 21 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 22 | messages = dict() 23 | for message_row in cur : 24 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 25 | 26 | print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders)) 27 | 28 | sendcounts = dict() 29 | sendorgs = dict() 30 | for (message_id, message) in list(messages.items()): 31 | sender = message[1] 32 | sendcounts[sender] = sendcounts.get(sender,0) + 1 33 | pieces = senders[sender].split("@") 34 | if len(pieces) != 2 : continue 35 | dns = pieces[1] 36 | sendorgs[dns] = sendorgs.get(dns,0) + 1 37 | 38 | print('') 39 | print('Top',howmany,'Email list participants') 40 | 41 | x = sorted(sendcounts, key=sendcounts.get, reverse=True) 42 | for k in x[:howmany]: 43 | print(senders[k], sendcounts[k]) 44 | if sendcounts[k] < 10 : break 45 | 46 | print('') 47 | print('Top',howmany,'Email list organizations') 48 | 49 | x = sorted(sendorgs, key=sendorgs.get, reverse=True) 50 | for k in x[:howmany]: 51 | print(k, sendorgs[k]) 52 | if sendorgs[k] < 10 : break 53 | -------------------------------------------------------------------------------- /Week4-6/gline.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 19 | 20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /Week4-6/gline.js: -------------------------------------------------------------------------------- 1 | gline = [ ['Year','mtu.edu','cam.ac.uk','unicon.net','virginia.edu','asu.edu','gmail.com','indiana.edu','suny.edu','columbia.edu','ucdavis.edu'], 2 | ['2005-12',2,2,2,2,1,1,1,1,1,1] 3 | ]; 4 | -------------------------------------------------------------------------------- /Week4-6/gline.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | 5 | conn = sqlite3.connect('index.sqlite') 6 | cur = conn.cursor() 7 | 8 | cur.execute('SELECT id, sender FROM Senders') 9 | senders = dict() 10 | for message_row in cur : 11 | senders[message_row[0]] = message_row[1] 12 | 13 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 14 | messages = dict() 15 | for message_row in cur : 16 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 17 | 18 | print("Loaded messages=",len(messages),"senders=",len(senders)) 19 | 20 | sendorgs = dict() 21 | for (message_id, message) in list(messages.items()): 22 | sender = message[1] 23 | pieces = senders[sender].split("@") 24 | if len(pieces) != 2 : continue 25 | dns = pieces[1] 26 | sendorgs[dns] = sendorgs.get(dns,0) + 1 27 | 28 | # pick the top schools 29 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 30 | orgs = orgs[:10] 31 | print("Top 10 Oranizations") 32 | print(orgs) 33 | 34 | counts = dict() 35 | months = list() 36 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 37 | for (message_id, message) in list(messages.items()): 38 | sender = message[1] 39 | pieces = senders[sender].split("@") 40 | if len(pieces) != 2 : continue 41 | dns = pieces[1] 42 | if dns not in orgs : continue 43 | month = message[3][:7] 44 | if month not in months : months.append(month) 45 | key = (month, dns) 46 | counts[key] = counts.get(key,0) + 1 47 | 48 | months.sort() 49 | # print counts 50 | # print months 51 | 52 | fhand = open('gline.js','w') 53 | fhand.write("gline = [ ['Year'") 54 | for org in orgs: 55 | fhand.write(",'"+org+"'") 56 | fhand.write("]") 57 | 58 | for month in months: 59 | fhand.write(",\n['"+month+"'") 60 | for org in orgs: 61 | key = (month, org) 62 | val = counts.get(key,0) 63 | fhand.write(","+str(val)) 64 | fhand.write("]"); 65 | 66 | fhand.write("\n];\n") 67 | fhand.close() 68 | 69 | print("Output written to gline.js") 70 | print("Open gline.htm to visualize the data") 71 | -------------------------------------------------------------------------------- /Week4-6/gmane.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import ssl 4 | import urllib.request, urllib.parse, urllib.error 5 | from urllib.parse import urljoin 6 | from urllib.parse import urlparse 7 | import re 8 | from datetime import datetime, timedelta 9 | 10 | # Not all systems have this so conditionally define parser 11 | try: 12 | import dateutil.parser as parser 13 | except: 14 | pass 15 | 16 | def parsemaildate(md) : 17 | # See if we have dateutil 18 | try: 19 | pdate = parser.parse(tdate) 20 | test_at = pdate.isoformat() 21 | return test_at 22 | except: 23 | pass 24 | 25 | # Non-dateutil version - we try our best 26 | 27 | pieces = md.split() 28 | notz = " ".join(pieces[:4]).strip() 29 | 30 | # Try a bunch of format variations - strptime() is *lame* 31 | dnotz = None 32 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 33 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 34 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 35 | try: 36 | dnotz = datetime.strptime(notz, form) 37 | break 38 | except: 39 | continue 40 | 41 | if dnotz is None : 42 | # print 'Bad Date:',md 43 | return None 44 | 45 | iso = dnotz.isoformat() 46 | 47 | tz = "+0000" 48 | try: 49 | tz = pieces[4] 50 | ival = int(tz) # Only want numeric timezone values 51 | if tz == '-0000' : tz = '+0000' 52 | tzh = tz[:3] 53 | tzm = tz[3:] 54 | tz = tzh+":"+tzm 55 | except: 56 | pass 57 | 58 | return iso+tz 59 | 60 | # Ignore SSL certificate errors 61 | ctx = ssl.create_default_context() 62 | ctx.check_hostname = False 63 | ctx.verify_mode = ssl.CERT_NONE 64 | 65 | conn = sqlite3.connect('content.sqlite') 66 | cur = conn.cursor() 67 | 68 | baseurl = "http://mbox.dr-chuck.net/sakai.devel/" 69 | 70 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 71 | (id INTEGER UNIQUE, email TEXT, sent_at TEXT, 72 | subject TEXT, headers TEXT, body TEXT)''') 73 | 74 | # Pick up where we left off 75 | start = None 76 | cur.execute('SELECT max(id) FROM Messages' ) 77 | try: 78 | row = cur.fetchone() 79 | if row is None : 80 | start = 0 81 | else: 82 | start = row[0] 83 | except: 84 | start = 0 85 | 86 | if start is None : start = 0 87 | 88 | many = 0 89 | count = 0 90 | fail = 0 91 | while True: 92 | if ( many < 1 ) : 93 | conn.commit() 94 | sval = input('How many messages:') 95 | if ( len(sval) < 1 ) : break 96 | many = int(sval) 97 | 98 | start = start + 1 99 | cur.execute('SELECT id FROM Messages WHERE id=?', (start,) ) 100 | try: 101 | row = cur.fetchone() 102 | if row is not None : continue 103 | except: 104 | row = None 105 | 106 | many = many - 1 107 | url = baseurl + str(start) + '/' + str(start + 1) 108 | 109 | text = "None" 110 | try: 111 | # Open with a timeout of 30 seconds 112 | document = urllib.request.urlopen(url, None, 30, context=ctx) 113 | text = document.read().decode() 114 | if document.getcode() != 200 : 115 | print("Error code=",document.getcode(), url) 116 | break 117 | except KeyboardInterrupt: 118 | print('') 119 | print('Program interrupted by user...') 120 | break 121 | except Exception as e: 122 | print("Unable to retrieve or parse page",url) 123 | print("Error",e) 124 | fail = fail + 1 125 | if fail > 5 : break 126 | continue 127 | 128 | print(url,len(text)) 129 | count = count + 1 130 | 131 | if not text.startswith("From "): 132 | print(text) 133 | print("Did not find From ") 134 | fail = fail + 1 135 | if fail > 5 : break 136 | continue 137 | 138 | pos = text.find("\n\n") 139 | if pos > 0 : 140 | hdr = text[:pos] 141 | body = text[pos+2:] 142 | else: 143 | print(text) 144 | print("Could not find break between headers and body") 145 | fail = fail + 1 146 | if fail > 5 : break 147 | continue 148 | 149 | email = None 150 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 151 | if len(x) == 1 : 152 | email = x[0]; 153 | email = email.strip().lower() 154 | email = email.replace("<","") 155 | else: 156 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 157 | if len(x) == 1 : 158 | email = x[0]; 159 | email = email.strip().lower() 160 | email = email.replace("<","") 161 | 162 | date = None 163 | y = re.findall('\Date: .*, (.*)\n', hdr) 164 | if len(y) == 1 : 165 | tdate = y[0] 166 | tdate = tdate[:26] 167 | try: 168 | sent_at = parsemaildate(tdate) 169 | except: 170 | print(text) 171 | print("Parse fail",tdate) 172 | fail = fail + 1 173 | if fail > 5 : break 174 | continue 175 | 176 | subject = None 177 | z = re.findall('\Subject: (.*)\n', hdr) 178 | if len(z) == 1 : subject = z[0].strip().lower(); 179 | 180 | # Reset the fail counter 181 | fail = 0 182 | print(" ",email,sent_at,subject) 183 | cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body) 184 | VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body)) 185 | if count % 50 == 0 : conn.commit() 186 | if count % 100 == 0 : time.sleep(1) 187 | 188 | conn.commit() 189 | cur.close() 190 | -------------------------------------------------------------------------------- /Week4-6/gmodel.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import re 4 | import zlib 5 | from datetime import datetime, timedelta 6 | 7 | # Not all systems have this 8 | try: 9 | import dateutil.parser as parser 10 | except: 11 | pass 12 | 13 | dnsmapping = dict() 14 | mapping = dict() 15 | 16 | def fixsender(sender,allsenders=None) : 17 | global dnsmapping 18 | global mapping 19 | if sender is None : return None 20 | sender = sender.strip().lower() 21 | sender = sender.replace('<','').replace('>','') 22 | 23 | # Check if we have a hacked gmane.org from address 24 | if allsenders is not None and sender.endswith('gmane.org') : 25 | pieces = sender.split('-') 26 | realsender = None 27 | for s in allsenders: 28 | if s.startswith(pieces[0]) : 29 | realsender = sender 30 | sender = s 31 | # print(realsender, sender) 32 | break 33 | if realsender is None : 34 | for s in mapping: 35 | if s.startswith(pieces[0]) : 36 | realsender = sender 37 | sender = mapping[s] 38 | # print(realsender, sender) 39 | break 40 | if realsender is None : sender = pieces[0] 41 | 42 | mpieces = sender.split("@") 43 | if len(mpieces) != 2 : return sender 44 | dns = mpieces[1] 45 | x = dns 46 | pieces = dns.split(".") 47 | if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") : 48 | dns = ".".join(pieces[-2:]) 49 | else: 50 | dns = ".".join(pieces[-3:]) 51 | # if dns != x : print(x,dns) 52 | # if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns)) 53 | dns = dnsmapping.get(dns,dns) 54 | return mpieces[0] + '@' + dns 55 | 56 | def parsemaildate(md) : 57 | # See if we have dateutil 58 | try: 59 | pdate = parser.parse(tdate) 60 | test_at = pdate.isoformat() 61 | return test_at 62 | except: 63 | pass 64 | 65 | # Non-dateutil version - we try our best 66 | 67 | pieces = md.split() 68 | notz = " ".join(pieces[:4]).strip() 69 | 70 | # Try a bunch of format variations - strptime() is *lame* 71 | dnotz = None 72 | for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S', 73 | '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S', 74 | '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] : 75 | try: 76 | dnotz = datetime.strptime(notz, form) 77 | break 78 | except: 79 | continue 80 | 81 | if dnotz is None : 82 | # print('Bad Date:',md) 83 | return None 84 | 85 | iso = dnotz.isoformat() 86 | 87 | tz = "+0000" 88 | try: 89 | tz = pieces[4] 90 | ival = int(tz) # Only want numeric timezone values 91 | if tz == '-0000' : tz = '+0000' 92 | tzh = tz[:3] 93 | tzm = tz[3:] 94 | tz = tzh+":"+tzm 95 | except: 96 | pass 97 | 98 | return iso+tz 99 | 100 | # Parse out the info... 101 | def parseheader(hdr, allsenders=None): 102 | if hdr is None or len(hdr) < 1 : return None 103 | sender = None 104 | x = re.findall('\nFrom: .* <(\S+@\S+)>\n', hdr) 105 | if len(x) >= 1 : 106 | sender = x[0] 107 | else: 108 | x = re.findall('\nFrom: (\S+@\S+)\n', hdr) 109 | if len(x) >= 1 : 110 | sender = x[0] 111 | 112 | # normalize the domain name of Email addresses 113 | sender = fixsender(sender, allsenders) 114 | 115 | date = None 116 | y = re.findall('\nDate: .*, (.*)\n', hdr) 117 | sent_at = None 118 | if len(y) >= 1 : 119 | tdate = y[0] 120 | tdate = tdate[:26] 121 | try: 122 | sent_at = parsemaildate(tdate) 123 | except Exception as e: 124 | # print('Date ignored ',tdate, e) 125 | return None 126 | 127 | subject = None 128 | z = re.findall('\nSubject: (.*)\n', hdr) 129 | if len(z) >= 1 : subject = z[0].strip().lower() 130 | 131 | guid = None 132 | z = re.findall('\nMessage-ID: (.*)\n', hdr) 133 | if len(z) >= 1 : guid = z[0].strip().lower() 134 | 135 | if sender is None or sent_at is None or subject is None or guid is None : 136 | return None 137 | return (guid, sender, subject, sent_at) 138 | 139 | conn = sqlite3.connect('index.sqlite') 140 | cur = conn.cursor() 141 | 142 | cur.execute('''DROP TABLE IF EXISTS Messages ''') 143 | cur.execute('''DROP TABLE IF EXISTS Senders ''') 144 | cur.execute('''DROP TABLE IF EXISTS Subjects ''') 145 | cur.execute('''DROP TABLE IF EXISTS Replies ''') 146 | 147 | cur.execute('''CREATE TABLE IF NOT EXISTS Messages 148 | (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER, 149 | sender_id INTEGER, subject_id INTEGER, 150 | headers BLOB, body BLOB)''') 151 | cur.execute('''CREATE TABLE IF NOT EXISTS Senders 152 | (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''') 153 | cur.execute('''CREATE TABLE IF NOT EXISTS Subjects 154 | (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''') 155 | cur.execute('''CREATE TABLE IF NOT EXISTS Replies 156 | (from_id INTEGER, to_id INTEGER)''') 157 | 158 | conn_1 = sqlite3.connect('mapping.sqlite') 159 | cur_1 = conn_1.cursor() 160 | 161 | cur_1.execute('''SELECT old,new FROM DNSMapping''') 162 | for message_row in cur_1 : 163 | dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower() 164 | 165 | mapping = dict() 166 | cur_1.execute('''SELECT old,new FROM Mapping''') 167 | for message_row in cur_1 : 168 | old = fixsender(message_row[0]) 169 | new = fixsender(message_row[1]) 170 | mapping[old] = fixsender(new) 171 | 172 | # Done with mapping.sqlite 173 | conn_1.close() 174 | 175 | # Open the main content (Read only) 176 | conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True) 177 | cur_1 = conn_1.cursor() 178 | 179 | allsenders = list() 180 | cur_1.execute('''SELECT email FROM Messages''') 181 | for message_row in cur_1 : 182 | sender = fixsender(message_row[0]) 183 | if sender is None : continue 184 | if 'gmane.org' in sender : continue 185 | if sender in allsenders: continue 186 | allsenders.append(sender) 187 | 188 | print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping)) 189 | 190 | cur_1.execute('''SELECT headers, body, sent_at 191 | FROM Messages ORDER BY sent_at''') 192 | 193 | senders = dict() 194 | subjects = dict() 195 | guids = dict() 196 | 197 | count = 0 198 | 199 | for message_row in cur_1 : 200 | hdr = message_row[0] 201 | parsed = parseheader(hdr, allsenders) 202 | if parsed is None: continue 203 | (guid, sender, subject, sent_at) = parsed 204 | 205 | # Apply the sender mapping 206 | sender = mapping.get(sender,sender) 207 | 208 | count = count + 1 209 | if count % 250 == 1 : print(count,sent_at, sender) 210 | # print(guid, sender, subject, sent_at) 211 | 212 | if 'gmane.org' in sender: 213 | print("Error in sender ===", sender) 214 | 215 | sender_id = senders.get(sender,None) 216 | subject_id = subjects.get(subject,None) 217 | guid_id = guids.get(guid,None) 218 | 219 | if sender_id is None : 220 | cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) ) 221 | conn.commit() 222 | cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, )) 223 | try: 224 | row = cur.fetchone() 225 | sender_id = row[0] 226 | senders[sender] = sender_id 227 | except: 228 | print('Could not retrieve sender id',sender) 229 | break 230 | if subject_id is None : 231 | cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) ) 232 | conn.commit() 233 | cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, )) 234 | try: 235 | row = cur.fetchone() 236 | subject_id = row[0] 237 | subjects[subject] = subject_id 238 | except: 239 | print('Could not retrieve subject id',subject) 240 | break 241 | # print(sender_id, subject_id) 242 | cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )', 243 | ( guid, sender_id, subject_id, sent_at, 244 | zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) ) 245 | conn.commit() 246 | cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, )) 247 | try: 248 | row = cur.fetchone() 249 | message_id = row[0] 250 | guids[guid] = message_id 251 | except: 252 | print('Could not retrieve guid id',guid) 253 | break 254 | 255 | cur.close() 256 | cur_1.close() 257 | -------------------------------------------------------------------------------- /Week4-6/gword.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 37 | -------------------------------------------------------------------------------- /Week4-6/gword.js: -------------------------------------------------------------------------------- 1 | gword = [{text: 'sakai', size: 100}, 2 | {text: 'error', size: 50}, 3 | {text: 'sakaiportallogin', size: 50}, 4 | {text: 'presense', size: 50}, 5 | {text: 'rantscomments', size: 40}, 6 | {text: 'lmsvle', size: 40}, 7 | {text: 'memory', size: 40}, 8 | {text: 'with', size: 40}, 9 | {text: 'examples', size: 40}, 10 | {text: 'provider', size: 40}, 11 | {text: 'developers', size: 30}, 12 | {text: 'zero', size: 20}, 13 | {text: 'documentation', size: 20}, 14 | {text: 'lone', size: 20}, 15 | {text: 'conference', size: 20}, 16 | {text: 'authentication', size: 20}, 17 | {text: 'ldap', size: 20}, 18 | {text: 'song', size: 20}, 19 | {text: 'into', size: 20}, 20 | {text: 'break', size: 20}, 21 | {text: 'from', size: 20}, 22 | {text: 'report', size: 20}, 23 | {text: 'austin', size: 20}, 24 | {text: 'block', size: 20}, 25 | {text: 'logo', size: 20}, 26 | {text: 'call', size: 20}, 27 | {text: 'participation', size: 20} 28 | ]; 29 | -------------------------------------------------------------------------------- /Week4-6/gword.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import zlib 4 | import string 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, subject FROM Subjects') 10 | subjects = dict() 11 | for message_row in cur : 12 | subjects[message_row[0]] = message_row[1] 13 | 14 | # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages') 15 | cur.execute('SELECT subject_id FROM Messages') 16 | counts = dict() 17 | for message_row in cur : 18 | text = subjects[message_row[0]] 19 | text = text.translate(str.maketrans('','',string.punctuation)) 20 | text = text.translate(str.maketrans('','','1234567890')) 21 | text = text.strip() 22 | text = text.lower() 23 | words = text.split() 24 | for word in words: 25 | if len(word) < 4 : continue 26 | counts[word] = counts.get(word,0) + 1 27 | 28 | x = sorted(counts, key=counts.get, reverse=True) 29 | highest = None 30 | lowest = None 31 | for k in x[:100]: 32 | if highest is None or highest < counts[k] : 33 | highest = counts[k] 34 | if lowest is None or lowest > counts[k] : 35 | lowest = counts[k] 36 | print('Range of counts:',highest,lowest) 37 | 38 | # Spread the font sizes across 20-100 based on the count 39 | bigsize = 80 40 | smallsize = 20 41 | 42 | fhand = open('gword.js','w') 43 | fhand.write("gword = [") 44 | first = True 45 | for k in x[:100]: 46 | if not first : fhand.write( ",\n") 47 | first = False 48 | size = counts[k] 49 | size = (size - lowest) / float(highest - lowest) 50 | size = int((size * bigsize) + smallsize) 51 | fhand.write("{text: '"+k+"', size: "+str(size)+"}") 52 | fhand.write( "\n];\n") 53 | fhand.close() 54 | 55 | print("Output written to gword.js") 56 | print("Open gword.htm in a browser to see the vizualization") 57 | -------------------------------------------------------------------------------- /Week4-6/gyear.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import time 3 | import urllib.request, urllib.parse, urllib.error 4 | import zlib 5 | 6 | conn = sqlite3.connect('index.sqlite') 7 | cur = conn.cursor() 8 | 9 | cur.execute('SELECT id, sender FROM Senders') 10 | senders = dict() 11 | for message_row in cur : 12 | senders[message_row[0]] = message_row[1] 13 | 14 | cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 15 | messages = dict() 16 | for message_row in cur : 17 | messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4]) 18 | 19 | print("Loaded messages=",len(messages),"senders=",len(senders)) 20 | 21 | sendorgs = dict() 22 | for (message_id, message) in list(messages.items()): 23 | sender = message[1] 24 | pieces = senders[sender].split("@") 25 | if len(pieces) != 2 : continue 26 | dns = pieces[1] 27 | sendorgs[dns] = sendorgs.get(dns,0) + 1 28 | 29 | # pick the top schools 30 | orgs = sorted(sendorgs, key=sendorgs.get, reverse=True) 31 | orgs = orgs[:10] 32 | print("Top 10 Oranizations") 33 | print(orgs) 34 | # orgs = ['total'] + orgs 35 | 36 | counts = dict() 37 | months = list() 38 | # cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages') 39 | for (message_id, message) in list(messages.items()): 40 | sender = message[1] 41 | pieces = senders[sender].split("@") 42 | if len(pieces) != 2 : continue 43 | dns = pieces[1] 44 | if dns not in orgs : continue 45 | month = message[3][:4] 46 | if month not in months : months.append(month) 47 | key = (month, dns) 48 | counts[key] = counts.get(key,0) + 1 49 | tkey = (month, 'total') 50 | counts[tkey] = counts.get(tkey,0) + 1 51 | 52 | months.sort() 53 | # print counts 54 | # print months 55 | 56 | fhand = open('gline.js','w') 57 | fhand.write("gline = [ ['Year'") 58 | for org in orgs: 59 | fhand.write(",'"+org+"'") 60 | fhand.write("]") 61 | 62 | for month in months[1:-1]: 63 | fhand.write(",\n['"+month+"'") 64 | for org in orgs: 65 | key = (month, org) 66 | val = counts.get(key,0) 67 | fhand.write(","+str(val)) 68 | fhand.write("]"); 69 | 70 | fhand.write("\n];\n") 71 | fhand.close() 72 | 73 | print("Output written to gline.js") 74 | print("Open gline.htm to visualize the data") 75 | 76 | -------------------------------------------------------------------------------- /Week4-6/index.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week4-6/index.sqlite -------------------------------------------------------------------------------- /Week4-6/mapping.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/synsantss/Capstone-Retrieving-Processing-and-Visualizing-Data-with-Python/1f9d632cf114f734c6e31fa9d3ef4c62ef69c1e1/Week4-6/mapping.sqlite --------------------------------------------------------------------------------