├── .gitignore ├── BeautifulSoup.py ├── README.md ├── app.yaml ├── browse.py ├── bs4 ├── __init__.py ├── builder │ ├── __init__.py │ ├── _html5lib.py │ ├── _htmlparser.py │ └── _lxml.py ├── dammit.py ├── diagnose.py ├── element.py ├── testing.py └── tests │ ├── __init__.py │ ├── test_builder_registry.py │ ├── test_docs.py │ ├── test_html5lib.py │ ├── test_htmlparser.py │ ├── test_lxml.py │ ├── test_soup.py │ └── test_tree.py ├── cssselect ├── __init__.py ├── parser.py ├── tests.py └── xpath.py ├── document.py ├── document_old.py ├── favicon.ico ├── goose ├── __init__.py ├── article.py ├── cleaners.py ├── configuration.py ├── crawler.py ├── extractors.py ├── images │ ├── __init__.py │ ├── extractors.py │ ├── image.py │ └── utils.py ├── network.py ├── outputformatters.py ├── parsers.py ├── resources │ ├── images │ │ └── known-image-css.txt │ └── text │ │ ├── stopwords-ar.txt │ │ ├── stopwords-da.txt │ │ ├── stopwords-de.txt │ │ ├── stopwords-en.txt │ │ ├── stopwords-es.txt │ │ ├── stopwords-fi.txt │ │ ├── stopwords-fr.txt │ │ ├── stopwords-hu.txt │ │ ├── stopwords-id.txt │ │ ├── stopwords-it.txt │ │ ├── stopwords-ko.txt │ │ ├── stopwords-nb.txt │ │ ├── stopwords-nl.txt │ │ ├── stopwords-no.txt │ │ ├── stopwords-pl.txt │ │ ├── stopwords-pt.txt │ │ ├── stopwords-ru.txt │ │ ├── stopwords-sv.txt │ │ └── stopwords-zh.txt ├── text.py ├── utils │ ├── __init__.py │ └── encoding.py ├── version.py └── videos │ ├── __init__.py │ ├── extractors.py │ └── videos.py ├── html2text.py ├── httplib2 ├── __init__.py ├── cacerts.txt ├── iri2uri.py ├── socks.py └── test │ ├── __init__.py │ ├── brokensocket │ └── socket.py │ ├── functional │ └── test_proxies.py │ ├── miniserver.py │ ├── other_cacerts.txt │ ├── smoke_test.py │ └── test_no_socket.py ├── index.yaml ├── instructions.html ├── main.py ├── page.html ├── parse_command.py ├── pybing ├── __init__.py ├── bing.py ├── constants.py ├── query │ ├── __init__.py │ ├── mixin.py │ ├── pagable.py │ ├── query.py │ └── web.py ├── result.py └── resultset.py ├── requests ├── __init__.py ├── adapters.py ├── api.py ├── auth.py ├── cacert.pem ├── certs.py ├── compat.py ├── cookies.py ├── exceptions.py ├── hooks.py ├── models.py ├── packages │ ├── __init__.py │ ├── chardet │ │ ├── __init__.py │ │ ├── big5freq.py │ │ ├── big5prober.py │ │ ├── chardetect.py │ │ ├── chardistribution.py │ │ ├── charsetgroupprober.py │ │ ├── charsetprober.py │ │ ├── codingstatemachine.py │ │ ├── compat.py │ │ ├── constants.py │ │ ├── cp949prober.py │ │ ├── escprober.py │ │ ├── escsm.py │ │ ├── eucjpprober.py │ │ ├── euckrfreq.py │ │ ├── euckrprober.py │ │ ├── euctwfreq.py │ │ ├── euctwprober.py │ │ ├── gb2312freq.py │ │ ├── gb2312prober.py │ │ ├── hebrewprober.py │ │ ├── jisfreq.py │ │ ├── jpcntx.py │ │ ├── langbulgarianmodel.py │ │ ├── langcyrillicmodel.py │ │ ├── langgreekmodel.py │ │ ├── langhebrewmodel.py │ │ ├── langhungarianmodel.py │ │ ├── langthaimodel.py │ │ ├── latin1prober.py │ │ ├── mbcharsetprober.py │ │ ├── mbcsgroupprober.py │ │ ├── mbcssm.py │ │ ├── sbcharsetprober.py │ │ ├── sbcsgroupprober.py │ │ ├── sjisprober.py │ │ ├── universaldetector.py │ │ └── utf8prober.py │ └── urllib3 │ │ ├── __init__.py │ │ ├── _collections.py │ │ ├── connection.py │ │ ├── connectionpool.py │ │ ├── contrib │ │ ├── __init__.py │ │ ├── ntlmpool.py │ │ └── pyopenssl.py │ │ ├── exceptions.py │ │ ├── fields.py │ │ ├── filepost.py │ │ ├── packages │ │ ├── __init__.py │ │ ├── ordered_dict.py │ │ ├── six.py │ │ └── ssl_match_hostname │ │ │ ├── __init__.py │ │ │ └── _implementation.py │ │ ├── poolmanager.py │ │ ├── request.py │ │ ├── response.py │ │ └── util │ │ ├── __init__.py │ │ ├── connection.py │ │ ├── request.py │ │ ├── response.py │ │ ├── retry.py │ │ ├── ssl_.py │ │ ├── timeout.py │ │ └── url.py ├── sessions.py ├── status_codes.py ├── structures.py └── utils.py ├── search.py ├── txtfy.py └── wise.py /.gitignore: -------------------------------------------------------------------------------- 1 | secrets.py 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | 45 | # Translations 46 | *.mo 47 | *.pot 48 | 49 | # Django stuff: 50 | *.log 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # PyBuilder 56 | target/ 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Astrobot 2 | 3 | `Astrobot` is an SMS service that allows you to (sort of) browse the web entirely over SMS. Here's what it looks like: 4 | 5 | 1. You: help 6 | 2. Astro-bot:"Go to wsj.com", "search wikipedia for Azerbaijan". Text the # of a link to click it. 7 | 3. You: hackerschool.com 8 | 4. Astro-bot: Hacker School is a retreat 4 programmers. Want 2 spend three months coding & learning 9 | in New York w/ smart, friendly, & intellectually curious people? [Rea 10 | 5. You: more 11 | 6. Astro-bot:d on](1) . [APPLY NOW](2) New blog post: [Paper of the Week: Error Detecting & 12 | Error Correcting Codes](3) Hacker School is… Self-directed We value in 13 | 7. You: 3 more 14 | 8. Astro-bot:trinsic motivation & self-direction, & believe people learn best when they’re free 2 15 | explore their passions & interests. Accessible We’re free 4 everyone. 16 | 9. Astro-bot:We also offer need-based grants 4 living expenses 4 people from traditionally 17 | underrepresented groups in programming. Stimulating We look 4 smart, friendly, 18 | 10. Astro-bot: self-directed, & intellectually curious people who enjoy programming & want 2 get 19 | dramatically better. Exceptional We have exceptional residents including 20 | 11. You: click 1 21 | 12. Astro-bot: We review applications on a rolling basis. u should receive a confirmation email 22 | immediately after u apply, & we will review ur application within three weeks 23 | 13. You: 2 more 24 | 14. Astro-bot:. In general, clear & concise answers are better than long ones. Most answers have a 25 | 1500 character max length, but u don’t need 2 write that much. We strongl 26 | 15. Astro-bot:y encourage u 2 read our [about page](1) , [FAQ](2) , & [User’s Manual](3) b4 applying. 27 | Choose ur batch If you’re admitted but ur preferred batch is full, 28 | 16. You: click link 2 29 | 17. Astro-bot:What's that? I don't understand. Say 'help me' for help. 30 | 18. You: click 2 31 | 19. Astro-bot: Welcome 2 an unusual experiment Hacker School is unlike the rest of the world. This 32 | guide is designed 2 help u get settled in & get the most out of ur batch. 33 | 20. You: 3 more 34 | 21. Astro-bot: 1 of the things that makes Hacker School different is that it's largely self-directed . This 35 | means u won't have someone telling u what 2 do, learn, etc, while 36 | 22. Astro-bot: you're here (though we do have a few [social rules](1) ). This self-directedness is baked 37 | into the core structure of Hacker School, & is why we don't have grad 38 | 23. Astro-bot:es, exams, curricula, or even classes. It comes from our belief that people learn best when 39 | given the freedom 2 explore what most interests them. This doesn't 40 | 41 | 42 | `Astrobot` is a Google Appengine app that supports responding to incoming messages from Twilio. You can use a live version at [astro-bot.appspot.com](http://astro-bot.appspot.com) or by texting _646-576-7688_ . 43 | 44 | ## Running your own 45 | Download the Google Appengine launcher and just drag the repository folder into it. 46 | 47 | You've also got to **create a file called secrets.py**, which includes a variable `BING_API_KEY`, holding your Bing search API key (*not* a Simple Search API key—the full API key). You can get these for free. You'll need it for web search — otherwise, leave secrets.txt empty and it'll all work *except* web search. 48 | 49 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | application: astro-bot 2 | version: 1 3 | runtime: python27 4 | api_version: 1 5 | threadsafe: yes 6 | 7 | handlers: 8 | - url: /favicon\.ico 9 | static_files: favicon.ico 10 | upload: favicon\.ico 11 | 12 | - url: .* 13 | script: main.app 14 | 15 | libraries: 16 | - name: webapp2 17 | version: "2.5.2" 18 | - name: lxml 19 | version: "latest" 20 | - name: PIL 21 | version: "latest" 22 | -------------------------------------------------------------------------------- /browse.py: -------------------------------------------------------------------------------- 1 | from wise import Phrase, parse_phrase 2 | import parse_command 3 | import document 4 | import urllib 5 | import search 6 | 7 | def interact(query, state): 8 | # query: String, state: Dictionary 9 | parsed = parse_command.parse_command(query) 10 | print parsed 11 | if 'BrowserState' in state: 12 | bstate = state['BrowserState'] 13 | else: 14 | state['BrowserState'] = document.BrowserState() 15 | bstate = state['BrowserState'] 16 | 17 | bstate.clean_up() 18 | 19 | if parsed.intent == 'url': 20 | bstate.navigate_to_url(parsed.get("*url", None)) 21 | return bstate.get_n_messages(1) 22 | elif parsed.intent in ('more_text', 'previous_text'): 23 | return bstate.get_n_messages(min(7, int(parsed.get('*number', '1'))), backwards=(parsed.intent=='previous_text')) 24 | elif parsed.intent == 'back_to_top': 25 | bstate.frame_stack[-1].offset = 0 26 | return bstate.get_n_messages(1) 27 | elif parsed.intent == 'navigate' and parsed.get('*number', None): 28 | if parsed.get('on_last_page', False): 29 | bstate.back() 30 | url = bstate.frame_stack[-1].document.links[int(parsed.get('*number', '0'))-1] 31 | bstate.navigate_to_url(url) 32 | return bstate.get_n_messages(1) 33 | elif parsed.intent == 'help': 34 | bstate.navigate_to_url('http://astro-bot.appspot.com/instructions') 35 | return bstate.get_n_messages(1) 36 | # return ['Try these: "Go to hackerschool.com", "search wikipedia for Azerbaijan". On a web page, type "2 more" to see more or text the # of a link to click it.'] 37 | elif parsed.intent == 'back': 38 | bstate.back() 39 | return bstate.resend_current_place() 40 | elif parsed.intent == 'contents': 41 | bstate.go_to_contents() 42 | return bstate.get_n_messages(1) 43 | elif parsed.intent == 'search': 44 | query = parsed.get("~query", "") 45 | if parsed.get('search_source/wikipedia', False): 46 | url = "http://en.wikipedia.org/w/index.php?search=" + urllib.quote_plus(query) 47 | bstate.navigate_to_url(url) 48 | return bstate.get_n_messages(1) 49 | else: 50 | bstate.frame_stack.append(document.Frame(search.document_from_query(query))) 51 | return bstate.get_n_messages(1) 52 | elif parsed.intent == 'whereami': 53 | if bstate.frame_stack == []: 54 | return ["You haven't loaded any page yet."] 55 | else: 56 | url = bstate.frame_stack[-1].document.url 57 | url_string = " ({0})".format(url) if url else "" 58 | return [u'You\'re reading "{0}"{1}'.format(bstate.frame_stack[-1].document.title, url_string)] 59 | else: 60 | return ["What's that? I don't understand. Say 'help me' for help."] 61 | -------------------------------------------------------------------------------- /bs4/tests/__init__.py: -------------------------------------------------------------------------------- 1 | "The beautifulsoup tests." 2 | -------------------------------------------------------------------------------- /bs4/tests/test_docs.py: -------------------------------------------------------------------------------- 1 | "Test harness for doctests." 2 | 3 | # pylint: disable-msg=E0611,W0142 4 | 5 | __metaclass__ = type 6 | __all__ = [ 7 | 'additional_tests', 8 | ] 9 | 10 | import atexit 11 | import doctest 12 | import os 13 | #from pkg_resources import ( 14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources) 15 | import unittest 16 | 17 | DOCTEST_FLAGS = ( 18 | doctest.ELLIPSIS | 19 | doctest.NORMALIZE_WHITESPACE | 20 | doctest.REPORT_NDIFF) 21 | 22 | 23 | # def additional_tests(): 24 | # "Run the doc tests (README.txt and docs/*, if any exist)" 25 | # doctest_files = [ 26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))] 27 | # if resource_exists('bs4', 'docs'): 28 | # for name in resource_listdir('bs4', 'docs'): 29 | # if name.endswith('.txt'): 30 | # doctest_files.append( 31 | # os.path.abspath( 32 | # resource_filename('bs4', 'docs/%s' % name))) 33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) 34 | # atexit.register(cleanup_resources) 35 | # return unittest.TestSuite(( 36 | # doctest.DocFileSuite(*doctest_files, **kwargs))) 37 | -------------------------------------------------------------------------------- /bs4/tests/test_html5lib.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" 2 | 3 | import warnings 4 | 5 | try: 6 | from bs4.builder import HTML5TreeBuilder 7 | HTML5LIB_PRESENT = True 8 | except ImportError, e: 9 | HTML5LIB_PRESENT = False 10 | from bs4.element import SoupStrainer 11 | from bs4.testing import ( 12 | HTML5TreeBuilderSmokeTest, 13 | SoupTest, 14 | skipIf, 15 | ) 16 | 17 | @skipIf( 18 | not HTML5LIB_PRESENT, 19 | "html5lib seems not to be present, not testing its tree builder.") 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): 21 | """See ``HTML5TreeBuilderSmokeTest``.""" 22 | 23 | @property 24 | def default_builder(self): 25 | return HTML5TreeBuilder() 26 | 27 | def test_soupstrainer(self): 28 | # The html5lib tree builder does not support SoupStrainers. 29 | strainer = SoupStrainer("b") 30 | markup = "

A bold statement.

" 31 | with warnings.catch_warnings(record=True) as w: 32 | soup = self.soup(markup, parse_only=strainer) 33 | self.assertEqual( 34 | soup.decode(), self.document_for(markup)) 35 | 36 | self.assertTrue( 37 | "the html5lib tree builder doesn't support parse_only" in 38 | str(w[0].message)) 39 | 40 | def test_correctly_nested_tables(self): 41 | """html5lib inserts tags where other parsers don't.""" 42 | markup = ('' 43 | '' 44 | "') 48 | 49 | self.assertSoupEquals( 50 | markup, 51 | '
Here's another table:" 45 | '' 46 | '' 47 | '
foo
Here\'s another table:' 52 | '
foo
' 53 | '
') 54 | 55 | self.assertSoupEquals( 56 | "" 57 | "" 58 | "
Foo
Bar
Baz
") 59 | 60 | def test_xml_declaration_followed_by_doctype(self): 61 | markup = ''' 62 | 63 | 64 | 65 | 66 | 67 |

foo

68 | 69 | ''' 70 | soup = self.soup(markup) 71 | # Verify that we can reach the

tag; this means the tree is connected. 72 | self.assertEqual(b"

foo

", soup.p.encode()) 73 | 74 | def test_reparented_markup(self): 75 | markup = '

foo

\n

bar

' 76 | soup = self.soup(markup) 77 | self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) 78 | self.assertEqual(2, len(soup.find_all('p'))) 79 | 80 | 81 | def test_reparented_markup_ends_with_whitespace(self): 82 | markup = '

foo

\n

bar

\n' 83 | soup = self.soup(markup) 84 | self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) 85 | self.assertEqual(2, len(soup.find_all('p'))) 86 | -------------------------------------------------------------------------------- /bs4/tests/test_htmlparser.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the html.parser tree builder generates good 2 | trees.""" 3 | 4 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest 5 | from bs4.builder import HTMLParserTreeBuilder 6 | 7 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 8 | 9 | @property 10 | def default_builder(self): 11 | return HTMLParserTreeBuilder() 12 | 13 | def test_namespaced_system_doctype(self): 14 | # html.parser can't handle namespaced doctypes, so skip this one. 15 | pass 16 | 17 | def test_namespaced_public_doctype(self): 18 | # html.parser can't handle namespaced doctypes, so skip this one. 19 | pass 20 | -------------------------------------------------------------------------------- /bs4/tests/test_lxml.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure that the lxml tree builder generates good trees.""" 2 | 3 | import re 4 | import warnings 5 | 6 | try: 7 | import lxml.etree 8 | LXML_PRESENT = True 9 | LXML_VERSION = lxml.etree.LXML_VERSION 10 | except ImportError, e: 11 | LXML_PRESENT = False 12 | LXML_VERSION = (0,) 13 | 14 | if LXML_PRESENT: 15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML 16 | 17 | from bs4 import ( 18 | BeautifulSoup, 19 | BeautifulStoneSoup, 20 | ) 21 | from bs4.element import Comment, Doctype, SoupStrainer 22 | from bs4.testing import skipIf 23 | from bs4.tests import test_htmlparser 24 | from bs4.testing import ( 25 | HTMLTreeBuilderSmokeTest, 26 | XMLTreeBuilderSmokeTest, 27 | SoupTest, 28 | skipIf, 29 | ) 30 | 31 | @skipIf( 32 | not LXML_PRESENT, 33 | "lxml seems not to be present, not testing its tree builder.") 34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): 35 | """See ``HTMLTreeBuilderSmokeTest``.""" 36 | 37 | @property 38 | def default_builder(self): 39 | return LXMLTreeBuilder() 40 | 41 | def test_out_of_range_entity(self): 42 | self.assertSoupEquals( 43 | "

foo�bar

", "

foobar

") 44 | self.assertSoupEquals( 45 | "

foo�bar

", "

foobar

") 46 | self.assertSoupEquals( 47 | "

foo�bar

", "

foobar

") 48 | 49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this 50 | # test if an old version of lxml is installed. 51 | 52 | @skipIf( 53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0), 54 | "Skipping doctype test for old version of lxml to avoid segfault.") 55 | def test_empty_doctype(self): 56 | soup = self.soup("") 57 | doctype = soup.contents[0] 58 | self.assertEqual("", doctype.strip()) 59 | 60 | def test_beautifulstonesoup_is_xml_parser(self): 61 | # Make sure that the deprecated BSS class uses an xml builder 62 | # if one is installed. 63 | with warnings.catch_warnings(record=True) as w: 64 | soup = BeautifulStoneSoup("") 65 | self.assertEqual(u"", unicode(soup.b)) 66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) 67 | 68 | def test_real_xhtml_document(self): 69 | """lxml strips the XML definition from an XHTML doc, which is fine.""" 70 | markup = b""" 71 | 72 | 73 | Hello. 74 | Goodbye. 75 | """ 76 | soup = self.soup(markup) 77 | self.assertEqual( 78 | soup.encode("utf-8").replace(b"\n", b''), 79 | markup.replace(b'\n', b'').replace( 80 | b'', b'')) 81 | 82 | 83 | @skipIf( 84 | not LXML_PRESENT, 85 | "lxml seems not to be present, not testing its XML tree builder.") 86 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): 87 | """See ``HTMLTreeBuilderSmokeTest``.""" 88 | 89 | @property 90 | def default_builder(self): 91 | return LXMLTreeBuilderForXML() 92 | -------------------------------------------------------------------------------- /cssselect/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | """ 3 | CSS Selectors based on XPath 4 | ============================ 5 | 6 | This module supports selecting XML/HTML elements based on CSS selectors. 7 | See the `CSSSelector` class for details. 8 | 9 | 10 | :copyright: (c) 2007-2012 Ian Bicking and contributors. 11 | See AUTHORS for more details. 12 | :license: BSD, see LICENSE for more details. 13 | 14 | """ 15 | 16 | from cssselect.parser import (parse, Selector, FunctionalPseudoElement, 17 | SelectorError, SelectorSyntaxError) 18 | from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError 19 | 20 | 21 | VERSION = '0.9.1' 22 | __version__ = VERSION 23 | -------------------------------------------------------------------------------- /document.py: -------------------------------------------------------------------------------- 1 | import urllib, urllib2 2 | import bs4 3 | from txtfy import txtfy 4 | from html2text import html2doc 5 | import urlparse 6 | 7 | SMS_LEN = 160 8 | 9 | def normalize_url(url): 10 | scheme = url.split("://")[0] 11 | if scheme not in ['http', 'https']: 12 | url = 'http://' + url 13 | return url 14 | 15 | # opera mini for dumbphones: 16 | USER_AGENT = "Opera/9.80 (J2ME/MIDP; Opera Mini/4.2.13337/34.818; U; en) Presto/2.8.119 Version/11.10" 17 | 18 | def get_content(url): 19 | opener = urllib2.build_opener() 20 | opener.addheaders = [('User-agent', USER_AGENT)] 21 | html = opener.open(url).read() 22 | title = bs4.BeautifulSoup(html).title.get_text() 23 | return (html, title) 24 | 25 | def get_content_ip(url): 26 | url = "http://instapaper.com/m?u=" + urllib.quote_plus(url) 27 | html = urllib2.urlopen(url).read() 28 | soup = bs4.BeautifulSoup(html) 29 | story = soup.find(id='story') 30 | return unicode(story), soup.title.get_text() 31 | 32 | NO_URL = "" 33 | 34 | class Document(object): 35 | url = None 36 | def __init__(self, url=None, html=None): 37 | if url: 38 | url = normalize_url(url) 39 | html, self.title = get_content(url) 40 | self.url = url 41 | if not isinstance(html, unicode): 42 | html = html.decode('utf-8') 43 | self.text, self.links, self.headers = html2doc(html, baseurl = url if url else "") 44 | print "HEADERS", self.headers 45 | 46 | class Frame(object): 47 | def __init__(self, doc): 48 | self.document = doc 49 | self.offset = 0 50 | 51 | class BrowserState(object): 52 | def __init__(self): 53 | self.frame_stack = [] 54 | 55 | def clean_up(self): 56 | while len(self.frame_stack) > 5: 57 | self.frame_stack = self.frame_stack[1:] 58 | 59 | def navigate_to_url(self, url): 60 | parsed = urlparse.urlparse(url) 61 | if parsed.scheme == 'go-to-offset': 62 | self.back() 63 | self.frame_stack[-1].offset = int(parsed.netloc) 64 | else: 65 | self.frame_stack.append(Frame(Document(url))) 66 | 67 | def back(self): 68 | if len(self.frame_stack): 69 | self.frame_stack = self.frame_stack[:-1] 70 | 71 | def resend_current_place(self): 72 | self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - SMS_LEN) 73 | return self.get_n_messages(1) 74 | 75 | def go_to_contents(self): 76 | current_page_title = self.frame_stack[-1].document.title 77 | html = "Headings on {0}".format(current_page_title) + u"
".join([u"{1}".format(offset, heading) for heading, offset in self.frame_stack[-1].document.headers]) 78 | doc = Document(html = html) 79 | self.frame_stack.append(Frame(doc)) 80 | 81 | def get_n_messages(self, n, backwards=False): 82 | if backwards: 83 | self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - 160) 84 | 85 | if not backwards and self.frame_stack[-1].offset >= len(self.frame_stack[-1].document.text): 86 | return [""] 87 | else: 88 | messages = [] 89 | for i in xrange(n): 90 | start_offset = self.frame_stack[-1].offset 91 | if backwards: 92 | start_offset = max(0, start_offset-160) 93 | end_offset = min(len(self.frame_stack[-1].document.text), start_offset + SMS_LEN) 94 | if end_offset - start_offset == 0: 95 | break 96 | messages.append(self.frame_stack[-1].document.text[start_offset : end_offset]) 97 | self.frame_stack[-1].offset = start_offset if backwards else end_offset 98 | if self.frame_stack[-1].offset == 0: 99 | break 100 | if backwards: 101 | self.frame_stack[-1].offset = min(len(self.frame_stack[-1].document.text), self.frame_stack[-1].offset + 160) 102 | return messages 103 | 104 | -------------------------------------------------------------------------------- /document_old.py: -------------------------------------------------------------------------------- 1 | import urllib, urllib2 2 | import bs4 3 | from txtfy import txtfy 4 | 5 | SMS_LEN = 160 6 | 7 | def normalize_url(url): 8 | scheme = url.split("://")[0] 9 | if scheme not in ['http', 'https']: 10 | url = 'http://' + url 11 | return url 12 | 13 | def get_content(url): 14 | html = urllib2.urlopen(url).read() 15 | title = bs4.BeautifulSoup(html).title.get_text() 16 | return (html, title) 17 | 18 | def get_content_ip(url): 19 | url = "http://instapaper.com/m?u=" + urllib.quote_plus(url) 20 | html = urllib2.urlopen(url).read() 21 | soup = bs4.BeautifulSoup(html) 22 | story = soup.find(id='story') 23 | return unicode(story), soup.title.get_text() 24 | 25 | NO_URL = "" 26 | 27 | class Document(object): 28 | def __init__(self, url=None, html=None): 29 | self.url = url 30 | if url: 31 | html, self.title = get_content(normalize_url(url)) 32 | soup = bs4.BeautifulSoup(html) 33 | self.text = u"" 34 | self.links = [] 35 | self.headers = [] 36 | ignore_tags = set(['head', 'script', 'style']) 37 | def break_line(): 38 | if len(self.text) > 0 and self.text[-1] != '\n': 39 | self.text += '\n' 40 | def break_word(): 41 | if len(self.text) > 0 and self.text[-1] not in " \n": 42 | self.text += " " 43 | def emit_text(t): 44 | break_word() 45 | self.text += txtfy(t) 46 | def traverse(tag): 47 | if tag.name == 'a' and tag.has_attr('href'): 48 | self.links.append(tag['href']) 49 | emit_text(u'[{0}]({1}) '.format(tag.get_text(), len(self.links))) 50 | elif tag.name in ['h1', 'h2', 'h3', 'h4']: 51 | break_line() 52 | self.headers.append((tag.get_text(), len(self.text))) 53 | process_contents(tag) 54 | break_line() 55 | elif tag.name in ['li', 'p']: 56 | break_line() 57 | process_contents(tag) 58 | break_line() 59 | else: 60 | process_contents(tag) 61 | def process_contents(tag): 62 | for child in tag.contents: 63 | if isinstance(child, bs4.NavigableString): 64 | emit_text(unicode(child)) 65 | elif hasattr(child, 'name'): 66 | traverse(child) 67 | traverse(soup) 68 | break_line() 69 | emit_text("") 70 | 71 | class Frame(object): 72 | def __init__(self, doc): 73 | self.document = doc 74 | self.offset = 0 75 | 76 | class BrowserState(object): 77 | def __init__(self): 78 | self.frame_stack = [] 79 | 80 | def clean_up(self): 81 | while len(self.frame_stack) > 5: 82 | self.frame_stack = self.frame_stack[1:] 83 | 84 | def navigate_to_url(self, url): 85 | self.frame_stack.append(Frame(Document(url))) 86 | 87 | def back(self): 88 | if len(self.frame_stack): 89 | self.frame_stack = self.frame_stack[:-1] 90 | 91 | def resend_current_place(self): 92 | self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - SMS_LEN) 93 | return self.get_n_messages(1) 94 | 95 | def get_n_messages(self, n): 96 | if self.frame_stack[-1].offset >= len(self.frame_stack[-1].document.text): 97 | return [""] 98 | else: 99 | messages = [] 100 | for i in xrange(n): 101 | start_offset = self.frame_stack[-1].offset 102 | end_offset = min(len(self.frame_stack[-1].document.text), start_offset + SMS_LEN) 103 | if end_offset - start_offset == 0: 104 | break 105 | messages.append(self.frame_stack[-1].document.text[start_offset : end_offset]) 106 | self.frame_stack[-1].offset = end_offset 107 | return messages 108 | 109 | -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/favicon.ico -------------------------------------------------------------------------------- /goose/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import os 24 | import platform 25 | from tempfile import mkstemp 26 | 27 | from goose.version import version_info, __version__ 28 | from goose.configuration import Configuration 29 | from goose.crawler import CrawlCandidate 30 | from goose.crawler import Crawler 31 | 32 | 33 | class Goose(object): 34 | """\ 35 | 36 | """ 37 | def __init__(self, config=None): 38 | self.config = config or Configuration() 39 | self.extend_config() 40 | self.initialize() 41 | 42 | def extend_config(self): 43 | if isinstance(self.config, dict): 44 | config = Configuration() 45 | for k, v in self.config.items(): 46 | if hasattr(config, k): 47 | setattr(config, k, v) 48 | self.config = config 49 | 50 | def extract(self, url=None, raw_html=None): 51 | """\ 52 | Main method to extract an article object from a URL, 53 | pass in a url and get back a Article 54 | """ 55 | cc = CrawlCandidate(self.config, url, raw_html) 56 | return self.crawl(cc) 57 | 58 | def shutdown_network(self): 59 | pass 60 | 61 | def crawl(self, crawl_candiate): 62 | crawler = Crawler(self.config) 63 | article = crawler.crawl(crawl_candiate) 64 | return article 65 | 66 | def initialize(self): 67 | # we don't need to go further if image extractor or 68 | # local_storage is not set 69 | if not self.config.local_storage_path or \ 70 | not self.config.enable_image_fetching: 71 | return 72 | # test if config.local_storage_path 73 | # is a directory 74 | if not os.path.isdir(self.config.local_storage_path): 75 | os.makedirs(self.config.local_storage_path) 76 | 77 | if not os.path.isdir(self.config.local_storage_path): 78 | raise Exception(self.config.local_storage_path + 79 | " directory does not seem to exist, " 80 | "you need to set this for image processing downloads" 81 | ) 82 | 83 | # test to write a dummy file to the directory 84 | # to check is directory is writtable 85 | level, path = mkstemp(dir=self.config.local_storage_path) 86 | try: 87 | f = os.fdopen(level, "w") 88 | f.close() 89 | os.remove(path) 90 | except IOError: 91 | raise Exception(self.config.local_storage_path + 92 | " directory is not writeble, " 93 | "you need to set this for image processing downloads" 94 | ) 95 | -------------------------------------------------------------------------------- /goose/article.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | 24 | 25 | class Article(object): 26 | 27 | def __init__(self): 28 | # title of the article 29 | self.title = None 30 | 31 | # stores the lovely, pure text from the article, 32 | # stripped of html, formatting, etc... 33 | # just raw text with paragraphs separated by newlines. 34 | # This is probably what you want to use. 35 | self.cleaned_text = u"" 36 | 37 | # meta description field in HTML source 38 | self.meta_description = u"" 39 | 40 | # meta lang field in HTML source 41 | self.meta_lang = u"" 42 | 43 | # meta favicon field in HTML source 44 | self.meta_favicon = u"" 45 | 46 | # meta keywords field in the HTML source 47 | self.meta_keywords = u"" 48 | 49 | # The canonical link of this article if found in the meta data 50 | self.canonical_link = u"" 51 | 52 | # holds the domain of this article we're parsing 53 | self.domain = u"" 54 | 55 | # holds the top Element we think 56 | # is a candidate for the main body of the article 57 | self.top_node = None 58 | 59 | # holds the top Image object that 60 | # we think represents this article 61 | self.top_image = None 62 | 63 | # holds a set of tags that may have 64 | # been in the artcle, these are not meta keywords 65 | self.tags = set() 66 | 67 | # holds a list of any movies 68 | # we found on the page like youtube, vimeo 69 | self.movies = [] 70 | 71 | # stores the final URL that we're going to try 72 | # and fetch content against, this would be expanded if any 73 | self.final_url = u"" 74 | 75 | # stores the MD5 hash of the url 76 | # to use for various identification tasks 77 | self.link_hash = "" 78 | 79 | # stores the RAW HTML 80 | # straight from the network connection 81 | self.raw_html = u"" 82 | 83 | # the lxml Document object 84 | self.doc = None 85 | 86 | # this is the original JSoup document that contains 87 | # a pure object from the original HTML without any cleaning 88 | # options done on it 89 | self.raw_doc = None 90 | 91 | # Sometimes useful to try and know when 92 | # the publish date of an article was 93 | self.publish_date = None 94 | 95 | # A property bucket for consumers of goose to store custom data extractions. 96 | self.additional_data = {} 97 | -------------------------------------------------------------------------------- /goose/configuration.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import os 24 | import tempfile 25 | from goose.text import StopWords 26 | from goose.parsers import Parser 27 | from goose.parsers import ParserSoup 28 | from goose.version import __version__ 29 | 30 | HTTP_DEFAULT_TIMEOUT = 30 31 | 32 | 33 | class Configuration(object): 34 | 35 | def __init__(self): 36 | # What's the minimum bytes for an image we'd accept is, 37 | # alot of times we want to filter out the author's little images 38 | # in the beginning of the article 39 | self.images_min_bytes = 4500 40 | 41 | # set this guy to false if you don't care about getting images, 42 | # otherwise you can either use the default 43 | # image extractor to implement the ImageExtractor 44 | # interface to build your own 45 | self.enable_image_fetching = True 46 | 47 | # set this valriable to False if you want to force 48 | # the article language. OtherWise it will attempt to 49 | # find meta language and use the correct stopwords dictionary 50 | self.use_meta_language = True 51 | 52 | # default language 53 | # it will be use as fallback 54 | # if use_meta_language is set to false, targetlanguage will 55 | # be use 56 | self.target_language = 'en' 57 | 58 | # defautl stopwrods class 59 | self.stopwords_class = StopWords 60 | 61 | # path to your imagemagick convert executable, 62 | # on the mac using mac ports this is the default listed 63 | self.imagemagick_convert_path = "/opt/local/bin/convert" 64 | 65 | # path to your imagemagick identify executable 66 | self.imagemagick_identify_path = "/opt/local/bin/identify" 67 | 68 | # used as the user agent that 69 | # is sent with your web requests to extract an article 70 | # self.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2)"\ 71 | # " AppleWebKit/534.52.7 (KHTML, like Gecko) "\ 72 | # "Version/5.1.2 Safari/534.52.7" 73 | self.browser_user_agent = 'Goose/%s' % __version__ 74 | 75 | # debug mode 76 | # enable this to have additional debugging information 77 | # sent to stdout 78 | self.debug = False 79 | 80 | # TODO 81 | self.extract_publishdate = None 82 | 83 | # TODO 84 | self.additional_data_extractor = None 85 | 86 | # Parser type 87 | self.parser_class = 'lxml' 88 | 89 | # set the local storage path 90 | # make this configurable 91 | self.local_storage_path = os.path.join(tempfile.gettempdir(), 'goose') 92 | 93 | # http timeout 94 | self.http_timeout = HTTP_DEFAULT_TIMEOUT 95 | 96 | def get_parser(self): 97 | return Parser if self.parser_class == 'lxml' else ParserSoup 98 | 99 | def get_publishdate_extractor(self): 100 | return self.extract_publishdate 101 | 102 | def set_publishdate_extractor(self, extractor): 103 | """\ 104 | Pass in to extract article publish dates. 105 | @param extractor a concrete instance of PublishDateExtractor 106 | """ 107 | if not extractor: 108 | raise ValueError("extractor must not be null!") 109 | self.extract_publishdate = extractor 110 | 111 | def get_additionaldata_extractor(self): 112 | return self.additional_data_extractor 113 | 114 | def set_additionaldata_extractor(self, extractor): 115 | """\ 116 | Pass in to extract any additional data not defined within 117 | @param extractor a concrete instance of AdditionalDataExtractor 118 | """ 119 | if not extractor: 120 | raise ValueError("extractor must not be null!") 121 | self.additional_data_extractor = extractor 122 | -------------------------------------------------------------------------------- /goose/images/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/goose/images/__init__.py -------------------------------------------------------------------------------- /goose/images/image.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | 24 | 25 | class Image(object): 26 | 27 | def __init__(self): 28 | # holds the Element node of the image we think is top dog 29 | self.top_image_node = None 30 | 31 | # holds the src of the image 32 | self.src = "" 33 | 34 | # how confident are we in this image extraction? 35 | # the most images generally the less confident 36 | self.confidence_score = float(0.0) 37 | 38 | # Height of the image in pixels 39 | self.height = 0 40 | 41 | # width of the image in pixels 42 | self.width = 0 43 | 44 | # what kind of image extraction was used for this? 45 | # bestGuess, linkTag, openGraph tags? 46 | self.extraction_type = "NA" 47 | 48 | # stores how many bytes this image is. 49 | self.bytes = long(0) 50 | 51 | def get_src(self): 52 | return self.src 53 | 54 | 55 | class ImageDetails(object): 56 | 57 | def __init__(self): 58 | 59 | # the width of the image 60 | self.width = 0 61 | 62 | # height of the image 63 | self.height = 0 64 | 65 | # the mime_type of the image JPEG / PNG 66 | self.mime_type = None 67 | 68 | def get_width(self): 69 | return self.width 70 | 71 | def set_width(self, width): 72 | self.width = width 73 | 74 | def get_height(self): 75 | return self.height 76 | 77 | def set_height(self, height): 78 | self.height = height 79 | 80 | def get_mime_type(self): 81 | return self.mime_type 82 | 83 | def set_mime_type(self, mime_type): 84 | self.mime_type = mime_type 85 | 86 | 87 | class LocallyStoredImage(object): 88 | 89 | def __init__(self, src='', local_filename='', 90 | link_hash='', bytes=long(0), file_extension='', height=0, width=0): 91 | self.src = src 92 | self.local_filename = local_filename 93 | self.link_hash = link_hash 94 | self.bytes = bytes 95 | self.file_extension = file_extension 96 | self.height = height 97 | self.width = width 98 | -------------------------------------------------------------------------------- /goose/images/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import hashlib 24 | import os 25 | import urllib2 26 | from PIL import Image 27 | from goose.utils.encoding import smart_str 28 | from goose.images.image import ImageDetails 29 | from goose.images.image import LocallyStoredImage 30 | 31 | 32 | class ImageUtils(object): 33 | 34 | @classmethod 35 | def get_image_dimensions(self, identify_program, path): 36 | image = Image.open(path) 37 | image_details = ImageDetails() 38 | image_details.set_mime_type(image.format) 39 | width, height = image.size 40 | image_details.set_width(width) 41 | image_details.set_height(height) 42 | return image_details 43 | 44 | @classmethod 45 | def store_image(self, http_client, link_hash, src, config): 46 | """\ 47 | Writes an image src http string to disk as a temporary file 48 | and returns the LocallyStoredImage object 49 | that has the info you should need on the image 50 | """ 51 | # check for a cache hit already on disk 52 | image = self.read_localfile(link_hash, src, config) 53 | if image: 54 | return image 55 | 56 | # no cache found download the image 57 | data = self.fetch(http_client, src) 58 | if data: 59 | image = self.write_localfile(data, link_hash, src, config) 60 | if image: 61 | return image 62 | 63 | return None 64 | 65 | @classmethod 66 | def get_mime_type(self, image_details): 67 | mime_type = image_details.get_mime_type().lower() 68 | mimes = { 69 | 'png': '.png', 70 | 'jpg': '.jpg', 71 | 'jpeg': '.jpg', 72 | 'gif': '.gif', 73 | } 74 | return mimes.get(mime_type, 'NA') 75 | 76 | @classmethod 77 | def read_localfile(self, link_hash, src, config): 78 | local_image_name = self.get_localfile_name(link_hash, src, config) 79 | if os.path.isfile(local_image_name): 80 | identify = config.imagemagick_identify_path 81 | image_details = self.get_image_dimensions(identify, local_image_name) 82 | file_extension = self.get_mime_type(image_details) 83 | bytes = os.path.getsize(local_image_name) 84 | return LocallyStoredImage( 85 | src=src, 86 | local_filename=local_image_name, 87 | link_hash=link_hash, 88 | bytes=bytes, 89 | file_extension=file_extension, 90 | height=image_details.get_height(), 91 | width=image_details.get_width() 92 | ) 93 | return None 94 | 95 | @classmethod 96 | def write_localfile(self, entity, link_hash, src, config): 97 | local_path = self.get_localfile_name(link_hash, src, config) 98 | f = open(local_path, 'wb') 99 | f.write(entity) 100 | f.close() 101 | return self.read_localfile(link_hash, src, config) 102 | 103 | @classmethod 104 | def get_localfile_name(self, link_hash, src, config): 105 | image_hash = hashlib.md5(smart_str(src)).hexdigest() 106 | return os.path.join(config.local_storage_path, '%s_%s' % (link_hash, image_hash)) 107 | 108 | @classmethod 109 | def clean_src_string(self, src): 110 | return src.replace(" ", "%20") 111 | 112 | @classmethod 113 | def fetch(self, http_client, src): 114 | try: 115 | req = urllib2.Request(src) 116 | f = urllib2.urlopen(req) 117 | data = f.read() 118 | return data 119 | except: 120 | return None 121 | -------------------------------------------------------------------------------- /goose/network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import urllib2 24 | 25 | 26 | class HtmlFetcher(object): 27 | 28 | def __init__(self, config): 29 | self.config = config 30 | # set header 31 | self.headers = {'User-agent': self.config.browser_user_agent} 32 | 33 | def get_url(self): 34 | # if we have a result 35 | # get the final_url 36 | if self.result is not None: 37 | return self.result.geturl() 38 | return None 39 | 40 | def get_html(self, url): 41 | # utf-8 encode unicode url 42 | if isinstance(url, unicode): 43 | url = url.encode('utf-8') 44 | 45 | # set request 46 | self.request = urllib2.Request( 47 | url, 48 | headers=self.headers) 49 | # do request 50 | try: 51 | self.result = urllib2.urlopen( 52 | self.request, 53 | timeout=self.config.http_timeout) 54 | except: 55 | self.result = None 56 | 57 | # read the result content 58 | if self.result is not None: 59 | return self.result.read() 60 | return None 61 | -------------------------------------------------------------------------------- /goose/resources/images/known-image-css.txt: -------------------------------------------------------------------------------- 1 | latimes.com^thumbnail 2 | cnn.com^storytext|cnn_strycntntlft 3 | foxnews.com^entry-content 4 | msn.com^articleText 5 | go.com^mediaimage 6 | lefigaro.fr^photo center 7 | cadres.apec.fr^noFieldsTable 8 | emploi.lesechos.fr^offerHeader 9 | linkfinance.fr^offerHeader -------------------------------------------------------------------------------- /goose/resources/text/stopwords-ar.txt: -------------------------------------------------------------------------------- 1 | فى 2 | في 3 | كل 4 | لم 5 | لن 6 | له 7 | من 8 | هو 9 | هي 10 | قوة 11 | كما 12 | لها 13 | منذ 14 | وقد 15 | ولا 16 | نفسه 17 | لقاء 18 | مقابل 19 | هناك 20 | وقال 21 | وكان 22 | نهاية 23 | وقالت 24 | وكانت 25 | للامم 26 | فيه 27 | كلم 28 | لكن 29 | وفي 30 | وقف 31 | ولم 32 | ومن 33 | وهو 34 | وهي 35 | يوم 36 | فيها 37 | منها 38 | مليار 39 | لوكالة 40 | يكون 41 | يمكن 42 | مليون 43 | حيث 44 | اكد 45 | الا 46 | اما 47 | امس 48 | السابق 49 | التى 50 | التي 51 | اكثر 52 | ايار 53 | ايضا 54 | ثلاثة 55 | الذاتي 56 | الاخيرة 57 | الثاني 58 | الثانية 59 | الذى 60 | الذي 61 | الان 62 | امام 63 | ايام 64 | خلال 65 | حوالى 66 | الذين 67 | الاول 68 | الاولى 69 | بين 70 | ذلك 71 | دون 72 | حول 73 | حين 74 | الف 75 | الى 76 | انه 77 | اول 78 | ضمن 79 | انها 80 | جميع 81 | الماضي 82 | الوقت 83 | المقبل 84 | اليوم 85 | ـ 86 | ف 87 | و 88 | و6 89 | قد 90 | لا 91 | ما 92 | مع 93 | مساء 94 | هذا 95 | واحد 96 | واضاف 97 | واضافت 98 | فان 99 | قبل 100 | قال 101 | كان 102 | لدى 103 | نحو 104 | هذه 105 | وان 106 | واكد 107 | كانت 108 | واوضح 109 | مايو 110 | ب 111 | ا 112 | أ 113 | ، 114 | عشر 115 | عدد 116 | عدة 117 | عشرة 118 | عدم 119 | عام 120 | عاما 121 | عن 122 | عند 123 | عندما 124 | على 125 | عليه 126 | عليها 127 | زيارة 128 | سنة 129 | سنوات 130 | تم 131 | ضد 132 | بعد 133 | بعض 134 | اعادة 135 | اعلنت 136 | بسبب 137 | حتى 138 | اذا 139 | احد 140 | اثر 141 | برس 142 | باسم 143 | غدا 144 | شخصا 145 | صباح 146 | اطار 147 | اربعة 148 | اخرى 149 | بان 150 | اجل 151 | غير 152 | بشكل 153 | حاليا 154 | بن 155 | به 156 | ثم 157 | اف 158 | ان 159 | او 160 | اي 161 | بها 162 | صفر -------------------------------------------------------------------------------- /goose/resources/text/stopwords-da.txt: -------------------------------------------------------------------------------- 1 | af 2 | alle 3 | andet 4 | andre 5 | at 6 | begge 7 | da 8 | de 9 | den 10 | denne 11 | der 12 | deres 13 | det 14 | dette 15 | dig 16 | din 17 | dog 18 | du 19 | ej 20 | eller 21 | en 22 | end 23 | ene 24 | eneste 25 | enhver 26 | et 27 | fem 28 | fire 29 | flere 30 | fleste 31 | for 32 | fordi 33 | forrige 34 | fra 35 | få 36 | før 37 | god 38 | han 39 | hans 40 | har 41 | hendes 42 | her 43 | hun 44 | hvad 45 | hvem 46 | hver 47 | hvilken 48 | hvis 49 | hvor 50 | hvordan 51 | hvorfor 52 | hvornår 53 | i 54 | ikke 55 | ind 56 | ingen 57 | intet 58 | jeg 59 | jeres 60 | kan 61 | kom 62 | kommer 63 | lav 64 | lidt 65 | lille 66 | man 67 | mand 68 | mange 69 | med 70 | meget 71 | men 72 | mens 73 | mere 74 | mig 75 | ned 76 | ni 77 | nogen 78 | noget 79 | ny 80 | nyt 81 | nær 82 | næste 83 | næsten 84 | og 85 | op 86 | otte 87 | over 88 | på 89 | se 90 | seks 91 | ses 92 | som 93 | stor 94 | store 95 | syv 96 | ti 97 | til 98 | to 99 | tre 100 | ud 101 | var 102 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-es.txt: -------------------------------------------------------------------------------- 1 | de 2 | la 3 | que 4 | el 5 | en 6 | y 7 | a 8 | los 9 | del 10 | se 11 | las 12 | por 13 | un 14 | para 15 | con 16 | no 17 | una 18 | su 19 | al 20 | lo 21 | como 22 | más 23 | pero 24 | sus 25 | le 26 | ya 27 | o 28 | este 29 | sí 30 | porque 31 | esta 32 | entre 33 | cuando 34 | muy 35 | sin 36 | sobre 37 | también 38 | me 39 | hasta 40 | hay 41 | donde 42 | quien 43 | desde 44 | todo 45 | nos 46 | durante 47 | todos 48 | uno 49 | les 50 | ni 51 | contra 52 | otros 53 | ese 54 | eso 55 | ante 56 | ellos 57 | e 58 | esto 59 | mí 60 | antes 61 | algunos 62 | qué 63 | unos 64 | yo 65 | otro 66 | otras 67 | otra 68 | él 69 | tanto 70 | esa 71 | estos 72 | mucho 73 | quienes 74 | nada 75 | muchos 76 | cual 77 | poco 78 | ella 79 | estar 80 | estas 81 | algunas 82 | algo 83 | nosotros 84 | mi 85 | mis 86 | tú 87 | te 88 | ti 89 | tu 90 | tus 91 | ellas 92 | nosotras 93 | vosotros 94 | vosotras 95 | os 96 | mío 97 | mía 98 | míos 99 | mías 100 | tuyo 101 | tuya 102 | tuyos 103 | tuyas 104 | suyo 105 | suya 106 | suyos 107 | suyas 108 | nuestro 109 | nuestra 110 | nuestros 111 | nuestras 112 | vuestro 113 | vuestra 114 | vuestros 115 | vuestras 116 | esos 117 | esas 118 | estoy 119 | estás 120 | está 121 | estamos 122 | estáis 123 | están 124 | esté 125 | estés 126 | estemos 127 | estéis 128 | estén 129 | estaré 130 | estarás 131 | estará 132 | estaremos 133 | estaréis 134 | estarán 135 | estaría 136 | estarías 137 | estaríamos 138 | estaríais 139 | estarían 140 | estaba 141 | estabas 142 | estábamos 143 | estabais 144 | estaban 145 | estuve 146 | estuviste 147 | estuvo 148 | estuvimos 149 | estuvisteis 150 | estuvieron 151 | estuviera 152 | estuvieras 153 | estuviéramos 154 | estuvierais 155 | estuvieran 156 | estuviese 157 | estuvieses 158 | estuviésemos 159 | estuvieseis 160 | estuviesen 161 | estando 162 | estado 163 | estada 164 | estados 165 | estadas 166 | estad 167 | he 168 | has 169 | ha 170 | hemos 171 | habéis 172 | han 173 | haya 174 | hayas 175 | hayamos 176 | hayáis 177 | hayan 178 | habré 179 | habrás 180 | habrá 181 | habremos 182 | habréis 183 | habrán 184 | habría 185 | habrías 186 | habríamos 187 | habríais 188 | habrían 189 | había 190 | habías 191 | habíamos 192 | habíais 193 | habían 194 | hube 195 | hubiste 196 | hubo 197 | hubimos 198 | hubisteis 199 | hubieron 200 | hubiera 201 | hubieras 202 | hubiéramos 203 | hubierais 204 | hubieran 205 | hubiese 206 | hubieses 207 | hubiésemos 208 | hubieseis 209 | hubiesen 210 | habiendo 211 | habido 212 | habida 213 | habidos 214 | habidas 215 | 216 | # forms of ser, to be (not including the infinitive): 217 | soy 218 | eres 219 | es 220 | somos 221 | sois 222 | son 223 | sea 224 | seas 225 | seamos 226 | seáis 227 | sean 228 | seré 229 | serás 230 | será 231 | seremos 232 | seréis 233 | serán 234 | sería 235 | serías 236 | seríamos 237 | seríais 238 | serían 239 | era 240 | eras 241 | éramos 242 | erais 243 | eran 244 | fui 245 | fuiste 246 | fue 247 | fuimos 248 | fuisteis 249 | fueron 250 | fuera 251 | fueras 252 | fuéramos 253 | fuerais 254 | fueran 255 | fuese 256 | fueses 257 | fuésemos 258 | fueseis 259 | fuesen 260 | siendo 261 | sido 262 | tengo 263 | tienes 264 | tiene 265 | tenemos 266 | tenéis 267 | tienen 268 | tenga 269 | tengas 270 | tengamos 271 | tengáis 272 | tengan 273 | tendré 274 | tendrás 275 | tendrá 276 | tendremos 277 | tendréis 278 | tendrán 279 | tendría 280 | tendrías 281 | tendríamos 282 | tendríais 283 | tendrían 284 | tenía 285 | tenías 286 | teníamos 287 | teníais 288 | tenían 289 | tuve 290 | tuviste 291 | tuvo 292 | tuvimos 293 | tuvisteis 294 | tuvieron 295 | tuviera 296 | tuvieras 297 | tuviéramos 298 | tuvierais 299 | tuvieran 300 | tuviese 301 | tuvieses 302 | tuviésemos 303 | tuvieseis 304 | tuviesen 305 | teniendo 306 | tenido 307 | tenida 308 | tenidos 309 | tenidas 310 | tened 311 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-fi.txt: -------------------------------------------------------------------------------- 1 | alla 2 | ansiosta 3 | ehkä 4 | ei 5 | enemmän 6 | ennen 7 | etessa 8 | f 9 | haikki 10 | he 11 | hitaasti 12 | hoikein 13 | hyvin 14 | hän 15 | ilman 16 | ja 17 | jos 18 | jälkeen 19 | kanssa 20 | kaukana 21 | kenties 22 | keskellä 23 | kesken 24 | koskaan 25 | kuinkan 26 | kukka 27 | kylliksi 28 | kyllä 29 | liian 30 | lla 31 | lla 32 | luona 33 | lähellä 34 | läpi 35 | me 36 | miksi 37 | mikä 38 | milloin 39 | milloinkan 40 | minä 41 | missä 42 | miten 43 | nopeasti 44 | nyt 45 | oikea 46 | oikealla 47 | paljon 48 | siellä 49 | sinä 50 | ssa 51 | sta 52 | suoraan 53 | tai 54 | takana 55 | takia 56 | tarpeeksi 57 | te 58 | tässä 59 | ulkopuolella 60 | vahemmän 61 | vasen 62 | vasenmalla 63 | vastan 64 | vielä 65 | vieressä 66 | vähän 67 | yhdessä 68 | ylös 69 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-fr.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #----------------------------------------------------------------------- 17 | # a couple of test stopwords to test that the words are really being 18 | # configured from this file: 19 | stopworda 20 | stopwordb 21 | 22 | #Standard english stop words taken from Lucene's StopAnalyzer 23 | a 24 | an 25 | and 26 | are 27 | as 28 | at 29 | be 30 | but 31 | by 32 | for 33 | if 34 | in 35 | into 36 | is 37 | it 38 | no 39 | not 40 | of 41 | on 42 | or 43 | s 44 | such 45 | t 46 | that 47 | the 48 | their 49 | then 50 | there 51 | these 52 | they 53 | this 54 | to 55 | was 56 | will 57 | with 58 | au 59 | aux 60 | avec 61 | ce 62 | ces 63 | dans 64 | de 65 | des 66 | du 67 | elle 68 | en 69 | et 70 | eux 71 | il 72 | je 73 | la 74 | le 75 | leur 76 | lui 77 | ma 78 | mais 79 | me 80 | même 81 | mes 82 | moi 83 | mon 84 | ne 85 | nos 86 | notre 87 | nous 88 | on 89 | ou 90 | par 91 | pas 92 | pour 93 | qu 94 | que 95 | qui 96 | sa 97 | se 98 | ses 99 | son 100 | sur 101 | ta 102 | te 103 | tes 104 | toi 105 | ton 106 | tu 107 | un 108 | une 109 | vos 110 | votre 111 | vous 112 | c 113 | d 114 | j 115 | l 116 | à 117 | m 118 | n 119 | s 120 | t 121 | y 122 | été 123 | étée 124 | étées 125 | étés 126 | étant 127 | suis 128 | es 129 | est 130 | sommes 131 | êtes 132 | sont 133 | serai 134 | seras 135 | sera 136 | serons 137 | serez 138 | seront 139 | serais 140 | serait 141 | serions 142 | seriez 143 | seraient 144 | étais 145 | était 146 | étions 147 | étiez 148 | étaient 149 | fus 150 | fut 151 | fûmes 152 | fûtes 153 | furent 154 | sois 155 | soit 156 | soyons 157 | soyez 158 | soient 159 | fusse 160 | fusses 161 | fût 162 | fussions 163 | fussiez 164 | fussent 165 | ayant 166 | eu 167 | eue 168 | eues 169 | eus 170 | ai 171 | as 172 | avons 173 | avez 174 | ont 175 | aurai 176 | auras 177 | aura 178 | aurons 179 | aurez 180 | auront 181 | aurais 182 | aurait 183 | aurions 184 | auriez 185 | auraient 186 | avais 187 | avait 188 | avions 189 | aviez 190 | avaient 191 | eut 192 | eûmes 193 | eûtes 194 | eurent 195 | aie 196 | aies 197 | ait 198 | ayons 199 | ayez 200 | aient 201 | eusse 202 | eusses 203 | eût 204 | eussions 205 | eussiez 206 | eussent 207 | ceci 208 | celà 209 | cet 210 | cette 211 | ici 212 | ils 213 | les 214 | leurs 215 | quel 216 | quels 217 | quelle 218 | quelles 219 | sans 220 | soi 221 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-hu.txt: -------------------------------------------------------------------------------- 1 | a 2 | á 3 | ahogy 4 | ahol 5 | aki 6 | akik 7 | akkor 8 | alatt 9 | által 10 | általában 11 | amely 12 | amelyek 13 | amelyekben 14 | amelyeket 15 | amelyet 16 | amelynek 17 | ami 18 | amit 19 | amolyan 20 | amp 21 | amíg 22 | amikor 23 | át 24 | abban 25 | ahhoz 26 | annak 27 | arra 28 | arról 29 | az 30 | azok 31 | azon 32 | azt 33 | azzal 34 | azért 35 | aztán 36 | azután 37 | azonban 38 | b 39 | bár 40 | be 41 | belül 42 | benne 43 | c 44 | cikk 45 | cikkek 46 | cikkeket 47 | csak 48 | d 49 | de 50 | e 51 | é 52 | eddig 53 | egész 54 | egy 55 | egyes 56 | egyetlen 57 | egyéb 58 | egyik 59 | egyre 60 | ekkor 61 | el 62 | elég 63 | ellen 64 | elő 65 | először 66 | előtt 67 | első 68 | én 69 | éppen 70 | ebben 71 | ehhez 72 | emilyen 73 | ennek 74 | erre 75 | ez 76 | ezt 77 | ezek 78 | ezen 79 | ezzel 80 | ezért 81 | és 82 | f 83 | fel 84 | felé 85 | g 86 | h 87 | hanem 88 | hiszen 89 | hogy 90 | hogyan 91 | i 92 | í 93 | igen 94 | így 95 | illetve 96 | ill. 97 | ill 98 | ilyen 99 | ilyenkor 100 | is 101 | ison 102 | ismét 103 | itt 104 | j 105 | jó 106 | jól 107 | jobban 108 | k 109 | kell 110 | kellett 111 | keresztül 112 | keressünk 113 | ki 114 | kívül 115 | között 116 | közül 117 | l 118 | legalább 119 | lehet 120 | lehetett 121 | legyen 122 | lenne 123 | lenni 124 | lesz 125 | lett 126 | m 127 | maga 128 | magát 129 | majd 130 | majd 131 | már 132 | más 133 | másik 134 | meg 135 | még 136 | mellett 137 | mert 138 | mely 139 | melyek 140 | mi 141 | mit 142 | míg 143 | miért 144 | milyen 145 | mikor 146 | minden 147 | mindent 148 | mindenki 149 | mindig 150 | mint 151 | mintha 152 | mivel 153 | most 154 | n 155 | nagy 156 | nagyobb 157 | nagyon 158 | ne 159 | néha 160 | nekem 161 | neki 162 | nem 163 | néhány 164 | nélkül 165 | nincs 166 | o 167 | ó 168 | olyan 169 | ott 170 | össze 171 | ö 172 | ő 173 | ők 174 | őket 175 | p 176 | pedig 177 | persze 178 | q 179 | r 180 | rá 181 | s 182 | saját 183 | sem 184 | semmi 185 | sok 186 | sokat 187 | sokkal 188 | sz 189 | számára 190 | szemben 191 | szerint 192 | szinte 193 | t 194 | talán 195 | tehát 196 | teljes 197 | tovább 198 | továbbá 199 | több 200 | u 201 | ú 202 | úgy 203 | ugyanis 204 | új 205 | újabb 206 | újra 207 | után 208 | utána 209 | utolsó 210 | ü 211 | ű 212 | v 213 | vagy 214 | vagyis 215 | valaki 216 | valamely 217 | valami 218 | valamint 219 | való 220 | vagyok 221 | van 222 | vannak 223 | volt 224 | voltam 225 | voltak 226 | voltunk 227 | vissza 228 | vele 229 | viszont 230 | volna 231 | számolnak 232 | szólnak 233 | szól 234 | w 235 | x 236 | y 237 | z 238 | zs 239 | a 240 | ahogy 241 | ahol 242 | aki 243 | akkor 244 | alatt 245 | általában 246 | által 247 | amely 248 | amíg 249 | amikor 250 | ami 251 | amolyan 252 | arra 253 | át 254 | az 255 | azért 256 | azonban 257 | azon 258 | aztán 259 | azt 260 | azután 261 | azzal 262 | bár 263 | be 264 | belül 265 | benne 266 | cikk 267 | csak 268 | de 269 | eddig 270 | egész 271 | egy 272 | egyéb 273 | egyes 274 | egyetlen 275 | egyik 276 | egyre 277 | ekkor 278 | el 279 | elég 280 | ellen 281 | elő 282 | először 283 | előtt 284 | első 285 | emilyen 286 | én 287 | éppen 288 | erre 289 | és 290 | e 291 | ez 292 | ezen 293 | ezért 294 | ezzel 295 | fel 296 | felé 297 | hanem 298 | hiszen 299 | hogy 300 | hogyan 301 | igen 302 | így 303 | ill. 304 | illetve 305 | ill 306 | ilyen 307 | ilyenkor 308 | ismét 309 | ison 310 | itt 311 | jó 312 | jobban 313 | jól 314 | kell 315 | keres 316 | keresztül 317 | ki 318 | kívül 319 | között 320 | közül 321 | legalább 322 | legyen 323 | lehet 324 | lenni 325 | lett 326 | maga 327 | maga 328 | majd 329 | már 330 | más 331 | másik 332 | még 333 | meg 334 | mellett 335 | mely 336 | mert 337 | miért 338 | míg 339 | mikor 340 | milyen 341 | minden 342 | mindenki 343 | mindig 344 | mi 345 | mint 346 | mintha 347 | mivel 348 | most 349 | nagy 350 | nagyobb 351 | nagyon 352 | ne 353 | néha 354 | néhány 355 | neki 356 | nélkül 357 | nem 358 | nincs 359 | ők 360 | olyan 361 | ő 362 | össze 363 | ott 364 | pedig 365 | persze 366 | rá 367 | saját 368 | s 369 | sem 370 | semmi 371 | sokkal 372 | sok 373 | számára 374 | számol 375 | szemben 376 | szerint 377 | szinte 378 | szól 379 | talán 380 | tehát 381 | teljes 382 | továbbá 383 | tovább 384 | úgy 385 | ugyanis 386 | új 387 | újabb 388 | újra 389 | utána 390 | után 391 | utolsó 392 | vagy 393 | vagyis 394 | valaki 395 | valamely 396 | valami 397 | valamint 398 | való 399 | van 400 | vissza 401 | viszont 402 | volt 403 | 404 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-it.txt: -------------------------------------------------------------------------------- 1 | ad 2 | al 3 | allo 4 | ai 5 | agli 6 | all 7 | agl 8 | alla 9 | alle 10 | con 11 | col 12 | coi 13 | da 14 | dal 15 | dallo 16 | dai 17 | dagli 18 | dall 19 | dagl 20 | dalla 21 | dalle 22 | di 23 | del 24 | dello 25 | dei 26 | degli 27 | dell 28 | degl 29 | della 30 | delle 31 | in 32 | nel 33 | nello 34 | nei 35 | negli 36 | nell 37 | negl 38 | nella 39 | nelle 40 | su 41 | sul 42 | sullo 43 | sui 44 | sugli 45 | sull 46 | sugl 47 | sulla 48 | sulle 49 | per 50 | tra 51 | contro 52 | io 53 | tu 54 | lui 55 | lei 56 | noi 57 | voi 58 | loro 59 | mio 60 | mia 61 | miei 62 | mie 63 | tuo 64 | tua 65 | tuoi 66 | tue 67 | suo 68 | sua 69 | suoi 70 | sue 71 | nostro 72 | nostra 73 | nostri 74 | nostre 75 | vostro 76 | vostra 77 | vostri 78 | vostre 79 | mi 80 | ti 81 | ci 82 | vi 83 | lo 84 | la 85 | li 86 | le 87 | gli 88 | ne 89 | il 90 | un 91 | uno 92 | una 93 | ma 94 | ed 95 | se 96 | perchè 97 | perché 98 | perche 99 | anche 100 | come 101 | dov 102 | dove 103 | che 104 | chi 105 | cui 106 | non 107 | più 108 | piu 109 | quale 110 | quanto 111 | quanti 112 | quanta 113 | quante 114 | quello 115 | quelli 116 | quella 117 | quelle 118 | questo 119 | questi 120 | questa 121 | queste 122 | si 123 | tutto 124 | tutti 125 | a 126 | c 127 | e 128 | i 129 | l 130 | o 131 | ho 132 | hai 133 | ha 134 | abbiamo 135 | avete 136 | hanno 137 | abbia 138 | abbiate 139 | abbiano 140 | avrò 141 | avro 142 | avrai 143 | avrà 144 | avra 145 | avremo 146 | avrete 147 | avranno 148 | avrei 149 | avresti 150 | avrebbe 151 | avremmo 152 | avreste 153 | avrebbero 154 | avevo 155 | avevi 156 | aveva 157 | avevamo 158 | avevate 159 | avevano 160 | ebbi 161 | avesti 162 | ebbe 163 | avemmo 164 | aveste 165 | ebbero 166 | avessi 167 | avesse 168 | avessimo 169 | avessero 170 | avendo 171 | avuto 172 | avuta 173 | avuti 174 | avute 175 | sono 176 | sei 177 | è 178 | é 179 | e 180 | siamo 181 | siete 182 | sia 183 | siate 184 | siano 185 | sarà 186 | sarai 187 | sarò 188 | saro 189 | saremo 190 | sarete 191 | saranno 192 | sarei 193 | saresti 194 | sarebbe 195 | saremmo 196 | sareste 197 | sarebbero 198 | ero 199 | eri 200 | era 201 | eravamo 202 | eravate 203 | erano 204 | fui 205 | fosti 206 | fu 207 | fummo 208 | foste 209 | furono 210 | fossi 211 | fosse 212 | fossimo 213 | fossero 214 | essendo 215 | faccio 216 | fai 217 | facciamo 218 | fanno 219 | faccia 220 | facciate 221 | facciano 222 | farà 223 | farai 224 | farò 225 | faremo 226 | farete 227 | faranno 228 | farei 229 | faresti 230 | farebbe 231 | faremmo 232 | fareste 233 | farebbero 234 | facevo 235 | facevi 236 | faceva 237 | facevamo 238 | facevate 239 | facevano 240 | feci 241 | facesti 242 | fece 243 | facemmo 244 | faceste 245 | fecero 246 | facessi 247 | facesse 248 | facessimo 249 | facessero 250 | facendo 251 | sto 252 | stai 253 | sta 254 | stiamo 255 | stanno 256 | stia 257 | stiate 258 | stiano 259 | starà 260 | starai 261 | starò 262 | staremo 263 | starete 264 | staranno 265 | starei 266 | staresti 267 | starebbe 268 | staremmo 269 | stareste 270 | starebbero 271 | stavo 272 | stavi 273 | stava 274 | stavamo 275 | stavate 276 | stavano 277 | stetti 278 | stesti 279 | stette 280 | stemmo 281 | steste 282 | stettero 283 | stessi 284 | stesse 285 | stessimo 286 | stessero 287 | stando 288 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-ko.txt: -------------------------------------------------------------------------------- 1 | 을 2 | 의 3 | 에 4 | 이 5 | 를 6 | 으로 7 | 은 8 | 는 9 | 가 10 | 로 11 | 하고 12 | 과 13 | 에서 14 | 도 15 | 와 16 | 이다 17 | 고 18 | 부터 19 | 까지 20 | 께 21 | 에는 22 | 이라고 23 | 만 24 | 라고 25 | 보다 26 | 에도 27 | 다 28 | 토록 29 | 에게 30 | 나 31 | 대로 32 | 에서는 33 | 이나 34 | 이며 35 | 요 36 | 든 37 | 으로써 38 | 같이 39 | 로는 40 | 밖에 41 | 과의 42 | 며 43 | 로부터 44 | 처럼 45 | 아 46 | 라 47 | 여 48 | 으로는 49 | 이고 50 | 에서의 51 | 이라는 52 | 만에 53 | 으로부터 54 | 에서도 55 | 와의 56 | 엔 57 | 만을 58 | 부터는 59 | 만의 60 | 야 61 | 까지의 62 | 과는 63 | 치고 64 | 과를 65 | 으로의 66 | 까지는 67 | 보다는 68 | 만이 69 | 에만 70 | 로의 -------------------------------------------------------------------------------- /goose/resources/text/stopwords-nb.txt: -------------------------------------------------------------------------------- 1 | alle 2 | andre 3 | arbeid 4 | av 5 | begge 6 | bort 7 | bra 8 | bruke 9 | da 10 | denne 11 | der 12 | deres 13 | det 14 | din 15 | disse 16 | du 17 | eller 18 | en 19 | ene 20 | eneste 21 | enhver 22 | enn 23 | er 24 | et 25 | folk 26 | for 27 | fordi 28 | forsÛke 29 | fra 30 | fÅ 31 | fÛr 32 | fÛrst 33 | gjorde 34 | gjÛre 35 | god 36 | gÅ 37 | ha 38 | hadde 39 | han 40 | hans 41 | hennes 42 | her 43 | hva 44 | hvem 45 | hver 46 | hvilken 47 | hvis 48 | hvor 49 | hvordan 50 | hvorfor 51 | ikke 52 | inn 53 | innen 54 | kan 55 | kunne 56 | lage 57 | lang 58 | lik 59 | like 60 | makt 61 | mange 62 | med 63 | meg 64 | meget 65 | men 66 | mens 67 | mer 68 | mest 69 | min 70 | mye 71 | mÅ 72 | mÅte 73 | navn 74 | nei 75 | ny 76 | nÅ 77 | nÅr 78 | og 79 | ogsÅ 80 | om 81 | opp 82 | oss 83 | over 84 | part 85 | punkt 86 | pÅ 87 | rett 88 | riktig 89 | samme 90 | sant 91 | si 92 | siden 93 | sist 94 | skulle 95 | slik 96 | slutt 97 | som 98 | start 99 | stille 100 | tid 101 | til 102 | tilbake 103 | tilstand 104 | under 105 | ut 106 | uten 107 | var 108 | ved 109 | verdi 110 | vi 111 | vil 112 | ville 113 | vite 114 | vÅr 115 | vÖre 116 | vÖrt 117 | Å 118 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-nl.txt: -------------------------------------------------------------------------------- 1 | aan 2 | af 3 | al 4 | als 5 | bij 6 | dan 7 | dat 8 | die 9 | dit 10 | een 11 | en 12 | er 13 | had 14 | heb 15 | hem 16 | het 17 | hij 18 | hoe 19 | hun 20 | ik 21 | in 22 | is 23 | je 24 | kan 25 | me 26 | men 27 | met 28 | mij 29 | nog 30 | nu 31 | of 32 | ons 33 | ook 34 | te 35 | tot 36 | uit 37 | van 38 | was 39 | wat 40 | we 41 | wel 42 | wij 43 | zal 44 | ze 45 | zei 46 | zij 47 | zo 48 | zou 49 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-no.txt: -------------------------------------------------------------------------------- 1 | at 2 | av 3 | de 4 | den 5 | der 6 | det 7 | du 8 | en 9 | er 10 | et 11 | for 12 | fra 13 | før 14 | med 15 | og 16 | om 17 | over 18 | på 19 | som 20 | til 21 | ved 22 | år 23 | alle 24 | bare 25 | ble 26 | bort 27 | bra 28 | da 29 | deg 30 | dem 31 | denne 32 | dere 33 | deres 34 | det 35 | dette 36 | din 37 | disse 38 | dit 39 | ditt 40 | eller 41 | ene 42 | enn 43 | er 44 | et 45 | ett 46 | etter 47 | for 48 | fram 49 | først 50 | få 51 | god 52 | gå 53 | ha 54 | han 55 | hans 56 | har 57 | her 58 | hit 59 | hun 60 | hva 61 | hvem 62 | hver 63 | ikke 64 | inn 65 | ja 66 | jeg 67 | kan 68 | kom 69 | kun 70 | kunne 71 | lage 72 | lang 73 | lik 74 | like 75 | man 76 | mer 77 | min 78 | mot 79 | mye 80 | må 81 | måte 82 | ned 83 | nei 84 | noe 85 | noen 86 | ny 87 | nå 88 | når 89 | også 90 | opp 91 | oss 92 | seg 93 | selv 94 | si 95 | siden 96 | sin 97 | sine 98 | sist 99 | skal 100 | skulle 101 | slik 102 | som 103 | så 104 | sånn 105 | tid 106 | til 107 | under 108 | ut 109 | uten 110 | var 111 | ved 112 | vi 113 | vil 114 | vite 115 | vår 116 | å 117 | dei 118 | di 119 | då 120 | eg -------------------------------------------------------------------------------- /goose/resources/text/stopwords-pl.txt: -------------------------------------------------------------------------------- 1 | a 2 | aby 3 | ach 4 | acz 5 | aczkolwiek 6 | aj 7 | albo 8 | ale 9 | ależ 10 | ani 11 | aż 12 | bardziej 13 | bardzo 14 | bo 15 | bowiem 16 | by 17 | byli 18 | bynajmniej 19 | być 20 | był 21 | była 22 | było 23 | były 24 | będzie 25 | będą 26 | cali 27 | cała 28 | cały 29 | ci 30 | cię 31 | ciebie 32 | co 33 | cokolwiek 34 | coś 35 | czasami 36 | czasem 37 | czemu 38 | czy 39 | czyli 40 | daleko 41 | dla 42 | dlaczego 43 | dlatego 44 | do 45 | dobrze 46 | dokąd 47 | dość 48 | dużo 49 | dwa 50 | dwaj 51 | dwie 52 | dwoje 53 | dziś 54 | dzisiaj 55 | gdy 56 | gdyby 57 | gdyż 58 | gdzie 59 | gdziekolwiek 60 | gdzieś 61 | i 62 | ich 63 | ile 64 | im 65 | inna 66 | inne 67 | inny 68 | innych 69 | iż 70 | ja 71 | ją 72 | jak 73 | jakaś 74 | jakby 75 | jaki 76 | jakichś 77 | jakie 78 | jakiś 79 | jakiż 80 | jakkolwiek 81 | jako 82 | jakoś 83 | je 84 | jeden 85 | jedna 86 | jedno 87 | jednak 88 | jednakże 89 | jego 90 | jej 91 | jemu 92 | jest 93 | jestem 94 | jeszcze 95 | jeśli 96 | jeżeli 97 | już 98 | ją 99 | każdy 100 | kiedy 101 | kilka 102 | kimś 103 | kto 104 | ktokolwiek 105 | ktoś 106 | która 107 | które 108 | którego 109 | której 110 | który 111 | których 112 | którym 113 | którzy 114 | ku 115 | lat 116 | lecz 117 | lub 118 | ma 119 | mają 120 | mało 121 | mam 122 | mi 123 | mimo 124 | między 125 | mną 126 | mnie 127 | mogą 128 | moi 129 | moim 130 | moja 131 | moje 132 | może 133 | możliwe 134 | można 135 | mój 136 | mu 137 | musi 138 | my 139 | na 140 | nad 141 | nam 142 | nami 143 | nas 144 | nasi 145 | nasz 146 | nasza 147 | nasze 148 | naszego 149 | naszych 150 | natomiast 151 | natychmiast 152 | nawet 153 | nią 154 | nic 155 | nich 156 | nie 157 | niech 158 | niego 159 | niej 160 | niemu 161 | nigdy 162 | nim 163 | nimi 164 | niż 165 | no 166 | o 167 | obok 168 | od 169 | około 170 | on 171 | ona 172 | one 173 | oni 174 | ono 175 | oraz 176 | oto 177 | owszem 178 | pan 179 | pana 180 | pani 181 | po 182 | pod 183 | podczas 184 | pomimo 185 | ponad 186 | ponieważ 187 | powinien 188 | powinna 189 | powinni 190 | powinno 191 | poza 192 | prawie 193 | przecież 194 | przed 195 | przede 196 | przedtem 197 | przez 198 | przy 199 | roku 200 | również 201 | sam 202 | sama 203 | są 204 | się 205 | skąd 206 | sobie 207 | sobą 208 | sposób 209 | swoje 210 | ta 211 | tak 212 | taka 213 | taki 214 | takie 215 | także 216 | tam 217 | te 218 | tego 219 | tej 220 | temu 221 | ten 222 | teraz 223 | też 224 | to 225 | tobą 226 | tobie 227 | toteż 228 | trzeba 229 | tu 230 | tutaj 231 | twoi 232 | twoim 233 | twoja 234 | twoje 235 | twym 236 | twój 237 | ty 238 | tych 239 | tylko 240 | tym 241 | u 242 | w 243 | wam 244 | wami 245 | was 246 | wasz 247 | wasza 248 | wasze 249 | we 250 | według 251 | wiele 252 | wielu 253 | więc 254 | więcej 255 | wszyscy 256 | wszystkich 257 | wszystkie 258 | wszystkim 259 | wszystko 260 | wtedy 261 | wy 262 | właśnie 263 | z 264 | za 265 | zapewne 266 | zawsze 267 | ze 268 | zł 269 | znowu 270 | znów 271 | został 272 | żaden 273 | żadna 274 | żadne 275 | żadnych 276 | że 277 | żeby -------------------------------------------------------------------------------- /goose/resources/text/stopwords-pt.txt: -------------------------------------------------------------------------------- 1 | último 2 | é 3 | acerca 4 | agora 5 | algmas 6 | alguns 7 | ali 8 | ambos 9 | antes 10 | apontar 11 | aquela 12 | aquelas 13 | aquele 14 | aqueles 15 | aqui 16 | atrás 17 | bem 18 | bom 19 | cada 20 | caminho 21 | cima 22 | com 23 | como 24 | comprido 25 | conhecido 26 | corrente 27 | das 28 | debaixo 29 | dentro 30 | desde 31 | desligado 32 | deve 33 | devem 34 | deverá 35 | direita 36 | diz 37 | dizer 38 | dois 39 | dos 40 | e 41 | ela 42 | ele 43 | eles 44 | em 45 | enquanto 46 | então 47 | está 48 | estão 49 | estado 50 | estar 51 | estará 52 | este 53 | estes 54 | esteve 55 | estive 56 | estivemos 57 | estiveram 58 | eu 59 | fará 60 | faz 61 | fazer 62 | fazia 63 | fez 64 | fim 65 | foi 66 | fora 67 | horas 68 | iniciar 69 | inicio 70 | ir 71 | irá 72 | ista 73 | iste 74 | isto 75 | ligado 76 | maioria 77 | maiorias 78 | mais 79 | mas 80 | mesmo 81 | meu 82 | muito 83 | muitos 84 | nós 85 | não 86 | nome 87 | nosso 88 | novo 89 | o 90 | onde 91 | os 92 | ou 93 | outro 94 | para 95 | parte 96 | pegar 97 | pelo 98 | pessoas 99 | pode 100 | poderá 101 | podia 102 | por 103 | porque 104 | povo 105 | promeiro 106 | quê 107 | qual 108 | qualquer 109 | quando 110 | quem 111 | quieto 112 | são 113 | saber 114 | sem 115 | ser 116 | seu 117 | somente 118 | têm 119 | tal 120 | também 121 | tem 122 | tempo 123 | tenho 124 | tentar 125 | tentaram 126 | tente 127 | tentei 128 | teu 129 | teve 130 | tipo 131 | tive 132 | todos 133 | trabalhar 134 | trabalho 135 | tu 136 | um 137 | uma 138 | umas 139 | uns 140 | usa 141 | usar 142 | valor 143 | veja 144 | ver 145 | verdade 146 | verdadeiro 147 | você 148 | -------------------------------------------------------------------------------- /goose/resources/text/stopwords-zh.txt: -------------------------------------------------------------------------------- 1 | 的 2 | 一 3 | 不 4 | 在 5 | 人 6 | 有 7 | 是 8 | 为 9 | 以 10 | 于 11 | 上 12 | 他 13 | 而 14 | 后 15 | 之 16 | 来 17 | 及 18 | 了 19 | 因 20 | 下 21 | 可 22 | 到 23 | 由 24 | 这 25 | 与 26 | 也 27 | 此 28 | 但 29 | 并 30 | 个 31 | 其 32 | 已 33 | 无 34 | 小 35 | 我 36 | 们 37 | 起 38 | 最 39 | 再 40 | 今 41 | 去 42 | 好 43 | 只 44 | 又 45 | 或 46 | 很 47 | 亦 48 | 某 49 | 把 50 | 那 51 | 你 52 | 乃 53 | 它 54 | 吧 55 | 被 56 | 比 57 | 别 58 | 趁 59 | 当 60 | 从 61 | 到 62 | 得 63 | 打 64 | 凡 65 | 儿 66 | 尔 67 | 该 68 | 各 69 | 给 70 | 跟 71 | 和 72 | 何 73 | 还 74 | 即 75 | 几 76 | 既 77 | 看 78 | 据 79 | 距 80 | 靠 81 | 啦 82 | 了 83 | 另 84 | 么 85 | 每 86 | 们 87 | 嘛 88 | 拿 89 | 哪 90 | 那 91 | 您 92 | 凭 93 | 且 94 | 却 95 | 让 96 | 仍 97 | 啥 98 | 如 99 | 若 100 | 使 101 | 谁 102 | 虽 103 | 随 104 | 同 105 | 所 106 | 她 107 | 哇 108 | 嗡 109 | 往 110 | 哪 111 | 些 112 | 向 113 | 沿 114 | 哟 115 | 用 116 | 于 117 | 咱 118 | 则 119 | 怎 120 | 曾 121 | 至 122 | 致 123 | 着 124 | 诸 125 | 自 126 | 為 127 | 於 128 | 後 129 | 這 130 | 與 131 | 並 132 | 個 133 | 無 134 | 們 135 | 當 136 | 從 137 | 兒 138 | 爾 139 | 該 140 | 給 141 | 還 142 | 幾 143 | 麼 144 | 憑 145 | 卻 146 | 讓 147 | 誰 148 | 雖 149 | 喲 150 | 則 151 | 諸 152 | -------------------------------------------------------------------------------- /goose/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | import time 24 | import hashlib 25 | import re 26 | import os 27 | import goose 28 | import codecs 29 | import urlparse 30 | 31 | 32 | class BuildURL(object): 33 | def __init__(self, url, finalurl=None): 34 | self.url = url 35 | self.finalurl = finalurl 36 | 37 | def getHostname(self, o): 38 | if o.hostname: 39 | return o.hotname 40 | elif self.finalurl: 41 | oo = urlparse(self.finalurl) 42 | if oo.hostname: 43 | return oo.hostname 44 | return None 45 | 46 | def getScheme(self, o): 47 | if o.scheme: 48 | return o.scheme 49 | elif self.finalurl: 50 | oo = urlparse(self.finalurl) 51 | if oo.scheme: 52 | return oo.scheme 53 | return 'http' 54 | 55 | def getUrl(self): 56 | """\ 57 | 58 | """ 59 | url_obj = urlparse(self.url) 60 | scheme = self.getScheme(url_obj) 61 | hostname = self.getHostname(url_obj) 62 | 63 | 64 | class FileHelper(object): 65 | 66 | @classmethod 67 | def loadResourceFile(self, filename): 68 | if not os.path.isabs('filename'): 69 | dirpath = os.path.dirname(goose.__file__) 70 | path = os.path.join(dirpath, 'resources', filename) 71 | else: 72 | path = filename 73 | try: 74 | f = codecs.open(path, 'r', 'utf-8') 75 | content = f.read() 76 | f.close() 77 | return content 78 | except IOError: 79 | raise IOError("Couldn't open file %s" % path) 80 | 81 | 82 | class ParsingCandidate(object): 83 | 84 | def __init__(self, urlString, link_hash): 85 | self.urlString = self.url = urlString 86 | self.link_hash = link_hash 87 | 88 | 89 | class RawHelper(object): 90 | @classmethod 91 | def get_parsing_candidate(self, url, raw_html): 92 | if isinstance(raw_html, unicode): 93 | raw_html = raw_html.encode('utf-8') 94 | link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time()) 95 | return ParsingCandidate(url, link_hash) 96 | 97 | 98 | class URLHelper(object): 99 | @classmethod 100 | def get_parsing_candidate(self, url_to_crawl): 101 | # replace shebang is urls 102 | final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \ 103 | if '#!' in url_to_crawl else url_to_crawl 104 | link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time()) 105 | return ParsingCandidate(final_url, link_hash) 106 | 107 | 108 | class StringSplitter(object): 109 | """\ 110 | 111 | """ 112 | def __init__(self, pattern): 113 | self.pattern = re.compile(pattern) 114 | 115 | def split(self, string): 116 | if not string: 117 | return [] 118 | return self.pattern.split(string) 119 | 120 | 121 | class StringReplacement(object): 122 | 123 | def __init__(self, pattern, replaceWith): 124 | self.pattern = pattern 125 | self.replaceWith = replaceWith 126 | 127 | def replaceAll(self, string): 128 | if not string: 129 | return u'' 130 | return string.replace(self.pattern, self.replaceWith) 131 | 132 | 133 | class ReplaceSequence(object): 134 | 135 | def __init__(self): 136 | self.replacements = [] 137 | 138 | #@classmethod 139 | def create(self, firstPattern, replaceWith=None): 140 | result = StringReplacement(firstPattern, replaceWith or u'') 141 | self.replacements.append(result) 142 | return self 143 | 144 | def append(self, pattern, replaceWith=None): 145 | return self.create(pattern, replaceWith) 146 | 147 | def replaceAll(self, string): 148 | if not string: 149 | return u'' 150 | 151 | mutatedString = string 152 | 153 | for rp in self.replacements: 154 | mutatedString = rp.replaceAll(mutatedString) 155 | return mutatedString 156 | -------------------------------------------------------------------------------- /goose/version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | 24 | version_info = (1, 0, 22) 25 | __version__ = ".".join(map(str, version_info)) 26 | -------------------------------------------------------------------------------- /goose/videos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/goose/videos/__init__.py -------------------------------------------------------------------------------- /goose/videos/videos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """\ 3 | This is a python port of "Goose" orignialy licensed to Gravity.com 4 | under one or more contributor license agreements. See the NOTICE file 5 | distributed with this work for additional information 6 | regarding copyright ownership. 7 | 8 | Python port was written by Xavier Grangier for Recrutae 9 | 10 | Gravity.com licenses this file 11 | to you under the Apache License, Version 2.0 (the "License"); 12 | you may not use this file except in compliance 13 | with the License. You may obtain a copy of the License at 14 | 15 | http://www.apache.org/licenses/LICENSE-2.0 16 | 17 | Unless required by applicable law or agreed to in writing, software 18 | distributed under the License is distributed on an "AS IS" BASIS, 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | See the License for the specific language governing permissions and 21 | limitations under the License. 22 | """ 23 | 24 | class Video(object): 25 | """\ 26 | Video object 27 | """ 28 | 29 | def __init__(self): 30 | 31 | # type of embed 32 | # embed, object, iframe 33 | self.embed_type = None 34 | 35 | # video provider name 36 | self.provider = None 37 | 38 | # width 39 | self.width = None 40 | 41 | # height 42 | self.height = None 43 | 44 | # embed code 45 | self.embed_code = None 46 | 47 | # src 48 | self.src = None 49 | -------------------------------------------------------------------------------- /httplib2/iri2uri.py: -------------------------------------------------------------------------------- 1 | """ 2 | iri2uri 3 | 4 | Converts an IRI to a URI. 5 | 6 | """ 7 | __author__ = "Joe Gregorio (joe@bitworking.org)" 8 | __copyright__ = "Copyright 2006, Joe Gregorio" 9 | __contributors__ = [] 10 | __version__ = "1.0.0" 11 | __license__ = "MIT" 12 | __history__ = """ 13 | """ 14 | 15 | import urlparse 16 | 17 | 18 | # Convert an IRI to a URI following the rules in RFC 3987 19 | # 20 | # The characters we need to enocde and escape are defined in the spec: 21 | # 22 | # iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD 23 | # ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF 24 | # / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD 25 | # / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD 26 | # / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD 27 | # / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD 28 | # / %xD0000-DFFFD / %xE1000-EFFFD 29 | 30 | escape_range = [ 31 | (0xA0, 0xD7FF), 32 | (0xE000, 0xF8FF), 33 | (0xF900, 0xFDCF), 34 | (0xFDF0, 0xFFEF), 35 | (0x10000, 0x1FFFD), 36 | (0x20000, 0x2FFFD), 37 | (0x30000, 0x3FFFD), 38 | (0x40000, 0x4FFFD), 39 | (0x50000, 0x5FFFD), 40 | (0x60000, 0x6FFFD), 41 | (0x70000, 0x7FFFD), 42 | (0x80000, 0x8FFFD), 43 | (0x90000, 0x9FFFD), 44 | (0xA0000, 0xAFFFD), 45 | (0xB0000, 0xBFFFD), 46 | (0xC0000, 0xCFFFD), 47 | (0xD0000, 0xDFFFD), 48 | (0xE1000, 0xEFFFD), 49 | (0xF0000, 0xFFFFD), 50 | (0x100000, 0x10FFFD), 51 | ] 52 | 53 | def encode(c): 54 | retval = c 55 | i = ord(c) 56 | for low, high in escape_range: 57 | if i < low: 58 | break 59 | if i >= low and i <= high: 60 | retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')]) 61 | break 62 | return retval 63 | 64 | 65 | def iri2uri(uri): 66 | """Convert an IRI to a URI. Note that IRIs must be 67 | passed in a unicode strings. That is, do not utf-8 encode 68 | the IRI before passing it into the function.""" 69 | if isinstance(uri ,unicode): 70 | (scheme, authority, path, query, fragment) = urlparse.urlsplit(uri) 71 | authority = authority.encode('idna') 72 | # For each character in 'ucschar' or 'iprivate' 73 | # 1. encode as utf-8 74 | # 2. then %-encode each octet of that utf-8 75 | uri = urlparse.urlunsplit((scheme, authority, path, query, fragment)) 76 | uri = "".join([encode(c) for c in uri]) 77 | return uri 78 | 79 | if __name__ == "__main__": 80 | import unittest 81 | 82 | class Test(unittest.TestCase): 83 | 84 | def test_uris(self): 85 | """Test that URIs are invariant under the transformation.""" 86 | invariant = [ 87 | u"ftp://ftp.is.co.za/rfc/rfc1808.txt", 88 | u"http://www.ietf.org/rfc/rfc2396.txt", 89 | u"ldap://[2001:db8::7]/c=GB?objectClass?one", 90 | u"mailto:John.Doe@example.com", 91 | u"news:comp.infosystems.www.servers.unix", 92 | u"tel:+1-816-555-1212", 93 | u"telnet://192.0.2.16:80/", 94 | u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ] 95 | for uri in invariant: 96 | self.assertEqual(uri, iri2uri(uri)) 97 | 98 | def test_iri(self): 99 | """ Test that the right type of escaping is done for each part of the URI.""" 100 | self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}")) 101 | self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}")) 102 | self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}")) 103 | self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}")) 104 | self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")) 105 | self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))) 106 | self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8'))) 107 | 108 | unittest.main() 109 | 110 | 111 | -------------------------------------------------------------------------------- /httplib2/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/httplib2/test/__init__.py -------------------------------------------------------------------------------- /httplib2/test/brokensocket/socket.py: -------------------------------------------------------------------------------- 1 | from realsocket import gaierror, error, getaddrinfo, SOCK_STREAM 2 | -------------------------------------------------------------------------------- /httplib2/test/functional/test_proxies.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import errno 3 | import os 4 | import signal 5 | import subprocess 6 | import tempfile 7 | 8 | import nose 9 | 10 | import httplib2 11 | from httplib2 import socks 12 | from httplib2.test import miniserver 13 | 14 | tinyproxy_cfg = """ 15 | User "%(user)s" 16 | Port %(port)s 17 | Listen 127.0.0.1 18 | PidFile "%(pidfile)s" 19 | LogFile "%(logfile)s" 20 | MaxClients 2 21 | StartServers 1 22 | LogLevel Info 23 | """ 24 | 25 | 26 | class FunctionalProxyHttpTest(unittest.TestCase): 27 | def setUp(self): 28 | if not socks: 29 | raise nose.SkipTest('socks module unavailable') 30 | if not subprocess: 31 | raise nose.SkipTest('subprocess module unavailable') 32 | 33 | # start a short-lived miniserver so we can get a likely port 34 | # for the proxy 35 | self.httpd, self.proxyport = miniserver.start_server( 36 | miniserver.ThisDirHandler) 37 | self.httpd.shutdown() 38 | self.httpd, self.port = miniserver.start_server( 39 | miniserver.ThisDirHandler) 40 | 41 | self.pidfile = tempfile.mktemp() 42 | self.logfile = tempfile.mktemp() 43 | fd, self.conffile = tempfile.mkstemp() 44 | f = os.fdopen(fd, 'w') 45 | our_cfg = tinyproxy_cfg % {'user': os.getlogin(), 46 | 'pidfile': self.pidfile, 47 | 'port': self.proxyport, 48 | 'logfile': self.logfile} 49 | f.write(our_cfg) 50 | f.close() 51 | try: 52 | # TODO use subprocess.check_call when 2.4 is dropped 53 | ret = subprocess.call(['tinyproxy', '-c', self.conffile]) 54 | self.assertEqual(0, ret) 55 | except OSError, e: 56 | if e.errno == errno.ENOENT: 57 | raise nose.SkipTest('tinyproxy not available') 58 | raise 59 | 60 | def tearDown(self): 61 | self.httpd.shutdown() 62 | try: 63 | pid = int(open(self.pidfile).read()) 64 | os.kill(pid, signal.SIGTERM) 65 | except OSError, e: 66 | if e.errno == errno.ESRCH: 67 | print '\n\n\nTinyProxy Failed to start, log follows:' 68 | print open(self.logfile).read() 69 | print 'end tinyproxy log\n\n\n' 70 | raise 71 | map(os.unlink, (self.pidfile, 72 | self.logfile, 73 | self.conffile)) 74 | 75 | def testSimpleProxy(self): 76 | proxy_info = httplib2.ProxyInfo(socks.PROXY_TYPE_HTTP, 77 | 'localhost', self.proxyport) 78 | client = httplib2.Http(proxy_info=proxy_info) 79 | src = 'miniserver.py' 80 | response, body = client.request('http://localhost:%d/%s' % 81 | (self.port, src)) 82 | self.assertEqual(response.status, 200) 83 | self.assertEqual(body, open(os.path.join(miniserver.HERE, src)).read()) 84 | lf = open(self.logfile).read() 85 | expect = ('Established connection to host "127.0.0.1" ' 86 | 'using file descriptor') 87 | self.assertTrue(expect in lf, 88 | 'tinyproxy did not proxy a request for miniserver') 89 | -------------------------------------------------------------------------------- /httplib2/test/miniserver.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import select 4 | import SimpleHTTPServer 5 | import SocketServer 6 | import threading 7 | 8 | HERE = os.path.dirname(__file__) 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class ThisDirHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): 13 | def translate_path(self, path): 14 | path = path.split('?', 1)[0].split('#', 1)[0] 15 | return os.path.join(HERE, *filter(None, path.split('/'))) 16 | 17 | def log_message(self, s, *args): 18 | # output via logging so nose can catch it 19 | logger.info(s, *args) 20 | 21 | 22 | class ShutdownServer(SocketServer.TCPServer): 23 | """Mixin that allows serve_forever to be shut down. 24 | 25 | The methods in this mixin are backported from SocketServer.py in the Python 26 | 2.6.4 standard library. The mixin is unnecessary in 2.6 and later, when 27 | BaseServer supports the shutdown method directly. 28 | """ 29 | 30 | def __init__(self, *args, **kwargs): 31 | SocketServer.TCPServer.__init__(self, *args, **kwargs) 32 | self.__is_shut_down = threading.Event() 33 | self.__serving = False 34 | 35 | def serve_forever(self, poll_interval=0.1): 36 | """Handle one request at a time until shutdown. 37 | 38 | Polls for shutdown every poll_interval seconds. Ignores 39 | self.timeout. If you need to do periodic tasks, do them in 40 | another thread. 41 | """ 42 | self.__serving = True 43 | self.__is_shut_down.clear() 44 | while self.__serving: 45 | r, w, e = select.select([self.socket], [], [], poll_interval) 46 | if r: 47 | self._handle_request_noblock() 48 | self.__is_shut_down.set() 49 | 50 | def shutdown(self): 51 | """Stops the serve_forever loop. 52 | 53 | Blocks until the loop has finished. This must be called while 54 | serve_forever() is running in another thread, or it will deadlock. 55 | """ 56 | self.__serving = False 57 | self.__is_shut_down.wait() 58 | 59 | def handle_request(self): 60 | """Handle one request, possibly blocking. 61 | 62 | Respects self.timeout. 63 | """ 64 | # Support people who used socket.settimeout() to escape 65 | # handle_request before self.timeout was available. 66 | timeout = self.socket.gettimeout() 67 | if timeout is None: 68 | timeout = self.timeout 69 | elif self.timeout is not None: 70 | timeout = min(timeout, self.timeout) 71 | fd_sets = select.select([self], [], [], timeout) 72 | if not fd_sets[0]: 73 | self.handle_timeout() 74 | return 75 | self._handle_request_noblock() 76 | 77 | def _handle_request_noblock(self): 78 | """Handle one request, without blocking. 79 | 80 | I assume that select.select has returned that the socket is 81 | readable before this function was called, so there should be 82 | no risk of blocking in get_request(). 83 | """ 84 | try: 85 | request, client_address = self.get_request() 86 | except socket.error: 87 | return 88 | if self.verify_request(request, client_address): 89 | try: 90 | self.process_request(request, client_address) 91 | except: 92 | self.handle_error(request, client_address) 93 | self.close_request(request) 94 | 95 | 96 | def start_server(handler): 97 | httpd = ShutdownServer(("", 0), handler) 98 | threading.Thread(target=httpd.serve_forever).start() 99 | _, port = httpd.socket.getsockname() 100 | return httpd, port 101 | -------------------------------------------------------------------------------- /httplib2/test/other_cacerts.txt: -------------------------------------------------------------------------------- 1 | # Certifcate Authority certificates for validating SSL connections. 2 | # 3 | # This file contains PEM format certificates generated from 4 | # http://mxr.mozilla.org/seamonkey/source/security/nss/lib/ckfw/builtins/certdata.txt 5 | # 6 | # ***** BEGIN LICENSE BLOCK ***** 7 | # Version: MPL 1.1/GPL 2.0/LGPL 2.1 8 | # 9 | # The contents of this file are subject to the Mozilla Public License Version 10 | # 1.1 (the "License"); you may not use this file except in compliance with 11 | # the License. You may obtain a copy of the License at 12 | # http://www.mozilla.org/MPL/ 13 | # 14 | # Software distributed under the License is distributed on an "AS IS" basis, 15 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 16 | # for the specific language governing rights and limitations under the 17 | # License. 18 | # 19 | # The Original Code is the Netscape security libraries. 20 | # 21 | # The Initial Developer of the Original Code is 22 | # Netscape Communications Corporation. 23 | # Portions created by the Initial Developer are Copyright (C) 1994-2000 24 | # the Initial Developer. All Rights Reserved. 25 | # 26 | # Contributor(s): 27 | # 28 | # Alternatively, the contents of this file may be used under the terms of 29 | # either the GNU General Public License Version 2 or later (the "GPL"), or 30 | # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 31 | # in which case the provisions of the GPL or the LGPL are applicable instead 32 | # of those above. If you wish to allow use of your version of this file only 33 | # under the terms of either the GPL or the LGPL, and not to allow others to 34 | # use your version of this file under the terms of the MPL, indicate your 35 | # decision by deleting the provisions above and replace them with the notice 36 | # and other provisions required by the GPL or the LGPL. If you do not delete 37 | # the provisions above, a recipient may use your version of this file under 38 | # the terms of any one of the MPL, the GPL or the LGPL. 39 | # 40 | # ***** END LICENSE BLOCK ***** 41 | 42 | 43 | Comodo CA Limited, CN=Trusted Certificate Services 44 | ================================================== 45 | 46 | -----BEGIN CERTIFICATE----- 47 | MIIEQzCCAyugAwIBAgIBATANBgkqhkiG9w0BAQUFADB/MQswCQYDVQQGEwJHQjEb 48 | MBkGA1UECAwSR3JlYXRlciBNYW5jaGVzdGVyMRAwDgYDVQQHDAdTYWxmb3JkMRow 49 | GAYDVQQKDBFDb21vZG8gQ0EgTGltaXRlZDElMCMGA1UEAwwcVHJ1c3RlZCBDZXJ0 50 | aWZpY2F0ZSBTZXJ2aWNlczAeFw0wNDAxMDEwMDAwMDBaFw0yODEyMzEyMzU5NTla 51 | MH8xCzAJBgNVBAYTAkdCMRswGQYDVQQIDBJHcmVhdGVyIE1hbmNoZXN0ZXIxEDAO 52 | BgNVBAcMB1NhbGZvcmQxGjAYBgNVBAoMEUNvbW9kbyBDQSBMaW1pdGVkMSUwIwYD 53 | VQQDDBxUcnVzdGVkIENlcnRpZmljYXRlIFNlcnZpY2VzMIIBIjANBgkqhkiG9w0B 54 | AQEFAAOCAQ8AMIIBCgKCAQEA33FvNlhTWvI2VFeAxHQIIO0Yfyod5jWaHiWsnOWW 55 | fnJSoBVC21ndZHoa0Lh73TkVvFVIxO06AOoxEbrycXQaZ7jPM8yoMa+j49d/vzMt 56 | TGo87IvDktJTdyR0nAducPy9C1t2ul/y/9c3S0pgePfw+spwtOpZqqPOSC+pw7IL 57 | fhdyFgymBwwbOM/JYrc/oJOlh0Hyt3BAd9i+FHzjqMB6juljatEPmsbS9Is6FARW 58 | 1O24zG71++IsWL1/T2sr92AkWCTOJu80kTrV44HQsvAEAtdbtz6SrGsSivnkBbA7 59 | kUlcsutT6vifR4buv5XAwAaf0lteERv0xwQ1KdJVXOTt6wIDAQABo4HJMIHGMB0G 60 | A1UdDgQWBBTFe1i97doladL3WRaoszLAeydb9DAOBgNVHQ8BAf8EBAMCAQYwDwYD 61 | VR0TAQH/BAUwAwEB/zCBgwYDVR0fBHwwejA8oDqgOIY2aHR0cDovL2NybC5jb21v 62 | ZG9jYS5jb20vVHJ1c3RlZENlcnRpZmljYXRlU2VydmljZXMuY3JsMDqgOKA2hjRo 63 | dHRwOi8vY3JsLmNvbW9kby5uZXQvVHJ1c3RlZENlcnRpZmljYXRlU2VydmljZXMu 64 | Y3JsMA0GCSqGSIb3DQEBBQUAA4IBAQDIk4E7ibSvuIQSTI3S8NtwuleGFTQQuS9/ 65 | HrCoiWChisJ3DFBKmwCL2Iv0QeLQg4pKHBQGsKNoBXAxMKdTmw7pSqBYaWcOrp32 66 | pSxBvzwGa+RZzG0Q8ZZvH9/0BAKkn0U+yNj6NkZEUD+Cl5EfKNsYEYwq5GWDVxIS 67 | jBc/lDb+XbDABHcTuPQV1T84zJQ6VdCsmPW6AF/ghhmBeC8owH7TzEIK9a5QoNE+ 68 | xqFx7D+gIIxmOom0jtTYsU0lR+4viMi14QVFwL4Ucd56/Y57fU0IlqUSc/Atyjcn 69 | dBInTMu2l+nZrghtWjlA3QVHdWpaIbOjGM9O9y5Xt5hwXsjEeLBi 70 | -----END CERTIFICATE----- 71 | -------------------------------------------------------------------------------- /httplib2/test/smoke_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | import httplib2 5 | 6 | from httplib2.test import miniserver 7 | 8 | 9 | class HttpSmokeTest(unittest.TestCase): 10 | def setUp(self): 11 | self.httpd, self.port = miniserver.start_server( 12 | miniserver.ThisDirHandler) 13 | 14 | def tearDown(self): 15 | self.httpd.shutdown() 16 | 17 | def testGetFile(self): 18 | client = httplib2.Http() 19 | src = 'miniserver.py' 20 | response, body = client.request('http://localhost:%d/%s' % 21 | (self.port, src)) 22 | self.assertEqual(response.status, 200) 23 | self.assertEqual(body, open(os.path.join(miniserver.HERE, src)).read()) 24 | -------------------------------------------------------------------------------- /httplib2/test/test_no_socket.py: -------------------------------------------------------------------------------- 1 | """Tests for httplib2 when the socket module is missing. 2 | 3 | This helps ensure compatibility with environments such as AppEngine. 4 | """ 5 | import os 6 | import sys 7 | import unittest 8 | 9 | import httplib2 10 | 11 | class MissingSocketTest(unittest.TestCase): 12 | def setUp(self): 13 | self._oldsocks = httplib2.socks 14 | httplib2.socks = None 15 | 16 | def tearDown(self): 17 | httplib2.socks = self._oldsocks 18 | 19 | def testProxyDisabled(self): 20 | proxy_info = httplib2.ProxyInfo('blah', 21 | 'localhost', 0) 22 | client = httplib2.Http(proxy_info=proxy_info) 23 | self.assertRaises(httplib2.ProxiesUnavailableError, 24 | client.request, 'http://localhost:-1/') 25 | -------------------------------------------------------------------------------- /index.yaml: -------------------------------------------------------------------------------- 1 | indexes: 2 | 3 | # AUTOGENERATED 4 | 5 | # This index.yaml is automatically updated whenever the dev_appserver 6 | # detects that a new type of query is run. If you want to manage the 7 | # index.yaml file manually, remove the above marker line (the line 8 | # saying "# AUTOGENERATED"). If you want to manage some indexes 9 | # manually, move them above the marker line. The index.yaml file is 10 | # automatically uploaded to the admin console when you next deploy 11 | # your application using appcfg.py. 12 | 13 | -------------------------------------------------------------------------------- /instructions.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Astrobot Instructions 6 | 15 | 16 | 17 | 18 |

Astrobot Instructions

19 |

Coming soon...

20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2007 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | import webapp2 18 | import uuid 19 | import os 20 | import base64 21 | import pickle 22 | from google.appengine.ext import ndb 23 | import browse 24 | from xml.sax import saxutils 25 | import json 26 | 27 | class State(ndb.Model): 28 | pickled = ndb.BlobProperty(compressed=True) 29 | 30 | def interact(query, stateid): 31 | state = State.get_or_insert(stateid) 32 | unpickled_state = pickle.loads(state.pickled) if state.pickled else {} 33 | messages = browse.interact(query, unpickled_state) 34 | print "MESSAGES", messages 35 | state.pickled = pickle.dumps(unpickled_state) 36 | state.put() 37 | return messages 38 | 39 | class MainHandler(webapp2.RequestHandler): 40 | def get(self): 41 | self.response.write(open('page.html').read()) 42 | 43 | class Interact(webapp2.RequestHandler): 44 | def post(self): 45 | query = self.request.get('query') 46 | stateid = self.request.cookies.get('stateid', None) 47 | if stateid == None: 48 | stateid = base64.b64encode(uuid.uuid4().bytes + os.urandom(64)) 49 | self.response.set_cookie('stateid', stateid, max_age=3600*20) 50 | self.response.write(json.dumps({"messages": interact(query, stateid)})) 51 | 52 | class Twilio(webapp2.RequestHandler): 53 | def post(self): 54 | from_phone = self.request.get('From') 55 | query = self.request.get('Body') 56 | messages = [] 57 | try: 58 | messages = interact(query, 'phone:'+from_phone) 59 | except Exception: 60 | messages.append("Oops, something went wrong.") 61 | self.response.content_type = 'text/xml' 62 | self.response.write('') 63 | for msg in messages: 64 | self.response.write(u"{0}".format(saxutils.escape(msg))) 65 | self.response.write("") 66 | 67 | 68 | class InstructionsHandler(webapp2.RequestHandler): 69 | def get(self): 70 | self.response.write(open('instructions.html').read()) 71 | 72 | app = webapp2.WSGIApplication([ 73 | ('/', MainHandler), 74 | ('/instructions', InstructionsHandler), 75 | ('/interact', Interact), 76 | ('/twilio', Twilio) 77 | ], debug=True) 78 | -------------------------------------------------------------------------------- /page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Astrobot: browse the web with pure SMS 5 | 6 | 7 | 27 | 82 | 83 | 84 | 85 | 86 |

astro-bot

87 |

Browse the web over sms. Text 646-576-7688.

88 |

Things you can say...

89 |
    90 |
  • go to hackerschool.com
  • 91 |
  • 2 more
    sends 2 more messages of content
  • 92 |
  • 4
    clicks the link named '4'
  • 93 |
  • search the web for george harrison
  • 94 |
  • where am i?
    shows your current URL
  • 95 |
  • table of contents
    allows you to jump to specific headings on a page
  • 96 |
97 |
    98 | 99 |
100 |
101 | 102 | 103 |
104 | 105 | 106 | -------------------------------------------------------------------------------- /parse_command.py: -------------------------------------------------------------------------------- 1 | from wise import Phrase, parse_phrase 2 | 3 | examples = [ 4 | Phrase("url", [["*url", "google.com"]]), 5 | Phrase("url", ["load", ["*url", "google.com"]]), 6 | Phrase("url", ["open", ["*url", "google.com"]]), 7 | Phrase("url", ["fetch", ["*url", "google.com"]]), 8 | Phrase("url", ["go to", ["*url", "google.com"]]), 9 | Phrase("url", ["show", ["*url", "google.com"]]), 10 | Phrase("search", ["search", ["~query", "hacker school"]]), 11 | Phrase("search", ["google", ["~query", "weather 11215"]]), 12 | Phrase("search", ["search the web for", ["~query", "kanye west"]]), 13 | Phrase("search", ["search for", ["~query", "hello world"]]), 14 | Phrase("search", ["search", ["search_source/wikipedia", "wikipedia"], "for", ["~query", "praying mantis"]]), 15 | Phrase("search", [["search_source/wikipedia", "wikipedia"], ["~query", "android"]]), 16 | Phrase("search", [["search_source/wikipedia", "show me the wikipedia article for"], ["~query", "the grateful dead"]]), 17 | Phrase("search", ["search", ["search_source/this_site", "this site"], "for", ["~query", "contact us"]]), 18 | Phrase("search", ["find", ["~query", "support"], "on", ["search_source/this_site", "this site"]]), 19 | Phrase("search", [["~query", "barack obama"]]), 20 | Phrase("more_text", ["more"]), 21 | Phrase("more_text", [["*number", "2"], "more pages"]), 22 | Phrase("more_text", [["*number", "3"], "more pages"]), 23 | Phrase("more_text", ["next"]), 24 | Phrase("more_text", ["next", ["*number", "4"]]), 25 | Phrase("previous_text", ["previous"]), 26 | Phrase("previous_text", ["last", ["*number", "3"]]), 27 | Phrase("previous_text", ["previous", ["*number", "7"], "messages"]), 28 | Phrase("previous_text", ["last part"]), 29 | Phrase("back_to_top", ["back to top of page"]), 30 | Phrase("navigate", ["click", ["*number", "6"]]), 31 | Phrase("navigate", [["*number", "7"]]), 32 | Phrase("navigate", [["*number", "7"]]), 33 | Phrase("navigate", [["*number", "7"]]), 34 | Phrase("navigate", ["click link", ["target", "hvuiehguo"]]), 35 | Phrase("navigate", ["click", ["target", "ihenigo"], ["on_last_page", "on last page"]]), 36 | Phrase("navigate", ["load", ["target", "jegotghr"], ["on_last_page", "from previous page"]]), 37 | Phrase("show_navigation", ["show navigation"]), 38 | Phrase("help", ["help me"]), 39 | Phrase("help", ["what are the options"]), 40 | Phrase("help", ["what can I say?"]), 41 | Phrase("summarize", ["summarize this page"]), 42 | Phrase("summarize", ["summarize", ["*number", "2"]]), 43 | Phrase("show summary for", ["*number", "3"]), 44 | Phrase("back", ["back"]), 45 | Phrase("whereami", ["where am i?"]), 46 | Phrase("whereami", ["what page am i on?"]), 47 | Phrase("whereami", ["current site"]), 48 | Phrase("contents", ["show me the table of contents"]), 49 | Phrase("contents", ["zoom out"]), 50 | Phrase("contents", ["list the headings on the page"]) 51 | ] 52 | regexes = { 53 | "url": r"[a-zA-Z0-9_\-\.]+\.[a-z]+(\/[^ ]*)?", 54 | "number": r"\-?[0-9]+(\.[0-9]+)?" 55 | } 56 | 57 | def parse_command(command_text): 58 | return parse_phrase(command_text, examples, regexes) 59 | -------------------------------------------------------------------------------- /pybing/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | from bing import Bing 10 | -------------------------------------------------------------------------------- /pybing/bing.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | """ 10 | This module holds the Bing class which is used to create and execute queries 11 | against Bing. 12 | """ 13 | 14 | import urllib 15 | import urllib2 16 | 17 | # Issue #1 (http://code.google.com/p/pybing/issues/detail?id=1) 18 | # Python 2.6 has json built in, 2.5 needs simplejson 19 | try: 20 | import json 21 | except ImportError: 22 | import simplejson as json 23 | 24 | from pybing import constants 25 | 26 | class Bing(object): 27 | def __init__(self, app_id): 28 | self.app_id = app_id 29 | 30 | def search(self, query, source_type=None, api_version=None, extra_params=None, **kwargs): 31 | kwargs.update({ 32 | 'AppId': self.app_id, 33 | 'Version': api_version or constants.API_VERSION, 34 | 'Query': query, 35 | 'Sources': source_type or constants.DEFAULT_SOURCE_TYPE, 36 | }) 37 | 38 | if extra_params: 39 | kwargs.update(extra_params) 40 | 41 | query_string = urllib.urlencode(kwargs) 42 | contents = urllib2.urlopen(constants.JSON_ENDPOINT + '?' + query_string) 43 | return json.loads(contents.read()) 44 | 45 | def search_web(self, query): 46 | return self.search(query, source_type=constants.WEB_SOURCE_TYPE) 47 | 48 | def search_image(self, query): 49 | return self.search(query, source_type=constants.IMAGE_SOURCE_TYPE) 50 | 51 | def search_news(self, query): 52 | return self.search(query, source_type=constants.NEWS_SOURCE_TYPE) 53 | 54 | def search_spell(self, query): 55 | return self.search(query, source_type=constants.SPELL_SOURCE_TYPE) 56 | 57 | def search_related(self, query): 58 | return self.search(query, source_type=constants.RELATED_SOURCE_TYPE) 59 | 60 | def search_phonebook(self, query): 61 | return self.search(query, source_type=constants.PHONEBOOK_SOURCE_TYPE) 62 | 63 | def search_answers(self, query): 64 | return self.search(query, source_type=constants.ANSWERS_SOURCE_TYPE) 65 | -------------------------------------------------------------------------------- /pybing/constants.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | """ 10 | This module holds the any constants used when querying Bing. 11 | """ 12 | 13 | API_VERSION = '2.0' 14 | JSON_ENDPOINT = 'http://api.search.live.net/json.aspx' 15 | MAX_PAGE_SIZE = 50 16 | MAX_RESULTS = 1000 17 | 18 | WEB_SOURCE_TYPE = 'Web' 19 | IMAGE_SOURCE_TYPE = 'Image' 20 | NEWS_SOURCE_TYPE = 'News' 21 | SPELL_SOURCE_TYPE = 'Spell' 22 | RELATED_SOURCE_TYPE = 'RelatedSearch' 23 | PHONEBOOK_SOURCE_TYPE = 'Phonebook' 24 | ANSWERS_SOURCE_TYPE = 'InstanceAnswer' 25 | 26 | SOURCE_TYPES = ( 27 | WEB_SOURCE_TYPE, 28 | IMAGE_SOURCE_TYPE, 29 | NEWS_SOURCE_TYPE, 30 | SPELL_SOURCE_TYPE, 31 | RELATED_SOURCE_TYPE, 32 | PHONEBOOK_SOURCE_TYPE, 33 | ANSWERS_SOURCE_TYPE, 34 | ) 35 | 36 | DEFAULT_SOURCE_TYPE = WEB_SOURCE_TYPE 37 | -------------------------------------------------------------------------------- /pybing/query/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | # Mixins 10 | from mixin import QueryMixin 11 | from pagable import Pagable 12 | 13 | # Base Query 14 | from query import BingQuery 15 | 16 | # Concrete Queries 17 | from web import WebQuery 18 | -------------------------------------------------------------------------------- /pybing/query/mixin.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | """ 10 | This module holds the QueryMixin base class used for all queries. 11 | """ 12 | 13 | class QueryMixin(object): 14 | """ 15 | Any methods that might be mixed into queries should extend this 16 | base class. 17 | """ 18 | def get_request_parameters(self): 19 | params = {} 20 | 21 | # Since we're mixing in, super() may or may not have the attribute 22 | sup = super(QueryMixin, self) 23 | if hasattr(sup, 'get_request_parameters'): 24 | params = sup.get_request_parameters() 25 | 26 | return params 27 | -------------------------------------------------------------------------------- /pybing/query/pagable.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | """ 10 | This module holds a mixin to specify a query class you can page through 11 | using the count and offset parameter. 12 | """ 13 | 14 | from mixin import QueryMixin 15 | 16 | class Pagable(QueryMixin): 17 | """ 18 | This class is a mixin used with BingQuery classes to specify that 19 | queries can be paged through using the offset and count parameters. 20 | 21 | Some examples of Pagable requests are WebRequests and VideoRequests. 22 | Some non-Pagable requests are TranslationRequests and SearchRequests with 23 | the Spell source type. 24 | 25 | From the Bing API: 26 | - Count specifies the number of results to return per Request. 27 | - Offset specifies the offset requested, from zero, for the starting 28 | point of the result set to be returned for this Request. 29 | 30 | Note: This mixin currently supports only a single Source Type query. 31 | """ 32 | def __init__(self, *args, **kwargs): 33 | self._count = None 34 | self._offset = 0 35 | super(Pagable, self).__init__(*args, **kwargs) 36 | 37 | def execute(self, *args, **kwargs): 38 | if self.count and self.offset and self.count + self.offset > 1000: 39 | raise ValueError, "Count + Offset must be less than 1000" 40 | super(Pagable, self).execute(*args, **kwargs) 41 | 42 | def get_request_parameters(self): 43 | params = super(Pagable, self).get_request_parameters() 44 | 45 | if self.count: 46 | params['%s.Count' % self.SOURCE_TYPE] = self.count 47 | 48 | if self.offset: 49 | params['%s.Offset' % self.SOURCE_TYPE] = self.offset 50 | 51 | return params 52 | 53 | @property 54 | def count(self): 55 | return self._count 56 | 57 | def set_count(self, value): 58 | if value is not None: 59 | if value < 1: 60 | raise ValueError, 'Count must be positive' 61 | 62 | elif value > 50: 63 | raise ValueError, 'Count must be less than 50' 64 | 65 | obj = self._clone() 66 | obj._count = value 67 | return obj 68 | 69 | @property 70 | def offset(self): 71 | return self._offset 72 | 73 | def set_offset(self, value): 74 | if value < 0: 75 | raise ValueError, 'Offset must be positive' 76 | 77 | elif value > 1000: 78 | raise ValueError, 'Offset must be less than 1000' 79 | 80 | obj = self._clone() 81 | obj._offset = value 82 | return obj 83 | -------------------------------------------------------------------------------- /pybing/query/query.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | """ 10 | This module holds the base Query class used by the various types of Bing queries. 11 | """ 12 | 13 | import copy, urllib, httplib2 14 | 15 | # Issue #1 (http://code.google.com/p/pybing/issues/detail?id=1) 16 | # Python 2.6 has json built in, 2.5 needs simplejson 17 | try: import json 18 | except ImportError: import simplejson as json 19 | 20 | from pybing import constants 21 | from pybing.query.mixin import QueryMixin 22 | 23 | class BingQuery(QueryMixin): 24 | SOURCE_TYPE = None 25 | 26 | def __init__(self, app_id, query=None, version=None, *args, **kwargs): 27 | self.app_id = app_id 28 | self.version = version or constants.API_VERSION 29 | self._query = query 30 | 31 | # Needed for mixin's __init__'s to be called. 32 | super(BingQuery, self).__init__(*args, **kwargs) 33 | 34 | def set_query(self, query): 35 | if not query: 36 | raise ValueError, 'Query cannot be empty or None' 37 | 38 | obj = self._clone() 39 | obj._query = query 40 | return obj 41 | 42 | @property 43 | def query(self): 44 | return self._query 45 | 46 | def execute(self): 47 | if not self.query: 48 | raise ValueError, 'Query cannot be empty or None' 49 | 50 | elif not self.SOURCE_TYPE: 51 | raise ValueError, 'Source Type cannot be empty or None' 52 | 53 | from pybing.resultset import BingResultSet 54 | return BingResultSet(self) 55 | 56 | def get_request_parameters(self): 57 | params = super(BingQuery, self).get_request_parameters() 58 | params.update({ 59 | 'AppId': self.app_id, 60 | 'Version': self.version, 61 | 'Query': self.query, 62 | 'Sources': self.SOURCE_TYPE, 63 | }) 64 | return params 65 | 66 | def get_request_url(self): 67 | query_string = urllib.urlencode(self.get_request_parameters()) 68 | return constants.JSON_ENDPOINT + '?' + query_string 69 | 70 | def get_search_response(self): 71 | contents = self._get_url_contents(self.get_request_url()) 72 | return json.loads(contents)['SearchResponse'][self.SOURCE_TYPE] 73 | 74 | def get_search_results(self): 75 | from pybing.result import BingResult 76 | response = self.get_search_response() 77 | return [BingResult(result) for result in response['Results']] 78 | 79 | def _get_url_contents(self, url): 80 | response, contents = httplib2.Http().request(url) 81 | return contents 82 | 83 | def _clone(self): 84 | """ 85 | Do a deep copy of this object returning a clone that can be 86 | modified without affecting the old copy. 87 | """ 88 | return copy.deepcopy(self) 89 | 90 | def __unicode__(self): 91 | return 'BingQuery: %s' % self.get_request_url() 92 | 93 | __str__ = __unicode__ 94 | 95 | def __repr__(self): 96 | return '<%s>' % unicode(self) 97 | -------------------------------------------------------------------------------- /pybing/query/web.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | """ 10 | This module holds the Bing WebQuery class used to do web searches against Bing. 11 | """ 12 | 13 | from pybing import constants 14 | from pybing.query import BingQuery, Pagable 15 | 16 | class WebQuery(BingQuery, Pagable): 17 | SOURCE_TYPE = constants.WEB_SOURCE_TYPE 18 | -------------------------------------------------------------------------------- /pybing/result.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | """ 10 | This module holds the base BingResult class. 11 | """ 12 | 13 | class BingResult(object): 14 | """ 15 | The base BingResult class corresponds to a single result from a Bing 16 | Query response. 17 | """ 18 | def __init__(self, result): 19 | if isinstance(result, dict): 20 | self.load_from_dict(result) 21 | 22 | else: 23 | raise TypeError, 'Invalid result type' 24 | 25 | def load_from_dict(self, data): 26 | for key, value in data.iteritems(): 27 | setattr(self, key.lower(), value) 28 | 29 | def __repr__(self): 30 | return '' 31 | -------------------------------------------------------------------------------- /pybing/resultset.py: -------------------------------------------------------------------------------- 1 | # This file is part of PyBing (http://pybing.googlecode.com). 2 | # 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/ 4 | # All rights reserved. 5 | # 6 | # This software is licensed as described in the file COPYING.txt, 7 | # which you should have received as part of this distribution. 8 | 9 | """ 10 | This module holds the logic for dealing with a set of results from a query. 11 | """ 12 | 13 | from pybing import constants 14 | from pybing.query import BingQuery, Pagable 15 | 16 | class BingResultSet(object): 17 | """ 18 | This class corresponds to a set of results from a BingQuery. 19 | """ 20 | def __init__(self, query, offset=0, count=None): 21 | if not isinstance(query, BingQuery): 22 | raise TypeError, 'query must be a BingQuery instance' 23 | 24 | self.query = query 25 | self.results = {} 26 | 27 | # These offset + count are used internally to signify whether or 28 | # not the query should be cut down (whether they've been sliced). 29 | self.offset, self.count = offset, count 30 | 31 | def get_offset(self, index=0): 32 | return self.query.offset + self.offset + index 33 | 34 | def __getitem__(self, key): 35 | """ 36 | Allows you to grab an index or slice a query with array notation like 37 | resultset[4] or resultset[0:4] 38 | """ 39 | if not isinstance(self.query, Pagable): 40 | raise TypeError, 'Array access only supported on Pagable Queries' 41 | 42 | if isinstance(key, int): 43 | absolute_index = self.get_offset() 44 | if absolute_index < 0 or absolute_index >= constants.MAX_RESULTS: 45 | raise IndexError 46 | 47 | if absolute_index not in self.results: 48 | # Make a copy of the query for only this one result: 49 | query = self.query.set_offset(absolute_index).set_count(1) 50 | results = query.get_search_results() 51 | if results: 52 | self.results[absolute_index] = results[0] 53 | 54 | return self.results.get(absolute_index) 55 | 56 | elif isinstance(key, slice): 57 | # Return a new result set that is sliced internally (not the query) 58 | offset = key.start or 0 59 | if key.stop: count = key.stop - offset 60 | else: count = None 61 | return BingResultSet(self.query, self.offset + offset, count) 62 | 63 | else: 64 | raise TypeError 65 | 66 | def __len__(self): 67 | """ 68 | Returns the number of results if you were to iterate over this result set. 69 | This is at least 0 and at most 1000. 70 | """ 71 | count = constants.MAX_RESULTS 72 | 73 | if self.count: 74 | count = self.count 75 | 76 | elif self.query.count: 77 | count = self.query.count 78 | 79 | if count > constants.MAX_RESULTS: 80 | count = constants.MAX_RESULTS 81 | 82 | if count == constants.MAX_RESULTS: 83 | count = count - self.get_offset() 84 | 85 | return count 86 | 87 | def __iter__(self): 88 | """ 89 | Allows you to iterate over the search results in the standard Python 90 | format such as 91 | for result in my_query.execute(): 92 | print result.title, result.url 93 | """ 94 | query = self.query.set_offset(self.get_offset()) 95 | end_index = constants.MAX_RESULTS 96 | 97 | # If we've internally sliced out items 98 | if self.count: 99 | query = query.set_count(self.count if self.count < constants.MAX_PAGE_SIZE else constants.MAX_PAGE_SIZE) 100 | end_index = self.get_offset() + self.count 101 | 102 | if end_index > constants.MAX_RESULTS: 103 | end_index = constants.MAX_RESULTS 104 | 105 | # If we want to just go until the end, grab them the most per page 106 | if not query.count: 107 | query = query.set_count(constants.MAX_PAGE_SIZE) 108 | 109 | while query.offset < end_index: 110 | # If we don't have a full page left, only grab up to the end 111 | count = end_index - query.offset 112 | if count and count < constants.MAX_PAGE_SIZE: 113 | query = query.set_count(count) 114 | 115 | # Yield back each result 116 | for result in query.get_search_results(): 117 | yield result 118 | 119 | # Update the offset to move onto the next page 120 | query = query.set_offset(query.offset + query.count) 121 | 122 | def __repr__(self): 123 | return '' % self.query 124 | -------------------------------------------------------------------------------- /requests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # __ 4 | # /__) _ _ _ _ _/ _ 5 | # / ( (- (/ (/ (- _) / _) 6 | # / 7 | 8 | """ 9 | requests HTTP library 10 | ~~~~~~~~~~~~~~~~~~~~~ 11 | 12 | Requests is an HTTP library, written in Python, for human beings. Basic GET 13 | usage: 14 | 15 | >>> import requests 16 | >>> r = requests.get('http://python.org') 17 | >>> r.status_code 18 | 200 19 | >>> 'Python is a programming language' in r.content 20 | True 21 | 22 | ... or POST: 23 | 24 | >>> payload = dict(key1='value1', key2='value2') 25 | >>> r = requests.post("http://httpbin.org/post", data=payload) 26 | >>> print(r.text) 27 | { 28 | ... 29 | "form": { 30 | "key2": "value2", 31 | "key1": "value1" 32 | }, 33 | ... 34 | } 35 | 36 | The other HTTP methods are supported - see `requests.api`. Full documentation 37 | is at . 38 | 39 | :copyright: (c) 2014 by Kenneth Reitz. 40 | :license: Apache 2.0, see LICENSE for more details. 41 | 42 | """ 43 | 44 | __title__ = 'requests' 45 | __version__ = '2.4.1' 46 | __build__ = 0x020401 47 | __author__ = 'Kenneth Reitz' 48 | __license__ = 'Apache 2.0' 49 | __copyright__ = 'Copyright 2014 Kenneth Reitz' 50 | 51 | # Attempt to enable urllib3's SNI support, if possible 52 | try: 53 | from .packages.urllib3.contrib import pyopenssl 54 | pyopenssl.inject_into_urllib3() 55 | except ImportError: 56 | pass 57 | 58 | from . import utils 59 | from .models import Request, Response, PreparedRequest 60 | from .api import request, get, head, post, patch, put, delete, options 61 | from .sessions import session, Session 62 | from .status_codes import codes 63 | from .exceptions import ( 64 | RequestException, Timeout, URLRequired, 65 | TooManyRedirects, HTTPError, ConnectionError 66 | ) 67 | 68 | # Set default logging handler to avoid "No handler found" warnings. 69 | import logging 70 | try: # Python 2.7+ 71 | from logging import NullHandler 72 | except ImportError: 73 | class NullHandler(logging.Handler): 74 | def emit(self, record): 75 | pass 76 | 77 | logging.getLogger(__name__).addHandler(NullHandler()) 78 | -------------------------------------------------------------------------------- /requests/certs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | certs.py 6 | ~~~~~~~~ 7 | 8 | This module returns the preferred default CA certificate bundle. 9 | 10 | If you are packaging Requests, e.g., for a Linux distribution or a managed 11 | environment, you can change the definition of where() to return a separately 12 | packaged CA bundle. 13 | """ 14 | import os.path 15 | 16 | try: 17 | from certifi import where 18 | except ImportError: 19 | def where(): 20 | """Return the preferred certificate bundle.""" 21 | # vendored bundle inside Requests 22 | return os.path.join(os.path.dirname(__file__), 'cacert.pem') 23 | 24 | if __name__ == '__main__': 25 | print(where()) 26 | -------------------------------------------------------------------------------- /requests/compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | pythoncompat 5 | """ 6 | 7 | from .packages import chardet 8 | 9 | import sys 10 | 11 | # ------- 12 | # Pythons 13 | # ------- 14 | 15 | # Syntax sugar. 16 | _ver = sys.version_info 17 | 18 | #: Python 2.x? 19 | is_py2 = (_ver[0] == 2) 20 | 21 | #: Python 3.x? 22 | is_py3 = (_ver[0] == 3) 23 | 24 | #: Python 3.0.x 25 | is_py30 = (is_py3 and _ver[1] == 0) 26 | 27 | #: Python 3.1.x 28 | is_py31 = (is_py3 and _ver[1] == 1) 29 | 30 | #: Python 3.2.x 31 | is_py32 = (is_py3 and _ver[1] == 2) 32 | 33 | #: Python 3.3.x 34 | is_py33 = (is_py3 and _ver[1] == 3) 35 | 36 | #: Python 3.4.x 37 | is_py34 = (is_py3 and _ver[1] == 4) 38 | 39 | #: Python 2.7.x 40 | is_py27 = (is_py2 and _ver[1] == 7) 41 | 42 | #: Python 2.6.x 43 | is_py26 = (is_py2 and _ver[1] == 6) 44 | 45 | #: Python 2.5.x 46 | is_py25 = (is_py2 and _ver[1] == 5) 47 | 48 | #: Python 2.4.x 49 | is_py24 = (is_py2 and _ver[1] == 4) # I'm assuming this is not by choice. 50 | 51 | 52 | # --------- 53 | # Platforms 54 | # --------- 55 | 56 | 57 | # Syntax sugar. 58 | _ver = sys.version.lower() 59 | 60 | is_pypy = ('pypy' in _ver) 61 | is_jython = ('jython' in _ver) 62 | is_ironpython = ('iron' in _ver) 63 | 64 | # Assume CPython, if nothing else. 65 | is_cpython = not any((is_pypy, is_jython, is_ironpython)) 66 | 67 | # Windows-based system. 68 | is_windows = 'win32' in str(sys.platform).lower() 69 | 70 | # Standard Linux 2+ system. 71 | is_linux = ('linux' in str(sys.platform).lower()) 72 | is_osx = ('darwin' in str(sys.platform).lower()) 73 | is_hpux = ('hpux' in str(sys.platform).lower()) # Complete guess. 74 | is_solaris = ('solar==' in str(sys.platform).lower()) # Complete guess. 75 | 76 | try: 77 | import simplejson as json 78 | except (ImportError, SyntaxError): 79 | # simplejson does not support Python 3.2, it thows a SyntaxError 80 | # because of u'...' Unicode literals. 81 | import json 82 | 83 | # --------- 84 | # Specifics 85 | # --------- 86 | 87 | if is_py2: 88 | from urllib import quote, unquote, quote_plus, unquote_plus, urlencode, getproxies, proxy_bypass 89 | from urlparse import urlparse, urlunparse, urljoin, urlsplit, urldefrag 90 | from urllib2 import parse_http_list 91 | import cookielib 92 | from Cookie import Morsel 93 | from StringIO import StringIO 94 | from .packages.urllib3.packages.ordered_dict import OrderedDict 95 | 96 | builtin_str = str 97 | bytes = str 98 | str = unicode 99 | basestring = basestring 100 | numeric_types = (int, long, float) 101 | 102 | 103 | elif is_py3: 104 | from urllib.parse import urlparse, urlunparse, urljoin, urlsplit, urlencode, quote, unquote, quote_plus, unquote_plus, urldefrag 105 | from urllib.request import parse_http_list, getproxies, proxy_bypass 106 | from http import cookiejar as cookielib 107 | from http.cookies import Morsel 108 | from io import StringIO 109 | from collections import OrderedDict 110 | 111 | builtin_str = str 112 | str = str 113 | bytes = bytes 114 | basestring = (str, bytes) 115 | numeric_types = (int, float) 116 | -------------------------------------------------------------------------------- /requests/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | requests.exceptions 5 | ~~~~~~~~~~~~~~~~~~~ 6 | 7 | This module contains the set of Requests' exceptions. 8 | 9 | """ 10 | from .packages.urllib3.exceptions import HTTPError as BaseHTTPError 11 | 12 | 13 | class RequestException(IOError): 14 | """There was an ambiguous exception that occurred while handling your 15 | request.""" 16 | 17 | def __init__(self, *args, **kwargs): 18 | """ 19 | Initialize RequestException with `request` and `response` objects. 20 | """ 21 | response = kwargs.pop('response', None) 22 | self.response = response 23 | self.request = kwargs.pop('request', None) 24 | if (response is not None and not self.request and 25 | hasattr(response, 'request')): 26 | self.request = self.response.request 27 | super(RequestException, self).__init__(*args, **kwargs) 28 | 29 | 30 | class HTTPError(RequestException): 31 | """An HTTP error occurred.""" 32 | 33 | 34 | class ConnectionError(RequestException): 35 | """A Connection error occurred.""" 36 | 37 | 38 | class ProxyError(ConnectionError): 39 | """A proxy error occurred.""" 40 | 41 | 42 | class SSLError(ConnectionError): 43 | """An SSL error occurred.""" 44 | 45 | 46 | class Timeout(RequestException): 47 | """The request timed out. 48 | 49 | Catching this error will catch both 50 | :exc:`~requests.exceptions.ConnectTimeout` and 51 | :exc:`~requests.exceptions.ReadTimeout` errors. 52 | """ 53 | 54 | 55 | class ConnectTimeout(ConnectionError, Timeout): 56 | """The request timed out while trying to connect to the remote server. 57 | 58 | Requests that produced this error are safe to retry. 59 | """ 60 | 61 | 62 | class ReadTimeout(Timeout): 63 | """The server did not send any data in the allotted amount of time.""" 64 | 65 | 66 | class URLRequired(RequestException): 67 | """A valid URL is required to make a request.""" 68 | 69 | 70 | class TooManyRedirects(RequestException): 71 | """Too many redirects.""" 72 | 73 | 74 | class MissingSchema(RequestException, ValueError): 75 | """The URL schema (e.g. http or https) is missing.""" 76 | 77 | 78 | class InvalidSchema(RequestException, ValueError): 79 | """See defaults.py for valid schemas.""" 80 | 81 | 82 | class InvalidURL(RequestException, ValueError): 83 | """ The URL provided was somehow invalid. """ 84 | 85 | 86 | class ChunkedEncodingError(RequestException): 87 | """The server declared chunked encoding but sent an invalid chunk.""" 88 | 89 | 90 | class ContentDecodingError(RequestException, BaseHTTPError): 91 | """Failed to decode response content""" 92 | -------------------------------------------------------------------------------- /requests/hooks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | requests.hooks 5 | ~~~~~~~~~~~~~~ 6 | 7 | This module provides the capabilities for the Requests hooks system. 8 | 9 | Available hooks: 10 | 11 | ``response``: 12 | The response generated from a Request. 13 | 14 | """ 15 | 16 | 17 | HOOKS = ['response'] 18 | 19 | 20 | def default_hooks(): 21 | hooks = {} 22 | for event in HOOKS: 23 | hooks[event] = [] 24 | return hooks 25 | 26 | # TODO: response is the only one 27 | 28 | 29 | def dispatch_hook(key, hooks, hook_data, **kwargs): 30 | """Dispatches a hook dictionary on a given piece of data.""" 31 | 32 | hooks = hooks or dict() 33 | 34 | if key in hooks: 35 | hooks = hooks.get(key) 36 | 37 | if hasattr(hooks, '__call__'): 38 | hooks = [hooks] 39 | 40 | for hook in hooks: 41 | _hook_data = hook(hook_data, **kwargs) 42 | if _hook_data is not None: 43 | hook_data = _hook_data 44 | 45 | return hook_data 46 | -------------------------------------------------------------------------------- /requests/packages/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from . import urllib3 4 | -------------------------------------------------------------------------------- /requests/packages/chardet/__init__.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # This library is free software; you can redistribute it and/or 3 | # modify it under the terms of the GNU Lesser General Public 4 | # License as published by the Free Software Foundation; either 5 | # version 2.1 of the License, or (at your option) any later version. 6 | # 7 | # This library is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 | # Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public 13 | # License along with this library; if not, write to the Free Software 14 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 15 | # 02110-1301 USA 16 | ######################### END LICENSE BLOCK ######################### 17 | 18 | __version__ = "2.2.1" 19 | from sys import version_info 20 | 21 | 22 | def detect(aBuf): 23 | if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or 24 | (version_info >= (3, 0) and not isinstance(aBuf, bytes))): 25 | raise ValueError('Expected a bytes object, not a unicode object') 26 | 27 | from . import universaldetector 28 | u = universaldetector.UniversalDetector() 29 | u.reset() 30 | u.feed(aBuf) 31 | u.close() 32 | return u.result 33 | -------------------------------------------------------------------------------- /requests/packages/chardet/big5prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import Big5DistributionAnalysis 31 | from .mbcssm import Big5SMModel 32 | 33 | 34 | class Big5Prober(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(Big5SMModel) 38 | self._mDistributionAnalyzer = Big5DistributionAnalysis() 39 | self.reset() 40 | 41 | def get_charset_name(self): 42 | return "Big5" 43 | -------------------------------------------------------------------------------- /requests/packages/chardet/chardetect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Script which takes one or more file paths and reports on their detected 4 | encodings 5 | 6 | Example:: 7 | 8 | % chardetect somefile someotherfile 9 | somefile: windows-1252 with confidence 0.5 10 | someotherfile: ascii with confidence 1.0 11 | 12 | If no paths are provided, it takes its input from stdin. 13 | 14 | """ 15 | from io import open 16 | from sys import argv, stdin 17 | 18 | from chardet.universaldetector import UniversalDetector 19 | 20 | 21 | def description_of(file, name='stdin'): 22 | """Return a string describing the probable encoding of a file.""" 23 | u = UniversalDetector() 24 | for line in file: 25 | u.feed(line) 26 | u.close() 27 | result = u.result 28 | if result['encoding']: 29 | return '%s: %s with confidence %s' % (name, 30 | result['encoding'], 31 | result['confidence']) 32 | else: 33 | return '%s: no result' % name 34 | 35 | 36 | def main(): 37 | if len(argv) <= 1: 38 | print(description_of(stdin)) 39 | else: 40 | for path in argv[1:]: 41 | with open(path, 'rb') as f: 42 | print(description_of(f, path)) 43 | 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /requests/packages/chardet/charsetgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | import sys 30 | from .charsetprober import CharSetProber 31 | 32 | 33 | class CharSetGroupProber(CharSetProber): 34 | def __init__(self): 35 | CharSetProber.__init__(self) 36 | self._mActiveNum = 0 37 | self._mProbers = [] 38 | self._mBestGuessProber = None 39 | 40 | def reset(self): 41 | CharSetProber.reset(self) 42 | self._mActiveNum = 0 43 | for prober in self._mProbers: 44 | if prober: 45 | prober.reset() 46 | prober.active = True 47 | self._mActiveNum += 1 48 | self._mBestGuessProber = None 49 | 50 | def get_charset_name(self): 51 | if not self._mBestGuessProber: 52 | self.get_confidence() 53 | if not self._mBestGuessProber: 54 | return None 55 | # self._mBestGuessProber = self._mProbers[0] 56 | return self._mBestGuessProber.get_charset_name() 57 | 58 | def feed(self, aBuf): 59 | for prober in self._mProbers: 60 | if not prober: 61 | continue 62 | if not prober.active: 63 | continue 64 | st = prober.feed(aBuf) 65 | if not st: 66 | continue 67 | if st == constants.eFoundIt: 68 | self._mBestGuessProber = prober 69 | return self.get_state() 70 | elif st == constants.eNotMe: 71 | prober.active = False 72 | self._mActiveNum -= 1 73 | if self._mActiveNum <= 0: 74 | self._mState = constants.eNotMe 75 | return self.get_state() 76 | return self.get_state() 77 | 78 | def get_confidence(self): 79 | st = self.get_state() 80 | if st == constants.eFoundIt: 81 | return 0.99 82 | elif st == constants.eNotMe: 83 | return 0.01 84 | bestConf = 0.0 85 | self._mBestGuessProber = None 86 | for prober in self._mProbers: 87 | if not prober: 88 | continue 89 | if not prober.active: 90 | if constants._debug: 91 | sys.stderr.write(prober.get_charset_name() 92 | + ' not active\n') 93 | continue 94 | cf = prober.get_confidence() 95 | if constants._debug: 96 | sys.stderr.write('%s confidence = %s\n' % 97 | (prober.get_charset_name(), cf)) 98 | if bestConf < cf: 99 | bestConf = cf 100 | self._mBestGuessProber = prober 101 | if not self._mBestGuessProber: 102 | return 0.0 103 | return bestConf 104 | # else: 105 | # self._mBestGuessProber = self._mProbers[0] 106 | # return self._mBestGuessProber.get_confidence() 107 | -------------------------------------------------------------------------------- /requests/packages/chardet/charsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from . import constants 30 | import re 31 | 32 | 33 | class CharSetProber: 34 | def __init__(self): 35 | pass 36 | 37 | def reset(self): 38 | self._mState = constants.eDetecting 39 | 40 | def get_charset_name(self): 41 | return None 42 | 43 | def feed(self, aBuf): 44 | pass 45 | 46 | def get_state(self): 47 | return self._mState 48 | 49 | def get_confidence(self): 50 | return 0.0 51 | 52 | def filter_high_bit_only(self, aBuf): 53 | aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf) 54 | return aBuf 55 | 56 | def filter_without_english_letters(self, aBuf): 57 | aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf) 58 | return aBuf 59 | 60 | def filter_with_english_letters(self, aBuf): 61 | # TODO 62 | return aBuf 63 | -------------------------------------------------------------------------------- /requests/packages/chardet/codingstatemachine.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .constants import eStart 29 | from .compat import wrap_ord 30 | 31 | 32 | class CodingStateMachine: 33 | def __init__(self, sm): 34 | self._mModel = sm 35 | self._mCurrentBytePos = 0 36 | self._mCurrentCharLen = 0 37 | self.reset() 38 | 39 | def reset(self): 40 | self._mCurrentState = eStart 41 | 42 | def next_state(self, c): 43 | # for each byte we get its class 44 | # if it is first byte, we also get byte length 45 | # PY3K: aBuf is a byte stream, so c is an int, not a byte 46 | byteCls = self._mModel['classTable'][wrap_ord(c)] 47 | if self._mCurrentState == eStart: 48 | self._mCurrentBytePos = 0 49 | self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] 50 | # from byte's class and stateTable, we get its next state 51 | curr_state = (self._mCurrentState * self._mModel['classFactor'] 52 | + byteCls) 53 | self._mCurrentState = self._mModel['stateTable'][curr_state] 54 | self._mCurrentBytePos += 1 55 | return self._mCurrentState 56 | 57 | def get_current_charlen(self): 58 | return self._mCurrentCharLen 59 | 60 | def get_coding_state_machine(self): 61 | return self._mModel['name'] 62 | -------------------------------------------------------------------------------- /requests/packages/chardet/compat.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # Contributor(s): 3 | # Ian Cordasco - port to Python 4 | # 5 | # This library is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 2.1 of the License, or (at your option) any later version. 9 | # 10 | # This library is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public 16 | # License along with this library; if not, write to the Free Software 17 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 18 | # 02110-1301 USA 19 | ######################### END LICENSE BLOCK ######################### 20 | 21 | import sys 22 | 23 | 24 | if sys.version_info < (3, 0): 25 | base_str = (str, unicode) 26 | else: 27 | base_str = (bytes, str) 28 | 29 | 30 | def wrap_ord(a): 31 | if sys.version_info < (3, 0) and isinstance(a, base_str): 32 | return ord(a) 33 | else: 34 | return a 35 | -------------------------------------------------------------------------------- /requests/packages/chardet/constants.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | _debug = 0 30 | 31 | eDetecting = 0 32 | eFoundIt = 1 33 | eNotMe = 2 34 | 35 | eStart = 0 36 | eError = 1 37 | eItsMe = 2 38 | 39 | SHORTCUT_THRESHOLD = 0.95 40 | -------------------------------------------------------------------------------- /requests/packages/chardet/cp949prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCKRDistributionAnalysis 31 | from .mbcssm import CP949SMModel 32 | 33 | 34 | class CP949Prober(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(CP949SMModel) 38 | # NOTE: CP949 is a superset of EUC-KR, so the distribution should be 39 | # not different. 40 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis() 41 | self.reset() 42 | 43 | def get_charset_name(self): 44 | return "CP949" 45 | -------------------------------------------------------------------------------- /requests/packages/chardet/escprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, 30 | ISO2022KRSMModel) 31 | from .charsetprober import CharSetProber 32 | from .codingstatemachine import CodingStateMachine 33 | from .compat import wrap_ord 34 | 35 | 36 | class EscCharSetProber(CharSetProber): 37 | def __init__(self): 38 | CharSetProber.__init__(self) 39 | self._mCodingSM = [ 40 | CodingStateMachine(HZSMModel), 41 | CodingStateMachine(ISO2022CNSMModel), 42 | CodingStateMachine(ISO2022JPSMModel), 43 | CodingStateMachine(ISO2022KRSMModel) 44 | ] 45 | self.reset() 46 | 47 | def reset(self): 48 | CharSetProber.reset(self) 49 | for codingSM in self._mCodingSM: 50 | if not codingSM: 51 | continue 52 | codingSM.active = True 53 | codingSM.reset() 54 | self._mActiveSM = len(self._mCodingSM) 55 | self._mDetectedCharset = None 56 | 57 | def get_charset_name(self): 58 | return self._mDetectedCharset 59 | 60 | def get_confidence(self): 61 | if self._mDetectedCharset: 62 | return 0.99 63 | else: 64 | return 0.00 65 | 66 | def feed(self, aBuf): 67 | for c in aBuf: 68 | # PY3K: aBuf is a byte array, so c is an int, not a byte 69 | for codingSM in self._mCodingSM: 70 | if not codingSM: 71 | continue 72 | if not codingSM.active: 73 | continue 74 | codingState = codingSM.next_state(wrap_ord(c)) 75 | if codingState == constants.eError: 76 | codingSM.active = False 77 | self._mActiveSM -= 1 78 | if self._mActiveSM <= 0: 79 | self._mState = constants.eNotMe 80 | return self.get_state() 81 | elif codingState == constants.eItsMe: 82 | self._mState = constants.eFoundIt 83 | self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8 84 | return self.get_state() 85 | 86 | return self.get_state() 87 | -------------------------------------------------------------------------------- /requests/packages/chardet/eucjpprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import sys 29 | from . import constants 30 | from .mbcharsetprober import MultiByteCharSetProber 31 | from .codingstatemachine import CodingStateMachine 32 | from .chardistribution import EUCJPDistributionAnalysis 33 | from .jpcntx import EUCJPContextAnalysis 34 | from .mbcssm import EUCJPSMModel 35 | 36 | 37 | class EUCJPProber(MultiByteCharSetProber): 38 | def __init__(self): 39 | MultiByteCharSetProber.__init__(self) 40 | self._mCodingSM = CodingStateMachine(EUCJPSMModel) 41 | self._mDistributionAnalyzer = EUCJPDistributionAnalysis() 42 | self._mContextAnalyzer = EUCJPContextAnalysis() 43 | self.reset() 44 | 45 | def reset(self): 46 | MultiByteCharSetProber.reset(self) 47 | self._mContextAnalyzer.reset() 48 | 49 | def get_charset_name(self): 50 | return "EUC-JP" 51 | 52 | def feed(self, aBuf): 53 | aLen = len(aBuf) 54 | for i in range(0, aLen): 55 | # PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte 56 | codingState = self._mCodingSM.next_state(aBuf[i]) 57 | if codingState == constants.eError: 58 | if constants._debug: 59 | sys.stderr.write(self.get_charset_name() 60 | + ' prober hit error at byte ' + str(i) 61 | + '\n') 62 | self._mState = constants.eNotMe 63 | break 64 | elif codingState == constants.eItsMe: 65 | self._mState = constants.eFoundIt 66 | break 67 | elif codingState == constants.eStart: 68 | charLen = self._mCodingSM.get_current_charlen() 69 | if i == 0: 70 | self._mLastChar[1] = aBuf[0] 71 | self._mContextAnalyzer.feed(self._mLastChar, charLen) 72 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 73 | else: 74 | self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen) 75 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 76 | charLen) 77 | 78 | self._mLastChar[0] = aBuf[aLen - 1] 79 | 80 | if self.get_state() == constants.eDetecting: 81 | if (self._mContextAnalyzer.got_enough_data() and 82 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 83 | self._mState = constants.eFoundIt 84 | 85 | return self.get_state() 86 | 87 | def get_confidence(self): 88 | contxtCf = self._mContextAnalyzer.get_confidence() 89 | distribCf = self._mDistributionAnalyzer.get_confidence() 90 | return max(contxtCf, distribCf) 91 | -------------------------------------------------------------------------------- /requests/packages/chardet/euckrprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCKRDistributionAnalysis 31 | from .mbcssm import EUCKRSMModel 32 | 33 | 34 | class EUCKRProber(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(EUCKRSMModel) 38 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis() 39 | self.reset() 40 | 41 | def get_charset_name(self): 42 | return "EUC-KR" 43 | -------------------------------------------------------------------------------- /requests/packages/chardet/euctwprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCTWDistributionAnalysis 31 | from .mbcssm import EUCTWSMModel 32 | 33 | class EUCTWProber(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(EUCTWSMModel) 37 | self._mDistributionAnalyzer = EUCTWDistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "EUC-TW" 42 | -------------------------------------------------------------------------------- /requests/packages/chardet/gb2312prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import GB2312DistributionAnalysis 31 | from .mbcssm import GB2312SMModel 32 | 33 | class GB2312Prober(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(GB2312SMModel) 37 | self._mDistributionAnalyzer = GB2312DistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "GB2312" 42 | -------------------------------------------------------------------------------- /requests/packages/chardet/mbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | import sys 31 | from . import constants 32 | from .charsetprober import CharSetProber 33 | 34 | 35 | class MultiByteCharSetProber(CharSetProber): 36 | def __init__(self): 37 | CharSetProber.__init__(self) 38 | self._mDistributionAnalyzer = None 39 | self._mCodingSM = None 40 | self._mLastChar = [0, 0] 41 | 42 | def reset(self): 43 | CharSetProber.reset(self) 44 | if self._mCodingSM: 45 | self._mCodingSM.reset() 46 | if self._mDistributionAnalyzer: 47 | self._mDistributionAnalyzer.reset() 48 | self._mLastChar = [0, 0] 49 | 50 | def get_charset_name(self): 51 | pass 52 | 53 | def feed(self, aBuf): 54 | aLen = len(aBuf) 55 | for i in range(0, aLen): 56 | codingState = self._mCodingSM.next_state(aBuf[i]) 57 | if codingState == constants.eError: 58 | if constants._debug: 59 | sys.stderr.write(self.get_charset_name() 60 | + ' prober hit error at byte ' + str(i) 61 | + '\n') 62 | self._mState = constants.eNotMe 63 | break 64 | elif codingState == constants.eItsMe: 65 | self._mState = constants.eFoundIt 66 | break 67 | elif codingState == constants.eStart: 68 | charLen = self._mCodingSM.get_current_charlen() 69 | if i == 0: 70 | self._mLastChar[1] = aBuf[0] 71 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 72 | else: 73 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 74 | charLen) 75 | 76 | self._mLastChar[0] = aBuf[aLen - 1] 77 | 78 | if self.get_state() == constants.eDetecting: 79 | if (self._mDistributionAnalyzer.got_enough_data() and 80 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 81 | self._mState = constants.eFoundIt 82 | 83 | return self.get_state() 84 | 85 | def get_confidence(self): 86 | return self._mDistributionAnalyzer.get_confidence() 87 | -------------------------------------------------------------------------------- /requests/packages/chardet/mbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | from .charsetgroupprober import CharSetGroupProber 31 | from .utf8prober import UTF8Prober 32 | from .sjisprober import SJISProber 33 | from .eucjpprober import EUCJPProber 34 | from .gb2312prober import GB2312Prober 35 | from .euckrprober import EUCKRProber 36 | from .cp949prober import CP949Prober 37 | from .big5prober import Big5Prober 38 | from .euctwprober import EUCTWProber 39 | 40 | 41 | class MBCSGroupProber(CharSetGroupProber): 42 | def __init__(self): 43 | CharSetGroupProber.__init__(self) 44 | self._mProbers = [ 45 | UTF8Prober(), 46 | SJISProber(), 47 | EUCJPProber(), 48 | GB2312Prober(), 49 | EUCKRProber(), 50 | CP949Prober(), 51 | Big5Prober(), 52 | EUCTWProber() 53 | ] 54 | self.reset() 55 | -------------------------------------------------------------------------------- /requests/packages/chardet/sbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from .charsetgroupprober import CharSetGroupProber 30 | from .sbcharsetprober import SingleByteCharSetProber 31 | from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel, 32 | Latin5CyrillicModel, MacCyrillicModel, 33 | Ibm866Model, Ibm855Model) 34 | from .langgreekmodel import Latin7GreekModel, Win1253GreekModel 35 | from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel 36 | from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel 37 | from .langthaimodel import TIS620ThaiModel 38 | from .langhebrewmodel import Win1255HebrewModel 39 | from .hebrewprober import HebrewProber 40 | 41 | 42 | class SBCSGroupProber(CharSetGroupProber): 43 | def __init__(self): 44 | CharSetGroupProber.__init__(self) 45 | self._mProbers = [ 46 | SingleByteCharSetProber(Win1251CyrillicModel), 47 | SingleByteCharSetProber(Koi8rModel), 48 | SingleByteCharSetProber(Latin5CyrillicModel), 49 | SingleByteCharSetProber(MacCyrillicModel), 50 | SingleByteCharSetProber(Ibm866Model), 51 | SingleByteCharSetProber(Ibm855Model), 52 | SingleByteCharSetProber(Latin7GreekModel), 53 | SingleByteCharSetProber(Win1253GreekModel), 54 | SingleByteCharSetProber(Latin5BulgarianModel), 55 | SingleByteCharSetProber(Win1251BulgarianModel), 56 | SingleByteCharSetProber(Latin2HungarianModel), 57 | SingleByteCharSetProber(Win1250HungarianModel), 58 | SingleByteCharSetProber(TIS620ThaiModel), 59 | ] 60 | hebrewProber = HebrewProber() 61 | logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, 62 | False, hebrewProber) 63 | visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True, 64 | hebrewProber) 65 | hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) 66 | self._mProbers.extend([hebrewProber, logicalHebrewProber, 67 | visualHebrewProber]) 68 | 69 | self.reset() 70 | -------------------------------------------------------------------------------- /requests/packages/chardet/sjisprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import sys 29 | from .mbcharsetprober import MultiByteCharSetProber 30 | from .codingstatemachine import CodingStateMachine 31 | from .chardistribution import SJISDistributionAnalysis 32 | from .jpcntx import SJISContextAnalysis 33 | from .mbcssm import SJISSMModel 34 | from . import constants 35 | 36 | 37 | class SJISProber(MultiByteCharSetProber): 38 | def __init__(self): 39 | MultiByteCharSetProber.__init__(self) 40 | self._mCodingSM = CodingStateMachine(SJISSMModel) 41 | self._mDistributionAnalyzer = SJISDistributionAnalysis() 42 | self._mContextAnalyzer = SJISContextAnalysis() 43 | self.reset() 44 | 45 | def reset(self): 46 | MultiByteCharSetProber.reset(self) 47 | self._mContextAnalyzer.reset() 48 | 49 | def get_charset_name(self): 50 | return "SHIFT_JIS" 51 | 52 | def feed(self, aBuf): 53 | aLen = len(aBuf) 54 | for i in range(0, aLen): 55 | codingState = self._mCodingSM.next_state(aBuf[i]) 56 | if codingState == constants.eError: 57 | if constants._debug: 58 | sys.stderr.write(self.get_charset_name() 59 | + ' prober hit error at byte ' + str(i) 60 | + '\n') 61 | self._mState = constants.eNotMe 62 | break 63 | elif codingState == constants.eItsMe: 64 | self._mState = constants.eFoundIt 65 | break 66 | elif codingState == constants.eStart: 67 | charLen = self._mCodingSM.get_current_charlen() 68 | if i == 0: 69 | self._mLastChar[1] = aBuf[0] 70 | self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:], 71 | charLen) 72 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 73 | else: 74 | self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3 75 | - charLen], charLen) 76 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 77 | charLen) 78 | 79 | self._mLastChar[0] = aBuf[aLen - 1] 80 | 81 | if self.get_state() == constants.eDetecting: 82 | if (self._mContextAnalyzer.got_enough_data() and 83 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 84 | self._mState = constants.eFoundIt 85 | 86 | return self.get_state() 87 | 88 | def get_confidence(self): 89 | contxtCf = self._mContextAnalyzer.get_confidence() 90 | distribCf = self._mDistributionAnalyzer.get_confidence() 91 | return max(contxtCf, distribCf) 92 | -------------------------------------------------------------------------------- /requests/packages/chardet/utf8prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | from .charsetprober import CharSetProber 30 | from .codingstatemachine import CodingStateMachine 31 | from .mbcssm import UTF8SMModel 32 | 33 | ONE_CHAR_PROB = 0.5 34 | 35 | 36 | class UTF8Prober(CharSetProber): 37 | def __init__(self): 38 | CharSetProber.__init__(self) 39 | self._mCodingSM = CodingStateMachine(UTF8SMModel) 40 | self.reset() 41 | 42 | def reset(self): 43 | CharSetProber.reset(self) 44 | self._mCodingSM.reset() 45 | self._mNumOfMBChar = 0 46 | 47 | def get_charset_name(self): 48 | return "utf-8" 49 | 50 | def feed(self, aBuf): 51 | for c in aBuf: 52 | codingState = self._mCodingSM.next_state(c) 53 | if codingState == constants.eError: 54 | self._mState = constants.eNotMe 55 | break 56 | elif codingState == constants.eItsMe: 57 | self._mState = constants.eFoundIt 58 | break 59 | elif codingState == constants.eStart: 60 | if self._mCodingSM.get_current_charlen() >= 2: 61 | self._mNumOfMBChar += 1 62 | 63 | if self.get_state() == constants.eDetecting: 64 | if self.get_confidence() > constants.SHORTCUT_THRESHOLD: 65 | self._mState = constants.eFoundIt 66 | 67 | return self.get_state() 68 | 69 | def get_confidence(self): 70 | unlike = 0.99 71 | if self._mNumOfMBChar < 6: 72 | for i in range(0, self._mNumOfMBChar): 73 | unlike = unlike * ONE_CHAR_PROB 74 | return 1.0 - unlike 75 | else: 76 | return unlike 77 | -------------------------------------------------------------------------------- /requests/packages/urllib3/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib3 - Thread-safe connection pooling and re-using. 3 | """ 4 | 5 | __author__ = 'Andrey Petrov (andrey.petrov@shazow.net)' 6 | __license__ = 'MIT' 7 | __version__ = 'dev' 8 | 9 | 10 | from .connectionpool import ( 11 | HTTPConnectionPool, 12 | HTTPSConnectionPool, 13 | connection_from_url 14 | ) 15 | 16 | from . import exceptions 17 | from .filepost import encode_multipart_formdata 18 | from .poolmanager import PoolManager, ProxyManager, proxy_from_url 19 | from .response import HTTPResponse 20 | from .util.request import make_headers 21 | from .util.url import get_host 22 | from .util.timeout import Timeout 23 | from .util.retry import Retry 24 | 25 | 26 | # Set default logging handler to avoid "No handler found" warnings. 27 | import logging 28 | try: # Python 2.7+ 29 | from logging import NullHandler 30 | except ImportError: 31 | class NullHandler(logging.Handler): 32 | def emit(self, record): 33 | pass 34 | 35 | logging.getLogger(__name__).addHandler(NullHandler()) 36 | 37 | def add_stderr_logger(level=logging.DEBUG): 38 | """ 39 | Helper for quickly adding a StreamHandler to the logger. Useful for 40 | debugging. 41 | 42 | Returns the handler after adding it. 43 | """ 44 | # This method needs to be in this __init__.py to get the __name__ correct 45 | # even if urllib3 is vendored within another package. 46 | logger = logging.getLogger(__name__) 47 | handler = logging.StreamHandler() 48 | handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) 49 | logger.addHandler(handler) 50 | logger.setLevel(level) 51 | logger.debug('Added a stderr logging handler to logger: %s' % __name__) 52 | return handler 53 | 54 | # ... Clean up. 55 | del NullHandler 56 | 57 | 58 | # Set security warning to only go off once by default. 59 | import warnings 60 | warnings.simplefilter('module', exceptions.SecurityWarning) 61 | 62 | def disable_warnings(category=exceptions.HTTPWarning): 63 | """ 64 | Helper for quickly disabling all urllib3 warnings. 65 | """ 66 | warnings.simplefilter('ignore', category) 67 | -------------------------------------------------------------------------------- /requests/packages/urllib3/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/requests/packages/urllib3/contrib/__init__.py -------------------------------------------------------------------------------- /requests/packages/urllib3/exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | ## Base Exceptions 3 | 4 | class HTTPError(Exception): 5 | "Base exception used by this module." 6 | pass 7 | 8 | class HTTPWarning(Warning): 9 | "Base warning used by this module." 10 | pass 11 | 12 | 13 | 14 | class PoolError(HTTPError): 15 | "Base exception for errors caused within a pool." 16 | def __init__(self, pool, message): 17 | self.pool = pool 18 | HTTPError.__init__(self, "%s: %s" % (pool, message)) 19 | 20 | def __reduce__(self): 21 | # For pickling purposes. 22 | return self.__class__, (None, None) 23 | 24 | 25 | class RequestError(PoolError): 26 | "Base exception for PoolErrors that have associated URLs." 27 | def __init__(self, pool, url, message): 28 | self.url = url 29 | PoolError.__init__(self, pool, message) 30 | 31 | def __reduce__(self): 32 | # For pickling purposes. 33 | return self.__class__, (None, self.url, None) 34 | 35 | 36 | class SSLError(HTTPError): 37 | "Raised when SSL certificate fails in an HTTPS connection." 38 | pass 39 | 40 | 41 | class ProxyError(HTTPError): 42 | "Raised when the connection to a proxy fails." 43 | pass 44 | 45 | 46 | class DecodeError(HTTPError): 47 | "Raised when automatic decoding based on Content-Type fails." 48 | pass 49 | 50 | 51 | class ProtocolError(HTTPError): 52 | "Raised when something unexpected happens mid-request/response." 53 | pass 54 | 55 | 56 | #: Renamed to ProtocolError but aliased for backwards compatibility. 57 | ConnectionError = ProtocolError 58 | 59 | 60 | ## Leaf Exceptions 61 | 62 | class MaxRetryError(RequestError): 63 | """Raised when the maximum number of retries is exceeded. 64 | 65 | :param pool: The connection pool 66 | :type pool: :class:`~urllib3.connectionpool.HTTPConnectionPool` 67 | :param string url: The requested Url 68 | :param exceptions.Exception reason: The underlying error 69 | 70 | """ 71 | 72 | def __init__(self, pool, url, reason=None): 73 | self.reason = reason 74 | 75 | message = "Max retries exceeded with url: %s" % url 76 | if reason: 77 | message += " (Caused by %r)" % reason 78 | else: 79 | message += " (Caused by redirect)" 80 | 81 | RequestError.__init__(self, pool, url, message) 82 | 83 | 84 | class HostChangedError(RequestError): 85 | "Raised when an existing pool gets a request for a foreign host." 86 | 87 | def __init__(self, pool, url, retries=3): 88 | message = "Tried to open a foreign host with url: %s" % url 89 | RequestError.__init__(self, pool, url, message) 90 | self.retries = retries 91 | 92 | 93 | class TimeoutStateError(HTTPError): 94 | """ Raised when passing an invalid state to a timeout """ 95 | pass 96 | 97 | 98 | class TimeoutError(HTTPError): 99 | """ Raised when a socket timeout error occurs. 100 | 101 | Catching this error will catch both :exc:`ReadTimeoutErrors 102 | ` and :exc:`ConnectTimeoutErrors `. 103 | """ 104 | pass 105 | 106 | 107 | class ReadTimeoutError(TimeoutError, RequestError): 108 | "Raised when a socket timeout occurs while receiving data from a server" 109 | pass 110 | 111 | 112 | # This timeout error does not have a URL attached and needs to inherit from the 113 | # base HTTPError 114 | class ConnectTimeoutError(TimeoutError): 115 | "Raised when a socket timeout occurs while connecting to a server" 116 | pass 117 | 118 | 119 | class EmptyPoolError(PoolError): 120 | "Raised when a pool runs out of connections and no more are allowed." 121 | pass 122 | 123 | 124 | class ClosedPoolError(PoolError): 125 | "Raised when a request enters a pool after the pool has been closed." 126 | pass 127 | 128 | 129 | class LocationValueError(ValueError, HTTPError): 130 | "Raised when there is something wrong with a given URL input." 131 | pass 132 | 133 | 134 | class LocationParseError(LocationValueError): 135 | "Raised when get_host or similar fails to parse the URL input." 136 | 137 | def __init__(self, location): 138 | message = "Failed to parse: %s" % location 139 | HTTPError.__init__(self, message) 140 | 141 | self.location = location 142 | 143 | 144 | class SecurityWarning(HTTPWarning): 145 | "Warned when perfoming security reducing actions" 146 | pass 147 | 148 | 149 | class InsecureRequestWarning(SecurityWarning): 150 | "Warned when making an unverified HTTPS request." 151 | pass 152 | 153 | 154 | class SystemTimeWarning(SecurityWarning): 155 | "Warned when system time is suspected to be wrong" 156 | pass 157 | -------------------------------------------------------------------------------- /requests/packages/urllib3/filepost.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | from uuid import uuid4 4 | from io import BytesIO 5 | 6 | from .packages import six 7 | from .packages.six import b 8 | from .fields import RequestField 9 | 10 | writer = codecs.lookup('utf-8')[3] 11 | 12 | 13 | def choose_boundary(): 14 | """ 15 | Our embarassingly-simple replacement for mimetools.choose_boundary. 16 | """ 17 | return uuid4().hex 18 | 19 | 20 | def iter_field_objects(fields): 21 | """ 22 | Iterate over fields. 23 | 24 | Supports list of (k, v) tuples and dicts, and lists of 25 | :class:`~urllib3.fields.RequestField`. 26 | 27 | """ 28 | if isinstance(fields, dict): 29 | i = six.iteritems(fields) 30 | else: 31 | i = iter(fields) 32 | 33 | for field in i: 34 | if isinstance(field, RequestField): 35 | yield field 36 | else: 37 | yield RequestField.from_tuples(*field) 38 | 39 | 40 | def iter_fields(fields): 41 | """ 42 | .. deprecated:: 1.6 43 | 44 | Iterate over fields. 45 | 46 | The addition of :class:`~urllib3.fields.RequestField` makes this function 47 | obsolete. Instead, use :func:`iter_field_objects`, which returns 48 | :class:`~urllib3.fields.RequestField` objects. 49 | 50 | Supports list of (k, v) tuples and dicts. 51 | """ 52 | if isinstance(fields, dict): 53 | return ((k, v) for k, v in six.iteritems(fields)) 54 | 55 | return ((k, v) for k, v in fields) 56 | 57 | 58 | def encode_multipart_formdata(fields, boundary=None): 59 | """ 60 | Encode a dictionary of ``fields`` using the multipart/form-data MIME format. 61 | 62 | :param fields: 63 | Dictionary of fields or list of (key, :class:`~urllib3.fields.RequestField`). 64 | 65 | :param boundary: 66 | If not specified, then a random boundary will be generated using 67 | :func:`mimetools.choose_boundary`. 68 | """ 69 | body = BytesIO() 70 | if boundary is None: 71 | boundary = choose_boundary() 72 | 73 | for field in iter_field_objects(fields): 74 | body.write(b('--%s\r\n' % (boundary))) 75 | 76 | writer(body).write(field.render_headers()) 77 | data = field.data 78 | 79 | if isinstance(data, int): 80 | data = str(data) # Backwards compatibility 81 | 82 | if isinstance(data, six.text_type): 83 | writer(body).write(data) 84 | else: 85 | body.write(data) 86 | 87 | body.write(b'\r\n') 88 | 89 | body.write(b('--%s--\r\n' % (boundary))) 90 | 91 | content_type = str('multipart/form-data; boundary=%s' % boundary) 92 | 93 | return body.getvalue(), content_type 94 | -------------------------------------------------------------------------------- /requests/packages/urllib3/packages/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from . import ssl_match_hostname 4 | 5 | -------------------------------------------------------------------------------- /requests/packages/urllib3/packages/ssl_match_hostname/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | # Python 3.2+ 3 | from ssl import CertificateError, match_hostname 4 | except ImportError: 5 | try: 6 | # Backport of the function from a pypi module 7 | from backports.ssl_match_hostname import CertificateError, match_hostname 8 | except ImportError: 9 | # Our vendored copy 10 | from ._implementation import CertificateError, match_hostname 11 | 12 | # Not needed, but documenting what we provide. 13 | __all__ = ('CertificateError', 'match_hostname') 14 | -------------------------------------------------------------------------------- /requests/packages/urllib3/packages/ssl_match_hostname/_implementation.py: -------------------------------------------------------------------------------- 1 | """The match_hostname() function from Python 3.3.3, essential when using SSL.""" 2 | 3 | # Note: This file is under the PSF license as the code comes from the python 4 | # stdlib. http://docs.python.org/3/license.html 5 | 6 | import re 7 | 8 | __version__ = '3.4.0.2' 9 | 10 | class CertificateError(ValueError): 11 | pass 12 | 13 | 14 | def _dnsname_match(dn, hostname, max_wildcards=1): 15 | """Matching according to RFC 6125, section 6.4.3 16 | 17 | http://tools.ietf.org/html/rfc6125#section-6.4.3 18 | """ 19 | pats = [] 20 | if not dn: 21 | return False 22 | 23 | # Ported from python3-syntax: 24 | # leftmost, *remainder = dn.split(r'.') 25 | parts = dn.split(r'.') 26 | leftmost = parts[0] 27 | remainder = parts[1:] 28 | 29 | wildcards = leftmost.count('*') 30 | if wildcards > max_wildcards: 31 | # Issue #17980: avoid denials of service by refusing more 32 | # than one wildcard per fragment. A survey of established 33 | # policy among SSL implementations showed it to be a 34 | # reasonable choice. 35 | raise CertificateError( 36 | "too many wildcards in certificate DNS name: " + repr(dn)) 37 | 38 | # speed up common case w/o wildcards 39 | if not wildcards: 40 | return dn.lower() == hostname.lower() 41 | 42 | # RFC 6125, section 6.4.3, subitem 1. 43 | # The client SHOULD NOT attempt to match a presented identifier in which 44 | # the wildcard character comprises a label other than the left-most label. 45 | if leftmost == '*': 46 | # When '*' is a fragment by itself, it matches a non-empty dotless 47 | # fragment. 48 | pats.append('[^.]+') 49 | elif leftmost.startswith('xn--') or hostname.startswith('xn--'): 50 | # RFC 6125, section 6.4.3, subitem 3. 51 | # The client SHOULD NOT attempt to match a presented identifier 52 | # where the wildcard character is embedded within an A-label or 53 | # U-label of an internationalized domain name. 54 | pats.append(re.escape(leftmost)) 55 | else: 56 | # Otherwise, '*' matches any dotless string, e.g. www* 57 | pats.append(re.escape(leftmost).replace(r'\*', '[^.]*')) 58 | 59 | # add the remaining fragments, ignore any wildcards 60 | for frag in remainder: 61 | pats.append(re.escape(frag)) 62 | 63 | pat = re.compile(r'\A' + r'\.'.join(pats) + r'\Z', re.IGNORECASE) 64 | return pat.match(hostname) 65 | 66 | 67 | def match_hostname(cert, hostname): 68 | """Verify that *cert* (in decoded format as returned by 69 | SSLSocket.getpeercert()) matches the *hostname*. RFC 2818 and RFC 6125 70 | rules are followed, but IP addresses are not accepted for *hostname*. 71 | 72 | CertificateError is raised on failure. On success, the function 73 | returns nothing. 74 | """ 75 | if not cert: 76 | raise ValueError("empty or no certificate") 77 | dnsnames = [] 78 | san = cert.get('subjectAltName', ()) 79 | for key, value in san: 80 | if key == 'DNS': 81 | if _dnsname_match(value, hostname): 82 | return 83 | dnsnames.append(value) 84 | if not dnsnames: 85 | # The subject is only checked when there is no dNSName entry 86 | # in subjectAltName 87 | for sub in cert.get('subject', ()): 88 | for key, value in sub: 89 | # XXX according to RFC 2818, the most specific Common Name 90 | # must be used. 91 | if key == 'commonName': 92 | if _dnsname_match(value, hostname): 93 | return 94 | dnsnames.append(value) 95 | if len(dnsnames) > 1: 96 | raise CertificateError("hostname %r " 97 | "doesn't match either of %s" 98 | % (hostname, ', '.join(map(repr, dnsnames)))) 99 | elif len(dnsnames) == 1: 100 | raise CertificateError("hostname %r " 101 | "doesn't match %r" 102 | % (hostname, dnsnames[0])) 103 | else: 104 | raise CertificateError("no appropriate commonName or " 105 | "subjectAltName fields were found") 106 | -------------------------------------------------------------------------------- /requests/packages/urllib3/util/__init__.py: -------------------------------------------------------------------------------- 1 | # For backwards compatibility, provide imports that used to be here. 2 | from .connection import is_connection_dropped 3 | from .request import make_headers 4 | from .response import is_fp_closed 5 | from .ssl_ import ( 6 | SSLContext, 7 | HAS_SNI, 8 | assert_fingerprint, 9 | resolve_cert_reqs, 10 | resolve_ssl_version, 11 | ssl_wrap_socket, 12 | ) 13 | from .timeout import ( 14 | current_time, 15 | Timeout, 16 | ) 17 | 18 | from .retry import Retry 19 | from .url import ( 20 | get_host, 21 | parse_url, 22 | split_first, 23 | Url, 24 | ) 25 | -------------------------------------------------------------------------------- /requests/packages/urllib3/util/connection.py: -------------------------------------------------------------------------------- 1 | import socket 2 | try: 3 | from select import poll, POLLIN 4 | except ImportError: # `poll` doesn't exist on OSX and other platforms 5 | poll = False 6 | try: 7 | from select import select 8 | except ImportError: # `select` doesn't exist on AppEngine. 9 | select = False 10 | 11 | 12 | def is_connection_dropped(conn): # Platform-specific 13 | """ 14 | Returns True if the connection is dropped and should be closed. 15 | 16 | :param conn: 17 | :class:`httplib.HTTPConnection` object. 18 | 19 | Note: For platforms like AppEngine, this will always return ``False`` to 20 | let the platform handle connection recycling transparently for us. 21 | """ 22 | sock = getattr(conn, 'sock', False) 23 | if sock is False: # Platform-specific: AppEngine 24 | return False 25 | if sock is None: # Connection already closed (such as by httplib). 26 | return True 27 | 28 | if not poll: 29 | if not select: # Platform-specific: AppEngine 30 | return False 31 | 32 | try: 33 | return select([sock], [], [], 0.0)[0] 34 | except socket.error: 35 | return True 36 | 37 | # This version is better on platforms that support it. 38 | p = poll() 39 | p.register(sock, POLLIN) 40 | for (fno, ev) in p.poll(0.0): 41 | if fno == sock.fileno(): 42 | # Either data is buffered (bad), or the connection is dropped. 43 | return True 44 | 45 | 46 | # This function is copied from socket.py in the Python 2.7 standard 47 | # library test suite. Added to its signature is only `socket_options`. 48 | def create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 49 | source_address=None, socket_options=None): 50 | """Connect to *address* and return the socket object. 51 | 52 | Convenience function. Connect to *address* (a 2-tuple ``(host, 53 | port)``) and return the socket object. Passing the optional 54 | *timeout* parameter will set the timeout on the socket instance 55 | before attempting to connect. If no *timeout* is supplied, the 56 | global default timeout setting returned by :func:`getdefaulttimeout` 57 | is used. If *source_address* is set it must be a tuple of (host, port) 58 | for the socket to bind as a source address before making the connection. 59 | An host of '' or port 0 tells the OS to use the default. 60 | """ 61 | 62 | host, port = address 63 | err = None 64 | for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM): 65 | af, socktype, proto, canonname, sa = res 66 | sock = None 67 | try: 68 | sock = socket.socket(af, socktype, proto) 69 | 70 | # If provided, set socket level options before connecting. 71 | # This is the only addition urllib3 makes to this function. 72 | _set_socket_options(sock, socket_options) 73 | 74 | if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: 75 | sock.settimeout(timeout) 76 | if source_address: 77 | sock.bind(source_address) 78 | sock.connect(sa) 79 | return sock 80 | 81 | except socket.error as _: 82 | err = _ 83 | if sock is not None: 84 | sock.close() 85 | 86 | if err is not None: 87 | raise err 88 | else: 89 | raise socket.error("getaddrinfo returns an empty list") 90 | 91 | 92 | def _set_socket_options(sock, options): 93 | if options is None: 94 | return 95 | 96 | for opt in options: 97 | sock.setsockopt(*opt) 98 | -------------------------------------------------------------------------------- /requests/packages/urllib3/util/request.py: -------------------------------------------------------------------------------- 1 | from base64 import b64encode 2 | 3 | from ..packages.six import b 4 | 5 | ACCEPT_ENCODING = 'gzip,deflate' 6 | 7 | 8 | def make_headers(keep_alive=None, accept_encoding=None, user_agent=None, 9 | basic_auth=None, proxy_basic_auth=None, disable_cache=None): 10 | """ 11 | Shortcuts for generating request headers. 12 | 13 | :param keep_alive: 14 | If ``True``, adds 'connection: keep-alive' header. 15 | 16 | :param accept_encoding: 17 | Can be a boolean, list, or string. 18 | ``True`` translates to 'gzip,deflate'. 19 | List will get joined by comma. 20 | String will be used as provided. 21 | 22 | :param user_agent: 23 | String representing the user-agent you want, such as 24 | "python-urllib3/0.6" 25 | 26 | :param basic_auth: 27 | Colon-separated username:password string for 'authorization: basic ...' 28 | auth header. 29 | 30 | :param proxy_basic_auth: 31 | Colon-separated username:password string for 'proxy-authorization: basic ...' 32 | auth header. 33 | 34 | :param disable_cache: 35 | If ``True``, adds 'cache-control: no-cache' header. 36 | 37 | Example:: 38 | 39 | >>> make_headers(keep_alive=True, user_agent="Batman/1.0") 40 | {'connection': 'keep-alive', 'user-agent': 'Batman/1.0'} 41 | >>> make_headers(accept_encoding=True) 42 | {'accept-encoding': 'gzip,deflate'} 43 | """ 44 | headers = {} 45 | if accept_encoding: 46 | if isinstance(accept_encoding, str): 47 | pass 48 | elif isinstance(accept_encoding, list): 49 | accept_encoding = ','.join(accept_encoding) 50 | else: 51 | accept_encoding = ACCEPT_ENCODING 52 | headers['accept-encoding'] = accept_encoding 53 | 54 | if user_agent: 55 | headers['user-agent'] = user_agent 56 | 57 | if keep_alive: 58 | headers['connection'] = 'keep-alive' 59 | 60 | if basic_auth: 61 | headers['authorization'] = 'Basic ' + \ 62 | b64encode(b(basic_auth)).decode('utf-8') 63 | 64 | if proxy_basic_auth: 65 | headers['proxy-authorization'] = 'Basic ' + \ 66 | b64encode(b(proxy_basic_auth)).decode('utf-8') 67 | 68 | if disable_cache: 69 | headers['cache-control'] = 'no-cache' 70 | 71 | return headers 72 | -------------------------------------------------------------------------------- /requests/packages/urllib3/util/response.py: -------------------------------------------------------------------------------- 1 | def is_fp_closed(obj): 2 | """ 3 | Checks whether a given file-like object is closed. 4 | 5 | :param obj: 6 | The file-like object to check. 7 | """ 8 | 9 | try: 10 | # Check via the official file-like-object way. 11 | return obj.closed 12 | except AttributeError: 13 | pass 14 | 15 | try: 16 | # Check if the object is a container for another file-like object that 17 | # gets released on exhaustion (e.g. HTTPResponse). 18 | return obj.fp is None 19 | except AttributeError: 20 | pass 21 | 22 | raise ValueError("Unable to determine whether fp is closed.") 23 | -------------------------------------------------------------------------------- /requests/packages/urllib3/util/ssl_.py: -------------------------------------------------------------------------------- 1 | from binascii import hexlify, unhexlify 2 | from hashlib import md5, sha1 3 | 4 | from ..exceptions import SSLError 5 | 6 | 7 | try: # Test for SSL features 8 | SSLContext = None 9 | HAS_SNI = False 10 | 11 | import ssl 12 | from ssl import wrap_socket, CERT_NONE, PROTOCOL_SSLv23 13 | from ssl import SSLContext # Modern SSL? 14 | from ssl import HAS_SNI # Has SNI? 15 | except ImportError: 16 | pass 17 | 18 | 19 | def assert_fingerprint(cert, fingerprint): 20 | """ 21 | Checks if given fingerprint matches the supplied certificate. 22 | 23 | :param cert: 24 | Certificate as bytes object. 25 | :param fingerprint: 26 | Fingerprint as string of hexdigits, can be interspersed by colons. 27 | """ 28 | 29 | # Maps the length of a digest to a possible hash function producing 30 | # this digest. 31 | hashfunc_map = { 32 | 16: md5, 33 | 20: sha1 34 | } 35 | 36 | fingerprint = fingerprint.replace(':', '').lower() 37 | digest_length, odd = divmod(len(fingerprint), 2) 38 | 39 | if odd or digest_length not in hashfunc_map: 40 | raise SSLError('Fingerprint is of invalid length.') 41 | 42 | # We need encode() here for py32; works on py2 and p33. 43 | fingerprint_bytes = unhexlify(fingerprint.encode()) 44 | 45 | hashfunc = hashfunc_map[digest_length] 46 | 47 | cert_digest = hashfunc(cert).digest() 48 | 49 | if not cert_digest == fingerprint_bytes: 50 | raise SSLError('Fingerprints did not match. Expected "{0}", got "{1}".' 51 | .format(hexlify(fingerprint_bytes), 52 | hexlify(cert_digest))) 53 | 54 | 55 | def resolve_cert_reqs(candidate): 56 | """ 57 | Resolves the argument to a numeric constant, which can be passed to 58 | the wrap_socket function/method from the ssl module. 59 | Defaults to :data:`ssl.CERT_NONE`. 60 | If given a string it is assumed to be the name of the constant in the 61 | :mod:`ssl` module or its abbrevation. 62 | (So you can specify `REQUIRED` instead of `CERT_REQUIRED`. 63 | If it's neither `None` nor a string we assume it is already the numeric 64 | constant which can directly be passed to wrap_socket. 65 | """ 66 | if candidate is None: 67 | return CERT_NONE 68 | 69 | if isinstance(candidate, str): 70 | res = getattr(ssl, candidate, None) 71 | if res is None: 72 | res = getattr(ssl, 'CERT_' + candidate) 73 | return res 74 | 75 | return candidate 76 | 77 | 78 | def resolve_ssl_version(candidate): 79 | """ 80 | like resolve_cert_reqs 81 | """ 82 | if candidate is None: 83 | return PROTOCOL_SSLv23 84 | 85 | if isinstance(candidate, str): 86 | res = getattr(ssl, candidate, None) 87 | if res is None: 88 | res = getattr(ssl, 'PROTOCOL_' + candidate) 89 | return res 90 | 91 | return candidate 92 | 93 | 94 | if SSLContext is not None: # Python 3.2+ 95 | def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None, 96 | ca_certs=None, server_hostname=None, 97 | ssl_version=None): 98 | """ 99 | All arguments except `server_hostname` have the same meaning as for 100 | :func:`ssl.wrap_socket` 101 | 102 | :param server_hostname: 103 | Hostname of the expected certificate 104 | """ 105 | context = SSLContext(ssl_version) 106 | context.verify_mode = cert_reqs 107 | 108 | # Disable TLS compression to migitate CRIME attack (issue #309) 109 | OP_NO_COMPRESSION = 0x20000 110 | context.options |= OP_NO_COMPRESSION 111 | 112 | if ca_certs: 113 | try: 114 | context.load_verify_locations(ca_certs) 115 | # Py32 raises IOError 116 | # Py33 raises FileNotFoundError 117 | except Exception as e: # Reraise as SSLError 118 | raise SSLError(e) 119 | if certfile: 120 | # FIXME: This block needs a test. 121 | context.load_cert_chain(certfile, keyfile) 122 | if HAS_SNI: # Platform-specific: OpenSSL with enabled SNI 123 | return context.wrap_socket(sock, server_hostname=server_hostname) 124 | return context.wrap_socket(sock) 125 | 126 | else: # Python 3.1 and earlier 127 | def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None, 128 | ca_certs=None, server_hostname=None, 129 | ssl_version=None): 130 | return wrap_socket(sock, keyfile=keyfile, certfile=certfile, 131 | ca_certs=ca_certs, cert_reqs=cert_reqs, 132 | ssl_version=ssl_version) 133 | -------------------------------------------------------------------------------- /requests/status_codes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .structures import LookupDict 4 | 5 | _codes = { 6 | 7 | # Informational. 8 | 100: ('continue',), 9 | 101: ('switching_protocols',), 10 | 102: ('processing',), 11 | 103: ('checkpoint',), 12 | 122: ('uri_too_long', 'request_uri_too_long'), 13 | 200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\o/', '✓'), 14 | 201: ('created',), 15 | 202: ('accepted',), 16 | 203: ('non_authoritative_info', 'non_authoritative_information'), 17 | 204: ('no_content',), 18 | 205: ('reset_content', 'reset'), 19 | 206: ('partial_content', 'partial'), 20 | 207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'), 21 | 208: ('already_reported',), 22 | 226: ('im_used',), 23 | 24 | # Redirection. 25 | 300: ('multiple_choices',), 26 | 301: ('moved_permanently', 'moved', '\\o-'), 27 | 302: ('found',), 28 | 303: ('see_other', 'other'), 29 | 304: ('not_modified',), 30 | 305: ('use_proxy',), 31 | 306: ('switch_proxy',), 32 | 307: ('temporary_redirect', 'temporary_moved', 'temporary'), 33 | 308: ('permanent_redirect', 34 | 'resume_incomplete', 'resume',), # These 2 to be removed in 3.0 35 | 36 | # Client Error. 37 | 400: ('bad_request', 'bad'), 38 | 401: ('unauthorized',), 39 | 402: ('payment_required', 'payment'), 40 | 403: ('forbidden',), 41 | 404: ('not_found', '-o-'), 42 | 405: ('method_not_allowed', 'not_allowed'), 43 | 406: ('not_acceptable',), 44 | 407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'), 45 | 408: ('request_timeout', 'timeout'), 46 | 409: ('conflict',), 47 | 410: ('gone',), 48 | 411: ('length_required',), 49 | 412: ('precondition_failed', 'precondition'), 50 | 413: ('request_entity_too_large',), 51 | 414: ('request_uri_too_large',), 52 | 415: ('unsupported_media_type', 'unsupported_media', 'media_type'), 53 | 416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'), 54 | 417: ('expectation_failed',), 55 | 418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'), 56 | 422: ('unprocessable_entity', 'unprocessable'), 57 | 423: ('locked',), 58 | 424: ('failed_dependency', 'dependency'), 59 | 425: ('unordered_collection', 'unordered'), 60 | 426: ('upgrade_required', 'upgrade'), 61 | 428: ('precondition_required', 'precondition'), 62 | 429: ('too_many_requests', 'too_many'), 63 | 431: ('header_fields_too_large', 'fields_too_large'), 64 | 444: ('no_response', 'none'), 65 | 449: ('retry_with', 'retry'), 66 | 450: ('blocked_by_windows_parental_controls', 'parental_controls'), 67 | 451: ('unavailable_for_legal_reasons', 'legal_reasons'), 68 | 499: ('client_closed_request',), 69 | 70 | # Server Error. 71 | 500: ('internal_server_error', 'server_error', '/o\\', '✗'), 72 | 501: ('not_implemented',), 73 | 502: ('bad_gateway',), 74 | 503: ('service_unavailable', 'unavailable'), 75 | 504: ('gateway_timeout',), 76 | 505: ('http_version_not_supported', 'http_version'), 77 | 506: ('variant_also_negotiates',), 78 | 507: ('insufficient_storage',), 79 | 509: ('bandwidth_limit_exceeded', 'bandwidth'), 80 | 510: ('not_extended',), 81 | } 82 | 83 | codes = LookupDict(name='status_codes') 84 | 85 | for (code, titles) in list(_codes.items()): 86 | for title in titles: 87 | setattr(codes, title, code) 88 | if not title.startswith('\\'): 89 | setattr(codes, title.upper(), code) 90 | -------------------------------------------------------------------------------- /requests/structures.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | requests.structures 5 | ~~~~~~~~~~~~~~~~~~~ 6 | 7 | Data structures that power Requests. 8 | 9 | """ 10 | 11 | import collections 12 | 13 | 14 | class CaseInsensitiveDict(collections.MutableMapping): 15 | """ 16 | A case-insensitive ``dict``-like object. 17 | 18 | Implements all methods and operations of 19 | ``collections.MutableMapping`` as well as dict's ``copy``. Also 20 | provides ``lower_items``. 21 | 22 | All keys are expected to be strings. The structure remembers the 23 | case of the last key to be set, and ``iter(instance)``, 24 | ``keys()``, ``items()``, ``iterkeys()``, and ``iteritems()`` 25 | will contain case-sensitive keys. However, querying and contains 26 | testing is case insensitive:: 27 | 28 | cid = CaseInsensitiveDict() 29 | cid['Accept'] = 'application/json' 30 | cid['aCCEPT'] == 'application/json' # True 31 | list(cid) == ['Accept'] # True 32 | 33 | For example, ``headers['content-encoding']`` will return the 34 | value of a ``'Content-Encoding'`` response header, regardless 35 | of how the header name was originally stored. 36 | 37 | If the constructor, ``.update``, or equality comparison 38 | operations are given keys that have equal ``.lower()``s, the 39 | behavior is undefined. 40 | 41 | """ 42 | def __init__(self, data=None, **kwargs): 43 | self._store = dict() 44 | if data is None: 45 | data = {} 46 | self.update(data, **kwargs) 47 | 48 | def __setitem__(self, key, value): 49 | # Use the lowercased key for lookups, but store the actual 50 | # key alongside the value. 51 | self._store[key.lower()] = (key, value) 52 | 53 | def __getitem__(self, key): 54 | return self._store[key.lower()][1] 55 | 56 | def __delitem__(self, key): 57 | del self._store[key.lower()] 58 | 59 | def __iter__(self): 60 | return (casedkey for casedkey, mappedvalue in self._store.values()) 61 | 62 | def __len__(self): 63 | return len(self._store) 64 | 65 | def lower_items(self): 66 | """Like iteritems(), but with all lowercase keys.""" 67 | return ( 68 | (lowerkey, keyval[1]) 69 | for (lowerkey, keyval) 70 | in self._store.items() 71 | ) 72 | 73 | def __eq__(self, other): 74 | if isinstance(other, collections.Mapping): 75 | other = CaseInsensitiveDict(other) 76 | else: 77 | return NotImplemented 78 | # Compare insensitively 79 | return dict(self.lower_items()) == dict(other.lower_items()) 80 | 81 | # Copy is required 82 | def copy(self): 83 | return CaseInsensitiveDict(self._store.values()) 84 | 85 | def __repr__(self): 86 | return str(dict(self.items())) 87 | 88 | class LookupDict(dict): 89 | """Dictionary lookup object.""" 90 | 91 | def __init__(self, name=None): 92 | self.name = name 93 | super(LookupDict, self).__init__() 94 | 95 | def __repr__(self): 96 | return '' % (self.name) 97 | 98 | def __getitem__(self, key): 99 | # We allow fall-through here, so values default to None 100 | 101 | return self.__dict__.get(key, None) 102 | 103 | def get(self, key, default=None): 104 | return self.__dict__.get(key, default) 105 | -------------------------------------------------------------------------------- /search.py: -------------------------------------------------------------------------------- 1 | import document 2 | from urllib import urlencode 3 | from httplib2 import Http 4 | import json 5 | from base64 import b64encode 6 | import secrets 7 | 8 | def document_from_query(query): 9 | query_dict = {"$format": "json", "Query": "'{0}'".format(query)} 10 | url = "https://api.datamarket.azure.com/Bing/Search/Web?" + urlencode(query_dict) 11 | auth_string = b64encode("{0}:{0}".format(secrets.BING_API_KEY)) 12 | headers = {"Authorization": "Basic " + auth_string} 13 | response, content = Http().request(url, "GET", headers=headers) 14 | results = json.loads(content)['d']['results'] 15 | html = u"

Web search for '{0}'

".format(query) + u"
".join([u"{1} ({2})".format(r['Url'], r['Title'], r['DisplayUrl']) for r in results]) 16 | doc = document.Document(html = html) 17 | return doc 18 | -------------------------------------------------------------------------------- /txtfy.py: -------------------------------------------------------------------------------- 1 | def txtfy_word(w): 2 | maps = { 3 | "to": "2", 4 | "too": "2", 5 | "you": "u", 6 | "you'll": "u'll", 7 | "your": "ur", 8 | "for": "4", 9 | "and": "&", 10 | "at": "@", 11 | "with": "w/", 12 | "before": "b4", 13 | "one": "1", 14 | } 15 | if w.lower() in maps: 16 | return maps[w.lower()] 17 | return w 18 | 19 | def txtfy(text): 20 | tokens = text.split(" ") 21 | return u" ".join(map(txtfy_word, tokens)) 22 | --------------------------------------------------------------------------------