├── .gitignore
├── BeautifulSoup.py
├── README.md
├── app.yaml
├── browse.py
├── bs4
    ├── __init__.py
    ├── builder
    │   ├── __init__.py
    │   ├── _html5lib.py
    │   ├── _htmlparser.py
    │   └── _lxml.py
    ├── dammit.py
    ├── diagnose.py
    ├── element.py
    ├── testing.py
    └── tests
    │   ├── __init__.py
    │   ├── test_builder_registry.py
    │   ├── test_docs.py
    │   ├── test_html5lib.py
    │   ├── test_htmlparser.py
    │   ├── test_lxml.py
    │   ├── test_soup.py
    │   └── test_tree.py
├── cssselect
    ├── __init__.py
    ├── parser.py
    ├── tests.py
    └── xpath.py
├── document.py
├── document_old.py
├── favicon.ico
├── goose
    ├── __init__.py
    ├── article.py
    ├── cleaners.py
    ├── configuration.py
    ├── crawler.py
    ├── extractors.py
    ├── images
    │   ├── __init__.py
    │   ├── extractors.py
    │   ├── image.py
    │   └── utils.py
    ├── network.py
    ├── outputformatters.py
    ├── parsers.py
    ├── resources
    │   ├── images
    │   │   └── known-image-css.txt
    │   └── text
    │   │   ├── stopwords-ar.txt
    │   │   ├── stopwords-da.txt
    │   │   ├── stopwords-de.txt
    │   │   ├── stopwords-en.txt
    │   │   ├── stopwords-es.txt
    │   │   ├── stopwords-fi.txt
    │   │   ├── stopwords-fr.txt
    │   │   ├── stopwords-hu.txt
    │   │   ├── stopwords-id.txt
    │   │   ├── stopwords-it.txt
    │   │   ├── stopwords-ko.txt
    │   │   ├── stopwords-nb.txt
    │   │   ├── stopwords-nl.txt
    │   │   ├── stopwords-no.txt
    │   │   ├── stopwords-pl.txt
    │   │   ├── stopwords-pt.txt
    │   │   ├── stopwords-ru.txt
    │   │   ├── stopwords-sv.txt
    │   │   └── stopwords-zh.txt
    ├── text.py
    ├── utils
    │   ├── __init__.py
    │   └── encoding.py
    ├── version.py
    └── videos
    │   ├── __init__.py
    │   ├── extractors.py
    │   └── videos.py
├── html2text.py
├── httplib2
    ├── __init__.py
    ├── cacerts.txt
    ├── iri2uri.py
    ├── socks.py
    └── test
    │   ├── __init__.py
    │   ├── brokensocket
    │       └── socket.py
    │   ├── functional
    │       └── test_proxies.py
    │   ├── miniserver.py
    │   ├── other_cacerts.txt
    │   ├── smoke_test.py
    │   └── test_no_socket.py
├── index.yaml
├── instructions.html
├── main.py
├── page.html
├── parse_command.py
├── pybing
    ├── __init__.py
    ├── bing.py
    ├── constants.py
    ├── query
    │   ├── __init__.py
    │   ├── mixin.py
    │   ├── pagable.py
    │   ├── query.py
    │   └── web.py
    ├── result.py
    └── resultset.py
├── requests
    ├── __init__.py
    ├── adapters.py
    ├── api.py
    ├── auth.py
    ├── cacert.pem
    ├── certs.py
    ├── compat.py
    ├── cookies.py
    ├── exceptions.py
    ├── hooks.py
    ├── models.py
    ├── packages
    │   ├── __init__.py
    │   ├── chardet
    │   │   ├── __init__.py
    │   │   ├── big5freq.py
    │   │   ├── big5prober.py
    │   │   ├── chardetect.py
    │   │   ├── chardistribution.py
    │   │   ├── charsetgroupprober.py
    │   │   ├── charsetprober.py
    │   │   ├── codingstatemachine.py
    │   │   ├── compat.py
    │   │   ├── constants.py
    │   │   ├── cp949prober.py
    │   │   ├── escprober.py
    │   │   ├── escsm.py
    │   │   ├── eucjpprober.py
    │   │   ├── euckrfreq.py
    │   │   ├── euckrprober.py
    │   │   ├── euctwfreq.py
    │   │   ├── euctwprober.py
    │   │   ├── gb2312freq.py
    │   │   ├── gb2312prober.py
    │   │   ├── hebrewprober.py
    │   │   ├── jisfreq.py
    │   │   ├── jpcntx.py
    │   │   ├── langbulgarianmodel.py
    │   │   ├── langcyrillicmodel.py
    │   │   ├── langgreekmodel.py
    │   │   ├── langhebrewmodel.py
    │   │   ├── langhungarianmodel.py
    │   │   ├── langthaimodel.py
    │   │   ├── latin1prober.py
    │   │   ├── mbcharsetprober.py
    │   │   ├── mbcsgroupprober.py
    │   │   ├── mbcssm.py
    │   │   ├── sbcharsetprober.py
    │   │   ├── sbcsgroupprober.py
    │   │   ├── sjisprober.py
    │   │   ├── universaldetector.py
    │   │   └── utf8prober.py
    │   └── urllib3
    │   │   ├── __init__.py
    │   │   ├── _collections.py
    │   │   ├── connection.py
    │   │   ├── connectionpool.py
    │   │   ├── contrib
    │   │       ├── __init__.py
    │   │       ├── ntlmpool.py
    │   │       └── pyopenssl.py
    │   │   ├── exceptions.py
    │   │   ├── fields.py
    │   │   ├── filepost.py
    │   │   ├── packages
    │   │       ├── __init__.py
    │   │       ├── ordered_dict.py
    │   │       ├── six.py
    │   │       └── ssl_match_hostname
    │   │       │   ├── __init__.py
    │   │       │   └── _implementation.py
    │   │   ├── poolmanager.py
    │   │   ├── request.py
    │   │   ├── response.py
    │   │   └── util
    │   │       ├── __init__.py
    │   │       ├── connection.py
    │   │       ├── request.py
    │   │       ├── response.py
    │   │       ├── retry.py
    │   │       ├── ssl_.py
    │   │       ├── timeout.py
    │   │       └── url.py
    ├── sessions.py
    ├── status_codes.py
    ├── structures.py
    └── utils.py
├── search.py
├── txtfy.py
└── wise.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | secrets.py
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | 
45 | # Translations
46 | *.mo
47 | *.pot
48 | 
49 | # Django stuff:
50 | *.log
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | # PyBuilder
56 | target/
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Astrobot
 2 | 
 3 | `Astrobot` is an SMS service that allows you to (sort of) browse the web entirely over SMS. Here's what it looks like:
 4 | 
 5 | 	1. You: help
 6 | 	2. Astro-bot:"Go to wsj.com", "search wikipedia for Azerbaijan". Text the # of a link to click it.
 7 | 	3. You: hackerschool.com
 8 | 	4. Astro-bot: Hacker School is a retreat 4 programmers. Want 2 spend three months coding & learning
 9 | 	in New York w/ smart, friendly, & intellectually curious people? [Rea
10 | 	5. You: more
11 | 	6. Astro-bot:d on](1) . [APPLY NOW](2) New blog post: [Paper of the Week: Error Detecting &
12 | 	Error Correcting Codes](3) Hacker School is… Self-directed We value in
13 | 	7. You: 3 more
14 | 	8. Astro-bot:trinsic motivation & self-direction, & believe people learn best when they’re free 2
15 | 	explore their passions & interests. Accessible We’re free 4 everyone.
16 | 	9. Astro-bot:We also offer need-based grants 4 living expenses 4 people from traditionally
17 | 	underrepresented groups in programming. Stimulating We look 4 smart, friendly,
18 | 	10. Astro-bot: self-directed, & intellectually curious people who enjoy programming & want 2 get
19 | 	dramatically better. Exceptional We have exceptional residents including
20 | 	11. You: click 1
21 | 	12. Astro-bot: We review applications on a rolling basis. u should receive a confirmation email
22 | 	immediately after u apply, & we will review ur application within three weeks
23 | 	13. You: 2 more
24 | 	14. Astro-bot:. In general, clear & concise answers are better than long ones. Most answers have a
25 | 	1500 character max length, but u don’t need 2 write that much. We strongl
26 | 	15. Astro-bot:y encourage u 2 read our [about page](1) , [FAQ](2) , & [User’s Manual](3) b4 applying.
27 | 	Choose ur batch If you’re admitted but ur preferred batch is full,
28 | 	16. You: click link 2
29 | 	17. Astro-bot:What's that? I don't understand. Say 'help me' for help.
30 | 	18. You: click 2
31 | 	19. Astro-bot: Welcome 2 an unusual experiment Hacker School is unlike the rest of the world. This
32 | 	guide is designed 2 help u get settled in & get the most out of ur batch.
33 | 	20. You: 3 more
34 | 	21. Astro-bot: 1 of the things that makes Hacker School different is that it's largely self-directed . This
35 | 	means u won't have someone telling u what 2 do, learn, etc, while
36 | 	22. Astro-bot: you're here (though we do have a few [social rules](1) ). This self-directedness is baked
37 | 	into the core structure of Hacker School, & is why we don't have grad
38 | 	23. Astro-bot:es, exams, curricula, or even classes. It comes from our belief that people learn best when
39 | 	given the freedom 2 explore what most interests them. This doesn't
40 | 
41 | 
42 | `Astrobot` is a Google Appengine app that supports responding to incoming messages from Twilio. You can use a live version at [astro-bot.appspot.com](http://astro-bot.appspot.com) or by texting _646-576-7688_ .
43 | 
44 | ## Running your own
45 | Download the Google Appengine launcher and just drag the repository folder into it.
46 | 
47 | You've also got to **create a file called secrets.py**, which includes a variable `BING_API_KEY`, holding your Bing search API key (*not* a Simple Search API key—the full API key). You can get these for free. You'll need it for web search — otherwise, leave secrets.txt empty and it'll all work *except* web search.
48 | 
49 | 


--------------------------------------------------------------------------------
/app.yaml:
--------------------------------------------------------------------------------
 1 | application: astro-bot
 2 | version: 1
 3 | runtime: python27
 4 | api_version: 1
 5 | threadsafe: yes
 6 | 
 7 | handlers:
 8 | - url: /favicon\.ico
 9 |   static_files: favicon.ico
10 |   upload: favicon\.ico
11 | 
12 | - url: .*
13 |   script: main.app
14 | 
15 | libraries:
16 | - name: webapp2
17 |   version: "2.5.2"
18 | - name: lxml
19 |   version: "latest"
20 | - name: PIL
21 |   version: "latest"
22 | 


--------------------------------------------------------------------------------
/browse.py:
--------------------------------------------------------------------------------
 1 | from wise import Phrase, parse_phrase
 2 | import parse_command
 3 | import document
 4 | import urllib
 5 | import search
 6 | 
 7 | def interact(query, state):
 8 | 	# query: String, state: Dictionary
 9 | 	parsed = parse_command.parse_command(query)
10 | 	print parsed
11 | 	if 'BrowserState' in state:
12 | 		bstate = state['BrowserState']
13 | 	else:
14 | 		state['BrowserState'] = document.BrowserState()
15 | 		bstate = state['BrowserState']
16 | 
17 | 	bstate.clean_up()
18 | 
19 | 	if parsed.intent == 'url':
20 | 		bstate.navigate_to_url(parsed.get("*url", None))
21 | 		return bstate.get_n_messages(1)
22 | 	elif parsed.intent in ('more_text', 'previous_text'):
23 | 		return bstate.get_n_messages(min(7, int(parsed.get('*number', '1'))), backwards=(parsed.intent=='previous_text'))
24 | 	elif parsed.intent == 'back_to_top':
25 | 		bstate.frame_stack[-1].offset = 0
26 | 		return bstate.get_n_messages(1)
27 | 	elif parsed.intent == 'navigate' and parsed.get('*number', None):
28 | 		if parsed.get('on_last_page', False):
29 | 			bstate.back()
30 | 		url = bstate.frame_stack[-1].document.links[int(parsed.get('*number', '0'))-1]
31 | 		bstate.navigate_to_url(url)
32 | 		return bstate.get_n_messages(1)
33 | 	elif parsed.intent == 'help':
34 | 		bstate.navigate_to_url('http://astro-bot.appspot.com/instructions')
35 | 		return bstate.get_n_messages(1)
36 | 		# return ['Try these: "Go to hackerschool.com", "search wikipedia for Azerbaijan". On a web page, type "2 more" to see more or text the # of a link to click it.']
37 | 	elif parsed.intent == 'back':
38 | 		bstate.back()
39 | 		return bstate.resend_current_place()
40 | 	elif parsed.intent == 'contents':
41 | 		bstate.go_to_contents()
42 | 		return bstate.get_n_messages(1)
43 | 	elif parsed.intent == 'search':
44 | 		query = parsed.get("~query", "")
45 | 		if parsed.get('search_source/wikipedia', False):
46 | 			url = "http://en.wikipedia.org/w/index.php?search=" + urllib.quote_plus(query)
47 | 			bstate.navigate_to_url(url)
48 | 			return bstate.get_n_messages(1)
49 | 		else:
50 | 			bstate.frame_stack.append(document.Frame(search.document_from_query(query)))
51 | 			return bstate.get_n_messages(1)
52 | 	elif parsed.intent == 'whereami':
53 | 		if bstate.frame_stack == []:
54 | 			return ["You haven't loaded any page yet."]
55 | 		else:
56 | 			url = bstate.frame_stack[-1].document.url
57 | 			url_string = " ({0})".format(url) if url else ""
58 | 			return [u'You\'re reading "{0}"{1}'.format(bstate.frame_stack[-1].document.title, url_string)]
59 | 	else:
60 | 		return ["What's that? I don't understand. Say 'help me' for help."]
61 | 


--------------------------------------------------------------------------------
/bs4/tests/__init__.py:
--------------------------------------------------------------------------------
1 | "The beautifulsoup tests."
2 | 


--------------------------------------------------------------------------------
/bs4/tests/test_docs.py:
--------------------------------------------------------------------------------
 1 | "Test harness for doctests."
 2 | 
 3 | # pylint: disable-msg=E0611,W0142
 4 | 
 5 | __metaclass__ = type
 6 | __all__ = [
 7 |     'additional_tests',
 8 |     ]
 9 | 
10 | import atexit
11 | import doctest
12 | import os
13 | #from pkg_resources import (
14 | #    resource_filename, resource_exists, resource_listdir, cleanup_resources)
15 | import unittest
16 | 
17 | DOCTEST_FLAGS = (
18 |     doctest.ELLIPSIS |
19 |     doctest.NORMALIZE_WHITESPACE |
20 |     doctest.REPORT_NDIFF)
21 | 
22 | 
23 | # def additional_tests():
24 | #     "Run the doc tests (README.txt and docs/*, if any exist)"
25 | #     doctest_files = [
26 | #         os.path.abspath(resource_filename('bs4', 'README.txt'))]
27 | #     if resource_exists('bs4', 'docs'):
28 | #         for name in resource_listdir('bs4', 'docs'):
29 | #             if name.endswith('.txt'):
30 | #                 doctest_files.append(
31 | #                     os.path.abspath(
32 | #                         resource_filename('bs4', 'docs/%s' % name)))
33 | #     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
34 | #     atexit.register(cleanup_resources)
35 | #     return unittest.TestSuite((
36 | #         doctest.DocFileSuite(*doctest_files, **kwargs)))
37 | 


--------------------------------------------------------------------------------
/bs4/tests/test_html5lib.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html5lib tree builder generates good trees."""
 2 | 
 3 | import warnings
 4 | 
 5 | try:
 6 |     from bs4.builder import HTML5TreeBuilder
 7 |     HTML5LIB_PRESENT = True
 8 | except ImportError, e:
 9 |     HTML5LIB_PRESENT = False
10 | from bs4.element import SoupStrainer
11 | from bs4.testing import (
12 |     HTML5TreeBuilderSmokeTest,
13 |     SoupTest,
14 |     skipIf,
15 | )
16 | 
17 | @skipIf(
18 |     not HTML5LIB_PRESENT,
19 |     "html5lib seems not to be present, not testing its tree builder.")
20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 |     """See ``HTML5TreeBuilderSmokeTest``."""
22 | 
23 |     @property
24 |     def default_builder(self):
25 |         return HTML5TreeBuilder()
26 | 
27 |     def test_soupstrainer(self):
28 |         # The html5lib tree builder does not support SoupStrainers.
29 |         strainer = SoupStrainer("b")
30 |         markup = "<p>A <b>bold</b> statement.</p>"
31 |         with warnings.catch_warnings(record=True) as w:
32 |             soup = self.soup(markup, parse_only=strainer)
33 |         self.assertEqual(
34 |             soup.decode(), self.document_for(markup))
35 | 
36 |         self.assertTrue(
37 |             "the html5lib tree builder doesn't support parse_only" in
38 |             str(w[0].message))
39 | 
40 |     def test_correctly_nested_tables(self):
41 |         """html5lib inserts <tbody> tags where other parsers don't."""
42 |         markup = ('<table id="1">'
43 |                   '<tr>'
44 |                   "<td>Here's another table:"
45 |                   '<table id="2">'
46 |                   '<tr><td>foo</td></tr>'
47 |                   '</table></td>')
48 | 
49 |         self.assertSoupEquals(
50 |             markup,
51 |             '<table id="1"><tbody><tr><td>Here\'s another table:'
52 |             '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
53 |             '</td></tr></tbody></table>')
54 | 
55 |         self.assertSoupEquals(
56 |             "<table><thead><tr><td>Foo</td></tr></thead>"
57 |             "<tbody><tr><td>Bar</td></tr></tbody>"
58 |             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
59 | 
60 |     def test_xml_declaration_followed_by_doctype(self):
61 |         markup = '''<?xml version="1.0" encoding="utf-8"?>
62 | <!DOCTYPE html>
63 | <html>
64 |   <head>
65 |   </head>
66 |   <body>
67 |    <p>foo</p>
68 |   </body>
69 | </html>'''
70 |         soup = self.soup(markup)
71 |         # Verify that we can reach the <p> tag; this means the tree is connected.
72 |         self.assertEqual(b"<p>foo</p>", soup.p.encode())
73 | 
74 |     def test_reparented_markup(self):
75 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
76 |         soup = self.soup(markup)
77 |         self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
78 |         self.assertEqual(2, len(soup.find_all('p')))
79 | 
80 | 
81 |     def test_reparented_markup_ends_with_whitespace(self):
82 |         markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
83 |         soup = self.soup(markup)
84 |         self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
85 |         self.assertEqual(2, len(soup.find_all('p')))
86 | 


--------------------------------------------------------------------------------
/bs4/tests/test_htmlparser.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the html.parser tree builder generates good
 2 | trees."""
 3 | 
 4 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 5 | from bs4.builder import HTMLParserTreeBuilder
 6 | 
 7 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
 8 | 
 9 |     @property
10 |     def default_builder(self):
11 |         return HTMLParserTreeBuilder()
12 | 
13 |     def test_namespaced_system_doctype(self):
14 |         # html.parser can't handle namespaced doctypes, so skip this one.
15 |         pass
16 | 
17 |     def test_namespaced_public_doctype(self):
18 |         # html.parser can't handle namespaced doctypes, so skip this one.
19 |         pass
20 | 


--------------------------------------------------------------------------------
/bs4/tests/test_lxml.py:
--------------------------------------------------------------------------------
 1 | """Tests to ensure that the lxml tree builder generates good trees."""
 2 | 
 3 | import re
 4 | import warnings
 5 | 
 6 | try:
 7 |     import lxml.etree
 8 |     LXML_PRESENT = True
 9 |     LXML_VERSION = lxml.etree.LXML_VERSION
10 | except ImportError, e:
11 |     LXML_PRESENT = False
12 |     LXML_VERSION = (0,)
13 | 
14 | if LXML_PRESENT:
15 |     from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16 | 
17 | from bs4 import (
18 |     BeautifulSoup,
19 |     BeautifulStoneSoup,
20 |     )
21 | from bs4.element import Comment, Doctype, SoupStrainer
22 | from bs4.testing import skipIf
23 | from bs4.tests import test_htmlparser
24 | from bs4.testing import (
25 |     HTMLTreeBuilderSmokeTest,
26 |     XMLTreeBuilderSmokeTest,
27 |     SoupTest,
28 |     skipIf,
29 | )
30 | 
31 | @skipIf(
32 |     not LXML_PRESENT,
33 |     "lxml seems not to be present, not testing its tree builder.")
34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35 |     """See ``HTMLTreeBuilderSmokeTest``."""
36 | 
37 |     @property
38 |     def default_builder(self):
39 |         return LXMLTreeBuilder()
40 | 
41 |     def test_out_of_range_entity(self):
42 |         self.assertSoupEquals(
43 |             "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
44 |         self.assertSoupEquals(
45 |             "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
46 |         self.assertSoupEquals(
47 |             "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
48 | 
49 |     # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50 |     # test if an old version of lxml is installed.
51 | 
52 |     @skipIf(
53 |         not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54 |         "Skipping doctype test for old version of lxml to avoid segfault.")
55 |     def test_empty_doctype(self):
56 |         soup = self.soup("<!DOCTYPE>")
57 |         doctype = soup.contents[0]
58 |         self.assertEqual("", doctype.strip())
59 | 
60 |     def test_beautifulstonesoup_is_xml_parser(self):
61 |         # Make sure that the deprecated BSS class uses an xml builder
62 |         # if one is installed.
63 |         with warnings.catch_warnings(record=True) as w:
64 |             soup = BeautifulStoneSoup("<b />")
65 |         self.assertEqual(u"<b/>", unicode(soup.b))
66 |         self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67 | 
68 |     def test_real_xhtml_document(self):
69 |         """lxml strips the XML definition from an XHTML doc, which is fine."""
70 |         markup = b"""<?xml version="1.0" encoding="utf-8"?>
71 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
72 | <html xmlns="http://www.w3.org/1999/xhtml">
73 | <head><title>Hello.</title></head>
74 | <body>Goodbye.</body>
75 | </html>"""
76 |         soup = self.soup(markup)
77 |         self.assertEqual(
78 |             soup.encode("utf-8").replace(b"\n", b''),
79 |             markup.replace(b'\n', b'').replace(
80 |                 b'<?xml version="1.0" encoding="utf-8"?>', b''))
81 | 
82 | 
83 | @skipIf(
84 |     not LXML_PRESENT,
85 |     "lxml seems not to be present, not testing its XML tree builder.")
86 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
87 |     """See ``HTMLTreeBuilderSmokeTest``."""
88 | 
89 |     @property
90 |     def default_builder(self):
91 |         return LXMLTreeBuilderForXML()
92 | 


--------------------------------------------------------------------------------
/cssselect/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | """
 3 |     CSS Selectors based on XPath
 4 |     ============================
 5 | 
 6 |     This module supports selecting XML/HTML elements based on CSS selectors.
 7 |     See the `CSSSelector` class for details.
 8 | 
 9 | 
10 |     :copyright: (c) 2007-2012 Ian Bicking and contributors.
11 |                 See AUTHORS for more details.
12 |     :license: BSD, see LICENSE for more details.
13 | 
14 | """
15 | 
16 | from cssselect.parser import (parse, Selector, FunctionalPseudoElement,
17 |                               SelectorError, SelectorSyntaxError)
18 | from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError
19 | 
20 | 
21 | VERSION = '0.9.1'
22 | __version__ = VERSION
23 | 


--------------------------------------------------------------------------------
/document.py:
--------------------------------------------------------------------------------
  1 | import urllib, urllib2
  2 | import bs4
  3 | from txtfy import txtfy
  4 | from html2text import html2doc
  5 | import urlparse
  6 | 
  7 | SMS_LEN = 160
  8 | 
  9 | def normalize_url(url):
 10 | 	scheme = url.split("://")[0]
 11 | 	if scheme not in ['http', 'https']:
 12 | 		url = 'http://' + url
 13 | 	return url
 14 | 
 15 | # opera mini for dumbphones:
 16 | USER_AGENT = "Opera/9.80 (J2ME/MIDP; Opera Mini/4.2.13337/34.818; U; en) Presto/2.8.119 Version/11.10"
 17 | 
 18 | def get_content(url):
 19 | 	opener = urllib2.build_opener()
 20 | 	opener.addheaders = [('User-agent', USER_AGENT)]
 21 | 	html = opener.open(url).read()
 22 | 	title = bs4.BeautifulSoup(html).title.get_text()
 23 | 	return (html, title)
 24 | 
 25 | def get_content_ip(url):
 26 | 	url = "http://instapaper.com/m?u=" + urllib.quote_plus(url)
 27 | 	html = urllib2.urlopen(url).read()
 28 | 	soup = bs4.BeautifulSoup(html)
 29 | 	story = soup.find(id='story')
 30 | 	return unicode(story), soup.title.get_text()
 31 | 
 32 | NO_URL = ""
 33 | 
 34 | class Document(object):
 35 | 	url = None
 36 | 	def __init__(self, url=None, html=None):
 37 | 		if url:
 38 | 			url = normalize_url(url)
 39 | 			html, self.title = get_content(url)
 40 | 			self.url = url
 41 | 		if not isinstance(html, unicode):
 42 | 			html = html.decode('utf-8')
 43 | 		self.text, self.links, self.headers = html2doc(html, baseurl = url if url else "")
 44 | 		print "HEADERS", self.headers
 45 | 
 46 | class Frame(object):
 47 | 	def __init__(self, doc):
 48 | 		self.document = doc
 49 | 		self.offset = 0
 50 | 
 51 | class BrowserState(object):
 52 | 	def __init__(self):
 53 | 		self.frame_stack = []
 54 | 
 55 | 	def clean_up(self):
 56 | 		while len(self.frame_stack) > 5:
 57 | 			self.frame_stack = self.frame_stack[1:]
 58 | 
 59 | 	def navigate_to_url(self, url):
 60 | 		parsed = urlparse.urlparse(url)
 61 | 		if parsed.scheme == 'go-to-offset':
 62 | 			self.back()
 63 | 			self.frame_stack[-1].offset = int(parsed.netloc)
 64 | 		else:
 65 | 			self.frame_stack.append(Frame(Document(url)))
 66 | 
 67 | 	def back(self):
 68 | 		if len(self.frame_stack):
 69 | 			self.frame_stack = self.frame_stack[:-1]
 70 | 
 71 | 	def resend_current_place(self):
 72 | 		self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - SMS_LEN)
 73 | 		return self.get_n_messages(1)
 74 | 
 75 | 	def go_to_contents(self):
 76 | 		current_page_title = self.frame_stack[-1].document.title
 77 | 		html = "<title>Headings on {0}</title>".format(current_page_title) + u"<br/>".join([u"<a href='go-to-offset://{0}'>{1}</a>".format(offset, heading) for heading, offset in self.frame_stack[-1].document.headers])
 78 | 		doc = Document(html = html)
 79 | 		self.frame_stack.append(Frame(doc))
 80 | 
 81 | 	def get_n_messages(self, n, backwards=False):
 82 | 		if backwards:
 83 | 			self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - 160)
 84 | 
 85 | 		if not backwards and self.frame_stack[-1].offset >= len(self.frame_stack[-1].document.text):
 86 | 			return ["<end of page>"]
 87 | 		else:
 88 | 			messages = []
 89 | 			for i in xrange(n):
 90 | 				start_offset = self.frame_stack[-1].offset
 91 | 				if backwards:
 92 | 					start_offset = max(0, start_offset-160)
 93 | 				end_offset = min(len(self.frame_stack[-1].document.text), start_offset + SMS_LEN)
 94 | 				if end_offset - start_offset == 0:
 95 | 					break
 96 | 				messages.append(self.frame_stack[-1].document.text[start_offset : end_offset])
 97 | 				self.frame_stack[-1].offset = start_offset if backwards else end_offset
 98 | 				if self.frame_stack[-1].offset == 0:
 99 | 					break
100 | 		if backwards:
101 | 			self.frame_stack[-1].offset = min(len(self.frame_stack[-1].document.text), self.frame_stack[-1].offset + 160)
102 | 		return messages
103 | 
104 | 


--------------------------------------------------------------------------------
/document_old.py:
--------------------------------------------------------------------------------
  1 | import urllib, urllib2
  2 | import bs4
  3 | from txtfy import txtfy
  4 | 
  5 | SMS_LEN = 160
  6 | 
  7 | def normalize_url(url):
  8 | 	scheme = url.split("://")[0]
  9 | 	if scheme not in ['http', 'https']:
 10 | 		url = 'http://' + url
 11 | 	return url
 12 | 
 13 | def get_content(url):
 14 | 	html = urllib2.urlopen(url).read()
 15 | 	title = bs4.BeautifulSoup(html).title.get_text()
 16 | 	return (html, title)
 17 | 
 18 | def get_content_ip(url):
 19 | 	url = "http://instapaper.com/m?u=" + urllib.quote_plus(url)
 20 | 	html = urllib2.urlopen(url).read()
 21 | 	soup = bs4.BeautifulSoup(html)
 22 | 	story = soup.find(id='story')
 23 | 	return unicode(story), soup.title.get_text()
 24 | 
 25 | NO_URL = ""
 26 | 
 27 | class Document(object):
 28 | 	def __init__(self, url=None, html=None):
 29 | 		self.url = url
 30 | 		if url:
 31 | 			html, self.title = get_content(normalize_url(url))
 32 | 		soup = bs4.BeautifulSoup(html)
 33 | 		self.text = u""
 34 | 		self.links = []
 35 | 		self.headers = []
 36 | 		ignore_tags = set(['head', 'script', 'style'])
 37 | 		def break_line():
 38 | 			if len(self.text) > 0 and self.text[-1] != '\n':
 39 | 				self.text += '\n'
 40 | 		def break_word():
 41 | 			if len(self.text) > 0 and self.text[-1] not in " \n":
 42 | 				self.text += " "
 43 | 		def emit_text(t):
 44 | 			break_word()
 45 | 			self.text += txtfy(t)
 46 | 		def traverse(tag):
 47 | 			if tag.name == 'a' and tag.has_attr('href'):
 48 | 				self.links.append(tag['href'])
 49 | 				emit_text(u'[{0}]({1}) '.format(tag.get_text(), len(self.links)))
 50 | 			elif tag.name in ['h1', 'h2', 'h3', 'h4']:
 51 | 				break_line()
 52 | 				self.headers.append((tag.get_text(), len(self.text)))
 53 | 				process_contents(tag)
 54 | 				break_line()
 55 | 			elif tag.name in ['li', 'p']:
 56 | 				break_line()
 57 | 				process_contents(tag)
 58 | 				break_line()
 59 | 			else:
 60 | 				process_contents(tag)
 61 | 		def process_contents(tag):
 62 | 			for child in tag.contents:
 63 | 				if isinstance(child, bs4.NavigableString):
 64 | 					emit_text(unicode(child))
 65 | 				elif hasattr(child, 'name'):
 66 | 					traverse(child)
 67 | 		traverse(soup)
 68 | 		break_line()
 69 | 		emit_text("<end of page>")
 70 | 
 71 | class Frame(object):
 72 | 	def __init__(self, doc):
 73 | 		self.document = doc
 74 | 		self.offset = 0
 75 | 
 76 | class BrowserState(object):
 77 | 	def __init__(self):
 78 | 		self.frame_stack = []
 79 | 
 80 | 	def clean_up(self):
 81 | 		while len(self.frame_stack) > 5:
 82 | 			self.frame_stack = self.frame_stack[1:]
 83 | 
 84 | 	def navigate_to_url(self, url):
 85 | 		self.frame_stack.append(Frame(Document(url)))
 86 | 
 87 | 	def back(self):
 88 | 		if len(self.frame_stack):
 89 | 			self.frame_stack = self.frame_stack[:-1]
 90 | 
 91 | 	def resend_current_place(self):
 92 | 		self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - SMS_LEN)
 93 | 		return self.get_n_messages(1)
 94 | 
 95 | 	def get_n_messages(self, n):
 96 | 		if self.frame_stack[-1].offset >= len(self.frame_stack[-1].document.text):
 97 | 			return ["<end of page>"]
 98 | 		else:
 99 | 			messages = []
100 | 			for i in xrange(n):
101 | 				start_offset = self.frame_stack[-1].offset
102 | 				end_offset = min(len(self.frame_stack[-1].document.text), start_offset + SMS_LEN)
103 | 				if end_offset - start_offset == 0:
104 | 					break
105 | 				messages.append(self.frame_stack[-1].document.text[start_offset : end_offset])
106 | 				self.frame_stack[-1].offset = end_offset
107 | 			return messages
108 | 
109 | 


--------------------------------------------------------------------------------
/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/favicon.ico


--------------------------------------------------------------------------------
/goose/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import os
24 | import platform
25 | from tempfile import mkstemp
26 | 
27 | from goose.version import version_info, __version__
28 | from goose.configuration import Configuration
29 | from goose.crawler import CrawlCandidate
30 | from goose.crawler import Crawler
31 | 
32 | 
33 | class Goose(object):
34 |     """\
35 | 
36 |     """
37 |     def __init__(self, config=None):
38 |         self.config = config or Configuration()
39 |         self.extend_config()
40 |         self.initialize()
41 | 
42 |     def extend_config(self):
43 |         if isinstance(self.config, dict):
44 |             config = Configuration()
45 |             for k, v in self.config.items():
46 |                 if hasattr(config, k):
47 |                     setattr(config, k, v)
48 |             self.config = config
49 | 
50 |     def extract(self, url=None, raw_html=None):
51 |         """\
52 |         Main method to extract an article object from a URL,
53 |         pass in a url and get back a Article
54 |         """
55 |         cc = CrawlCandidate(self.config, url, raw_html)
56 |         return self.crawl(cc)
57 | 
58 |     def shutdown_network(self):
59 |         pass
60 | 
61 |     def crawl(self, crawl_candiate):
62 |         crawler = Crawler(self.config)
63 |         article = crawler.crawl(crawl_candiate)
64 |         return article
65 | 
66 |     def initialize(self):
67 |         # we don't need to go further if image extractor or
68 |         # local_storage is not set
69 |         if not self.config.local_storage_path or \
70 |            not self.config.enable_image_fetching:
71 |             return
72 |         # test if config.local_storage_path
73 |         # is a directory
74 |         if not os.path.isdir(self.config.local_storage_path):
75 |             os.makedirs(self.config.local_storage_path)
76 | 
77 |         if not os.path.isdir(self.config.local_storage_path):
78 |             raise Exception(self.config.local_storage_path +
79 |                 " directory does not seem to exist, "
80 |                 "you need to set this for image processing downloads"
81 |             )
82 | 
83 |         # test to write a dummy file to the directory
84 |         # to check is directory is writtable
85 |         level, path = mkstemp(dir=self.config.local_storage_path)
86 |         try:
87 |             f = os.fdopen(level, "w")
88 |             f.close()
89 |             os.remove(path)
90 |         except IOError:
91 |             raise Exception(self.config.local_storage_path +
92 |                 " directory is not writeble, "
93 |                 "you need to set this for image processing downloads"
94 |             )
95 | 


--------------------------------------------------------------------------------
/goose/article.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | 
25 | class Article(object):
26 | 
27 |     def __init__(self):
28 |         # title of the article
29 |         self.title = None
30 | 
31 |         # stores the lovely, pure text from the article,
32 |         # stripped of html, formatting, etc...
33 |         # just raw text with paragraphs separated by newlines.
34 |         # This is probably what you want to use.
35 |         self.cleaned_text = u""
36 | 
37 |         # meta description field in HTML source
38 |         self.meta_description = u""
39 | 
40 |         # meta lang field in HTML source
41 |         self.meta_lang = u""
42 | 
43 |         # meta favicon field in HTML source
44 |         self.meta_favicon = u""
45 | 
46 |         # meta keywords field in the HTML source
47 |         self.meta_keywords = u""
48 | 
49 |         # The canonical link of this article if found in the meta data
50 |         self.canonical_link = u""
51 | 
52 |         # holds the domain of this article we're parsing
53 |         self.domain = u""
54 | 
55 |         # holds the top Element we think
56 |         # is a candidate for the main body of the article
57 |         self.top_node = None
58 | 
59 |         # holds the top Image object that
60 |         # we think represents this article
61 |         self.top_image = None
62 | 
63 |         # holds a set of tags that may have
64 |         # been in the artcle, these are not meta keywords
65 |         self.tags = set()
66 | 
67 |         # holds a list of any movies
68 |         # we found on the page like youtube, vimeo
69 |         self.movies = []
70 | 
71 |         # stores the final URL that we're going to try
72 |         # and fetch content against, this would be expanded if any
73 |         self.final_url = u""
74 | 
75 |         # stores the MD5 hash of the url
76 |         # to use for various identification tasks
77 |         self.link_hash = ""
78 | 
79 |         # stores the RAW HTML
80 |         # straight from the network connection
81 |         self.raw_html = u""
82 | 
83 |         # the lxml Document object
84 |         self.doc = None
85 | 
86 |         # this is the original JSoup document that contains
87 |         # a pure object from the original HTML without any cleaning
88 |         # options done on it
89 |         self.raw_doc = None
90 | 
91 |         # Sometimes useful to try and know when
92 |         # the publish date of an article was
93 |         self.publish_date = None
94 | 
95 |         # A property bucket for consumers of goose to store custom data extractions.
96 |         self.additional_data = {}
97 | 


--------------------------------------------------------------------------------
/goose/configuration.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | import os
 24 | import tempfile
 25 | from goose.text import StopWords
 26 | from goose.parsers import Parser
 27 | from goose.parsers import ParserSoup
 28 | from goose.version import __version__
 29 | 
 30 | HTTP_DEFAULT_TIMEOUT = 30
 31 | 
 32 | 
 33 | class Configuration(object):
 34 | 
 35 |     def __init__(self):
 36 |         # What's the minimum bytes for an image we'd accept is,
 37 |         # alot of times we want to filter out the author's little images
 38 |         # in the beginning of the article
 39 |         self.images_min_bytes = 4500
 40 | 
 41 |         # set this guy to false if you don't care about getting images,
 42 |         # otherwise you can either use the default
 43 |         # image extractor to implement the ImageExtractor
 44 |         # interface to build your own
 45 |         self.enable_image_fetching = True
 46 | 
 47 |         # set this valriable to False if you want to force
 48 |         # the article language. OtherWise it will attempt to
 49 |         # find meta language and use the correct stopwords dictionary
 50 |         self.use_meta_language = True
 51 | 
 52 |         # default language
 53 |         # it will be use as fallback
 54 |         # if use_meta_language is set to false, targetlanguage will
 55 |         # be use
 56 |         self.target_language = 'en'
 57 | 
 58 |         # defautl stopwrods class
 59 |         self.stopwords_class = StopWords
 60 | 
 61 |         # path to your imagemagick convert executable,
 62 |         # on the mac using mac ports this is the default listed
 63 |         self.imagemagick_convert_path = "/opt/local/bin/convert"
 64 | 
 65 |         # path to your imagemagick identify executable
 66 |         self.imagemagick_identify_path = "/opt/local/bin/identify"
 67 | 
 68 |         # used as the user agent that
 69 |         # is sent with your web requests to extract an article
 70 |         # self.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2)"\
 71 |         #                         " AppleWebKit/534.52.7 (KHTML, like Gecko) "\
 72 |         #                         "Version/5.1.2 Safari/534.52.7"
 73 |         self.browser_user_agent = 'Goose/%s' % __version__
 74 | 
 75 |         # debug mode
 76 |         # enable this to have additional debugging information
 77 |         # sent to stdout
 78 |         self.debug = False
 79 | 
 80 |         # TODO
 81 |         self.extract_publishdate = None
 82 | 
 83 |         # TODO
 84 |         self.additional_data_extractor = None
 85 | 
 86 |         # Parser type
 87 |         self.parser_class = 'lxml'
 88 | 
 89 |         # set the local storage path
 90 |         # make this configurable
 91 |         self.local_storage_path = os.path.join(tempfile.gettempdir(), 'goose')
 92 | 
 93 |         # http timeout
 94 |         self.http_timeout = HTTP_DEFAULT_TIMEOUT
 95 | 
 96 |     def get_parser(self):
 97 |         return Parser if self.parser_class == 'lxml' else ParserSoup
 98 | 
 99 |     def get_publishdate_extractor(self):
100 |         return self.extract_publishdate
101 | 
102 |     def set_publishdate_extractor(self, extractor):
103 |         """\
104 |         Pass in to extract article publish dates.
105 |         @param extractor a concrete instance of PublishDateExtractor
106 |         """
107 |         if not extractor:
108 |             raise ValueError("extractor must not be null!")
109 |         self.extract_publishdate = extractor
110 | 
111 |     def get_additionaldata_extractor(self):
112 |         return self.additional_data_extractor
113 | 
114 |     def set_additionaldata_extractor(self, extractor):
115 |         """\
116 |         Pass in to extract any additional data not defined within
117 |         @param extractor a concrete instance of AdditionalDataExtractor
118 |         """
119 |         if not extractor:
120 |             raise ValueError("extractor must not be null!")
121 |         self.additional_data_extractor = extractor
122 | 


--------------------------------------------------------------------------------
/goose/images/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/goose/images/__init__.py


--------------------------------------------------------------------------------
/goose/images/image.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | 
25 | class Image(object):
26 | 
27 |     def __init__(self):
28 |         # holds the Element node of the image we think is top dog
29 |         self.top_image_node = None
30 | 
31 |         # holds the src of the image
32 |         self.src = ""
33 | 
34 |         # how confident are we in this image extraction?
35 |         # the most images generally the less confident
36 |         self.confidence_score = float(0.0)
37 | 
38 |         # Height of the image in pixels
39 |         self.height = 0
40 | 
41 |         # width of the image in pixels
42 |         self.width = 0
43 | 
44 |         # what kind of image extraction was used for this?
45 |         # bestGuess, linkTag, openGraph tags?
46 |         self.extraction_type = "NA"
47 | 
48 |         # stores how many bytes this image is.
49 |         self.bytes = long(0)
50 | 
51 |     def get_src(self):
52 |         return self.src
53 | 
54 | 
55 | class ImageDetails(object):
56 | 
57 |     def __init__(self):
58 | 
59 |         # the width of the image
60 |         self.width = 0
61 | 
62 |         # height of the image
63 |         self.height = 0
64 | 
65 |         # the mime_type of the image JPEG / PNG
66 |         self.mime_type = None
67 | 
68 |     def get_width(self):
69 |         return self.width
70 | 
71 |     def set_width(self, width):
72 |         self.width = width
73 | 
74 |     def get_height(self):
75 |         return self.height
76 | 
77 |     def set_height(self, height):
78 |         self.height = height
79 | 
80 |     def get_mime_type(self):
81 |         return self.mime_type
82 | 
83 |     def set_mime_type(self, mime_type):
84 |         self.mime_type = mime_type
85 | 
86 | 
87 | class LocallyStoredImage(object):
88 | 
89 |     def __init__(self, src='', local_filename='',
90 |         link_hash='', bytes=long(0), file_extension='', height=0, width=0):
91 |         self.src = src
92 |         self.local_filename = local_filename
93 |         self.link_hash = link_hash
94 |         self.bytes = bytes
95 |         self.file_extension = file_extension
96 |         self.height = height
97 |         self.width = width
98 | 


--------------------------------------------------------------------------------
/goose/images/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | import hashlib
 24 | import os
 25 | import urllib2
 26 | from PIL import Image
 27 | from goose.utils.encoding import smart_str
 28 | from goose.images.image import ImageDetails
 29 | from goose.images.image import LocallyStoredImage
 30 | 
 31 | 
 32 | class ImageUtils(object):
 33 | 
 34 |     @classmethod
 35 |     def get_image_dimensions(self, identify_program, path):
 36 |         image = Image.open(path)
 37 |         image_details = ImageDetails()
 38 |         image_details.set_mime_type(image.format)
 39 |         width, height = image.size
 40 |         image_details.set_width(width)
 41 |         image_details.set_height(height)
 42 |         return image_details
 43 | 
 44 |     @classmethod
 45 |     def store_image(self, http_client, link_hash, src, config):
 46 |         """\
 47 |         Writes an image src http string to disk as a temporary file
 48 |         and returns the LocallyStoredImage object
 49 |         that has the info you should need on the image
 50 |         """
 51 |         # check for a cache hit already on disk
 52 |         image = self.read_localfile(link_hash, src, config)
 53 |         if image:
 54 |             return image
 55 | 
 56 |         # no cache found download the image
 57 |         data = self.fetch(http_client, src)
 58 |         if data:
 59 |             image = self.write_localfile(data, link_hash, src, config)
 60 |             if image:
 61 |                 return image
 62 | 
 63 |         return None
 64 | 
 65 |     @classmethod
 66 |     def get_mime_type(self, image_details):
 67 |         mime_type = image_details.get_mime_type().lower()
 68 |         mimes = {
 69 |             'png': '.png',
 70 |             'jpg': '.jpg',
 71 |             'jpeg': '.jpg',
 72 |             'gif': '.gif',
 73 |         }
 74 |         return mimes.get(mime_type, 'NA')
 75 | 
 76 |     @classmethod
 77 |     def read_localfile(self, link_hash, src, config):
 78 |         local_image_name = self.get_localfile_name(link_hash, src, config)
 79 |         if os.path.isfile(local_image_name):
 80 |             identify = config.imagemagick_identify_path
 81 |             image_details = self.get_image_dimensions(identify, local_image_name)
 82 |             file_extension = self.get_mime_type(image_details)
 83 |             bytes = os.path.getsize(local_image_name)
 84 |             return LocallyStoredImage(
 85 |                 src=src,
 86 |                 local_filename=local_image_name,
 87 |                 link_hash=link_hash,
 88 |                 bytes=bytes,
 89 |                 file_extension=file_extension,
 90 |                 height=image_details.get_height(),
 91 |                 width=image_details.get_width()
 92 |             )
 93 |         return None
 94 | 
 95 |     @classmethod
 96 |     def write_localfile(self, entity, link_hash, src, config):
 97 |         local_path = self.get_localfile_name(link_hash, src, config)
 98 |         f = open(local_path, 'wb')
 99 |         f.write(entity)
100 |         f.close()
101 |         return self.read_localfile(link_hash, src, config)
102 | 
103 |     @classmethod
104 |     def get_localfile_name(self, link_hash, src, config):
105 |         image_hash = hashlib.md5(smart_str(src)).hexdigest()
106 |         return os.path.join(config.local_storage_path, '%s_%s' % (link_hash, image_hash))
107 | 
108 |     @classmethod
109 |     def clean_src_string(self, src):
110 |         return src.replace(" ", "%20")
111 | 
112 |     @classmethod
113 |     def fetch(self, http_client, src):
114 |         try:
115 |             req = urllib2.Request(src)
116 |             f = urllib2.urlopen(req)
117 |             data = f.read()
118 |             return data
119 |         except:
120 |             return None
121 | 


--------------------------------------------------------------------------------
/goose/network.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import urllib2
24 | 
25 | 
26 | class HtmlFetcher(object):
27 | 
28 |     def __init__(self, config):
29 |         self.config = config
30 |         # set header
31 |         self.headers = {'User-agent': self.config.browser_user_agent}
32 | 
33 |     def get_url(self):
34 |         # if we have a result
35 |         # get the final_url
36 |         if self.result is not None:
37 |             return self.result.geturl()
38 |         return None
39 | 
40 |     def get_html(self, url):
41 |         # utf-8 encode unicode url
42 |         if isinstance(url, unicode):
43 |             url = url.encode('utf-8')
44 | 
45 |         # set request
46 |         self.request = urllib2.Request(
47 |                         url,
48 |                         headers=self.headers)
49 |         # do request
50 |         try:
51 |             self.result = urllib2.urlopen(
52 |                             self.request,
53 |                             timeout=self.config.http_timeout)
54 |         except:
55 |             self.result = None
56 | 
57 |         # read the result content
58 |         if self.result is not None:
59 |             return self.result.read()
60 |         return None
61 | 


--------------------------------------------------------------------------------
/goose/resources/images/known-image-css.txt:
--------------------------------------------------------------------------------
1 | latimes.com^thumbnail
2 | cnn.com^storytext|cnn_strycntntlft
3 | foxnews.com^entry-content
4 | msn.com^articleText
5 | go.com^mediaimage
6 | lefigaro.fr^photo center
7 | cadres.apec.fr^noFieldsTable
8 | emploi.lesechos.fr^offerHeader
9 | linkfinance.fr^offerHeader


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-ar.txt:
--------------------------------------------------------------------------------
  1 | ﻿فى
  2 | في
  3 | كل
  4 | لم
  5 | لن
  6 | له
  7 | من
  8 | هو
  9 | هي
 10 | قوة
 11 | كما
 12 | لها
 13 | منذ
 14 | وقد
 15 | ولا
 16 | نفسه
 17 | لقاء
 18 | مقابل
 19 | هناك
 20 | وقال
 21 | وكان
 22 | نهاية
 23 | وقالت
 24 | وكانت
 25 | للامم
 26 | فيه
 27 | كلم
 28 | لكن
 29 | وفي
 30 | وقف
 31 | ولم
 32 | ومن
 33 | وهو
 34 | وهي
 35 | يوم
 36 | فيها
 37 | منها
 38 | مليار
 39 | لوكالة
 40 | يكون
 41 | يمكن
 42 | مليون
 43 | حيث
 44 | اكد
 45 | الا
 46 | اما
 47 | امس
 48 | السابق
 49 | التى
 50 | التي
 51 | اكثر
 52 | ايار
 53 | ايضا
 54 | ثلاثة
 55 | الذاتي
 56 | الاخيرة
 57 | الثاني
 58 | الثانية
 59 | الذى
 60 | الذي
 61 | الان
 62 | امام
 63 | ايام
 64 | خلال
 65 | حوالى
 66 | الذين
 67 | الاول
 68 | الاولى
 69 | بين
 70 | ذلك
 71 | دون
 72 | حول
 73 | حين
 74 | الف
 75 | الى
 76 | انه
 77 | اول
 78 | ضمن
 79 | انها
 80 | جميع
 81 | الماضي
 82 | الوقت
 83 | المقبل
 84 | اليوم
 85 | ـ
 86 | ف
 87 | و
 88 | و6
 89 | قد
 90 | لا
 91 | ما
 92 | مع
 93 | مساء
 94 | هذا
 95 | واحد
 96 | واضاف
 97 | واضافت
 98 | فان
 99 | قبل
100 | قال
101 | كان
102 | لدى
103 | نحو
104 | هذه
105 | وان
106 | واكد
107 | كانت
108 | واوضح
109 | مايو
110 | ب
111 | ا
112 | أ
113 | ،
114 | عشر
115 | عدد
116 | عدة
117 | عشرة
118 | عدم
119 | عام
120 | عاما
121 | عن
122 | عند
123 | عندما
124 | على
125 | عليه
126 | عليها
127 | زيارة
128 | سنة
129 | سنوات
130 | تم
131 | ضد
132 | بعد
133 | بعض
134 | اعادة
135 | اعلنت
136 | بسبب
137 | حتى
138 | اذا
139 | احد
140 | اثر
141 | برس
142 | باسم
143 | غدا
144 | شخصا
145 | صباح
146 | اطار
147 | اربعة
148 | اخرى
149 | بان
150 | اجل
151 | غير
152 | بشكل
153 | حاليا
154 | بن
155 | به
156 | ثم
157 | اف
158 | ان
159 | او
160 | اي
161 | بها
162 | صفر


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-da.txt:
--------------------------------------------------------------------------------
  1 | af
  2 | alle
  3 | andet
  4 | andre
  5 | at
  6 | begge
  7 | da
  8 | de
  9 | den
 10 | denne
 11 | der
 12 | deres
 13 | det
 14 | dette
 15 | dig
 16 | din
 17 | dog
 18 | du
 19 | ej
 20 | eller
 21 | en
 22 | end
 23 | ene
 24 | eneste
 25 | enhver
 26 | et
 27 | fem
 28 | fire
 29 | flere
 30 | fleste
 31 | for
 32 | fordi
 33 | forrige
 34 | fra
 35 | få
 36 | før
 37 | god
 38 | han
 39 | hans
 40 | har
 41 | hendes
 42 | her
 43 | hun
 44 | hvad
 45 | hvem
 46 | hver
 47 | hvilken
 48 | hvis
 49 | hvor
 50 | hvordan
 51 | hvorfor
 52 | hvornår
 53 | i
 54 | ikke
 55 | ind
 56 | ingen
 57 | intet
 58 | jeg
 59 | jeres
 60 | kan
 61 | kom
 62 | kommer
 63 | lav
 64 | lidt
 65 | lille
 66 | man
 67 | mand
 68 | mange
 69 | med
 70 | meget
 71 | men
 72 | mens
 73 | mere
 74 | mig
 75 | ned
 76 | ni
 77 | nogen
 78 | noget
 79 | ny
 80 | nyt
 81 | nær
 82 | næste
 83 | næsten
 84 | og
 85 | op
 86 | otte
 87 | over
 88 | på
 89 | se
 90 | seks
 91 | ses
 92 | som
 93 | stor
 94 | store
 95 | syv
 96 | ti
 97 | til
 98 | to
 99 | tre
100 | ud
101 | var
102 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-es.txt:
--------------------------------------------------------------------------------
  1 | de
  2 | la
  3 | que
  4 | el
  5 | en
  6 | y
  7 | a
  8 | los
  9 | del
 10 | se
 11 | las
 12 | por
 13 | un
 14 | para
 15 | con
 16 | no
 17 | una
 18 | su
 19 | al
 20 | lo
 21 | como
 22 | más
 23 | pero
 24 | sus
 25 | le
 26 | ya
 27 | o
 28 | este
 29 | sí
 30 | porque
 31 | esta
 32 | entre
 33 | cuando
 34 | muy
 35 | sin
 36 | sobre
 37 | también
 38 | me
 39 | hasta
 40 | hay
 41 | donde
 42 | quien
 43 | desde
 44 | todo
 45 | nos
 46 | durante
 47 | todos
 48 | uno
 49 | les
 50 | ni
 51 | contra
 52 | otros
 53 | ese
 54 | eso
 55 | ante
 56 | ellos
 57 | e
 58 | esto
 59 | mí
 60 | antes
 61 | algunos
 62 | qué
 63 | unos
 64 | yo
 65 | otro
 66 | otras
 67 | otra
 68 | él
 69 | tanto
 70 | esa
 71 | estos
 72 | mucho
 73 | quienes
 74 | nada
 75 | muchos
 76 | cual
 77 | poco
 78 | ella
 79 | estar
 80 | estas
 81 | algunas
 82 | algo
 83 | nosotros
 84 | mi
 85 | mis
 86 | tú
 87 | te
 88 | ti
 89 | tu
 90 | tus
 91 | ellas
 92 | nosotras
 93 | vosotros
 94 | vosotras
 95 | os
 96 | mío
 97 | mía
 98 | míos
 99 | mías
100 | tuyo
101 | tuya
102 | tuyos
103 | tuyas
104 | suyo
105 | suya
106 | suyos
107 | suyas
108 | nuestro
109 | nuestra
110 | nuestros
111 | nuestras
112 | vuestro
113 | vuestra
114 | vuestros
115 | vuestras
116 | esos
117 | esas
118 | estoy
119 | estás
120 | está
121 | estamos
122 | estáis
123 | están
124 | esté
125 | estés
126 | estemos
127 | estéis
128 | estén
129 | estaré
130 | estarás
131 | estará
132 | estaremos
133 | estaréis
134 | estarán
135 | estaría
136 | estarías
137 | estaríamos
138 | estaríais
139 | estarían
140 | estaba
141 | estabas
142 | estábamos
143 | estabais
144 | estaban
145 | estuve
146 | estuviste
147 | estuvo
148 | estuvimos
149 | estuvisteis
150 | estuvieron
151 | estuviera
152 | estuvieras
153 | estuviéramos
154 | estuvierais
155 | estuvieran
156 | estuviese
157 | estuvieses
158 | estuviésemos
159 | estuvieseis
160 | estuviesen
161 | estando
162 | estado
163 | estada
164 | estados
165 | estadas
166 | estad
167 | he
168 | has
169 | ha
170 | hemos
171 | habéis
172 | han
173 | haya
174 | hayas
175 | hayamos
176 | hayáis
177 | hayan
178 | habré
179 | habrás
180 | habrá
181 | habremos
182 | habréis
183 | habrán
184 | habría
185 | habrías
186 | habríamos
187 | habríais
188 | habrían
189 | había
190 | habías
191 | habíamos
192 | habíais
193 | habían
194 | hube
195 | hubiste
196 | hubo
197 | hubimos
198 | hubisteis
199 | hubieron
200 | hubiera
201 | hubieras
202 | hubiéramos
203 | hubierais
204 | hubieran
205 | hubiese
206 | hubieses
207 | hubiésemos
208 | hubieseis
209 | hubiesen
210 | habiendo
211 | habido
212 | habida
213 | habidos
214 | habidas
215 | 
216 | # forms of ser, to be (not including the infinitive):
217 | soy
218 | eres
219 | es
220 | somos
221 | sois
222 | son
223 | sea
224 | seas
225 | seamos
226 | seáis
227 | sean
228 | seré
229 | serás
230 | será
231 | seremos
232 | seréis
233 | serán
234 | sería
235 | serías
236 | seríamos
237 | seríais
238 | serían
239 | era
240 | eras
241 | éramos
242 | erais
243 | eran
244 | fui
245 | fuiste
246 | fue
247 | fuimos
248 | fuisteis
249 | fueron
250 | fuera
251 | fueras
252 | fuéramos
253 | fuerais
254 | fueran
255 | fuese
256 | fueses
257 | fuésemos
258 | fueseis
259 | fuesen
260 | siendo
261 | sido
262 | tengo
263 | tienes
264 | tiene
265 | tenemos
266 | tenéis
267 | tienen
268 | tenga
269 | tengas
270 | tengamos
271 | tengáis
272 | tengan
273 | tendré
274 | tendrás
275 | tendrá
276 | tendremos
277 | tendréis
278 | tendrán
279 | tendría
280 | tendrías
281 | tendríamos
282 | tendríais
283 | tendrían
284 | tenía
285 | tenías
286 | teníamos
287 | teníais
288 | tenían
289 | tuve
290 | tuviste
291 | tuvo
292 | tuvimos
293 | tuvisteis
294 | tuvieron
295 | tuviera
296 | tuvieras
297 | tuviéramos
298 | tuvierais
299 | tuvieran
300 | tuviese
301 | tuvieses
302 | tuviésemos
303 | tuvieseis
304 | tuviesen
305 | teniendo
306 | tenido
307 | tenida
308 | tenidos
309 | tenidas
310 | tened
311 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-fi.txt:
--------------------------------------------------------------------------------
 1 | alla
 2 | ansiosta
 3 | ehkä
 4 | ei
 5 | enemmän
 6 | ennen
 7 | etessa
 8 | f
 9 | haikki
10 | he
11 | hitaasti
12 | hoikein
13 | hyvin
14 | hän
15 | ilman
16 | ja
17 | jos
18 | jälkeen
19 | kanssa
20 | kaukana
21 | kenties
22 | keskellä
23 | kesken
24 | koskaan
25 | kuinkan
26 | kukka
27 | kylliksi
28 | kyllä
29 | liian
30 | lla
31 | lla
32 | luona
33 | lähellä
34 | läpi
35 | me
36 | miksi
37 | mikä
38 | milloin
39 | milloinkan
40 | minä
41 | missä
42 | miten
43 | nopeasti
44 | nyt
45 | oikea
46 | oikealla
47 | paljon
48 | siellä
49 | sinä
50 | ssa
51 | sta
52 | suoraan
53 | tai
54 | takana
55 | takia
56 | tarpeeksi
57 | te
58 | tässä
59 | ulkopuolella
60 | vahemmän
61 | vasen
62 | vasenmalla
63 | vastan
64 | vielä
65 | vieressä
66 | vähän
67 | yhdessä
68 | ylös
69 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-fr.txt:
--------------------------------------------------------------------------------
  1 | # Licensed to the Apache Software Foundation (ASF) under one or more
  2 | # contributor license agreements.  See the NOTICE file distributed with
  3 | # this work for additional information regarding copyright ownership.
  4 | # The ASF licenses this file to You under the Apache License, Version 2.0
  5 | # (the "License"); you may not use this file except in compliance with
  6 | # the License.  You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | #-----------------------------------------------------------------------
 17 | # a couple of test stopwords to test that the words are really being
 18 | # configured from this file:
 19 | stopworda
 20 | stopwordb
 21 | 
 22 | #Standard english stop words taken from Lucene's StopAnalyzer
 23 | a
 24 | an
 25 | and
 26 | are
 27 | as
 28 | at
 29 | be
 30 | but
 31 | by
 32 | for
 33 | if
 34 | in
 35 | into
 36 | is
 37 | it
 38 | no
 39 | not
 40 | of
 41 | on
 42 | or
 43 | s
 44 | such
 45 | t
 46 | that
 47 | the
 48 | their
 49 | then
 50 | there
 51 | these
 52 | they
 53 | this
 54 | to
 55 | was
 56 | will
 57 | with
 58 | au
 59 | aux
 60 | avec
 61 | ce
 62 | ces
 63 | dans
 64 | de
 65 | des
 66 | du
 67 | elle
 68 | en
 69 | et
 70 | eux
 71 | il
 72 | je
 73 | la
 74 | le
 75 | leur
 76 | lui
 77 | ma
 78 | mais
 79 | me
 80 | même
 81 | mes
 82 | moi
 83 | mon
 84 | ne
 85 | nos
 86 | notre
 87 | nous
 88 | on
 89 | ou
 90 | par
 91 | pas
 92 | pour
 93 | qu
 94 | que
 95 | qui
 96 | sa
 97 | se
 98 | ses
 99 | son
100 | sur
101 | ta
102 | te
103 | tes
104 | toi
105 | ton
106 | tu
107 | un
108 | une
109 | vos
110 | votre
111 | vous
112 | c
113 | d
114 | j
115 | l
116 | à
117 | m
118 | n
119 | s
120 | t
121 | y
122 | été
123 | étée
124 | étées
125 | étés
126 | étant
127 | suis
128 | es
129 | est
130 | sommes
131 | êtes
132 | sont
133 | serai
134 | seras
135 | sera
136 | serons
137 | serez
138 | seront
139 | serais
140 | serait
141 | serions
142 | seriez
143 | seraient
144 | étais
145 | était
146 | étions
147 | étiez
148 | étaient
149 | fus
150 | fut
151 | fûmes
152 | fûtes
153 | furent
154 | sois
155 | soit
156 | soyons
157 | soyez
158 | soient
159 | fusse
160 | fusses
161 | fût
162 | fussions
163 | fussiez
164 | fussent
165 | ayant
166 | eu
167 | eue
168 | eues
169 | eus
170 | ai
171 | as
172 | avons
173 | avez
174 | ont
175 | aurai
176 | auras
177 | aura
178 | aurons
179 | aurez
180 | auront
181 | aurais
182 | aurait
183 | aurions
184 | auriez
185 | auraient
186 | avais
187 | avait
188 | avions
189 | aviez
190 | avaient
191 | eut
192 | eûmes
193 | eûtes
194 | eurent
195 | aie
196 | aies
197 | ait
198 | ayons
199 | ayez
200 | aient
201 | eusse
202 | eusses
203 | eût
204 | eussions
205 | eussiez
206 | eussent
207 | ceci
208 | celà
209 | cet
210 | cette
211 | ici
212 | ils
213 | les
214 | leurs
215 | quel
216 | quels
217 | quelle
218 | quelles
219 | sans
220 | soi
221 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-hu.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | á
  3 | ahogy
  4 | ahol
  5 | aki
  6 | akik
  7 | akkor
  8 | alatt
  9 | által
 10 | általában
 11 | amely
 12 | amelyek
 13 | amelyekben
 14 | amelyeket
 15 | amelyet
 16 | amelynek
 17 | ami
 18 | amit
 19 | amolyan
 20 | amp
 21 | amíg
 22 | amikor
 23 | át
 24 | abban
 25 | ahhoz
 26 | annak
 27 | arra
 28 | arról
 29 | az
 30 | azok
 31 | azon
 32 | azt
 33 | azzal
 34 | azért
 35 | aztán
 36 | azután
 37 | azonban
 38 | b
 39 | bár
 40 | be
 41 | belül
 42 | benne
 43 | c
 44 | cikk
 45 | cikkek
 46 | cikkeket
 47 | csak
 48 | d
 49 | de
 50 | e
 51 | é
 52 | eddig
 53 | egész
 54 | egy
 55 | egyes
 56 | egyetlen
 57 | egyéb
 58 | egyik
 59 | egyre
 60 | ekkor
 61 | el
 62 | elég
 63 | ellen
 64 | elő
 65 | először
 66 | előtt
 67 | első
 68 | én
 69 | éppen
 70 | ebben
 71 | ehhez
 72 | emilyen
 73 | ennek
 74 | erre
 75 | ez
 76 | ezt
 77 | ezek
 78 | ezen
 79 | ezzel
 80 | ezért
 81 | és
 82 | f
 83 | fel
 84 | felé
 85 | g
 86 | h
 87 | hanem
 88 | hiszen
 89 | hogy
 90 | hogyan
 91 | i
 92 | í
 93 | igen
 94 | így
 95 | illetve
 96 | ill.
 97 | ill
 98 | ilyen
 99 | ilyenkor
100 | is
101 | ison
102 | ismét
103 | itt
104 | j
105 | jó
106 | jól
107 | jobban
108 | k
109 | kell
110 | kellett
111 | keresztül
112 | keressünk
113 | ki
114 | kívül
115 | között
116 | közül
117 | l
118 | legalább
119 | lehet
120 | lehetett
121 | legyen
122 | lenne
123 | lenni
124 | lesz
125 | lett
126 | m
127 | maga
128 | magát
129 | majd
130 | majd
131 | már
132 | más
133 | másik
134 | meg
135 | még
136 | mellett
137 | mert
138 | mely
139 | melyek
140 | mi
141 | mit
142 | míg
143 | miért
144 | milyen
145 | mikor
146 | minden
147 | mindent
148 | mindenki
149 | mindig
150 | mint
151 | mintha
152 | mivel
153 | most
154 | n
155 | nagy
156 | nagyobb
157 | nagyon
158 | ne
159 | néha
160 | nekem
161 | neki
162 | nem
163 | néhány
164 | nélkül
165 | nincs
166 | o
167 | ó
168 | olyan
169 | ott
170 | össze
171 | ö
172 | ő
173 | ők
174 | őket
175 | p
176 | pedig
177 | persze
178 | q
179 | r
180 | rá
181 | s
182 | saját
183 | sem
184 | semmi
185 | sok
186 | sokat
187 | sokkal
188 | sz
189 | számára
190 | szemben
191 | szerint
192 | szinte
193 | t
194 | talán
195 | tehát
196 | teljes
197 | tovább
198 | továbbá
199 | több
200 | u
201 | ú
202 | úgy
203 | ugyanis
204 | új
205 | újabb
206 | újra
207 | után
208 | utána
209 | utolsó
210 | ü
211 | ű
212 | v
213 | vagy
214 | vagyis
215 | valaki
216 | valamely
217 | valami
218 | valamint
219 | való
220 | vagyok
221 | van
222 | vannak
223 | volt
224 | voltam
225 | voltak
226 | voltunk
227 | vissza
228 | vele
229 | viszont
230 | volna
231 | számolnak
232 | szólnak
233 | szól
234 | w
235 | x
236 | y
237 | z
238 | zs
239 | a
240 | ahogy
241 | ahol
242 | aki
243 | akkor
244 | alatt
245 | általában
246 | által
247 | amely
248 | amíg
249 | amikor
250 | ami
251 | amolyan
252 | arra
253 | át
254 | az
255 | azért
256 | azonban
257 | azon
258 | aztán
259 | azt
260 | azután
261 | azzal
262 | bár
263 | be
264 | belül
265 | benne
266 | cikk
267 | csak
268 | de
269 | eddig
270 | egész
271 | egy
272 | egyéb
273 | egyes
274 | egyetlen
275 | egyik
276 | egyre
277 | ekkor
278 | el
279 | elég
280 | ellen
281 | elő
282 | először
283 | előtt
284 | első
285 | emilyen
286 | én
287 | éppen
288 | erre
289 | és
290 | e
291 | ez
292 | ezen
293 | ezért
294 | ezzel
295 | fel
296 | felé
297 | hanem
298 | hiszen
299 | hogy
300 | hogyan
301 | igen
302 | így
303 | ill.
304 | illetve
305 | ill
306 | ilyen
307 | ilyenkor
308 | ismét
309 | ison
310 | itt
311 | jó
312 | jobban
313 | jól
314 | kell
315 | keres
316 | keresztül
317 | ki
318 | kívül
319 | között
320 | közül
321 | legalább
322 | legyen
323 | lehet
324 | lenni
325 | lett
326 | maga
327 | maga
328 | majd
329 | már
330 | más
331 | másik
332 | még
333 | meg
334 | mellett
335 | mely
336 | mert
337 | miért
338 | míg
339 | mikor
340 | milyen
341 | minden
342 | mindenki
343 | mindig
344 | mi
345 | mint
346 | mintha
347 | mivel
348 | most
349 | nagy
350 | nagyobb
351 | nagyon
352 | ne
353 | néha
354 | néhány
355 | neki
356 | nélkül
357 | nem
358 | nincs
359 | ők
360 | olyan
361 | ő
362 | össze
363 | ott
364 | pedig
365 | persze
366 | rá
367 | saját
368 | s
369 | sem
370 | semmi
371 | sokkal
372 | sok
373 | számára
374 | számol
375 | szemben
376 | szerint
377 | szinte
378 | szól
379 | talán
380 | tehát
381 | teljes
382 | továbbá
383 | tovább
384 | úgy
385 | ugyanis
386 | új
387 | újabb
388 | újra
389 | utána
390 | után
391 | utolsó
392 | vagy
393 | vagyis
394 | valaki
395 | valamely
396 | valami
397 | valamint
398 | való
399 | van
400 | vissza
401 | viszont
402 | volt
403 | 
404 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-it.txt:
--------------------------------------------------------------------------------
  1 | ﻿ad            
  2 | al            
  3 | allo          
  4 | ai            
  5 | agli          
  6 | all           
  7 | agl           
  8 | alla          
  9 | alle          
 10 | con           
 11 | col           
 12 | coi           
 13 | da            
 14 | dal           
 15 | dallo         
 16 | dai           
 17 | dagli         
 18 | dall          
 19 | dagl          
 20 | dalla         
 21 | dalle         
 22 | di            
 23 | del           
 24 | dello         
 25 | dei           
 26 | degli         
 27 | dell          
 28 | degl          
 29 | della         
 30 | delle         
 31 | in            
 32 | nel           
 33 | nello         
 34 | nei           
 35 | negli         
 36 | nell          
 37 | negl          
 38 | nella         
 39 | nelle         
 40 | su            
 41 | sul           
 42 | sullo         
 43 | sui           
 44 | sugli         
 45 | sull          
 46 | sugl          
 47 | sulla         
 48 | sulle         
 49 | per           
 50 | tra           
 51 | contro        
 52 | io            
 53 | tu            
 54 | lui           
 55 | lei           
 56 | noi           
 57 | voi           
 58 | loro          
 59 | mio           
 60 | mia           
 61 | miei          
 62 | mie           
 63 | tuo           
 64 | tua           
 65 | tuoi          
 66 | tue           
 67 | suo           
 68 | sua           
 69 | suoi          
 70 | sue           
 71 | nostro        
 72 | nostra        
 73 | nostri        
 74 | nostre        
 75 | vostro        
 76 | vostra        
 77 | vostri        
 78 | vostre        
 79 | mi            
 80 | ti            
 81 | ci            
 82 | vi            
 83 | lo            
 84 | la            
 85 | li            
 86 | le            
 87 | gli           
 88 | ne            
 89 | il            
 90 | un            
 91 | uno           
 92 | una           
 93 | ma            
 94 | ed            
 95 | se            
 96 | perchè        
 97 | perché
 98 | perche
 99 | anche         
100 | come          
101 | dov           
102 | dove          
103 | che           
104 | chi           
105 | cui           
106 | non           
107 | più           
108 | piu
109 | quale         
110 | quanto        
111 | quanti        
112 | quanta        
113 | quante        
114 | quello        
115 | quelli        
116 | quella        
117 | quelle        
118 | questo        
119 | questi        
120 | questa        
121 | queste        
122 | si            
123 | tutto         
124 | tutti         
125 | a             
126 | c             
127 | e             
128 | i             
129 | l             
130 | o             
131 | ho
132 | hai
133 | ha
134 | abbiamo
135 | avete
136 | hanno
137 | abbia
138 | abbiate
139 | abbiano
140 | avrò
141 | avro
142 | avrai
143 | avrà
144 | avra
145 | avremo
146 | avrete
147 | avranno
148 | avrei
149 | avresti
150 | avrebbe
151 | avremmo
152 | avreste
153 | avrebbero
154 | avevo
155 | avevi
156 | aveva
157 | avevamo
158 | avevate
159 | avevano
160 | ebbi
161 | avesti
162 | ebbe
163 | avemmo
164 | aveste
165 | ebbero
166 | avessi
167 | avesse
168 | avessimo
169 | avessero
170 | avendo
171 | avuto
172 | avuta
173 | avuti
174 | avute
175 | sono
176 | sei
177 | è
178 | é
179 | e
180 | siamo
181 | siete
182 | sia
183 | siate
184 | siano
185 | sarà
186 | sarai
187 | sarò
188 | saro
189 | saremo
190 | sarete
191 | saranno
192 | sarei
193 | saresti
194 | sarebbe
195 | saremmo
196 | sareste
197 | sarebbero
198 | ero
199 | eri
200 | era
201 | eravamo
202 | eravate
203 | erano
204 | fui
205 | fosti
206 | fu
207 | fummo
208 | foste
209 | furono
210 | fossi
211 | fosse
212 | fossimo
213 | fossero
214 | essendo
215 | faccio
216 | fai
217 | facciamo
218 | fanno
219 | faccia
220 | facciate
221 | facciano
222 | farà
223 | farai
224 | farò
225 | faremo
226 | farete
227 | faranno
228 | farei
229 | faresti
230 | farebbe
231 | faremmo
232 | fareste
233 | farebbero
234 | facevo
235 | facevi
236 | faceva
237 | facevamo
238 | facevate
239 | facevano
240 | feci
241 | facesti
242 | fece
243 | facemmo
244 | faceste
245 | fecero
246 | facessi
247 | facesse
248 | facessimo
249 | facessero
250 | facendo
251 | sto
252 | stai
253 | sta
254 | stiamo
255 | stanno
256 | stia
257 | stiate
258 | stiano
259 | starà
260 | starai
261 | starò
262 | staremo
263 | starete
264 | staranno
265 | starei
266 | staresti
267 | starebbe
268 | staremmo
269 | stareste
270 | starebbero
271 | stavo
272 | stavi
273 | stava
274 | stavamo
275 | stavate
276 | stavano
277 | stetti
278 | stesti
279 | stette
280 | stemmo
281 | steste
282 | stettero
283 | stessi
284 | stesse
285 | stessimo
286 | stessero
287 | stando
288 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-ko.txt:
--------------------------------------------------------------------------------
 1 | 을
 2 | 의
 3 | 에
 4 | 이
 5 | 를
 6 | 으로
 7 | 은
 8 | 는
 9 | 가
10 | 로
11 | 하고
12 | 과
13 | 에서
14 | 도
15 | 와
16 | 이다
17 | 고
18 | 부터
19 | 까지
20 | 께
21 | 에는
22 | 이라고
23 | 만
24 | 라고
25 | 보다
26 | 에도
27 | 다
28 | 토록
29 | 에게
30 | 나
31 | 대로
32 | 에서는
33 | 이나
34 | 이며
35 | 요
36 | 든
37 | 으로써
38 | 같이
39 | 로는
40 | 밖에
41 | 과의
42 | 며
43 | 로부터
44 | 처럼
45 | 아
46 | 라
47 | 여
48 | 으로는
49 | 이고
50 | 에서의
51 | 이라는
52 | 만에
53 | 으로부터
54 | 에서도
55 | 와의
56 | 엔
57 | 만을
58 | 부터는
59 | 만의
60 | 야
61 | 까지의
62 | 과는
63 | 치고
64 | 과를
65 | 으로의
66 | 까지는
67 | 보다는
68 | 만이
69 | 에만
70 | 로의


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-nb.txt:
--------------------------------------------------------------------------------
  1 | alle
  2 | andre
  3 | arbeid
  4 | av
  5 | begge
  6 | bort
  7 | bra
  8 | bruke
  9 | da
 10 | denne
 11 | der
 12 | deres
 13 | det
 14 | din
 15 | disse
 16 | du
 17 | eller
 18 | en
 19 | ene
 20 | eneste
 21 | enhver
 22 | enn
 23 | er
 24 | et
 25 | folk
 26 | for
 27 | fordi
 28 | forsÛke
 29 | fra
 30 | fÅ
 31 | fÛr
 32 | fÛrst
 33 | gjorde
 34 | gjÛre
 35 | god
 36 | gÅ
 37 | ha
 38 | hadde
 39 | han
 40 | hans
 41 | hennes
 42 | her
 43 | hva
 44 | hvem
 45 | hver
 46 | hvilken
 47 | hvis
 48 | hvor
 49 | hvordan
 50 | hvorfor
 51 | ikke
 52 | inn
 53 | innen
 54 | kan
 55 | kunne
 56 | lage
 57 | lang
 58 | lik
 59 | like
 60 | makt
 61 | mange
 62 | med
 63 | meg
 64 | meget
 65 | men
 66 | mens
 67 | mer
 68 | mest
 69 | min
 70 | mye
 71 | mÅ
 72 | mÅte
 73 | navn
 74 | nei
 75 | ny
 76 | nÅ
 77 | nÅr
 78 | og
 79 | ogsÅ
 80 | om
 81 | opp
 82 | oss
 83 | over
 84 | part
 85 | punkt
 86 | pÅ
 87 | rett
 88 | riktig
 89 | samme
 90 | sant
 91 | si
 92 | siden
 93 | sist
 94 | skulle
 95 | slik
 96 | slutt
 97 | som
 98 | start
 99 | stille
100 | tid
101 | til
102 | tilbake
103 | tilstand
104 | under
105 | ut
106 | uten
107 | var
108 | ved
109 | verdi
110 | vi
111 | vil
112 | ville
113 | vite
114 | vÅr
115 | vÖre
116 | vÖrt
117 | Å
118 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-nl.txt:
--------------------------------------------------------------------------------
 1 | aan
 2 | af
 3 | al
 4 | als
 5 | bij
 6 | dan
 7 | dat
 8 | die
 9 | dit
10 | een
11 | en
12 | er
13 | had
14 | heb
15 | hem
16 | het
17 | hij
18 | hoe
19 | hun
20 | ik
21 | in
22 | is
23 | je
24 | kan
25 | me
26 | men
27 | met
28 | mij
29 | nog
30 | nu
31 | of
32 | ons
33 | ook
34 | te
35 | tot
36 | uit
37 | van
38 | was
39 | wat
40 | we
41 | wel
42 | wij
43 | zal
44 | ze
45 | zei
46 | zij
47 | zo
48 | zou
49 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-no.txt:
--------------------------------------------------------------------------------
  1 | at
  2 | av
  3 | de
  4 | den
  5 | der
  6 | det
  7 | du
  8 | en
  9 | er
 10 | et
 11 | for
 12 | fra
 13 | før
 14 | med
 15 | og
 16 | om
 17 | over
 18 | på
 19 | som
 20 | til
 21 | ved
 22 | år
 23 | alle
 24 | bare
 25 | ble
 26 | bort
 27 | bra
 28 | da
 29 | deg
 30 | dem
 31 | denne
 32 | dere
 33 | deres
 34 | det
 35 | dette
 36 | din
 37 | disse
 38 | dit
 39 | ditt
 40 | eller
 41 | ene
 42 | enn
 43 | er
 44 | et
 45 | ett
 46 | etter
 47 | for
 48 | fram
 49 | først
 50 | få
 51 | god
 52 | gå
 53 | ha
 54 | han
 55 | hans
 56 | har
 57 | her
 58 | hit
 59 | hun
 60 | hva
 61 | hvem
 62 | hver
 63 | ikke
 64 | inn
 65 | ja
 66 | jeg
 67 | kan
 68 | kom
 69 | kun
 70 | kunne
 71 | lage
 72 | lang
 73 | lik
 74 | like
 75 | man
 76 | mer
 77 | min
 78 | mot
 79 | mye
 80 | må
 81 | måte
 82 | ned
 83 | nei
 84 | noe
 85 | noen
 86 | ny
 87 | nå
 88 | når
 89 | også
 90 | opp
 91 | oss
 92 | seg
 93 | selv
 94 | si
 95 | siden
 96 | sin
 97 | sine
 98 | sist
 99 | skal
100 | skulle
101 | slik
102 | som
103 | så
104 | sånn
105 | tid
106 | til
107 | under
108 | ut
109 | uten
110 | var
111 | ved
112 | vi
113 | vil
114 | vite
115 | vår
116 | å
117 | dei
118 | di
119 | då
120 | eg


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-pl.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | aby
  3 | ach
  4 | acz
  5 | aczkolwiek
  6 | aj
  7 | albo
  8 | ale
  9 | ależ
 10 | ani
 11 | aż
 12 | bardziej
 13 | bardzo
 14 | bo
 15 | bowiem
 16 | by
 17 | byli
 18 | bynajmniej
 19 | być
 20 | był
 21 | była
 22 | było
 23 | były
 24 | będzie
 25 | będą
 26 | cali
 27 | cała
 28 | cały
 29 | ci
 30 | cię
 31 | ciebie
 32 | co
 33 | cokolwiek
 34 | coś
 35 | czasami
 36 | czasem
 37 | czemu
 38 | czy
 39 | czyli
 40 | daleko
 41 | dla
 42 | dlaczego
 43 | dlatego
 44 | do
 45 | dobrze
 46 | dokąd
 47 | dość
 48 | dużo
 49 | dwa
 50 | dwaj
 51 | dwie
 52 | dwoje
 53 | dziś
 54 | dzisiaj
 55 | gdy
 56 | gdyby
 57 | gdyż
 58 | gdzie
 59 | gdziekolwiek
 60 | gdzieś
 61 | i
 62 | ich
 63 | ile
 64 | im
 65 | inna
 66 | inne
 67 | inny
 68 | innych
 69 | iż
 70 | ja
 71 | ją
 72 | jak
 73 | jakaś
 74 | jakby
 75 | jaki
 76 | jakichś
 77 | jakie
 78 | jakiś
 79 | jakiż
 80 | jakkolwiek
 81 | jako
 82 | jakoś
 83 | je
 84 | jeden
 85 | jedna
 86 | jedno
 87 | jednak
 88 | jednakże
 89 | jego
 90 | jej
 91 | jemu
 92 | jest
 93 | jestem
 94 | jeszcze
 95 | jeśli
 96 | jeżeli
 97 | już
 98 | ją
 99 | każdy
100 | kiedy
101 | kilka
102 | kimś
103 | kto
104 | ktokolwiek
105 | ktoś
106 | która
107 | które
108 | którego
109 | której
110 | który
111 | których
112 | którym
113 | którzy
114 | ku
115 | lat
116 | lecz
117 | lub
118 | ma
119 | mają
120 | mało
121 | mam
122 | mi
123 | mimo
124 | między
125 | mną
126 | mnie
127 | mogą
128 | moi
129 | moim
130 | moja
131 | moje
132 | może
133 | możliwe
134 | można
135 | mój
136 | mu
137 | musi
138 | my
139 | na
140 | nad
141 | nam
142 | nami
143 | nas
144 | nasi
145 | nasz
146 | nasza
147 | nasze
148 | naszego
149 | naszych
150 | natomiast
151 | natychmiast
152 | nawet
153 | nią
154 | nic
155 | nich
156 | nie
157 | niech
158 | niego
159 | niej
160 | niemu
161 | nigdy
162 | nim
163 | nimi
164 | niż
165 | no
166 | o
167 | obok
168 | od
169 | około
170 | on
171 | ona
172 | one
173 | oni
174 | ono
175 | oraz
176 | oto
177 | owszem
178 | pan
179 | pana
180 | pani
181 | po
182 | pod
183 | podczas
184 | pomimo
185 | ponad
186 | ponieważ
187 | powinien
188 | powinna
189 | powinni
190 | powinno
191 | poza
192 | prawie
193 | przecież
194 | przed
195 | przede
196 | przedtem
197 | przez
198 | przy
199 | roku
200 | również
201 | sam
202 | sama
203 | są
204 | się
205 | skąd
206 | sobie
207 | sobą
208 | sposób
209 | swoje
210 | ta
211 | tak
212 | taka
213 | taki
214 | takie
215 | także
216 | tam
217 | te
218 | tego
219 | tej
220 | temu
221 | ten
222 | teraz
223 | też
224 | to
225 | tobą
226 | tobie
227 | toteż
228 | trzeba
229 | tu
230 | tutaj
231 | twoi
232 | twoim
233 | twoja
234 | twoje
235 | twym
236 | twój
237 | ty
238 | tych
239 | tylko
240 | tym
241 | u
242 | w
243 | wam
244 | wami
245 | was
246 | wasz
247 | wasza
248 | wasze
249 | we
250 | według
251 | wiele
252 | wielu
253 | więc
254 | więcej
255 | wszyscy
256 | wszystkich
257 | wszystkie
258 | wszystkim
259 | wszystko
260 | wtedy
261 | wy
262 | właśnie
263 | z
264 | za
265 | zapewne
266 | zawsze
267 | ze
268 | zł
269 | znowu
270 | znów
271 | został
272 | żaden
273 | żadna
274 | żadne
275 | żadnych
276 | że
277 | żeby


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-pt.txt:
--------------------------------------------------------------------------------
  1 | último
  2 | é
  3 | acerca
  4 | agora
  5 | algmas
  6 | alguns
  7 | ali
  8 | ambos
  9 | antes
 10 | apontar
 11 | aquela
 12 | aquelas
 13 | aquele
 14 | aqueles
 15 | aqui
 16 | atrás
 17 | bem
 18 | bom
 19 | cada
 20 | caminho
 21 | cima
 22 | com
 23 | como
 24 | comprido
 25 | conhecido
 26 | corrente
 27 | das
 28 | debaixo
 29 | dentro
 30 | desde
 31 | desligado
 32 | deve
 33 | devem
 34 | deverá
 35 | direita
 36 | diz
 37 | dizer
 38 | dois
 39 | dos
 40 | e
 41 | ela
 42 | ele
 43 | eles
 44 | em
 45 | enquanto
 46 | então
 47 | está
 48 | estão
 49 | estado
 50 | estar
 51 | estará
 52 | este
 53 | estes
 54 | esteve
 55 | estive
 56 | estivemos
 57 | estiveram
 58 | eu
 59 | fará
 60 | faz
 61 | fazer
 62 | fazia
 63 | fez
 64 | fim
 65 | foi
 66 | fora
 67 | horas
 68 | iniciar
 69 | inicio
 70 | ir
 71 | irá
 72 | ista
 73 | iste
 74 | isto
 75 | ligado
 76 | maioria
 77 | maiorias
 78 | mais
 79 | mas
 80 | mesmo
 81 | meu
 82 | muito
 83 | muitos
 84 | nós
 85 | não
 86 | nome
 87 | nosso
 88 | novo
 89 | o
 90 | onde
 91 | os
 92 | ou
 93 | outro
 94 | para
 95 | parte
 96 | pegar
 97 | pelo
 98 | pessoas
 99 | pode
100 | poderá
101 | podia
102 | por
103 | porque
104 | povo
105 | promeiro
106 | quê
107 | qual
108 | qualquer
109 | quando
110 | quem
111 | quieto
112 | são
113 | saber
114 | sem
115 | ser
116 | seu
117 | somente
118 | têm
119 | tal
120 | também
121 | tem
122 | tempo
123 | tenho
124 | tentar
125 | tentaram
126 | tente
127 | tentei
128 | teu
129 | teve
130 | tipo
131 | tive
132 | todos
133 | trabalhar
134 | trabalho
135 | tu
136 | um
137 | uma
138 | umas
139 | uns
140 | usa
141 | usar
142 | valor
143 | veja
144 | ver
145 | verdade
146 | verdadeiro
147 | você
148 | 


--------------------------------------------------------------------------------
/goose/resources/text/stopwords-zh.txt:
--------------------------------------------------------------------------------
  1 | 的
  2 | 一
  3 | 不
  4 | 在
  5 | 人
  6 | 有
  7 | 是
  8 | 为
  9 | 以
 10 | 于
 11 | 上
 12 | 他
 13 | 而
 14 | 后
 15 | 之
 16 | 来
 17 | 及
 18 | 了
 19 | 因
 20 | 下
 21 | 可
 22 | 到
 23 | 由
 24 | 这
 25 | 与
 26 | 也
 27 | 此
 28 | 但
 29 | 并
 30 | 个
 31 | 其
 32 | 已
 33 | 无
 34 | 小
 35 | 我
 36 | 们
 37 | 起
 38 | 最
 39 | 再
 40 | 今
 41 | 去
 42 | 好
 43 | 只
 44 | 又
 45 | 或
 46 | 很
 47 | 亦
 48 | 某
 49 | 把
 50 | 那
 51 | 你
 52 | 乃
 53 | 它
 54 | 吧
 55 | 被
 56 | 比
 57 | 别
 58 | 趁
 59 | 当
 60 | 从
 61 | 到
 62 | 得
 63 | 打
 64 | 凡
 65 | 儿
 66 | 尔
 67 | 该
 68 | 各
 69 | 给
 70 | 跟
 71 | 和
 72 | 何
 73 | 还
 74 | 即
 75 | 几
 76 | 既
 77 | 看
 78 | 据
 79 | 距
 80 | 靠
 81 | 啦
 82 | 了
 83 | 另
 84 | 么
 85 | 每
 86 | 们
 87 | 嘛
 88 | 拿
 89 | 哪
 90 | 那
 91 | 您
 92 | 凭
 93 | 且
 94 | 却
 95 | 让
 96 | 仍
 97 | 啥
 98 | 如
 99 | 若
100 | 使
101 | 谁
102 | 虽
103 | 随
104 | 同
105 | 所
106 | 她
107 | 哇
108 | 嗡
109 | 往
110 | 哪
111 | 些
112 | 向
113 | 沿
114 | 哟
115 | 用
116 | 于
117 | 咱
118 | 则
119 | 怎
120 | 曾
121 | 至
122 | 致
123 | 着
124 | 诸
125 | 自
126 | 為
127 | 於
128 | 後
129 | 這
130 | 與
131 | 並
132 | 個
133 | 無
134 | 們
135 | 當
136 | 從
137 | 兒
138 | 爾
139 | 該
140 | 給
141 | 還
142 | 幾
143 | 麼
144 | 憑
145 | 卻
146 | 讓
147 | 誰
148 | 雖
149 | 喲
150 | 則
151 | 諸
152 | 


--------------------------------------------------------------------------------
/goose/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """\
  3 | This is a python port of "Goose" orignialy licensed to Gravity.com
  4 | under one or more contributor license agreements.  See the NOTICE file
  5 | distributed with this work for additional information
  6 | regarding copyright ownership.
  7 | 
  8 | Python port was written by Xavier Grangier for Recrutae
  9 | 
 10 | Gravity.com licenses this file
 11 | to you under the Apache License, Version 2.0 (the "License");
 12 | you may not use this file except in compliance
 13 | with the License.  You may obtain a copy of the License at
 14 | 
 15 | http://www.apache.org/licenses/LICENSE-2.0
 16 | 
 17 | Unless required by applicable law or agreed to in writing, software
 18 | distributed under the License is distributed on an "AS IS" BASIS,
 19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 20 | See the License for the specific language governing permissions and
 21 | limitations under the License.
 22 | """
 23 | import time
 24 | import hashlib
 25 | import re
 26 | import os
 27 | import goose
 28 | import codecs
 29 | import urlparse
 30 | 
 31 | 
 32 | class BuildURL(object):
 33 |     def __init__(self, url, finalurl=None):
 34 |         self.url = url
 35 |         self.finalurl = finalurl
 36 | 
 37 |     def getHostname(self, o):
 38 |         if o.hostname:
 39 |             return o.hotname
 40 |         elif self.finalurl:
 41 |             oo = urlparse(self.finalurl)
 42 |             if oo.hostname:
 43 |                 return oo.hostname
 44 |         return None
 45 | 
 46 |     def getScheme(self, o):
 47 |         if o.scheme:
 48 |             return o.scheme
 49 |         elif self.finalurl:
 50 |             oo = urlparse(self.finalurl)
 51 |             if oo.scheme:
 52 |                 return oo.scheme
 53 |         return 'http'
 54 | 
 55 |     def getUrl(self):
 56 |         """\
 57 | 
 58 |         """
 59 |         url_obj = urlparse(self.url)
 60 |         scheme = self.getScheme(url_obj)
 61 |         hostname = self.getHostname(url_obj)
 62 | 
 63 | 
 64 | class FileHelper(object):
 65 | 
 66 |     @classmethod
 67 |     def loadResourceFile(self, filename):
 68 |         if not os.path.isabs('filename'):
 69 |             dirpath = os.path.dirname(goose.__file__)
 70 |             path = os.path.join(dirpath, 'resources', filename)
 71 |         else:
 72 |             path = filename
 73 |         try:
 74 |             f = codecs.open(path, 'r', 'utf-8')
 75 |             content = f.read()
 76 |             f.close()
 77 |             return content
 78 |         except IOError:
 79 |             raise IOError("Couldn't open file %s" % path)
 80 | 
 81 | 
 82 | class ParsingCandidate(object):
 83 | 
 84 |     def __init__(self, urlString, link_hash):
 85 |         self.urlString = self.url = urlString
 86 |         self.link_hash = link_hash
 87 | 
 88 | 
 89 | class RawHelper(object):
 90 |     @classmethod
 91 |     def get_parsing_candidate(self, url, raw_html):
 92 |         if isinstance(raw_html, unicode):
 93 |             raw_html = raw_html.encode('utf-8')
 94 |         link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
 95 |         return ParsingCandidate(url, link_hash)
 96 | 
 97 | 
 98 | class URLHelper(object):
 99 |     @classmethod
100 |     def get_parsing_candidate(self, url_to_crawl):
101 |         # replace shebang is urls
102 |         final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
103 |                     if '#!' in url_to_crawl else url_to_crawl
104 |         link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
105 |         return ParsingCandidate(final_url, link_hash)
106 | 
107 | 
108 | class StringSplitter(object):
109 |     """\
110 | 
111 |     """
112 |     def __init__(self, pattern):
113 |         self.pattern = re.compile(pattern)
114 | 
115 |     def split(self, string):
116 |         if not string:
117 |             return []
118 |         return self.pattern.split(string)
119 | 
120 | 
121 | class StringReplacement(object):
122 | 
123 |     def __init__(self, pattern, replaceWith):
124 |         self.pattern = pattern
125 |         self.replaceWith = replaceWith
126 | 
127 |     def replaceAll(self, string):
128 |         if not string:
129 |             return u''
130 |         return string.replace(self.pattern, self.replaceWith)
131 | 
132 | 
133 | class ReplaceSequence(object):
134 | 
135 |     def __init__(self):
136 |         self.replacements = []
137 | 
138 |     #@classmethod
139 |     def create(self, firstPattern, replaceWith=None):
140 |         result = StringReplacement(firstPattern, replaceWith or u'')
141 |         self.replacements.append(result)
142 |         return self
143 | 
144 |     def append(self, pattern, replaceWith=None):
145 |         return self.create(pattern, replaceWith)
146 | 
147 |     def replaceAll(self, string):
148 |         if not string:
149 |             return u''
150 | 
151 |         mutatedString = string
152 | 
153 |         for rp in self.replacements:
154 |             mutatedString = rp.replaceAll(mutatedString)
155 |         return mutatedString
156 | 


--------------------------------------------------------------------------------
/goose/version.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | version_info = (1, 0, 22)
25 | __version__ = ".".join(map(str, version_info))
26 | 


--------------------------------------------------------------------------------
/goose/videos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/goose/videos/__init__.py


--------------------------------------------------------------------------------
/goose/videos/videos.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """\
 3 | This is a python port of "Goose" orignialy licensed to Gravity.com
 4 | under one or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.
 7 | 
 8 | Python port was written by Xavier Grangier for Recrutae
 9 | 
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License.  You may obtain a copy of the License at
14 | 
15 | http://www.apache.org/licenses/LICENSE-2.0
16 | 
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | 
24 | class Video(object):
25 |     """\
26 |     Video object
27 |     """
28 | 
29 |     def __init__(self):
30 | 
31 |         # type of embed
32 |         # embed, object, iframe
33 |         self.embed_type = None
34 | 
35 |         # video provider name
36 |         self.provider = None
37 | 
38 |         # width
39 |         self.width = None
40 | 
41 |         # height
42 |         self.height = None
43 | 
44 |         # embed code
45 |         self.embed_code = None
46 | 
47 |         # src
48 |         self.src = None
49 | 


--------------------------------------------------------------------------------
/httplib2/iri2uri.py:
--------------------------------------------------------------------------------
  1 | """
  2 | iri2uri
  3 | 
  4 | Converts an IRI to a URI.
  5 | 
  6 | """
  7 | __author__ = "Joe Gregorio (joe@bitworking.org)"
  8 | __copyright__ = "Copyright 2006, Joe Gregorio"
  9 | __contributors__ = []
 10 | __version__ = "1.0.0"
 11 | __license__ = "MIT"
 12 | __history__ = """
 13 | """
 14 | 
 15 | import urlparse
 16 | 
 17 | 
 18 | # Convert an IRI to a URI following the rules in RFC 3987
 19 | #
 20 | # The characters we need to enocde and escape are defined in the spec:
 21 | #
 22 | # iprivate =  %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
 23 | # ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
 24 | #         / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
 25 | #         / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
 26 | #         / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
 27 | #         / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
 28 | #         / %xD0000-DFFFD / %xE1000-EFFFD
 29 | 
 30 | escape_range = [
 31 |     (0xA0, 0xD7FF),
 32 |     (0xE000, 0xF8FF),
 33 |     (0xF900, 0xFDCF),
 34 |     (0xFDF0, 0xFFEF),
 35 |     (0x10000, 0x1FFFD),
 36 |     (0x20000, 0x2FFFD),
 37 |     (0x30000, 0x3FFFD),
 38 |     (0x40000, 0x4FFFD),
 39 |     (0x50000, 0x5FFFD),
 40 |     (0x60000, 0x6FFFD),
 41 |     (0x70000, 0x7FFFD),
 42 |     (0x80000, 0x8FFFD),
 43 |     (0x90000, 0x9FFFD),
 44 |     (0xA0000, 0xAFFFD),
 45 |     (0xB0000, 0xBFFFD),
 46 |     (0xC0000, 0xCFFFD),
 47 |     (0xD0000, 0xDFFFD),
 48 |     (0xE1000, 0xEFFFD),
 49 |     (0xF0000, 0xFFFFD),
 50 |     (0x100000, 0x10FFFD),
 51 | ]
 52 | 
 53 | def encode(c):
 54 |     retval = c
 55 |     i = ord(c)
 56 |     for low, high in escape_range:
 57 |         if i < low:
 58 |             break
 59 |         if i >= low and i <= high:
 60 |             retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')])
 61 |             break
 62 |     return retval
 63 | 
 64 | 
 65 | def iri2uri(uri):
 66 |     """Convert an IRI to a URI. Note that IRIs must be
 67 |     passed in a unicode strings. That is, do not utf-8 encode
 68 |     the IRI before passing it into the function."""
 69 |     if isinstance(uri ,unicode):
 70 |         (scheme, authority, path, query, fragment) = urlparse.urlsplit(uri)
 71 |         authority = authority.encode('idna')
 72 |         # For each character in 'ucschar' or 'iprivate'
 73 |         #  1. encode as utf-8
 74 |         #  2. then %-encode each octet of that utf-8
 75 |         uri = urlparse.urlunsplit((scheme, authority, path, query, fragment))
 76 |         uri = "".join([encode(c) for c in uri])
 77 |     return uri
 78 | 
 79 | if __name__ == "__main__":
 80 |     import unittest
 81 | 
 82 |     class Test(unittest.TestCase):
 83 | 
 84 |         def test_uris(self):
 85 |             """Test that URIs are invariant under the transformation."""
 86 |             invariant = [
 87 |                 u"ftp://ftp.is.co.za/rfc/rfc1808.txt",
 88 |                 u"http://www.ietf.org/rfc/rfc2396.txt",
 89 |                 u"ldap://[2001:db8::7]/c=GB?objectClass?one",
 90 |                 u"mailto:John.Doe@example.com",
 91 |                 u"news:comp.infosystems.www.servers.unix",
 92 |                 u"tel:+1-816-555-1212",
 93 |                 u"telnet://192.0.2.16:80/",
 94 |                 u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ]
 95 |             for uri in invariant:
 96 |                 self.assertEqual(uri, iri2uri(uri))
 97 | 
 98 |         def test_iri(self):
 99 |             """ Test that the right type of escaping is done for each part of the URI."""
100 |             self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}"))
101 |             self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}"))
102 |             self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}"))
103 |             self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}"))
104 |             self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))
105 |             self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")))
106 |             self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8')))
107 | 
108 |     unittest.main()
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/httplib2/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/httplib2/test/__init__.py


--------------------------------------------------------------------------------
/httplib2/test/brokensocket/socket.py:
--------------------------------------------------------------------------------
1 | from realsocket import gaierror, error, getaddrinfo, SOCK_STREAM
2 | 


--------------------------------------------------------------------------------
/httplib2/test/functional/test_proxies.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import errno
 3 | import os
 4 | import signal
 5 | import subprocess
 6 | import tempfile
 7 | 
 8 | import nose
 9 | 
10 | import httplib2
11 | from httplib2 import socks
12 | from httplib2.test import miniserver
13 | 
14 | tinyproxy_cfg = """
15 | User "%(user)s"
16 | Port %(port)s
17 | Listen 127.0.0.1
18 | PidFile "%(pidfile)s"
19 | LogFile "%(logfile)s"
20 | MaxClients 2
21 | StartServers 1
22 | LogLevel Info
23 | """
24 | 
25 | 
26 | class FunctionalProxyHttpTest(unittest.TestCase):
27 |     def setUp(self):
28 |         if not socks:
29 |             raise nose.SkipTest('socks module unavailable')
30 |         if not subprocess:
31 |             raise nose.SkipTest('subprocess module unavailable')
32 | 
33 |         # start a short-lived miniserver so we can get a likely port
34 |         # for the proxy
35 |         self.httpd, self.proxyport = miniserver.start_server(
36 |             miniserver.ThisDirHandler)
37 |         self.httpd.shutdown()
38 |         self.httpd, self.port = miniserver.start_server(
39 |             miniserver.ThisDirHandler)
40 | 
41 |         self.pidfile = tempfile.mktemp()
42 |         self.logfile = tempfile.mktemp()
43 |         fd, self.conffile = tempfile.mkstemp()
44 |         f = os.fdopen(fd, 'w')
45 |         our_cfg = tinyproxy_cfg % {'user': os.getlogin(),
46 |                                    'pidfile': self.pidfile,
47 |                                    'port': self.proxyport,
48 |                                    'logfile': self.logfile}
49 |         f.write(our_cfg)
50 |         f.close()
51 |         try:
52 |             # TODO use subprocess.check_call when 2.4 is dropped
53 |             ret = subprocess.call(['tinyproxy', '-c', self.conffile])
54 |             self.assertEqual(0, ret)
55 |         except OSError, e:
56 |             if e.errno == errno.ENOENT:
57 |                 raise nose.SkipTest('tinyproxy not available')
58 |             raise
59 | 
60 |     def tearDown(self):
61 |         self.httpd.shutdown()
62 |         try:
63 |             pid = int(open(self.pidfile).read())
64 |             os.kill(pid, signal.SIGTERM)
65 |         except OSError, e:
66 |             if e.errno == errno.ESRCH:
67 |                 print '\n\n\nTinyProxy Failed to start, log follows:'
68 |                 print open(self.logfile).read()
69 |                 print 'end tinyproxy log\n\n\n'
70 |             raise
71 |         map(os.unlink, (self.pidfile,
72 |                         self.logfile,
73 |                         self.conffile))
74 | 
75 |     def testSimpleProxy(self):
76 |         proxy_info = httplib2.ProxyInfo(socks.PROXY_TYPE_HTTP,
77 |                                         'localhost', self.proxyport)
78 |         client = httplib2.Http(proxy_info=proxy_info)
79 |         src = 'miniserver.py'
80 |         response, body = client.request('http://localhost:%d/%s' %
81 |                                         (self.port, src))
82 |         self.assertEqual(response.status, 200)
83 |         self.assertEqual(body, open(os.path.join(miniserver.HERE, src)).read())
84 |         lf = open(self.logfile).read()
85 |         expect = ('Established connection to host "127.0.0.1" '
86 |                   'using file descriptor')
87 |         self.assertTrue(expect in lf,
88 |                         'tinyproxy did not proxy a request for miniserver')
89 | 


--------------------------------------------------------------------------------
/httplib2/test/miniserver.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import select
  4 | import SimpleHTTPServer
  5 | import SocketServer
  6 | import threading
  7 | 
  8 | HERE = os.path.dirname(__file__)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class ThisDirHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
 13 |     def translate_path(self, path):
 14 |         path = path.split('?', 1)[0].split('#', 1)[0]
 15 |         return os.path.join(HERE, *filter(None, path.split('/')))
 16 | 
 17 |     def log_message(self, s, *args):
 18 |         # output via logging so nose can catch it
 19 |         logger.info(s, *args)
 20 | 
 21 | 
 22 | class ShutdownServer(SocketServer.TCPServer):
 23 |     """Mixin that allows serve_forever to be shut down.
 24 | 
 25 |     The methods in this mixin are backported from SocketServer.py in the Python
 26 |     2.6.4 standard library. The mixin is unnecessary in 2.6 and later, when
 27 |     BaseServer supports the shutdown method directly.
 28 |     """
 29 | 
 30 |     def __init__(self, *args, **kwargs):
 31 |         SocketServer.TCPServer.__init__(self, *args, **kwargs)
 32 |         self.__is_shut_down = threading.Event()
 33 |         self.__serving = False
 34 | 
 35 |     def serve_forever(self, poll_interval=0.1):
 36 |         """Handle one request at a time until shutdown.
 37 | 
 38 |         Polls for shutdown every poll_interval seconds. Ignores
 39 |         self.timeout. If you need to do periodic tasks, do them in
 40 |         another thread.
 41 |         """
 42 |         self.__serving = True
 43 |         self.__is_shut_down.clear()
 44 |         while self.__serving:
 45 |             r, w, e = select.select([self.socket], [], [], poll_interval)
 46 |             if r:
 47 |                 self._handle_request_noblock()
 48 |         self.__is_shut_down.set()
 49 | 
 50 |     def shutdown(self):
 51 |         """Stops the serve_forever loop.
 52 | 
 53 |         Blocks until the loop has finished. This must be called while
 54 |         serve_forever() is running in another thread, or it will deadlock.
 55 |         """
 56 |         self.__serving = False
 57 |         self.__is_shut_down.wait()
 58 | 
 59 |     def handle_request(self):
 60 |         """Handle one request, possibly blocking.
 61 | 
 62 |         Respects self.timeout.
 63 |         """
 64 |         # Support people who used socket.settimeout() to escape
 65 |         # handle_request before self.timeout was available.
 66 |         timeout = self.socket.gettimeout()
 67 |         if timeout is None:
 68 |             timeout = self.timeout
 69 |         elif self.timeout is not None:
 70 |             timeout = min(timeout, self.timeout)
 71 |         fd_sets = select.select([self], [], [], timeout)
 72 |         if not fd_sets[0]:
 73 |             self.handle_timeout()
 74 |             return
 75 |         self._handle_request_noblock()
 76 | 
 77 |     def _handle_request_noblock(self):
 78 |         """Handle one request, without blocking.
 79 | 
 80 |         I assume that select.select has returned that the socket is
 81 |         readable before this function was called, so there should be
 82 |         no risk of blocking in get_request().
 83 |         """
 84 |         try:
 85 |             request, client_address = self.get_request()
 86 |         except socket.error:
 87 |             return
 88 |         if self.verify_request(request, client_address):
 89 |             try:
 90 |                 self.process_request(request, client_address)
 91 |             except:
 92 |                 self.handle_error(request, client_address)
 93 |                 self.close_request(request)
 94 | 
 95 | 
 96 | def start_server(handler):
 97 |     httpd = ShutdownServer(("", 0), handler)
 98 |     threading.Thread(target=httpd.serve_forever).start()
 99 |     _, port = httpd.socket.getsockname()
100 |     return httpd, port
101 | 


--------------------------------------------------------------------------------
/httplib2/test/other_cacerts.txt:
--------------------------------------------------------------------------------
 1 | # Certifcate Authority certificates for validating SSL connections.
 2 | #
 3 | # This file contains PEM format certificates generated from
 4 | # http://mxr.mozilla.org/seamonkey/source/security/nss/lib/ckfw/builtins/certdata.txt
 5 | #
 6 | # ***** BEGIN LICENSE BLOCK *****
 7 | # Version: MPL 1.1/GPL 2.0/LGPL 2.1
 8 | #
 9 | # The contents of this file are subject to the Mozilla Public License Version
10 | # 1.1 (the "License"); you may not use this file except in compliance with
11 | # the License. You may obtain a copy of the License at
12 | # http://www.mozilla.org/MPL/
13 | #
14 | # Software distributed under the License is distributed on an "AS IS" basis,
15 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
16 | # for the specific language governing rights and limitations under the
17 | # License.
18 | #
19 | # The Original Code is the Netscape security libraries.
20 | #
21 | # The Initial Developer of the Original Code is
22 | # Netscape Communications Corporation.
23 | # Portions created by the Initial Developer are Copyright (C) 1994-2000
24 | # the Initial Developer. All Rights Reserved.
25 | #
26 | # Contributor(s):
27 | #
28 | # Alternatively, the contents of this file may be used under the terms of
29 | # either the GNU General Public License Version 2 or later (the "GPL"), or
30 | # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
31 | # in which case the provisions of the GPL or the LGPL are applicable instead
32 | # of those above. If you wish to allow use of your version of this file only
33 | # under the terms of either the GPL or the LGPL, and not to allow others to
34 | # use your version of this file under the terms of the MPL, indicate your
35 | # decision by deleting the provisions above and replace them with the notice
36 | # and other provisions required by the GPL or the LGPL. If you do not delete
37 | # the provisions above, a recipient may use your version of this file under
38 | # the terms of any one of the MPL, the GPL or the LGPL.
39 | #
40 | # ***** END LICENSE BLOCK *****
41 | 
42 | 
43 | Comodo CA Limited, CN=Trusted Certificate Services
44 | ==================================================
45 | 
46 | -----BEGIN CERTIFICATE-----
47 | MIIEQzCCAyugAwIBAgIBATANBgkqhkiG9w0BAQUFADB/MQswCQYDVQQGEwJHQjEb
48 | MBkGA1UECAwSR3JlYXRlciBNYW5jaGVzdGVyMRAwDgYDVQQHDAdTYWxmb3JkMRow
49 | GAYDVQQKDBFDb21vZG8gQ0EgTGltaXRlZDElMCMGA1UEAwwcVHJ1c3RlZCBDZXJ0
50 | aWZpY2F0ZSBTZXJ2aWNlczAeFw0wNDAxMDEwMDAwMDBaFw0yODEyMzEyMzU5NTla
51 | MH8xCzAJBgNVBAYTAkdCMRswGQYDVQQIDBJHcmVhdGVyIE1hbmNoZXN0ZXIxEDAO
52 | BgNVBAcMB1NhbGZvcmQxGjAYBgNVBAoMEUNvbW9kbyBDQSBMaW1pdGVkMSUwIwYD
53 | VQQDDBxUcnVzdGVkIENlcnRpZmljYXRlIFNlcnZpY2VzMIIBIjANBgkqhkiG9w0B
54 | AQEFAAOCAQ8AMIIBCgKCAQEA33FvNlhTWvI2VFeAxHQIIO0Yfyod5jWaHiWsnOWW
55 | fnJSoBVC21ndZHoa0Lh73TkVvFVIxO06AOoxEbrycXQaZ7jPM8yoMa+j49d/vzMt
56 | TGo87IvDktJTdyR0nAducPy9C1t2ul/y/9c3S0pgePfw+spwtOpZqqPOSC+pw7IL
57 | fhdyFgymBwwbOM/JYrc/oJOlh0Hyt3BAd9i+FHzjqMB6juljatEPmsbS9Is6FARW
58 | 1O24zG71++IsWL1/T2sr92AkWCTOJu80kTrV44HQsvAEAtdbtz6SrGsSivnkBbA7
59 | kUlcsutT6vifR4buv5XAwAaf0lteERv0xwQ1KdJVXOTt6wIDAQABo4HJMIHGMB0G
60 | A1UdDgQWBBTFe1i97doladL3WRaoszLAeydb9DAOBgNVHQ8BAf8EBAMCAQYwDwYD
61 | VR0TAQH/BAUwAwEB/zCBgwYDVR0fBHwwejA8oDqgOIY2aHR0cDovL2NybC5jb21v
62 | ZG9jYS5jb20vVHJ1c3RlZENlcnRpZmljYXRlU2VydmljZXMuY3JsMDqgOKA2hjRo
63 | dHRwOi8vY3JsLmNvbW9kby5uZXQvVHJ1c3RlZENlcnRpZmljYXRlU2VydmljZXMu
64 | Y3JsMA0GCSqGSIb3DQEBBQUAA4IBAQDIk4E7ibSvuIQSTI3S8NtwuleGFTQQuS9/
65 | HrCoiWChisJ3DFBKmwCL2Iv0QeLQg4pKHBQGsKNoBXAxMKdTmw7pSqBYaWcOrp32
66 | pSxBvzwGa+RZzG0Q8ZZvH9/0BAKkn0U+yNj6NkZEUD+Cl5EfKNsYEYwq5GWDVxIS
67 | jBc/lDb+XbDABHcTuPQV1T84zJQ6VdCsmPW6AF/ghhmBeC8owH7TzEIK9a5QoNE+
68 | xqFx7D+gIIxmOom0jtTYsU0lR+4viMi14QVFwL4Ucd56/Y57fU0IlqUSc/Atyjcn
69 | dBInTMu2l+nZrghtWjlA3QVHdWpaIbOjGM9O9y5Xt5hwXsjEeLBi
70 | -----END CERTIFICATE-----
71 | 


--------------------------------------------------------------------------------
/httplib2/test/smoke_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | import httplib2
 5 | 
 6 | from httplib2.test import miniserver
 7 | 
 8 | 
 9 | class HttpSmokeTest(unittest.TestCase):
10 |     def setUp(self):
11 |         self.httpd, self.port = miniserver.start_server(
12 |             miniserver.ThisDirHandler)
13 | 
14 |     def tearDown(self):
15 |         self.httpd.shutdown()
16 | 
17 |     def testGetFile(self):
18 |         client = httplib2.Http()
19 |         src = 'miniserver.py'
20 |         response, body = client.request('http://localhost:%d/%s' %
21 |                                         (self.port, src))
22 |         self.assertEqual(response.status, 200)
23 |         self.assertEqual(body, open(os.path.join(miniserver.HERE, src)).read())
24 | 


--------------------------------------------------------------------------------
/httplib2/test/test_no_socket.py:
--------------------------------------------------------------------------------
 1 | """Tests for httplib2 when the socket module is missing.
 2 | 
 3 | This helps ensure compatibility with environments such as AppEngine.
 4 | """
 5 | import os
 6 | import sys
 7 | import unittest
 8 | 
 9 | import httplib2
10 | 
11 | class MissingSocketTest(unittest.TestCase):
12 |     def setUp(self):
13 |         self._oldsocks = httplib2.socks
14 |         httplib2.socks = None
15 | 
16 |     def tearDown(self):
17 |         httplib2.socks = self._oldsocks
18 | 
19 |     def testProxyDisabled(self):
20 |         proxy_info = httplib2.ProxyInfo('blah',
21 |                                         'localhost', 0)
22 |         client = httplib2.Http(proxy_info=proxy_info)
23 |         self.assertRaises(httplib2.ProxiesUnavailableError,
24 |                           client.request, 'http://localhost:-1/')
25 | 


--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
 1 | indexes:
 2 | 
 3 | # AUTOGENERATED
 4 | 
 5 | # This index.yaml is automatically updated whenever the dev_appserver
 6 | # detects that a new type of query is run.  If you want to manage the
 7 | # index.yaml file manually, remove the above marker line (the line
 8 | # saying "# AUTOGENERATED").  If you want to manage some indexes
 9 | # manually, move them above the marker line.  The index.yaml file is
10 | # automatically uploaded to the admin console when you next deploy
11 | # your application using appcfg.py.
12 | 
13 | 


--------------------------------------------------------------------------------
/instructions.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | 
 3 | <html>
 4 | <head>
 5 | <title>Astrobot Instructions</title>
 6 | <style>
 7 | body {
 8 | 	line-height: 1.4em;
 9 | 	font-family: sans-serif;
10 | 	padding: 2em;
11 | 	max-width: 600px;
12 | 	margin: auto;
13 | }
14 | </style>
15 | </head>
16 | <body>
17 | 
18 | <h1>Astrobot Instructions</h1>
19 | <p>Coming soon...</p>
20 | 
21 | </body>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright 2007 Google Inc.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | import webapp2
18 | import uuid
19 | import os
20 | import base64
21 | import pickle
22 | from google.appengine.ext import ndb
23 | import browse
24 | from xml.sax import saxutils
25 | import json
26 | 
27 | class State(ndb.Model):
28 | 	pickled = ndb.BlobProperty(compressed=True)
29 | 
30 | def interact(query, stateid):
31 | 	state = State.get_or_insert(stateid)
32 | 	unpickled_state = pickle.loads(state.pickled) if state.pickled else {}
33 | 	messages = browse.interact(query, unpickled_state)
34 | 	print "MESSAGES", messages
35 | 	state.pickled = pickle.dumps(unpickled_state)
36 | 	state.put()
37 | 	return messages
38 | 
39 | class MainHandler(webapp2.RequestHandler):
40 |     def get(self):
41 |         self.response.write(open('page.html').read())
42 | 
43 | class Interact(webapp2.RequestHandler):
44 | 	def post(self):
45 | 		query = self.request.get('query')
46 | 		stateid = self.request.cookies.get('stateid', None)
47 | 		if stateid == None:
48 | 			stateid = base64.b64encode(uuid.uuid4().bytes + os.urandom(64))
49 | 			self.response.set_cookie('stateid', stateid, max_age=3600*20)
50 | 		self.response.write(json.dumps({"messages": interact(query, stateid)}))
51 | 
52 | class Twilio(webapp2.RequestHandler):
53 | 	def post(self):
54 | 		from_phone = self.request.get('From')
55 | 		query = self.request.get('Body')
56 | 		messages = []
57 | 		try:
58 | 			messages = interact(query, 'phone:'+from_phone)
59 | 		except Exception:
60 | 			messages.append("Oops, something went wrong.")
61 | 		self.response.content_type = 'text/xml'
62 | 		self.response.write('<?xml version="1.0" encoding="UTF-8" ?><Response>')
63 | 		for msg in messages:
64 | 			self.response.write(u"<Message>{0}</Message>".format(saxutils.escape(msg)))
65 | 		self.response.write("</Response>")
66 | 
67 | 
68 | class InstructionsHandler(webapp2.RequestHandler):
69 | 	def get(self):
70 | 		self.response.write(open('instructions.html').read()) 
71 | 
72 | app = webapp2.WSGIApplication([
73 |     ('/', MainHandler),
74 |     ('/instructions', InstructionsHandler),
75 |     ('/interact', Interact),
76 |     ('/twilio', Twilio)
77 | ], debug=True)
78 | 


--------------------------------------------------------------------------------
/page.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | 	<title>Astrobot: browse the web with pure SMS</title>
  5 | 	<meta name="viewport" content="width=device-width, user-scalable=no"/>
  6 | 	<script src='//ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js'></script>
  7 | 	<script>
  8 | 	$(document).ready(function() {
  9 | 		$("#form").submit(function() {
 10 | 			$("<li></li>").text($("#form input[name=query]").val()).addClass("sent").appendTo("#messages");
 11 | 			$.ajax({
 12 | 				url: '/interact',
 13 | 				method: 'POST',
 14 | 				data: $('#form').serialize(),
 15 | 				success: function(result) {
 16 | 					JSON.parse(result).messages.forEach(function(msg) {
 17 | 						console.log(msg)
 18 | 						$("<li></li>").text(msg).appendTo("#messages");
 19 | 					});
 20 | 				}
 21 | 			})
 22 | 			$("#form input[name=query]").val("");
 23 | 			return false;
 24 | 		})
 25 | 	})
 26 | 	</script>
 27 | 	<style>
 28 | 	body {
 29 | 		margin: 20px;
 30 | 		font-family: sans-serif;
 31 | 		max-width: 600px;
 32 | 		width: 90%;
 33 | 		margin: auto;
 34 | 		text-align: center;
 35 | 		padding-bottom: 2em;
 36 | 	}
 37 | 	#messages {
 38 | 		list-style-type: none;
 39 | 		padding: 0px;
 40 | 	}
 41 | 	#messages li {
 42 | 		white-space: pre-wrap;
 43 | 		padding: 15px;
 44 | 		border-radius: 10px;
 45 | 		color: white;
 46 | 		background-color: #1882ff;
 47 | 		margin: 20px;
 48 | 		text-align: left;
 49 | 	}
 50 | 	#messages li.sent {
 51 | 		color: black;
 52 | 		background-color: #dbd9e1;
 53 | 		text-align: right;
 54 | 	}
 55 | 	#form {
 56 | 		display: inline-block;
 57 | 	}
 58 | 	#commands {
 59 | 		padding: 0;
 60 | 		text-align: center;
 61 | 	}
 62 | 	#commands li {
 63 | 		display: inline-block;
 64 | 		padding: 10px;
 65 | 		list-style-type: none;
 66 | 		max-width: 40%
 67 | 	}
 68 | 	#commands li div:before {
 69 | 		content: "“";
 70 | 		font-size: large;
 71 | 	}
 72 | 	#commands li div:after {
 73 | 		content: "”";
 74 | 		font-size: large
 75 | 	}
 76 | 	#commands li span {
 77 | 		font-size: small;
 78 | 		display: block;
 79 | 		font-style: italic;
 80 | 	}
 81 | 	</style>
 82 | </head>
 83 | <body>
 84 | <a href='http://github.com/nate-parrott/astro-bot' alt='Fork me on GitHub'><img style='display: block; position: absolute; top: 0px; right: 0px; width: 149px' src='http://aral.github.io/fork-me-on-github-retina-ribbons/right-turquoise@2x.png'/></a>
 85 | 
 86 | 	<h1>astro-bot</h1>
 87 | 	<p>Browse the web over sms. Text <strong>646-576-7688</strong>.</p>
 88 | 	<h4>Things you can say...</h4>
 89 | 	<ul id='commands'>
 90 | 		<li><div>go to hackerschool.com</div></li>
 91 | 		<li><div>2 more</div> <span>sends 2 more messages of content</span></li>
 92 | 		<li><div>4</div> <span>clicks the link named '4'</span></li>
 93 | 		<li><div>search the web for george harrison</div></li>
 94 | 		<li><div>where am i?</div> <span>shows your current URL</span></li>
 95 | 		<li><div>table of contents</div> <span>allows you to jump to specific headings on a page</span></li>
 96 | 	</ul>
 97 | 	<ol id='messages'>
 98 | 
 99 | 	</ol>
100 | 	<form id='form'>
101 | 		<input name='query' placeholder='Ask something...' />
102 | 		<input type='submit' />
103 | 	</form>
104 | </body>
105 | </html>
106 | 


--------------------------------------------------------------------------------
/parse_command.py:
--------------------------------------------------------------------------------
 1 | from wise import Phrase, parse_phrase
 2 | 
 3 | examples = [
 4 | 	Phrase("url", [["*url", "google.com"]]),
 5 | 	Phrase("url", ["load", ["*url", "google.com"]]),
 6 | 	Phrase("url", ["open", ["*url", "google.com"]]),
 7 | 	Phrase("url", ["fetch", ["*url", "google.com"]]),
 8 | 	Phrase("url", ["go to", ["*url", "google.com"]]),
 9 | 	Phrase("url", ["show", ["*url", "google.com"]]),
10 | 	Phrase("search", ["search", ["~query", "hacker school"]]),
11 | 	Phrase("search", ["google", ["~query", "weather 11215"]]),
12 | 	Phrase("search", ["search the web for", ["~query", "kanye west"]]),
13 | 	Phrase("search", ["search for", ["~query", "hello world"]]),
14 | 	Phrase("search", ["search", ["search_source/wikipedia", "wikipedia"], "for", ["~query", "praying mantis"]]),
15 | 	Phrase("search", [["search_source/wikipedia", "wikipedia"], ["~query", "android"]]),
16 | 	Phrase("search", [["search_source/wikipedia", "show me the wikipedia article for"], ["~query", "the grateful dead"]]),
17 | 	Phrase("search", ["search", ["search_source/this_site", "this site"], "for", ["~query", "contact us"]]),
18 | 	Phrase("search", ["find", ["~query", "support"], "on", ["search_source/this_site", "this site"]]),
19 | 	Phrase("search", [["~query", "barack obama"]]),
20 | 	Phrase("more_text", ["more"]),
21 | 	Phrase("more_text", [["*number", "2"], "more pages"]),
22 | 	Phrase("more_text", [["*number", "3"], "more pages"]),
23 | 	Phrase("more_text", ["next"]),
24 | 	Phrase("more_text", ["next", ["*number", "4"]]),
25 | 	Phrase("previous_text", ["previous"]),
26 | 	Phrase("previous_text", ["last", ["*number", "3"]]),
27 | 	Phrase("previous_text", ["previous", ["*number", "7"], "messages"]),
28 | 	Phrase("previous_text", ["last part"]),
29 | 	Phrase("back_to_top", ["back to top of page"]),
30 | 	Phrase("navigate", ["click", ["*number", "6"]]),
31 | 	Phrase("navigate", [["*number", "7"]]),
32 | 	Phrase("navigate", [["*number", "7"]]),
33 | 	Phrase("navigate", [["*number", "7"]]),
34 | 	Phrase("navigate", ["click link", ["target", "hvuiehguo"]]),
35 | 	Phrase("navigate", ["click", ["target", "ihenigo"], ["on_last_page", "on last page"]]),
36 | 	Phrase("navigate", ["load", ["target", "jegotghr"], ["on_last_page", "from previous page"]]),
37 | 	Phrase("show_navigation", ["show navigation"]),
38 | 	Phrase("help", ["help me"]),
39 | 	Phrase("help", ["what are the options"]),
40 | 	Phrase("help", ["what can I say?"]),
41 | 	Phrase("summarize", ["summarize this page"]),
42 | 	Phrase("summarize", ["summarize", ["*number", "2"]]),
43 | 	Phrase("show summary for", ["*number", "3"]),
44 | 	Phrase("back", ["back"]),
45 | 	Phrase("whereami", ["where am i?"]),
46 | 	Phrase("whereami", ["what page am i on?"]),
47 | 	Phrase("whereami", ["current site"]),
48 | 	Phrase("contents", ["show me the table of contents"]),
49 | 	Phrase("contents", ["zoom out"]),
50 | 	Phrase("contents", ["list the headings on the page"])
51 | ]
52 | regexes = {
53 | "url": r"[a-zA-Z0-9_\-\.]+\.[a-z]+(\/[^ ]*)?",
54 | "number": r"\-?[0-9]+(\.[0-9]+)?"
55 | }
56 | 
57 | def parse_command(command_text):
58 | 	return parse_phrase(command_text, examples, regexes)
59 | 


--------------------------------------------------------------------------------
/pybing/__init__.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | from bing import Bing
10 | 


--------------------------------------------------------------------------------
/pybing/bing.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | """
10 | This module holds the Bing class which is used to create and execute queries
11 | against Bing.
12 | """
13 | 
14 | import urllib
15 | import urllib2
16 | 
17 | # Issue #1 (http://code.google.com/p/pybing/issues/detail?id=1)
18 | # Python 2.6 has json built in, 2.5 needs simplejson
19 | try:
20 |     import json
21 | except ImportError:
22 |     import simplejson as json
23 | 
24 | from pybing import constants
25 | 
26 | class Bing(object):
27 |     def __init__(self, app_id):
28 |         self.app_id = app_id
29 |     
30 |     def search(self, query, source_type=None, api_version=None, extra_params=None, **kwargs):
31 |         kwargs.update({
32 |             'AppId':    self.app_id,
33 |             'Version':  api_version or constants.API_VERSION,
34 |             'Query':    query,
35 |             'Sources':  source_type or constants.DEFAULT_SOURCE_TYPE,
36 |         })
37 |         
38 |         if extra_params:
39 |             kwargs.update(extra_params)
40 |         
41 |         query_string = urllib.urlencode(kwargs)
42 |         contents = urllib2.urlopen(constants.JSON_ENDPOINT + '?' + query_string)
43 |         return json.loads(contents.read())
44 |     
45 |     def search_web(self, query):
46 |         return self.search(query, source_type=constants.WEB_SOURCE_TYPE)
47 |     
48 |     def search_image(self, query):
49 |         return self.search(query, source_type=constants.IMAGE_SOURCE_TYPE)
50 |     
51 |     def search_news(self, query):
52 |         return self.search(query, source_type=constants.NEWS_SOURCE_TYPE)
53 |     
54 |     def search_spell(self, query):
55 |         return self.search(query, source_type=constants.SPELL_SOURCE_TYPE)
56 |     
57 |     def search_related(self, query):
58 |         return self.search(query, source_type=constants.RELATED_SOURCE_TYPE)
59 |     
60 |     def search_phonebook(self, query):
61 |         return self.search(query, source_type=constants.PHONEBOOK_SOURCE_TYPE)
62 |     
63 |     def search_answers(self, query):
64 |         return self.search(query, source_type=constants.ANSWERS_SOURCE_TYPE)
65 | 


--------------------------------------------------------------------------------
/pybing/constants.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | """
10 | This module holds the any constants used when querying Bing.
11 | """
12 | 
13 | API_VERSION = '2.0'
14 | JSON_ENDPOINT = 'http://api.search.live.net/json.aspx'
15 | MAX_PAGE_SIZE = 50
16 | MAX_RESULTS = 1000
17 | 
18 | WEB_SOURCE_TYPE = 'Web'
19 | IMAGE_SOURCE_TYPE = 'Image'
20 | NEWS_SOURCE_TYPE = 'News'
21 | SPELL_SOURCE_TYPE = 'Spell'
22 | RELATED_SOURCE_TYPE = 'RelatedSearch'
23 | PHONEBOOK_SOURCE_TYPE = 'Phonebook'
24 | ANSWERS_SOURCE_TYPE = 'InstanceAnswer'
25 | 
26 | SOURCE_TYPES = (
27 |     WEB_SOURCE_TYPE,
28 |     IMAGE_SOURCE_TYPE,
29 |     NEWS_SOURCE_TYPE,
30 |     SPELL_SOURCE_TYPE,
31 |     RELATED_SOURCE_TYPE,
32 |     PHONEBOOK_SOURCE_TYPE,
33 |     ANSWERS_SOURCE_TYPE,
34 | )
35 | 
36 | DEFAULT_SOURCE_TYPE = WEB_SOURCE_TYPE
37 | 


--------------------------------------------------------------------------------
/pybing/query/__init__.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | # Mixins
10 | from mixin import QueryMixin
11 | from pagable import Pagable
12 | 
13 | # Base Query
14 | from query import BingQuery
15 | 
16 | # Concrete Queries
17 | from web import WebQuery
18 | 


--------------------------------------------------------------------------------
/pybing/query/mixin.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | """
10 | This module holds the QueryMixin base class used for all queries.
11 | """
12 | 
13 | class QueryMixin(object):
14 |     """
15 |     Any methods that might be mixed into queries should extend this
16 |     base class. 
17 |     """
18 |     def get_request_parameters(self):
19 |         params = {}
20 |         
21 |         # Since we're mixing in, super() may or may not have the attribute
22 |         sup = super(QueryMixin, self)
23 |         if hasattr(sup, 'get_request_parameters'):
24 |             params = sup.get_request_parameters()
25 |         
26 |         return params
27 | 


--------------------------------------------------------------------------------
/pybing/query/pagable.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | """
10 | This module holds a mixin to specify a query class you can page through
11 | using the count and offset parameter.
12 | """
13 | 
14 | from mixin import QueryMixin
15 | 
16 | class Pagable(QueryMixin):
17 |     """
18 |     This class is a mixin used with BingQuery classes to specify that 
19 |     queries can be paged through using the offset and count parameters.
20 |     
21 |     Some examples of Pagable requests are WebRequests and VideoRequests.
22 |     Some non-Pagable requests are TranslationRequests and SearchRequests with
23 |     the Spell source type.
24 |     
25 |     From the Bing API:
26 |     - Count specifies the number of results to return per Request.
27 |     - Offset specifies the offset requested, from zero, for the starting
28 |       point of the result set to be returned for this Request.
29 |     
30 |     Note: This mixin currently supports only a single Source Type query.
31 |     """
32 |     def __init__(self, *args, **kwargs):
33 |         self._count = None
34 |         self._offset = 0
35 |         super(Pagable, self).__init__(*args, **kwargs)
36 |     
37 |     def execute(self, *args, **kwargs):
38 |         if self.count and self.offset and self.count + self.offset > 1000:
39 |             raise ValueError, "Count + Offset must be less than 1000"
40 |         super(Pagable, self).execute(*args, **kwargs)
41 |     
42 |     def get_request_parameters(self):
43 |         params = super(Pagable, self).get_request_parameters()
44 |         
45 |         if self.count:
46 |             params['%s.Count' % self.SOURCE_TYPE] = self.count
47 |         
48 |         if self.offset:
49 |             params['%s.Offset' % self.SOURCE_TYPE] = self.offset
50 |         
51 |         return params
52 |     
53 |     @property
54 |     def count(self):
55 |         return self._count
56 |     
57 |     def set_count(self, value):
58 |         if value is not None:
59 |             if value < 1:
60 |                 raise ValueError, 'Count must be positive'
61 |             
62 |             elif value > 50:
63 |                 raise ValueError, 'Count must be less than 50'
64 |         
65 |         obj = self._clone()
66 |         obj._count = value
67 |         return obj
68 |     
69 |     @property
70 |     def offset(self):
71 |         return self._offset
72 |     
73 |     def set_offset(self, value):
74 |         if value < 0:
75 |             raise ValueError, 'Offset must be positive'
76 |         
77 |         elif value > 1000:
78 |             raise ValueError, 'Offset must be less than 1000'
79 |         
80 |         obj = self._clone()
81 |         obj._offset = value
82 |         return obj
83 | 


--------------------------------------------------------------------------------
/pybing/query/query.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | """
10 | This module holds the base Query class used by the various types of Bing queries.
11 | """
12 | 
13 | import copy, urllib, httplib2
14 | 
15 | # Issue #1 (http://code.google.com/p/pybing/issues/detail?id=1)
16 | # Python 2.6 has json built in, 2.5 needs simplejson
17 | try: import json
18 | except ImportError: import simplejson as json
19 | 
20 | from pybing import constants
21 | from pybing.query.mixin import QueryMixin
22 | 
23 | class BingQuery(QueryMixin):
24 |     SOURCE_TYPE = None
25 |     
26 |     def __init__(self, app_id, query=None, version=None, *args, **kwargs):
27 |         self.app_id = app_id
28 |         self.version = version or constants.API_VERSION
29 |         self._query = query
30 |         
31 |         # Needed for mixin's __init__'s to be called.
32 |         super(BingQuery, self).__init__(*args, **kwargs)
33 |     
34 |     def set_query(self, query):
35 |         if not query:
36 |             raise ValueError, 'Query cannot be empty or None'
37 |         
38 |         obj = self._clone()
39 |         obj._query = query
40 |         return obj
41 |     
42 |     @property
43 |     def query(self):
44 |         return self._query
45 |     
46 |     def execute(self):
47 |         if not self.query:
48 |             raise ValueError, 'Query cannot be empty or None'
49 |         
50 |         elif not self.SOURCE_TYPE:
51 |             raise ValueError, 'Source Type cannot be empty or None'
52 |         
53 |         from pybing.resultset import BingResultSet
54 |         return BingResultSet(self)
55 |     
56 |     def get_request_parameters(self):
57 |         params = super(BingQuery, self).get_request_parameters()
58 |         params.update({
59 |             'AppId':    self.app_id,
60 |             'Version':  self.version,
61 |             'Query':    self.query,
62 |             'Sources':  self.SOURCE_TYPE,
63 |         })
64 |         return params
65 |     
66 |     def get_request_url(self):
67 |         query_string = urllib.urlencode(self.get_request_parameters())
68 |         return constants.JSON_ENDPOINT + '?' + query_string
69 |     
70 |     def get_search_response(self):
71 |         contents = self._get_url_contents(self.get_request_url())
72 |         return json.loads(contents)['SearchResponse'][self.SOURCE_TYPE]
73 |     
74 |     def get_search_results(self):
75 |         from pybing.result import BingResult
76 |         response = self.get_search_response()
77 |         return [BingResult(result) for result in response['Results']]
78 |     
79 |     def _get_url_contents(self, url):
80 |         response, contents = httplib2.Http().request(url)
81 |         return contents
82 |     
83 |     def _clone(self):
84 |         """
85 |         Do a deep copy of this object returning a clone that can be
86 |         modified without affecting the old copy.
87 |         """
88 |         return copy.deepcopy(self)
89 |     
90 |     def __unicode__(self):
91 |         return 'BingQuery: %s' % self.get_request_url()
92 |     
93 |     __str__ = __unicode__
94 |     
95 |     def __repr__(self):
96 |         return '<%s>' % unicode(self)
97 | 


--------------------------------------------------------------------------------
/pybing/query/web.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | """
10 | This module holds the Bing WebQuery class used to do web searches against Bing.
11 | """
12 | 
13 | from pybing import constants
14 | from pybing.query import BingQuery, Pagable
15 | 
16 | class WebQuery(BingQuery, Pagable):
17 |     SOURCE_TYPE = constants.WEB_SOURCE_TYPE
18 | 


--------------------------------------------------------------------------------
/pybing/result.py:
--------------------------------------------------------------------------------
 1 | # This file is part of PyBing (http://pybing.googlecode.com).
 2 | # 
 3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
 4 | # All rights reserved.
 5 | # 
 6 | # This software is licensed as described in the file COPYING.txt,
 7 | # which you should have received as part of this distribution.
 8 | 
 9 | """
10 | This module holds the base BingResult class.
11 | """
12 | 
13 | class BingResult(object):
14 |     """
15 |     The base BingResult class corresponds to a single result from a Bing
16 |     Query response.
17 |     """
18 |     def __init__(self, result):
19 |         if isinstance(result, dict):
20 |             self.load_from_dict(result)
21 |         
22 |         else:
23 |             raise TypeError, 'Invalid result type'
24 |     
25 |     def load_from_dict(self, data):
26 |         for key, value in data.iteritems():
27 |             setattr(self, key.lower(), value)
28 |     
29 |     def __repr__(self):
30 |         return '<BingResult>'
31 | 


--------------------------------------------------------------------------------
/pybing/resultset.py:
--------------------------------------------------------------------------------
  1 | # This file is part of PyBing (http://pybing.googlecode.com).
  2 | # 
  3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
  4 | # All rights reserved.
  5 | # 
  6 | # This software is licensed as described in the file COPYING.txt,
  7 | # which you should have received as part of this distribution.
  8 | 
  9 | """
 10 | This module holds the logic for dealing with a set of results from a query.
 11 | """
 12 | 
 13 | from pybing import constants
 14 | from pybing.query import BingQuery, Pagable
 15 | 
 16 | class BingResultSet(object):
 17 |     """
 18 |     This class corresponds to a set of results from a BingQuery.
 19 |     """
 20 |     def __init__(self, query, offset=0, count=None):
 21 |         if not isinstance(query, BingQuery):
 22 |             raise TypeError, 'query must be a BingQuery instance'
 23 |         
 24 |         self.query = query
 25 |         self.results = {}
 26 |         
 27 |         # These offset + count are used internally to signify whether or
 28 |         # not the query should be cut down (whether they've been sliced).
 29 |         self.offset, self.count = offset, count
 30 |     
 31 |     def get_offset(self, index=0):
 32 |         return self.query.offset + self.offset + index
 33 |     
 34 |     def __getitem__(self, key):
 35 |         """
 36 |         Allows you to grab an index or slice a query with array notation like
 37 |         resultset[4] or resultset[0:4]
 38 |         """
 39 |         if not isinstance(self.query, Pagable):
 40 |             raise TypeError, 'Array access only supported on Pagable Queries'
 41 |         
 42 |         if isinstance(key, int):
 43 |             absolute_index = self.get_offset()
 44 |             if absolute_index < 0 or absolute_index >= constants.MAX_RESULTS:
 45 |                 raise IndexError
 46 |             
 47 |             if absolute_index not in self.results:
 48 |                 # Make a copy of the query for only this one result:
 49 |                 query = self.query.set_offset(absolute_index).set_count(1)
 50 |                 results = query.get_search_results()
 51 |                 if results:
 52 |                     self.results[absolute_index] = results[0]
 53 |             
 54 |             return self.results.get(absolute_index)
 55 |         
 56 |         elif isinstance(key, slice):
 57 |             # Return a new result set that is sliced internally (not the query)
 58 |             offset = key.start or 0
 59 |             if key.stop: count = key.stop - offset
 60 |             else: count = None
 61 |             return BingResultSet(self.query, self.offset + offset, count)
 62 |         
 63 |         else: 
 64 |             raise TypeError
 65 |     
 66 |     def __len__(self):
 67 |         """
 68 |         Returns the number of results if you were to iterate over this result set.
 69 |         This is at least 0 and at most 1000.
 70 |         """
 71 |         count = constants.MAX_RESULTS
 72 |         
 73 |         if self.count:
 74 |             count = self.count
 75 |         
 76 |         elif self.query.count:
 77 |             count = self.query.count
 78 |         
 79 |         if count > constants.MAX_RESULTS:
 80 |             count = constants.MAX_RESULTS
 81 |         
 82 |         if count == constants.MAX_RESULTS:
 83 |             count = count - self.get_offset()
 84 |         
 85 |         return count
 86 |     
 87 |     def __iter__(self):
 88 |         """
 89 |         Allows you to iterate over the search results in the standard Python
 90 |         format such as
 91 |         for result in my_query.execute():
 92 |             print result.title, result.url
 93 |         """
 94 |         query = self.query.set_offset(self.get_offset())
 95 |         end_index = constants.MAX_RESULTS
 96 |         
 97 |         # If we've internally sliced out items
 98 |         if self.count:
 99 |             query = query.set_count(self.count if self.count < constants.MAX_PAGE_SIZE else constants.MAX_PAGE_SIZE)
100 |             end_index = self.get_offset() + self.count
101 |             
102 |             if end_index > constants.MAX_RESULTS:
103 |                 end_index = constants.MAX_RESULTS
104 |         
105 |         # If we want to just go until the end, grab them the most per page
106 |         if not query.count:
107 |             query = query.set_count(constants.MAX_PAGE_SIZE)
108 |         
109 |         while query.offset < end_index:
110 |             # If we don't have a full page left, only grab up to the end
111 |             count = end_index - query.offset
112 |             if count and count < constants.MAX_PAGE_SIZE:
113 |                 query = query.set_count(count)
114 |             
115 |             # Yield back each result
116 |             for result in query.get_search_results():
117 |                 yield result
118 |             
119 |             # Update the offset to move onto the next page
120 |             query = query.set_offset(query.offset + query.count)
121 |     
122 |     def __repr__(self):
123 |         return '<BingResultSet (%s)>' % self.query
124 | 


--------------------------------------------------------------------------------
/requests/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | #   __
 4 | #  /__)  _  _     _   _ _/   _
 5 | # / (   (- (/ (/ (- _)  /  _)
 6 | #          /
 7 | 
 8 | """
 9 | requests HTTP library
10 | ~~~~~~~~~~~~~~~~~~~~~
11 | 
12 | Requests is an HTTP library, written in Python, for human beings. Basic GET
13 | usage:
14 | 
15 |    >>> import requests
16 |    >>> r = requests.get('http://python.org')
17 |    >>> r.status_code
18 |    200
19 |    >>> 'Python is a programming language' in r.content
20 |    True
21 | 
22 | ... or POST:
23 | 
24 |    >>> payload = dict(key1='value1', key2='value2')
25 |    >>> r = requests.post("http://httpbin.org/post", data=payload)
26 |    >>> print(r.text)
27 |    {
28 |      ...
29 |      "form": {
30 |        "key2": "value2",
31 |        "key1": "value1"
32 |      },
33 |      ...
34 |    }
35 | 
36 | The other HTTP methods are supported - see `requests.api`. Full documentation
37 | is at <http://python-requests.org>.
38 | 
39 | :copyright: (c) 2014 by Kenneth Reitz.
40 | :license: Apache 2.0, see LICENSE for more details.
41 | 
42 | """
43 | 
44 | __title__ = 'requests'
45 | __version__ = '2.4.1'
46 | __build__ = 0x020401
47 | __author__ = 'Kenneth Reitz'
48 | __license__ = 'Apache 2.0'
49 | __copyright__ = 'Copyright 2014 Kenneth Reitz'
50 | 
51 | # Attempt to enable urllib3's SNI support, if possible
52 | try:
53 |     from .packages.urllib3.contrib import pyopenssl
54 |     pyopenssl.inject_into_urllib3()
55 | except ImportError:
56 |     pass
57 | 
58 | from . import utils
59 | from .models import Request, Response, PreparedRequest
60 | from .api import request, get, head, post, patch, put, delete, options
61 | from .sessions import session, Session
62 | from .status_codes import codes
63 | from .exceptions import (
64 |     RequestException, Timeout, URLRequired,
65 |     TooManyRedirects, HTTPError, ConnectionError
66 | )
67 | 
68 | # Set default logging handler to avoid "No handler found" warnings.
69 | import logging
70 | try:  # Python 2.7+
71 |     from logging import NullHandler
72 | except ImportError:
73 |     class NullHandler(logging.Handler):
74 |         def emit(self, record):
75 |             pass
76 | 
77 | logging.getLogger(__name__).addHandler(NullHandler())
78 | 


--------------------------------------------------------------------------------
/requests/certs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | certs.py
 6 | ~~~~~~~~
 7 | 
 8 | This module returns the preferred default CA certificate bundle.
 9 | 
10 | If you are packaging Requests, e.g., for a Linux distribution or a managed
11 | environment, you can change the definition of where() to return a separately
12 | packaged CA bundle.
13 | """
14 | import os.path
15 | 
16 | try:
17 |     from certifi import where
18 | except ImportError:
19 |     def where():
20 |         """Return the preferred certificate bundle."""
21 |         # vendored bundle inside Requests
22 |         return os.path.join(os.path.dirname(__file__), 'cacert.pem')
23 | 
24 | if __name__ == '__main__':
25 |     print(where())
26 | 


--------------------------------------------------------------------------------
/requests/compat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | pythoncompat
  5 | """
  6 | 
  7 | from .packages import chardet
  8 | 
  9 | import sys
 10 | 
 11 | # -------
 12 | # Pythons
 13 | # -------
 14 | 
 15 | # Syntax sugar.
 16 | _ver = sys.version_info
 17 | 
 18 | #: Python 2.x?
 19 | is_py2 = (_ver[0] == 2)
 20 | 
 21 | #: Python 3.x?
 22 | is_py3 = (_ver[0] == 3)
 23 | 
 24 | #: Python 3.0.x
 25 | is_py30 = (is_py3 and _ver[1] == 0)
 26 | 
 27 | #: Python 3.1.x
 28 | is_py31 = (is_py3 and _ver[1] == 1)
 29 | 
 30 | #: Python 3.2.x
 31 | is_py32 = (is_py3 and _ver[1] == 2)
 32 | 
 33 | #: Python 3.3.x
 34 | is_py33 = (is_py3 and _ver[1] == 3)
 35 | 
 36 | #: Python 3.4.x
 37 | is_py34 = (is_py3 and _ver[1] == 4)
 38 | 
 39 | #: Python 2.7.x
 40 | is_py27 = (is_py2 and _ver[1] == 7)
 41 | 
 42 | #: Python 2.6.x
 43 | is_py26 = (is_py2 and _ver[1] == 6)
 44 | 
 45 | #: Python 2.5.x
 46 | is_py25 = (is_py2 and _ver[1] == 5)
 47 | 
 48 | #: Python 2.4.x
 49 | is_py24 = (is_py2 and _ver[1] == 4)   # I'm assuming this is not by choice.
 50 | 
 51 | 
 52 | # ---------
 53 | # Platforms
 54 | # ---------
 55 | 
 56 | 
 57 | # Syntax sugar.
 58 | _ver = sys.version.lower()
 59 | 
 60 | is_pypy = ('pypy' in _ver)
 61 | is_jython = ('jython' in _ver)
 62 | is_ironpython = ('iron' in _ver)
 63 | 
 64 | # Assume CPython, if nothing else.
 65 | is_cpython = not any((is_pypy, is_jython, is_ironpython))
 66 | 
 67 | # Windows-based system.
 68 | is_windows = 'win32' in str(sys.platform).lower()
 69 | 
 70 | # Standard Linux 2+ system.
 71 | is_linux = ('linux' in str(sys.platform).lower())
 72 | is_osx = ('darwin' in str(sys.platform).lower())
 73 | is_hpux = ('hpux' in str(sys.platform).lower())   # Complete guess.
 74 | is_solaris = ('solar==' in str(sys.platform).lower())   # Complete guess.
 75 | 
 76 | try:
 77 |     import simplejson as json
 78 | except (ImportError, SyntaxError):
 79 |     # simplejson does not support Python 3.2, it thows a SyntaxError
 80 |     # because of u'...' Unicode literals.
 81 |     import json
 82 | 
 83 | # ---------
 84 | # Specifics
 85 | # ---------
 86 | 
 87 | if is_py2:
 88 |     from urllib import quote, unquote, quote_plus, unquote_plus, urlencode, getproxies, proxy_bypass
 89 |     from urlparse import urlparse, urlunparse, urljoin, urlsplit, urldefrag
 90 |     from urllib2 import parse_http_list
 91 |     import cookielib
 92 |     from Cookie import Morsel
 93 |     from StringIO import StringIO
 94 |     from .packages.urllib3.packages.ordered_dict import OrderedDict
 95 | 
 96 |     builtin_str = str
 97 |     bytes = str
 98 |     str = unicode
 99 |     basestring = basestring
100 |     numeric_types = (int, long, float)
101 | 
102 | 
103 | elif is_py3:
104 |     from urllib.parse import urlparse, urlunparse, urljoin, urlsplit, urlencode, quote, unquote, quote_plus, unquote_plus, urldefrag
105 |     from urllib.request import parse_http_list, getproxies, proxy_bypass
106 |     from http import cookiejar as cookielib
107 |     from http.cookies import Morsel
108 |     from io import StringIO
109 |     from collections import OrderedDict
110 | 
111 |     builtin_str = str
112 |     str = str
113 |     bytes = bytes
114 |     basestring = (str, bytes)
115 |     numeric_types = (int, float)
116 | 


--------------------------------------------------------------------------------
/requests/exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | requests.exceptions
 5 | ~~~~~~~~~~~~~~~~~~~
 6 | 
 7 | This module contains the set of Requests' exceptions.
 8 | 
 9 | """
10 | from .packages.urllib3.exceptions import HTTPError as BaseHTTPError
11 | 
12 | 
13 | class RequestException(IOError):
14 |     """There was an ambiguous exception that occurred while handling your
15 |     request."""
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         """
19 |         Initialize RequestException with `request` and `response` objects.
20 |         """
21 |         response = kwargs.pop('response', None)
22 |         self.response = response
23 |         self.request = kwargs.pop('request', None)
24 |         if (response is not None and not self.request and
25 |                 hasattr(response, 'request')):
26 |             self.request = self.response.request
27 |         super(RequestException, self).__init__(*args, **kwargs)
28 | 
29 | 
30 | class HTTPError(RequestException):
31 |     """An HTTP error occurred."""
32 | 
33 | 
34 | class ConnectionError(RequestException):
35 |     """A Connection error occurred."""
36 | 
37 | 
38 | class ProxyError(ConnectionError):
39 |     """A proxy error occurred."""
40 | 
41 | 
42 | class SSLError(ConnectionError):
43 |     """An SSL error occurred."""
44 | 
45 | 
46 | class Timeout(RequestException):
47 |     """The request timed out.
48 | 
49 |     Catching this error will catch both
50 |     :exc:`~requests.exceptions.ConnectTimeout` and
51 |     :exc:`~requests.exceptions.ReadTimeout` errors.
52 |     """
53 | 
54 | 
55 | class ConnectTimeout(ConnectionError, Timeout):
56 |     """The request timed out while trying to connect to the remote server.
57 | 
58 |     Requests that produced this error are safe to retry.
59 |     """
60 | 
61 | 
62 | class ReadTimeout(Timeout):
63 |     """The server did not send any data in the allotted amount of time."""
64 | 
65 | 
66 | class URLRequired(RequestException):
67 |     """A valid URL is required to make a request."""
68 | 
69 | 
70 | class TooManyRedirects(RequestException):
71 |     """Too many redirects."""
72 | 
73 | 
74 | class MissingSchema(RequestException, ValueError):
75 |     """The URL schema (e.g. http or https) is missing."""
76 | 
77 | 
78 | class InvalidSchema(RequestException, ValueError):
79 |     """See defaults.py for valid schemas."""
80 | 
81 | 
82 | class InvalidURL(RequestException, ValueError):
83 |     """ The URL provided was somehow invalid. """
84 | 
85 | 
86 | class ChunkedEncodingError(RequestException):
87 |     """The server declared chunked encoding but sent an invalid chunk."""
88 | 
89 | 
90 | class ContentDecodingError(RequestException, BaseHTTPError):
91 |     """Failed to decode response content"""
92 | 


--------------------------------------------------------------------------------
/requests/hooks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | requests.hooks
 5 | ~~~~~~~~~~~~~~
 6 | 
 7 | This module provides the capabilities for the Requests hooks system.
 8 | 
 9 | Available hooks:
10 | 
11 | ``response``:
12 |     The response generated from a Request.
13 | 
14 | """
15 | 
16 | 
17 | HOOKS = ['response']
18 | 
19 | 
20 | def default_hooks():
21 |     hooks = {}
22 |     for event in HOOKS:
23 |         hooks[event] = []
24 |     return hooks
25 | 
26 | # TODO: response is the only one
27 | 
28 | 
29 | def dispatch_hook(key, hooks, hook_data, **kwargs):
30 |     """Dispatches a hook dictionary on a given piece of data."""
31 | 
32 |     hooks = hooks or dict()
33 | 
34 |     if key in hooks:
35 |         hooks = hooks.get(key)
36 | 
37 |         if hasattr(hooks, '__call__'):
38 |             hooks = [hooks]
39 | 
40 |         for hook in hooks:
41 |             _hook_data = hook(hook_data, **kwargs)
42 |             if _hook_data is not None:
43 |                 hook_data = _hook_data
44 | 
45 |     return hook_data
46 | 


--------------------------------------------------------------------------------
/requests/packages/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | 
3 | from . import urllib3
4 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/__init__.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # This library is free software; you can redistribute it and/or
 3 | # modify it under the terms of the GNU Lesser General Public
 4 | # License as published by the Free Software Foundation; either
 5 | # version 2.1 of the License, or (at your option) any later version.
 6 | #
 7 | # This library is distributed in the hope that it will be useful,
 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
10 | # Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public
13 | # License along with this library; if not, write to the Free Software
14 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
15 | # 02110-1301  USA
16 | ######################### END LICENSE BLOCK #########################
17 | 
18 | __version__ = "2.2.1"
19 | from sys import version_info
20 | 
21 | 
22 | def detect(aBuf):
23 |     if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
24 |             (version_info >= (3, 0) and not isinstance(aBuf, bytes))):
25 |         raise ValueError('Expected a bytes object, not a unicode object')
26 | 
27 |     from . import universaldetector
28 |     u = universaldetector.UniversalDetector()
29 |     u.reset()
30 |     u.feed(aBuf)
31 |     u.close()
32 |     return u.result
33 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/big5prober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Communicator client code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import Big5DistributionAnalysis
31 | from .mbcssm import Big5SMModel
32 | 
33 | 
34 | class Big5Prober(MultiByteCharSetProber):
35 |     def __init__(self):
36 |         MultiByteCharSetProber.__init__(self)
37 |         self._mCodingSM = CodingStateMachine(Big5SMModel)
38 |         self._mDistributionAnalyzer = Big5DistributionAnalysis()
39 |         self.reset()
40 | 
41 |     def get_charset_name(self):
42 |         return "Big5"
43 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/chardetect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Script which takes one or more file paths and reports on their detected
 4 | encodings
 5 | 
 6 | Example::
 7 | 
 8 |     % chardetect somefile someotherfile
 9 |     somefile: windows-1252 with confidence 0.5
10 |     someotherfile: ascii with confidence 1.0
11 | 
12 | If no paths are provided, it takes its input from stdin.
13 | 
14 | """
15 | from io import open
16 | from sys import argv, stdin
17 | 
18 | from chardet.universaldetector import UniversalDetector
19 | 
20 | 
21 | def description_of(file, name='stdin'):
22 |     """Return a string describing the probable encoding of a file."""
23 |     u = UniversalDetector()
24 |     for line in file:
25 |         u.feed(line)
26 |     u.close()
27 |     result = u.result
28 |     if result['encoding']:
29 |         return '%s: %s with confidence %s' % (name,
30 |                                               result['encoding'],
31 |                                               result['confidence'])
32 |     else:
33 |         return '%s: no result' % name
34 | 
35 | 
36 | def main():
37 |     if len(argv) <= 1:
38 |         print(description_of(stdin))
39 |     else:
40 |         for path in argv[1:]:
41 |             with open(path, 'rb') as f:
42 |                 print(description_of(f, path))
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/charsetgroupprober.py:
--------------------------------------------------------------------------------
  1 | ######################## BEGIN LICENSE BLOCK ########################
  2 | # The Original Code is Mozilla Communicator client code.
  3 | # 
  4 | # The Initial Developer of the Original Code is
  5 | # Netscape Communications Corporation.
  6 | # Portions created by the Initial Developer are Copyright (C) 1998
  7 | # the Initial Developer. All Rights Reserved.
  8 | # 
  9 | # Contributor(s):
 10 | #   Mark Pilgrim - port to Python
 11 | #
 12 | # This library is free software; you can redistribute it and/or
 13 | # modify it under the terms of the GNU Lesser General Public
 14 | # License as published by the Free Software Foundation; either
 15 | # version 2.1 of the License, or (at your option) any later version.
 16 | # 
 17 | # This library is distributed in the hope that it will be useful,
 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 20 | # Lesser General Public License for more details.
 21 | # 
 22 | # You should have received a copy of the GNU Lesser General Public
 23 | # License along with this library; if not, write to the Free Software
 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 25 | # 02110-1301  USA
 26 | ######################### END LICENSE BLOCK #########################
 27 | 
 28 | from . import constants
 29 | import sys
 30 | from .charsetprober import CharSetProber
 31 | 
 32 | 
 33 | class CharSetGroupProber(CharSetProber):
 34 |     def __init__(self):
 35 |         CharSetProber.__init__(self)
 36 |         self._mActiveNum = 0
 37 |         self._mProbers = []
 38 |         self._mBestGuessProber = None
 39 | 
 40 |     def reset(self):
 41 |         CharSetProber.reset(self)
 42 |         self._mActiveNum = 0
 43 |         for prober in self._mProbers:
 44 |             if prober:
 45 |                 prober.reset()
 46 |                 prober.active = True
 47 |                 self._mActiveNum += 1
 48 |         self._mBestGuessProber = None
 49 | 
 50 |     def get_charset_name(self):
 51 |         if not self._mBestGuessProber:
 52 |             self.get_confidence()
 53 |             if not self._mBestGuessProber:
 54 |                 return None
 55 | #                self._mBestGuessProber = self._mProbers[0]
 56 |         return self._mBestGuessProber.get_charset_name()
 57 | 
 58 |     def feed(self, aBuf):
 59 |         for prober in self._mProbers:
 60 |             if not prober:
 61 |                 continue
 62 |             if not prober.active:
 63 |                 continue
 64 |             st = prober.feed(aBuf)
 65 |             if not st:
 66 |                 continue
 67 |             if st == constants.eFoundIt:
 68 |                 self._mBestGuessProber = prober
 69 |                 return self.get_state()
 70 |             elif st == constants.eNotMe:
 71 |                 prober.active = False
 72 |                 self._mActiveNum -= 1
 73 |                 if self._mActiveNum <= 0:
 74 |                     self._mState = constants.eNotMe
 75 |                     return self.get_state()
 76 |         return self.get_state()
 77 | 
 78 |     def get_confidence(self):
 79 |         st = self.get_state()
 80 |         if st == constants.eFoundIt:
 81 |             return 0.99
 82 |         elif st == constants.eNotMe:
 83 |             return 0.01
 84 |         bestConf = 0.0
 85 |         self._mBestGuessProber = None
 86 |         for prober in self._mProbers:
 87 |             if not prober:
 88 |                 continue
 89 |             if not prober.active:
 90 |                 if constants._debug:
 91 |                     sys.stderr.write(prober.get_charset_name()
 92 |                                      + ' not active\n')
 93 |                 continue
 94 |             cf = prober.get_confidence()
 95 |             if constants._debug:
 96 |                 sys.stderr.write('%s confidence = %s\n' %
 97 |                                  (prober.get_charset_name(), cf))
 98 |             if bestConf < cf:
 99 |                 bestConf = cf
100 |                 self._mBestGuessProber = prober
101 |         if not self._mBestGuessProber:
102 |             return 0.0
103 |         return bestConf
104 | #        else:
105 | #            self._mBestGuessProber = self._mProbers[0]
106 | #            return self._mBestGuessProber.get_confidence()
107 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/charsetprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | #
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 | # Lesser General Public License for more details.
22 | #
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301  USA
27 | ######################### END LICENSE BLOCK #########################
28 | 
29 | from . import constants
30 | import re
31 | 
32 | 
33 | class CharSetProber:
34 |     def __init__(self):
35 |         pass
36 | 
37 |     def reset(self):
38 |         self._mState = constants.eDetecting
39 | 
40 |     def get_charset_name(self):
41 |         return None
42 | 
43 |     def feed(self, aBuf):
44 |         pass
45 | 
46 |     def get_state(self):
47 |         return self._mState
48 | 
49 |     def get_confidence(self):
50 |         return 0.0
51 | 
52 |     def filter_high_bit_only(self, aBuf):
53 |         aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
54 |         return aBuf
55 | 
56 |     def filter_without_english_letters(self, aBuf):
57 |         aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
58 |         return aBuf
59 | 
60 |     def filter_with_english_letters(self, aBuf):
61 |         # TODO
62 |         return aBuf
63 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/codingstatemachine.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from .constants import eStart
29 | from .compat import wrap_ord
30 | 
31 | 
32 | class CodingStateMachine:
33 |     def __init__(self, sm):
34 |         self._mModel = sm
35 |         self._mCurrentBytePos = 0
36 |         self._mCurrentCharLen = 0
37 |         self.reset()
38 | 
39 |     def reset(self):
40 |         self._mCurrentState = eStart
41 | 
42 |     def next_state(self, c):
43 |         # for each byte we get its class
44 |         # if it is first byte, we also get byte length
45 |         # PY3K: aBuf is a byte stream, so c is an int, not a byte
46 |         byteCls = self._mModel['classTable'][wrap_ord(c)]
47 |         if self._mCurrentState == eStart:
48 |             self._mCurrentBytePos = 0
49 |             self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
50 |         # from byte's class and stateTable, we get its next state
51 |         curr_state = (self._mCurrentState * self._mModel['classFactor']
52 |                       + byteCls)
53 |         self._mCurrentState = self._mModel['stateTable'][curr_state]
54 |         self._mCurrentBytePos += 1
55 |         return self._mCurrentState
56 | 
57 |     def get_current_charlen(self):
58 |         return self._mCurrentCharLen
59 | 
60 |     def get_coding_state_machine(self):
61 |         return self._mModel['name']
62 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/compat.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # Contributor(s):
 3 | #   Ian Cordasco - port to Python
 4 | #
 5 | # This library is free software; you can redistribute it and/or
 6 | # modify it under the terms of the GNU Lesser General Public
 7 | # License as published by the Free Software Foundation; either
 8 | # version 2.1 of the License, or (at your option) any later version.
 9 | #
10 | # This library is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public
16 | # License along with this library; if not, write to the Free Software
17 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18 | # 02110-1301  USA
19 | ######################### END LICENSE BLOCK #########################
20 | 
21 | import sys
22 | 
23 | 
24 | if sys.version_info < (3, 0):
25 |     base_str = (str, unicode)
26 | else:
27 |     base_str = (bytes, str)
28 | 
29 | 
30 | def wrap_ord(a):
31 |     if sys.version_info < (3, 0) and isinstance(a, base_str):
32 |         return ord(a)
33 |     else:
34 |         return a
35 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/constants.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | # 
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 | # Lesser General Public License for more details.
22 | # 
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301  USA
27 | ######################### END LICENSE BLOCK #########################
28 | 
29 | _debug = 0
30 | 
31 | eDetecting = 0
32 | eFoundIt = 1
33 | eNotMe = 2
34 | 
35 | eStart = 0
36 | eError = 1
37 | eItsMe = 2
38 | 
39 | SHORTCUT_THRESHOLD = 0.95
40 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/cp949prober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import EUCKRDistributionAnalysis
31 | from .mbcssm import CP949SMModel
32 | 
33 | 
34 | class CP949Prober(MultiByteCharSetProber):
35 |     def __init__(self):
36 |         MultiByteCharSetProber.__init__(self)
37 |         self._mCodingSM = CodingStateMachine(CP949SMModel)
38 |         # NOTE: CP949 is a superset of EUC-KR, so the distribution should be
39 |         #       not different.
40 |         self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
41 |         self.reset()
42 | 
43 |     def get_charset_name(self):
44 |         return "CP949"
45 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/escprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from . import constants
29 | from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
30 |                     ISO2022KRSMModel)
31 | from .charsetprober import CharSetProber
32 | from .codingstatemachine import CodingStateMachine
33 | from .compat import wrap_ord
34 | 
35 | 
36 | class EscCharSetProber(CharSetProber):
37 |     def __init__(self):
38 |         CharSetProber.__init__(self)
39 |         self._mCodingSM = [
40 |             CodingStateMachine(HZSMModel),
41 |             CodingStateMachine(ISO2022CNSMModel),
42 |             CodingStateMachine(ISO2022JPSMModel),
43 |             CodingStateMachine(ISO2022KRSMModel)
44 |         ]
45 |         self.reset()
46 | 
47 |     def reset(self):
48 |         CharSetProber.reset(self)
49 |         for codingSM in self._mCodingSM:
50 |             if not codingSM:
51 |                 continue
52 |             codingSM.active = True
53 |             codingSM.reset()
54 |         self._mActiveSM = len(self._mCodingSM)
55 |         self._mDetectedCharset = None
56 | 
57 |     def get_charset_name(self):
58 |         return self._mDetectedCharset
59 | 
60 |     def get_confidence(self):
61 |         if self._mDetectedCharset:
62 |             return 0.99
63 |         else:
64 |             return 0.00
65 | 
66 |     def feed(self, aBuf):
67 |         for c in aBuf:
68 |             # PY3K: aBuf is a byte array, so c is an int, not a byte
69 |             for codingSM in self._mCodingSM:
70 |                 if not codingSM:
71 |                     continue
72 |                 if not codingSM.active:
73 |                     continue
74 |                 codingState = codingSM.next_state(wrap_ord(c))
75 |                 if codingState == constants.eError:
76 |                     codingSM.active = False
77 |                     self._mActiveSM -= 1
78 |                     if self._mActiveSM <= 0:
79 |                         self._mState = constants.eNotMe
80 |                         return self.get_state()
81 |                 elif codingState == constants.eItsMe:
82 |                     self._mState = constants.eFoundIt
83 |                     self._mDetectedCharset = codingSM.get_coding_state_machine()  # nopep8
84 |                     return self.get_state()
85 | 
86 |         return self.get_state()
87 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/eucjpprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | import sys
29 | from . import constants
30 | from .mbcharsetprober import MultiByteCharSetProber
31 | from .codingstatemachine import CodingStateMachine
32 | from .chardistribution import EUCJPDistributionAnalysis
33 | from .jpcntx import EUCJPContextAnalysis
34 | from .mbcssm import EUCJPSMModel
35 | 
36 | 
37 | class EUCJPProber(MultiByteCharSetProber):
38 |     def __init__(self):
39 |         MultiByteCharSetProber.__init__(self)
40 |         self._mCodingSM = CodingStateMachine(EUCJPSMModel)
41 |         self._mDistributionAnalyzer = EUCJPDistributionAnalysis()
42 |         self._mContextAnalyzer = EUCJPContextAnalysis()
43 |         self.reset()
44 | 
45 |     def reset(self):
46 |         MultiByteCharSetProber.reset(self)
47 |         self._mContextAnalyzer.reset()
48 | 
49 |     def get_charset_name(self):
50 |         return "EUC-JP"
51 | 
52 |     def feed(self, aBuf):
53 |         aLen = len(aBuf)
54 |         for i in range(0, aLen):
55 |             # PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
56 |             codingState = self._mCodingSM.next_state(aBuf[i])
57 |             if codingState == constants.eError:
58 |                 if constants._debug:
59 |                     sys.stderr.write(self.get_charset_name()
60 |                                      + ' prober hit error at byte ' + str(i)
61 |                                      + '\n')
62 |                 self._mState = constants.eNotMe
63 |                 break
64 |             elif codingState == constants.eItsMe:
65 |                 self._mState = constants.eFoundIt
66 |                 break
67 |             elif codingState == constants.eStart:
68 |                 charLen = self._mCodingSM.get_current_charlen()
69 |                 if i == 0:
70 |                     self._mLastChar[1] = aBuf[0]
71 |                     self._mContextAnalyzer.feed(self._mLastChar, charLen)
72 |                     self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
73 |                 else:
74 |                     self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
75 |                     self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
76 |                                                      charLen)
77 | 
78 |         self._mLastChar[0] = aBuf[aLen - 1]
79 | 
80 |         if self.get_state() == constants.eDetecting:
81 |             if (self._mContextAnalyzer.got_enough_data() and
82 |                (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
83 |                 self._mState = constants.eFoundIt
84 | 
85 |         return self.get_state()
86 | 
87 |     def get_confidence(self):
88 |         contxtCf = self._mContextAnalyzer.get_confidence()
89 |         distribCf = self._mDistributionAnalyzer.get_confidence()
90 |         return max(contxtCf, distribCf)
91 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/euckrprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import EUCKRDistributionAnalysis
31 | from .mbcssm import EUCKRSMModel
32 | 
33 | 
34 | class EUCKRProber(MultiByteCharSetProber):
35 |     def __init__(self):
36 |         MultiByteCharSetProber.__init__(self)
37 |         self._mCodingSM = CodingStateMachine(EUCKRSMModel)
38 |         self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
39 |         self.reset()
40 | 
41 |     def get_charset_name(self):
42 |         return "EUC-KR"
43 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/euctwprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import EUCTWDistributionAnalysis
31 | from .mbcssm import EUCTWSMModel
32 | 
33 | class EUCTWProber(MultiByteCharSetProber):
34 |     def __init__(self):
35 |         MultiByteCharSetProber.__init__(self)
36 |         self._mCodingSM = CodingStateMachine(EUCTWSMModel)
37 |         self._mDistributionAnalyzer = EUCTWDistributionAnalysis()
38 |         self.reset()
39 | 
40 |     def get_charset_name(self):
41 |         return "EUC-TW"
42 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/gb2312prober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import GB2312DistributionAnalysis
31 | from .mbcssm import GB2312SMModel
32 | 
33 | class GB2312Prober(MultiByteCharSetProber):
34 |     def __init__(self):
35 |         MultiByteCharSetProber.__init__(self)
36 |         self._mCodingSM = CodingStateMachine(GB2312SMModel)
37 |         self._mDistributionAnalyzer = GB2312DistributionAnalysis()
38 |         self.reset()
39 | 
40 |     def get_charset_name(self):
41 |         return "GB2312"
42 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/mbcharsetprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #   Proofpoint, Inc.
13 | #
14 | # This library is free software; you can redistribute it and/or
15 | # modify it under the terms of the GNU Lesser General Public
16 | # License as published by the Free Software Foundation; either
17 | # version 2.1 of the License, or (at your option) any later version.
18 | #
19 | # This library is distributed in the hope that it will be useful,
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22 | # Lesser General Public License for more details.
23 | #
24 | # You should have received a copy of the GNU Lesser General Public
25 | # License along with this library; if not, write to the Free Software
26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 | # 02110-1301  USA
28 | ######################### END LICENSE BLOCK #########################
29 | 
30 | import sys
31 | from . import constants
32 | from .charsetprober import CharSetProber
33 | 
34 | 
35 | class MultiByteCharSetProber(CharSetProber):
36 |     def __init__(self):
37 |         CharSetProber.__init__(self)
38 |         self._mDistributionAnalyzer = None
39 |         self._mCodingSM = None
40 |         self._mLastChar = [0, 0]
41 | 
42 |     def reset(self):
43 |         CharSetProber.reset(self)
44 |         if self._mCodingSM:
45 |             self._mCodingSM.reset()
46 |         if self._mDistributionAnalyzer:
47 |             self._mDistributionAnalyzer.reset()
48 |         self._mLastChar = [0, 0]
49 | 
50 |     def get_charset_name(self):
51 |         pass
52 | 
53 |     def feed(self, aBuf):
54 |         aLen = len(aBuf)
55 |         for i in range(0, aLen):
56 |             codingState = self._mCodingSM.next_state(aBuf[i])
57 |             if codingState == constants.eError:
58 |                 if constants._debug:
59 |                     sys.stderr.write(self.get_charset_name()
60 |                                      + ' prober hit error at byte ' + str(i)
61 |                                      + '\n')
62 |                 self._mState = constants.eNotMe
63 |                 break
64 |             elif codingState == constants.eItsMe:
65 |                 self._mState = constants.eFoundIt
66 |                 break
67 |             elif codingState == constants.eStart:
68 |                 charLen = self._mCodingSM.get_current_charlen()
69 |                 if i == 0:
70 |                     self._mLastChar[1] = aBuf[0]
71 |                     self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
72 |                 else:
73 |                     self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
74 |                                                      charLen)
75 | 
76 |         self._mLastChar[0] = aBuf[aLen - 1]
77 | 
78 |         if self.get_state() == constants.eDetecting:
79 |             if (self._mDistributionAnalyzer.got_enough_data() and
80 |                     (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
81 |                 self._mState = constants.eFoundIt
82 | 
83 |         return self.get_state()
84 | 
85 |     def get_confidence(self):
86 |         return self._mDistributionAnalyzer.get_confidence()
87 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/mbcsgroupprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #   Proofpoint, Inc.
13 | #
14 | # This library is free software; you can redistribute it and/or
15 | # modify it under the terms of the GNU Lesser General Public
16 | # License as published by the Free Software Foundation; either
17 | # version 2.1 of the License, or (at your option) any later version.
18 | #
19 | # This library is distributed in the hope that it will be useful,
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22 | # Lesser General Public License for more details.
23 | #
24 | # You should have received a copy of the GNU Lesser General Public
25 | # License along with this library; if not, write to the Free Software
26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 | # 02110-1301  USA
28 | ######################### END LICENSE BLOCK #########################
29 | 
30 | from .charsetgroupprober import CharSetGroupProber
31 | from .utf8prober import UTF8Prober
32 | from .sjisprober import SJISProber
33 | from .eucjpprober import EUCJPProber
34 | from .gb2312prober import GB2312Prober
35 | from .euckrprober import EUCKRProber
36 | from .cp949prober import CP949Prober
37 | from .big5prober import Big5Prober
38 | from .euctwprober import EUCTWProber
39 | 
40 | 
41 | class MBCSGroupProber(CharSetGroupProber):
42 |     def __init__(self):
43 |         CharSetGroupProber.__init__(self)
44 |         self._mProbers = [
45 |             UTF8Prober(),
46 |             SJISProber(),
47 |             EUCJPProber(),
48 |             GB2312Prober(),
49 |             EUCKRProber(),
50 |             CP949Prober(),
51 |             Big5Prober(),
52 |             EUCTWProber()
53 |         ]
54 |         self.reset()
55 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/sbcsgroupprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | #
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 | # Lesser General Public License for more details.
22 | #
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301  USA
27 | ######################### END LICENSE BLOCK #########################
28 | 
29 | from .charsetgroupprober import CharSetGroupProber
30 | from .sbcharsetprober import SingleByteCharSetProber
31 | from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
32 |                                 Latin5CyrillicModel, MacCyrillicModel,
33 |                                 Ibm866Model, Ibm855Model)
34 | from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
35 | from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
36 | from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
37 | from .langthaimodel import TIS620ThaiModel
38 | from .langhebrewmodel import Win1255HebrewModel
39 | from .hebrewprober import HebrewProber
40 | 
41 | 
42 | class SBCSGroupProber(CharSetGroupProber):
43 |     def __init__(self):
44 |         CharSetGroupProber.__init__(self)
45 |         self._mProbers = [
46 |             SingleByteCharSetProber(Win1251CyrillicModel),
47 |             SingleByteCharSetProber(Koi8rModel),
48 |             SingleByteCharSetProber(Latin5CyrillicModel),
49 |             SingleByteCharSetProber(MacCyrillicModel),
50 |             SingleByteCharSetProber(Ibm866Model),
51 |             SingleByteCharSetProber(Ibm855Model),
52 |             SingleByteCharSetProber(Latin7GreekModel),
53 |             SingleByteCharSetProber(Win1253GreekModel),
54 |             SingleByteCharSetProber(Latin5BulgarianModel),
55 |             SingleByteCharSetProber(Win1251BulgarianModel),
56 |             SingleByteCharSetProber(Latin2HungarianModel),
57 |             SingleByteCharSetProber(Win1250HungarianModel),
58 |             SingleByteCharSetProber(TIS620ThaiModel),
59 |         ]
60 |         hebrewProber = HebrewProber()
61 |         logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
62 |                                                       False, hebrewProber)
63 |         visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
64 |                                                      hebrewProber)
65 |         hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
66 |         self._mProbers.extend([hebrewProber, logicalHebrewProber,
67 |                                visualHebrewProber])
68 | 
69 |         self.reset()
70 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/sjisprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | import sys
29 | from .mbcharsetprober import MultiByteCharSetProber
30 | from .codingstatemachine import CodingStateMachine
31 | from .chardistribution import SJISDistributionAnalysis
32 | from .jpcntx import SJISContextAnalysis
33 | from .mbcssm import SJISSMModel
34 | from . import constants
35 | 
36 | 
37 | class SJISProber(MultiByteCharSetProber):
38 |     def __init__(self):
39 |         MultiByteCharSetProber.__init__(self)
40 |         self._mCodingSM = CodingStateMachine(SJISSMModel)
41 |         self._mDistributionAnalyzer = SJISDistributionAnalysis()
42 |         self._mContextAnalyzer = SJISContextAnalysis()
43 |         self.reset()
44 | 
45 |     def reset(self):
46 |         MultiByteCharSetProber.reset(self)
47 |         self._mContextAnalyzer.reset()
48 | 
49 |     def get_charset_name(self):
50 |         return "SHIFT_JIS"
51 | 
52 |     def feed(self, aBuf):
53 |         aLen = len(aBuf)
54 |         for i in range(0, aLen):
55 |             codingState = self._mCodingSM.next_state(aBuf[i])
56 |             if codingState == constants.eError:
57 |                 if constants._debug:
58 |                     sys.stderr.write(self.get_charset_name()
59 |                                      + ' prober hit error at byte ' + str(i)
60 |                                      + '\n')
61 |                 self._mState = constants.eNotMe
62 |                 break
63 |             elif codingState == constants.eItsMe:
64 |                 self._mState = constants.eFoundIt
65 |                 break
66 |             elif codingState == constants.eStart:
67 |                 charLen = self._mCodingSM.get_current_charlen()
68 |                 if i == 0:
69 |                     self._mLastChar[1] = aBuf[0]
70 |                     self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
71 |                                                 charLen)
72 |                     self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
73 |                 else:
74 |                     self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
75 |                                                      - charLen], charLen)
76 |                     self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
77 |                                                      charLen)
78 | 
79 |         self._mLastChar[0] = aBuf[aLen - 1]
80 | 
81 |         if self.get_state() == constants.eDetecting:
82 |             if (self._mContextAnalyzer.got_enough_data() and
83 |                (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
84 |                 self._mState = constants.eFoundIt
85 | 
86 |         return self.get_state()
87 | 
88 |     def get_confidence(self):
89 |         contxtCf = self._mContextAnalyzer.get_confidence()
90 |         distribCf = self._mDistributionAnalyzer.get_confidence()
91 |         return max(contxtCf, distribCf)
92 | 


--------------------------------------------------------------------------------
/requests/packages/chardet/utf8prober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from . import constants
29 | from .charsetprober import CharSetProber
30 | from .codingstatemachine import CodingStateMachine
31 | from .mbcssm import UTF8SMModel
32 | 
33 | ONE_CHAR_PROB = 0.5
34 | 
35 | 
36 | class UTF8Prober(CharSetProber):
37 |     def __init__(self):
38 |         CharSetProber.__init__(self)
39 |         self._mCodingSM = CodingStateMachine(UTF8SMModel)
40 |         self.reset()
41 | 
42 |     def reset(self):
43 |         CharSetProber.reset(self)
44 |         self._mCodingSM.reset()
45 |         self._mNumOfMBChar = 0
46 | 
47 |     def get_charset_name(self):
48 |         return "utf-8"
49 | 
50 |     def feed(self, aBuf):
51 |         for c in aBuf:
52 |             codingState = self._mCodingSM.next_state(c)
53 |             if codingState == constants.eError:
54 |                 self._mState = constants.eNotMe
55 |                 break
56 |             elif codingState == constants.eItsMe:
57 |                 self._mState = constants.eFoundIt
58 |                 break
59 |             elif codingState == constants.eStart:
60 |                 if self._mCodingSM.get_current_charlen() >= 2:
61 |                     self._mNumOfMBChar += 1
62 | 
63 |         if self.get_state() == constants.eDetecting:
64 |             if self.get_confidence() > constants.SHORTCUT_THRESHOLD:
65 |                 self._mState = constants.eFoundIt
66 | 
67 |         return self.get_state()
68 | 
69 |     def get_confidence(self):
70 |         unlike = 0.99
71 |         if self._mNumOfMBChar < 6:
72 |             for i in range(0, self._mNumOfMBChar):
73 |                 unlike = unlike * ONE_CHAR_PROB
74 |             return 1.0 - unlike
75 |         else:
76 |             return unlike
77 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | urllib3 - Thread-safe connection pooling and re-using.
 3 | """
 4 | 
 5 | __author__ = 'Andrey Petrov (andrey.petrov@shazow.net)'
 6 | __license__ = 'MIT'
 7 | __version__ = 'dev'
 8 | 
 9 | 
10 | from .connectionpool import (
11 |     HTTPConnectionPool,
12 |     HTTPSConnectionPool,
13 |     connection_from_url
14 | )
15 | 
16 | from . import exceptions
17 | from .filepost import encode_multipart_formdata
18 | from .poolmanager import PoolManager, ProxyManager, proxy_from_url
19 | from .response import HTTPResponse
20 | from .util.request import make_headers
21 | from .util.url import get_host
22 | from .util.timeout import Timeout
23 | from .util.retry import Retry
24 | 
25 | 
26 | # Set default logging handler to avoid "No handler found" warnings.
27 | import logging
28 | try:  # Python 2.7+
29 |     from logging import NullHandler
30 | except ImportError:
31 |     class NullHandler(logging.Handler):
32 |         def emit(self, record):
33 |             pass
34 | 
35 | logging.getLogger(__name__).addHandler(NullHandler())
36 | 
37 | def add_stderr_logger(level=logging.DEBUG):
38 |     """
39 |     Helper for quickly adding a StreamHandler to the logger. Useful for
40 |     debugging.
41 | 
42 |     Returns the handler after adding it.
43 |     """
44 |     # This method needs to be in this __init__.py to get the __name__ correct
45 |     # even if urllib3 is vendored within another package.
46 |     logger = logging.getLogger(__name__)
47 |     handler = logging.StreamHandler()
48 |     handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
49 |     logger.addHandler(handler)
50 |     logger.setLevel(level)
51 |     logger.debug('Added a stderr logging handler to logger: %s' % __name__)
52 |     return handler
53 | 
54 | # ... Clean up.
55 | del NullHandler
56 | 
57 | 
58 | # Set security warning to only go off once by default.
59 | import warnings
60 | warnings.simplefilter('module', exceptions.SecurityWarning)
61 | 
62 | def disable_warnings(category=exceptions.HTTPWarning):
63 |     """
64 |     Helper for quickly disabling all urllib3 warnings.
65 |     """
66 |     warnings.simplefilter('ignore', category)
67 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/requests/packages/urllib3/contrib/__init__.py


--------------------------------------------------------------------------------
/requests/packages/urllib3/exceptions.py:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Base Exceptions
  3 | 
  4 | class HTTPError(Exception):
  5 |     "Base exception used by this module."
  6 |     pass
  7 | 
  8 | class HTTPWarning(Warning):
  9 |     "Base warning used by this module."
 10 |     pass
 11 | 
 12 | 
 13 | 
 14 | class PoolError(HTTPError):
 15 |     "Base exception for errors caused within a pool."
 16 |     def __init__(self, pool, message):
 17 |         self.pool = pool
 18 |         HTTPError.__init__(self, "%s: %s" % (pool, message))
 19 | 
 20 |     def __reduce__(self):
 21 |         # For pickling purposes.
 22 |         return self.__class__, (None, None)
 23 | 
 24 | 
 25 | class RequestError(PoolError):
 26 |     "Base exception for PoolErrors that have associated URLs."
 27 |     def __init__(self, pool, url, message):
 28 |         self.url = url
 29 |         PoolError.__init__(self, pool, message)
 30 | 
 31 |     def __reduce__(self):
 32 |         # For pickling purposes.
 33 |         return self.__class__, (None, self.url, None)
 34 | 
 35 | 
 36 | class SSLError(HTTPError):
 37 |     "Raised when SSL certificate fails in an HTTPS connection."
 38 |     pass
 39 | 
 40 | 
 41 | class ProxyError(HTTPError):
 42 |     "Raised when the connection to a proxy fails."
 43 |     pass
 44 | 
 45 | 
 46 | class DecodeError(HTTPError):
 47 |     "Raised when automatic decoding based on Content-Type fails."
 48 |     pass
 49 | 
 50 | 
 51 | class ProtocolError(HTTPError):
 52 |     "Raised when something unexpected happens mid-request/response."
 53 |     pass
 54 | 
 55 | 
 56 | #: Renamed to ProtocolError but aliased for backwards compatibility.
 57 | ConnectionError = ProtocolError
 58 | 
 59 | 
 60 | ## Leaf Exceptions
 61 | 
 62 | class MaxRetryError(RequestError):
 63 |     """Raised when the maximum number of retries is exceeded.
 64 | 
 65 |     :param pool: The connection pool
 66 |     :type pool: :class:`~urllib3.connectionpool.HTTPConnectionPool`
 67 |     :param string url: The requested Url
 68 |     :param exceptions.Exception reason: The underlying error
 69 | 
 70 |     """
 71 | 
 72 |     def __init__(self, pool, url, reason=None):
 73 |         self.reason = reason
 74 | 
 75 |         message = "Max retries exceeded with url: %s" % url
 76 |         if reason:
 77 |             message += " (Caused by %r)" % reason
 78 |         else:
 79 |             message += " (Caused by redirect)"
 80 | 
 81 |         RequestError.__init__(self, pool, url, message)
 82 | 
 83 | 
 84 | class HostChangedError(RequestError):
 85 |     "Raised when an existing pool gets a request for a foreign host."
 86 | 
 87 |     def __init__(self, pool, url, retries=3):
 88 |         message = "Tried to open a foreign host with url: %s" % url
 89 |         RequestError.__init__(self, pool, url, message)
 90 |         self.retries = retries
 91 | 
 92 | 
 93 | class TimeoutStateError(HTTPError):
 94 |     """ Raised when passing an invalid state to a timeout """
 95 |     pass
 96 | 
 97 | 
 98 | class TimeoutError(HTTPError):
 99 |     """ Raised when a socket timeout error occurs.
100 | 
101 |     Catching this error will catch both :exc:`ReadTimeoutErrors
102 |     <ReadTimeoutError>` and :exc:`ConnectTimeoutErrors <ConnectTimeoutError>`.
103 |     """
104 |     pass
105 | 
106 | 
107 | class ReadTimeoutError(TimeoutError, RequestError):
108 |     "Raised when a socket timeout occurs while receiving data from a server"
109 |     pass
110 | 
111 | 
112 | # This timeout error does not have a URL attached and needs to inherit from the
113 | # base HTTPError
114 | class ConnectTimeoutError(TimeoutError):
115 |     "Raised when a socket timeout occurs while connecting to a server"
116 |     pass
117 | 
118 | 
119 | class EmptyPoolError(PoolError):
120 |     "Raised when a pool runs out of connections and no more are allowed."
121 |     pass
122 | 
123 | 
124 | class ClosedPoolError(PoolError):
125 |     "Raised when a request enters a pool after the pool has been closed."
126 |     pass
127 | 
128 | 
129 | class LocationValueError(ValueError, HTTPError):
130 |     "Raised when there is something wrong with a given URL input."
131 |     pass
132 | 
133 | 
134 | class LocationParseError(LocationValueError):
135 |     "Raised when get_host or similar fails to parse the URL input."
136 | 
137 |     def __init__(self, location):
138 |         message = "Failed to parse: %s" % location
139 |         HTTPError.__init__(self, message)
140 | 
141 |         self.location = location
142 | 
143 | 
144 | class SecurityWarning(HTTPWarning):
145 |     "Warned when perfoming security reducing actions"
146 |     pass
147 | 
148 | 
149 | class InsecureRequestWarning(SecurityWarning):
150 |     "Warned when making an unverified HTTPS request."
151 |     pass
152 | 
153 | 
154 | class SystemTimeWarning(SecurityWarning):
155 |     "Warned when system time is suspected to be wrong"
156 |     pass
157 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/filepost.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | 
 3 | from uuid import uuid4
 4 | from io import BytesIO
 5 | 
 6 | from .packages import six
 7 | from .packages.six import b
 8 | from .fields import RequestField
 9 | 
10 | writer = codecs.lookup('utf-8')[3]
11 | 
12 | 
13 | def choose_boundary():
14 |     """
15 |     Our embarassingly-simple replacement for mimetools.choose_boundary.
16 |     """
17 |     return uuid4().hex
18 | 
19 | 
20 | def iter_field_objects(fields):
21 |     """
22 |     Iterate over fields.
23 | 
24 |     Supports list of (k, v) tuples and dicts, and lists of
25 |     :class:`~urllib3.fields.RequestField`.
26 | 
27 |     """
28 |     if isinstance(fields, dict):
29 |         i = six.iteritems(fields)
30 |     else:
31 |         i = iter(fields)
32 | 
33 |     for field in i:
34 |         if isinstance(field, RequestField):
35 |             yield field
36 |         else:
37 |             yield RequestField.from_tuples(*field)
38 | 
39 | 
40 | def iter_fields(fields):
41 |     """
42 |     .. deprecated:: 1.6
43 | 
44 |     Iterate over fields.
45 | 
46 |     The addition of :class:`~urllib3.fields.RequestField` makes this function
47 |     obsolete. Instead, use :func:`iter_field_objects`, which returns
48 |     :class:`~urllib3.fields.RequestField` objects.
49 | 
50 |     Supports list of (k, v) tuples and dicts.
51 |     """
52 |     if isinstance(fields, dict):
53 |         return ((k, v) for k, v in six.iteritems(fields))
54 | 
55 |     return ((k, v) for k, v in fields)
56 | 
57 | 
58 | def encode_multipart_formdata(fields, boundary=None):
59 |     """
60 |     Encode a dictionary of ``fields`` using the multipart/form-data MIME format.
61 | 
62 |     :param fields:
63 |         Dictionary of fields or list of (key, :class:`~urllib3.fields.RequestField`).
64 | 
65 |     :param boundary:
66 |         If not specified, then a random boundary will be generated using
67 |         :func:`mimetools.choose_boundary`.
68 |     """
69 |     body = BytesIO()
70 |     if boundary is None:
71 |         boundary = choose_boundary()
72 | 
73 |     for field in iter_field_objects(fields):
74 |         body.write(b('--%s\r\n' % (boundary)))
75 | 
76 |         writer(body).write(field.render_headers())
77 |         data = field.data
78 | 
79 |         if isinstance(data, int):
80 |             data = str(data)  # Backwards compatibility
81 | 
82 |         if isinstance(data, six.text_type):
83 |             writer(body).write(data)
84 |         else:
85 |             body.write(data)
86 | 
87 |         body.write(b'\r\n')
88 | 
89 |     body.write(b('--%s--\r\n' % (boundary)))
90 | 
91 |     content_type = str('multipart/form-data; boundary=%s' % boundary)
92 | 
93 |     return body.getvalue(), content_type
94 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/packages/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | 
3 | from . import ssl_match_hostname
4 | 
5 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/packages/ssl_match_hostname/__init__.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     # Python 3.2+
 3 |     from ssl import CertificateError, match_hostname
 4 | except ImportError:
 5 |     try:
 6 |         # Backport of the function from a pypi module
 7 |         from backports.ssl_match_hostname import CertificateError, match_hostname
 8 |     except ImportError:
 9 |         # Our vendored copy
10 |         from ._implementation import CertificateError, match_hostname
11 | 
12 | # Not needed, but documenting what we provide.
13 | __all__ = ('CertificateError', 'match_hostname')
14 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/packages/ssl_match_hostname/_implementation.py:
--------------------------------------------------------------------------------
  1 | """The match_hostname() function from Python 3.3.3, essential when using SSL."""
  2 | 
  3 | # Note: This file is under the PSF license as the code comes from the python
  4 | # stdlib.   http://docs.python.org/3/license.html
  5 | 
  6 | import re
  7 | 
  8 | __version__ = '3.4.0.2'
  9 | 
 10 | class CertificateError(ValueError):
 11 |     pass
 12 | 
 13 | 
 14 | def _dnsname_match(dn, hostname, max_wildcards=1):
 15 |     """Matching according to RFC 6125, section 6.4.3
 16 | 
 17 |     http://tools.ietf.org/html/rfc6125#section-6.4.3
 18 |     """
 19 |     pats = []
 20 |     if not dn:
 21 |         return False
 22 | 
 23 |     # Ported from python3-syntax:
 24 |     # leftmost, *remainder = dn.split(r'.')
 25 |     parts = dn.split(r'.')
 26 |     leftmost = parts[0]
 27 |     remainder = parts[1:]
 28 | 
 29 |     wildcards = leftmost.count('*')
 30 |     if wildcards > max_wildcards:
 31 |         # Issue #17980: avoid denials of service by refusing more
 32 |         # than one wildcard per fragment.  A survey of established
 33 |         # policy among SSL implementations showed it to be a
 34 |         # reasonable choice.
 35 |         raise CertificateError(
 36 |             "too many wildcards in certificate DNS name: " + repr(dn))
 37 | 
 38 |     # speed up common case w/o wildcards
 39 |     if not wildcards:
 40 |         return dn.lower() == hostname.lower()
 41 | 
 42 |     # RFC 6125, section 6.4.3, subitem 1.
 43 |     # The client SHOULD NOT attempt to match a presented identifier in which
 44 |     # the wildcard character comprises a label other than the left-most label.
 45 |     if leftmost == '*':
 46 |         # When '*' is a fragment by itself, it matches a non-empty dotless
 47 |         # fragment.
 48 |         pats.append('[^.]+')
 49 |     elif leftmost.startswith('xn--') or hostname.startswith('xn--'):
 50 |         # RFC 6125, section 6.4.3, subitem 3.
 51 |         # The client SHOULD NOT attempt to match a presented identifier
 52 |         # where the wildcard character is embedded within an A-label or
 53 |         # U-label of an internationalized domain name.
 54 |         pats.append(re.escape(leftmost))
 55 |     else:
 56 |         # Otherwise, '*' matches any dotless string, e.g. www*
 57 |         pats.append(re.escape(leftmost).replace(r'\*', '[^.]*'))
 58 | 
 59 |     # add the remaining fragments, ignore any wildcards
 60 |     for frag in remainder:
 61 |         pats.append(re.escape(frag))
 62 | 
 63 |     pat = re.compile(r'\A' + r'\.'.join(pats) + r'\Z', re.IGNORECASE)
 64 |     return pat.match(hostname)
 65 | 
 66 | 
 67 | def match_hostname(cert, hostname):
 68 |     """Verify that *cert* (in decoded format as returned by
 69 |     SSLSocket.getpeercert()) matches the *hostname*.  RFC 2818 and RFC 6125
 70 |     rules are followed, but IP addresses are not accepted for *hostname*.
 71 | 
 72 |     CertificateError is raised on failure. On success, the function
 73 |     returns nothing.
 74 |     """
 75 |     if not cert:
 76 |         raise ValueError("empty or no certificate")
 77 |     dnsnames = []
 78 |     san = cert.get('subjectAltName', ())
 79 |     for key, value in san:
 80 |         if key == 'DNS':
 81 |             if _dnsname_match(value, hostname):
 82 |                 return
 83 |             dnsnames.append(value)
 84 |     if not dnsnames:
 85 |         # The subject is only checked when there is no dNSName entry
 86 |         # in subjectAltName
 87 |         for sub in cert.get('subject', ()):
 88 |             for key, value in sub:
 89 |                 # XXX according to RFC 2818, the most specific Common Name
 90 |                 # must be used.
 91 |                 if key == 'commonName':
 92 |                     if _dnsname_match(value, hostname):
 93 |                         return
 94 |                     dnsnames.append(value)
 95 |     if len(dnsnames) > 1:
 96 |         raise CertificateError("hostname %r "
 97 |             "doesn't match either of %s"
 98 |             % (hostname, ', '.join(map(repr, dnsnames))))
 99 |     elif len(dnsnames) == 1:
100 |         raise CertificateError("hostname %r "
101 |             "doesn't match %r"
102 |             % (hostname, dnsnames[0]))
103 |     else:
104 |         raise CertificateError("no appropriate commonName or "
105 |             "subjectAltName fields were found")
106 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # For backwards compatibility, provide imports that used to be here.
 2 | from .connection import is_connection_dropped
 3 | from .request import make_headers
 4 | from .response import is_fp_closed
 5 | from .ssl_ import (
 6 |     SSLContext,
 7 |     HAS_SNI,
 8 |     assert_fingerprint,
 9 |     resolve_cert_reqs,
10 |     resolve_ssl_version,
11 |     ssl_wrap_socket,
12 | )
13 | from .timeout import (
14 |     current_time,
15 |     Timeout,
16 | )
17 | 
18 | from .retry import Retry
19 | from .url import (
20 |     get_host,
21 |     parse_url,
22 |     split_first,
23 |     Url,
24 | )
25 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/util/connection.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | try:
 3 |     from select import poll, POLLIN
 4 | except ImportError:  # `poll` doesn't exist on OSX and other platforms
 5 |     poll = False
 6 |     try:
 7 |         from select import select
 8 |     except ImportError:  # `select` doesn't exist on AppEngine.
 9 |         select = False
10 | 
11 | 
12 | def is_connection_dropped(conn):  # Platform-specific
13 |     """
14 |     Returns True if the connection is dropped and should be closed.
15 | 
16 |     :param conn:
17 |         :class:`httplib.HTTPConnection` object.
18 | 
19 |     Note: For platforms like AppEngine, this will always return ``False`` to
20 |     let the platform handle connection recycling transparently for us.
21 |     """
22 |     sock = getattr(conn, 'sock', False)
23 |     if sock is False:  # Platform-specific: AppEngine
24 |         return False
25 |     if sock is None:  # Connection already closed (such as by httplib).
26 |         return True
27 | 
28 |     if not poll:
29 |         if not select:  # Platform-specific: AppEngine
30 |             return False
31 | 
32 |         try:
33 |             return select([sock], [], [], 0.0)[0]
34 |         except socket.error:
35 |             return True
36 | 
37 |     # This version is better on platforms that support it.
38 |     p = poll()
39 |     p.register(sock, POLLIN)
40 |     for (fno, ev) in p.poll(0.0):
41 |         if fno == sock.fileno():
42 |             # Either data is buffered (bad), or the connection is dropped.
43 |             return True
44 | 
45 | 
46 | # This function is copied from socket.py in the Python 2.7 standard
47 | # library test suite. Added to its signature is only `socket_options`.
48 | def create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
49 |                       source_address=None, socket_options=None):
50 |     """Connect to *address* and return the socket object.
51 | 
52 |     Convenience function.  Connect to *address* (a 2-tuple ``(host,
53 |     port)``) and return the socket object.  Passing the optional
54 |     *timeout* parameter will set the timeout on the socket instance
55 |     before attempting to connect.  If no *timeout* is supplied, the
56 |     global default timeout setting returned by :func:`getdefaulttimeout`
57 |     is used.  If *source_address* is set it must be a tuple of (host, port)
58 |     for the socket to bind as a source address before making the connection.
59 |     An host of '' or port 0 tells the OS to use the default.
60 |     """
61 | 
62 |     host, port = address
63 |     err = None
64 |     for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM):
65 |         af, socktype, proto, canonname, sa = res
66 |         sock = None
67 |         try:
68 |             sock = socket.socket(af, socktype, proto)
69 | 
70 |             # If provided, set socket level options before connecting.
71 |             # This is the only addition urllib3 makes to this function.
72 |             _set_socket_options(sock, socket_options)
73 | 
74 |             if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
75 |                 sock.settimeout(timeout)
76 |             if source_address:
77 |                 sock.bind(source_address)
78 |             sock.connect(sa)
79 |             return sock
80 | 
81 |         except socket.error as _:
82 |             err = _
83 |             if sock is not None:
84 |                 sock.close()
85 | 
86 |     if err is not None:
87 |         raise err
88 |     else:
89 |         raise socket.error("getaddrinfo returns an empty list")
90 | 
91 | 
92 | def _set_socket_options(sock, options):
93 |     if options is None:
94 |         return
95 | 
96 |     for opt in options:
97 |         sock.setsockopt(*opt)
98 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/util/request.py:
--------------------------------------------------------------------------------
 1 | from base64 import b64encode
 2 | 
 3 | from ..packages.six import b
 4 | 
 5 | ACCEPT_ENCODING = 'gzip,deflate'
 6 | 
 7 | 
 8 | def make_headers(keep_alive=None, accept_encoding=None, user_agent=None,
 9 |                  basic_auth=None, proxy_basic_auth=None, disable_cache=None):
10 |     """
11 |     Shortcuts for generating request headers.
12 | 
13 |     :param keep_alive:
14 |         If ``True``, adds 'connection: keep-alive' header.
15 | 
16 |     :param accept_encoding:
17 |         Can be a boolean, list, or string.
18 |         ``True`` translates to 'gzip,deflate'.
19 |         List will get joined by comma.
20 |         String will be used as provided.
21 | 
22 |     :param user_agent:
23 |         String representing the user-agent you want, such as
24 |         "python-urllib3/0.6"
25 | 
26 |     :param basic_auth:
27 |         Colon-separated username:password string for 'authorization: basic ...'
28 |         auth header.
29 | 
30 |     :param proxy_basic_auth:
31 |         Colon-separated username:password string for 'proxy-authorization: basic ...'
32 |         auth header.
33 | 
34 |     :param disable_cache:
35 |         If ``True``, adds 'cache-control: no-cache' header.
36 | 
37 |     Example::
38 | 
39 |         >>> make_headers(keep_alive=True, user_agent="Batman/1.0")
40 |         {'connection': 'keep-alive', 'user-agent': 'Batman/1.0'}
41 |         >>> make_headers(accept_encoding=True)
42 |         {'accept-encoding': 'gzip,deflate'}
43 |     """
44 |     headers = {}
45 |     if accept_encoding:
46 |         if isinstance(accept_encoding, str):
47 |             pass
48 |         elif isinstance(accept_encoding, list):
49 |             accept_encoding = ','.join(accept_encoding)
50 |         else:
51 |             accept_encoding = ACCEPT_ENCODING
52 |         headers['accept-encoding'] = accept_encoding
53 | 
54 |     if user_agent:
55 |         headers['user-agent'] = user_agent
56 | 
57 |     if keep_alive:
58 |         headers['connection'] = 'keep-alive'
59 | 
60 |     if basic_auth:
61 |         headers['authorization'] = 'Basic ' + \
62 |             b64encode(b(basic_auth)).decode('utf-8')
63 | 
64 |     if proxy_basic_auth:
65 |         headers['proxy-authorization'] = 'Basic ' + \
66 |             b64encode(b(proxy_basic_auth)).decode('utf-8')
67 | 
68 |     if disable_cache:
69 |         headers['cache-control'] = 'no-cache'
70 | 
71 |     return headers
72 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/util/response.py:
--------------------------------------------------------------------------------
 1 | def is_fp_closed(obj):
 2 |     """
 3 |     Checks whether a given file-like object is closed.
 4 | 
 5 |     :param obj:
 6 |         The file-like object to check.
 7 |     """
 8 | 
 9 |     try:
10 |         # Check via the official file-like-object way.
11 |         return obj.closed
12 |     except AttributeError:
13 |         pass
14 | 
15 |     try:
16 |         # Check if the object is a container for another file-like object that
17 |         # gets released on exhaustion (e.g. HTTPResponse).
18 |         return obj.fp is None
19 |     except AttributeError:
20 |         pass
21 | 
22 |     raise ValueError("Unable to determine whether fp is closed.")
23 | 


--------------------------------------------------------------------------------
/requests/packages/urllib3/util/ssl_.py:
--------------------------------------------------------------------------------
  1 | from binascii import hexlify, unhexlify
  2 | from hashlib import md5, sha1
  3 | 
  4 | from ..exceptions import SSLError
  5 | 
  6 | 
  7 | try:  # Test for SSL features
  8 |     SSLContext = None
  9 |     HAS_SNI = False
 10 | 
 11 |     import ssl
 12 |     from ssl import wrap_socket, CERT_NONE, PROTOCOL_SSLv23
 13 |     from ssl import SSLContext  # Modern SSL?
 14 |     from ssl import HAS_SNI  # Has SNI?
 15 | except ImportError:
 16 |     pass
 17 | 
 18 | 
 19 | def assert_fingerprint(cert, fingerprint):
 20 |     """
 21 |     Checks if given fingerprint matches the supplied certificate.
 22 | 
 23 |     :param cert:
 24 |         Certificate as bytes object.
 25 |     :param fingerprint:
 26 |         Fingerprint as string of hexdigits, can be interspersed by colons.
 27 |     """
 28 | 
 29 |     # Maps the length of a digest to a possible hash function producing
 30 |     # this digest.
 31 |     hashfunc_map = {
 32 |         16: md5,
 33 |         20: sha1
 34 |     }
 35 | 
 36 |     fingerprint = fingerprint.replace(':', '').lower()
 37 |     digest_length, odd = divmod(len(fingerprint), 2)
 38 | 
 39 |     if odd or digest_length not in hashfunc_map:
 40 |         raise SSLError('Fingerprint is of invalid length.')
 41 | 
 42 |     # We need encode() here for py32; works on py2 and p33.
 43 |     fingerprint_bytes = unhexlify(fingerprint.encode())
 44 | 
 45 |     hashfunc = hashfunc_map[digest_length]
 46 | 
 47 |     cert_digest = hashfunc(cert).digest()
 48 | 
 49 |     if not cert_digest == fingerprint_bytes:
 50 |         raise SSLError('Fingerprints did not match. Expected "{0}", got "{1}".'
 51 |                        .format(hexlify(fingerprint_bytes),
 52 |                                hexlify(cert_digest)))
 53 | 
 54 | 
 55 | def resolve_cert_reqs(candidate):
 56 |     """
 57 |     Resolves the argument to a numeric constant, which can be passed to
 58 |     the wrap_socket function/method from the ssl module.
 59 |     Defaults to :data:`ssl.CERT_NONE`.
 60 |     If given a string it is assumed to be the name of the constant in the
 61 |     :mod:`ssl` module or its abbrevation.
 62 |     (So you can specify `REQUIRED` instead of `CERT_REQUIRED`.
 63 |     If it's neither `None` nor a string we assume it is already the numeric
 64 |     constant which can directly be passed to wrap_socket.
 65 |     """
 66 |     if candidate is None:
 67 |         return CERT_NONE
 68 | 
 69 |     if isinstance(candidate, str):
 70 |         res = getattr(ssl, candidate, None)
 71 |         if res is None:
 72 |             res = getattr(ssl, 'CERT_' + candidate)
 73 |         return res
 74 | 
 75 |     return candidate
 76 | 
 77 | 
 78 | def resolve_ssl_version(candidate):
 79 |     """
 80 |     like resolve_cert_reqs
 81 |     """
 82 |     if candidate is None:
 83 |         return PROTOCOL_SSLv23
 84 | 
 85 |     if isinstance(candidate, str):
 86 |         res = getattr(ssl, candidate, None)
 87 |         if res is None:
 88 |             res = getattr(ssl, 'PROTOCOL_' + candidate)
 89 |         return res
 90 | 
 91 |     return candidate
 92 | 
 93 | 
 94 | if SSLContext is not None:  # Python 3.2+
 95 |     def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
 96 |                         ca_certs=None, server_hostname=None,
 97 |                         ssl_version=None):
 98 |         """
 99 |         All arguments except `server_hostname` have the same meaning as for
100 |         :func:`ssl.wrap_socket`
101 | 
102 |         :param server_hostname:
103 |             Hostname of the expected certificate
104 |         """
105 |         context = SSLContext(ssl_version)
106 |         context.verify_mode = cert_reqs
107 | 
108 |         # Disable TLS compression to migitate CRIME attack (issue #309)
109 |         OP_NO_COMPRESSION = 0x20000
110 |         context.options |= OP_NO_COMPRESSION
111 | 
112 |         if ca_certs:
113 |             try:
114 |                 context.load_verify_locations(ca_certs)
115 |             # Py32 raises IOError
116 |             # Py33 raises FileNotFoundError
117 |             except Exception as e:  # Reraise as SSLError
118 |                 raise SSLError(e)
119 |         if certfile:
120 |             # FIXME: This block needs a test.
121 |             context.load_cert_chain(certfile, keyfile)
122 |         if HAS_SNI:  # Platform-specific: OpenSSL with enabled SNI
123 |             return context.wrap_socket(sock, server_hostname=server_hostname)
124 |         return context.wrap_socket(sock)
125 | 
126 | else:  # Python 3.1 and earlier
127 |     def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
128 |                         ca_certs=None, server_hostname=None,
129 |                         ssl_version=None):
130 |         return wrap_socket(sock, keyfile=keyfile, certfile=certfile,
131 |                            ca_certs=ca_certs, cert_reqs=cert_reqs,
132 |                            ssl_version=ssl_version)
133 | 


--------------------------------------------------------------------------------
/requests/status_codes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .structures import LookupDict
 4 | 
 5 | _codes = {
 6 | 
 7 |     # Informational.
 8 |     100: ('continue',),
 9 |     101: ('switching_protocols',),
10 |     102: ('processing',),
11 |     103: ('checkpoint',),
12 |     122: ('uri_too_long', 'request_uri_too_long'),
13 |     200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\o/', '✓'),
14 |     201: ('created',),
15 |     202: ('accepted',),
16 |     203: ('non_authoritative_info', 'non_authoritative_information'),
17 |     204: ('no_content',),
18 |     205: ('reset_content', 'reset'),
19 |     206: ('partial_content', 'partial'),
20 |     207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'),
21 |     208: ('already_reported',),
22 |     226: ('im_used',),
23 | 
24 |     # Redirection.
25 |     300: ('multiple_choices',),
26 |     301: ('moved_permanently', 'moved', '\\o-'),
27 |     302: ('found',),
28 |     303: ('see_other', 'other'),
29 |     304: ('not_modified',),
30 |     305: ('use_proxy',),
31 |     306: ('switch_proxy',),
32 |     307: ('temporary_redirect', 'temporary_moved', 'temporary'),
33 |     308: ('permanent_redirect',
34 |           'resume_incomplete', 'resume',), # These 2 to be removed in 3.0
35 | 
36 |     # Client Error.
37 |     400: ('bad_request', 'bad'),
38 |     401: ('unauthorized',),
39 |     402: ('payment_required', 'payment'),
40 |     403: ('forbidden',),
41 |     404: ('not_found', '-o-'),
42 |     405: ('method_not_allowed', 'not_allowed'),
43 |     406: ('not_acceptable',),
44 |     407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'),
45 |     408: ('request_timeout', 'timeout'),
46 |     409: ('conflict',),
47 |     410: ('gone',),
48 |     411: ('length_required',),
49 |     412: ('precondition_failed', 'precondition'),
50 |     413: ('request_entity_too_large',),
51 |     414: ('request_uri_too_large',),
52 |     415: ('unsupported_media_type', 'unsupported_media', 'media_type'),
53 |     416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'),
54 |     417: ('expectation_failed',),
55 |     418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'),
56 |     422: ('unprocessable_entity', 'unprocessable'),
57 |     423: ('locked',),
58 |     424: ('failed_dependency', 'dependency'),
59 |     425: ('unordered_collection', 'unordered'),
60 |     426: ('upgrade_required', 'upgrade'),
61 |     428: ('precondition_required', 'precondition'),
62 |     429: ('too_many_requests', 'too_many'),
63 |     431: ('header_fields_too_large', 'fields_too_large'),
64 |     444: ('no_response', 'none'),
65 |     449: ('retry_with', 'retry'),
66 |     450: ('blocked_by_windows_parental_controls', 'parental_controls'),
67 |     451: ('unavailable_for_legal_reasons', 'legal_reasons'),
68 |     499: ('client_closed_request',),
69 | 
70 |     # Server Error.
71 |     500: ('internal_server_error', 'server_error', '/o\\', '✗'),
72 |     501: ('not_implemented',),
73 |     502: ('bad_gateway',),
74 |     503: ('service_unavailable', 'unavailable'),
75 |     504: ('gateway_timeout',),
76 |     505: ('http_version_not_supported', 'http_version'),
77 |     506: ('variant_also_negotiates',),
78 |     507: ('insufficient_storage',),
79 |     509: ('bandwidth_limit_exceeded', 'bandwidth'),
80 |     510: ('not_extended',),
81 | }
82 | 
83 | codes = LookupDict(name='status_codes')
84 | 
85 | for (code, titles) in list(_codes.items()):
86 |     for title in titles:
87 |         setattr(codes, title, code)
88 |         if not title.startswith('\\'):
89 |             setattr(codes, title.upper(), code)
90 | 


--------------------------------------------------------------------------------
/requests/structures.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | requests.structures
  5 | ~~~~~~~~~~~~~~~~~~~
  6 | 
  7 | Data structures that power Requests.
  8 | 
  9 | """
 10 | 
 11 | import collections
 12 | 
 13 | 
 14 | class CaseInsensitiveDict(collections.MutableMapping):
 15 |     """
 16 |     A case-insensitive ``dict``-like object.
 17 | 
 18 |     Implements all methods and operations of
 19 |     ``collections.MutableMapping`` as well as dict's ``copy``. Also
 20 |     provides ``lower_items``.
 21 | 
 22 |     All keys are expected to be strings. The structure remembers the
 23 |     case of the last key to be set, and ``iter(instance)``,
 24 |     ``keys()``, ``items()``, ``iterkeys()``, and ``iteritems()``
 25 |     will contain case-sensitive keys. However, querying and contains
 26 |     testing is case insensitive::
 27 | 
 28 |         cid = CaseInsensitiveDict()
 29 |         cid['Accept'] = 'application/json'
 30 |         cid['aCCEPT'] == 'application/json'  # True
 31 |         list(cid) == ['Accept']  # True
 32 | 
 33 |     For example, ``headers['content-encoding']`` will return the
 34 |     value of a ``'Content-Encoding'`` response header, regardless
 35 |     of how the header name was originally stored.
 36 | 
 37 |     If the constructor, ``.update``, or equality comparison
 38 |     operations are given keys that have equal ``.lower()``s, the
 39 |     behavior is undefined.
 40 | 
 41 |     """
 42 |     def __init__(self, data=None, **kwargs):
 43 |         self._store = dict()
 44 |         if data is None:
 45 |             data = {}
 46 |         self.update(data, **kwargs)
 47 | 
 48 |     def __setitem__(self, key, value):
 49 |         # Use the lowercased key for lookups, but store the actual
 50 |         # key alongside the value.
 51 |         self._store[key.lower()] = (key, value)
 52 | 
 53 |     def __getitem__(self, key):
 54 |         return self._store[key.lower()][1]
 55 | 
 56 |     def __delitem__(self, key):
 57 |         del self._store[key.lower()]
 58 | 
 59 |     def __iter__(self):
 60 |         return (casedkey for casedkey, mappedvalue in self._store.values())
 61 | 
 62 |     def __len__(self):
 63 |         return len(self._store)
 64 | 
 65 |     def lower_items(self):
 66 |         """Like iteritems(), but with all lowercase keys."""
 67 |         return (
 68 |             (lowerkey, keyval[1])
 69 |             for (lowerkey, keyval)
 70 |             in self._store.items()
 71 |         )
 72 | 
 73 |     def __eq__(self, other):
 74 |         if isinstance(other, collections.Mapping):
 75 |             other = CaseInsensitiveDict(other)
 76 |         else:
 77 |             return NotImplemented
 78 |         # Compare insensitively
 79 |         return dict(self.lower_items()) == dict(other.lower_items())
 80 | 
 81 |     # Copy is required
 82 |     def copy(self):
 83 |         return CaseInsensitiveDict(self._store.values())
 84 | 
 85 |     def __repr__(self):
 86 |         return str(dict(self.items()))
 87 | 
 88 | class LookupDict(dict):
 89 |     """Dictionary lookup object."""
 90 | 
 91 |     def __init__(self, name=None):
 92 |         self.name = name
 93 |         super(LookupDict, self).__init__()
 94 | 
 95 |     def __repr__(self):
 96 |         return '<lookup \'%s\'>' % (self.name)
 97 | 
 98 |     def __getitem__(self, key):
 99 |         # We allow fall-through here, so values default to None
100 | 
101 |         return self.__dict__.get(key, None)
102 | 
103 |     def get(self, key, default=None):
104 |         return self.__dict__.get(key, default)
105 | 


--------------------------------------------------------------------------------
/search.py:
--------------------------------------------------------------------------------
 1 | import document
 2 | from urllib import urlencode
 3 | from httplib2 import Http
 4 | import json
 5 | from base64 import b64encode
 6 | import secrets
 7 | 
 8 | def document_from_query(query):
 9 | 	query_dict = {"$format": "json", "Query": "'{0}'".format(query)}
10 | 	url = "https://api.datamarket.azure.com/Bing/Search/Web?" + urlencode(query_dict)
11 | 	auth_string = b64encode("{0}:{0}".format(secrets.BING_API_KEY))
12 | 	headers = {"Authorization": "Basic " + auth_string}
13 | 	response, content = Http().request(url, "GET", headers=headers)
14 | 	results = json.loads(content)['d']['results']
15 | 	html = u"<h1>Web search for '{0}'</h1>".format(query) + u"<br/>".join([u"<a href='{0}'>{1} ({2})</a>".format(r['Url'], r['Title'], r['DisplayUrl']) for r in results])
16 | 	doc = document.Document(html = html)
17 | 	return doc
18 | 


--------------------------------------------------------------------------------
/txtfy.py:
--------------------------------------------------------------------------------
 1 | def txtfy_word(w):
 2 | 	maps = {
 3 | 	"to": "2",
 4 | 	"too": "2",
 5 | 	"you": "u",
 6 | 	"you'll": "u'll",
 7 | 	"your": "ur",
 8 | 	"for": "4",
 9 | 	"and": "&",
10 | 	"at": "@",
11 | 	"with": "w/",
12 | 	"before": "b4",
13 | 	"one": "1",
14 | 	}
15 | 	if w.lower() in maps:
16 | 		return maps[w.lower()]
17 | 	return w
18 | 
19 | def txtfy(text):
20 | 	tokens = text.split(" ")
21 | 	return u" ".join(map(txtfy_word, tokens))
22 | 


--------------------------------------------------------------------------------