├── .gitignore
├── BeautifulSoup.py
├── README.md
├── app.yaml
├── browse.py
├── bs4
├── __init__.py
├── builder
│ ├── __init__.py
│ ├── _html5lib.py
│ ├── _htmlparser.py
│ └── _lxml.py
├── dammit.py
├── diagnose.py
├── element.py
├── testing.py
└── tests
│ ├── __init__.py
│ ├── test_builder_registry.py
│ ├── test_docs.py
│ ├── test_html5lib.py
│ ├── test_htmlparser.py
│ ├── test_lxml.py
│ ├── test_soup.py
│ └── test_tree.py
├── cssselect
├── __init__.py
├── parser.py
├── tests.py
└── xpath.py
├── document.py
├── document_old.py
├── favicon.ico
├── goose
├── __init__.py
├── article.py
├── cleaners.py
├── configuration.py
├── crawler.py
├── extractors.py
├── images
│ ├── __init__.py
│ ├── extractors.py
│ ├── image.py
│ └── utils.py
├── network.py
├── outputformatters.py
├── parsers.py
├── resources
│ ├── images
│ │ └── known-image-css.txt
│ └── text
│ │ ├── stopwords-ar.txt
│ │ ├── stopwords-da.txt
│ │ ├── stopwords-de.txt
│ │ ├── stopwords-en.txt
│ │ ├── stopwords-es.txt
│ │ ├── stopwords-fi.txt
│ │ ├── stopwords-fr.txt
│ │ ├── stopwords-hu.txt
│ │ ├── stopwords-id.txt
│ │ ├── stopwords-it.txt
│ │ ├── stopwords-ko.txt
│ │ ├── stopwords-nb.txt
│ │ ├── stopwords-nl.txt
│ │ ├── stopwords-no.txt
│ │ ├── stopwords-pl.txt
│ │ ├── stopwords-pt.txt
│ │ ├── stopwords-ru.txt
│ │ ├── stopwords-sv.txt
│ │ └── stopwords-zh.txt
├── text.py
├── utils
│ ├── __init__.py
│ └── encoding.py
├── version.py
└── videos
│ ├── __init__.py
│ ├── extractors.py
│ └── videos.py
├── html2text.py
├── httplib2
├── __init__.py
├── cacerts.txt
├── iri2uri.py
├── socks.py
└── test
│ ├── __init__.py
│ ├── brokensocket
│ └── socket.py
│ ├── functional
│ └── test_proxies.py
│ ├── miniserver.py
│ ├── other_cacerts.txt
│ ├── smoke_test.py
│ └── test_no_socket.py
├── index.yaml
├── instructions.html
├── main.py
├── page.html
├── parse_command.py
├── pybing
├── __init__.py
├── bing.py
├── constants.py
├── query
│ ├── __init__.py
│ ├── mixin.py
│ ├── pagable.py
│ ├── query.py
│ └── web.py
├── result.py
└── resultset.py
├── requests
├── __init__.py
├── adapters.py
├── api.py
├── auth.py
├── cacert.pem
├── certs.py
├── compat.py
├── cookies.py
├── exceptions.py
├── hooks.py
├── models.py
├── packages
│ ├── __init__.py
│ ├── chardet
│ │ ├── __init__.py
│ │ ├── big5freq.py
│ │ ├── big5prober.py
│ │ ├── chardetect.py
│ │ ├── chardistribution.py
│ │ ├── charsetgroupprober.py
│ │ ├── charsetprober.py
│ │ ├── codingstatemachine.py
│ │ ├── compat.py
│ │ ├── constants.py
│ │ ├── cp949prober.py
│ │ ├── escprober.py
│ │ ├── escsm.py
│ │ ├── eucjpprober.py
│ │ ├── euckrfreq.py
│ │ ├── euckrprober.py
│ │ ├── euctwfreq.py
│ │ ├── euctwprober.py
│ │ ├── gb2312freq.py
│ │ ├── gb2312prober.py
│ │ ├── hebrewprober.py
│ │ ├── jisfreq.py
│ │ ├── jpcntx.py
│ │ ├── langbulgarianmodel.py
│ │ ├── langcyrillicmodel.py
│ │ ├── langgreekmodel.py
│ │ ├── langhebrewmodel.py
│ │ ├── langhungarianmodel.py
│ │ ├── langthaimodel.py
│ │ ├── latin1prober.py
│ │ ├── mbcharsetprober.py
│ │ ├── mbcsgroupprober.py
│ │ ├── mbcssm.py
│ │ ├── sbcharsetprober.py
│ │ ├── sbcsgroupprober.py
│ │ ├── sjisprober.py
│ │ ├── universaldetector.py
│ │ └── utf8prober.py
│ └── urllib3
│ │ ├── __init__.py
│ │ ├── _collections.py
│ │ ├── connection.py
│ │ ├── connectionpool.py
│ │ ├── contrib
│ │ ├── __init__.py
│ │ ├── ntlmpool.py
│ │ └── pyopenssl.py
│ │ ├── exceptions.py
│ │ ├── fields.py
│ │ ├── filepost.py
│ │ ├── packages
│ │ ├── __init__.py
│ │ ├── ordered_dict.py
│ │ ├── six.py
│ │ └── ssl_match_hostname
│ │ │ ├── __init__.py
│ │ │ └── _implementation.py
│ │ ├── poolmanager.py
│ │ ├── request.py
│ │ ├── response.py
│ │ └── util
│ │ ├── __init__.py
│ │ ├── connection.py
│ │ ├── request.py
│ │ ├── response.py
│ │ ├── retry.py
│ │ ├── ssl_.py
│ │ ├── timeout.py
│ │ └── url.py
├── sessions.py
├── status_codes.py
├── structures.py
└── utils.py
├── search.py
├── txtfy.py
└── wise.py
/.gitignore:
--------------------------------------------------------------------------------
1 | secrets.py
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 |
45 | # Translations
46 | *.mo
47 | *.pot
48 |
49 | # Django stuff:
50 | *.log
51 |
52 | # Sphinx documentation
53 | docs/_build/
54 |
55 | # PyBuilder
56 | target/
57 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Astrobot
2 |
3 | `Astrobot` is an SMS service that allows you to (sort of) browse the web entirely over SMS. Here's what it looks like:
4 |
5 | 1. You: help
6 | 2. Astro-bot:"Go to wsj.com", "search wikipedia for Azerbaijan". Text the # of a link to click it.
7 | 3. You: hackerschool.com
8 | 4. Astro-bot: Hacker School is a retreat 4 programmers. Want 2 spend three months coding & learning
9 | in New York w/ smart, friendly, & intellectually curious people? [Rea
10 | 5. You: more
11 | 6. Astro-bot:d on](1) . [APPLY NOW](2) New blog post: [Paper of the Week: Error Detecting &
12 | Error Correcting Codes](3) Hacker School is… Self-directed We value in
13 | 7. You: 3 more
14 | 8. Astro-bot:trinsic motivation & self-direction, & believe people learn best when they’re free 2
15 | explore their passions & interests. Accessible We’re free 4 everyone.
16 | 9. Astro-bot:We also offer need-based grants 4 living expenses 4 people from traditionally
17 | underrepresented groups in programming. Stimulating We look 4 smart, friendly,
18 | 10. Astro-bot: self-directed, & intellectually curious people who enjoy programming & want 2 get
19 | dramatically better. Exceptional We have exceptional residents including
20 | 11. You: click 1
21 | 12. Astro-bot: We review applications on a rolling basis. u should receive a confirmation email
22 | immediately after u apply, & we will review ur application within three weeks
23 | 13. You: 2 more
24 | 14. Astro-bot:. In general, clear & concise answers are better than long ones. Most answers have a
25 | 1500 character max length, but u don’t need 2 write that much. We strongl
26 | 15. Astro-bot:y encourage u 2 read our [about page](1) , [FAQ](2) , & [User’s Manual](3) b4 applying.
27 | Choose ur batch If you’re admitted but ur preferred batch is full,
28 | 16. You: click link 2
29 | 17. Astro-bot:What's that? I don't understand. Say 'help me' for help.
30 | 18. You: click 2
31 | 19. Astro-bot: Welcome 2 an unusual experiment Hacker School is unlike the rest of the world. This
32 | guide is designed 2 help u get settled in & get the most out of ur batch.
33 | 20. You: 3 more
34 | 21. Astro-bot: 1 of the things that makes Hacker School different is that it's largely self-directed . This
35 | means u won't have someone telling u what 2 do, learn, etc, while
36 | 22. Astro-bot: you're here (though we do have a few [social rules](1) ). This self-directedness is baked
37 | into the core structure of Hacker School, & is why we don't have grad
38 | 23. Astro-bot:es, exams, curricula, or even classes. It comes from our belief that people learn best when
39 | given the freedom 2 explore what most interests them. This doesn't
40 |
41 |
42 | `Astrobot` is a Google Appengine app that supports responding to incoming messages from Twilio. You can use a live version at [astro-bot.appspot.com](http://astro-bot.appspot.com) or by texting _646-576-7688_ .
43 |
44 | ## Running your own
45 | Download the Google Appengine launcher and just drag the repository folder into it.
46 |
47 | You've also got to **create a file called secrets.py**, which includes a variable `BING_API_KEY`, holding your Bing search API key (*not* a Simple Search API key—the full API key). You can get these for free. You'll need it for web search — otherwise, leave secrets.txt empty and it'll all work *except* web search.
48 |
49 |
--------------------------------------------------------------------------------
/app.yaml:
--------------------------------------------------------------------------------
1 | application: astro-bot
2 | version: 1
3 | runtime: python27
4 | api_version: 1
5 | threadsafe: yes
6 |
7 | handlers:
8 | - url: /favicon\.ico
9 | static_files: favicon.ico
10 | upload: favicon\.ico
11 |
12 | - url: .*
13 | script: main.app
14 |
15 | libraries:
16 | - name: webapp2
17 | version: "2.5.2"
18 | - name: lxml
19 | version: "latest"
20 | - name: PIL
21 | version: "latest"
22 |
--------------------------------------------------------------------------------
/browse.py:
--------------------------------------------------------------------------------
1 | from wise import Phrase, parse_phrase
2 | import parse_command
3 | import document
4 | import urllib
5 | import search
6 |
7 | def interact(query, state):
8 | # query: String, state: Dictionary
9 | parsed = parse_command.parse_command(query)
10 | print parsed
11 | if 'BrowserState' in state:
12 | bstate = state['BrowserState']
13 | else:
14 | state['BrowserState'] = document.BrowserState()
15 | bstate = state['BrowserState']
16 |
17 | bstate.clean_up()
18 |
19 | if parsed.intent == 'url':
20 | bstate.navigate_to_url(parsed.get("*url", None))
21 | return bstate.get_n_messages(1)
22 | elif parsed.intent in ('more_text', 'previous_text'):
23 | return bstate.get_n_messages(min(7, int(parsed.get('*number', '1'))), backwards=(parsed.intent=='previous_text'))
24 | elif parsed.intent == 'back_to_top':
25 | bstate.frame_stack[-1].offset = 0
26 | return bstate.get_n_messages(1)
27 | elif parsed.intent == 'navigate' and parsed.get('*number', None):
28 | if parsed.get('on_last_page', False):
29 | bstate.back()
30 | url = bstate.frame_stack[-1].document.links[int(parsed.get('*number', '0'))-1]
31 | bstate.navigate_to_url(url)
32 | return bstate.get_n_messages(1)
33 | elif parsed.intent == 'help':
34 | bstate.navigate_to_url('http://astro-bot.appspot.com/instructions')
35 | return bstate.get_n_messages(1)
36 | # return ['Try these: "Go to hackerschool.com", "search wikipedia for Azerbaijan". On a web page, type "2 more" to see more or text the # of a link to click it.']
37 | elif parsed.intent == 'back':
38 | bstate.back()
39 | return bstate.resend_current_place()
40 | elif parsed.intent == 'contents':
41 | bstate.go_to_contents()
42 | return bstate.get_n_messages(1)
43 | elif parsed.intent == 'search':
44 | query = parsed.get("~query", "")
45 | if parsed.get('search_source/wikipedia', False):
46 | url = "http://en.wikipedia.org/w/index.php?search=" + urllib.quote_plus(query)
47 | bstate.navigate_to_url(url)
48 | return bstate.get_n_messages(1)
49 | else:
50 | bstate.frame_stack.append(document.Frame(search.document_from_query(query)))
51 | return bstate.get_n_messages(1)
52 | elif parsed.intent == 'whereami':
53 | if bstate.frame_stack == []:
54 | return ["You haven't loaded any page yet."]
55 | else:
56 | url = bstate.frame_stack[-1].document.url
57 | url_string = " ({0})".format(url) if url else ""
58 | return [u'You\'re reading "{0}"{1}'.format(bstate.frame_stack[-1].document.title, url_string)]
59 | else:
60 | return ["What's that? I don't understand. Say 'help me' for help."]
61 |
--------------------------------------------------------------------------------
/bs4/tests/__init__.py:
--------------------------------------------------------------------------------
1 | "The beautifulsoup tests."
2 |
--------------------------------------------------------------------------------
/bs4/tests/test_docs.py:
--------------------------------------------------------------------------------
1 | "Test harness for doctests."
2 |
3 | # pylint: disable-msg=E0611,W0142
4 |
5 | __metaclass__ = type
6 | __all__ = [
7 | 'additional_tests',
8 | ]
9 |
10 | import atexit
11 | import doctest
12 | import os
13 | #from pkg_resources import (
14 | # resource_filename, resource_exists, resource_listdir, cleanup_resources)
15 | import unittest
16 |
17 | DOCTEST_FLAGS = (
18 | doctest.ELLIPSIS |
19 | doctest.NORMALIZE_WHITESPACE |
20 | doctest.REPORT_NDIFF)
21 |
22 |
23 | # def additional_tests():
24 | # "Run the doc tests (README.txt and docs/*, if any exist)"
25 | # doctest_files = [
26 | # os.path.abspath(resource_filename('bs4', 'README.txt'))]
27 | # if resource_exists('bs4', 'docs'):
28 | # for name in resource_listdir('bs4', 'docs'):
29 | # if name.endswith('.txt'):
30 | # doctest_files.append(
31 | # os.path.abspath(
32 | # resource_filename('bs4', 'docs/%s' % name)))
33 | # kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
34 | # atexit.register(cleanup_resources)
35 | # return unittest.TestSuite((
36 | # doctest.DocFileSuite(*doctest_files, **kwargs)))
37 |
--------------------------------------------------------------------------------
/bs4/tests/test_html5lib.py:
--------------------------------------------------------------------------------
1 | """Tests to ensure that the html5lib tree builder generates good trees."""
2 |
3 | import warnings
4 |
5 | try:
6 | from bs4.builder import HTML5TreeBuilder
7 | HTML5LIB_PRESENT = True
8 | except ImportError, e:
9 | HTML5LIB_PRESENT = False
10 | from bs4.element import SoupStrainer
11 | from bs4.testing import (
12 | HTML5TreeBuilderSmokeTest,
13 | SoupTest,
14 | skipIf,
15 | )
16 |
17 | @skipIf(
18 | not HTML5LIB_PRESENT,
19 | "html5lib seems not to be present, not testing its tree builder.")
20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
21 | """See ``HTML5TreeBuilderSmokeTest``."""
22 |
23 | @property
24 | def default_builder(self):
25 | return HTML5TreeBuilder()
26 |
27 | def test_soupstrainer(self):
28 | # The html5lib tree builder does not support SoupStrainers.
29 | strainer = SoupStrainer("b")
30 | markup = "
A bold statement.
"
31 | with warnings.catch_warnings(record=True) as w:
32 | soup = self.soup(markup, parse_only=strainer)
33 | self.assertEqual(
34 | soup.decode(), self.document_for(markup))
35 |
36 | self.assertTrue(
37 | "the html5lib tree builder doesn't support parse_only" in
38 | str(w[0].message))
39 |
40 | def test_correctly_nested_tables(self):
41 | """html5lib inserts tags where other parsers don't."""
42 | markup = (''
43 | ''
44 | "Here's another table:"
45 | ' | ')
48 |
49 | self.assertSoupEquals(
50 | markup,
51 | 'Here\'s another table:'
52 | ''
53 | ' |
')
54 |
55 | self.assertSoupEquals(
56 | "Foo |
"
57 | "Bar |
"
58 | "Baz |
")
59 |
60 | def test_xml_declaration_followed_by_doctype(self):
61 | markup = '''
62 |
63 |
64 |
65 |
66 |
67 | foo
68 |
69 | '''
70 | soup = self.soup(markup)
71 | # Verify that we can reach the tag; this means the tree is connected.
72 | self.assertEqual(b"
foo
", soup.p.encode())
73 |
74 | def test_reparented_markup(self):
75 | markup = 'foo
\nbar
'
76 | soup = self.soup(markup)
77 | self.assertEqual(u"foo
\nbar
", soup.body.decode())
78 | self.assertEqual(2, len(soup.find_all('p')))
79 |
80 |
81 | def test_reparented_markup_ends_with_whitespace(self):
82 | markup = 'foo
\nbar
\n'
83 | soup = self.soup(markup)
84 | self.assertEqual(u"foo
\nbar
\n", soup.body.decode())
85 | self.assertEqual(2, len(soup.find_all('p')))
86 |
--------------------------------------------------------------------------------
/bs4/tests/test_htmlparser.py:
--------------------------------------------------------------------------------
1 | """Tests to ensure that the html.parser tree builder generates good
2 | trees."""
3 |
4 | from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
5 | from bs4.builder import HTMLParserTreeBuilder
6 |
7 | class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
8 |
9 | @property
10 | def default_builder(self):
11 | return HTMLParserTreeBuilder()
12 |
13 | def test_namespaced_system_doctype(self):
14 | # html.parser can't handle namespaced doctypes, so skip this one.
15 | pass
16 |
17 | def test_namespaced_public_doctype(self):
18 | # html.parser can't handle namespaced doctypes, so skip this one.
19 | pass
20 |
--------------------------------------------------------------------------------
/bs4/tests/test_lxml.py:
--------------------------------------------------------------------------------
1 | """Tests to ensure that the lxml tree builder generates good trees."""
2 |
3 | import re
4 | import warnings
5 |
6 | try:
7 | import lxml.etree
8 | LXML_PRESENT = True
9 | LXML_VERSION = lxml.etree.LXML_VERSION
10 | except ImportError, e:
11 | LXML_PRESENT = False
12 | LXML_VERSION = (0,)
13 |
14 | if LXML_PRESENT:
15 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
16 |
17 | from bs4 import (
18 | BeautifulSoup,
19 | BeautifulStoneSoup,
20 | )
21 | from bs4.element import Comment, Doctype, SoupStrainer
22 | from bs4.testing import skipIf
23 | from bs4.tests import test_htmlparser
24 | from bs4.testing import (
25 | HTMLTreeBuilderSmokeTest,
26 | XMLTreeBuilderSmokeTest,
27 | SoupTest,
28 | skipIf,
29 | )
30 |
31 | @skipIf(
32 | not LXML_PRESENT,
33 | "lxml seems not to be present, not testing its tree builder.")
34 | class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
35 | """See ``HTMLTreeBuilderSmokeTest``."""
36 |
37 | @property
38 | def default_builder(self):
39 | return LXMLTreeBuilder()
40 |
41 | def test_out_of_range_entity(self):
42 | self.assertSoupEquals(
43 | "foobar
", "foobar
")
44 | self.assertSoupEquals(
45 | "foobar
", "foobar
")
46 | self.assertSoupEquals(
47 | "foobar
", "foobar
")
48 |
49 | # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
50 | # test if an old version of lxml is installed.
51 |
52 | @skipIf(
53 | not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
54 | "Skipping doctype test for old version of lxml to avoid segfault.")
55 | def test_empty_doctype(self):
56 | soup = self.soup("")
57 | doctype = soup.contents[0]
58 | self.assertEqual("", doctype.strip())
59 |
60 | def test_beautifulstonesoup_is_xml_parser(self):
61 | # Make sure that the deprecated BSS class uses an xml builder
62 | # if one is installed.
63 | with warnings.catch_warnings(record=True) as w:
64 | soup = BeautifulStoneSoup("")
65 | self.assertEqual(u"", unicode(soup.b))
66 | self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
67 |
68 | def test_real_xhtml_document(self):
69 | """lxml strips the XML definition from an XHTML doc, which is fine."""
70 | markup = b"""
71 |
72 |
73 | Hello.
74 | Goodbye.
75 | """
76 | soup = self.soup(markup)
77 | self.assertEqual(
78 | soup.encode("utf-8").replace(b"\n", b''),
79 | markup.replace(b'\n', b'').replace(
80 | b'', b''))
81 |
82 |
83 | @skipIf(
84 | not LXML_PRESENT,
85 | "lxml seems not to be present, not testing its XML tree builder.")
86 | class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
87 | """See ``HTMLTreeBuilderSmokeTest``."""
88 |
89 | @property
90 | def default_builder(self):
91 | return LXMLTreeBuilderForXML()
92 |
--------------------------------------------------------------------------------
/cssselect/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 | """
3 | CSS Selectors based on XPath
4 | ============================
5 |
6 | This module supports selecting XML/HTML elements based on CSS selectors.
7 | See the `CSSSelector` class for details.
8 |
9 |
10 | :copyright: (c) 2007-2012 Ian Bicking and contributors.
11 | See AUTHORS for more details.
12 | :license: BSD, see LICENSE for more details.
13 |
14 | """
15 |
16 | from cssselect.parser import (parse, Selector, FunctionalPseudoElement,
17 | SelectorError, SelectorSyntaxError)
18 | from cssselect.xpath import GenericTranslator, HTMLTranslator, ExpressionError
19 |
20 |
21 | VERSION = '0.9.1'
22 | __version__ = VERSION
23 |
--------------------------------------------------------------------------------
/document.py:
--------------------------------------------------------------------------------
1 | import urllib, urllib2
2 | import bs4
3 | from txtfy import txtfy
4 | from html2text import html2doc
5 | import urlparse
6 |
7 | SMS_LEN = 160
8 |
9 | def normalize_url(url):
10 | scheme = url.split("://")[0]
11 | if scheme not in ['http', 'https']:
12 | url = 'http://' + url
13 | return url
14 |
15 | # opera mini for dumbphones:
16 | USER_AGENT = "Opera/9.80 (J2ME/MIDP; Opera Mini/4.2.13337/34.818; U; en) Presto/2.8.119 Version/11.10"
17 |
18 | def get_content(url):
19 | opener = urllib2.build_opener()
20 | opener.addheaders = [('User-agent', USER_AGENT)]
21 | html = opener.open(url).read()
22 | title = bs4.BeautifulSoup(html).title.get_text()
23 | return (html, title)
24 |
25 | def get_content_ip(url):
26 | url = "http://instapaper.com/m?u=" + urllib.quote_plus(url)
27 | html = urllib2.urlopen(url).read()
28 | soup = bs4.BeautifulSoup(html)
29 | story = soup.find(id='story')
30 | return unicode(story), soup.title.get_text()
31 |
32 | NO_URL = ""
33 |
34 | class Document(object):
35 | url = None
36 | def __init__(self, url=None, html=None):
37 | if url:
38 | url = normalize_url(url)
39 | html, self.title = get_content(url)
40 | self.url = url
41 | if not isinstance(html, unicode):
42 | html = html.decode('utf-8')
43 | self.text, self.links, self.headers = html2doc(html, baseurl = url if url else "")
44 | print "HEADERS", self.headers
45 |
46 | class Frame(object):
47 | def __init__(self, doc):
48 | self.document = doc
49 | self.offset = 0
50 |
51 | class BrowserState(object):
52 | def __init__(self):
53 | self.frame_stack = []
54 |
55 | def clean_up(self):
56 | while len(self.frame_stack) > 5:
57 | self.frame_stack = self.frame_stack[1:]
58 |
59 | def navigate_to_url(self, url):
60 | parsed = urlparse.urlparse(url)
61 | if parsed.scheme == 'go-to-offset':
62 | self.back()
63 | self.frame_stack[-1].offset = int(parsed.netloc)
64 | else:
65 | self.frame_stack.append(Frame(Document(url)))
66 |
67 | def back(self):
68 | if len(self.frame_stack):
69 | self.frame_stack = self.frame_stack[:-1]
70 |
71 | def resend_current_place(self):
72 | self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - SMS_LEN)
73 | return self.get_n_messages(1)
74 |
75 | def go_to_contents(self):
76 | current_page_title = self.frame_stack[-1].document.title
77 | html = "Headings on {0}".format(current_page_title) + u"
".join([u"{1}".format(offset, heading) for heading, offset in self.frame_stack[-1].document.headers])
78 | doc = Document(html = html)
79 | self.frame_stack.append(Frame(doc))
80 |
81 | def get_n_messages(self, n, backwards=False):
82 | if backwards:
83 | self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - 160)
84 |
85 | if not backwards and self.frame_stack[-1].offset >= len(self.frame_stack[-1].document.text):
86 | return [""]
87 | else:
88 | messages = []
89 | for i in xrange(n):
90 | start_offset = self.frame_stack[-1].offset
91 | if backwards:
92 | start_offset = max(0, start_offset-160)
93 | end_offset = min(len(self.frame_stack[-1].document.text), start_offset + SMS_LEN)
94 | if end_offset - start_offset == 0:
95 | break
96 | messages.append(self.frame_stack[-1].document.text[start_offset : end_offset])
97 | self.frame_stack[-1].offset = start_offset if backwards else end_offset
98 | if self.frame_stack[-1].offset == 0:
99 | break
100 | if backwards:
101 | self.frame_stack[-1].offset = min(len(self.frame_stack[-1].document.text), self.frame_stack[-1].offset + 160)
102 | return messages
103 |
104 |
--------------------------------------------------------------------------------
/document_old.py:
--------------------------------------------------------------------------------
1 | import urllib, urllib2
2 | import bs4
3 | from txtfy import txtfy
4 |
5 | SMS_LEN = 160
6 |
7 | def normalize_url(url):
8 | scheme = url.split("://")[0]
9 | if scheme not in ['http', 'https']:
10 | url = 'http://' + url
11 | return url
12 |
13 | def get_content(url):
14 | html = urllib2.urlopen(url).read()
15 | title = bs4.BeautifulSoup(html).title.get_text()
16 | return (html, title)
17 |
18 | def get_content_ip(url):
19 | url = "http://instapaper.com/m?u=" + urllib.quote_plus(url)
20 | html = urllib2.urlopen(url).read()
21 | soup = bs4.BeautifulSoup(html)
22 | story = soup.find(id='story')
23 | return unicode(story), soup.title.get_text()
24 |
25 | NO_URL = ""
26 |
27 | class Document(object):
28 | def __init__(self, url=None, html=None):
29 | self.url = url
30 | if url:
31 | html, self.title = get_content(normalize_url(url))
32 | soup = bs4.BeautifulSoup(html)
33 | self.text = u""
34 | self.links = []
35 | self.headers = []
36 | ignore_tags = set(['head', 'script', 'style'])
37 | def break_line():
38 | if len(self.text) > 0 and self.text[-1] != '\n':
39 | self.text += '\n'
40 | def break_word():
41 | if len(self.text) > 0 and self.text[-1] not in " \n":
42 | self.text += " "
43 | def emit_text(t):
44 | break_word()
45 | self.text += txtfy(t)
46 | def traverse(tag):
47 | if tag.name == 'a' and tag.has_attr('href'):
48 | self.links.append(tag['href'])
49 | emit_text(u'[{0}]({1}) '.format(tag.get_text(), len(self.links)))
50 | elif tag.name in ['h1', 'h2', 'h3', 'h4']:
51 | break_line()
52 | self.headers.append((tag.get_text(), len(self.text)))
53 | process_contents(tag)
54 | break_line()
55 | elif tag.name in ['li', 'p']:
56 | break_line()
57 | process_contents(tag)
58 | break_line()
59 | else:
60 | process_contents(tag)
61 | def process_contents(tag):
62 | for child in tag.contents:
63 | if isinstance(child, bs4.NavigableString):
64 | emit_text(unicode(child))
65 | elif hasattr(child, 'name'):
66 | traverse(child)
67 | traverse(soup)
68 | break_line()
69 | emit_text("")
70 |
71 | class Frame(object):
72 | def __init__(self, doc):
73 | self.document = doc
74 | self.offset = 0
75 |
76 | class BrowserState(object):
77 | def __init__(self):
78 | self.frame_stack = []
79 |
80 | def clean_up(self):
81 | while len(self.frame_stack) > 5:
82 | self.frame_stack = self.frame_stack[1:]
83 |
84 | def navigate_to_url(self, url):
85 | self.frame_stack.append(Frame(Document(url)))
86 |
87 | def back(self):
88 | if len(self.frame_stack):
89 | self.frame_stack = self.frame_stack[:-1]
90 |
91 | def resend_current_place(self):
92 | self.frame_stack[-1].offset = max(0, self.frame_stack[-1].offset - SMS_LEN)
93 | return self.get_n_messages(1)
94 |
95 | def get_n_messages(self, n):
96 | if self.frame_stack[-1].offset >= len(self.frame_stack[-1].document.text):
97 | return [""]
98 | else:
99 | messages = []
100 | for i in xrange(n):
101 | start_offset = self.frame_stack[-1].offset
102 | end_offset = min(len(self.frame_stack[-1].document.text), start_offset + SMS_LEN)
103 | if end_offset - start_offset == 0:
104 | break
105 | messages.append(self.frame_stack[-1].document.text[start_offset : end_offset])
106 | self.frame_stack[-1].offset = end_offset
107 | return messages
108 |
109 |
--------------------------------------------------------------------------------
/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/favicon.ico
--------------------------------------------------------------------------------
/goose/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import os
24 | import platform
25 | from tempfile import mkstemp
26 |
27 | from goose.version import version_info, __version__
28 | from goose.configuration import Configuration
29 | from goose.crawler import CrawlCandidate
30 | from goose.crawler import Crawler
31 |
32 |
33 | class Goose(object):
34 | """\
35 |
36 | """
37 | def __init__(self, config=None):
38 | self.config = config or Configuration()
39 | self.extend_config()
40 | self.initialize()
41 |
42 | def extend_config(self):
43 | if isinstance(self.config, dict):
44 | config = Configuration()
45 | for k, v in self.config.items():
46 | if hasattr(config, k):
47 | setattr(config, k, v)
48 | self.config = config
49 |
50 | def extract(self, url=None, raw_html=None):
51 | """\
52 | Main method to extract an article object from a URL,
53 | pass in a url and get back a Article
54 | """
55 | cc = CrawlCandidate(self.config, url, raw_html)
56 | return self.crawl(cc)
57 |
58 | def shutdown_network(self):
59 | pass
60 |
61 | def crawl(self, crawl_candiate):
62 | crawler = Crawler(self.config)
63 | article = crawler.crawl(crawl_candiate)
64 | return article
65 |
66 | def initialize(self):
67 | # we don't need to go further if image extractor or
68 | # local_storage is not set
69 | if not self.config.local_storage_path or \
70 | not self.config.enable_image_fetching:
71 | return
72 | # test if config.local_storage_path
73 | # is a directory
74 | if not os.path.isdir(self.config.local_storage_path):
75 | os.makedirs(self.config.local_storage_path)
76 |
77 | if not os.path.isdir(self.config.local_storage_path):
78 | raise Exception(self.config.local_storage_path +
79 | " directory does not seem to exist, "
80 | "you need to set this for image processing downloads"
81 | )
82 |
83 | # test to write a dummy file to the directory
84 | # to check is directory is writtable
85 | level, path = mkstemp(dir=self.config.local_storage_path)
86 | try:
87 | f = os.fdopen(level, "w")
88 | f.close()
89 | os.remove(path)
90 | except IOError:
91 | raise Exception(self.config.local_storage_path +
92 | " directory is not writeble, "
93 | "you need to set this for image processing downloads"
94 | )
95 |
--------------------------------------------------------------------------------
/goose/article.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 |
24 |
25 | class Article(object):
26 |
27 | def __init__(self):
28 | # title of the article
29 | self.title = None
30 |
31 | # stores the lovely, pure text from the article,
32 | # stripped of html, formatting, etc...
33 | # just raw text with paragraphs separated by newlines.
34 | # This is probably what you want to use.
35 | self.cleaned_text = u""
36 |
37 | # meta description field in HTML source
38 | self.meta_description = u""
39 |
40 | # meta lang field in HTML source
41 | self.meta_lang = u""
42 |
43 | # meta favicon field in HTML source
44 | self.meta_favicon = u""
45 |
46 | # meta keywords field in the HTML source
47 | self.meta_keywords = u""
48 |
49 | # The canonical link of this article if found in the meta data
50 | self.canonical_link = u""
51 |
52 | # holds the domain of this article we're parsing
53 | self.domain = u""
54 |
55 | # holds the top Element we think
56 | # is a candidate for the main body of the article
57 | self.top_node = None
58 |
59 | # holds the top Image object that
60 | # we think represents this article
61 | self.top_image = None
62 |
63 | # holds a set of tags that may have
64 | # been in the artcle, these are not meta keywords
65 | self.tags = set()
66 |
67 | # holds a list of any movies
68 | # we found on the page like youtube, vimeo
69 | self.movies = []
70 |
71 | # stores the final URL that we're going to try
72 | # and fetch content against, this would be expanded if any
73 | self.final_url = u""
74 |
75 | # stores the MD5 hash of the url
76 | # to use for various identification tasks
77 | self.link_hash = ""
78 |
79 | # stores the RAW HTML
80 | # straight from the network connection
81 | self.raw_html = u""
82 |
83 | # the lxml Document object
84 | self.doc = None
85 |
86 | # this is the original JSoup document that contains
87 | # a pure object from the original HTML without any cleaning
88 | # options done on it
89 | self.raw_doc = None
90 |
91 | # Sometimes useful to try and know when
92 | # the publish date of an article was
93 | self.publish_date = None
94 |
95 | # A property bucket for consumers of goose to store custom data extractions.
96 | self.additional_data = {}
97 |
--------------------------------------------------------------------------------
/goose/configuration.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import os
24 | import tempfile
25 | from goose.text import StopWords
26 | from goose.parsers import Parser
27 | from goose.parsers import ParserSoup
28 | from goose.version import __version__
29 |
30 | HTTP_DEFAULT_TIMEOUT = 30
31 |
32 |
33 | class Configuration(object):
34 |
35 | def __init__(self):
36 | # What's the minimum bytes for an image we'd accept is,
37 | # alot of times we want to filter out the author's little images
38 | # in the beginning of the article
39 | self.images_min_bytes = 4500
40 |
41 | # set this guy to false if you don't care about getting images,
42 | # otherwise you can either use the default
43 | # image extractor to implement the ImageExtractor
44 | # interface to build your own
45 | self.enable_image_fetching = True
46 |
47 | # set this valriable to False if you want to force
48 | # the article language. OtherWise it will attempt to
49 | # find meta language and use the correct stopwords dictionary
50 | self.use_meta_language = True
51 |
52 | # default language
53 | # it will be use as fallback
54 | # if use_meta_language is set to false, targetlanguage will
55 | # be use
56 | self.target_language = 'en'
57 |
58 | # defautl stopwrods class
59 | self.stopwords_class = StopWords
60 |
61 | # path to your imagemagick convert executable,
62 | # on the mac using mac ports this is the default listed
63 | self.imagemagick_convert_path = "/opt/local/bin/convert"
64 |
65 | # path to your imagemagick identify executable
66 | self.imagemagick_identify_path = "/opt/local/bin/identify"
67 |
68 | # used as the user agent that
69 | # is sent with your web requests to extract an article
70 | # self.browser_user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2)"\
71 | # " AppleWebKit/534.52.7 (KHTML, like Gecko) "\
72 | # "Version/5.1.2 Safari/534.52.7"
73 | self.browser_user_agent = 'Goose/%s' % __version__
74 |
75 | # debug mode
76 | # enable this to have additional debugging information
77 | # sent to stdout
78 | self.debug = False
79 |
80 | # TODO
81 | self.extract_publishdate = None
82 |
83 | # TODO
84 | self.additional_data_extractor = None
85 |
86 | # Parser type
87 | self.parser_class = 'lxml'
88 |
89 | # set the local storage path
90 | # make this configurable
91 | self.local_storage_path = os.path.join(tempfile.gettempdir(), 'goose')
92 |
93 | # http timeout
94 | self.http_timeout = HTTP_DEFAULT_TIMEOUT
95 |
96 | def get_parser(self):
97 | return Parser if self.parser_class == 'lxml' else ParserSoup
98 |
99 | def get_publishdate_extractor(self):
100 | return self.extract_publishdate
101 |
102 | def set_publishdate_extractor(self, extractor):
103 | """\
104 | Pass in to extract article publish dates.
105 | @param extractor a concrete instance of PublishDateExtractor
106 | """
107 | if not extractor:
108 | raise ValueError("extractor must not be null!")
109 | self.extract_publishdate = extractor
110 |
111 | def get_additionaldata_extractor(self):
112 | return self.additional_data_extractor
113 |
114 | def set_additionaldata_extractor(self, extractor):
115 | """\
116 | Pass in to extract any additional data not defined within
117 | @param extractor a concrete instance of AdditionalDataExtractor
118 | """
119 | if not extractor:
120 | raise ValueError("extractor must not be null!")
121 | self.additional_data_extractor = extractor
122 |
--------------------------------------------------------------------------------
/goose/images/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/goose/images/__init__.py
--------------------------------------------------------------------------------
/goose/images/image.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 |
24 |
25 | class Image(object):
26 |
27 | def __init__(self):
28 | # holds the Element node of the image we think is top dog
29 | self.top_image_node = None
30 |
31 | # holds the src of the image
32 | self.src = ""
33 |
34 | # how confident are we in this image extraction?
35 | # the most images generally the less confident
36 | self.confidence_score = float(0.0)
37 |
38 | # Height of the image in pixels
39 | self.height = 0
40 |
41 | # width of the image in pixels
42 | self.width = 0
43 |
44 | # what kind of image extraction was used for this?
45 | # bestGuess, linkTag, openGraph tags?
46 | self.extraction_type = "NA"
47 |
48 | # stores how many bytes this image is.
49 | self.bytes = long(0)
50 |
51 | def get_src(self):
52 | return self.src
53 |
54 |
55 | class ImageDetails(object):
56 |
57 | def __init__(self):
58 |
59 | # the width of the image
60 | self.width = 0
61 |
62 | # height of the image
63 | self.height = 0
64 |
65 | # the mime_type of the image JPEG / PNG
66 | self.mime_type = None
67 |
68 | def get_width(self):
69 | return self.width
70 |
71 | def set_width(self, width):
72 | self.width = width
73 |
74 | def get_height(self):
75 | return self.height
76 |
77 | def set_height(self, height):
78 | self.height = height
79 |
80 | def get_mime_type(self):
81 | return self.mime_type
82 |
83 | def set_mime_type(self, mime_type):
84 | self.mime_type = mime_type
85 |
86 |
87 | class LocallyStoredImage(object):
88 |
89 | def __init__(self, src='', local_filename='',
90 | link_hash='', bytes=long(0), file_extension='', height=0, width=0):
91 | self.src = src
92 | self.local_filename = local_filename
93 | self.link_hash = link_hash
94 | self.bytes = bytes
95 | self.file_extension = file_extension
96 | self.height = height
97 | self.width = width
98 |
--------------------------------------------------------------------------------
/goose/images/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import hashlib
24 | import os
25 | import urllib2
26 | from PIL import Image
27 | from goose.utils.encoding import smart_str
28 | from goose.images.image import ImageDetails
29 | from goose.images.image import LocallyStoredImage
30 |
31 |
32 | class ImageUtils(object):
33 |
34 | @classmethod
35 | def get_image_dimensions(self, identify_program, path):
36 | image = Image.open(path)
37 | image_details = ImageDetails()
38 | image_details.set_mime_type(image.format)
39 | width, height = image.size
40 | image_details.set_width(width)
41 | image_details.set_height(height)
42 | return image_details
43 |
44 | @classmethod
45 | def store_image(self, http_client, link_hash, src, config):
46 | """\
47 | Writes an image src http string to disk as a temporary file
48 | and returns the LocallyStoredImage object
49 | that has the info you should need on the image
50 | """
51 | # check for a cache hit already on disk
52 | image = self.read_localfile(link_hash, src, config)
53 | if image:
54 | return image
55 |
56 | # no cache found download the image
57 | data = self.fetch(http_client, src)
58 | if data:
59 | image = self.write_localfile(data, link_hash, src, config)
60 | if image:
61 | return image
62 |
63 | return None
64 |
65 | @classmethod
66 | def get_mime_type(self, image_details):
67 | mime_type = image_details.get_mime_type().lower()
68 | mimes = {
69 | 'png': '.png',
70 | 'jpg': '.jpg',
71 | 'jpeg': '.jpg',
72 | 'gif': '.gif',
73 | }
74 | return mimes.get(mime_type, 'NA')
75 |
76 | @classmethod
77 | def read_localfile(self, link_hash, src, config):
78 | local_image_name = self.get_localfile_name(link_hash, src, config)
79 | if os.path.isfile(local_image_name):
80 | identify = config.imagemagick_identify_path
81 | image_details = self.get_image_dimensions(identify, local_image_name)
82 | file_extension = self.get_mime_type(image_details)
83 | bytes = os.path.getsize(local_image_name)
84 | return LocallyStoredImage(
85 | src=src,
86 | local_filename=local_image_name,
87 | link_hash=link_hash,
88 | bytes=bytes,
89 | file_extension=file_extension,
90 | height=image_details.get_height(),
91 | width=image_details.get_width()
92 | )
93 | return None
94 |
95 | @classmethod
96 | def write_localfile(self, entity, link_hash, src, config):
97 | local_path = self.get_localfile_name(link_hash, src, config)
98 | f = open(local_path, 'wb')
99 | f.write(entity)
100 | f.close()
101 | return self.read_localfile(link_hash, src, config)
102 |
103 | @classmethod
104 | def get_localfile_name(self, link_hash, src, config):
105 | image_hash = hashlib.md5(smart_str(src)).hexdigest()
106 | return os.path.join(config.local_storage_path, '%s_%s' % (link_hash, image_hash))
107 |
108 | @classmethod
109 | def clean_src_string(self, src):
110 | return src.replace(" ", "%20")
111 |
112 | @classmethod
113 | def fetch(self, http_client, src):
114 | try:
115 | req = urllib2.Request(src)
116 | f = urllib2.urlopen(req)
117 | data = f.read()
118 | return data
119 | except:
120 | return None
121 |
--------------------------------------------------------------------------------
/goose/network.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import urllib2
24 |
25 |
26 | class HtmlFetcher(object):
27 |
28 | def __init__(self, config):
29 | self.config = config
30 | # set header
31 | self.headers = {'User-agent': self.config.browser_user_agent}
32 |
33 | def get_url(self):
34 | # if we have a result
35 | # get the final_url
36 | if self.result is not None:
37 | return self.result.geturl()
38 | return None
39 |
40 | def get_html(self, url):
41 | # utf-8 encode unicode url
42 | if isinstance(url, unicode):
43 | url = url.encode('utf-8')
44 |
45 | # set request
46 | self.request = urllib2.Request(
47 | url,
48 | headers=self.headers)
49 | # do request
50 | try:
51 | self.result = urllib2.urlopen(
52 | self.request,
53 | timeout=self.config.http_timeout)
54 | except:
55 | self.result = None
56 |
57 | # read the result content
58 | if self.result is not None:
59 | return self.result.read()
60 | return None
61 |
--------------------------------------------------------------------------------
/goose/resources/images/known-image-css.txt:
--------------------------------------------------------------------------------
1 | latimes.com^thumbnail
2 | cnn.com^storytext|cnn_strycntntlft
3 | foxnews.com^entry-content
4 | msn.com^articleText
5 | go.com^mediaimage
6 | lefigaro.fr^photo center
7 | cadres.apec.fr^noFieldsTable
8 | emploi.lesechos.fr^offerHeader
9 | linkfinance.fr^offerHeader
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-ar.txt:
--------------------------------------------------------------------------------
1 | فى
2 | في
3 | كل
4 | لم
5 | لن
6 | له
7 | من
8 | هو
9 | هي
10 | قوة
11 | كما
12 | لها
13 | منذ
14 | وقد
15 | ولا
16 | نفسه
17 | لقاء
18 | مقابل
19 | هناك
20 | وقال
21 | وكان
22 | نهاية
23 | وقالت
24 | وكانت
25 | للامم
26 | فيه
27 | كلم
28 | لكن
29 | وفي
30 | وقف
31 | ولم
32 | ومن
33 | وهو
34 | وهي
35 | يوم
36 | فيها
37 | منها
38 | مليار
39 | لوكالة
40 | يكون
41 | يمكن
42 | مليون
43 | حيث
44 | اكد
45 | الا
46 | اما
47 | امس
48 | السابق
49 | التى
50 | التي
51 | اكثر
52 | ايار
53 | ايضا
54 | ثلاثة
55 | الذاتي
56 | الاخيرة
57 | الثاني
58 | الثانية
59 | الذى
60 | الذي
61 | الان
62 | امام
63 | ايام
64 | خلال
65 | حوالى
66 | الذين
67 | الاول
68 | الاولى
69 | بين
70 | ذلك
71 | دون
72 | حول
73 | حين
74 | الف
75 | الى
76 | انه
77 | اول
78 | ضمن
79 | انها
80 | جميع
81 | الماضي
82 | الوقت
83 | المقبل
84 | اليوم
85 | ـ
86 | ف
87 | و
88 | و6
89 | قد
90 | لا
91 | ما
92 | مع
93 | مساء
94 | هذا
95 | واحد
96 | واضاف
97 | واضافت
98 | فان
99 | قبل
100 | قال
101 | كان
102 | لدى
103 | نحو
104 | هذه
105 | وان
106 | واكد
107 | كانت
108 | واوضح
109 | مايو
110 | ب
111 | ا
112 | أ
113 | ،
114 | عشر
115 | عدد
116 | عدة
117 | عشرة
118 | عدم
119 | عام
120 | عاما
121 | عن
122 | عند
123 | عندما
124 | على
125 | عليه
126 | عليها
127 | زيارة
128 | سنة
129 | سنوات
130 | تم
131 | ضد
132 | بعد
133 | بعض
134 | اعادة
135 | اعلنت
136 | بسبب
137 | حتى
138 | اذا
139 | احد
140 | اثر
141 | برس
142 | باسم
143 | غدا
144 | شخصا
145 | صباح
146 | اطار
147 | اربعة
148 | اخرى
149 | بان
150 | اجل
151 | غير
152 | بشكل
153 | حاليا
154 | بن
155 | به
156 | ثم
157 | اف
158 | ان
159 | او
160 | اي
161 | بها
162 | صفر
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-da.txt:
--------------------------------------------------------------------------------
1 | af
2 | alle
3 | andet
4 | andre
5 | at
6 | begge
7 | da
8 | de
9 | den
10 | denne
11 | der
12 | deres
13 | det
14 | dette
15 | dig
16 | din
17 | dog
18 | du
19 | ej
20 | eller
21 | en
22 | end
23 | ene
24 | eneste
25 | enhver
26 | et
27 | fem
28 | fire
29 | flere
30 | fleste
31 | for
32 | fordi
33 | forrige
34 | fra
35 | få
36 | før
37 | god
38 | han
39 | hans
40 | har
41 | hendes
42 | her
43 | hun
44 | hvad
45 | hvem
46 | hver
47 | hvilken
48 | hvis
49 | hvor
50 | hvordan
51 | hvorfor
52 | hvornår
53 | i
54 | ikke
55 | ind
56 | ingen
57 | intet
58 | jeg
59 | jeres
60 | kan
61 | kom
62 | kommer
63 | lav
64 | lidt
65 | lille
66 | man
67 | mand
68 | mange
69 | med
70 | meget
71 | men
72 | mens
73 | mere
74 | mig
75 | ned
76 | ni
77 | nogen
78 | noget
79 | ny
80 | nyt
81 | nær
82 | næste
83 | næsten
84 | og
85 | op
86 | otte
87 | over
88 | på
89 | se
90 | seks
91 | ses
92 | som
93 | stor
94 | store
95 | syv
96 | ti
97 | til
98 | to
99 | tre
100 | ud
101 | var
102 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-es.txt:
--------------------------------------------------------------------------------
1 | de
2 | la
3 | que
4 | el
5 | en
6 | y
7 | a
8 | los
9 | del
10 | se
11 | las
12 | por
13 | un
14 | para
15 | con
16 | no
17 | una
18 | su
19 | al
20 | lo
21 | como
22 | más
23 | pero
24 | sus
25 | le
26 | ya
27 | o
28 | este
29 | sí
30 | porque
31 | esta
32 | entre
33 | cuando
34 | muy
35 | sin
36 | sobre
37 | también
38 | me
39 | hasta
40 | hay
41 | donde
42 | quien
43 | desde
44 | todo
45 | nos
46 | durante
47 | todos
48 | uno
49 | les
50 | ni
51 | contra
52 | otros
53 | ese
54 | eso
55 | ante
56 | ellos
57 | e
58 | esto
59 | mí
60 | antes
61 | algunos
62 | qué
63 | unos
64 | yo
65 | otro
66 | otras
67 | otra
68 | él
69 | tanto
70 | esa
71 | estos
72 | mucho
73 | quienes
74 | nada
75 | muchos
76 | cual
77 | poco
78 | ella
79 | estar
80 | estas
81 | algunas
82 | algo
83 | nosotros
84 | mi
85 | mis
86 | tú
87 | te
88 | ti
89 | tu
90 | tus
91 | ellas
92 | nosotras
93 | vosotros
94 | vosotras
95 | os
96 | mío
97 | mía
98 | míos
99 | mías
100 | tuyo
101 | tuya
102 | tuyos
103 | tuyas
104 | suyo
105 | suya
106 | suyos
107 | suyas
108 | nuestro
109 | nuestra
110 | nuestros
111 | nuestras
112 | vuestro
113 | vuestra
114 | vuestros
115 | vuestras
116 | esos
117 | esas
118 | estoy
119 | estás
120 | está
121 | estamos
122 | estáis
123 | están
124 | esté
125 | estés
126 | estemos
127 | estéis
128 | estén
129 | estaré
130 | estarás
131 | estará
132 | estaremos
133 | estaréis
134 | estarán
135 | estaría
136 | estarías
137 | estaríamos
138 | estaríais
139 | estarían
140 | estaba
141 | estabas
142 | estábamos
143 | estabais
144 | estaban
145 | estuve
146 | estuviste
147 | estuvo
148 | estuvimos
149 | estuvisteis
150 | estuvieron
151 | estuviera
152 | estuvieras
153 | estuviéramos
154 | estuvierais
155 | estuvieran
156 | estuviese
157 | estuvieses
158 | estuviésemos
159 | estuvieseis
160 | estuviesen
161 | estando
162 | estado
163 | estada
164 | estados
165 | estadas
166 | estad
167 | he
168 | has
169 | ha
170 | hemos
171 | habéis
172 | han
173 | haya
174 | hayas
175 | hayamos
176 | hayáis
177 | hayan
178 | habré
179 | habrás
180 | habrá
181 | habremos
182 | habréis
183 | habrán
184 | habría
185 | habrías
186 | habríamos
187 | habríais
188 | habrían
189 | había
190 | habías
191 | habíamos
192 | habíais
193 | habían
194 | hube
195 | hubiste
196 | hubo
197 | hubimos
198 | hubisteis
199 | hubieron
200 | hubiera
201 | hubieras
202 | hubiéramos
203 | hubierais
204 | hubieran
205 | hubiese
206 | hubieses
207 | hubiésemos
208 | hubieseis
209 | hubiesen
210 | habiendo
211 | habido
212 | habida
213 | habidos
214 | habidas
215 |
216 | # forms of ser, to be (not including the infinitive):
217 | soy
218 | eres
219 | es
220 | somos
221 | sois
222 | son
223 | sea
224 | seas
225 | seamos
226 | seáis
227 | sean
228 | seré
229 | serás
230 | será
231 | seremos
232 | seréis
233 | serán
234 | sería
235 | serías
236 | seríamos
237 | seríais
238 | serían
239 | era
240 | eras
241 | éramos
242 | erais
243 | eran
244 | fui
245 | fuiste
246 | fue
247 | fuimos
248 | fuisteis
249 | fueron
250 | fuera
251 | fueras
252 | fuéramos
253 | fuerais
254 | fueran
255 | fuese
256 | fueses
257 | fuésemos
258 | fueseis
259 | fuesen
260 | siendo
261 | sido
262 | tengo
263 | tienes
264 | tiene
265 | tenemos
266 | tenéis
267 | tienen
268 | tenga
269 | tengas
270 | tengamos
271 | tengáis
272 | tengan
273 | tendré
274 | tendrás
275 | tendrá
276 | tendremos
277 | tendréis
278 | tendrán
279 | tendría
280 | tendrías
281 | tendríamos
282 | tendríais
283 | tendrían
284 | tenía
285 | tenías
286 | teníamos
287 | teníais
288 | tenían
289 | tuve
290 | tuviste
291 | tuvo
292 | tuvimos
293 | tuvisteis
294 | tuvieron
295 | tuviera
296 | tuvieras
297 | tuviéramos
298 | tuvierais
299 | tuvieran
300 | tuviese
301 | tuvieses
302 | tuviésemos
303 | tuvieseis
304 | tuviesen
305 | teniendo
306 | tenido
307 | tenida
308 | tenidos
309 | tenidas
310 | tened
311 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-fi.txt:
--------------------------------------------------------------------------------
1 | alla
2 | ansiosta
3 | ehkä
4 | ei
5 | enemmän
6 | ennen
7 | etessa
8 | f
9 | haikki
10 | he
11 | hitaasti
12 | hoikein
13 | hyvin
14 | hän
15 | ilman
16 | ja
17 | jos
18 | jälkeen
19 | kanssa
20 | kaukana
21 | kenties
22 | keskellä
23 | kesken
24 | koskaan
25 | kuinkan
26 | kukka
27 | kylliksi
28 | kyllä
29 | liian
30 | lla
31 | lla
32 | luona
33 | lähellä
34 | läpi
35 | me
36 | miksi
37 | mikä
38 | milloin
39 | milloinkan
40 | minä
41 | missä
42 | miten
43 | nopeasti
44 | nyt
45 | oikea
46 | oikealla
47 | paljon
48 | siellä
49 | sinä
50 | ssa
51 | sta
52 | suoraan
53 | tai
54 | takana
55 | takia
56 | tarpeeksi
57 | te
58 | tässä
59 | ulkopuolella
60 | vahemmän
61 | vasen
62 | vasenmalla
63 | vastan
64 | vielä
65 | vieressä
66 | vähän
67 | yhdessä
68 | ylös
69 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-fr.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | #-----------------------------------------------------------------------
17 | # a couple of test stopwords to test that the words are really being
18 | # configured from this file:
19 | stopworda
20 | stopwordb
21 |
22 | #Standard english stop words taken from Lucene's StopAnalyzer
23 | a
24 | an
25 | and
26 | are
27 | as
28 | at
29 | be
30 | but
31 | by
32 | for
33 | if
34 | in
35 | into
36 | is
37 | it
38 | no
39 | not
40 | of
41 | on
42 | or
43 | s
44 | such
45 | t
46 | that
47 | the
48 | their
49 | then
50 | there
51 | these
52 | they
53 | this
54 | to
55 | was
56 | will
57 | with
58 | au
59 | aux
60 | avec
61 | ce
62 | ces
63 | dans
64 | de
65 | des
66 | du
67 | elle
68 | en
69 | et
70 | eux
71 | il
72 | je
73 | la
74 | le
75 | leur
76 | lui
77 | ma
78 | mais
79 | me
80 | même
81 | mes
82 | moi
83 | mon
84 | ne
85 | nos
86 | notre
87 | nous
88 | on
89 | ou
90 | par
91 | pas
92 | pour
93 | qu
94 | que
95 | qui
96 | sa
97 | se
98 | ses
99 | son
100 | sur
101 | ta
102 | te
103 | tes
104 | toi
105 | ton
106 | tu
107 | un
108 | une
109 | vos
110 | votre
111 | vous
112 | c
113 | d
114 | j
115 | l
116 | à
117 | m
118 | n
119 | s
120 | t
121 | y
122 | été
123 | étée
124 | étées
125 | étés
126 | étant
127 | suis
128 | es
129 | est
130 | sommes
131 | êtes
132 | sont
133 | serai
134 | seras
135 | sera
136 | serons
137 | serez
138 | seront
139 | serais
140 | serait
141 | serions
142 | seriez
143 | seraient
144 | étais
145 | était
146 | étions
147 | étiez
148 | étaient
149 | fus
150 | fut
151 | fûmes
152 | fûtes
153 | furent
154 | sois
155 | soit
156 | soyons
157 | soyez
158 | soient
159 | fusse
160 | fusses
161 | fût
162 | fussions
163 | fussiez
164 | fussent
165 | ayant
166 | eu
167 | eue
168 | eues
169 | eus
170 | ai
171 | as
172 | avons
173 | avez
174 | ont
175 | aurai
176 | auras
177 | aura
178 | aurons
179 | aurez
180 | auront
181 | aurais
182 | aurait
183 | aurions
184 | auriez
185 | auraient
186 | avais
187 | avait
188 | avions
189 | aviez
190 | avaient
191 | eut
192 | eûmes
193 | eûtes
194 | eurent
195 | aie
196 | aies
197 | ait
198 | ayons
199 | ayez
200 | aient
201 | eusse
202 | eusses
203 | eût
204 | eussions
205 | eussiez
206 | eussent
207 | ceci
208 | celà
209 | cet
210 | cette
211 | ici
212 | ils
213 | les
214 | leurs
215 | quel
216 | quels
217 | quelle
218 | quelles
219 | sans
220 | soi
221 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-hu.txt:
--------------------------------------------------------------------------------
1 | a
2 | á
3 | ahogy
4 | ahol
5 | aki
6 | akik
7 | akkor
8 | alatt
9 | által
10 | általában
11 | amely
12 | amelyek
13 | amelyekben
14 | amelyeket
15 | amelyet
16 | amelynek
17 | ami
18 | amit
19 | amolyan
20 | amp
21 | amíg
22 | amikor
23 | át
24 | abban
25 | ahhoz
26 | annak
27 | arra
28 | arról
29 | az
30 | azok
31 | azon
32 | azt
33 | azzal
34 | azért
35 | aztán
36 | azután
37 | azonban
38 | b
39 | bár
40 | be
41 | belül
42 | benne
43 | c
44 | cikk
45 | cikkek
46 | cikkeket
47 | csak
48 | d
49 | de
50 | e
51 | é
52 | eddig
53 | egész
54 | egy
55 | egyes
56 | egyetlen
57 | egyéb
58 | egyik
59 | egyre
60 | ekkor
61 | el
62 | elég
63 | ellen
64 | elő
65 | először
66 | előtt
67 | első
68 | én
69 | éppen
70 | ebben
71 | ehhez
72 | emilyen
73 | ennek
74 | erre
75 | ez
76 | ezt
77 | ezek
78 | ezen
79 | ezzel
80 | ezért
81 | és
82 | f
83 | fel
84 | felé
85 | g
86 | h
87 | hanem
88 | hiszen
89 | hogy
90 | hogyan
91 | i
92 | í
93 | igen
94 | így
95 | illetve
96 | ill.
97 | ill
98 | ilyen
99 | ilyenkor
100 | is
101 | ison
102 | ismét
103 | itt
104 | j
105 | jó
106 | jól
107 | jobban
108 | k
109 | kell
110 | kellett
111 | keresztül
112 | keressünk
113 | ki
114 | kívül
115 | között
116 | közül
117 | l
118 | legalább
119 | lehet
120 | lehetett
121 | legyen
122 | lenne
123 | lenni
124 | lesz
125 | lett
126 | m
127 | maga
128 | magát
129 | majd
130 | majd
131 | már
132 | más
133 | másik
134 | meg
135 | még
136 | mellett
137 | mert
138 | mely
139 | melyek
140 | mi
141 | mit
142 | míg
143 | miért
144 | milyen
145 | mikor
146 | minden
147 | mindent
148 | mindenki
149 | mindig
150 | mint
151 | mintha
152 | mivel
153 | most
154 | n
155 | nagy
156 | nagyobb
157 | nagyon
158 | ne
159 | néha
160 | nekem
161 | neki
162 | nem
163 | néhány
164 | nélkül
165 | nincs
166 | o
167 | ó
168 | olyan
169 | ott
170 | össze
171 | ö
172 | ő
173 | ők
174 | őket
175 | p
176 | pedig
177 | persze
178 | q
179 | r
180 | rá
181 | s
182 | saját
183 | sem
184 | semmi
185 | sok
186 | sokat
187 | sokkal
188 | sz
189 | számára
190 | szemben
191 | szerint
192 | szinte
193 | t
194 | talán
195 | tehát
196 | teljes
197 | tovább
198 | továbbá
199 | több
200 | u
201 | ú
202 | úgy
203 | ugyanis
204 | új
205 | újabb
206 | újra
207 | után
208 | utána
209 | utolsó
210 | ü
211 | ű
212 | v
213 | vagy
214 | vagyis
215 | valaki
216 | valamely
217 | valami
218 | valamint
219 | való
220 | vagyok
221 | van
222 | vannak
223 | volt
224 | voltam
225 | voltak
226 | voltunk
227 | vissza
228 | vele
229 | viszont
230 | volna
231 | számolnak
232 | szólnak
233 | szól
234 | w
235 | x
236 | y
237 | z
238 | zs
239 | a
240 | ahogy
241 | ahol
242 | aki
243 | akkor
244 | alatt
245 | általában
246 | által
247 | amely
248 | amíg
249 | amikor
250 | ami
251 | amolyan
252 | arra
253 | át
254 | az
255 | azért
256 | azonban
257 | azon
258 | aztán
259 | azt
260 | azután
261 | azzal
262 | bár
263 | be
264 | belül
265 | benne
266 | cikk
267 | csak
268 | de
269 | eddig
270 | egész
271 | egy
272 | egyéb
273 | egyes
274 | egyetlen
275 | egyik
276 | egyre
277 | ekkor
278 | el
279 | elég
280 | ellen
281 | elő
282 | először
283 | előtt
284 | első
285 | emilyen
286 | én
287 | éppen
288 | erre
289 | és
290 | e
291 | ez
292 | ezen
293 | ezért
294 | ezzel
295 | fel
296 | felé
297 | hanem
298 | hiszen
299 | hogy
300 | hogyan
301 | igen
302 | így
303 | ill.
304 | illetve
305 | ill
306 | ilyen
307 | ilyenkor
308 | ismét
309 | ison
310 | itt
311 | jó
312 | jobban
313 | jól
314 | kell
315 | keres
316 | keresztül
317 | ki
318 | kívül
319 | között
320 | közül
321 | legalább
322 | legyen
323 | lehet
324 | lenni
325 | lett
326 | maga
327 | maga
328 | majd
329 | már
330 | más
331 | másik
332 | még
333 | meg
334 | mellett
335 | mely
336 | mert
337 | miért
338 | míg
339 | mikor
340 | milyen
341 | minden
342 | mindenki
343 | mindig
344 | mi
345 | mint
346 | mintha
347 | mivel
348 | most
349 | nagy
350 | nagyobb
351 | nagyon
352 | ne
353 | néha
354 | néhány
355 | neki
356 | nélkül
357 | nem
358 | nincs
359 | ők
360 | olyan
361 | ő
362 | össze
363 | ott
364 | pedig
365 | persze
366 | rá
367 | saját
368 | s
369 | sem
370 | semmi
371 | sokkal
372 | sok
373 | számára
374 | számol
375 | szemben
376 | szerint
377 | szinte
378 | szól
379 | talán
380 | tehát
381 | teljes
382 | továbbá
383 | tovább
384 | úgy
385 | ugyanis
386 | új
387 | újabb
388 | újra
389 | utána
390 | után
391 | utolsó
392 | vagy
393 | vagyis
394 | valaki
395 | valamely
396 | valami
397 | valamint
398 | való
399 | van
400 | vissza
401 | viszont
402 | volt
403 |
404 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-it.txt:
--------------------------------------------------------------------------------
1 | ad
2 | al
3 | allo
4 | ai
5 | agli
6 | all
7 | agl
8 | alla
9 | alle
10 | con
11 | col
12 | coi
13 | da
14 | dal
15 | dallo
16 | dai
17 | dagli
18 | dall
19 | dagl
20 | dalla
21 | dalle
22 | di
23 | del
24 | dello
25 | dei
26 | degli
27 | dell
28 | degl
29 | della
30 | delle
31 | in
32 | nel
33 | nello
34 | nei
35 | negli
36 | nell
37 | negl
38 | nella
39 | nelle
40 | su
41 | sul
42 | sullo
43 | sui
44 | sugli
45 | sull
46 | sugl
47 | sulla
48 | sulle
49 | per
50 | tra
51 | contro
52 | io
53 | tu
54 | lui
55 | lei
56 | noi
57 | voi
58 | loro
59 | mio
60 | mia
61 | miei
62 | mie
63 | tuo
64 | tua
65 | tuoi
66 | tue
67 | suo
68 | sua
69 | suoi
70 | sue
71 | nostro
72 | nostra
73 | nostri
74 | nostre
75 | vostro
76 | vostra
77 | vostri
78 | vostre
79 | mi
80 | ti
81 | ci
82 | vi
83 | lo
84 | la
85 | li
86 | le
87 | gli
88 | ne
89 | il
90 | un
91 | uno
92 | una
93 | ma
94 | ed
95 | se
96 | perchè
97 | perché
98 | perche
99 | anche
100 | come
101 | dov
102 | dove
103 | che
104 | chi
105 | cui
106 | non
107 | più
108 | piu
109 | quale
110 | quanto
111 | quanti
112 | quanta
113 | quante
114 | quello
115 | quelli
116 | quella
117 | quelle
118 | questo
119 | questi
120 | questa
121 | queste
122 | si
123 | tutto
124 | tutti
125 | a
126 | c
127 | e
128 | i
129 | l
130 | o
131 | ho
132 | hai
133 | ha
134 | abbiamo
135 | avete
136 | hanno
137 | abbia
138 | abbiate
139 | abbiano
140 | avrò
141 | avro
142 | avrai
143 | avrà
144 | avra
145 | avremo
146 | avrete
147 | avranno
148 | avrei
149 | avresti
150 | avrebbe
151 | avremmo
152 | avreste
153 | avrebbero
154 | avevo
155 | avevi
156 | aveva
157 | avevamo
158 | avevate
159 | avevano
160 | ebbi
161 | avesti
162 | ebbe
163 | avemmo
164 | aveste
165 | ebbero
166 | avessi
167 | avesse
168 | avessimo
169 | avessero
170 | avendo
171 | avuto
172 | avuta
173 | avuti
174 | avute
175 | sono
176 | sei
177 | è
178 | é
179 | e
180 | siamo
181 | siete
182 | sia
183 | siate
184 | siano
185 | sarà
186 | sarai
187 | sarò
188 | saro
189 | saremo
190 | sarete
191 | saranno
192 | sarei
193 | saresti
194 | sarebbe
195 | saremmo
196 | sareste
197 | sarebbero
198 | ero
199 | eri
200 | era
201 | eravamo
202 | eravate
203 | erano
204 | fui
205 | fosti
206 | fu
207 | fummo
208 | foste
209 | furono
210 | fossi
211 | fosse
212 | fossimo
213 | fossero
214 | essendo
215 | faccio
216 | fai
217 | facciamo
218 | fanno
219 | faccia
220 | facciate
221 | facciano
222 | farà
223 | farai
224 | farò
225 | faremo
226 | farete
227 | faranno
228 | farei
229 | faresti
230 | farebbe
231 | faremmo
232 | fareste
233 | farebbero
234 | facevo
235 | facevi
236 | faceva
237 | facevamo
238 | facevate
239 | facevano
240 | feci
241 | facesti
242 | fece
243 | facemmo
244 | faceste
245 | fecero
246 | facessi
247 | facesse
248 | facessimo
249 | facessero
250 | facendo
251 | sto
252 | stai
253 | sta
254 | stiamo
255 | stanno
256 | stia
257 | stiate
258 | stiano
259 | starà
260 | starai
261 | starò
262 | staremo
263 | starete
264 | staranno
265 | starei
266 | staresti
267 | starebbe
268 | staremmo
269 | stareste
270 | starebbero
271 | stavo
272 | stavi
273 | stava
274 | stavamo
275 | stavate
276 | stavano
277 | stetti
278 | stesti
279 | stette
280 | stemmo
281 | steste
282 | stettero
283 | stessi
284 | stesse
285 | stessimo
286 | stessero
287 | stando
288 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-ko.txt:
--------------------------------------------------------------------------------
1 | 을
2 | 의
3 | 에
4 | 이
5 | 를
6 | 으로
7 | 은
8 | 는
9 | 가
10 | 로
11 | 하고
12 | 과
13 | 에서
14 | 도
15 | 와
16 | 이다
17 | 고
18 | 부터
19 | 까지
20 | 께
21 | 에는
22 | 이라고
23 | 만
24 | 라고
25 | 보다
26 | 에도
27 | 다
28 | 토록
29 | 에게
30 | 나
31 | 대로
32 | 에서는
33 | 이나
34 | 이며
35 | 요
36 | 든
37 | 으로써
38 | 같이
39 | 로는
40 | 밖에
41 | 과의
42 | 며
43 | 로부터
44 | 처럼
45 | 아
46 | 라
47 | 여
48 | 으로는
49 | 이고
50 | 에서의
51 | 이라는
52 | 만에
53 | 으로부터
54 | 에서도
55 | 와의
56 | 엔
57 | 만을
58 | 부터는
59 | 만의
60 | 야
61 | 까지의
62 | 과는
63 | 치고
64 | 과를
65 | 으로의
66 | 까지는
67 | 보다는
68 | 만이
69 | 에만
70 | 로의
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-nb.txt:
--------------------------------------------------------------------------------
1 | alle
2 | andre
3 | arbeid
4 | av
5 | begge
6 | bort
7 | bra
8 | bruke
9 | da
10 | denne
11 | der
12 | deres
13 | det
14 | din
15 | disse
16 | du
17 | eller
18 | en
19 | ene
20 | eneste
21 | enhver
22 | enn
23 | er
24 | et
25 | folk
26 | for
27 | fordi
28 | forsÛke
29 | fra
30 | fÅ
31 | fÛr
32 | fÛrst
33 | gjorde
34 | gjÛre
35 | god
36 | gÅ
37 | ha
38 | hadde
39 | han
40 | hans
41 | hennes
42 | her
43 | hva
44 | hvem
45 | hver
46 | hvilken
47 | hvis
48 | hvor
49 | hvordan
50 | hvorfor
51 | ikke
52 | inn
53 | innen
54 | kan
55 | kunne
56 | lage
57 | lang
58 | lik
59 | like
60 | makt
61 | mange
62 | med
63 | meg
64 | meget
65 | men
66 | mens
67 | mer
68 | mest
69 | min
70 | mye
71 | mÅ
72 | mÅte
73 | navn
74 | nei
75 | ny
76 | nÅ
77 | nÅr
78 | og
79 | ogsÅ
80 | om
81 | opp
82 | oss
83 | over
84 | part
85 | punkt
86 | pÅ
87 | rett
88 | riktig
89 | samme
90 | sant
91 | si
92 | siden
93 | sist
94 | skulle
95 | slik
96 | slutt
97 | som
98 | start
99 | stille
100 | tid
101 | til
102 | tilbake
103 | tilstand
104 | under
105 | ut
106 | uten
107 | var
108 | ved
109 | verdi
110 | vi
111 | vil
112 | ville
113 | vite
114 | vÅr
115 | vÖre
116 | vÖrt
117 | Å
118 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-nl.txt:
--------------------------------------------------------------------------------
1 | aan
2 | af
3 | al
4 | als
5 | bij
6 | dan
7 | dat
8 | die
9 | dit
10 | een
11 | en
12 | er
13 | had
14 | heb
15 | hem
16 | het
17 | hij
18 | hoe
19 | hun
20 | ik
21 | in
22 | is
23 | je
24 | kan
25 | me
26 | men
27 | met
28 | mij
29 | nog
30 | nu
31 | of
32 | ons
33 | ook
34 | te
35 | tot
36 | uit
37 | van
38 | was
39 | wat
40 | we
41 | wel
42 | wij
43 | zal
44 | ze
45 | zei
46 | zij
47 | zo
48 | zou
49 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-no.txt:
--------------------------------------------------------------------------------
1 | at
2 | av
3 | de
4 | den
5 | der
6 | det
7 | du
8 | en
9 | er
10 | et
11 | for
12 | fra
13 | før
14 | med
15 | og
16 | om
17 | over
18 | på
19 | som
20 | til
21 | ved
22 | år
23 | alle
24 | bare
25 | ble
26 | bort
27 | bra
28 | da
29 | deg
30 | dem
31 | denne
32 | dere
33 | deres
34 | det
35 | dette
36 | din
37 | disse
38 | dit
39 | ditt
40 | eller
41 | ene
42 | enn
43 | er
44 | et
45 | ett
46 | etter
47 | for
48 | fram
49 | først
50 | få
51 | god
52 | gå
53 | ha
54 | han
55 | hans
56 | har
57 | her
58 | hit
59 | hun
60 | hva
61 | hvem
62 | hver
63 | ikke
64 | inn
65 | ja
66 | jeg
67 | kan
68 | kom
69 | kun
70 | kunne
71 | lage
72 | lang
73 | lik
74 | like
75 | man
76 | mer
77 | min
78 | mot
79 | mye
80 | må
81 | måte
82 | ned
83 | nei
84 | noe
85 | noen
86 | ny
87 | nå
88 | når
89 | også
90 | opp
91 | oss
92 | seg
93 | selv
94 | si
95 | siden
96 | sin
97 | sine
98 | sist
99 | skal
100 | skulle
101 | slik
102 | som
103 | så
104 | sånn
105 | tid
106 | til
107 | under
108 | ut
109 | uten
110 | var
111 | ved
112 | vi
113 | vil
114 | vite
115 | vår
116 | å
117 | dei
118 | di
119 | då
120 | eg
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-pl.txt:
--------------------------------------------------------------------------------
1 | a
2 | aby
3 | ach
4 | acz
5 | aczkolwiek
6 | aj
7 | albo
8 | ale
9 | ależ
10 | ani
11 | aż
12 | bardziej
13 | bardzo
14 | bo
15 | bowiem
16 | by
17 | byli
18 | bynajmniej
19 | być
20 | był
21 | była
22 | było
23 | były
24 | będzie
25 | będą
26 | cali
27 | cała
28 | cały
29 | ci
30 | cię
31 | ciebie
32 | co
33 | cokolwiek
34 | coś
35 | czasami
36 | czasem
37 | czemu
38 | czy
39 | czyli
40 | daleko
41 | dla
42 | dlaczego
43 | dlatego
44 | do
45 | dobrze
46 | dokąd
47 | dość
48 | dużo
49 | dwa
50 | dwaj
51 | dwie
52 | dwoje
53 | dziś
54 | dzisiaj
55 | gdy
56 | gdyby
57 | gdyż
58 | gdzie
59 | gdziekolwiek
60 | gdzieś
61 | i
62 | ich
63 | ile
64 | im
65 | inna
66 | inne
67 | inny
68 | innych
69 | iż
70 | ja
71 | ją
72 | jak
73 | jakaś
74 | jakby
75 | jaki
76 | jakichś
77 | jakie
78 | jakiś
79 | jakiż
80 | jakkolwiek
81 | jako
82 | jakoś
83 | je
84 | jeden
85 | jedna
86 | jedno
87 | jednak
88 | jednakże
89 | jego
90 | jej
91 | jemu
92 | jest
93 | jestem
94 | jeszcze
95 | jeśli
96 | jeżeli
97 | już
98 | ją
99 | każdy
100 | kiedy
101 | kilka
102 | kimś
103 | kto
104 | ktokolwiek
105 | ktoś
106 | która
107 | które
108 | którego
109 | której
110 | który
111 | których
112 | którym
113 | którzy
114 | ku
115 | lat
116 | lecz
117 | lub
118 | ma
119 | mają
120 | mało
121 | mam
122 | mi
123 | mimo
124 | między
125 | mną
126 | mnie
127 | mogą
128 | moi
129 | moim
130 | moja
131 | moje
132 | może
133 | możliwe
134 | można
135 | mój
136 | mu
137 | musi
138 | my
139 | na
140 | nad
141 | nam
142 | nami
143 | nas
144 | nasi
145 | nasz
146 | nasza
147 | nasze
148 | naszego
149 | naszych
150 | natomiast
151 | natychmiast
152 | nawet
153 | nią
154 | nic
155 | nich
156 | nie
157 | niech
158 | niego
159 | niej
160 | niemu
161 | nigdy
162 | nim
163 | nimi
164 | niż
165 | no
166 | o
167 | obok
168 | od
169 | około
170 | on
171 | ona
172 | one
173 | oni
174 | ono
175 | oraz
176 | oto
177 | owszem
178 | pan
179 | pana
180 | pani
181 | po
182 | pod
183 | podczas
184 | pomimo
185 | ponad
186 | ponieważ
187 | powinien
188 | powinna
189 | powinni
190 | powinno
191 | poza
192 | prawie
193 | przecież
194 | przed
195 | przede
196 | przedtem
197 | przez
198 | przy
199 | roku
200 | również
201 | sam
202 | sama
203 | są
204 | się
205 | skąd
206 | sobie
207 | sobą
208 | sposób
209 | swoje
210 | ta
211 | tak
212 | taka
213 | taki
214 | takie
215 | także
216 | tam
217 | te
218 | tego
219 | tej
220 | temu
221 | ten
222 | teraz
223 | też
224 | to
225 | tobą
226 | tobie
227 | toteż
228 | trzeba
229 | tu
230 | tutaj
231 | twoi
232 | twoim
233 | twoja
234 | twoje
235 | twym
236 | twój
237 | ty
238 | tych
239 | tylko
240 | tym
241 | u
242 | w
243 | wam
244 | wami
245 | was
246 | wasz
247 | wasza
248 | wasze
249 | we
250 | według
251 | wiele
252 | wielu
253 | więc
254 | więcej
255 | wszyscy
256 | wszystkich
257 | wszystkie
258 | wszystkim
259 | wszystko
260 | wtedy
261 | wy
262 | właśnie
263 | z
264 | za
265 | zapewne
266 | zawsze
267 | ze
268 | zł
269 | znowu
270 | znów
271 | został
272 | żaden
273 | żadna
274 | żadne
275 | żadnych
276 | że
277 | żeby
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-pt.txt:
--------------------------------------------------------------------------------
1 | último
2 | é
3 | acerca
4 | agora
5 | algmas
6 | alguns
7 | ali
8 | ambos
9 | antes
10 | apontar
11 | aquela
12 | aquelas
13 | aquele
14 | aqueles
15 | aqui
16 | atrás
17 | bem
18 | bom
19 | cada
20 | caminho
21 | cima
22 | com
23 | como
24 | comprido
25 | conhecido
26 | corrente
27 | das
28 | debaixo
29 | dentro
30 | desde
31 | desligado
32 | deve
33 | devem
34 | deverá
35 | direita
36 | diz
37 | dizer
38 | dois
39 | dos
40 | e
41 | ela
42 | ele
43 | eles
44 | em
45 | enquanto
46 | então
47 | está
48 | estão
49 | estado
50 | estar
51 | estará
52 | este
53 | estes
54 | esteve
55 | estive
56 | estivemos
57 | estiveram
58 | eu
59 | fará
60 | faz
61 | fazer
62 | fazia
63 | fez
64 | fim
65 | foi
66 | fora
67 | horas
68 | iniciar
69 | inicio
70 | ir
71 | irá
72 | ista
73 | iste
74 | isto
75 | ligado
76 | maioria
77 | maiorias
78 | mais
79 | mas
80 | mesmo
81 | meu
82 | muito
83 | muitos
84 | nós
85 | não
86 | nome
87 | nosso
88 | novo
89 | o
90 | onde
91 | os
92 | ou
93 | outro
94 | para
95 | parte
96 | pegar
97 | pelo
98 | pessoas
99 | pode
100 | poderá
101 | podia
102 | por
103 | porque
104 | povo
105 | promeiro
106 | quê
107 | qual
108 | qualquer
109 | quando
110 | quem
111 | quieto
112 | são
113 | saber
114 | sem
115 | ser
116 | seu
117 | somente
118 | têm
119 | tal
120 | também
121 | tem
122 | tempo
123 | tenho
124 | tentar
125 | tentaram
126 | tente
127 | tentei
128 | teu
129 | teve
130 | tipo
131 | tive
132 | todos
133 | trabalhar
134 | trabalho
135 | tu
136 | um
137 | uma
138 | umas
139 | uns
140 | usa
141 | usar
142 | valor
143 | veja
144 | ver
145 | verdade
146 | verdadeiro
147 | você
148 |
--------------------------------------------------------------------------------
/goose/resources/text/stopwords-zh.txt:
--------------------------------------------------------------------------------
1 | 的
2 | 一
3 | 不
4 | 在
5 | 人
6 | 有
7 | 是
8 | 为
9 | 以
10 | 于
11 | 上
12 | 他
13 | 而
14 | 后
15 | 之
16 | 来
17 | 及
18 | 了
19 | 因
20 | 下
21 | 可
22 | 到
23 | 由
24 | 这
25 | 与
26 | 也
27 | 此
28 | 但
29 | 并
30 | 个
31 | 其
32 | 已
33 | 无
34 | 小
35 | 我
36 | 们
37 | 起
38 | 最
39 | 再
40 | 今
41 | 去
42 | 好
43 | 只
44 | 又
45 | 或
46 | 很
47 | 亦
48 | 某
49 | 把
50 | 那
51 | 你
52 | 乃
53 | 它
54 | 吧
55 | 被
56 | 比
57 | 别
58 | 趁
59 | 当
60 | 从
61 | 到
62 | 得
63 | 打
64 | 凡
65 | 儿
66 | 尔
67 | 该
68 | 各
69 | 给
70 | 跟
71 | 和
72 | 何
73 | 还
74 | 即
75 | 几
76 | 既
77 | 看
78 | 据
79 | 距
80 | 靠
81 | 啦
82 | 了
83 | 另
84 | 么
85 | 每
86 | 们
87 | 嘛
88 | 拿
89 | 哪
90 | 那
91 | 您
92 | 凭
93 | 且
94 | 却
95 | 让
96 | 仍
97 | 啥
98 | 如
99 | 若
100 | 使
101 | 谁
102 | 虽
103 | 随
104 | 同
105 | 所
106 | 她
107 | 哇
108 | 嗡
109 | 往
110 | 哪
111 | 些
112 | 向
113 | 沿
114 | 哟
115 | 用
116 | 于
117 | 咱
118 | 则
119 | 怎
120 | 曾
121 | 至
122 | 致
123 | 着
124 | 诸
125 | 自
126 | 為
127 | 於
128 | 後
129 | 這
130 | 與
131 | 並
132 | 個
133 | 無
134 | 們
135 | 當
136 | 從
137 | 兒
138 | 爾
139 | 該
140 | 給
141 | 還
142 | 幾
143 | 麼
144 | 憑
145 | 卻
146 | 讓
147 | 誰
148 | 雖
149 | 喲
150 | 則
151 | 諸
152 |
--------------------------------------------------------------------------------
/goose/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 | import time
24 | import hashlib
25 | import re
26 | import os
27 | import goose
28 | import codecs
29 | import urlparse
30 |
31 |
32 | class BuildURL(object):
33 | def __init__(self, url, finalurl=None):
34 | self.url = url
35 | self.finalurl = finalurl
36 |
37 | def getHostname(self, o):
38 | if o.hostname:
39 | return o.hotname
40 | elif self.finalurl:
41 | oo = urlparse(self.finalurl)
42 | if oo.hostname:
43 | return oo.hostname
44 | return None
45 |
46 | def getScheme(self, o):
47 | if o.scheme:
48 | return o.scheme
49 | elif self.finalurl:
50 | oo = urlparse(self.finalurl)
51 | if oo.scheme:
52 | return oo.scheme
53 | return 'http'
54 |
55 | def getUrl(self):
56 | """\
57 |
58 | """
59 | url_obj = urlparse(self.url)
60 | scheme = self.getScheme(url_obj)
61 | hostname = self.getHostname(url_obj)
62 |
63 |
64 | class FileHelper(object):
65 |
66 | @classmethod
67 | def loadResourceFile(self, filename):
68 | if not os.path.isabs('filename'):
69 | dirpath = os.path.dirname(goose.__file__)
70 | path = os.path.join(dirpath, 'resources', filename)
71 | else:
72 | path = filename
73 | try:
74 | f = codecs.open(path, 'r', 'utf-8')
75 | content = f.read()
76 | f.close()
77 | return content
78 | except IOError:
79 | raise IOError("Couldn't open file %s" % path)
80 |
81 |
82 | class ParsingCandidate(object):
83 |
84 | def __init__(self, urlString, link_hash):
85 | self.urlString = self.url = urlString
86 | self.link_hash = link_hash
87 |
88 |
89 | class RawHelper(object):
90 | @classmethod
91 | def get_parsing_candidate(self, url, raw_html):
92 | if isinstance(raw_html, unicode):
93 | raw_html = raw_html.encode('utf-8')
94 | link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
95 | return ParsingCandidate(url, link_hash)
96 |
97 |
98 | class URLHelper(object):
99 | @classmethod
100 | def get_parsing_candidate(self, url_to_crawl):
101 | # replace shebang is urls
102 | final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
103 | if '#!' in url_to_crawl else url_to_crawl
104 | link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
105 | return ParsingCandidate(final_url, link_hash)
106 |
107 |
108 | class StringSplitter(object):
109 | """\
110 |
111 | """
112 | def __init__(self, pattern):
113 | self.pattern = re.compile(pattern)
114 |
115 | def split(self, string):
116 | if not string:
117 | return []
118 | return self.pattern.split(string)
119 |
120 |
121 | class StringReplacement(object):
122 |
123 | def __init__(self, pattern, replaceWith):
124 | self.pattern = pattern
125 | self.replaceWith = replaceWith
126 |
127 | def replaceAll(self, string):
128 | if not string:
129 | return u''
130 | return string.replace(self.pattern, self.replaceWith)
131 |
132 |
133 | class ReplaceSequence(object):
134 |
135 | def __init__(self):
136 | self.replacements = []
137 |
138 | #@classmethod
139 | def create(self, firstPattern, replaceWith=None):
140 | result = StringReplacement(firstPattern, replaceWith or u'')
141 | self.replacements.append(result)
142 | return self
143 |
144 | def append(self, pattern, replaceWith=None):
145 | return self.create(pattern, replaceWith)
146 |
147 | def replaceAll(self, string):
148 | if not string:
149 | return u''
150 |
151 | mutatedString = string
152 |
153 | for rp in self.replacements:
154 | mutatedString = rp.replaceAll(mutatedString)
155 | return mutatedString
156 |
--------------------------------------------------------------------------------
/goose/version.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 |
24 | version_info = (1, 0, 22)
25 | __version__ = ".".join(map(str, version_info))
26 |
--------------------------------------------------------------------------------
/goose/videos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/goose/videos/__init__.py
--------------------------------------------------------------------------------
/goose/videos/videos.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """\
3 | This is a python port of "Goose" orignialy licensed to Gravity.com
4 | under one or more contributor license agreements. See the NOTICE file
5 | distributed with this work for additional information
6 | regarding copyright ownership.
7 |
8 | Python port was written by Xavier Grangier for Recrutae
9 |
10 | Gravity.com licenses this file
11 | to you under the Apache License, Version 2.0 (the "License");
12 | you may not use this file except in compliance
13 | with the License. You may obtain a copy of the License at
14 |
15 | http://www.apache.org/licenses/LICENSE-2.0
16 |
17 | Unless required by applicable law or agreed to in writing, software
18 | distributed under the License is distributed on an "AS IS" BASIS,
19 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20 | See the License for the specific language governing permissions and
21 | limitations under the License.
22 | """
23 |
24 | class Video(object):
25 | """\
26 | Video object
27 | """
28 |
29 | def __init__(self):
30 |
31 | # type of embed
32 | # embed, object, iframe
33 | self.embed_type = None
34 |
35 | # video provider name
36 | self.provider = None
37 |
38 | # width
39 | self.width = None
40 |
41 | # height
42 | self.height = None
43 |
44 | # embed code
45 | self.embed_code = None
46 |
47 | # src
48 | self.src = None
49 |
--------------------------------------------------------------------------------
/httplib2/iri2uri.py:
--------------------------------------------------------------------------------
1 | """
2 | iri2uri
3 |
4 | Converts an IRI to a URI.
5 |
6 | """
7 | __author__ = "Joe Gregorio (joe@bitworking.org)"
8 | __copyright__ = "Copyright 2006, Joe Gregorio"
9 | __contributors__ = []
10 | __version__ = "1.0.0"
11 | __license__ = "MIT"
12 | __history__ = """
13 | """
14 |
15 | import urlparse
16 |
17 |
18 | # Convert an IRI to a URI following the rules in RFC 3987
19 | #
20 | # The characters we need to enocde and escape are defined in the spec:
21 | #
22 | # iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
23 | # ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
24 | # / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
25 | # / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
26 | # / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
27 | # / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
28 | # / %xD0000-DFFFD / %xE1000-EFFFD
29 |
30 | escape_range = [
31 | (0xA0, 0xD7FF),
32 | (0xE000, 0xF8FF),
33 | (0xF900, 0xFDCF),
34 | (0xFDF0, 0xFFEF),
35 | (0x10000, 0x1FFFD),
36 | (0x20000, 0x2FFFD),
37 | (0x30000, 0x3FFFD),
38 | (0x40000, 0x4FFFD),
39 | (0x50000, 0x5FFFD),
40 | (0x60000, 0x6FFFD),
41 | (0x70000, 0x7FFFD),
42 | (0x80000, 0x8FFFD),
43 | (0x90000, 0x9FFFD),
44 | (0xA0000, 0xAFFFD),
45 | (0xB0000, 0xBFFFD),
46 | (0xC0000, 0xCFFFD),
47 | (0xD0000, 0xDFFFD),
48 | (0xE1000, 0xEFFFD),
49 | (0xF0000, 0xFFFFD),
50 | (0x100000, 0x10FFFD),
51 | ]
52 |
53 | def encode(c):
54 | retval = c
55 | i = ord(c)
56 | for low, high in escape_range:
57 | if i < low:
58 | break
59 | if i >= low and i <= high:
60 | retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')])
61 | break
62 | return retval
63 |
64 |
65 | def iri2uri(uri):
66 | """Convert an IRI to a URI. Note that IRIs must be
67 | passed in a unicode strings. That is, do not utf-8 encode
68 | the IRI before passing it into the function."""
69 | if isinstance(uri ,unicode):
70 | (scheme, authority, path, query, fragment) = urlparse.urlsplit(uri)
71 | authority = authority.encode('idna')
72 | # For each character in 'ucschar' or 'iprivate'
73 | # 1. encode as utf-8
74 | # 2. then %-encode each octet of that utf-8
75 | uri = urlparse.urlunsplit((scheme, authority, path, query, fragment))
76 | uri = "".join([encode(c) for c in uri])
77 | return uri
78 |
79 | if __name__ == "__main__":
80 | import unittest
81 |
82 | class Test(unittest.TestCase):
83 |
84 | def test_uris(self):
85 | """Test that URIs are invariant under the transformation."""
86 | invariant = [
87 | u"ftp://ftp.is.co.za/rfc/rfc1808.txt",
88 | u"http://www.ietf.org/rfc/rfc2396.txt",
89 | u"ldap://[2001:db8::7]/c=GB?objectClass?one",
90 | u"mailto:John.Doe@example.com",
91 | u"news:comp.infosystems.www.servers.unix",
92 | u"tel:+1-816-555-1212",
93 | u"telnet://192.0.2.16:80/",
94 | u"urn:oasis:names:specification:docbook:dtd:xml:4.1.2" ]
95 | for uri in invariant:
96 | self.assertEqual(uri, iri2uri(uri))
97 |
98 | def test_iri(self):
99 | """ Test that the right type of escaping is done for each part of the URI."""
100 | self.assertEqual("http://xn--o3h.com/%E2%98%84", iri2uri(u"http://\N{COMET}.com/\N{COMET}"))
101 | self.assertEqual("http://bitworking.org/?fred=%E2%98%84", iri2uri(u"http://bitworking.org/?fred=\N{COMET}"))
102 | self.assertEqual("http://bitworking.org/#%E2%98%84", iri2uri(u"http://bitworking.org/#\N{COMET}"))
103 | self.assertEqual("#%E2%98%84", iri2uri(u"#\N{COMET}"))
104 | self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"))
105 | self.assertEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")))
106 | self.assertNotEqual("/fred?bar=%E2%98%9A#%E2%98%84", iri2uri(u"/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode('utf-8')))
107 |
108 | unittest.main()
109 |
110 |
111 |
--------------------------------------------------------------------------------
/httplib2/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/httplib2/test/__init__.py
--------------------------------------------------------------------------------
/httplib2/test/brokensocket/socket.py:
--------------------------------------------------------------------------------
1 | from realsocket import gaierror, error, getaddrinfo, SOCK_STREAM
2 |
--------------------------------------------------------------------------------
/httplib2/test/functional/test_proxies.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import errno
3 | import os
4 | import signal
5 | import subprocess
6 | import tempfile
7 |
8 | import nose
9 |
10 | import httplib2
11 | from httplib2 import socks
12 | from httplib2.test import miniserver
13 |
14 | tinyproxy_cfg = """
15 | User "%(user)s"
16 | Port %(port)s
17 | Listen 127.0.0.1
18 | PidFile "%(pidfile)s"
19 | LogFile "%(logfile)s"
20 | MaxClients 2
21 | StartServers 1
22 | LogLevel Info
23 | """
24 |
25 |
26 | class FunctionalProxyHttpTest(unittest.TestCase):
27 | def setUp(self):
28 | if not socks:
29 | raise nose.SkipTest('socks module unavailable')
30 | if not subprocess:
31 | raise nose.SkipTest('subprocess module unavailable')
32 |
33 | # start a short-lived miniserver so we can get a likely port
34 | # for the proxy
35 | self.httpd, self.proxyport = miniserver.start_server(
36 | miniserver.ThisDirHandler)
37 | self.httpd.shutdown()
38 | self.httpd, self.port = miniserver.start_server(
39 | miniserver.ThisDirHandler)
40 |
41 | self.pidfile = tempfile.mktemp()
42 | self.logfile = tempfile.mktemp()
43 | fd, self.conffile = tempfile.mkstemp()
44 | f = os.fdopen(fd, 'w')
45 | our_cfg = tinyproxy_cfg % {'user': os.getlogin(),
46 | 'pidfile': self.pidfile,
47 | 'port': self.proxyport,
48 | 'logfile': self.logfile}
49 | f.write(our_cfg)
50 | f.close()
51 | try:
52 | # TODO use subprocess.check_call when 2.4 is dropped
53 | ret = subprocess.call(['tinyproxy', '-c', self.conffile])
54 | self.assertEqual(0, ret)
55 | except OSError, e:
56 | if e.errno == errno.ENOENT:
57 | raise nose.SkipTest('tinyproxy not available')
58 | raise
59 |
60 | def tearDown(self):
61 | self.httpd.shutdown()
62 | try:
63 | pid = int(open(self.pidfile).read())
64 | os.kill(pid, signal.SIGTERM)
65 | except OSError, e:
66 | if e.errno == errno.ESRCH:
67 | print '\n\n\nTinyProxy Failed to start, log follows:'
68 | print open(self.logfile).read()
69 | print 'end tinyproxy log\n\n\n'
70 | raise
71 | map(os.unlink, (self.pidfile,
72 | self.logfile,
73 | self.conffile))
74 |
75 | def testSimpleProxy(self):
76 | proxy_info = httplib2.ProxyInfo(socks.PROXY_TYPE_HTTP,
77 | 'localhost', self.proxyport)
78 | client = httplib2.Http(proxy_info=proxy_info)
79 | src = 'miniserver.py'
80 | response, body = client.request('http://localhost:%d/%s' %
81 | (self.port, src))
82 | self.assertEqual(response.status, 200)
83 | self.assertEqual(body, open(os.path.join(miniserver.HERE, src)).read())
84 | lf = open(self.logfile).read()
85 | expect = ('Established connection to host "127.0.0.1" '
86 | 'using file descriptor')
87 | self.assertTrue(expect in lf,
88 | 'tinyproxy did not proxy a request for miniserver')
89 |
--------------------------------------------------------------------------------
/httplib2/test/miniserver.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import select
4 | import SimpleHTTPServer
5 | import SocketServer
6 | import threading
7 |
8 | HERE = os.path.dirname(__file__)
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | class ThisDirHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
13 | def translate_path(self, path):
14 | path = path.split('?', 1)[0].split('#', 1)[0]
15 | return os.path.join(HERE, *filter(None, path.split('/')))
16 |
17 | def log_message(self, s, *args):
18 | # output via logging so nose can catch it
19 | logger.info(s, *args)
20 |
21 |
22 | class ShutdownServer(SocketServer.TCPServer):
23 | """Mixin that allows serve_forever to be shut down.
24 |
25 | The methods in this mixin are backported from SocketServer.py in the Python
26 | 2.6.4 standard library. The mixin is unnecessary in 2.6 and later, when
27 | BaseServer supports the shutdown method directly.
28 | """
29 |
30 | def __init__(self, *args, **kwargs):
31 | SocketServer.TCPServer.__init__(self, *args, **kwargs)
32 | self.__is_shut_down = threading.Event()
33 | self.__serving = False
34 |
35 | def serve_forever(self, poll_interval=0.1):
36 | """Handle one request at a time until shutdown.
37 |
38 | Polls for shutdown every poll_interval seconds. Ignores
39 | self.timeout. If you need to do periodic tasks, do them in
40 | another thread.
41 | """
42 | self.__serving = True
43 | self.__is_shut_down.clear()
44 | while self.__serving:
45 | r, w, e = select.select([self.socket], [], [], poll_interval)
46 | if r:
47 | self._handle_request_noblock()
48 | self.__is_shut_down.set()
49 |
50 | def shutdown(self):
51 | """Stops the serve_forever loop.
52 |
53 | Blocks until the loop has finished. This must be called while
54 | serve_forever() is running in another thread, or it will deadlock.
55 | """
56 | self.__serving = False
57 | self.__is_shut_down.wait()
58 |
59 | def handle_request(self):
60 | """Handle one request, possibly blocking.
61 |
62 | Respects self.timeout.
63 | """
64 | # Support people who used socket.settimeout() to escape
65 | # handle_request before self.timeout was available.
66 | timeout = self.socket.gettimeout()
67 | if timeout is None:
68 | timeout = self.timeout
69 | elif self.timeout is not None:
70 | timeout = min(timeout, self.timeout)
71 | fd_sets = select.select([self], [], [], timeout)
72 | if not fd_sets[0]:
73 | self.handle_timeout()
74 | return
75 | self._handle_request_noblock()
76 |
77 | def _handle_request_noblock(self):
78 | """Handle one request, without blocking.
79 |
80 | I assume that select.select has returned that the socket is
81 | readable before this function was called, so there should be
82 | no risk of blocking in get_request().
83 | """
84 | try:
85 | request, client_address = self.get_request()
86 | except socket.error:
87 | return
88 | if self.verify_request(request, client_address):
89 | try:
90 | self.process_request(request, client_address)
91 | except:
92 | self.handle_error(request, client_address)
93 | self.close_request(request)
94 |
95 |
96 | def start_server(handler):
97 | httpd = ShutdownServer(("", 0), handler)
98 | threading.Thread(target=httpd.serve_forever).start()
99 | _, port = httpd.socket.getsockname()
100 | return httpd, port
101 |
--------------------------------------------------------------------------------
/httplib2/test/other_cacerts.txt:
--------------------------------------------------------------------------------
1 | # Certifcate Authority certificates for validating SSL connections.
2 | #
3 | # This file contains PEM format certificates generated from
4 | # http://mxr.mozilla.org/seamonkey/source/security/nss/lib/ckfw/builtins/certdata.txt
5 | #
6 | # ***** BEGIN LICENSE BLOCK *****
7 | # Version: MPL 1.1/GPL 2.0/LGPL 2.1
8 | #
9 | # The contents of this file are subject to the Mozilla Public License Version
10 | # 1.1 (the "License"); you may not use this file except in compliance with
11 | # the License. You may obtain a copy of the License at
12 | # http://www.mozilla.org/MPL/
13 | #
14 | # Software distributed under the License is distributed on an "AS IS" basis,
15 | # WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
16 | # for the specific language governing rights and limitations under the
17 | # License.
18 | #
19 | # The Original Code is the Netscape security libraries.
20 | #
21 | # The Initial Developer of the Original Code is
22 | # Netscape Communications Corporation.
23 | # Portions created by the Initial Developer are Copyright (C) 1994-2000
24 | # the Initial Developer. All Rights Reserved.
25 | #
26 | # Contributor(s):
27 | #
28 | # Alternatively, the contents of this file may be used under the terms of
29 | # either the GNU General Public License Version 2 or later (the "GPL"), or
30 | # the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
31 | # in which case the provisions of the GPL or the LGPL are applicable instead
32 | # of those above. If you wish to allow use of your version of this file only
33 | # under the terms of either the GPL or the LGPL, and not to allow others to
34 | # use your version of this file under the terms of the MPL, indicate your
35 | # decision by deleting the provisions above and replace them with the notice
36 | # and other provisions required by the GPL or the LGPL. If you do not delete
37 | # the provisions above, a recipient may use your version of this file under
38 | # the terms of any one of the MPL, the GPL or the LGPL.
39 | #
40 | # ***** END LICENSE BLOCK *****
41 |
42 |
43 | Comodo CA Limited, CN=Trusted Certificate Services
44 | ==================================================
45 |
46 | -----BEGIN CERTIFICATE-----
47 | MIIEQzCCAyugAwIBAgIBATANBgkqhkiG9w0BAQUFADB/MQswCQYDVQQGEwJHQjEb
48 | MBkGA1UECAwSR3JlYXRlciBNYW5jaGVzdGVyMRAwDgYDVQQHDAdTYWxmb3JkMRow
49 | GAYDVQQKDBFDb21vZG8gQ0EgTGltaXRlZDElMCMGA1UEAwwcVHJ1c3RlZCBDZXJ0
50 | aWZpY2F0ZSBTZXJ2aWNlczAeFw0wNDAxMDEwMDAwMDBaFw0yODEyMzEyMzU5NTla
51 | MH8xCzAJBgNVBAYTAkdCMRswGQYDVQQIDBJHcmVhdGVyIE1hbmNoZXN0ZXIxEDAO
52 | BgNVBAcMB1NhbGZvcmQxGjAYBgNVBAoMEUNvbW9kbyBDQSBMaW1pdGVkMSUwIwYD
53 | VQQDDBxUcnVzdGVkIENlcnRpZmljYXRlIFNlcnZpY2VzMIIBIjANBgkqhkiG9w0B
54 | AQEFAAOCAQ8AMIIBCgKCAQEA33FvNlhTWvI2VFeAxHQIIO0Yfyod5jWaHiWsnOWW
55 | fnJSoBVC21ndZHoa0Lh73TkVvFVIxO06AOoxEbrycXQaZ7jPM8yoMa+j49d/vzMt
56 | TGo87IvDktJTdyR0nAducPy9C1t2ul/y/9c3S0pgePfw+spwtOpZqqPOSC+pw7IL
57 | fhdyFgymBwwbOM/JYrc/oJOlh0Hyt3BAd9i+FHzjqMB6juljatEPmsbS9Is6FARW
58 | 1O24zG71++IsWL1/T2sr92AkWCTOJu80kTrV44HQsvAEAtdbtz6SrGsSivnkBbA7
59 | kUlcsutT6vifR4buv5XAwAaf0lteERv0xwQ1KdJVXOTt6wIDAQABo4HJMIHGMB0G
60 | A1UdDgQWBBTFe1i97doladL3WRaoszLAeydb9DAOBgNVHQ8BAf8EBAMCAQYwDwYD
61 | VR0TAQH/BAUwAwEB/zCBgwYDVR0fBHwwejA8oDqgOIY2aHR0cDovL2NybC5jb21v
62 | ZG9jYS5jb20vVHJ1c3RlZENlcnRpZmljYXRlU2VydmljZXMuY3JsMDqgOKA2hjRo
63 | dHRwOi8vY3JsLmNvbW9kby5uZXQvVHJ1c3RlZENlcnRpZmljYXRlU2VydmljZXMu
64 | Y3JsMA0GCSqGSIb3DQEBBQUAA4IBAQDIk4E7ibSvuIQSTI3S8NtwuleGFTQQuS9/
65 | HrCoiWChisJ3DFBKmwCL2Iv0QeLQg4pKHBQGsKNoBXAxMKdTmw7pSqBYaWcOrp32
66 | pSxBvzwGa+RZzG0Q8ZZvH9/0BAKkn0U+yNj6NkZEUD+Cl5EfKNsYEYwq5GWDVxIS
67 | jBc/lDb+XbDABHcTuPQV1T84zJQ6VdCsmPW6AF/ghhmBeC8owH7TzEIK9a5QoNE+
68 | xqFx7D+gIIxmOom0jtTYsU0lR+4viMi14QVFwL4Ucd56/Y57fU0IlqUSc/Atyjcn
69 | dBInTMu2l+nZrghtWjlA3QVHdWpaIbOjGM9O9y5Xt5hwXsjEeLBi
70 | -----END CERTIFICATE-----
71 |
--------------------------------------------------------------------------------
/httplib2/test/smoke_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | import httplib2
5 |
6 | from httplib2.test import miniserver
7 |
8 |
9 | class HttpSmokeTest(unittest.TestCase):
10 | def setUp(self):
11 | self.httpd, self.port = miniserver.start_server(
12 | miniserver.ThisDirHandler)
13 |
14 | def tearDown(self):
15 | self.httpd.shutdown()
16 |
17 | def testGetFile(self):
18 | client = httplib2.Http()
19 | src = 'miniserver.py'
20 | response, body = client.request('http://localhost:%d/%s' %
21 | (self.port, src))
22 | self.assertEqual(response.status, 200)
23 | self.assertEqual(body, open(os.path.join(miniserver.HERE, src)).read())
24 |
--------------------------------------------------------------------------------
/httplib2/test/test_no_socket.py:
--------------------------------------------------------------------------------
1 | """Tests for httplib2 when the socket module is missing.
2 |
3 | This helps ensure compatibility with environments such as AppEngine.
4 | """
5 | import os
6 | import sys
7 | import unittest
8 |
9 | import httplib2
10 |
11 | class MissingSocketTest(unittest.TestCase):
12 | def setUp(self):
13 | self._oldsocks = httplib2.socks
14 | httplib2.socks = None
15 |
16 | def tearDown(self):
17 | httplib2.socks = self._oldsocks
18 |
19 | def testProxyDisabled(self):
20 | proxy_info = httplib2.ProxyInfo('blah',
21 | 'localhost', 0)
22 | client = httplib2.Http(proxy_info=proxy_info)
23 | self.assertRaises(httplib2.ProxiesUnavailableError,
24 | client.request, 'http://localhost:-1/')
25 |
--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
1 | indexes:
2 |
3 | # AUTOGENERATED
4 |
5 | # This index.yaml is automatically updated whenever the dev_appserver
6 | # detects that a new type of query is run. If you want to manage the
7 | # index.yaml file manually, remove the above marker line (the line
8 | # saying "# AUTOGENERATED"). If you want to manage some indexes
9 | # manually, move them above the marker line. The index.yaml file is
10 | # automatically uploaded to the admin console when you next deploy
11 | # your application using appcfg.py.
12 |
13 |
--------------------------------------------------------------------------------
/instructions.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Astrobot Instructions
6 |
15 |
16 |
17 |
18 | Astrobot Instructions
19 | Coming soon...
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2007 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | import webapp2
18 | import uuid
19 | import os
20 | import base64
21 | import pickle
22 | from google.appengine.ext import ndb
23 | import browse
24 | from xml.sax import saxutils
25 | import json
26 |
27 | class State(ndb.Model):
28 | pickled = ndb.BlobProperty(compressed=True)
29 |
30 | def interact(query, stateid):
31 | state = State.get_or_insert(stateid)
32 | unpickled_state = pickle.loads(state.pickled) if state.pickled else {}
33 | messages = browse.interact(query, unpickled_state)
34 | print "MESSAGES", messages
35 | state.pickled = pickle.dumps(unpickled_state)
36 | state.put()
37 | return messages
38 |
39 | class MainHandler(webapp2.RequestHandler):
40 | def get(self):
41 | self.response.write(open('page.html').read())
42 |
43 | class Interact(webapp2.RequestHandler):
44 | def post(self):
45 | query = self.request.get('query')
46 | stateid = self.request.cookies.get('stateid', None)
47 | if stateid == None:
48 | stateid = base64.b64encode(uuid.uuid4().bytes + os.urandom(64))
49 | self.response.set_cookie('stateid', stateid, max_age=3600*20)
50 | self.response.write(json.dumps({"messages": interact(query, stateid)}))
51 |
52 | class Twilio(webapp2.RequestHandler):
53 | def post(self):
54 | from_phone = self.request.get('From')
55 | query = self.request.get('Body')
56 | messages = []
57 | try:
58 | messages = interact(query, 'phone:'+from_phone)
59 | except Exception:
60 | messages.append("Oops, something went wrong.")
61 | self.response.content_type = 'text/xml'
62 | self.response.write('')
63 | for msg in messages:
64 | self.response.write(u"{0}".format(saxutils.escape(msg)))
65 | self.response.write("")
66 |
67 |
68 | class InstructionsHandler(webapp2.RequestHandler):
69 | def get(self):
70 | self.response.write(open('instructions.html').read())
71 |
72 | app = webapp2.WSGIApplication([
73 | ('/', MainHandler),
74 | ('/instructions', InstructionsHandler),
75 | ('/interact', Interact),
76 | ('/twilio', Twilio)
77 | ], debug=True)
78 |
--------------------------------------------------------------------------------
/page.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Astrobot: browse the web with pure SMS
5 |
6 |
7 |
27 |
82 |
83 |
84 |
85 |
86 | astro-bot
87 | Browse the web over sms. Text 646-576-7688.
88 | Things you can say...
89 |
97 |
98 |
99 |
100 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/parse_command.py:
--------------------------------------------------------------------------------
1 | from wise import Phrase, parse_phrase
2 |
3 | examples = [
4 | Phrase("url", [["*url", "google.com"]]),
5 | Phrase("url", ["load", ["*url", "google.com"]]),
6 | Phrase("url", ["open", ["*url", "google.com"]]),
7 | Phrase("url", ["fetch", ["*url", "google.com"]]),
8 | Phrase("url", ["go to", ["*url", "google.com"]]),
9 | Phrase("url", ["show", ["*url", "google.com"]]),
10 | Phrase("search", ["search", ["~query", "hacker school"]]),
11 | Phrase("search", ["google", ["~query", "weather 11215"]]),
12 | Phrase("search", ["search the web for", ["~query", "kanye west"]]),
13 | Phrase("search", ["search for", ["~query", "hello world"]]),
14 | Phrase("search", ["search", ["search_source/wikipedia", "wikipedia"], "for", ["~query", "praying mantis"]]),
15 | Phrase("search", [["search_source/wikipedia", "wikipedia"], ["~query", "android"]]),
16 | Phrase("search", [["search_source/wikipedia", "show me the wikipedia article for"], ["~query", "the grateful dead"]]),
17 | Phrase("search", ["search", ["search_source/this_site", "this site"], "for", ["~query", "contact us"]]),
18 | Phrase("search", ["find", ["~query", "support"], "on", ["search_source/this_site", "this site"]]),
19 | Phrase("search", [["~query", "barack obama"]]),
20 | Phrase("more_text", ["more"]),
21 | Phrase("more_text", [["*number", "2"], "more pages"]),
22 | Phrase("more_text", [["*number", "3"], "more pages"]),
23 | Phrase("more_text", ["next"]),
24 | Phrase("more_text", ["next", ["*number", "4"]]),
25 | Phrase("previous_text", ["previous"]),
26 | Phrase("previous_text", ["last", ["*number", "3"]]),
27 | Phrase("previous_text", ["previous", ["*number", "7"], "messages"]),
28 | Phrase("previous_text", ["last part"]),
29 | Phrase("back_to_top", ["back to top of page"]),
30 | Phrase("navigate", ["click", ["*number", "6"]]),
31 | Phrase("navigate", [["*number", "7"]]),
32 | Phrase("navigate", [["*number", "7"]]),
33 | Phrase("navigate", [["*number", "7"]]),
34 | Phrase("navigate", ["click link", ["target", "hvuiehguo"]]),
35 | Phrase("navigate", ["click", ["target", "ihenigo"], ["on_last_page", "on last page"]]),
36 | Phrase("navigate", ["load", ["target", "jegotghr"], ["on_last_page", "from previous page"]]),
37 | Phrase("show_navigation", ["show navigation"]),
38 | Phrase("help", ["help me"]),
39 | Phrase("help", ["what are the options"]),
40 | Phrase("help", ["what can I say?"]),
41 | Phrase("summarize", ["summarize this page"]),
42 | Phrase("summarize", ["summarize", ["*number", "2"]]),
43 | Phrase("show summary for", ["*number", "3"]),
44 | Phrase("back", ["back"]),
45 | Phrase("whereami", ["where am i?"]),
46 | Phrase("whereami", ["what page am i on?"]),
47 | Phrase("whereami", ["current site"]),
48 | Phrase("contents", ["show me the table of contents"]),
49 | Phrase("contents", ["zoom out"]),
50 | Phrase("contents", ["list the headings on the page"])
51 | ]
52 | regexes = {
53 | "url": r"[a-zA-Z0-9_\-\.]+\.[a-z]+(\/[^ ]*)?",
54 | "number": r"\-?[0-9]+(\.[0-9]+)?"
55 | }
56 |
57 | def parse_command(command_text):
58 | return parse_phrase(command_text, examples, regexes)
59 |
--------------------------------------------------------------------------------
/pybing/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | from bing import Bing
10 |
--------------------------------------------------------------------------------
/pybing/bing.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | """
10 | This module holds the Bing class which is used to create and execute queries
11 | against Bing.
12 | """
13 |
14 | import urllib
15 | import urllib2
16 |
17 | # Issue #1 (http://code.google.com/p/pybing/issues/detail?id=1)
18 | # Python 2.6 has json built in, 2.5 needs simplejson
19 | try:
20 | import json
21 | except ImportError:
22 | import simplejson as json
23 |
24 | from pybing import constants
25 |
26 | class Bing(object):
27 | def __init__(self, app_id):
28 | self.app_id = app_id
29 |
30 | def search(self, query, source_type=None, api_version=None, extra_params=None, **kwargs):
31 | kwargs.update({
32 | 'AppId': self.app_id,
33 | 'Version': api_version or constants.API_VERSION,
34 | 'Query': query,
35 | 'Sources': source_type or constants.DEFAULT_SOURCE_TYPE,
36 | })
37 |
38 | if extra_params:
39 | kwargs.update(extra_params)
40 |
41 | query_string = urllib.urlencode(kwargs)
42 | contents = urllib2.urlopen(constants.JSON_ENDPOINT + '?' + query_string)
43 | return json.loads(contents.read())
44 |
45 | def search_web(self, query):
46 | return self.search(query, source_type=constants.WEB_SOURCE_TYPE)
47 |
48 | def search_image(self, query):
49 | return self.search(query, source_type=constants.IMAGE_SOURCE_TYPE)
50 |
51 | def search_news(self, query):
52 | return self.search(query, source_type=constants.NEWS_SOURCE_TYPE)
53 |
54 | def search_spell(self, query):
55 | return self.search(query, source_type=constants.SPELL_SOURCE_TYPE)
56 |
57 | def search_related(self, query):
58 | return self.search(query, source_type=constants.RELATED_SOURCE_TYPE)
59 |
60 | def search_phonebook(self, query):
61 | return self.search(query, source_type=constants.PHONEBOOK_SOURCE_TYPE)
62 |
63 | def search_answers(self, query):
64 | return self.search(query, source_type=constants.ANSWERS_SOURCE_TYPE)
65 |
--------------------------------------------------------------------------------
/pybing/constants.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | """
10 | This module holds the any constants used when querying Bing.
11 | """
12 |
13 | API_VERSION = '2.0'
14 | JSON_ENDPOINT = 'http://api.search.live.net/json.aspx'
15 | MAX_PAGE_SIZE = 50
16 | MAX_RESULTS = 1000
17 |
18 | WEB_SOURCE_TYPE = 'Web'
19 | IMAGE_SOURCE_TYPE = 'Image'
20 | NEWS_SOURCE_TYPE = 'News'
21 | SPELL_SOURCE_TYPE = 'Spell'
22 | RELATED_SOURCE_TYPE = 'RelatedSearch'
23 | PHONEBOOK_SOURCE_TYPE = 'Phonebook'
24 | ANSWERS_SOURCE_TYPE = 'InstanceAnswer'
25 |
26 | SOURCE_TYPES = (
27 | WEB_SOURCE_TYPE,
28 | IMAGE_SOURCE_TYPE,
29 | NEWS_SOURCE_TYPE,
30 | SPELL_SOURCE_TYPE,
31 | RELATED_SOURCE_TYPE,
32 | PHONEBOOK_SOURCE_TYPE,
33 | ANSWERS_SOURCE_TYPE,
34 | )
35 |
36 | DEFAULT_SOURCE_TYPE = WEB_SOURCE_TYPE
37 |
--------------------------------------------------------------------------------
/pybing/query/__init__.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | # Mixins
10 | from mixin import QueryMixin
11 | from pagable import Pagable
12 |
13 | # Base Query
14 | from query import BingQuery
15 |
16 | # Concrete Queries
17 | from web import WebQuery
18 |
--------------------------------------------------------------------------------
/pybing/query/mixin.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | """
10 | This module holds the QueryMixin base class used for all queries.
11 | """
12 |
13 | class QueryMixin(object):
14 | """
15 | Any methods that might be mixed into queries should extend this
16 | base class.
17 | """
18 | def get_request_parameters(self):
19 | params = {}
20 |
21 | # Since we're mixing in, super() may or may not have the attribute
22 | sup = super(QueryMixin, self)
23 | if hasattr(sup, 'get_request_parameters'):
24 | params = sup.get_request_parameters()
25 |
26 | return params
27 |
--------------------------------------------------------------------------------
/pybing/query/pagable.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | """
10 | This module holds a mixin to specify a query class you can page through
11 | using the count and offset parameter.
12 | """
13 |
14 | from mixin import QueryMixin
15 |
16 | class Pagable(QueryMixin):
17 | """
18 | This class is a mixin used with BingQuery classes to specify that
19 | queries can be paged through using the offset and count parameters.
20 |
21 | Some examples of Pagable requests are WebRequests and VideoRequests.
22 | Some non-Pagable requests are TranslationRequests and SearchRequests with
23 | the Spell source type.
24 |
25 | From the Bing API:
26 | - Count specifies the number of results to return per Request.
27 | - Offset specifies the offset requested, from zero, for the starting
28 | point of the result set to be returned for this Request.
29 |
30 | Note: This mixin currently supports only a single Source Type query.
31 | """
32 | def __init__(self, *args, **kwargs):
33 | self._count = None
34 | self._offset = 0
35 | super(Pagable, self).__init__(*args, **kwargs)
36 |
37 | def execute(self, *args, **kwargs):
38 | if self.count and self.offset and self.count + self.offset > 1000:
39 | raise ValueError, "Count + Offset must be less than 1000"
40 | super(Pagable, self).execute(*args, **kwargs)
41 |
42 | def get_request_parameters(self):
43 | params = super(Pagable, self).get_request_parameters()
44 |
45 | if self.count:
46 | params['%s.Count' % self.SOURCE_TYPE] = self.count
47 |
48 | if self.offset:
49 | params['%s.Offset' % self.SOURCE_TYPE] = self.offset
50 |
51 | return params
52 |
53 | @property
54 | def count(self):
55 | return self._count
56 |
57 | def set_count(self, value):
58 | if value is not None:
59 | if value < 1:
60 | raise ValueError, 'Count must be positive'
61 |
62 | elif value > 50:
63 | raise ValueError, 'Count must be less than 50'
64 |
65 | obj = self._clone()
66 | obj._count = value
67 | return obj
68 |
69 | @property
70 | def offset(self):
71 | return self._offset
72 |
73 | def set_offset(self, value):
74 | if value < 0:
75 | raise ValueError, 'Offset must be positive'
76 |
77 | elif value > 1000:
78 | raise ValueError, 'Offset must be less than 1000'
79 |
80 | obj = self._clone()
81 | obj._offset = value
82 | return obj
83 |
--------------------------------------------------------------------------------
/pybing/query/query.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | """
10 | This module holds the base Query class used by the various types of Bing queries.
11 | """
12 |
13 | import copy, urllib, httplib2
14 |
15 | # Issue #1 (http://code.google.com/p/pybing/issues/detail?id=1)
16 | # Python 2.6 has json built in, 2.5 needs simplejson
17 | try: import json
18 | except ImportError: import simplejson as json
19 |
20 | from pybing import constants
21 | from pybing.query.mixin import QueryMixin
22 |
23 | class BingQuery(QueryMixin):
24 | SOURCE_TYPE = None
25 |
26 | def __init__(self, app_id, query=None, version=None, *args, **kwargs):
27 | self.app_id = app_id
28 | self.version = version or constants.API_VERSION
29 | self._query = query
30 |
31 | # Needed for mixin's __init__'s to be called.
32 | super(BingQuery, self).__init__(*args, **kwargs)
33 |
34 | def set_query(self, query):
35 | if not query:
36 | raise ValueError, 'Query cannot be empty or None'
37 |
38 | obj = self._clone()
39 | obj._query = query
40 | return obj
41 |
42 | @property
43 | def query(self):
44 | return self._query
45 |
46 | def execute(self):
47 | if not self.query:
48 | raise ValueError, 'Query cannot be empty or None'
49 |
50 | elif not self.SOURCE_TYPE:
51 | raise ValueError, 'Source Type cannot be empty or None'
52 |
53 | from pybing.resultset import BingResultSet
54 | return BingResultSet(self)
55 |
56 | def get_request_parameters(self):
57 | params = super(BingQuery, self).get_request_parameters()
58 | params.update({
59 | 'AppId': self.app_id,
60 | 'Version': self.version,
61 | 'Query': self.query,
62 | 'Sources': self.SOURCE_TYPE,
63 | })
64 | return params
65 |
66 | def get_request_url(self):
67 | query_string = urllib.urlencode(self.get_request_parameters())
68 | return constants.JSON_ENDPOINT + '?' + query_string
69 |
70 | def get_search_response(self):
71 | contents = self._get_url_contents(self.get_request_url())
72 | return json.loads(contents)['SearchResponse'][self.SOURCE_TYPE]
73 |
74 | def get_search_results(self):
75 | from pybing.result import BingResult
76 | response = self.get_search_response()
77 | return [BingResult(result) for result in response['Results']]
78 |
79 | def _get_url_contents(self, url):
80 | response, contents = httplib2.Http().request(url)
81 | return contents
82 |
83 | def _clone(self):
84 | """
85 | Do a deep copy of this object returning a clone that can be
86 | modified without affecting the old copy.
87 | """
88 | return copy.deepcopy(self)
89 |
90 | def __unicode__(self):
91 | return 'BingQuery: %s' % self.get_request_url()
92 |
93 | __str__ = __unicode__
94 |
95 | def __repr__(self):
96 | return '<%s>' % unicode(self)
97 |
--------------------------------------------------------------------------------
/pybing/query/web.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | """
10 | This module holds the Bing WebQuery class used to do web searches against Bing.
11 | """
12 |
13 | from pybing import constants
14 | from pybing.query import BingQuery, Pagable
15 |
16 | class WebQuery(BingQuery, Pagable):
17 | SOURCE_TYPE = constants.WEB_SOURCE_TYPE
18 |
--------------------------------------------------------------------------------
/pybing/result.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | """
10 | This module holds the base BingResult class.
11 | """
12 |
13 | class BingResult(object):
14 | """
15 | The base BingResult class corresponds to a single result from a Bing
16 | Query response.
17 | """
18 | def __init__(self, result):
19 | if isinstance(result, dict):
20 | self.load_from_dict(result)
21 |
22 | else:
23 | raise TypeError, 'Invalid result type'
24 |
25 | def load_from_dict(self, data):
26 | for key, value in data.iteritems():
27 | setattr(self, key.lower(), value)
28 |
29 | def __repr__(self):
30 | return ''
31 |
--------------------------------------------------------------------------------
/pybing/resultset.py:
--------------------------------------------------------------------------------
1 | # This file is part of PyBing (http://pybing.googlecode.com).
2 | #
3 | # Copyright (C) 2009 JJ Geewax http://geewax.org/
4 | # All rights reserved.
5 | #
6 | # This software is licensed as described in the file COPYING.txt,
7 | # which you should have received as part of this distribution.
8 |
9 | """
10 | This module holds the logic for dealing with a set of results from a query.
11 | """
12 |
13 | from pybing import constants
14 | from pybing.query import BingQuery, Pagable
15 |
16 | class BingResultSet(object):
17 | """
18 | This class corresponds to a set of results from a BingQuery.
19 | """
20 | def __init__(self, query, offset=0, count=None):
21 | if not isinstance(query, BingQuery):
22 | raise TypeError, 'query must be a BingQuery instance'
23 |
24 | self.query = query
25 | self.results = {}
26 |
27 | # These offset + count are used internally to signify whether or
28 | # not the query should be cut down (whether they've been sliced).
29 | self.offset, self.count = offset, count
30 |
31 | def get_offset(self, index=0):
32 | return self.query.offset + self.offset + index
33 |
34 | def __getitem__(self, key):
35 | """
36 | Allows you to grab an index or slice a query with array notation like
37 | resultset[4] or resultset[0:4]
38 | """
39 | if not isinstance(self.query, Pagable):
40 | raise TypeError, 'Array access only supported on Pagable Queries'
41 |
42 | if isinstance(key, int):
43 | absolute_index = self.get_offset()
44 | if absolute_index < 0 or absolute_index >= constants.MAX_RESULTS:
45 | raise IndexError
46 |
47 | if absolute_index not in self.results:
48 | # Make a copy of the query for only this one result:
49 | query = self.query.set_offset(absolute_index).set_count(1)
50 | results = query.get_search_results()
51 | if results:
52 | self.results[absolute_index] = results[0]
53 |
54 | return self.results.get(absolute_index)
55 |
56 | elif isinstance(key, slice):
57 | # Return a new result set that is sliced internally (not the query)
58 | offset = key.start or 0
59 | if key.stop: count = key.stop - offset
60 | else: count = None
61 | return BingResultSet(self.query, self.offset + offset, count)
62 |
63 | else:
64 | raise TypeError
65 |
66 | def __len__(self):
67 | """
68 | Returns the number of results if you were to iterate over this result set.
69 | This is at least 0 and at most 1000.
70 | """
71 | count = constants.MAX_RESULTS
72 |
73 | if self.count:
74 | count = self.count
75 |
76 | elif self.query.count:
77 | count = self.query.count
78 |
79 | if count > constants.MAX_RESULTS:
80 | count = constants.MAX_RESULTS
81 |
82 | if count == constants.MAX_RESULTS:
83 | count = count - self.get_offset()
84 |
85 | return count
86 |
87 | def __iter__(self):
88 | """
89 | Allows you to iterate over the search results in the standard Python
90 | format such as
91 | for result in my_query.execute():
92 | print result.title, result.url
93 | """
94 | query = self.query.set_offset(self.get_offset())
95 | end_index = constants.MAX_RESULTS
96 |
97 | # If we've internally sliced out items
98 | if self.count:
99 | query = query.set_count(self.count if self.count < constants.MAX_PAGE_SIZE else constants.MAX_PAGE_SIZE)
100 | end_index = self.get_offset() + self.count
101 |
102 | if end_index > constants.MAX_RESULTS:
103 | end_index = constants.MAX_RESULTS
104 |
105 | # If we want to just go until the end, grab them the most per page
106 | if not query.count:
107 | query = query.set_count(constants.MAX_PAGE_SIZE)
108 |
109 | while query.offset < end_index:
110 | # If we don't have a full page left, only grab up to the end
111 | count = end_index - query.offset
112 | if count and count < constants.MAX_PAGE_SIZE:
113 | query = query.set_count(count)
114 |
115 | # Yield back each result
116 | for result in query.get_search_results():
117 | yield result
118 |
119 | # Update the offset to move onto the next page
120 | query = query.set_offset(query.offset + query.count)
121 |
122 | def __repr__(self):
123 | return '' % self.query
124 |
--------------------------------------------------------------------------------
/requests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # __
4 | # /__) _ _ _ _ _/ _
5 | # / ( (- (/ (/ (- _) / _)
6 | # /
7 |
8 | """
9 | requests HTTP library
10 | ~~~~~~~~~~~~~~~~~~~~~
11 |
12 | Requests is an HTTP library, written in Python, for human beings. Basic GET
13 | usage:
14 |
15 | >>> import requests
16 | >>> r = requests.get('http://python.org')
17 | >>> r.status_code
18 | 200
19 | >>> 'Python is a programming language' in r.content
20 | True
21 |
22 | ... or POST:
23 |
24 | >>> payload = dict(key1='value1', key2='value2')
25 | >>> r = requests.post("http://httpbin.org/post", data=payload)
26 | >>> print(r.text)
27 | {
28 | ...
29 | "form": {
30 | "key2": "value2",
31 | "key1": "value1"
32 | },
33 | ...
34 | }
35 |
36 | The other HTTP methods are supported - see `requests.api`. Full documentation
37 | is at .
38 |
39 | :copyright: (c) 2014 by Kenneth Reitz.
40 | :license: Apache 2.0, see LICENSE for more details.
41 |
42 | """
43 |
44 | __title__ = 'requests'
45 | __version__ = '2.4.1'
46 | __build__ = 0x020401
47 | __author__ = 'Kenneth Reitz'
48 | __license__ = 'Apache 2.0'
49 | __copyright__ = 'Copyright 2014 Kenneth Reitz'
50 |
51 | # Attempt to enable urllib3's SNI support, if possible
52 | try:
53 | from .packages.urllib3.contrib import pyopenssl
54 | pyopenssl.inject_into_urllib3()
55 | except ImportError:
56 | pass
57 |
58 | from . import utils
59 | from .models import Request, Response, PreparedRequest
60 | from .api import request, get, head, post, patch, put, delete, options
61 | from .sessions import session, Session
62 | from .status_codes import codes
63 | from .exceptions import (
64 | RequestException, Timeout, URLRequired,
65 | TooManyRedirects, HTTPError, ConnectionError
66 | )
67 |
68 | # Set default logging handler to avoid "No handler found" warnings.
69 | import logging
70 | try: # Python 2.7+
71 | from logging import NullHandler
72 | except ImportError:
73 | class NullHandler(logging.Handler):
74 | def emit(self, record):
75 | pass
76 |
77 | logging.getLogger(__name__).addHandler(NullHandler())
78 |
--------------------------------------------------------------------------------
/requests/certs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | certs.py
6 | ~~~~~~~~
7 |
8 | This module returns the preferred default CA certificate bundle.
9 |
10 | If you are packaging Requests, e.g., for a Linux distribution or a managed
11 | environment, you can change the definition of where() to return a separately
12 | packaged CA bundle.
13 | """
14 | import os.path
15 |
16 | try:
17 | from certifi import where
18 | except ImportError:
19 | def where():
20 | """Return the preferred certificate bundle."""
21 | # vendored bundle inside Requests
22 | return os.path.join(os.path.dirname(__file__), 'cacert.pem')
23 |
24 | if __name__ == '__main__':
25 | print(where())
26 |
--------------------------------------------------------------------------------
/requests/compat.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | pythoncompat
5 | """
6 |
7 | from .packages import chardet
8 |
9 | import sys
10 |
11 | # -------
12 | # Pythons
13 | # -------
14 |
15 | # Syntax sugar.
16 | _ver = sys.version_info
17 |
18 | #: Python 2.x?
19 | is_py2 = (_ver[0] == 2)
20 |
21 | #: Python 3.x?
22 | is_py3 = (_ver[0] == 3)
23 |
24 | #: Python 3.0.x
25 | is_py30 = (is_py3 and _ver[1] == 0)
26 |
27 | #: Python 3.1.x
28 | is_py31 = (is_py3 and _ver[1] == 1)
29 |
30 | #: Python 3.2.x
31 | is_py32 = (is_py3 and _ver[1] == 2)
32 |
33 | #: Python 3.3.x
34 | is_py33 = (is_py3 and _ver[1] == 3)
35 |
36 | #: Python 3.4.x
37 | is_py34 = (is_py3 and _ver[1] == 4)
38 |
39 | #: Python 2.7.x
40 | is_py27 = (is_py2 and _ver[1] == 7)
41 |
42 | #: Python 2.6.x
43 | is_py26 = (is_py2 and _ver[1] == 6)
44 |
45 | #: Python 2.5.x
46 | is_py25 = (is_py2 and _ver[1] == 5)
47 |
48 | #: Python 2.4.x
49 | is_py24 = (is_py2 and _ver[1] == 4) # I'm assuming this is not by choice.
50 |
51 |
52 | # ---------
53 | # Platforms
54 | # ---------
55 |
56 |
57 | # Syntax sugar.
58 | _ver = sys.version.lower()
59 |
60 | is_pypy = ('pypy' in _ver)
61 | is_jython = ('jython' in _ver)
62 | is_ironpython = ('iron' in _ver)
63 |
64 | # Assume CPython, if nothing else.
65 | is_cpython = not any((is_pypy, is_jython, is_ironpython))
66 |
67 | # Windows-based system.
68 | is_windows = 'win32' in str(sys.platform).lower()
69 |
70 | # Standard Linux 2+ system.
71 | is_linux = ('linux' in str(sys.platform).lower())
72 | is_osx = ('darwin' in str(sys.platform).lower())
73 | is_hpux = ('hpux' in str(sys.platform).lower()) # Complete guess.
74 | is_solaris = ('solar==' in str(sys.platform).lower()) # Complete guess.
75 |
76 | try:
77 | import simplejson as json
78 | except (ImportError, SyntaxError):
79 | # simplejson does not support Python 3.2, it thows a SyntaxError
80 | # because of u'...' Unicode literals.
81 | import json
82 |
83 | # ---------
84 | # Specifics
85 | # ---------
86 |
87 | if is_py2:
88 | from urllib import quote, unquote, quote_plus, unquote_plus, urlencode, getproxies, proxy_bypass
89 | from urlparse import urlparse, urlunparse, urljoin, urlsplit, urldefrag
90 | from urllib2 import parse_http_list
91 | import cookielib
92 | from Cookie import Morsel
93 | from StringIO import StringIO
94 | from .packages.urllib3.packages.ordered_dict import OrderedDict
95 |
96 | builtin_str = str
97 | bytes = str
98 | str = unicode
99 | basestring = basestring
100 | numeric_types = (int, long, float)
101 |
102 |
103 | elif is_py3:
104 | from urllib.parse import urlparse, urlunparse, urljoin, urlsplit, urlencode, quote, unquote, quote_plus, unquote_plus, urldefrag
105 | from urllib.request import parse_http_list, getproxies, proxy_bypass
106 | from http import cookiejar as cookielib
107 | from http.cookies import Morsel
108 | from io import StringIO
109 | from collections import OrderedDict
110 |
111 | builtin_str = str
112 | str = str
113 | bytes = bytes
114 | basestring = (str, bytes)
115 | numeric_types = (int, float)
116 |
--------------------------------------------------------------------------------
/requests/exceptions.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | requests.exceptions
5 | ~~~~~~~~~~~~~~~~~~~
6 |
7 | This module contains the set of Requests' exceptions.
8 |
9 | """
10 | from .packages.urllib3.exceptions import HTTPError as BaseHTTPError
11 |
12 |
13 | class RequestException(IOError):
14 | """There was an ambiguous exception that occurred while handling your
15 | request."""
16 |
17 | def __init__(self, *args, **kwargs):
18 | """
19 | Initialize RequestException with `request` and `response` objects.
20 | """
21 | response = kwargs.pop('response', None)
22 | self.response = response
23 | self.request = kwargs.pop('request', None)
24 | if (response is not None and not self.request and
25 | hasattr(response, 'request')):
26 | self.request = self.response.request
27 | super(RequestException, self).__init__(*args, **kwargs)
28 |
29 |
30 | class HTTPError(RequestException):
31 | """An HTTP error occurred."""
32 |
33 |
34 | class ConnectionError(RequestException):
35 | """A Connection error occurred."""
36 |
37 |
38 | class ProxyError(ConnectionError):
39 | """A proxy error occurred."""
40 |
41 |
42 | class SSLError(ConnectionError):
43 | """An SSL error occurred."""
44 |
45 |
46 | class Timeout(RequestException):
47 | """The request timed out.
48 |
49 | Catching this error will catch both
50 | :exc:`~requests.exceptions.ConnectTimeout` and
51 | :exc:`~requests.exceptions.ReadTimeout` errors.
52 | """
53 |
54 |
55 | class ConnectTimeout(ConnectionError, Timeout):
56 | """The request timed out while trying to connect to the remote server.
57 |
58 | Requests that produced this error are safe to retry.
59 | """
60 |
61 |
62 | class ReadTimeout(Timeout):
63 | """The server did not send any data in the allotted amount of time."""
64 |
65 |
66 | class URLRequired(RequestException):
67 | """A valid URL is required to make a request."""
68 |
69 |
70 | class TooManyRedirects(RequestException):
71 | """Too many redirects."""
72 |
73 |
74 | class MissingSchema(RequestException, ValueError):
75 | """The URL schema (e.g. http or https) is missing."""
76 |
77 |
78 | class InvalidSchema(RequestException, ValueError):
79 | """See defaults.py for valid schemas."""
80 |
81 |
82 | class InvalidURL(RequestException, ValueError):
83 | """ The URL provided was somehow invalid. """
84 |
85 |
86 | class ChunkedEncodingError(RequestException):
87 | """The server declared chunked encoding but sent an invalid chunk."""
88 |
89 |
90 | class ContentDecodingError(RequestException, BaseHTTPError):
91 | """Failed to decode response content"""
92 |
--------------------------------------------------------------------------------
/requests/hooks.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | requests.hooks
5 | ~~~~~~~~~~~~~~
6 |
7 | This module provides the capabilities for the Requests hooks system.
8 |
9 | Available hooks:
10 |
11 | ``response``:
12 | The response generated from a Request.
13 |
14 | """
15 |
16 |
17 | HOOKS = ['response']
18 |
19 |
20 | def default_hooks():
21 | hooks = {}
22 | for event in HOOKS:
23 | hooks[event] = []
24 | return hooks
25 |
26 | # TODO: response is the only one
27 |
28 |
29 | def dispatch_hook(key, hooks, hook_data, **kwargs):
30 | """Dispatches a hook dictionary on a given piece of data."""
31 |
32 | hooks = hooks or dict()
33 |
34 | if key in hooks:
35 | hooks = hooks.get(key)
36 |
37 | if hasattr(hooks, '__call__'):
38 | hooks = [hooks]
39 |
40 | for hook in hooks:
41 | _hook_data = hook(hook_data, **kwargs)
42 | if _hook_data is not None:
43 | hook_data = _hook_data
44 |
45 | return hook_data
46 |
--------------------------------------------------------------------------------
/requests/packages/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | from . import urllib3
4 |
--------------------------------------------------------------------------------
/requests/packages/chardet/__init__.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # This library is free software; you can redistribute it and/or
3 | # modify it under the terms of the GNU Lesser General Public
4 | # License as published by the Free Software Foundation; either
5 | # version 2.1 of the License, or (at your option) any later version.
6 | #
7 | # This library is distributed in the hope that it will be useful,
8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 | # Lesser General Public License for more details.
11 | #
12 | # You should have received a copy of the GNU Lesser General Public
13 | # License along with this library; if not, write to the Free Software
14 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
15 | # 02110-1301 USA
16 | ######################### END LICENSE BLOCK #########################
17 |
18 | __version__ = "2.2.1"
19 | from sys import version_info
20 |
21 |
22 | def detect(aBuf):
23 | if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
24 | (version_info >= (3, 0) and not isinstance(aBuf, bytes))):
25 | raise ValueError('Expected a bytes object, not a unicode object')
26 |
27 | from . import universaldetector
28 | u = universaldetector.UniversalDetector()
29 | u.reset()
30 | u.feed(aBuf)
31 | u.close()
32 | return u.result
33 |
--------------------------------------------------------------------------------
/requests/packages/chardet/big5prober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is Mozilla Communicator client code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import Big5DistributionAnalysis
31 | from .mbcssm import Big5SMModel
32 |
33 |
34 | class Big5Prober(MultiByteCharSetProber):
35 | def __init__(self):
36 | MultiByteCharSetProber.__init__(self)
37 | self._mCodingSM = CodingStateMachine(Big5SMModel)
38 | self._mDistributionAnalyzer = Big5DistributionAnalysis()
39 | self.reset()
40 |
41 | def get_charset_name(self):
42 | return "Big5"
43 |
--------------------------------------------------------------------------------
/requests/packages/chardet/chardetect.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Script which takes one or more file paths and reports on their detected
4 | encodings
5 |
6 | Example::
7 |
8 | % chardetect somefile someotherfile
9 | somefile: windows-1252 with confidence 0.5
10 | someotherfile: ascii with confidence 1.0
11 |
12 | If no paths are provided, it takes its input from stdin.
13 |
14 | """
15 | from io import open
16 | from sys import argv, stdin
17 |
18 | from chardet.universaldetector import UniversalDetector
19 |
20 |
21 | def description_of(file, name='stdin'):
22 | """Return a string describing the probable encoding of a file."""
23 | u = UniversalDetector()
24 | for line in file:
25 | u.feed(line)
26 | u.close()
27 | result = u.result
28 | if result['encoding']:
29 | return '%s: %s with confidence %s' % (name,
30 | result['encoding'],
31 | result['confidence'])
32 | else:
33 | return '%s: no result' % name
34 |
35 |
36 | def main():
37 | if len(argv) <= 1:
38 | print(description_of(stdin))
39 | else:
40 | for path in argv[1:]:
41 | with open(path, 'rb') as f:
42 | print(description_of(f, path))
43 |
44 |
45 | if __name__ == '__main__':
46 | main()
47 |
--------------------------------------------------------------------------------
/requests/packages/chardet/charsetgroupprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is Mozilla Communicator client code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from . import constants
29 | import sys
30 | from .charsetprober import CharSetProber
31 |
32 |
33 | class CharSetGroupProber(CharSetProber):
34 | def __init__(self):
35 | CharSetProber.__init__(self)
36 | self._mActiveNum = 0
37 | self._mProbers = []
38 | self._mBestGuessProber = None
39 |
40 | def reset(self):
41 | CharSetProber.reset(self)
42 | self._mActiveNum = 0
43 | for prober in self._mProbers:
44 | if prober:
45 | prober.reset()
46 | prober.active = True
47 | self._mActiveNum += 1
48 | self._mBestGuessProber = None
49 |
50 | def get_charset_name(self):
51 | if not self._mBestGuessProber:
52 | self.get_confidence()
53 | if not self._mBestGuessProber:
54 | return None
55 | # self._mBestGuessProber = self._mProbers[0]
56 | return self._mBestGuessProber.get_charset_name()
57 |
58 | def feed(self, aBuf):
59 | for prober in self._mProbers:
60 | if not prober:
61 | continue
62 | if not prober.active:
63 | continue
64 | st = prober.feed(aBuf)
65 | if not st:
66 | continue
67 | if st == constants.eFoundIt:
68 | self._mBestGuessProber = prober
69 | return self.get_state()
70 | elif st == constants.eNotMe:
71 | prober.active = False
72 | self._mActiveNum -= 1
73 | if self._mActiveNum <= 0:
74 | self._mState = constants.eNotMe
75 | return self.get_state()
76 | return self.get_state()
77 |
78 | def get_confidence(self):
79 | st = self.get_state()
80 | if st == constants.eFoundIt:
81 | return 0.99
82 | elif st == constants.eNotMe:
83 | return 0.01
84 | bestConf = 0.0
85 | self._mBestGuessProber = None
86 | for prober in self._mProbers:
87 | if not prober:
88 | continue
89 | if not prober.active:
90 | if constants._debug:
91 | sys.stderr.write(prober.get_charset_name()
92 | + ' not active\n')
93 | continue
94 | cf = prober.get_confidence()
95 | if constants._debug:
96 | sys.stderr.write('%s confidence = %s\n' %
97 | (prober.get_charset_name(), cf))
98 | if bestConf < cf:
99 | bestConf = cf
100 | self._mBestGuessProber = prober
101 | if not self._mBestGuessProber:
102 | return 0.0
103 | return bestConf
104 | # else:
105 | # self._mBestGuessProber = self._mProbers[0]
106 | # return self._mBestGuessProber.get_confidence()
107 |
--------------------------------------------------------------------------------
/requests/packages/chardet/charsetprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is Mozilla Universal charset detector code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 2001
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | # Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | #
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 | # Lesser General Public License for more details.
22 | #
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301 USA
27 | ######################### END LICENSE BLOCK #########################
28 |
29 | from . import constants
30 | import re
31 |
32 |
33 | class CharSetProber:
34 | def __init__(self):
35 | pass
36 |
37 | def reset(self):
38 | self._mState = constants.eDetecting
39 |
40 | def get_charset_name(self):
41 | return None
42 |
43 | def feed(self, aBuf):
44 | pass
45 |
46 | def get_state(self):
47 | return self._mState
48 |
49 | def get_confidence(self):
50 | return 0.0
51 |
52 | def filter_high_bit_only(self, aBuf):
53 | aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
54 | return aBuf
55 |
56 | def filter_without_english_letters(self, aBuf):
57 | aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
58 | return aBuf
59 |
60 | def filter_with_english_letters(self, aBuf):
61 | # TODO
62 | return aBuf
63 |
--------------------------------------------------------------------------------
/requests/packages/chardet/codingstatemachine.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from .constants import eStart
29 | from .compat import wrap_ord
30 |
31 |
32 | class CodingStateMachine:
33 | def __init__(self, sm):
34 | self._mModel = sm
35 | self._mCurrentBytePos = 0
36 | self._mCurrentCharLen = 0
37 | self.reset()
38 |
39 | def reset(self):
40 | self._mCurrentState = eStart
41 |
42 | def next_state(self, c):
43 | # for each byte we get its class
44 | # if it is first byte, we also get byte length
45 | # PY3K: aBuf is a byte stream, so c is an int, not a byte
46 | byteCls = self._mModel['classTable'][wrap_ord(c)]
47 | if self._mCurrentState == eStart:
48 | self._mCurrentBytePos = 0
49 | self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
50 | # from byte's class and stateTable, we get its next state
51 | curr_state = (self._mCurrentState * self._mModel['classFactor']
52 | + byteCls)
53 | self._mCurrentState = self._mModel['stateTable'][curr_state]
54 | self._mCurrentBytePos += 1
55 | return self._mCurrentState
56 |
57 | def get_current_charlen(self):
58 | return self._mCurrentCharLen
59 |
60 | def get_coding_state_machine(self):
61 | return self._mModel['name']
62 |
--------------------------------------------------------------------------------
/requests/packages/chardet/compat.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # Contributor(s):
3 | # Ian Cordasco - port to Python
4 | #
5 | # This library is free software; you can redistribute it and/or
6 | # modify it under the terms of the GNU Lesser General Public
7 | # License as published by the Free Software Foundation; either
8 | # version 2.1 of the License, or (at your option) any later version.
9 | #
10 | # This library is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | # Lesser General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU Lesser General Public
16 | # License along with this library; if not, write to the Free Software
17 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
18 | # 02110-1301 USA
19 | ######################### END LICENSE BLOCK #########################
20 |
21 | import sys
22 |
23 |
24 | if sys.version_info < (3, 0):
25 | base_str = (str, unicode)
26 | else:
27 | base_str = (bytes, str)
28 |
29 |
30 | def wrap_ord(a):
31 | if sys.version_info < (3, 0) and isinstance(a, base_str):
32 | return ord(a)
33 | else:
34 | return a
35 |
--------------------------------------------------------------------------------
/requests/packages/chardet/constants.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is Mozilla Universal charset detector code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 2001
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | # Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | #
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 | # Lesser General Public License for more details.
22 | #
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301 USA
27 | ######################### END LICENSE BLOCK #########################
28 |
29 | _debug = 0
30 |
31 | eDetecting = 0
32 | eFoundIt = 1
33 | eNotMe = 2
34 |
35 | eStart = 0
36 | eError = 1
37 | eItsMe = 2
38 |
39 | SHORTCUT_THRESHOLD = 0.95
40 |
--------------------------------------------------------------------------------
/requests/packages/chardet/cp949prober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import EUCKRDistributionAnalysis
31 | from .mbcssm import CP949SMModel
32 |
33 |
34 | class CP949Prober(MultiByteCharSetProber):
35 | def __init__(self):
36 | MultiByteCharSetProber.__init__(self)
37 | self._mCodingSM = CodingStateMachine(CP949SMModel)
38 | # NOTE: CP949 is a superset of EUC-KR, so the distribution should be
39 | # not different.
40 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
41 | self.reset()
42 |
43 | def get_charset_name(self):
44 | return "CP949"
45 |
--------------------------------------------------------------------------------
/requests/packages/chardet/escprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from . import constants
29 | from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel,
30 | ISO2022KRSMModel)
31 | from .charsetprober import CharSetProber
32 | from .codingstatemachine import CodingStateMachine
33 | from .compat import wrap_ord
34 |
35 |
36 | class EscCharSetProber(CharSetProber):
37 | def __init__(self):
38 | CharSetProber.__init__(self)
39 | self._mCodingSM = [
40 | CodingStateMachine(HZSMModel),
41 | CodingStateMachine(ISO2022CNSMModel),
42 | CodingStateMachine(ISO2022JPSMModel),
43 | CodingStateMachine(ISO2022KRSMModel)
44 | ]
45 | self.reset()
46 |
47 | def reset(self):
48 | CharSetProber.reset(self)
49 | for codingSM in self._mCodingSM:
50 | if not codingSM:
51 | continue
52 | codingSM.active = True
53 | codingSM.reset()
54 | self._mActiveSM = len(self._mCodingSM)
55 | self._mDetectedCharset = None
56 |
57 | def get_charset_name(self):
58 | return self._mDetectedCharset
59 |
60 | def get_confidence(self):
61 | if self._mDetectedCharset:
62 | return 0.99
63 | else:
64 | return 0.00
65 |
66 | def feed(self, aBuf):
67 | for c in aBuf:
68 | # PY3K: aBuf is a byte array, so c is an int, not a byte
69 | for codingSM in self._mCodingSM:
70 | if not codingSM:
71 | continue
72 | if not codingSM.active:
73 | continue
74 | codingState = codingSM.next_state(wrap_ord(c))
75 | if codingState == constants.eError:
76 | codingSM.active = False
77 | self._mActiveSM -= 1
78 | if self._mActiveSM <= 0:
79 | self._mState = constants.eNotMe
80 | return self.get_state()
81 | elif codingState == constants.eItsMe:
82 | self._mState = constants.eFoundIt
83 | self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8
84 | return self.get_state()
85 |
86 | return self.get_state()
87 |
--------------------------------------------------------------------------------
/requests/packages/chardet/eucjpprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | import sys
29 | from . import constants
30 | from .mbcharsetprober import MultiByteCharSetProber
31 | from .codingstatemachine import CodingStateMachine
32 | from .chardistribution import EUCJPDistributionAnalysis
33 | from .jpcntx import EUCJPContextAnalysis
34 | from .mbcssm import EUCJPSMModel
35 |
36 |
37 | class EUCJPProber(MultiByteCharSetProber):
38 | def __init__(self):
39 | MultiByteCharSetProber.__init__(self)
40 | self._mCodingSM = CodingStateMachine(EUCJPSMModel)
41 | self._mDistributionAnalyzer = EUCJPDistributionAnalysis()
42 | self._mContextAnalyzer = EUCJPContextAnalysis()
43 | self.reset()
44 |
45 | def reset(self):
46 | MultiByteCharSetProber.reset(self)
47 | self._mContextAnalyzer.reset()
48 |
49 | def get_charset_name(self):
50 | return "EUC-JP"
51 |
52 | def feed(self, aBuf):
53 | aLen = len(aBuf)
54 | for i in range(0, aLen):
55 | # PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte
56 | codingState = self._mCodingSM.next_state(aBuf[i])
57 | if codingState == constants.eError:
58 | if constants._debug:
59 | sys.stderr.write(self.get_charset_name()
60 | + ' prober hit error at byte ' + str(i)
61 | + '\n')
62 | self._mState = constants.eNotMe
63 | break
64 | elif codingState == constants.eItsMe:
65 | self._mState = constants.eFoundIt
66 | break
67 | elif codingState == constants.eStart:
68 | charLen = self._mCodingSM.get_current_charlen()
69 | if i == 0:
70 | self._mLastChar[1] = aBuf[0]
71 | self._mContextAnalyzer.feed(self._mLastChar, charLen)
72 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
73 | else:
74 | self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen)
75 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
76 | charLen)
77 |
78 | self._mLastChar[0] = aBuf[aLen - 1]
79 |
80 | if self.get_state() == constants.eDetecting:
81 | if (self._mContextAnalyzer.got_enough_data() and
82 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
83 | self._mState = constants.eFoundIt
84 |
85 | return self.get_state()
86 |
87 | def get_confidence(self):
88 | contxtCf = self._mContextAnalyzer.get_confidence()
89 | distribCf = self._mDistributionAnalyzer.get_confidence()
90 | return max(contxtCf, distribCf)
91 |
--------------------------------------------------------------------------------
/requests/packages/chardet/euckrprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import EUCKRDistributionAnalysis
31 | from .mbcssm import EUCKRSMModel
32 |
33 |
34 | class EUCKRProber(MultiByteCharSetProber):
35 | def __init__(self):
36 | MultiByteCharSetProber.__init__(self)
37 | self._mCodingSM = CodingStateMachine(EUCKRSMModel)
38 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
39 | self.reset()
40 |
41 | def get_charset_name(self):
42 | return "EUC-KR"
43 |
--------------------------------------------------------------------------------
/requests/packages/chardet/euctwprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import EUCTWDistributionAnalysis
31 | from .mbcssm import EUCTWSMModel
32 |
33 | class EUCTWProber(MultiByteCharSetProber):
34 | def __init__(self):
35 | MultiByteCharSetProber.__init__(self)
36 | self._mCodingSM = CodingStateMachine(EUCTWSMModel)
37 | self._mDistributionAnalyzer = EUCTWDistributionAnalysis()
38 | self.reset()
39 |
40 | def get_charset_name(self):
41 | return "EUC-TW"
42 |
--------------------------------------------------------------------------------
/requests/packages/chardet/gb2312prober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from .mbcharsetprober import MultiByteCharSetProber
29 | from .codingstatemachine import CodingStateMachine
30 | from .chardistribution import GB2312DistributionAnalysis
31 | from .mbcssm import GB2312SMModel
32 |
33 | class GB2312Prober(MultiByteCharSetProber):
34 | def __init__(self):
35 | MultiByteCharSetProber.__init__(self)
36 | self._mCodingSM = CodingStateMachine(GB2312SMModel)
37 | self._mDistributionAnalyzer = GB2312DistributionAnalysis()
38 | self.reset()
39 |
40 | def get_charset_name(self):
41 | return "GB2312"
42 |
--------------------------------------------------------------------------------
/requests/packages/chardet/mbcharsetprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is Mozilla Universal charset detector code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 2001
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | # Shy Shalom - original C code
12 | # Proofpoint, Inc.
13 | #
14 | # This library is free software; you can redistribute it and/or
15 | # modify it under the terms of the GNU Lesser General Public
16 | # License as published by the Free Software Foundation; either
17 | # version 2.1 of the License, or (at your option) any later version.
18 | #
19 | # This library is distributed in the hope that it will be useful,
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 | # Lesser General Public License for more details.
23 | #
24 | # You should have received a copy of the GNU Lesser General Public
25 | # License along with this library; if not, write to the Free Software
26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 | # 02110-1301 USA
28 | ######################### END LICENSE BLOCK #########################
29 |
30 | import sys
31 | from . import constants
32 | from .charsetprober import CharSetProber
33 |
34 |
35 | class MultiByteCharSetProber(CharSetProber):
36 | def __init__(self):
37 | CharSetProber.__init__(self)
38 | self._mDistributionAnalyzer = None
39 | self._mCodingSM = None
40 | self._mLastChar = [0, 0]
41 |
42 | def reset(self):
43 | CharSetProber.reset(self)
44 | if self._mCodingSM:
45 | self._mCodingSM.reset()
46 | if self._mDistributionAnalyzer:
47 | self._mDistributionAnalyzer.reset()
48 | self._mLastChar = [0, 0]
49 |
50 | def get_charset_name(self):
51 | pass
52 |
53 | def feed(self, aBuf):
54 | aLen = len(aBuf)
55 | for i in range(0, aLen):
56 | codingState = self._mCodingSM.next_state(aBuf[i])
57 | if codingState == constants.eError:
58 | if constants._debug:
59 | sys.stderr.write(self.get_charset_name()
60 | + ' prober hit error at byte ' + str(i)
61 | + '\n')
62 | self._mState = constants.eNotMe
63 | break
64 | elif codingState == constants.eItsMe:
65 | self._mState = constants.eFoundIt
66 | break
67 | elif codingState == constants.eStart:
68 | charLen = self._mCodingSM.get_current_charlen()
69 | if i == 0:
70 | self._mLastChar[1] = aBuf[0]
71 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
72 | else:
73 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
74 | charLen)
75 |
76 | self._mLastChar[0] = aBuf[aLen - 1]
77 |
78 | if self.get_state() == constants.eDetecting:
79 | if (self._mDistributionAnalyzer.got_enough_data() and
80 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
81 | self._mState = constants.eFoundIt
82 |
83 | return self.get_state()
84 |
85 | def get_confidence(self):
86 | return self._mDistributionAnalyzer.get_confidence()
87 |
--------------------------------------------------------------------------------
/requests/packages/chardet/mbcsgroupprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is Mozilla Universal charset detector code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 2001
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | # Shy Shalom - original C code
12 | # Proofpoint, Inc.
13 | #
14 | # This library is free software; you can redistribute it and/or
15 | # modify it under the terms of the GNU Lesser General Public
16 | # License as published by the Free Software Foundation; either
17 | # version 2.1 of the License, or (at your option) any later version.
18 | #
19 | # This library is distributed in the hope that it will be useful,
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 | # Lesser General Public License for more details.
23 | #
24 | # You should have received a copy of the GNU Lesser General Public
25 | # License along with this library; if not, write to the Free Software
26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 | # 02110-1301 USA
28 | ######################### END LICENSE BLOCK #########################
29 |
30 | from .charsetgroupprober import CharSetGroupProber
31 | from .utf8prober import UTF8Prober
32 | from .sjisprober import SJISProber
33 | from .eucjpprober import EUCJPProber
34 | from .gb2312prober import GB2312Prober
35 | from .euckrprober import EUCKRProber
36 | from .cp949prober import CP949Prober
37 | from .big5prober import Big5Prober
38 | from .euctwprober import EUCTWProber
39 |
40 |
41 | class MBCSGroupProber(CharSetGroupProber):
42 | def __init__(self):
43 | CharSetGroupProber.__init__(self)
44 | self._mProbers = [
45 | UTF8Prober(),
46 | SJISProber(),
47 | EUCJPProber(),
48 | GB2312Prober(),
49 | EUCKRProber(),
50 | CP949Prober(),
51 | Big5Prober(),
52 | EUCTWProber()
53 | ]
54 | self.reset()
55 |
--------------------------------------------------------------------------------
/requests/packages/chardet/sbcsgroupprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is Mozilla Universal charset detector code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 2001
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | # Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | #
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 | # Lesser General Public License for more details.
22 | #
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301 USA
27 | ######################### END LICENSE BLOCK #########################
28 |
29 | from .charsetgroupprober import CharSetGroupProber
30 | from .sbcharsetprober import SingleByteCharSetProber
31 | from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,
32 | Latin5CyrillicModel, MacCyrillicModel,
33 | Ibm866Model, Ibm855Model)
34 | from .langgreekmodel import Latin7GreekModel, Win1253GreekModel
35 | from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
36 | from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
37 | from .langthaimodel import TIS620ThaiModel
38 | from .langhebrewmodel import Win1255HebrewModel
39 | from .hebrewprober import HebrewProber
40 |
41 |
42 | class SBCSGroupProber(CharSetGroupProber):
43 | def __init__(self):
44 | CharSetGroupProber.__init__(self)
45 | self._mProbers = [
46 | SingleByteCharSetProber(Win1251CyrillicModel),
47 | SingleByteCharSetProber(Koi8rModel),
48 | SingleByteCharSetProber(Latin5CyrillicModel),
49 | SingleByteCharSetProber(MacCyrillicModel),
50 | SingleByteCharSetProber(Ibm866Model),
51 | SingleByteCharSetProber(Ibm855Model),
52 | SingleByteCharSetProber(Latin7GreekModel),
53 | SingleByteCharSetProber(Win1253GreekModel),
54 | SingleByteCharSetProber(Latin5BulgarianModel),
55 | SingleByteCharSetProber(Win1251BulgarianModel),
56 | SingleByteCharSetProber(Latin2HungarianModel),
57 | SingleByteCharSetProber(Win1250HungarianModel),
58 | SingleByteCharSetProber(TIS620ThaiModel),
59 | ]
60 | hebrewProber = HebrewProber()
61 | logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel,
62 | False, hebrewProber)
63 | visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True,
64 | hebrewProber)
65 | hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
66 | self._mProbers.extend([hebrewProber, logicalHebrewProber,
67 | visualHebrewProber])
68 |
69 | self.reset()
70 |
--------------------------------------------------------------------------------
/requests/packages/chardet/sjisprober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | import sys
29 | from .mbcharsetprober import MultiByteCharSetProber
30 | from .codingstatemachine import CodingStateMachine
31 | from .chardistribution import SJISDistributionAnalysis
32 | from .jpcntx import SJISContextAnalysis
33 | from .mbcssm import SJISSMModel
34 | from . import constants
35 |
36 |
37 | class SJISProber(MultiByteCharSetProber):
38 | def __init__(self):
39 | MultiByteCharSetProber.__init__(self)
40 | self._mCodingSM = CodingStateMachine(SJISSMModel)
41 | self._mDistributionAnalyzer = SJISDistributionAnalysis()
42 | self._mContextAnalyzer = SJISContextAnalysis()
43 | self.reset()
44 |
45 | def reset(self):
46 | MultiByteCharSetProber.reset(self)
47 | self._mContextAnalyzer.reset()
48 |
49 | def get_charset_name(self):
50 | return "SHIFT_JIS"
51 |
52 | def feed(self, aBuf):
53 | aLen = len(aBuf)
54 | for i in range(0, aLen):
55 | codingState = self._mCodingSM.next_state(aBuf[i])
56 | if codingState == constants.eError:
57 | if constants._debug:
58 | sys.stderr.write(self.get_charset_name()
59 | + ' prober hit error at byte ' + str(i)
60 | + '\n')
61 | self._mState = constants.eNotMe
62 | break
63 | elif codingState == constants.eItsMe:
64 | self._mState = constants.eFoundIt
65 | break
66 | elif codingState == constants.eStart:
67 | charLen = self._mCodingSM.get_current_charlen()
68 | if i == 0:
69 | self._mLastChar[1] = aBuf[0]
70 | self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:],
71 | charLen)
72 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
73 | else:
74 | self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3
75 | - charLen], charLen)
76 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1],
77 | charLen)
78 |
79 | self._mLastChar[0] = aBuf[aLen - 1]
80 |
81 | if self.get_state() == constants.eDetecting:
82 | if (self._mContextAnalyzer.got_enough_data() and
83 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)):
84 | self._mState = constants.eFoundIt
85 |
86 | return self.get_state()
87 |
88 | def get_confidence(self):
89 | contxtCf = self._mContextAnalyzer.get_confidence()
90 | distribCf = self._mDistributionAnalyzer.get_confidence()
91 | return max(contxtCf, distribCf)
92 |
--------------------------------------------------------------------------------
/requests/packages/chardet/utf8prober.py:
--------------------------------------------------------------------------------
1 | ######################## BEGIN LICENSE BLOCK ########################
2 | # The Original Code is mozilla.org code.
3 | #
4 | # The Initial Developer of the Original Code is
5 | # Netscape Communications Corporation.
6 | # Portions created by the Initial Developer are Copyright (C) 1998
7 | # the Initial Developer. All Rights Reserved.
8 | #
9 | # Contributor(s):
10 | # Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | #
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 | # Lesser General Public License for more details.
21 | #
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301 USA
26 | ######################### END LICENSE BLOCK #########################
27 |
28 | from . import constants
29 | from .charsetprober import CharSetProber
30 | from .codingstatemachine import CodingStateMachine
31 | from .mbcssm import UTF8SMModel
32 |
33 | ONE_CHAR_PROB = 0.5
34 |
35 |
36 | class UTF8Prober(CharSetProber):
37 | def __init__(self):
38 | CharSetProber.__init__(self)
39 | self._mCodingSM = CodingStateMachine(UTF8SMModel)
40 | self.reset()
41 |
42 | def reset(self):
43 | CharSetProber.reset(self)
44 | self._mCodingSM.reset()
45 | self._mNumOfMBChar = 0
46 |
47 | def get_charset_name(self):
48 | return "utf-8"
49 |
50 | def feed(self, aBuf):
51 | for c in aBuf:
52 | codingState = self._mCodingSM.next_state(c)
53 | if codingState == constants.eError:
54 | self._mState = constants.eNotMe
55 | break
56 | elif codingState == constants.eItsMe:
57 | self._mState = constants.eFoundIt
58 | break
59 | elif codingState == constants.eStart:
60 | if self._mCodingSM.get_current_charlen() >= 2:
61 | self._mNumOfMBChar += 1
62 |
63 | if self.get_state() == constants.eDetecting:
64 | if self.get_confidence() > constants.SHORTCUT_THRESHOLD:
65 | self._mState = constants.eFoundIt
66 |
67 | return self.get_state()
68 |
69 | def get_confidence(self):
70 | unlike = 0.99
71 | if self._mNumOfMBChar < 6:
72 | for i in range(0, self._mNumOfMBChar):
73 | unlike = unlike * ONE_CHAR_PROB
74 | return 1.0 - unlike
75 | else:
76 | return unlike
77 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | urllib3 - Thread-safe connection pooling and re-using.
3 | """
4 |
5 | __author__ = 'Andrey Petrov (andrey.petrov@shazow.net)'
6 | __license__ = 'MIT'
7 | __version__ = 'dev'
8 |
9 |
10 | from .connectionpool import (
11 | HTTPConnectionPool,
12 | HTTPSConnectionPool,
13 | connection_from_url
14 | )
15 |
16 | from . import exceptions
17 | from .filepost import encode_multipart_formdata
18 | from .poolmanager import PoolManager, ProxyManager, proxy_from_url
19 | from .response import HTTPResponse
20 | from .util.request import make_headers
21 | from .util.url import get_host
22 | from .util.timeout import Timeout
23 | from .util.retry import Retry
24 |
25 |
26 | # Set default logging handler to avoid "No handler found" warnings.
27 | import logging
28 | try: # Python 2.7+
29 | from logging import NullHandler
30 | except ImportError:
31 | class NullHandler(logging.Handler):
32 | def emit(self, record):
33 | pass
34 |
35 | logging.getLogger(__name__).addHandler(NullHandler())
36 |
37 | def add_stderr_logger(level=logging.DEBUG):
38 | """
39 | Helper for quickly adding a StreamHandler to the logger. Useful for
40 | debugging.
41 |
42 | Returns the handler after adding it.
43 | """
44 | # This method needs to be in this __init__.py to get the __name__ correct
45 | # even if urllib3 is vendored within another package.
46 | logger = logging.getLogger(__name__)
47 | handler = logging.StreamHandler()
48 | handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
49 | logger.addHandler(handler)
50 | logger.setLevel(level)
51 | logger.debug('Added a stderr logging handler to logger: %s' % __name__)
52 | return handler
53 |
54 | # ... Clean up.
55 | del NullHandler
56 |
57 |
58 | # Set security warning to only go off once by default.
59 | import warnings
60 | warnings.simplefilter('module', exceptions.SecurityWarning)
61 |
62 | def disable_warnings(category=exceptions.HTTPWarning):
63 | """
64 | Helper for quickly disabling all urllib3 warnings.
65 | """
66 | warnings.simplefilter('ignore', category)
67 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/contrib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0dayCTF/astro-bot/be6dabba5e57676a4ea193d878a7e1bbc588f1ce/requests/packages/urllib3/contrib/__init__.py
--------------------------------------------------------------------------------
/requests/packages/urllib3/exceptions.py:
--------------------------------------------------------------------------------
1 |
2 | ## Base Exceptions
3 |
4 | class HTTPError(Exception):
5 | "Base exception used by this module."
6 | pass
7 |
8 | class HTTPWarning(Warning):
9 | "Base warning used by this module."
10 | pass
11 |
12 |
13 |
14 | class PoolError(HTTPError):
15 | "Base exception for errors caused within a pool."
16 | def __init__(self, pool, message):
17 | self.pool = pool
18 | HTTPError.__init__(self, "%s: %s" % (pool, message))
19 |
20 | def __reduce__(self):
21 | # For pickling purposes.
22 | return self.__class__, (None, None)
23 |
24 |
25 | class RequestError(PoolError):
26 | "Base exception for PoolErrors that have associated URLs."
27 | def __init__(self, pool, url, message):
28 | self.url = url
29 | PoolError.__init__(self, pool, message)
30 |
31 | def __reduce__(self):
32 | # For pickling purposes.
33 | return self.__class__, (None, self.url, None)
34 |
35 |
36 | class SSLError(HTTPError):
37 | "Raised when SSL certificate fails in an HTTPS connection."
38 | pass
39 |
40 |
41 | class ProxyError(HTTPError):
42 | "Raised when the connection to a proxy fails."
43 | pass
44 |
45 |
46 | class DecodeError(HTTPError):
47 | "Raised when automatic decoding based on Content-Type fails."
48 | pass
49 |
50 |
51 | class ProtocolError(HTTPError):
52 | "Raised when something unexpected happens mid-request/response."
53 | pass
54 |
55 |
56 | #: Renamed to ProtocolError but aliased for backwards compatibility.
57 | ConnectionError = ProtocolError
58 |
59 |
60 | ## Leaf Exceptions
61 |
62 | class MaxRetryError(RequestError):
63 | """Raised when the maximum number of retries is exceeded.
64 |
65 | :param pool: The connection pool
66 | :type pool: :class:`~urllib3.connectionpool.HTTPConnectionPool`
67 | :param string url: The requested Url
68 | :param exceptions.Exception reason: The underlying error
69 |
70 | """
71 |
72 | def __init__(self, pool, url, reason=None):
73 | self.reason = reason
74 |
75 | message = "Max retries exceeded with url: %s" % url
76 | if reason:
77 | message += " (Caused by %r)" % reason
78 | else:
79 | message += " (Caused by redirect)"
80 |
81 | RequestError.__init__(self, pool, url, message)
82 |
83 |
84 | class HostChangedError(RequestError):
85 | "Raised when an existing pool gets a request for a foreign host."
86 |
87 | def __init__(self, pool, url, retries=3):
88 | message = "Tried to open a foreign host with url: %s" % url
89 | RequestError.__init__(self, pool, url, message)
90 | self.retries = retries
91 |
92 |
93 | class TimeoutStateError(HTTPError):
94 | """ Raised when passing an invalid state to a timeout """
95 | pass
96 |
97 |
98 | class TimeoutError(HTTPError):
99 | """ Raised when a socket timeout error occurs.
100 |
101 | Catching this error will catch both :exc:`ReadTimeoutErrors
102 | ` and :exc:`ConnectTimeoutErrors `.
103 | """
104 | pass
105 |
106 |
107 | class ReadTimeoutError(TimeoutError, RequestError):
108 | "Raised when a socket timeout occurs while receiving data from a server"
109 | pass
110 |
111 |
112 | # This timeout error does not have a URL attached and needs to inherit from the
113 | # base HTTPError
114 | class ConnectTimeoutError(TimeoutError):
115 | "Raised when a socket timeout occurs while connecting to a server"
116 | pass
117 |
118 |
119 | class EmptyPoolError(PoolError):
120 | "Raised when a pool runs out of connections and no more are allowed."
121 | pass
122 |
123 |
124 | class ClosedPoolError(PoolError):
125 | "Raised when a request enters a pool after the pool has been closed."
126 | pass
127 |
128 |
129 | class LocationValueError(ValueError, HTTPError):
130 | "Raised when there is something wrong with a given URL input."
131 | pass
132 |
133 |
134 | class LocationParseError(LocationValueError):
135 | "Raised when get_host or similar fails to parse the URL input."
136 |
137 | def __init__(self, location):
138 | message = "Failed to parse: %s" % location
139 | HTTPError.__init__(self, message)
140 |
141 | self.location = location
142 |
143 |
144 | class SecurityWarning(HTTPWarning):
145 | "Warned when perfoming security reducing actions"
146 | pass
147 |
148 |
149 | class InsecureRequestWarning(SecurityWarning):
150 | "Warned when making an unverified HTTPS request."
151 | pass
152 |
153 |
154 | class SystemTimeWarning(SecurityWarning):
155 | "Warned when system time is suspected to be wrong"
156 | pass
157 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/filepost.py:
--------------------------------------------------------------------------------
1 | import codecs
2 |
3 | from uuid import uuid4
4 | from io import BytesIO
5 |
6 | from .packages import six
7 | from .packages.six import b
8 | from .fields import RequestField
9 |
10 | writer = codecs.lookup('utf-8')[3]
11 |
12 |
13 | def choose_boundary():
14 | """
15 | Our embarassingly-simple replacement for mimetools.choose_boundary.
16 | """
17 | return uuid4().hex
18 |
19 |
20 | def iter_field_objects(fields):
21 | """
22 | Iterate over fields.
23 |
24 | Supports list of (k, v) tuples and dicts, and lists of
25 | :class:`~urllib3.fields.RequestField`.
26 |
27 | """
28 | if isinstance(fields, dict):
29 | i = six.iteritems(fields)
30 | else:
31 | i = iter(fields)
32 |
33 | for field in i:
34 | if isinstance(field, RequestField):
35 | yield field
36 | else:
37 | yield RequestField.from_tuples(*field)
38 |
39 |
40 | def iter_fields(fields):
41 | """
42 | .. deprecated:: 1.6
43 |
44 | Iterate over fields.
45 |
46 | The addition of :class:`~urllib3.fields.RequestField` makes this function
47 | obsolete. Instead, use :func:`iter_field_objects`, which returns
48 | :class:`~urllib3.fields.RequestField` objects.
49 |
50 | Supports list of (k, v) tuples and dicts.
51 | """
52 | if isinstance(fields, dict):
53 | return ((k, v) for k, v in six.iteritems(fields))
54 |
55 | return ((k, v) for k, v in fields)
56 |
57 |
58 | def encode_multipart_formdata(fields, boundary=None):
59 | """
60 | Encode a dictionary of ``fields`` using the multipart/form-data MIME format.
61 |
62 | :param fields:
63 | Dictionary of fields or list of (key, :class:`~urllib3.fields.RequestField`).
64 |
65 | :param boundary:
66 | If not specified, then a random boundary will be generated using
67 | :func:`mimetools.choose_boundary`.
68 | """
69 | body = BytesIO()
70 | if boundary is None:
71 | boundary = choose_boundary()
72 |
73 | for field in iter_field_objects(fields):
74 | body.write(b('--%s\r\n' % (boundary)))
75 |
76 | writer(body).write(field.render_headers())
77 | data = field.data
78 |
79 | if isinstance(data, int):
80 | data = str(data) # Backwards compatibility
81 |
82 | if isinstance(data, six.text_type):
83 | writer(body).write(data)
84 | else:
85 | body.write(data)
86 |
87 | body.write(b'\r\n')
88 |
89 | body.write(b('--%s--\r\n' % (boundary)))
90 |
91 | content_type = str('multipart/form-data; boundary=%s' % boundary)
92 |
93 | return body.getvalue(), content_type
94 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/packages/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
3 | from . import ssl_match_hostname
4 |
5 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/packages/ssl_match_hostname/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | # Python 3.2+
3 | from ssl import CertificateError, match_hostname
4 | except ImportError:
5 | try:
6 | # Backport of the function from a pypi module
7 | from backports.ssl_match_hostname import CertificateError, match_hostname
8 | except ImportError:
9 | # Our vendored copy
10 | from ._implementation import CertificateError, match_hostname
11 |
12 | # Not needed, but documenting what we provide.
13 | __all__ = ('CertificateError', 'match_hostname')
14 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/packages/ssl_match_hostname/_implementation.py:
--------------------------------------------------------------------------------
1 | """The match_hostname() function from Python 3.3.3, essential when using SSL."""
2 |
3 | # Note: This file is under the PSF license as the code comes from the python
4 | # stdlib. http://docs.python.org/3/license.html
5 |
6 | import re
7 |
8 | __version__ = '3.4.0.2'
9 |
10 | class CertificateError(ValueError):
11 | pass
12 |
13 |
14 | def _dnsname_match(dn, hostname, max_wildcards=1):
15 | """Matching according to RFC 6125, section 6.4.3
16 |
17 | http://tools.ietf.org/html/rfc6125#section-6.4.3
18 | """
19 | pats = []
20 | if not dn:
21 | return False
22 |
23 | # Ported from python3-syntax:
24 | # leftmost, *remainder = dn.split(r'.')
25 | parts = dn.split(r'.')
26 | leftmost = parts[0]
27 | remainder = parts[1:]
28 |
29 | wildcards = leftmost.count('*')
30 | if wildcards > max_wildcards:
31 | # Issue #17980: avoid denials of service by refusing more
32 | # than one wildcard per fragment. A survey of established
33 | # policy among SSL implementations showed it to be a
34 | # reasonable choice.
35 | raise CertificateError(
36 | "too many wildcards in certificate DNS name: " + repr(dn))
37 |
38 | # speed up common case w/o wildcards
39 | if not wildcards:
40 | return dn.lower() == hostname.lower()
41 |
42 | # RFC 6125, section 6.4.3, subitem 1.
43 | # The client SHOULD NOT attempt to match a presented identifier in which
44 | # the wildcard character comprises a label other than the left-most label.
45 | if leftmost == '*':
46 | # When '*' is a fragment by itself, it matches a non-empty dotless
47 | # fragment.
48 | pats.append('[^.]+')
49 | elif leftmost.startswith('xn--') or hostname.startswith('xn--'):
50 | # RFC 6125, section 6.4.3, subitem 3.
51 | # The client SHOULD NOT attempt to match a presented identifier
52 | # where the wildcard character is embedded within an A-label or
53 | # U-label of an internationalized domain name.
54 | pats.append(re.escape(leftmost))
55 | else:
56 | # Otherwise, '*' matches any dotless string, e.g. www*
57 | pats.append(re.escape(leftmost).replace(r'\*', '[^.]*'))
58 |
59 | # add the remaining fragments, ignore any wildcards
60 | for frag in remainder:
61 | pats.append(re.escape(frag))
62 |
63 | pat = re.compile(r'\A' + r'\.'.join(pats) + r'\Z', re.IGNORECASE)
64 | return pat.match(hostname)
65 |
66 |
67 | def match_hostname(cert, hostname):
68 | """Verify that *cert* (in decoded format as returned by
69 | SSLSocket.getpeercert()) matches the *hostname*. RFC 2818 and RFC 6125
70 | rules are followed, but IP addresses are not accepted for *hostname*.
71 |
72 | CertificateError is raised on failure. On success, the function
73 | returns nothing.
74 | """
75 | if not cert:
76 | raise ValueError("empty or no certificate")
77 | dnsnames = []
78 | san = cert.get('subjectAltName', ())
79 | for key, value in san:
80 | if key == 'DNS':
81 | if _dnsname_match(value, hostname):
82 | return
83 | dnsnames.append(value)
84 | if not dnsnames:
85 | # The subject is only checked when there is no dNSName entry
86 | # in subjectAltName
87 | for sub in cert.get('subject', ()):
88 | for key, value in sub:
89 | # XXX according to RFC 2818, the most specific Common Name
90 | # must be used.
91 | if key == 'commonName':
92 | if _dnsname_match(value, hostname):
93 | return
94 | dnsnames.append(value)
95 | if len(dnsnames) > 1:
96 | raise CertificateError("hostname %r "
97 | "doesn't match either of %s"
98 | % (hostname, ', '.join(map(repr, dnsnames))))
99 | elif len(dnsnames) == 1:
100 | raise CertificateError("hostname %r "
101 | "doesn't match %r"
102 | % (hostname, dnsnames[0]))
103 | else:
104 | raise CertificateError("no appropriate commonName or "
105 | "subjectAltName fields were found")
106 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/util/__init__.py:
--------------------------------------------------------------------------------
1 | # For backwards compatibility, provide imports that used to be here.
2 | from .connection import is_connection_dropped
3 | from .request import make_headers
4 | from .response import is_fp_closed
5 | from .ssl_ import (
6 | SSLContext,
7 | HAS_SNI,
8 | assert_fingerprint,
9 | resolve_cert_reqs,
10 | resolve_ssl_version,
11 | ssl_wrap_socket,
12 | )
13 | from .timeout import (
14 | current_time,
15 | Timeout,
16 | )
17 |
18 | from .retry import Retry
19 | from .url import (
20 | get_host,
21 | parse_url,
22 | split_first,
23 | Url,
24 | )
25 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/util/connection.py:
--------------------------------------------------------------------------------
1 | import socket
2 | try:
3 | from select import poll, POLLIN
4 | except ImportError: # `poll` doesn't exist on OSX and other platforms
5 | poll = False
6 | try:
7 | from select import select
8 | except ImportError: # `select` doesn't exist on AppEngine.
9 | select = False
10 |
11 |
12 | def is_connection_dropped(conn): # Platform-specific
13 | """
14 | Returns True if the connection is dropped and should be closed.
15 |
16 | :param conn:
17 | :class:`httplib.HTTPConnection` object.
18 |
19 | Note: For platforms like AppEngine, this will always return ``False`` to
20 | let the platform handle connection recycling transparently for us.
21 | """
22 | sock = getattr(conn, 'sock', False)
23 | if sock is False: # Platform-specific: AppEngine
24 | return False
25 | if sock is None: # Connection already closed (such as by httplib).
26 | return True
27 |
28 | if not poll:
29 | if not select: # Platform-specific: AppEngine
30 | return False
31 |
32 | try:
33 | return select([sock], [], [], 0.0)[0]
34 | except socket.error:
35 | return True
36 |
37 | # This version is better on platforms that support it.
38 | p = poll()
39 | p.register(sock, POLLIN)
40 | for (fno, ev) in p.poll(0.0):
41 | if fno == sock.fileno():
42 | # Either data is buffered (bad), or the connection is dropped.
43 | return True
44 |
45 |
46 | # This function is copied from socket.py in the Python 2.7 standard
47 | # library test suite. Added to its signature is only `socket_options`.
48 | def create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
49 | source_address=None, socket_options=None):
50 | """Connect to *address* and return the socket object.
51 |
52 | Convenience function. Connect to *address* (a 2-tuple ``(host,
53 | port)``) and return the socket object. Passing the optional
54 | *timeout* parameter will set the timeout on the socket instance
55 | before attempting to connect. If no *timeout* is supplied, the
56 | global default timeout setting returned by :func:`getdefaulttimeout`
57 | is used. If *source_address* is set it must be a tuple of (host, port)
58 | for the socket to bind as a source address before making the connection.
59 | An host of '' or port 0 tells the OS to use the default.
60 | """
61 |
62 | host, port = address
63 | err = None
64 | for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM):
65 | af, socktype, proto, canonname, sa = res
66 | sock = None
67 | try:
68 | sock = socket.socket(af, socktype, proto)
69 |
70 | # If provided, set socket level options before connecting.
71 | # This is the only addition urllib3 makes to this function.
72 | _set_socket_options(sock, socket_options)
73 |
74 | if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
75 | sock.settimeout(timeout)
76 | if source_address:
77 | sock.bind(source_address)
78 | sock.connect(sa)
79 | return sock
80 |
81 | except socket.error as _:
82 | err = _
83 | if sock is not None:
84 | sock.close()
85 |
86 | if err is not None:
87 | raise err
88 | else:
89 | raise socket.error("getaddrinfo returns an empty list")
90 |
91 |
92 | def _set_socket_options(sock, options):
93 | if options is None:
94 | return
95 |
96 | for opt in options:
97 | sock.setsockopt(*opt)
98 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/util/request.py:
--------------------------------------------------------------------------------
1 | from base64 import b64encode
2 |
3 | from ..packages.six import b
4 |
5 | ACCEPT_ENCODING = 'gzip,deflate'
6 |
7 |
8 | def make_headers(keep_alive=None, accept_encoding=None, user_agent=None,
9 | basic_auth=None, proxy_basic_auth=None, disable_cache=None):
10 | """
11 | Shortcuts for generating request headers.
12 |
13 | :param keep_alive:
14 | If ``True``, adds 'connection: keep-alive' header.
15 |
16 | :param accept_encoding:
17 | Can be a boolean, list, or string.
18 | ``True`` translates to 'gzip,deflate'.
19 | List will get joined by comma.
20 | String will be used as provided.
21 |
22 | :param user_agent:
23 | String representing the user-agent you want, such as
24 | "python-urllib3/0.6"
25 |
26 | :param basic_auth:
27 | Colon-separated username:password string for 'authorization: basic ...'
28 | auth header.
29 |
30 | :param proxy_basic_auth:
31 | Colon-separated username:password string for 'proxy-authorization: basic ...'
32 | auth header.
33 |
34 | :param disable_cache:
35 | If ``True``, adds 'cache-control: no-cache' header.
36 |
37 | Example::
38 |
39 | >>> make_headers(keep_alive=True, user_agent="Batman/1.0")
40 | {'connection': 'keep-alive', 'user-agent': 'Batman/1.0'}
41 | >>> make_headers(accept_encoding=True)
42 | {'accept-encoding': 'gzip,deflate'}
43 | """
44 | headers = {}
45 | if accept_encoding:
46 | if isinstance(accept_encoding, str):
47 | pass
48 | elif isinstance(accept_encoding, list):
49 | accept_encoding = ','.join(accept_encoding)
50 | else:
51 | accept_encoding = ACCEPT_ENCODING
52 | headers['accept-encoding'] = accept_encoding
53 |
54 | if user_agent:
55 | headers['user-agent'] = user_agent
56 |
57 | if keep_alive:
58 | headers['connection'] = 'keep-alive'
59 |
60 | if basic_auth:
61 | headers['authorization'] = 'Basic ' + \
62 | b64encode(b(basic_auth)).decode('utf-8')
63 |
64 | if proxy_basic_auth:
65 | headers['proxy-authorization'] = 'Basic ' + \
66 | b64encode(b(proxy_basic_auth)).decode('utf-8')
67 |
68 | if disable_cache:
69 | headers['cache-control'] = 'no-cache'
70 |
71 | return headers
72 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/util/response.py:
--------------------------------------------------------------------------------
1 | def is_fp_closed(obj):
2 | """
3 | Checks whether a given file-like object is closed.
4 |
5 | :param obj:
6 | The file-like object to check.
7 | """
8 |
9 | try:
10 | # Check via the official file-like-object way.
11 | return obj.closed
12 | except AttributeError:
13 | pass
14 |
15 | try:
16 | # Check if the object is a container for another file-like object that
17 | # gets released on exhaustion (e.g. HTTPResponse).
18 | return obj.fp is None
19 | except AttributeError:
20 | pass
21 |
22 | raise ValueError("Unable to determine whether fp is closed.")
23 |
--------------------------------------------------------------------------------
/requests/packages/urllib3/util/ssl_.py:
--------------------------------------------------------------------------------
1 | from binascii import hexlify, unhexlify
2 | from hashlib import md5, sha1
3 |
4 | from ..exceptions import SSLError
5 |
6 |
7 | try: # Test for SSL features
8 | SSLContext = None
9 | HAS_SNI = False
10 |
11 | import ssl
12 | from ssl import wrap_socket, CERT_NONE, PROTOCOL_SSLv23
13 | from ssl import SSLContext # Modern SSL?
14 | from ssl import HAS_SNI # Has SNI?
15 | except ImportError:
16 | pass
17 |
18 |
19 | def assert_fingerprint(cert, fingerprint):
20 | """
21 | Checks if given fingerprint matches the supplied certificate.
22 |
23 | :param cert:
24 | Certificate as bytes object.
25 | :param fingerprint:
26 | Fingerprint as string of hexdigits, can be interspersed by colons.
27 | """
28 |
29 | # Maps the length of a digest to a possible hash function producing
30 | # this digest.
31 | hashfunc_map = {
32 | 16: md5,
33 | 20: sha1
34 | }
35 |
36 | fingerprint = fingerprint.replace(':', '').lower()
37 | digest_length, odd = divmod(len(fingerprint), 2)
38 |
39 | if odd or digest_length not in hashfunc_map:
40 | raise SSLError('Fingerprint is of invalid length.')
41 |
42 | # We need encode() here for py32; works on py2 and p33.
43 | fingerprint_bytes = unhexlify(fingerprint.encode())
44 |
45 | hashfunc = hashfunc_map[digest_length]
46 |
47 | cert_digest = hashfunc(cert).digest()
48 |
49 | if not cert_digest == fingerprint_bytes:
50 | raise SSLError('Fingerprints did not match. Expected "{0}", got "{1}".'
51 | .format(hexlify(fingerprint_bytes),
52 | hexlify(cert_digest)))
53 |
54 |
55 | def resolve_cert_reqs(candidate):
56 | """
57 | Resolves the argument to a numeric constant, which can be passed to
58 | the wrap_socket function/method from the ssl module.
59 | Defaults to :data:`ssl.CERT_NONE`.
60 | If given a string it is assumed to be the name of the constant in the
61 | :mod:`ssl` module or its abbrevation.
62 | (So you can specify `REQUIRED` instead of `CERT_REQUIRED`.
63 | If it's neither `None` nor a string we assume it is already the numeric
64 | constant which can directly be passed to wrap_socket.
65 | """
66 | if candidate is None:
67 | return CERT_NONE
68 |
69 | if isinstance(candidate, str):
70 | res = getattr(ssl, candidate, None)
71 | if res is None:
72 | res = getattr(ssl, 'CERT_' + candidate)
73 | return res
74 |
75 | return candidate
76 |
77 |
78 | def resolve_ssl_version(candidate):
79 | """
80 | like resolve_cert_reqs
81 | """
82 | if candidate is None:
83 | return PROTOCOL_SSLv23
84 |
85 | if isinstance(candidate, str):
86 | res = getattr(ssl, candidate, None)
87 | if res is None:
88 | res = getattr(ssl, 'PROTOCOL_' + candidate)
89 | return res
90 |
91 | return candidate
92 |
93 |
94 | if SSLContext is not None: # Python 3.2+
95 | def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
96 | ca_certs=None, server_hostname=None,
97 | ssl_version=None):
98 | """
99 | All arguments except `server_hostname` have the same meaning as for
100 | :func:`ssl.wrap_socket`
101 |
102 | :param server_hostname:
103 | Hostname of the expected certificate
104 | """
105 | context = SSLContext(ssl_version)
106 | context.verify_mode = cert_reqs
107 |
108 | # Disable TLS compression to migitate CRIME attack (issue #309)
109 | OP_NO_COMPRESSION = 0x20000
110 | context.options |= OP_NO_COMPRESSION
111 |
112 | if ca_certs:
113 | try:
114 | context.load_verify_locations(ca_certs)
115 | # Py32 raises IOError
116 | # Py33 raises FileNotFoundError
117 | except Exception as e: # Reraise as SSLError
118 | raise SSLError(e)
119 | if certfile:
120 | # FIXME: This block needs a test.
121 | context.load_cert_chain(certfile, keyfile)
122 | if HAS_SNI: # Platform-specific: OpenSSL with enabled SNI
123 | return context.wrap_socket(sock, server_hostname=server_hostname)
124 | return context.wrap_socket(sock)
125 |
126 | else: # Python 3.1 and earlier
127 | def ssl_wrap_socket(sock, keyfile=None, certfile=None, cert_reqs=None,
128 | ca_certs=None, server_hostname=None,
129 | ssl_version=None):
130 | return wrap_socket(sock, keyfile=keyfile, certfile=certfile,
131 | ca_certs=ca_certs, cert_reqs=cert_reqs,
132 | ssl_version=ssl_version)
133 |
--------------------------------------------------------------------------------
/requests/status_codes.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .structures import LookupDict
4 |
5 | _codes = {
6 |
7 | # Informational.
8 | 100: ('continue',),
9 | 101: ('switching_protocols',),
10 | 102: ('processing',),
11 | 103: ('checkpoint',),
12 | 122: ('uri_too_long', 'request_uri_too_long'),
13 | 200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\o/', '✓'),
14 | 201: ('created',),
15 | 202: ('accepted',),
16 | 203: ('non_authoritative_info', 'non_authoritative_information'),
17 | 204: ('no_content',),
18 | 205: ('reset_content', 'reset'),
19 | 206: ('partial_content', 'partial'),
20 | 207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'),
21 | 208: ('already_reported',),
22 | 226: ('im_used',),
23 |
24 | # Redirection.
25 | 300: ('multiple_choices',),
26 | 301: ('moved_permanently', 'moved', '\\o-'),
27 | 302: ('found',),
28 | 303: ('see_other', 'other'),
29 | 304: ('not_modified',),
30 | 305: ('use_proxy',),
31 | 306: ('switch_proxy',),
32 | 307: ('temporary_redirect', 'temporary_moved', 'temporary'),
33 | 308: ('permanent_redirect',
34 | 'resume_incomplete', 'resume',), # These 2 to be removed in 3.0
35 |
36 | # Client Error.
37 | 400: ('bad_request', 'bad'),
38 | 401: ('unauthorized',),
39 | 402: ('payment_required', 'payment'),
40 | 403: ('forbidden',),
41 | 404: ('not_found', '-o-'),
42 | 405: ('method_not_allowed', 'not_allowed'),
43 | 406: ('not_acceptable',),
44 | 407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'),
45 | 408: ('request_timeout', 'timeout'),
46 | 409: ('conflict',),
47 | 410: ('gone',),
48 | 411: ('length_required',),
49 | 412: ('precondition_failed', 'precondition'),
50 | 413: ('request_entity_too_large',),
51 | 414: ('request_uri_too_large',),
52 | 415: ('unsupported_media_type', 'unsupported_media', 'media_type'),
53 | 416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'),
54 | 417: ('expectation_failed',),
55 | 418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'),
56 | 422: ('unprocessable_entity', 'unprocessable'),
57 | 423: ('locked',),
58 | 424: ('failed_dependency', 'dependency'),
59 | 425: ('unordered_collection', 'unordered'),
60 | 426: ('upgrade_required', 'upgrade'),
61 | 428: ('precondition_required', 'precondition'),
62 | 429: ('too_many_requests', 'too_many'),
63 | 431: ('header_fields_too_large', 'fields_too_large'),
64 | 444: ('no_response', 'none'),
65 | 449: ('retry_with', 'retry'),
66 | 450: ('blocked_by_windows_parental_controls', 'parental_controls'),
67 | 451: ('unavailable_for_legal_reasons', 'legal_reasons'),
68 | 499: ('client_closed_request',),
69 |
70 | # Server Error.
71 | 500: ('internal_server_error', 'server_error', '/o\\', '✗'),
72 | 501: ('not_implemented',),
73 | 502: ('bad_gateway',),
74 | 503: ('service_unavailable', 'unavailable'),
75 | 504: ('gateway_timeout',),
76 | 505: ('http_version_not_supported', 'http_version'),
77 | 506: ('variant_also_negotiates',),
78 | 507: ('insufficient_storage',),
79 | 509: ('bandwidth_limit_exceeded', 'bandwidth'),
80 | 510: ('not_extended',),
81 | }
82 |
83 | codes = LookupDict(name='status_codes')
84 |
85 | for (code, titles) in list(_codes.items()):
86 | for title in titles:
87 | setattr(codes, title, code)
88 | if not title.startswith('\\'):
89 | setattr(codes, title.upper(), code)
90 |
--------------------------------------------------------------------------------
/requests/structures.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | requests.structures
5 | ~~~~~~~~~~~~~~~~~~~
6 |
7 | Data structures that power Requests.
8 |
9 | """
10 |
11 | import collections
12 |
13 |
14 | class CaseInsensitiveDict(collections.MutableMapping):
15 | """
16 | A case-insensitive ``dict``-like object.
17 |
18 | Implements all methods and operations of
19 | ``collections.MutableMapping`` as well as dict's ``copy``. Also
20 | provides ``lower_items``.
21 |
22 | All keys are expected to be strings. The structure remembers the
23 | case of the last key to be set, and ``iter(instance)``,
24 | ``keys()``, ``items()``, ``iterkeys()``, and ``iteritems()``
25 | will contain case-sensitive keys. However, querying and contains
26 | testing is case insensitive::
27 |
28 | cid = CaseInsensitiveDict()
29 | cid['Accept'] = 'application/json'
30 | cid['aCCEPT'] == 'application/json' # True
31 | list(cid) == ['Accept'] # True
32 |
33 | For example, ``headers['content-encoding']`` will return the
34 | value of a ``'Content-Encoding'`` response header, regardless
35 | of how the header name was originally stored.
36 |
37 | If the constructor, ``.update``, or equality comparison
38 | operations are given keys that have equal ``.lower()``s, the
39 | behavior is undefined.
40 |
41 | """
42 | def __init__(self, data=None, **kwargs):
43 | self._store = dict()
44 | if data is None:
45 | data = {}
46 | self.update(data, **kwargs)
47 |
48 | def __setitem__(self, key, value):
49 | # Use the lowercased key for lookups, but store the actual
50 | # key alongside the value.
51 | self._store[key.lower()] = (key, value)
52 |
53 | def __getitem__(self, key):
54 | return self._store[key.lower()][1]
55 |
56 | def __delitem__(self, key):
57 | del self._store[key.lower()]
58 |
59 | def __iter__(self):
60 | return (casedkey for casedkey, mappedvalue in self._store.values())
61 |
62 | def __len__(self):
63 | return len(self._store)
64 |
65 | def lower_items(self):
66 | """Like iteritems(), but with all lowercase keys."""
67 | return (
68 | (lowerkey, keyval[1])
69 | for (lowerkey, keyval)
70 | in self._store.items()
71 | )
72 |
73 | def __eq__(self, other):
74 | if isinstance(other, collections.Mapping):
75 | other = CaseInsensitiveDict(other)
76 | else:
77 | return NotImplemented
78 | # Compare insensitively
79 | return dict(self.lower_items()) == dict(other.lower_items())
80 |
81 | # Copy is required
82 | def copy(self):
83 | return CaseInsensitiveDict(self._store.values())
84 |
85 | def __repr__(self):
86 | return str(dict(self.items()))
87 |
88 | class LookupDict(dict):
89 | """Dictionary lookup object."""
90 |
91 | def __init__(self, name=None):
92 | self.name = name
93 | super(LookupDict, self).__init__()
94 |
95 | def __repr__(self):
96 | return '' % (self.name)
97 |
98 | def __getitem__(self, key):
99 | # We allow fall-through here, so values default to None
100 |
101 | return self.__dict__.get(key, None)
102 |
103 | def get(self, key, default=None):
104 | return self.__dict__.get(key, default)
105 |
--------------------------------------------------------------------------------
/search.py:
--------------------------------------------------------------------------------
1 | import document
2 | from urllib import urlencode
3 | from httplib2 import Http
4 | import json
5 | from base64 import b64encode
6 | import secrets
7 |
8 | def document_from_query(query):
9 | query_dict = {"$format": "json", "Query": "'{0}'".format(query)}
10 | url = "https://api.datamarket.azure.com/Bing/Search/Web?" + urlencode(query_dict)
11 | auth_string = b64encode("{0}:{0}".format(secrets.BING_API_KEY))
12 | headers = {"Authorization": "Basic " + auth_string}
13 | response, content = Http().request(url, "GET", headers=headers)
14 | results = json.loads(content)['d']['results']
15 | html = u"Web search for '{0}'
".format(query) + u"
".join([u"{1} ({2})".format(r['Url'], r['Title'], r['DisplayUrl']) for r in results])
16 | doc = document.Document(html = html)
17 | return doc
18 |
--------------------------------------------------------------------------------
/txtfy.py:
--------------------------------------------------------------------------------
1 | def txtfy_word(w):
2 | maps = {
3 | "to": "2",
4 | "too": "2",
5 | "you": "u",
6 | "you'll": "u'll",
7 | "your": "ur",
8 | "for": "4",
9 | "and": "&",
10 | "at": "@",
11 | "with": "w/",
12 | "before": "b4",
13 | "one": "1",
14 | }
15 | if w.lower() in maps:
16 | return maps[w.lower()]
17 | return w
18 |
19 | def txtfy(text):
20 | tokens = text.split(" ")
21 | return u" ".join(map(txtfy_word, tokens))
22 |
--------------------------------------------------------------------------------